{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999398785546805, "eval_steps": 500, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.05084274, "auxiliary_loss_mlp": 0.02254236, "balance_loss_clip": 2.43166804, "balance_loss_mlp": 1.82355666, "epoch": 0.00012024289063909097, "flos": 24932483919360.0, "grad_norm": 41.035222361724514, "language_loss": 2.57854795, "learning_rate": 0.0, "loss": 1.90186608, "num_input_tokens_seen": 20375, "step": 1, "time_per_iteration": 16.504486322402954 }, { "auxiliary_loss_clip": 0.03335267, "auxiliary_loss_mlp": 0.01466756, "balance_loss_clip": 1.62033272, "balance_loss_mlp": 1.1838963, "epoch": 0.00024048578127818193, "flos": 30664624377600.0, "grad_norm": 54.81833791861545, "language_loss": 1.88862348, "learning_rate": 5.021476677069823e-07, "loss": 1.93664372, "num_input_tokens_seen": 39035, "step": 2, "time_per_iteration": 2.851646661758423 }, { "auxiliary_loss_clip": 0.03358685, "auxiliary_loss_mlp": 0.01431997, "balance_loss_clip": 1.6263535, "balance_loss_mlp": 1.15657592, "epoch": 0.0003607286719172729, "flos": 19026227969280.0, "grad_norm": 62.6136514731849, "language_loss": 1.61428607, "learning_rate": 7.958852231401551e-07, "loss": 1.66219282, "num_input_tokens_seen": 57600, "step": 3, "time_per_iteration": 2.5824859142303467 }, { "auxiliary_loss_clip": 0.03304018, "auxiliary_loss_mlp": 0.01431645, "balance_loss_clip": 1.61422896, "balance_loss_mlp": 1.17510688, "epoch": 0.00048097156255636386, "flos": 19316314206720.0, "grad_norm": 36.58156463625064, "language_loss": 1.64504576, "learning_rate": 1.0042953354139647e-06, "loss": 1.69240236, "num_input_tokens_seen": 76465, "step": 4, "time_per_iteration": 2.572399139404297 }, { "auxiliary_loss_clip": 0.03368428, "auxiliary_loss_mlp": 0.01464796, "balance_loss_clip": 1.61386895, "balance_loss_mlp": 1.20005631, "epoch": 0.0006012144531954548, "flos": 13991264893440.0, "grad_norm": 55.43643585899937, "language_loss": 1.93859792, "learning_rate": 1.1659507774310057e-06, "loss": 1.98693001, "num_input_tokens_seen": 94350, "step": 5, "time_per_iteration": 2.8182880878448486 }, { "auxiliary_loss_clip": 0.03384572, "auxiliary_loss_mlp": 0.01438663, "balance_loss_clip": 1.61456013, "balance_loss_mlp": 1.16133451, "epoch": 0.0007214573438345458, "flos": 23148988225920.0, "grad_norm": 44.676575760417485, "language_loss": 1.60970354, "learning_rate": 1.2980328908471373e-06, "loss": 1.65793586, "num_input_tokens_seen": 114595, "step": 6, "time_per_iteration": 2.9074580669403076 }, { "auxiliary_loss_clip": 0.03971822, "auxiliary_loss_mlp": 0.02046369, "balance_loss_clip": 1.62586808, "balance_loss_mlp": 1.80833173, "epoch": 0.0008417002344736367, "flos": 67663246170240.0, "grad_norm": 4.666591226554287, "language_loss": 0.81483674, "learning_rate": 1.4097067265369432e-06, "loss": 0.87501872, "num_input_tokens_seen": 179590, "step": 7, "time_per_iteration": 3.262604236602783 }, { "auxiliary_loss_clip": 0.03325577, "auxiliary_loss_mlp": 0.01492432, "balance_loss_clip": 1.60716343, "balance_loss_mlp": 1.20690179, "epoch": 0.0009619431251127277, "flos": 21281381504640.0, "grad_norm": 40.94020928945012, "language_loss": 1.58690071, "learning_rate": 1.506443003120947e-06, "loss": 1.63508081, "num_input_tokens_seen": 195090, "step": 8, "time_per_iteration": 2.847315788269043 }, { "auxiliary_loss_clip": 0.03325803, "auxiliary_loss_mlp": 0.01471258, "balance_loss_clip": 1.61823845, "balance_loss_mlp": 1.2049917, "epoch": 0.0010821860157518186, "flos": 23331342597120.0, "grad_norm": 17.669327825642622, "language_loss": 1.47570503, "learning_rate": 1.5917704462803102e-06, "loss": 1.52367568, "num_input_tokens_seen": 211635, "step": 9, "time_per_iteration": 2.8853163719177246 }, { "auxiliary_loss_clip": 0.03281426, "auxiliary_loss_mlp": 0.01447367, "balance_loss_clip": 1.62637532, "balance_loss_mlp": 1.18396187, "epoch": 0.0012024289063909096, "flos": 17010166337280.0, "grad_norm": 31.808818334857744, "language_loss": 1.53146791, "learning_rate": 1.6680984451379884e-06, "loss": 1.57875597, "num_input_tokens_seen": 224705, "step": 10, "time_per_iteration": 2.7462384700775146 }, { "auxiliary_loss_clip": 0.03269312, "auxiliary_loss_mlp": 0.01438452, "balance_loss_clip": 1.6132493, "balance_loss_mlp": 1.17638206, "epoch": 0.0013226717970300007, "flos": 21288133261440.0, "grad_norm": 13.861819211108713, "language_loss": 1.32816601, "learning_rate": 1.7371455188905097e-06, "loss": 1.37524366, "num_input_tokens_seen": 244635, "step": 11, "time_per_iteration": 2.8847248554229736 }, { "auxiliary_loss_clip": 0.03289533, "auxiliary_loss_mlp": 0.01433641, "balance_loss_clip": 1.60499752, "balance_loss_mlp": 1.16394162, "epoch": 0.0014429146876690916, "flos": 27237884935680.0, "grad_norm": 12.835011398445856, "language_loss": 1.25528884, "learning_rate": 1.8001805585541196e-06, "loss": 1.30252051, "num_input_tokens_seen": 265765, "step": 12, "time_per_iteration": 2.913663148880005 }, { "auxiliary_loss_clip": 0.03257225, "auxiliary_loss_mlp": 0.01407508, "balance_loss_clip": 1.60728574, "balance_loss_mlp": 1.15306783, "epoch": 0.0015631575783081825, "flos": 19062174504960.0, "grad_norm": 6.879521066989549, "language_loss": 1.29708052, "learning_rate": 1.8581671739548328e-06, "loss": 1.34372783, "num_input_tokens_seen": 283500, "step": 13, "time_per_iteration": 2.8478751182556152 }, { "auxiliary_loss_clip": 0.03257836, "auxiliary_loss_mlp": 0.01423658, "balance_loss_clip": 1.60812759, "balance_loss_mlp": 1.15281415, "epoch": 0.0016834004689472734, "flos": 48139473985920.0, "grad_norm": 6.352374508626791, "language_loss": 1.1404264, "learning_rate": 1.9118543942439254e-06, "loss": 1.18724132, "num_input_tokens_seen": 305685, "step": 14, "time_per_iteration": 5.091363906860352 }, { "auxiliary_loss_clip": 0.03235105, "auxiliary_loss_mlp": 0.01420053, "balance_loss_clip": 1.61689138, "balance_loss_mlp": 1.1558851, "epoch": 0.0018036433595863645, "flos": 34970026314240.0, "grad_norm": 5.645105787676996, "language_loss": 1.12964833, "learning_rate": 1.961836000571161e-06, "loss": 1.17619991, "num_input_tokens_seen": 327340, "step": 15, "time_per_iteration": 3.9750008583068848 }, { "auxiliary_loss_clip": 0.03620182, "auxiliary_loss_mlp": 0.01721494, "balance_loss_clip": 1.6278646, "balance_loss_mlp": 1.52923274, "epoch": 0.0019238862502254555, "flos": 59768284440960.0, "grad_norm": 3.83599289912613, "language_loss": 0.64644313, "learning_rate": 2.0085906708279293e-06, "loss": 0.69985992, "num_input_tokens_seen": 382710, "step": 16, "time_per_iteration": 3.27301025390625 }, { "auxiliary_loss_clip": 0.03127777, "auxiliary_loss_mlp": 0.01410646, "balance_loss_clip": 1.60285556, "balance_loss_mlp": 1.1665051, "epoch": 0.0020441291408645466, "flos": 20814543417600.0, "grad_norm": 4.543759575932944, "language_loss": 1.16360712, "learning_rate": 2.0525099325728135e-06, "loss": 1.20899129, "num_input_tokens_seen": 400890, "step": 17, "time_per_iteration": 2.9307589530944824 }, { "auxiliary_loss_clip": 0.03479199, "auxiliary_loss_mlp": 0.01591912, "balance_loss_clip": 1.63058305, "balance_loss_mlp": 1.41643572, "epoch": 0.0021643720315036373, "flos": 63857001582720.0, "grad_norm": 3.6002187975815487, "language_loss": 0.72157103, "learning_rate": 2.0939181139872922e-06, "loss": 0.77228218, "num_input_tokens_seen": 462605, "step": 18, "time_per_iteration": 3.3329527378082275 }, { "auxiliary_loss_clip": 0.03136248, "auxiliary_loss_mlp": 0.01386216, "balance_loss_clip": 1.59969115, "balance_loss_mlp": 1.13902378, "epoch": 0.0022846149221427284, "flos": 31284981192960.0, "grad_norm": 5.056231311387775, "language_loss": 1.01778054, "learning_rate": 2.1330868934640175e-06, "loss": 1.06300509, "num_input_tokens_seen": 483280, "step": 19, "time_per_iteration": 2.9027557373046875 }, { "auxiliary_loss_clip": 0.03299525, "auxiliary_loss_mlp": 0.01471595, "balance_loss_clip": 1.63628852, "balance_loss_mlp": 1.31442964, "epoch": 0.002404857812781819, "flos": 51083648161920.0, "grad_norm": 3.614105543847463, "language_loss": 0.7646513, "learning_rate": 2.170246112844971e-06, "loss": 0.81236255, "num_input_tokens_seen": 537620, "step": 20, "time_per_iteration": 3.0472071170806885 }, { "auxiliary_loss_clip": 0.03049564, "auxiliary_loss_mlp": 0.01349237, "balance_loss_clip": 1.60200858, "balance_loss_mlp": 1.12130916, "epoch": 0.0025251007034209102, "flos": 15815347309440.0, "grad_norm": 4.456530861809907, "language_loss": 1.01824117, "learning_rate": 2.2055919496770983e-06, "loss": 1.06222916, "num_input_tokens_seen": 555760, "step": 21, "time_per_iteration": 2.84885835647583 }, { "auxiliary_loss_clip": 0.03017215, "auxiliary_loss_mlp": 0.01372378, "balance_loss_clip": 1.5853672, "balance_loss_mlp": 1.13710666, "epoch": 0.0026453435940600014, "flos": 37851857458560.0, "grad_norm": 3.605010044608876, "language_loss": 0.89627987, "learning_rate": 2.2392931865974923e-06, "loss": 0.94017589, "num_input_tokens_seen": 578450, "step": 22, "time_per_iteration": 2.935797691345215 }, { "auxiliary_loss_clip": 0.0290978, "auxiliary_loss_mlp": 0.01349508, "balance_loss_clip": 1.57212663, "balance_loss_mlp": 1.12367773, "epoch": 0.002765586484699092, "flos": 21141976821120.0, "grad_norm": 4.237647755646384, "language_loss": 1.01970434, "learning_rate": 2.271496085962064e-06, "loss": 1.06229722, "num_input_tokens_seen": 596145, "step": 23, "time_per_iteration": 2.902651309967041 }, { "auxiliary_loss_clip": 0.02875604, "auxiliary_loss_mlp": 0.01318482, "balance_loss_clip": 1.56570625, "balance_loss_mlp": 1.11220229, "epoch": 0.002885829375338183, "flos": 20667381396480.0, "grad_norm": 3.236563185420304, "language_loss": 1.02802539, "learning_rate": 2.3023282262611022e-06, "loss": 1.06996608, "num_input_tokens_seen": 614920, "step": 24, "time_per_iteration": 3.1192309856414795 }, { "auxiliary_loss_clip": 0.02843272, "auxiliary_loss_mlp": 0.01339698, "balance_loss_clip": 1.56808627, "balance_loss_mlp": 1.13494372, "epoch": 0.003006072265977274, "flos": 34823869873920.0, "grad_norm": 3.0266340604026856, "language_loss": 0.92584801, "learning_rate": 2.3319015548620114e-06, "loss": 0.96767771, "num_input_tokens_seen": 636060, "step": 25, "time_per_iteration": 3.0139873027801514 }, { "auxiliary_loss_clip": 0.027992, "auxiliary_loss_mlp": 0.01318286, "balance_loss_clip": 1.56671047, "balance_loss_mlp": 1.12535763, "epoch": 0.003126315156616365, "flos": 24422021118720.0, "grad_norm": 2.152138444843957, "language_loss": 0.92741185, "learning_rate": 2.3603148416618152e-06, "loss": 0.9685868, "num_input_tokens_seen": 655575, "step": 26, "time_per_iteration": 2.7991855144500732 }, { "auxiliary_loss_clip": 0.0271718, "auxiliary_loss_mlp": 0.01308848, "balance_loss_clip": 1.55494106, "balance_loss_mlp": 1.12278628, "epoch": 0.003246558047255456, "flos": 23622326674560.0, "grad_norm": 2.5680702486932527, "language_loss": 1.00742495, "learning_rate": 2.3876556694204647e-06, "loss": 1.04768527, "num_input_tokens_seen": 675730, "step": 27, "time_per_iteration": 2.900230646133423 }, { "auxiliary_loss_clip": 0.02674927, "auxiliary_loss_mlp": 0.01319954, "balance_loss_clip": 1.53854811, "balance_loss_mlp": 1.12826586, "epoch": 0.003366800937894547, "flos": 17820275725440.0, "grad_norm": 2.4417946207158256, "language_loss": 0.90716541, "learning_rate": 2.414002061950908e-06, "loss": 0.94711423, "num_input_tokens_seen": 694605, "step": 28, "time_per_iteration": 2.8757548332214355 }, { "auxiliary_loss_clip": 0.02669568, "auxiliary_loss_mlp": 0.01313851, "balance_loss_clip": 1.53960049, "balance_loss_mlp": 1.13045943, "epoch": 0.003487043828533638, "flos": 24426115269120.0, "grad_norm": 2.263848833436222, "language_loss": 0.99730211, "learning_rate": 2.4394238264681557e-06, "loss": 1.0371362, "num_input_tokens_seen": 714340, "step": 29, "time_per_iteration": 2.9203038215637207 }, { "auxiliary_loss_clip": 0.0260199, "auxiliary_loss_mlp": 0.01286828, "balance_loss_clip": 1.52738476, "balance_loss_mlp": 1.11612034, "epoch": 0.003607286719172729, "flos": 26140311002880.0, "grad_norm": 2.209095334594582, "language_loss": 0.99374574, "learning_rate": 2.4639836682781433e-06, "loss": 1.03263402, "num_input_tokens_seen": 734470, "step": 30, "time_per_iteration": 2.928110361099243 }, { "auxiliary_loss_clip": 0.02542466, "auxiliary_loss_mlp": 0.01333146, "balance_loss_clip": 1.51784396, "balance_loss_mlp": 1.16568041, "epoch": 0.00372752960981182, "flos": 20593082113920.0, "grad_norm": 4.250415677151068, "language_loss": 1.00061333, "learning_rate": 2.487738122623307e-06, "loss": 1.03936934, "num_input_tokens_seen": 753380, "step": 31, "time_per_iteration": 2.898010015487671 }, { "auxiliary_loss_clip": 0.02507872, "auxiliary_loss_mlp": 0.012824, "balance_loss_clip": 1.49890924, "balance_loss_mlp": 1.11731875, "epoch": 0.003847772500450911, "flos": 22674608282880.0, "grad_norm": 3.6140494412754522, "language_loss": 0.9893018, "learning_rate": 2.510738338534912e-06, "loss": 1.02720451, "num_input_tokens_seen": 772105, "step": 32, "time_per_iteration": 2.8764238357543945 }, { "auxiliary_loss_clip": 0.02419353, "auxiliary_loss_mlp": 0.01320894, "balance_loss_clip": 1.4832859, "balance_loss_mlp": 1.15295208, "epoch": 0.003968015391090002, "flos": 17967796882560.0, "grad_norm": 2.7590489081104512, "language_loss": 1.02513504, "learning_rate": 2.5330307420306648e-06, "loss": 1.06253743, "num_input_tokens_seen": 788955, "step": 33, "time_per_iteration": 2.898483991622925 }, { "auxiliary_loss_clip": 0.02370646, "auxiliary_loss_mlp": 0.01272005, "balance_loss_clip": 1.47986639, "balance_loss_mlp": 1.11646056, "epoch": 0.004088258281729093, "flos": 27304103658240.0, "grad_norm": 2.1238401713597397, "language_loss": 0.88053989, "learning_rate": 2.554657600279796e-06, "loss": 0.91696638, "num_input_tokens_seen": 810230, "step": 34, "time_per_iteration": 2.960326910018921 }, { "auxiliary_loss_clip": 0.02334296, "auxiliary_loss_mlp": 0.01319808, "balance_loss_clip": 1.47286642, "balance_loss_mlp": 1.17074871, "epoch": 0.004208501172368184, "flos": 23258587599360.0, "grad_norm": 3.0105322498400393, "language_loss": 1.03399789, "learning_rate": 2.5756575039679493e-06, "loss": 1.070539, "num_input_tokens_seen": 829780, "step": 35, "time_per_iteration": 2.915315866470337 }, { "auxiliary_loss_clip": 0.02265946, "auxiliary_loss_mlp": 0.01282236, "balance_loss_clip": 1.4507575, "balance_loss_mlp": 1.14376283, "epoch": 0.0043287440630072746, "flos": 17312104062720.0, "grad_norm": 1.9271726088564434, "language_loss": 0.95061779, "learning_rate": 2.5960657816942747e-06, "loss": 0.98609966, "num_input_tokens_seen": 848695, "step": 36, "time_per_iteration": 2.7901387214660645 }, { "auxiliary_loss_clip": 0.02070546, "auxiliary_loss_mlp": 0.01669971, "balance_loss_clip": 1.58071017, "balance_loss_mlp": 1.61465836, "epoch": 0.004448986953646365, "flos": 53092491160320.0, "grad_norm": 1.4046274014691424, "language_loss": 0.6099329, "learning_rate": 2.6159148575788668e-06, "loss": 0.64733809, "num_input_tokens_seen": 906730, "step": 37, "time_per_iteration": 3.241480588912964 }, { "auxiliary_loss_clip": 0.02213789, "auxiliary_loss_mlp": 0.01313267, "balance_loss_clip": 1.43542302, "balance_loss_mlp": 1.17407823, "epoch": 0.004569229844285457, "flos": 13444165866240.0, "grad_norm": 2.6430328468858217, "language_loss": 0.98691249, "learning_rate": 2.635234561171e-06, "loss": 1.02218294, "num_input_tokens_seen": 925125, "step": 38, "time_per_iteration": 2.752657890319824 }, { "auxiliary_loss_clip": 0.02152026, "auxiliary_loss_mlp": 0.01241155, "balance_loss_clip": 1.42553687, "balance_loss_mlp": 1.12657106, "epoch": 0.0046894727349245475, "flos": 16209609966720.0, "grad_norm": 2.1681304221458895, "language_loss": 0.94087696, "learning_rate": 2.6540523970949877e-06, "loss": 0.97480875, "num_input_tokens_seen": 939970, "step": 39, "time_per_iteration": 2.9437460899353027 }, { "auxiliary_loss_clip": 0.021422, "auxiliary_loss_mlp": 0.01274059, "balance_loss_clip": 1.42657876, "balance_loss_mlp": 1.15995169, "epoch": 0.004809715625563638, "flos": 23914244505600.0, "grad_norm": 3.6366124118145824, "language_loss": 0.92239946, "learning_rate": 2.6723937805519533e-06, "loss": 0.95656204, "num_input_tokens_seen": 957470, "step": 40, "time_per_iteration": 3.8722739219665527 }, { "auxiliary_loss_clip": 0.02129783, "auxiliary_loss_mlp": 0.01256825, "balance_loss_clip": 1.41601241, "balance_loss_mlp": 1.14095342, "epoch": 0.00492995851620273, "flos": 20773030273920.0, "grad_norm": 3.108383559292718, "language_loss": 0.92929631, "learning_rate": 2.690282243737839e-06, "loss": 0.96316248, "num_input_tokens_seen": 976405, "step": 41, "time_per_iteration": 4.631072998046875 }, { "auxiliary_loss_clip": 0.02094824, "auxiliary_loss_mlp": 0.01242514, "balance_loss_clip": 1.40109873, "balance_loss_mlp": 1.12721503, "epoch": 0.0050502014068418205, "flos": 20338655103360.0, "grad_norm": 4.147612017853385, "language_loss": 0.99205863, "learning_rate": 2.7077396173840807e-06, "loss": 1.02543211, "num_input_tokens_seen": 994690, "step": 42, "time_per_iteration": 2.7396128177642822 }, { "auxiliary_loss_clip": 0.02043083, "auxiliary_loss_mlp": 0.01236164, "balance_loss_clip": 1.3936727, "balance_loss_mlp": 1.13249993, "epoch": 0.005170444297480911, "flos": 25994872834560.0, "grad_norm": 3.4184737559657763, "language_loss": 0.92788184, "learning_rate": 2.7247861909342594e-06, "loss": 0.96067429, "num_input_tokens_seen": 1015615, "step": 43, "time_per_iteration": 2.8920655250549316 }, { "auxiliary_loss_clip": 0.02024822, "auxiliary_loss_mlp": 0.01234756, "balance_loss_clip": 1.39083123, "balance_loss_mlp": 1.13767171, "epoch": 0.005290687188120003, "flos": 20954055841920.0, "grad_norm": 2.5764979142547837, "language_loss": 0.83133996, "learning_rate": 2.7414408543044743e-06, "loss": 0.86393571, "num_input_tokens_seen": 1031255, "step": 44, "time_per_iteration": 2.7814130783081055 }, { "auxiliary_loss_clip": 0.02028156, "auxiliary_loss_mlp": 0.01252775, "balance_loss_clip": 1.38848913, "balance_loss_mlp": 1.156883, "epoch": 0.005410930078759093, "flos": 15851401585920.0, "grad_norm": 7.70277680582527, "language_loss": 0.79361689, "learning_rate": 2.7577212237113157e-06, "loss": 0.82642615, "num_input_tokens_seen": 1048295, "step": 45, "time_per_iteration": 2.9958226680755615 }, { "auxiliary_loss_clip": 0.01987815, "auxiliary_loss_mlp": 0.01232928, "balance_loss_clip": 1.38572145, "balance_loss_mlp": 1.14065981, "epoch": 0.005531172969398184, "flos": 21104988791040.0, "grad_norm": 2.1786450099359427, "language_loss": 1.04210234, "learning_rate": 2.7736437536690466e-06, "loss": 1.07430971, "num_input_tokens_seen": 1067925, "step": 46, "time_per_iteration": 2.8643085956573486 }, { "auxiliary_loss_clip": 0.01989643, "auxiliary_loss_mlp": 0.01250456, "balance_loss_clip": 1.37670767, "balance_loss_mlp": 1.15823603, "epoch": 0.005651415860037276, "flos": 20844887431680.0, "grad_norm": 2.0461861412179037, "language_loss": 1.07959032, "learning_rate": 2.789223836941131e-06, "loss": 1.11199129, "num_input_tokens_seen": 1088060, "step": 47, "time_per_iteration": 2.8597664833068848 }, { "auxiliary_loss_clip": 0.01951419, "auxiliary_loss_mlp": 0.01246963, "balance_loss_clip": 1.36866188, "balance_loss_mlp": 1.15617335, "epoch": 0.005771658750676366, "flos": 13260195383040.0, "grad_norm": 2.4670111321824124, "language_loss": 1.08802879, "learning_rate": 2.8044758939680847e-06, "loss": 1.12001252, "num_input_tokens_seen": 1104130, "step": 48, "time_per_iteration": 2.7791342735290527 }, { "auxiliary_loss_clip": 0.01940159, "auxiliary_loss_mlp": 0.01224888, "balance_loss_clip": 1.37131596, "balance_loss_mlp": 1.14053535, "epoch": 0.005891901641315457, "flos": 24425396997120.0, "grad_norm": 2.707405011373264, "language_loss": 1.02092004, "learning_rate": 2.8194134530738863e-06, "loss": 1.05257058, "num_input_tokens_seen": 1122900, "step": 49, "time_per_iteration": 2.757429599761963 }, { "auxiliary_loss_clip": 0.01917329, "auxiliary_loss_mlp": 0.01243547, "balance_loss_clip": 1.36122715, "balance_loss_mlp": 1.16534531, "epoch": 0.006012144531954548, "flos": 23076197314560.0, "grad_norm": 2.897336187082976, "language_loss": 0.90240735, "learning_rate": 2.834049222568994e-06, "loss": 0.93401611, "num_input_tokens_seen": 1140250, "step": 50, "time_per_iteration": 2.8517069816589355 }, { "auxiliary_loss_clip": 0.01915676, "auxiliary_loss_mlp": 0.01204528, "balance_loss_clip": 1.35753834, "balance_loss_mlp": 1.12761474, "epoch": 0.006132387422593639, "flos": 22528775064960.0, "grad_norm": 2.3910706310221035, "language_loss": 0.92570543, "learning_rate": 2.848395155712969e-06, "loss": 0.95690751, "num_input_tokens_seen": 1160470, "step": 51, "time_per_iteration": 2.767319440841675 }, { "auxiliary_loss_clip": 0.01910232, "auxiliary_loss_mlp": 0.01200104, "balance_loss_clip": 1.36269033, "balance_loss_mlp": 1.12428689, "epoch": 0.00625263031323273, "flos": 27628340751360.0, "grad_norm": 2.3157634998475434, "language_loss": 0.97874039, "learning_rate": 2.8624625093687977e-06, "loss": 1.00984383, "num_input_tokens_seen": 1177605, "step": 52, "time_per_iteration": 2.8875205516815186 }, { "auxiliary_loss_clip": 0.01886809, "auxiliary_loss_mlp": 0.01192955, "balance_loss_clip": 1.34572816, "balance_loss_mlp": 1.11551666, "epoch": 0.006372873203871821, "flos": 23110671392640.0, "grad_norm": 2.0281964108903465, "language_loss": 0.89101601, "learning_rate": 2.876261897070029e-06, "loss": 0.92181373, "num_input_tokens_seen": 1197735, "step": 53, "time_per_iteration": 2.7452077865600586 }, { "auxiliary_loss_clip": 0.0189057, "auxiliary_loss_mlp": 0.0117983, "balance_loss_clip": 1.34766769, "balance_loss_mlp": 1.10591984, "epoch": 0.006493116094510912, "flos": 22856028900480.0, "grad_norm": 2.446093441390119, "language_loss": 0.92533863, "learning_rate": 2.889803337127447e-06, "loss": 0.95604265, "num_input_tokens_seen": 1216335, "step": 54, "time_per_iteration": 2.8024685382843018 }, { "auxiliary_loss_clip": 0.0188708, "auxiliary_loss_mlp": 0.01175438, "balance_loss_clip": 1.34748721, "balance_loss_mlp": 1.09685552, "epoch": 0.006613358985150003, "flos": 23071708114560.0, "grad_norm": 2.33745715683638, "language_loss": 0.84688163, "learning_rate": 2.903096296321516e-06, "loss": 0.87750673, "num_input_tokens_seen": 1234480, "step": 55, "time_per_iteration": 2.8023979663848877 }, { "auxiliary_loss_clip": 0.01853927, "auxiliary_loss_mlp": 0.01199806, "balance_loss_clip": 1.33698785, "balance_loss_mlp": 1.12284482, "epoch": 0.006733601875789094, "flos": 26537662229760.0, "grad_norm": 1.9782388256378645, "language_loss": 0.91572773, "learning_rate": 2.9161497296578907e-06, "loss": 0.9462651, "num_input_tokens_seen": 1253870, "step": 56, "time_per_iteration": 2.8221585750579834 }, { "auxiliary_loss_clip": 0.01847313, "auxiliary_loss_mlp": 0.0117828, "balance_loss_clip": 1.32913947, "balance_loss_mlp": 1.10208106, "epoch": 0.006853844766428185, "flos": 15523178083200.0, "grad_norm": 2.2980294843603097, "language_loss": 0.86154032, "learning_rate": 2.928972116604173e-06, "loss": 0.89179623, "num_input_tokens_seen": 1270145, "step": 57, "time_per_iteration": 2.8803305625915527 }, { "auxiliary_loss_clip": 0.01832548, "auxiliary_loss_mlp": 0.01146651, "balance_loss_clip": 1.32523549, "balance_loss_mlp": 1.07321763, "epoch": 0.006974087657067276, "flos": 24243760897920.0, "grad_norm": 2.2321688098214185, "language_loss": 1.02140403, "learning_rate": 2.9415714941751377e-06, "loss": 1.05119598, "num_input_tokens_seen": 1291365, "step": 58, "time_per_iteration": 2.759227752685547 }, { "auxiliary_loss_clip": 0.01824828, "auxiliary_loss_mlp": 0.0115526, "balance_loss_clip": 1.32039309, "balance_loss_mlp": 1.08082545, "epoch": 0.007094330547706367, "flos": 25772513690880.0, "grad_norm": 6.65063640845979, "language_loss": 0.93592507, "learning_rate": 2.9539554871897396e-06, "loss": 0.9657259, "num_input_tokens_seen": 1311535, "step": 59, "time_per_iteration": 2.876073122024536 }, { "auxiliary_loss_clip": 0.01817741, "auxiliary_loss_mlp": 0.01170666, "balance_loss_clip": 1.32258487, "balance_loss_mlp": 1.10114312, "epoch": 0.007214573438345458, "flos": 21319015979520.0, "grad_norm": 1.9735425732014875, "language_loss": 0.97401845, "learning_rate": 2.9661313359851253e-06, "loss": 1.00390255, "num_input_tokens_seen": 1329420, "step": 60, "time_per_iteration": 2.730991840362549 }, { "auxiliary_loss_clip": 0.01821031, "auxiliary_loss_mlp": 0.01155022, "balance_loss_clip": 1.32134795, "balance_loss_mlp": 1.0838778, "epoch": 0.007334816328984549, "flos": 24937088192640.0, "grad_norm": 2.0123991453314685, "language_loss": 0.93950438, "learning_rate": 2.978105921839922e-06, "loss": 0.96926498, "num_input_tokens_seen": 1349965, "step": 61, "time_per_iteration": 2.8542513847351074 }, { "auxiliary_loss_clip": 0.01807913, "auxiliary_loss_mlp": 0.01180042, "balance_loss_clip": 1.31576252, "balance_loss_mlp": 1.10703826, "epoch": 0.00745505921962364, "flos": 18510586277760.0, "grad_norm": 2.933304325943544, "language_loss": 0.72181046, "learning_rate": 2.9898857903302893e-06, "loss": 0.75169003, "num_input_tokens_seen": 1368915, "step": 62, "time_per_iteration": 2.8001906871795654 }, { "auxiliary_loss_clip": 0.01814594, "auxiliary_loss_mlp": 0.01181382, "balance_loss_clip": 1.31941438, "balance_loss_mlp": 1.11295581, "epoch": 0.007575302110262731, "flos": 18477656484480.0, "grad_norm": 3.3240644690357817, "language_loss": 0.87861896, "learning_rate": 3.001477172817253e-06, "loss": 0.90857875, "num_input_tokens_seen": 1386805, "step": 63, "time_per_iteration": 2.751695156097412 }, { "auxiliary_loss_clip": 0.01793206, "auxiliary_loss_mlp": 0.01175256, "balance_loss_clip": 1.31216371, "balance_loss_mlp": 1.10616255, "epoch": 0.007695545000901822, "flos": 24973178382720.0, "grad_norm": 2.674589923509647, "language_loss": 0.96264207, "learning_rate": 3.012886006241894e-06, "loss": 0.99232668, "num_input_tokens_seen": 1406190, "step": 64, "time_per_iteration": 2.8714640140533447 }, { "auxiliary_loss_clip": 0.01784283, "auxiliary_loss_mlp": 0.01162142, "balance_loss_clip": 1.30505097, "balance_loss_mlp": 1.09128451, "epoch": 0.007815787891540913, "flos": 21324223451520.0, "grad_norm": 2.004198919056353, "language_loss": 0.88107461, "learning_rate": 3.0241179513858383e-06, "loss": 0.91053885, "num_input_tokens_seen": 1425500, "step": 65, "time_per_iteration": 2.776402711868286 }, { "auxiliary_loss_clip": 0.0177837, "auxiliary_loss_mlp": 0.01156688, "balance_loss_clip": 1.30163693, "balance_loss_mlp": 1.08444726, "epoch": 0.007936030782180003, "flos": 21575777374080.0, "grad_norm": 4.743545553181586, "language_loss": 0.87936246, "learning_rate": 3.035178409737647e-06, "loss": 0.90871298, "num_input_tokens_seen": 1442950, "step": 66, "time_per_iteration": 3.7188055515289307 }, { "auxiliary_loss_clip": 0.01758108, "auxiliary_loss_mlp": 0.01133607, "balance_loss_clip": 1.2895416, "balance_loss_mlp": 1.06746984, "epoch": 0.008056273672819095, "flos": 20120785159680.0, "grad_norm": 2.991854226992496, "language_loss": 0.88594723, "learning_rate": 3.046072539090907e-06, "loss": 0.91486448, "num_input_tokens_seen": 1460915, "step": 67, "time_per_iteration": 2.787823438644409 }, { "auxiliary_loss_clip": 0.01763524, "auxiliary_loss_mlp": 0.01147381, "balance_loss_clip": 1.28915238, "balance_loss_mlp": 1.07499671, "epoch": 0.008176516563458186, "flos": 18333116156160.0, "grad_norm": 2.995905257531014, "language_loss": 1.04675806, "learning_rate": 3.056805267986779e-06, "loss": 1.07586706, "num_input_tokens_seen": 1478385, "step": 68, "time_per_iteration": 3.679590940475464 }, { "auxiliary_loss_clip": 0.01759212, "auxiliary_loss_mlp": 0.01152511, "balance_loss_clip": 1.29314399, "balance_loss_mlp": 1.08150995, "epoch": 0.008296759454097276, "flos": 21872076664320.0, "grad_norm": 3.1060327013887252, "language_loss": 0.95242345, "learning_rate": 3.0673813091022194e-06, "loss": 0.98154068, "num_input_tokens_seen": 1497605, "step": 69, "time_per_iteration": 2.8308873176574707 }, { "auxiliary_loss_clip": 0.01889812, "auxiliary_loss_mlp": 0.0147558, "balance_loss_clip": 1.46816754, "balance_loss_mlp": 1.4191227, "epoch": 0.008417002344736368, "flos": 63408228036480.0, "grad_norm": 1.3497087895668647, "language_loss": 0.62161648, "learning_rate": 3.0778051716749317e-06, "loss": 0.65527046, "num_input_tokens_seen": 1561150, "step": 70, "time_per_iteration": 3.3654112815856934 }, { "auxiliary_loss_clip": 0.01744628, "auxiliary_loss_mlp": 0.01164401, "balance_loss_clip": 1.28471661, "balance_loss_mlp": 1.0946877, "epoch": 0.008537245235375458, "flos": 22966454286720.0, "grad_norm": 2.944399830722267, "language_loss": 0.90495431, "learning_rate": 3.0880811730470094e-06, "loss": 0.9340446, "num_input_tokens_seen": 1580605, "step": 71, "time_per_iteration": 2.801382303237915 }, { "auxiliary_loss_clip": 0.01830504, "auxiliary_loss_mlp": 0.01341405, "balance_loss_clip": 1.45409942, "balance_loss_mlp": 1.28189564, "epoch": 0.008657488126014549, "flos": 61984046712960.0, "grad_norm": 1.2360273299777773, "language_loss": 0.58665597, "learning_rate": 3.098213449401257e-06, "loss": 0.61837506, "num_input_tokens_seen": 1647535, "step": 72, "time_per_iteration": 3.2549498081207275 }, { "auxiliary_loss_clip": 0.01733719, "auxiliary_loss_mlp": 0.0114991, "balance_loss_clip": 1.2710439, "balance_loss_mlp": 1.07833719, "epoch": 0.00877773101665364, "flos": 30296791152000.0, "grad_norm": 2.33549699187498, "language_loss": 0.9903354, "learning_rate": 3.1082059657570015e-06, "loss": 1.01917171, "num_input_tokens_seen": 1666770, "step": 73, "time_per_iteration": 2.9593331813812256 }, { "auxiliary_loss_clip": 0.0172517, "auxiliary_loss_mlp": 0.01146188, "balance_loss_clip": 1.27122092, "balance_loss_mlp": 1.07170594, "epoch": 0.00889797390729273, "flos": 23514056104320.0, "grad_norm": 2.177419390526778, "language_loss": 0.96642154, "learning_rate": 3.1180625252858496e-06, "loss": 0.99513507, "num_input_tokens_seen": 1685200, "step": 74, "time_per_iteration": 2.862757921218872 }, { "auxiliary_loss_clip": 0.01720821, "auxiliary_loss_mlp": 0.01169379, "balance_loss_clip": 1.26902127, "balance_loss_mlp": 1.09909344, "epoch": 0.009018216797931822, "flos": 23075838178560.0, "grad_norm": 2.6976722691736597, "language_loss": 0.80077773, "learning_rate": 3.1277867780021663e-06, "loss": 0.82967973, "num_input_tokens_seen": 1701835, "step": 75, "time_per_iteration": 2.8432583808898926 }, { "auxiliary_loss_clip": 0.01712036, "auxiliary_loss_mlp": 0.01180633, "balance_loss_clip": 1.26328814, "balance_loss_mlp": 1.10967982, "epoch": 0.009138459688570914, "flos": 15918877284480.0, "grad_norm": 1.9737132892412041, "language_loss": 0.95597613, "learning_rate": 3.1373822288779824e-06, "loss": 0.9849028, "num_input_tokens_seen": 1718415, "step": 76, "time_per_iteration": 2.836852788925171 }, { "auxiliary_loss_clip": 0.01725577, "auxiliary_loss_mlp": 0.01204886, "balance_loss_clip": 1.26497865, "balance_loss_mlp": 1.13135815, "epoch": 0.009258702579210003, "flos": 27016531372800.0, "grad_norm": 1.824315950722705, "language_loss": 0.79585528, "learning_rate": 3.1468522454274533e-06, "loss": 0.82515991, "num_input_tokens_seen": 1738770, "step": 77, "time_per_iteration": 3.075599193572998 }, { "auxiliary_loss_clip": 0.01710267, "auxiliary_loss_mlp": 0.01189526, "balance_loss_clip": 1.26535118, "balance_loss_mlp": 1.11604595, "epoch": 0.009378945469849095, "flos": 26903196984960.0, "grad_norm": 2.2110338671384553, "language_loss": 0.91798109, "learning_rate": 3.15620006480197e-06, "loss": 0.94697905, "num_input_tokens_seen": 1758040, "step": 78, "time_per_iteration": 2.823390007019043 }, { "auxiliary_loss_clip": 0.0172489, "auxiliary_loss_mlp": 0.01204158, "balance_loss_clip": 1.26378787, "balance_loss_mlp": 1.12943769, "epoch": 0.009499188360488187, "flos": 35694236327040.0, "grad_norm": 3.008564166277036, "language_loss": 0.74929547, "learning_rate": 3.1654288004333087e-06, "loss": 0.77858597, "num_input_tokens_seen": 1776705, "step": 79, "time_per_iteration": 2.9955408573150635 }, { "auxiliary_loss_clip": 0.01688551, "auxiliary_loss_mlp": 0.01163619, "balance_loss_clip": 1.25702953, "balance_loss_mlp": 1.08823097, "epoch": 0.009619431251127276, "flos": 21503201944320.0, "grad_norm": 2.5260453524938797, "language_loss": 0.7571671, "learning_rate": 3.1745414482589353e-06, "loss": 0.78568876, "num_input_tokens_seen": 1795915, "step": 80, "time_per_iteration": 2.7739596366882324 }, { "auxiliary_loss_clip": 0.016757, "auxiliary_loss_mlp": 0.01190715, "balance_loss_clip": 1.25001729, "balance_loss_mlp": 1.1217171, "epoch": 0.009739674141766368, "flos": 17421056991360.0, "grad_norm": 2.3003059867699887, "language_loss": 0.87165499, "learning_rate": 3.1835408925606204e-06, "loss": 0.90031916, "num_input_tokens_seen": 1814055, "step": 81, "time_per_iteration": 2.8769538402557373 }, { "auxiliary_loss_clip": 0.01671292, "auxiliary_loss_mlp": 0.01166301, "balance_loss_clip": 1.24912179, "balance_loss_mlp": 1.09520483, "epoch": 0.00985991703240546, "flos": 27527109246720.0, "grad_norm": 2.270044486245301, "language_loss": 0.89176399, "learning_rate": 3.1924299114448214e-06, "loss": 0.92013991, "num_input_tokens_seen": 1834535, "step": 82, "time_per_iteration": 2.8041391372680664 }, { "auxiliary_loss_clip": 0.0167448, "auxiliary_loss_mlp": 0.01174406, "balance_loss_clip": 1.24361145, "balance_loss_mlp": 1.09877968, "epoch": 0.00998015992304455, "flos": 13808084509440.0, "grad_norm": 4.17884093375585, "language_loss": 0.83568299, "learning_rate": 3.2012111819909055e-06, "loss": 0.86417186, "num_input_tokens_seen": 1851865, "step": 83, "time_per_iteration": 2.8329153060913086 }, { "auxiliary_loss_clip": 0.01655311, "auxiliary_loss_mlp": 0.01182735, "balance_loss_clip": 1.23612356, "balance_loss_mlp": 1.11216259, "epoch": 0.010100402813683641, "flos": 20191385341440.0, "grad_norm": 2.860076002272179, "language_loss": 0.9502579, "learning_rate": 3.2098872850910627e-06, "loss": 0.97863829, "num_input_tokens_seen": 1868540, "step": 84, "time_per_iteration": 2.7887182235717773 }, { "auxiliary_loss_clip": 0.01658729, "auxiliary_loss_mlp": 0.01200792, "balance_loss_clip": 1.23590016, "balance_loss_mlp": 1.13103032, "epoch": 0.010220645704322733, "flos": 17201642762880.0, "grad_norm": 1.9729954338837135, "language_loss": 0.8922255, "learning_rate": 3.2184607100038194e-06, "loss": 0.92082071, "num_input_tokens_seen": 1887180, "step": 85, "time_per_iteration": 2.784813165664673 }, { "auxiliary_loss_clip": 0.01648984, "auxiliary_loss_mlp": 0.01150347, "balance_loss_clip": 1.23435235, "balance_loss_mlp": 1.0770092, "epoch": 0.010340888594961822, "flos": 21470415805440.0, "grad_norm": 2.295229183229289, "language_loss": 0.93224466, "learning_rate": 3.2269338586412414e-06, "loss": 0.96023786, "num_input_tokens_seen": 1904765, "step": 86, "time_per_iteration": 2.8457841873168945 }, { "auxiliary_loss_clip": 0.01641411, "auxiliary_loss_mlp": 0.01165366, "balance_loss_clip": 1.23084843, "balance_loss_mlp": 1.0957005, "epoch": 0.010461131485600914, "flos": 23002831785600.0, "grad_norm": 2.4135347811272037, "language_loss": 0.96337414, "learning_rate": 3.2353090496083106e-06, "loss": 0.99144191, "num_input_tokens_seen": 1922600, "step": 87, "time_per_iteration": 2.8154499530792236 }, { "auxiliary_loss_clip": 0.01626309, "auxiliary_loss_mlp": 0.01168603, "balance_loss_clip": 1.23011029, "balance_loss_mlp": 1.10027242, "epoch": 0.010581374376240005, "flos": 33546850571520.0, "grad_norm": 2.039026261273289, "language_loss": 0.81172186, "learning_rate": 3.2435885220114572e-06, "loss": 0.83967102, "num_input_tokens_seen": 1943950, "step": 88, "time_per_iteration": 3.0532946586608887 }, { "auxiliary_loss_clip": 0.01630592, "auxiliary_loss_mlp": 0.0117884, "balance_loss_clip": 1.23088908, "balance_loss_mlp": 1.10678959, "epoch": 0.010701617266879095, "flos": 21763087822080.0, "grad_norm": 2.2312040707743668, "language_loss": 0.93818581, "learning_rate": 3.2517744390519113e-06, "loss": 0.9662801, "num_input_tokens_seen": 1962815, "step": 89, "time_per_iteration": 2.888662576675415 }, { "auxiliary_loss_clip": 0.01624806, "auxiliary_loss_mlp": 0.01137533, "balance_loss_clip": 1.22400641, "balance_loss_mlp": 1.07068026, "epoch": 0.010821860157518187, "flos": 19060199256960.0, "grad_norm": 2.1259664256747444, "language_loss": 0.75226182, "learning_rate": 3.259868891418298e-06, "loss": 0.77988517, "num_input_tokens_seen": 1980580, "step": 90, "time_per_iteration": 2.838543176651001 }, { "auxiliary_loss_clip": 0.01626479, "auxiliary_loss_mlp": 0.01158512, "balance_loss_clip": 1.22617984, "balance_loss_mlp": 1.08946574, "epoch": 0.010942103048157278, "flos": 25447378757760.0, "grad_norm": 2.3973534014720057, "language_loss": 0.84942991, "learning_rate": 3.2678739004917757e-06, "loss": 0.87727976, "num_input_tokens_seen": 2000315, "step": 91, "time_per_iteration": 2.8729374408721924 }, { "auxiliary_loss_clip": 0.01616283, "auxiliary_loss_mlp": 0.01155713, "balance_loss_clip": 1.22478735, "balance_loss_mlp": 1.08604741, "epoch": 0.011062345938796368, "flos": 27493928058240.0, "grad_norm": 1.6587880518389706, "language_loss": 0.92144895, "learning_rate": 3.275791421376029e-06, "loss": 0.94916892, "num_input_tokens_seen": 2023760, "step": 92, "time_per_iteration": 3.8022279739379883 }, { "auxiliary_loss_clip": 0.01599859, "auxiliary_loss_mlp": 0.01147065, "balance_loss_clip": 1.21308899, "balance_loss_mlp": 1.07911527, "epoch": 0.01118258882943546, "flos": 16071210864000.0, "grad_norm": 2.066719558751639, "language_loss": 0.96079755, "learning_rate": 3.2836233457634622e-06, "loss": 0.98826683, "num_input_tokens_seen": 2041895, "step": 93, "time_per_iteration": 4.55079984664917 }, { "auxiliary_loss_clip": 0.01605707, "auxiliary_loss_mlp": 0.011638, "balance_loss_clip": 1.21566129, "balance_loss_mlp": 1.09742391, "epoch": 0.011302831720074551, "flos": 20668602458880.0, "grad_norm": 2.272503230189778, "language_loss": 0.85331786, "learning_rate": 3.2913715046481135e-06, "loss": 0.88101292, "num_input_tokens_seen": 2061640, "step": 94, "time_per_iteration": 3.7886385917663574 }, { "auxiliary_loss_clip": 0.01602328, "auxiliary_loss_mlp": 0.01151288, "balance_loss_clip": 1.2111752, "balance_loss_mlp": 1.08558011, "epoch": 0.011423074610713641, "flos": 13072238490240.0, "grad_norm": 2.8695928805867466, "language_loss": 0.88906336, "learning_rate": 3.299037670895023e-06, "loss": 0.91659951, "num_input_tokens_seen": 2078255, "step": 95, "time_per_iteration": 2.806264877319336 }, { "auxiliary_loss_clip": 0.01594583, "auxiliary_loss_mlp": 0.01153479, "balance_loss_clip": 1.20743704, "balance_loss_mlp": 1.08901072, "epoch": 0.011543317501352733, "flos": 30335646689280.0, "grad_norm": 1.6967036906255069, "language_loss": 0.80264723, "learning_rate": 3.3066235616750667e-06, "loss": 0.83012784, "num_input_tokens_seen": 2099490, "step": 96, "time_per_iteration": 2.8389320373535156 }, { "auxiliary_loss_clip": 0.01585566, "auxiliary_loss_mlp": 0.01159397, "balance_loss_clip": 1.20764208, "balance_loss_mlp": 1.0892067, "epoch": 0.011663560391991824, "flos": 15522962601600.0, "grad_norm": 2.475146768366308, "language_loss": 0.92389119, "learning_rate": 3.3141308407736276e-06, "loss": 0.95134091, "num_input_tokens_seen": 2116125, "step": 97, "time_per_iteration": 2.834036350250244 }, { "auxiliary_loss_clip": 0.01592278, "auxiliary_loss_mlp": 0.01156619, "balance_loss_clip": 1.20629048, "balance_loss_mlp": 1.09043419, "epoch": 0.011783803282630914, "flos": 19902125116800.0, "grad_norm": 1.9674529660571831, "language_loss": 0.86601621, "learning_rate": 3.321561120780869e-06, "loss": 0.89350516, "num_input_tokens_seen": 2134835, "step": 98, "time_per_iteration": 2.766657829284668 }, { "auxiliary_loss_clip": 0.01584418, "auxiliary_loss_mlp": 0.01140924, "balance_loss_clip": 1.20817077, "balance_loss_mlp": 1.07430959, "epoch": 0.011904046173270006, "flos": 22340674517760.0, "grad_norm": 2.3999173512951057, "language_loss": 1.01483297, "learning_rate": 3.3289159651708192e-06, "loss": 1.04208636, "num_input_tokens_seen": 2152410, "step": 99, "time_per_iteration": 2.9116318225860596 }, { "auxiliary_loss_clip": 0.01569535, "auxiliary_loss_mlp": 0.01133127, "balance_loss_clip": 1.19647515, "balance_loss_mlp": 1.07051873, "epoch": 0.012024289063909096, "flos": 19100060375040.0, "grad_norm": 2.0756844672527617, "language_loss": 0.97481191, "learning_rate": 3.3361968902759768e-06, "loss": 1.00183845, "num_input_tokens_seen": 2172090, "step": 100, "time_per_iteration": 2.786073923110962 }, { "auxiliary_loss_clip": 0.01564817, "auxiliary_loss_mlp": 0.01141749, "balance_loss_clip": 1.19712102, "balance_loss_mlp": 1.07866383, "epoch": 0.012144531954548187, "flos": 15012205159680.0, "grad_norm": 2.1837100280621913, "language_loss": 0.93985009, "learning_rate": 3.343405367163663e-06, "loss": 0.96691579, "num_input_tokens_seen": 2189020, "step": 101, "time_per_iteration": 2.8302383422851562 }, { "auxiliary_loss_clip": 0.01569676, "auxiliary_loss_mlp": 0.01145526, "balance_loss_clip": 1.19740772, "balance_loss_mlp": 1.07624197, "epoch": 0.012264774845187279, "flos": 15122020014720.0, "grad_norm": 2.1947885740938977, "language_loss": 0.81309807, "learning_rate": 3.350542823419951e-06, "loss": 0.84025002, "num_input_tokens_seen": 2205620, "step": 102, "time_per_iteration": 2.8699703216552734 }, { "auxiliary_loss_clip": 0.01576774, "auxiliary_loss_mlp": 0.01156405, "balance_loss_clip": 1.20293522, "balance_loss_mlp": 1.09522629, "epoch": 0.012385017735826368, "flos": 13949248959360.0, "grad_norm": 4.156516739297177, "language_loss": 0.87278938, "learning_rate": 3.3576106448465615e-06, "loss": 0.90012115, "num_input_tokens_seen": 2219000, "step": 103, "time_per_iteration": 2.705120801925659 }, { "auxiliary_loss_clip": 0.01566938, "auxiliary_loss_mlp": 0.01145227, "balance_loss_clip": 1.19258022, "balance_loss_mlp": 1.07856536, "epoch": 0.01250526062646546, "flos": 23623260428160.0, "grad_norm": 1.9868011199485218, "language_loss": 0.88062906, "learning_rate": 3.3646101770757797e-06, "loss": 0.90775079, "num_input_tokens_seen": 2237790, "step": 104, "time_per_iteration": 2.8415415287017822 }, { "auxiliary_loss_clip": 0.01552868, "auxiliary_loss_mlp": 0.0115796, "balance_loss_clip": 1.18870199, "balance_loss_mlp": 1.08829415, "epoch": 0.012625503517104552, "flos": 34640078958720.0, "grad_norm": 1.6858049512605748, "language_loss": 0.85621172, "learning_rate": 3.371542727108104e-06, "loss": 0.88331997, "num_input_tokens_seen": 2259965, "step": 105, "time_per_iteration": 2.850281000137329 }, { "auxiliary_loss_clip": 0.01568852, "auxiliary_loss_mlp": 0.01167502, "balance_loss_clip": 1.19416428, "balance_loss_mlp": 1.1017462, "epoch": 0.012745746407743641, "flos": 17821891837440.0, "grad_norm": 2.4867770621283265, "language_loss": 0.89954305, "learning_rate": 3.3784095647770114e-06, "loss": 0.92690659, "num_input_tokens_seen": 2278610, "step": 106, "time_per_iteration": 2.7652459144592285 }, { "auxiliary_loss_clip": 0.01554624, "auxiliary_loss_mlp": 0.01153863, "balance_loss_clip": 1.1895225, "balance_loss_mlp": 1.08772588, "epoch": 0.012865989298382733, "flos": 20595057361920.0, "grad_norm": 2.4066828470745087, "language_loss": 0.88275623, "learning_rate": 3.3852119241449547e-06, "loss": 0.90984112, "num_input_tokens_seen": 2297730, "step": 107, "time_per_iteration": 2.769113540649414 }, { "auxiliary_loss_clip": 0.01562423, "auxiliary_loss_mlp": 0.01168674, "balance_loss_clip": 1.19060874, "balance_loss_mlp": 1.10520649, "epoch": 0.012986232189021825, "flos": 23948969978880.0, "grad_norm": 3.380644321668764, "language_loss": 0.96145844, "learning_rate": 3.3919510048344295e-06, "loss": 0.98876941, "num_input_tokens_seen": 2315740, "step": 108, "time_per_iteration": 2.769691228866577 }, { "auxiliary_loss_clip": 0.01547975, "auxiliary_loss_mlp": 0.01165904, "balance_loss_clip": 1.18603778, "balance_loss_mlp": 1.10143554, "epoch": 0.013106475079660914, "flos": 23725425686400.0, "grad_norm": 2.208106663262309, "language_loss": 0.86401439, "learning_rate": 3.3986279732976907e-06, "loss": 0.89115322, "num_input_tokens_seen": 2334215, "step": 109, "time_per_iteration": 2.680187463760376 }, { "auxiliary_loss_clip": 0.01552735, "auxiliary_loss_mlp": 0.01145086, "balance_loss_clip": 1.18612885, "balance_loss_mlp": 1.07909143, "epoch": 0.013226717970300006, "flos": 21102438925440.0, "grad_norm": 2.1918037383620605, "language_loss": 0.95083845, "learning_rate": 3.4052439640284983e-06, "loss": 0.9778167, "num_input_tokens_seen": 2353130, "step": 110, "time_per_iteration": 2.8247625827789307 }, { "auxiliary_loss_clip": 0.01542509, "auxiliary_loss_mlp": 0.01140815, "balance_loss_clip": 1.18925023, "balance_loss_mlp": 1.07782519, "epoch": 0.013346960860939098, "flos": 24863902231680.0, "grad_norm": 1.874904425641122, "language_loss": 0.81036353, "learning_rate": 3.4118000807190217e-06, "loss": 0.83719683, "num_input_tokens_seen": 2374010, "step": 111, "time_per_iteration": 2.851775646209717 }, { "auxiliary_loss_clip": 0.01546201, "auxiliary_loss_mlp": 0.01138918, "balance_loss_clip": 1.18296576, "balance_loss_mlp": 1.07459223, "epoch": 0.013467203751578187, "flos": 28181940140160.0, "grad_norm": 1.7319910215757495, "language_loss": 0.76028901, "learning_rate": 3.4182973973648723e-06, "loss": 0.78714025, "num_input_tokens_seen": 2395220, "step": 112, "time_per_iteration": 2.789478063583374 }, { "auxiliary_loss_clip": 0.01546776, "auxiliary_loss_mlp": 0.01166391, "balance_loss_clip": 1.18590581, "balance_loss_mlp": 1.10383046, "epoch": 0.013587446642217279, "flos": 18916233546240.0, "grad_norm": 2.467871167178391, "language_loss": 0.94975698, "learning_rate": 3.424736959321014e-06, "loss": 0.97688866, "num_input_tokens_seen": 2413025, "step": 113, "time_per_iteration": 2.9137063026428223 }, { "auxiliary_loss_clip": 0.01543202, "auxiliary_loss_mlp": 0.01166445, "balance_loss_clip": 1.18293905, "balance_loss_mlp": 1.0978756, "epoch": 0.01370768953285637, "flos": 23988615615360.0, "grad_norm": 1.9449194642536825, "language_loss": 0.8849768, "learning_rate": 3.431119784311155e-06, "loss": 0.91207325, "num_input_tokens_seen": 2432700, "step": 114, "time_per_iteration": 2.7932276725769043 }, { "auxiliary_loss_clip": 0.01529665, "auxiliary_loss_mlp": 0.01127681, "balance_loss_clip": 1.18348646, "balance_loss_mlp": 1.0653584, "epoch": 0.01382793242349546, "flos": 39202565512320.0, "grad_norm": 2.9151916237947404, "language_loss": 0.77740288, "learning_rate": 3.43744686339307e-06, "loss": 0.8039763, "num_input_tokens_seen": 2455020, "step": 115, "time_per_iteration": 3.0399532318115234 }, { "auxiliary_loss_clip": 0.01524615, "auxiliary_loss_mlp": 0.01112125, "balance_loss_clip": 1.16944194, "balance_loss_mlp": 1.05027914, "epoch": 0.013948175314134552, "flos": 41353506714240.0, "grad_norm": 2.2221798317535733, "language_loss": 0.9063307, "learning_rate": 3.44371916188212e-06, "loss": 0.93269813, "num_input_tokens_seen": 2475775, "step": 116, "time_per_iteration": 2.9230973720550537 }, { "auxiliary_loss_clip": 0.01518685, "auxiliary_loss_mlp": 0.01123182, "balance_loss_clip": 1.17078459, "balance_loss_mlp": 1.06033468, "epoch": 0.014068418204773643, "flos": 22453542028800.0, "grad_norm": 2.1297436538152708, "language_loss": 0.86174673, "learning_rate": 3.449937620235143e-06, "loss": 0.88816535, "num_input_tokens_seen": 2496370, "step": 117, "time_per_iteration": 2.8281915187835693 }, { "auxiliary_loss_clip": 0.01521535, "auxiliary_loss_mlp": 0.01142257, "balance_loss_clip": 1.17195356, "balance_loss_mlp": 1.0763582, "epoch": 0.014188661095412733, "flos": 23805147922560.0, "grad_norm": 1.6736343763204051, "language_loss": 0.89359123, "learning_rate": 3.456103154896722e-06, "loss": 0.92022908, "num_input_tokens_seen": 2517645, "step": 118, "time_per_iteration": 3.6903738975524902 }, { "auxiliary_loss_clip": 0.01518901, "auxiliary_loss_mlp": 0.01134923, "balance_loss_clip": 1.17114854, "balance_loss_mlp": 1.07198036, "epoch": 0.014308903986051825, "flos": 23660248458240.0, "grad_norm": 1.8271260721099412, "language_loss": 0.92286634, "learning_rate": 3.462216659109757e-06, "loss": 0.94940448, "num_input_tokens_seen": 2537825, "step": 119, "time_per_iteration": 3.881836414337158 }, { "auxiliary_loss_clip": 0.01532284, "auxiliary_loss_mlp": 0.01139787, "balance_loss_clip": 1.175892, "balance_loss_mlp": 1.07379293, "epoch": 0.014429146876690916, "flos": 20667991927680.0, "grad_norm": 2.3653499909377858, "language_loss": 0.85092688, "learning_rate": 3.4682790036921077e-06, "loss": 0.87764764, "num_input_tokens_seen": 2556485, "step": 120, "time_per_iteration": 3.655423879623413 }, { "auxiliary_loss_clip": 0.01501874, "auxiliary_loss_mlp": 0.01128238, "balance_loss_clip": 1.16644955, "balance_loss_mlp": 1.06491435, "epoch": 0.014549389767330006, "flos": 20229199384320.0, "grad_norm": 2.152121032837897, "language_loss": 0.83067465, "learning_rate": 3.4742910377810193e-06, "loss": 0.85697579, "num_input_tokens_seen": 2573945, "step": 121, "time_per_iteration": 2.743910789489746 }, { "auxiliary_loss_clip": 0.01507981, "auxiliary_loss_mlp": 0.01137549, "balance_loss_clip": 1.16626561, "balance_loss_mlp": 1.07811141, "epoch": 0.014669632657969098, "flos": 18004174381440.0, "grad_norm": 2.7054628310156428, "language_loss": 0.88700306, "learning_rate": 3.4802535895469042e-06, "loss": 0.91345835, "num_input_tokens_seen": 2592695, "step": 122, "time_per_iteration": 2.889767646789551 }, { "auxiliary_loss_clip": 0.01510802, "auxiliary_loss_mlp": 0.01129862, "balance_loss_clip": 1.16693377, "balance_loss_mlp": 1.06815886, "epoch": 0.01478987554860819, "flos": 22741796672640.0, "grad_norm": 2.00814534431366, "language_loss": 0.89529073, "learning_rate": 3.4861674668779934e-06, "loss": 0.92169738, "num_input_tokens_seen": 2610925, "step": 123, "time_per_iteration": 2.7469608783721924 }, { "auxiliary_loss_clip": 0.01502853, "auxiliary_loss_mlp": 0.01137479, "balance_loss_clip": 1.16340935, "balance_loss_mlp": 1.07673025, "epoch": 0.01491011843924728, "flos": 17198590106880.0, "grad_norm": 1.8995497207193748, "language_loss": 0.83841646, "learning_rate": 3.492033458037272e-06, "loss": 0.86481977, "num_input_tokens_seen": 2629495, "step": 124, "time_per_iteration": 2.7535390853881836 }, { "auxiliary_loss_clip": 0.01508275, "auxiliary_loss_mlp": 0.01144475, "balance_loss_clip": 1.16622639, "balance_loss_mlp": 1.0790534, "epoch": 0.01503036132988637, "flos": 17673867889920.0, "grad_norm": 2.5101171132884814, "language_loss": 0.86745024, "learning_rate": 3.497852332293018e-06, "loss": 0.8939777, "num_input_tokens_seen": 2645070, "step": 125, "time_per_iteration": 2.699939727783203 }, { "auxiliary_loss_clip": 0.01511362, "auxiliary_loss_mlp": 0.01134078, "balance_loss_clip": 1.16596746, "balance_loss_mlp": 1.0690372, "epoch": 0.015150604220525462, "flos": 18878239935360.0, "grad_norm": 1.9210640321124168, "language_loss": 0.96501672, "learning_rate": 3.5036248405242356e-06, "loss": 0.99147105, "num_input_tokens_seen": 2663825, "step": 126, "time_per_iteration": 2.797776460647583 }, { "auxiliary_loss_clip": 0.01501658, "auxiliary_loss_mlp": 0.01150579, "balance_loss_clip": 1.16144133, "balance_loss_mlp": 1.08625364, "epoch": 0.015270847111164552, "flos": 39420184060800.0, "grad_norm": 2.6355196871534448, "language_loss": 0.82576805, "learning_rate": 3.509351715802146e-06, "loss": 0.85229039, "num_input_tokens_seen": 2684710, "step": 127, "time_per_iteration": 2.8484017848968506 }, { "auxiliary_loss_clip": 0.01496339, "auxiliary_loss_mlp": 0.01125671, "balance_loss_clip": 1.15901339, "balance_loss_mlp": 1.06158352, "epoch": 0.015391090001803644, "flos": 43762466286720.0, "grad_norm": 2.8640486684562085, "language_loss": 0.78652871, "learning_rate": 3.5150336739488763e-06, "loss": 0.81274879, "num_input_tokens_seen": 2706995, "step": 128, "time_per_iteration": 3.009124994277954 }, { "auxiliary_loss_clip": 0.01479809, "auxiliary_loss_mlp": 0.01116475, "balance_loss_clip": 1.15709662, "balance_loss_mlp": 1.05458164, "epoch": 0.015511332892442733, "flos": 18916341287040.0, "grad_norm": 1.8314125631344453, "language_loss": 0.83956838, "learning_rate": 3.5206714140744143e-06, "loss": 0.86553121, "num_input_tokens_seen": 2727050, "step": 129, "time_per_iteration": 2.739480495452881 }, { "auxiliary_loss_clip": 0.01509956, "auxiliary_loss_mlp": 0.01142202, "balance_loss_clip": 1.16974247, "balance_loss_mlp": 1.08006966, "epoch": 0.015631575783081827, "flos": 24535283679360.0, "grad_norm": 3.062728820343016, "language_loss": 0.87376392, "learning_rate": 3.5262656190928208e-06, "loss": 0.90028548, "num_input_tokens_seen": 2745350, "step": 130, "time_per_iteration": 2.823554754257202 }, { "auxiliary_loss_clip": 0.01498774, "auxiliary_loss_mlp": 0.01235656, "balance_loss_clip": 1.2963165, "balance_loss_mlp": 1.19693661, "epoch": 0.015751818673720917, "flos": 62328536098560.0, "grad_norm": 1.0817388786066697, "language_loss": 0.71513999, "learning_rate": 3.5318169562186737e-06, "loss": 0.74248433, "num_input_tokens_seen": 2814195, "step": 131, "time_per_iteration": 3.4402055740356445 }, { "auxiliary_loss_clip": 0.01493793, "auxiliary_loss_mlp": 0.0113288, "balance_loss_clip": 1.16334367, "balance_loss_mlp": 1.07012844, "epoch": 0.015872061564360006, "flos": 23878549365120.0, "grad_norm": 1.726581134065642, "language_loss": 0.82230532, "learning_rate": 3.5373260774446292e-06, "loss": 0.84857202, "num_input_tokens_seen": 2834645, "step": 132, "time_per_iteration": 2.7537121772766113 }, { "auxiliary_loss_clip": 0.01479648, "auxiliary_loss_mlp": 0.0113668, "balance_loss_clip": 1.15044522, "balance_loss_mlp": 1.07249808, "epoch": 0.0159923044549991, "flos": 23367899664000.0, "grad_norm": 2.330667108169749, "language_loss": 0.90354574, "learning_rate": 3.542793620000961e-06, "loss": 0.92970896, "num_input_tokens_seen": 2854120, "step": 133, "time_per_iteration": 2.725517749786377 }, { "auxiliary_loss_clip": 0.014789, "auxiliary_loss_mlp": 0.01132772, "balance_loss_clip": 1.1554544, "balance_loss_mlp": 1.06982946, "epoch": 0.01611254734563819, "flos": 17858305249920.0, "grad_norm": 2.0922288210125224, "language_loss": 0.86834329, "learning_rate": 3.5482202067978894e-06, "loss": 0.89446002, "num_input_tokens_seen": 2871330, "step": 134, "time_per_iteration": 2.7707042694091797 }, { "auxiliary_loss_clip": 0.01483534, "auxiliary_loss_mlp": 0.01136584, "balance_loss_clip": 1.15167177, "balance_loss_mlp": 1.07230663, "epoch": 0.01623279023627728, "flos": 20954774113920.0, "grad_norm": 2.0833500269741623, "language_loss": 0.76073134, "learning_rate": 3.553606446851471e-06, "loss": 0.78693253, "num_input_tokens_seen": 2888070, "step": 135, "time_per_iteration": 2.78704571723938 }, { "auxiliary_loss_clip": 0.01469227, "auxiliary_loss_mlp": 0.01146209, "balance_loss_clip": 1.1468308, "balance_loss_mlp": 1.08398187, "epoch": 0.016353033126916373, "flos": 15742412743680.0, "grad_norm": 1.9458512773354673, "language_loss": 0.83321428, "learning_rate": 3.5589529356937613e-06, "loss": 0.85936856, "num_input_tokens_seen": 2906465, "step": 136, "time_per_iteration": 2.7701520919799805 }, { "auxiliary_loss_clip": 0.01479015, "auxiliary_loss_mlp": 0.01142777, "balance_loss_clip": 1.15030873, "balance_loss_mlp": 1.0803113, "epoch": 0.016473276017555463, "flos": 18807280617600.0, "grad_norm": 1.7054356203614904, "language_loss": 0.76962012, "learning_rate": 3.5642602557679627e-06, "loss": 0.79583806, "num_input_tokens_seen": 2924915, "step": 137, "time_per_iteration": 2.786118268966675 }, { "auxiliary_loss_clip": 0.01472849, "auxiliary_loss_mlp": 0.01153496, "balance_loss_clip": 1.1552372, "balance_loss_mlp": 1.0917933, "epoch": 0.016593518908194552, "flos": 24352641999360.0, "grad_norm": 2.148207048087294, "language_loss": 0.84025919, "learning_rate": 3.569528976809202e-06, "loss": 0.86652255, "num_input_tokens_seen": 2942130, "step": 138, "time_per_iteration": 2.787029504776001 }, { "auxiliary_loss_clip": 0.0147578, "auxiliary_loss_mlp": 0.01143076, "balance_loss_clip": 1.14528024, "balance_loss_mlp": 1.08218408, "epoch": 0.016713761798833646, "flos": 22346133384960.0, "grad_norm": 4.896396830566866, "language_loss": 0.89697391, "learning_rate": 3.5747596562115522e-06, "loss": 0.92316246, "num_input_tokens_seen": 2962745, "step": 139, "time_per_iteration": 2.8137307167053223 }, { "auxiliary_loss_clip": 0.01481123, "auxiliary_loss_mlp": 0.01164081, "balance_loss_clip": 1.14613032, "balance_loss_mlp": 1.09908819, "epoch": 0.016834004689472735, "flos": 17821820010240.0, "grad_norm": 2.7847474777297974, "language_loss": 0.90932059, "learning_rate": 3.5799528393819138e-06, "loss": 0.93577254, "num_input_tokens_seen": 2981825, "step": 140, "time_per_iteration": 2.798170328140259 }, { "auxiliary_loss_clip": 0.01465436, "auxiliary_loss_mlp": 0.01138263, "balance_loss_clip": 1.14709282, "balance_loss_mlp": 1.0791831, "epoch": 0.016954247580111825, "flos": 20519501103360.0, "grad_norm": 1.8966719880860237, "language_loss": 0.88158667, "learning_rate": 3.585109060081286e-06, "loss": 0.90762359, "num_input_tokens_seen": 3001625, "step": 141, "time_per_iteration": 2.785489320755005 }, { "auxiliary_loss_clip": 0.01475477, "auxiliary_loss_mlp": 0.01147219, "balance_loss_clip": 1.14484596, "balance_loss_mlp": 1.08427632, "epoch": 0.017074490470750915, "flos": 22088869200000.0, "grad_norm": 2.3911114820209853, "language_loss": 0.78549594, "learning_rate": 3.590228840753992e-06, "loss": 0.81172287, "num_input_tokens_seen": 3022055, "step": 142, "time_per_iteration": 2.8807334899902344 }, { "auxiliary_loss_clip": 0.01465362, "auxiliary_loss_mlp": 0.01162705, "balance_loss_clip": 1.14472723, "balance_loss_mlp": 1.09804595, "epoch": 0.01719473336139001, "flos": 15997270717440.0, "grad_norm": 3.094498058158236, "language_loss": 0.87231565, "learning_rate": 3.5953126928453423e-06, "loss": 0.89859629, "num_input_tokens_seen": 3039605, "step": 143, "time_per_iteration": 2.732213020324707 }, { "auxiliary_loss_clip": 0.0146856, "auxiliary_loss_mlp": 0.01154525, "balance_loss_clip": 1.14443898, "balance_loss_mlp": 1.09115362, "epoch": 0.017314976252029098, "flos": 22492038430080.0, "grad_norm": 1.8235856336179699, "language_loss": 0.80373418, "learning_rate": 3.600361117108239e-06, "loss": 0.82996505, "num_input_tokens_seen": 3059405, "step": 144, "time_per_iteration": 3.696141481399536 }, { "auxiliary_loss_clip": 0.01466398, "auxiliary_loss_mlp": 0.01152967, "balance_loss_clip": 1.14428878, "balance_loss_mlp": 1.09331512, "epoch": 0.017435219142668188, "flos": 22018053536640.0, "grad_norm": 1.9467710467514476, "language_loss": 0.96950072, "learning_rate": 3.6053746038991616e-06, "loss": 0.99569434, "num_input_tokens_seen": 3078490, "step": 145, "time_per_iteration": 2.8379273414611816 }, { "auxiliary_loss_clip": 0.01419618, "auxiliary_loss_mlp": 0.01213149, "balance_loss_clip": 1.2075547, "balance_loss_mlp": 1.1788168, "epoch": 0.01755546203330728, "flos": 72240526149120.0, "grad_norm": 1.0779611982176436, "language_loss": 0.58475542, "learning_rate": 3.6103536334639843e-06, "loss": 0.61108309, "num_input_tokens_seen": 3131755, "step": 146, "time_per_iteration": 5.162643671035767 }, { "auxiliary_loss_clip": 0.01454454, "auxiliary_loss_mlp": 0.01150318, "balance_loss_clip": 1.14168751, "balance_loss_mlp": 1.08923519, "epoch": 0.01767570492394637, "flos": 25337061112320.0, "grad_norm": 5.4151533667799985, "language_loss": 0.85380453, "learning_rate": 3.615298676214041e-06, "loss": 0.87985224, "num_input_tokens_seen": 3152035, "step": 147, "time_per_iteration": 3.642923593521118 }, { "auxiliary_loss_clip": 0.01457657, "auxiliary_loss_mlp": 0.01135776, "balance_loss_clip": 1.14014864, "balance_loss_mlp": 1.07345378, "epoch": 0.01779594781458546, "flos": 20449188230400.0, "grad_norm": 2.2615488445332854, "language_loss": 0.88880497, "learning_rate": 3.6202101929928317e-06, "loss": 0.91473937, "num_input_tokens_seen": 3170625, "step": 148, "time_per_iteration": 2.80865216255188 }, { "auxiliary_loss_clip": 0.01445135, "auxiliary_loss_mlp": 0.01121993, "balance_loss_clip": 1.1386143, "balance_loss_mlp": 1.06501067, "epoch": 0.017916190705224554, "flos": 16253601148800.0, "grad_norm": 1.948151504217171, "language_loss": 0.88445342, "learning_rate": 3.6250886353337413e-06, "loss": 0.91012466, "num_input_tokens_seen": 3188155, "step": 149, "time_per_iteration": 2.6919538974761963 }, { "auxiliary_loss_clip": 0.01453309, "auxiliary_loss_mlp": 0.01128159, "balance_loss_clip": 1.14082527, "balance_loss_mlp": 1.06693292, "epoch": 0.018036433595863644, "flos": 23330588411520.0, "grad_norm": 1.9638539193025075, "language_loss": 0.86652517, "learning_rate": 3.6299344457091488e-06, "loss": 0.89233983, "num_input_tokens_seen": 3209015, "step": 150, "time_per_iteration": 2.7229089736938477 }, { "auxiliary_loss_clip": 0.01449165, "auxiliary_loss_mlp": 0.01128899, "balance_loss_clip": 1.1355809, "balance_loss_mlp": 1.07039142, "epoch": 0.018156676486502734, "flos": 18588010043520.0, "grad_norm": 3.0997023960594454, "language_loss": 0.93678856, "learning_rate": 3.634748057771256e-06, "loss": 0.96256924, "num_input_tokens_seen": 3224955, "step": 151, "time_per_iteration": 2.674750804901123 }, { "auxiliary_loss_clip": 0.01449569, "auxiliary_loss_mlp": 0.01125556, "balance_loss_clip": 1.13637877, "balance_loss_mlp": 1.06485498, "epoch": 0.018276919377141827, "flos": 25448707560960.0, "grad_norm": 1.6424993730527226, "language_loss": 0.85577053, "learning_rate": 3.639529896584965e-06, "loss": 0.88152182, "num_input_tokens_seen": 3246330, "step": 152, "time_per_iteration": 2.7903478145599365 }, { "auxiliary_loss_clip": 0.01448008, "auxiliary_loss_mlp": 0.01132717, "balance_loss_clip": 1.13456678, "balance_loss_mlp": 1.07325554, "epoch": 0.018397162267780917, "flos": 20047311889920.0, "grad_norm": 3.4995716888662463, "language_loss": 0.88977289, "learning_rate": 3.6442803788531233e-06, "loss": 0.91558009, "num_input_tokens_seen": 3264290, "step": 153, "time_per_iteration": 2.719439744949341 }, { "auxiliary_loss_clip": 0.01450664, "auxiliary_loss_mlp": 0.01123528, "balance_loss_clip": 1.1362921, "balance_loss_mlp": 1.06540155, "epoch": 0.018517405158420007, "flos": 27565282425600.0, "grad_norm": 1.8868243548593358, "language_loss": 0.95867622, "learning_rate": 3.6489999131344357e-06, "loss": 0.98441815, "num_input_tokens_seen": 3287065, "step": 154, "time_per_iteration": 2.7784318923950195 }, { "auxiliary_loss_clip": 0.0145044, "auxiliary_loss_mlp": 0.01125351, "balance_loss_clip": 1.14115369, "balance_loss_mlp": 1.05954742, "epoch": 0.0186376480490591, "flos": 19354056422400.0, "grad_norm": 2.119887923141709, "language_loss": 0.90551376, "learning_rate": 3.653688900054313e-06, "loss": 0.93127161, "num_input_tokens_seen": 3305595, "step": 155, "time_per_iteration": 2.755037784576416 }, { "auxiliary_loss_clip": 0.01445012, "auxiliary_loss_mlp": 0.01129031, "balance_loss_clip": 1.13236141, "balance_loss_mlp": 1.07152402, "epoch": 0.01875789093969819, "flos": 26687840993280.0, "grad_norm": 2.206938791075143, "language_loss": 0.75772381, "learning_rate": 3.6583477325089526e-06, "loss": 0.78346419, "num_input_tokens_seen": 3326135, "step": 156, "time_per_iteration": 2.8949360847473145 }, { "auxiliary_loss_clip": 0.01439178, "auxiliary_loss_mlp": 0.01143631, "balance_loss_clip": 1.13236606, "balance_loss_mlp": 1.08526659, "epoch": 0.01887813383033728, "flos": 24353001135360.0, "grad_norm": 2.177181362313969, "language_loss": 1.03997231, "learning_rate": 3.6629767958628916e-06, "loss": 1.06580031, "num_input_tokens_seen": 3343510, "step": 157, "time_per_iteration": 2.7739217281341553 }, { "auxiliary_loss_clip": 0.01463937, "auxiliary_loss_mlp": 0.01150034, "balance_loss_clip": 1.14160359, "balance_loss_mlp": 1.09109712, "epoch": 0.018998376720976373, "flos": 14647532330880.0, "grad_norm": 3.1646134321880366, "language_loss": 0.85308337, "learning_rate": 3.667576468140291e-06, "loss": 0.87922305, "num_input_tokens_seen": 3361325, "step": 158, "time_per_iteration": 2.711282968521118 }, { "auxiliary_loss_clip": 0.01438581, "auxiliary_loss_mlp": 0.01120202, "balance_loss_clip": 1.12967849, "balance_loss_mlp": 1.06007266, "epoch": 0.019118619611615463, "flos": 29305261146240.0, "grad_norm": 2.4994427169125535, "language_loss": 0.89043432, "learning_rate": 3.672147120210184e-06, "loss": 0.91602218, "num_input_tokens_seen": 3377925, "step": 159, "time_per_iteration": 2.7503092288970947 }, { "auxiliary_loss_clip": 0.01439248, "auxiliary_loss_mlp": 0.01130974, "balance_loss_clip": 1.13419843, "balance_loss_mlp": 1.07256126, "epoch": 0.019238862502254553, "flos": 20886723797760.0, "grad_norm": 1.824881596742772, "language_loss": 0.86534923, "learning_rate": 3.6766891159659177e-06, "loss": 0.89105153, "num_input_tokens_seen": 3396335, "step": 160, "time_per_iteration": 2.7372491359710693 }, { "auxiliary_loss_clip": 0.01435456, "auxiliary_loss_mlp": 0.01123094, "balance_loss_clip": 1.12977469, "balance_loss_mlp": 1.06415725, "epoch": 0.019359105392893646, "flos": 21360672777600.0, "grad_norm": 2.9725898288613735, "language_loss": 0.87828839, "learning_rate": 3.6812028124990075e-06, "loss": 0.90387392, "num_input_tokens_seen": 3413605, "step": 161, "time_per_iteration": 2.7292912006378174 }, { "auxiliary_loss_clip": 0.01441025, "auxiliary_loss_mlp": 0.01128037, "balance_loss_clip": 1.13400745, "balance_loss_mlp": 1.06771684, "epoch": 0.019479348283532736, "flos": 16283729681280.0, "grad_norm": 3.190459733935789, "language_loss": 0.81235445, "learning_rate": 3.6856885602676016e-06, "loss": 0.83804512, "num_input_tokens_seen": 3429640, "step": 162, "time_per_iteration": 2.691810131072998 }, { "auxiliary_loss_clip": 0.01440971, "auxiliary_loss_mlp": 0.01123844, "balance_loss_clip": 1.1297574, "balance_loss_mlp": 1.06419206, "epoch": 0.019599591174171826, "flos": 22091239497600.0, "grad_norm": 2.551233931678815, "language_loss": 0.93899369, "learning_rate": 3.6901467032597733e-06, "loss": 0.96464193, "num_input_tokens_seen": 3448125, "step": 163, "time_per_iteration": 2.73280668258667 }, { "auxiliary_loss_clip": 0.01439928, "auxiliary_loss_mlp": 0.01133974, "balance_loss_clip": 1.13115191, "balance_loss_mlp": 1.07518005, "epoch": 0.01971983406481092, "flos": 19609668581760.0, "grad_norm": 2.041567923905246, "language_loss": 0.87255567, "learning_rate": 3.694577579151804e-06, "loss": 0.89829475, "num_input_tokens_seen": 3466535, "step": 164, "time_per_iteration": 2.769512414932251 }, { "auxiliary_loss_clip": 0.01436119, "auxiliary_loss_mlp": 0.01126431, "balance_loss_clip": 1.13215041, "balance_loss_mlp": 1.06396508, "epoch": 0.01984007695545001, "flos": 19099342103040.0, "grad_norm": 2.6820182487130717, "language_loss": 0.73443031, "learning_rate": 3.6989815194616703e-06, "loss": 0.76005572, "num_input_tokens_seen": 3483730, "step": 165, "time_per_iteration": 2.6797168254852295 }, { "auxiliary_loss_clip": 0.01442247, "auxiliary_loss_mlp": 0.01137131, "balance_loss_clip": 1.12531471, "balance_loss_mlp": 1.07418847, "epoch": 0.0199603198460891, "flos": 20848406964480.0, "grad_norm": 3.6146975989130294, "language_loss": 0.79666519, "learning_rate": 3.703358849697888e-06, "loss": 0.82245898, "num_input_tokens_seen": 3503640, "step": 166, "time_per_iteration": 2.7373454570770264 }, { "auxiliary_loss_clip": 0.01434116, "auxiliary_loss_mlp": 0.01139377, "balance_loss_clip": 1.1289525, "balance_loss_mlp": 1.07443202, "epoch": 0.020080562736728192, "flos": 21870747861120.0, "grad_norm": 1.9101661947356385, "language_loss": 0.82512927, "learning_rate": 3.7077098895038803e-06, "loss": 0.85086417, "num_input_tokens_seen": 3523010, "step": 167, "time_per_iteration": 2.633000373840332 }, { "auxiliary_loss_clip": 0.01428271, "auxiliary_loss_mlp": 0.01122277, "balance_loss_clip": 1.1272707, "balance_loss_mlp": 1.06581926, "epoch": 0.020200805627367282, "flos": 21688788539520.0, "grad_norm": 2.710181281865146, "language_loss": 0.96909308, "learning_rate": 3.712034952798045e-06, "loss": 0.99459857, "num_input_tokens_seen": 3541125, "step": 168, "time_per_iteration": 2.8180086612701416 }, { "auxiliary_loss_clip": 0.01432793, "auxiliary_loss_mlp": 0.01106511, "balance_loss_clip": 1.12545156, "balance_loss_mlp": 1.05081606, "epoch": 0.02032104851800637, "flos": 33543043729920.0, "grad_norm": 2.3292185318586083, "language_loss": 0.84502208, "learning_rate": 3.7163343479096656e-06, "loss": 0.87041515, "num_input_tokens_seen": 3562700, "step": 169, "time_per_iteration": 2.8158950805664062 }, { "auxiliary_loss_clip": 0.01422797, "auxiliary_loss_mlp": 0.01123727, "balance_loss_clip": 1.12777019, "balance_loss_mlp": 1.06872416, "epoch": 0.020441291408645465, "flos": 31686965274240.0, "grad_norm": 2.3952032215426606, "language_loss": 0.82769346, "learning_rate": 3.720608377710802e-06, "loss": 0.85315871, "num_input_tokens_seen": 3582790, "step": 170, "time_per_iteration": 2.826280117034912 }, { "auxiliary_loss_clip": 0.01432383, "auxiliary_loss_mlp": 0.01109026, "balance_loss_clip": 1.12534523, "balance_loss_mlp": 1.05104232, "epoch": 0.020561534299284555, "flos": 20886687884160.0, "grad_norm": 2.3770154431211585, "language_loss": 0.86496633, "learning_rate": 3.7248573397443277e-06, "loss": 0.89038038, "num_input_tokens_seen": 3601715, "step": 171, "time_per_iteration": 3.6985342502593994 }, { "auxiliary_loss_clip": 0.01416745, "auxiliary_loss_mlp": 0.01117794, "balance_loss_clip": 1.12642002, "balance_loss_mlp": 1.06024003, "epoch": 0.020681777189923645, "flos": 20996610480000.0, "grad_norm": 1.968456074058199, "language_loss": 0.97444814, "learning_rate": 3.729081526348224e-06, "loss": 0.99979353, "num_input_tokens_seen": 3620245, "step": 172, "time_per_iteration": 4.586490631103516 }, { "auxiliary_loss_clip": 0.014254, "auxiliary_loss_mlp": 0.01119537, "balance_loss_clip": 1.12768722, "balance_loss_mlp": 1.06350827, "epoch": 0.020802020080562738, "flos": 28257532312320.0, "grad_norm": 1.7154898232601583, "language_loss": 0.8485117, "learning_rate": 3.7332812247762777e-06, "loss": 0.87396109, "num_input_tokens_seen": 3641545, "step": 173, "time_per_iteration": 3.735161304473877 }, { "auxiliary_loss_clip": 0.01433118, "auxiliary_loss_mlp": 0.01142081, "balance_loss_clip": 1.12953186, "balance_loss_mlp": 1.08137918, "epoch": 0.020922262971201828, "flos": 19681274344320.0, "grad_norm": 2.838029410432031, "language_loss": 0.95376945, "learning_rate": 3.737456717315293e-06, "loss": 0.97952139, "num_input_tokens_seen": 3660510, "step": 174, "time_per_iteration": 2.703951120376587 }, { "auxiliary_loss_clip": 0.01424878, "auxiliary_loss_mlp": 0.01129464, "balance_loss_clip": 1.12995112, "balance_loss_mlp": 1.07257736, "epoch": 0.021042505861840918, "flos": 15666353694720.0, "grad_norm": 1.934908527565351, "language_loss": 0.90698099, "learning_rate": 3.7416082813989552e-06, "loss": 0.93252438, "num_input_tokens_seen": 3677505, "step": 175, "time_per_iteration": 2.666710376739502 }, { "auxiliary_loss_clip": 0.0142768, "auxiliary_loss_mlp": 0.01125929, "balance_loss_clip": 1.12346697, "balance_loss_mlp": 1.06959057, "epoch": 0.02116274875248001, "flos": 21142012734720.0, "grad_norm": 7.428030731974632, "language_loss": 0.89341116, "learning_rate": 3.745736189718439e-06, "loss": 0.91894722, "num_input_tokens_seen": 3696760, "step": 176, "time_per_iteration": 2.7681798934936523 }, { "auxiliary_loss_clip": 0.014175, "auxiliary_loss_mlp": 0.01120564, "balance_loss_clip": 1.12438297, "balance_loss_mlp": 1.06496429, "epoch": 0.0212829916431191, "flos": 24715770543360.0, "grad_norm": 2.8409153186369647, "language_loss": 0.72659451, "learning_rate": 3.749840710329894e-06, "loss": 0.75197518, "num_input_tokens_seen": 3717465, "step": 177, "time_per_iteration": 2.7880048751831055 }, { "auxiliary_loss_clip": 0.01426007, "auxiliary_loss_mlp": 0.01132113, "balance_loss_clip": 1.12694216, "balance_loss_mlp": 1.07699084, "epoch": 0.02140323453375819, "flos": 16645493508480.0, "grad_norm": 3.975676448028469, "language_loss": 0.9785682, "learning_rate": 3.7539221067588938e-06, "loss": 1.00414944, "num_input_tokens_seen": 3731440, "step": 178, "time_per_iteration": 2.627021074295044 }, { "auxiliary_loss_clip": 0.01420818, "auxiliary_loss_mlp": 0.01113477, "balance_loss_clip": 1.12233472, "balance_loss_mlp": 1.05234647, "epoch": 0.021523477424397284, "flos": 20299332689280.0, "grad_norm": 3.3962324124210217, "language_loss": 0.93677437, "learning_rate": 3.757980638101964e-06, "loss": 0.96211731, "num_input_tokens_seen": 3744935, "step": 179, "time_per_iteration": 2.692913055419922 }, { "auxiliary_loss_clip": 0.01426168, "auxiliary_loss_mlp": 0.01118814, "balance_loss_clip": 1.12735069, "balance_loss_mlp": 1.06342936, "epoch": 0.021643720315036374, "flos": 26104005331200.0, "grad_norm": 2.1789376488880654, "language_loss": 0.89403784, "learning_rate": 3.7620165591252806e-06, "loss": 0.9194876, "num_input_tokens_seen": 3763035, "step": 180, "time_per_iteration": 2.697840929031372 }, { "auxiliary_loss_clip": 0.01408248, "auxiliary_loss_mlp": 0.01112853, "balance_loss_clip": 1.12214875, "balance_loss_mlp": 1.05849373, "epoch": 0.021763963205675464, "flos": 24787663614720.0, "grad_norm": 1.9092320995456054, "language_loss": 0.9436177, "learning_rate": 3.766030120360636e-06, "loss": 0.96882868, "num_input_tokens_seen": 3782665, "step": 181, "time_per_iteration": 2.7423689365386963 }, { "auxiliary_loss_clip": 0.01409342, "auxiliary_loss_mlp": 0.0112004, "balance_loss_clip": 1.12093425, "balance_loss_mlp": 1.06277192, "epoch": 0.021884206096314557, "flos": 25813559957760.0, "grad_norm": 2.0878714690328213, "language_loss": 0.90313768, "learning_rate": 3.7700215681987578e-06, "loss": 0.92843157, "num_input_tokens_seen": 3802435, "step": 182, "time_per_iteration": 2.7355313301086426 }, { "auxiliary_loss_clip": 0.01409581, "auxiliary_loss_mlp": 0.01127667, "balance_loss_clip": 1.1182375, "balance_loss_mlp": 1.0692544, "epoch": 0.022004448986953647, "flos": 20082719721600.0, "grad_norm": 1.873902247121543, "language_loss": 0.8221271, "learning_rate": 3.7739911449800767e-06, "loss": 0.84749961, "num_input_tokens_seen": 3822490, "step": 183, "time_per_iteration": 2.7612643241882324 }, { "auxiliary_loss_clip": 0.01412725, "auxiliary_loss_mlp": 0.01121388, "balance_loss_clip": 1.11912727, "balance_loss_mlp": 1.06383336, "epoch": 0.022124691877592736, "flos": 20480609652480.0, "grad_norm": 2.2433747476047907, "language_loss": 0.8079825, "learning_rate": 3.7779390890830114e-06, "loss": 0.8333236, "num_input_tokens_seen": 3841140, "step": 184, "time_per_iteration": 2.71396803855896 }, { "auxiliary_loss_clip": 0.01407749, "auxiliary_loss_mlp": 0.01127196, "balance_loss_clip": 1.1176126, "balance_loss_mlp": 1.06892681, "epoch": 0.02224493476823183, "flos": 23586847015680.0, "grad_norm": 2.2824573746977252, "language_loss": 0.8587485, "learning_rate": 3.7818656350098723e-06, "loss": 0.88409793, "num_input_tokens_seen": 3862090, "step": 185, "time_per_iteration": 2.926093816757202 }, { "auxiliary_loss_clip": 0.014165, "auxiliary_loss_mlp": 0.01119344, "balance_loss_clip": 1.11888719, "balance_loss_mlp": 1.06262445, "epoch": 0.02236517765887092, "flos": 16909940413440.0, "grad_norm": 3.6690896487245657, "language_loss": 0.76778746, "learning_rate": 3.7857710134704447e-06, "loss": 0.7931459, "num_input_tokens_seen": 3881025, "step": 186, "time_per_iteration": 2.673619508743286 }, { "auxiliary_loss_clip": 0.01395066, "auxiliary_loss_mlp": 0.0112615, "balance_loss_clip": 1.11714458, "balance_loss_mlp": 1.0692153, "epoch": 0.02248542054951001, "flos": 43508182930560.0, "grad_norm": 2.1526834362344074, "language_loss": 0.79286981, "learning_rate": 3.7896554514633234e-06, "loss": 0.81808192, "num_input_tokens_seen": 3905310, "step": 187, "time_per_iteration": 2.8744282722473145 }, { "auxiliary_loss_clip": 0.01398343, "auxiliary_loss_mlp": 0.01114875, "balance_loss_clip": 1.11542726, "balance_loss_mlp": 1.06165981, "epoch": 0.022605663440149103, "flos": 23367648268800.0, "grad_norm": 2.0345818524321695, "language_loss": 0.84217811, "learning_rate": 3.7935191723550955e-06, "loss": 0.86731029, "num_input_tokens_seen": 3924265, "step": 188, "time_per_iteration": 2.771589517593384 }, { "auxiliary_loss_clip": 0.01404286, "auxiliary_loss_mlp": 0.01117557, "balance_loss_clip": 1.12098527, "balance_loss_mlp": 1.06133783, "epoch": 0.022725906330788193, "flos": 29019915504000.0, "grad_norm": 2.005348016902474, "language_loss": 0.88402086, "learning_rate": 3.797362395957408e-06, "loss": 0.90923929, "num_input_tokens_seen": 3944830, "step": 189, "time_per_iteration": 2.730041265487671 }, { "auxiliary_loss_clip": 0.01406714, "auxiliary_loss_mlp": 0.01114649, "balance_loss_clip": 1.118276, "balance_loss_mlp": 1.06040919, "epoch": 0.022846149221427282, "flos": 24496176746880.0, "grad_norm": 2.197040088945093, "language_loss": 0.78402781, "learning_rate": 3.8011853386020055e-06, "loss": 0.80924141, "num_input_tokens_seen": 3965735, "step": 190, "time_per_iteration": 2.761155843734741 }, { "auxiliary_loss_clip": 0.01402694, "auxiliary_loss_mlp": 0.01109625, "balance_loss_clip": 1.11350369, "balance_loss_mlp": 1.05869842, "epoch": 0.022966392112066376, "flos": 15523537219200.0, "grad_norm": 2.957340590342122, "language_loss": 0.89859891, "learning_rate": 3.804988213213804e-06, "loss": 0.92372209, "num_input_tokens_seen": 3983975, "step": 191, "time_per_iteration": 2.5981898307800293 }, { "auxiliary_loss_clip": 0.01339509, "auxiliary_loss_mlp": 0.01154329, "balance_loss_clip": 1.17330623, "balance_loss_mlp": 1.13086891, "epoch": 0.023086635002705466, "flos": 55650408433920.0, "grad_norm": 1.034508824366444, "language_loss": 0.63183385, "learning_rate": 3.808771229382049e-06, "loss": 0.6567722, "num_input_tokens_seen": 4043440, "step": 192, "time_per_iteration": 3.2255263328552246 }, { "auxiliary_loss_clip": 0.01399246, "auxiliary_loss_mlp": 0.01114467, "balance_loss_clip": 1.11588609, "balance_loss_mlp": 1.05858219, "epoch": 0.023206877893344555, "flos": 19313441118720.0, "grad_norm": 2.0783323847025743, "language_loss": 0.84411955, "learning_rate": 3.8125345934296324e-06, "loss": 0.86925668, "num_input_tokens_seen": 4061750, "step": 193, "time_per_iteration": 2.690062999725342 }, { "auxiliary_loss_clip": 0.01403718, "auxiliary_loss_mlp": 0.01124627, "balance_loss_clip": 1.11215949, "balance_loss_mlp": 1.06788313, "epoch": 0.02332712078398365, "flos": 23072965090560.0, "grad_norm": 2.8839757656018628, "language_loss": 0.88123608, "learning_rate": 3.81627850848061e-06, "loss": 0.90651953, "num_input_tokens_seen": 4082345, "step": 194, "time_per_iteration": 2.7380588054656982 }, { "auxiliary_loss_clip": 0.01397073, "auxiliary_loss_mlp": 0.01108619, "balance_loss_clip": 1.11445141, "balance_loss_mlp": 1.05475974, "epoch": 0.02344736367462274, "flos": 24425971614720.0, "grad_norm": 3.5295112200877257, "language_loss": 0.86246318, "learning_rate": 3.820003174525994e-06, "loss": 0.88752019, "num_input_tokens_seen": 4101770, "step": 195, "time_per_iteration": 2.7895748615264893 }, { "auxiliary_loss_clip": 0.014022, "auxiliary_loss_mlp": 0.01119163, "balance_loss_clip": 1.11836493, "balance_loss_mlp": 1.06146574, "epoch": 0.02356760656526183, "flos": 21579799697280.0, "grad_norm": 3.2576022981179786, "language_loss": 0.82793003, "learning_rate": 3.823708788487851e-06, "loss": 0.85314357, "num_input_tokens_seen": 4118770, "step": 196, "time_per_iteration": 2.7860825061798096 }, { "auxiliary_loss_clip": 0.01398072, "auxiliary_loss_mlp": 0.0110967, "balance_loss_clip": 1.11607003, "balance_loss_mlp": 1.05683601, "epoch": 0.02368784945590092, "flos": 25193598192000.0, "grad_norm": 2.1777284949511544, "language_loss": 0.84538603, "learning_rate": 3.827395544281781e-06, "loss": 0.87046337, "num_input_tokens_seen": 4141110, "step": 197, "time_per_iteration": 4.72819447517395 }, { "auxiliary_loss_clip": 0.01407598, "auxiliary_loss_mlp": 0.01121913, "balance_loss_clip": 1.11792171, "balance_loss_mlp": 1.06912673, "epoch": 0.02380809234654001, "flos": 27562481164800.0, "grad_norm": 1.8736760257845173, "language_loss": 0.78941298, "learning_rate": 3.831063632877802e-06, "loss": 0.81470811, "num_input_tokens_seen": 4161430, "step": 198, "time_per_iteration": 2.8219449520111084 }, { "auxiliary_loss_clip": 0.01392019, "auxiliary_loss_mlp": 0.01119946, "balance_loss_clip": 1.1165179, "balance_loss_mlp": 1.06496644, "epoch": 0.0239283352371791, "flos": 18259786540800.0, "grad_norm": 2.4787491003285087, "language_loss": 0.75898457, "learning_rate": 3.834713242359712e-06, "loss": 0.78410423, "num_input_tokens_seen": 4179260, "step": 199, "time_per_iteration": 3.669806718826294 }, { "auxiliary_loss_clip": 0.01396622, "auxiliary_loss_mlp": 0.01119305, "balance_loss_clip": 1.11617446, "balance_loss_mlp": 1.06341994, "epoch": 0.02404857812781819, "flos": 21395110942080.0, "grad_norm": 1.8180994860322914, "language_loss": 0.87273937, "learning_rate": 3.838344557982959e-06, "loss": 0.89789867, "num_input_tokens_seen": 4200640, "step": 200, "time_per_iteration": 2.802196979522705 }, { "auxiliary_loss_clip": 0.01397601, "auxiliary_loss_mlp": 0.01099498, "balance_loss_clip": 1.1097163, "balance_loss_mlp": 1.04814267, "epoch": 0.024168821018457284, "flos": 16654256426880.0, "grad_norm": 2.6403003692430818, "language_loss": 0.84950113, "learning_rate": 3.841957762231063e-06, "loss": 0.87447214, "num_input_tokens_seen": 4218170, "step": 201, "time_per_iteration": 2.7075273990631104 }, { "auxiliary_loss_clip": 0.01384205, "auxiliary_loss_mlp": 0.01099781, "balance_loss_clip": 1.10663056, "balance_loss_mlp": 1.04725778, "epoch": 0.024289063909096374, "flos": 22820872464000.0, "grad_norm": 2.1012361511567175, "language_loss": 0.87600875, "learning_rate": 3.8455530348706454e-06, "loss": 0.90084863, "num_input_tokens_seen": 4237770, "step": 202, "time_per_iteration": 2.7837843894958496 }, { "auxiliary_loss_clip": 0.01387644, "auxiliary_loss_mlp": 0.01103075, "balance_loss_clip": 1.11163676, "balance_loss_mlp": 1.05016994, "epoch": 0.024409306799735464, "flos": 17748598135680.0, "grad_norm": 1.8736171094158014, "language_loss": 0.77304125, "learning_rate": 3.849130553005099e-06, "loss": 0.79794842, "num_input_tokens_seen": 4255985, "step": 203, "time_per_iteration": 2.7886948585510254 }, { "auxiliary_loss_clip": 0.01394862, "auxiliary_loss_mlp": 0.0110522, "balance_loss_clip": 1.11144125, "balance_loss_mlp": 1.05052662, "epoch": 0.024529549690374557, "flos": 21616213109760.0, "grad_norm": 1.7325943114930018, "language_loss": 0.83552253, "learning_rate": 3.852690491126933e-06, "loss": 0.86052334, "num_input_tokens_seen": 4276035, "step": 204, "time_per_iteration": 2.662778854370117 }, { "auxiliary_loss_clip": 0.0139126, "auxiliary_loss_mlp": 0.01110395, "balance_loss_clip": 1.1085422, "balance_loss_mlp": 1.05818176, "epoch": 0.024649792581013647, "flos": 25551662918400.0, "grad_norm": 3.888815995854617, "language_loss": 0.91278601, "learning_rate": 3.856233021168845e-06, "loss": 0.93780255, "num_input_tokens_seen": 4295730, "step": 205, "time_per_iteration": 2.768275737762451 }, { "auxiliary_loss_clip": 0.01372252, "auxiliary_loss_mlp": 0.01123534, "balance_loss_clip": 1.10263181, "balance_loss_mlp": 1.07270324, "epoch": 0.024770035471652737, "flos": 34495574544000.0, "grad_norm": 2.1588786833516824, "language_loss": 0.91068918, "learning_rate": 3.859758312553544e-06, "loss": 0.93564701, "num_input_tokens_seen": 4317950, "step": 206, "time_per_iteration": 2.807624101638794 }, { "auxiliary_loss_clip": 0.01393033, "auxiliary_loss_mlp": 0.01112739, "balance_loss_clip": 1.11407447, "balance_loss_mlp": 1.0614078, "epoch": 0.02489027836229183, "flos": 21505428587520.0, "grad_norm": 2.2941506510161984, "language_loss": 0.91827607, "learning_rate": 3.8632665322423735e-06, "loss": 0.94333375, "num_input_tokens_seen": 4337605, "step": 207, "time_per_iteration": 2.810553789138794 }, { "auxiliary_loss_clip": 0.01380534, "auxiliary_loss_mlp": 0.01119401, "balance_loss_clip": 1.1094929, "balance_loss_mlp": 1.06573296, "epoch": 0.02501052125293092, "flos": 23219013790080.0, "grad_norm": 1.8259763602898254, "language_loss": 0.85986352, "learning_rate": 3.866757844782762e-06, "loss": 0.8848629, "num_input_tokens_seen": 4358110, "step": 208, "time_per_iteration": 2.8317456245422363 }, { "auxiliary_loss_clip": 0.01381065, "auxiliary_loss_mlp": 0.01114516, "balance_loss_clip": 1.11018026, "balance_loss_mlp": 1.0626359, "epoch": 0.02513076414357001, "flos": 26388920010240.0, "grad_norm": 2.3106908462982925, "language_loss": 0.91182595, "learning_rate": 3.870232412354527e-06, "loss": 0.93678176, "num_input_tokens_seen": 4374955, "step": 209, "time_per_iteration": 2.7541747093200684 }, { "auxiliary_loss_clip": 0.01385513, "auxiliary_loss_mlp": 0.01107379, "balance_loss_clip": 1.10793829, "balance_loss_mlp": 1.05452156, "epoch": 0.025251007034209103, "flos": 13590430047360.0, "grad_norm": 2.3131440907084992, "language_loss": 0.9254064, "learning_rate": 3.873690394815086e-06, "loss": 0.95033532, "num_input_tokens_seen": 4391535, "step": 210, "time_per_iteration": 2.773555040359497 }, { "auxiliary_loss_clip": 0.01381748, "auxiliary_loss_mlp": 0.01127058, "balance_loss_clip": 1.10456824, "balance_loss_mlp": 1.07338977, "epoch": 0.025371249924848193, "flos": 15049229103360.0, "grad_norm": 2.659124519830782, "language_loss": 0.91462952, "learning_rate": 3.877131949743587e-06, "loss": 0.93971753, "num_input_tokens_seen": 4408400, "step": 211, "time_per_iteration": 2.9739975929260254 }, { "auxiliary_loss_clip": 0.01377397, "auxiliary_loss_mlp": 0.01108742, "balance_loss_clip": 1.103899, "balance_loss_mlp": 1.05953264, "epoch": 0.025491492815487283, "flos": 25553853648000.0, "grad_norm": 3.023271393396335, "language_loss": 0.7795763, "learning_rate": 3.880557232483993e-06, "loss": 0.80443776, "num_input_tokens_seen": 4427840, "step": 212, "time_per_iteration": 2.847592353820801 }, { "auxiliary_loss_clip": 0.01376255, "auxiliary_loss_mlp": 0.01111078, "balance_loss_clip": 1.10382247, "balance_loss_mlp": 1.05769598, "epoch": 0.025611735706126376, "flos": 20630752502400.0, "grad_norm": 2.4156088394779154, "language_loss": 0.86865598, "learning_rate": 3.883966396187164e-06, "loss": 0.8935293, "num_input_tokens_seen": 4447110, "step": 213, "time_per_iteration": 2.7258505821228027 }, { "auxiliary_loss_clip": 0.01378424, "auxiliary_loss_mlp": 0.01103168, "balance_loss_clip": 1.10589445, "balance_loss_mlp": 1.05217028, "epoch": 0.025731978596765466, "flos": 19062282245760.0, "grad_norm": 1.9411934400594122, "language_loss": 0.90180725, "learning_rate": 3.887359591851937e-06, "loss": 0.92662311, "num_input_tokens_seen": 4464715, "step": 214, "time_per_iteration": 2.7642858028411865 }, { "auxiliary_loss_clip": 0.0137586, "auxiliary_loss_mlp": 0.01104971, "balance_loss_clip": 1.10806417, "balance_loss_mlp": 1.05282903, "epoch": 0.025852221487404556, "flos": 22163814927360.0, "grad_norm": 1.678094890722852, "language_loss": 0.9235177, "learning_rate": 3.890736968365265e-06, "loss": 0.94832599, "num_input_tokens_seen": 4485030, "step": 215, "time_per_iteration": 2.727710008621216 }, { "auxiliary_loss_clip": 0.01372564, "auxiliary_loss_mlp": 0.01085451, "balance_loss_clip": 1.10290515, "balance_loss_mlp": 1.03867316, "epoch": 0.02597246437804365, "flos": 26541971861760.0, "grad_norm": 1.9087461816788505, "language_loss": 0.84948409, "learning_rate": 3.894098672541412e-06, "loss": 0.87406427, "num_input_tokens_seen": 4505935, "step": 216, "time_per_iteration": 2.7877256870269775 }, { "auxiliary_loss_clip": 0.01374717, "auxiliary_loss_mlp": 0.01097737, "balance_loss_clip": 1.104774, "balance_loss_mlp": 1.04900467, "epoch": 0.02609270726868274, "flos": 32671671696000.0, "grad_norm": 3.224106469378558, "language_loss": 0.75345975, "learning_rate": 3.89744484916025e-06, "loss": 0.77818429, "num_input_tokens_seen": 4527045, "step": 217, "time_per_iteration": 2.7784478664398193 }, { "auxiliary_loss_clip": 0.01371662, "auxiliary_loss_mlp": 0.01105203, "balance_loss_clip": 1.10182297, "balance_loss_mlp": 1.05141616, "epoch": 0.02621295015932183, "flos": 26243553669120.0, "grad_norm": 1.8528604487814617, "language_loss": 0.87257081, "learning_rate": 3.900775641004673e-06, "loss": 0.89733946, "num_input_tokens_seen": 4546360, "step": 218, "time_per_iteration": 2.8026282787323 }, { "auxiliary_loss_clip": 0.01389135, "auxiliary_loss_mlp": 0.01110241, "balance_loss_clip": 1.10913444, "balance_loss_mlp": 1.06186604, "epoch": 0.026333193049960922, "flos": 42921402353280.0, "grad_norm": 7.768017570090544, "language_loss": 0.74310839, "learning_rate": 3.904091188897156e-06, "loss": 0.76810217, "num_input_tokens_seen": 4565495, "step": 219, "time_per_iteration": 2.907325506210327 }, { "auxiliary_loss_clip": 0.01374106, "auxiliary_loss_mlp": 0.01112003, "balance_loss_clip": 1.10749102, "balance_loss_mlp": 1.06076682, "epoch": 0.026453435940600012, "flos": 17963846386560.0, "grad_norm": 2.7121782034288318, "language_loss": 0.82109702, "learning_rate": 3.90739163173548e-06, "loss": 0.84595811, "num_input_tokens_seen": 4583330, "step": 220, "time_per_iteration": 2.7174806594848633 }, { "auxiliary_loss_clip": 0.01372735, "auxiliary_loss_mlp": 0.0110916, "balance_loss_clip": 1.10430074, "balance_loss_mlp": 1.05775666, "epoch": 0.026573678831239102, "flos": 18984319776000.0, "grad_norm": 2.386620601274439, "language_loss": 0.88399714, "learning_rate": 3.910677106527646e-06, "loss": 0.9088161, "num_input_tokens_seen": 4600520, "step": 221, "time_per_iteration": 2.71752667427063 }, { "auxiliary_loss_clip": 0.0136681, "auxiliary_loss_mlp": 0.0110697, "balance_loss_clip": 1.10011458, "balance_loss_mlp": 1.05652046, "epoch": 0.026693921721878195, "flos": 29241448634880.0, "grad_norm": 2.1430792884631797, "language_loss": 0.84066874, "learning_rate": 3.913947748426004e-06, "loss": 0.86540651, "num_input_tokens_seen": 4617340, "step": 222, "time_per_iteration": 2.7856829166412354 }, { "auxiliary_loss_clip": 0.01367607, "auxiliary_loss_mlp": 0.01110863, "balance_loss_clip": 1.10256362, "balance_loss_mlp": 1.05800605, "epoch": 0.026814164612517285, "flos": 14128083797760.0, "grad_norm": 2.701951932356017, "language_loss": 0.76328963, "learning_rate": 3.9172036907606136e-06, "loss": 0.78807431, "num_input_tokens_seen": 4630820, "step": 223, "time_per_iteration": 3.6921634674072266 }, { "auxiliary_loss_clip": 0.01373011, "auxiliary_loss_mlp": 0.01114937, "balance_loss_clip": 1.10484231, "balance_loss_mlp": 1.06248534, "epoch": 0.026934407503156375, "flos": 23511973115520.0, "grad_norm": 1.8978361918547557, "language_loss": 0.94718695, "learning_rate": 3.920445065071855e-06, "loss": 0.97206634, "num_input_tokens_seen": 4651985, "step": 224, "time_per_iteration": 2.7569668292999268 }, { "auxiliary_loss_clip": 0.01362648, "auxiliary_loss_mlp": 0.01099064, "balance_loss_clip": 1.10105371, "balance_loss_mlp": 1.04787517, "epoch": 0.027054650393795468, "flos": 28950356816640.0, "grad_norm": 2.423856193960826, "language_loss": 0.79885125, "learning_rate": 3.923672001142322e-06, "loss": 0.82346833, "num_input_tokens_seen": 4672295, "step": 225, "time_per_iteration": 3.711632251739502 }, { "auxiliary_loss_clip": 0.0136406, "auxiliary_loss_mlp": 0.01109124, "balance_loss_clip": 1.09834421, "balance_loss_mlp": 1.05927086, "epoch": 0.027174893284434558, "flos": 31431568596480.0, "grad_norm": 1.7494505073499678, "language_loss": 0.84266376, "learning_rate": 3.926884627027996e-06, "loss": 0.86739564, "num_input_tokens_seen": 4696065, "step": 226, "time_per_iteration": 3.792670965194702 }, { "auxiliary_loss_clip": 0.01365389, "auxiliary_loss_mlp": 0.01110119, "balance_loss_clip": 1.10069132, "balance_loss_mlp": 1.06138587, "epoch": 0.027295136175073648, "flos": 22054466949120.0, "grad_norm": 1.972783882727411, "language_loss": 0.77480567, "learning_rate": 3.930083069088744e-06, "loss": 0.79956079, "num_input_tokens_seen": 4716065, "step": 227, "time_per_iteration": 2.8280982971191406 }, { "auxiliary_loss_clip": 0.01321866, "auxiliary_loss_mlp": 0.01029049, "balance_loss_clip": 1.16509271, "balance_loss_mlp": 1.00835407, "epoch": 0.02741537906571274, "flos": 60800752972800.0, "grad_norm": 0.9955729298663645, "language_loss": 0.59368277, "learning_rate": 3.933267452018137e-06, "loss": 0.61719191, "num_input_tokens_seen": 4775860, "step": 228, "time_per_iteration": 3.267563819885254 }, { "auxiliary_loss_clip": 0.01362136, "auxiliary_loss_mlp": 0.01104295, "balance_loss_clip": 1.10203159, "balance_loss_mlp": 1.05646861, "epoch": 0.02753562195635183, "flos": 24606278910720.0, "grad_norm": 2.4409119347192694, "language_loss": 0.84207189, "learning_rate": 3.936437898872622e-06, "loss": 0.86673617, "num_input_tokens_seen": 4795835, "step": 229, "time_per_iteration": 2.798703908920288 }, { "auxiliary_loss_clip": 0.0136867, "auxiliary_loss_mlp": 0.01103737, "balance_loss_clip": 1.10292423, "balance_loss_mlp": 1.05524278, "epoch": 0.02765586484699092, "flos": 34094236907520.0, "grad_norm": 3.161211387759752, "language_loss": 0.79820228, "learning_rate": 3.9395945311000525e-06, "loss": 0.82292628, "num_input_tokens_seen": 4817460, "step": 230, "time_per_iteration": 2.8014519214630127 }, { "auxiliary_loss_clip": 0.01367671, "auxiliary_loss_mlp": 0.01106368, "balance_loss_clip": 1.10321474, "balance_loss_mlp": 1.05677724, "epoch": 0.027776107737630014, "flos": 14829922615680.0, "grad_norm": 4.605381311992704, "language_loss": 0.90724707, "learning_rate": 3.942737468567608e-06, "loss": 0.93198746, "num_input_tokens_seen": 4835475, "step": 231, "time_per_iteration": 2.6940255165100098 }, { "auxiliary_loss_clip": 0.01366716, "auxiliary_loss_mlp": 0.01101358, "balance_loss_clip": 1.10146439, "balance_loss_mlp": 1.05422306, "epoch": 0.027896350628269104, "flos": 47920347066240.0, "grad_norm": 2.4165492744702877, "language_loss": 0.86021292, "learning_rate": 3.9458668295891026e-06, "loss": 0.88489372, "num_input_tokens_seen": 4857760, "step": 232, "time_per_iteration": 2.9130940437316895 }, { "auxiliary_loss_clip": 0.01362006, "auxiliary_loss_mlp": 0.01113692, "balance_loss_clip": 1.09955883, "balance_loss_mlp": 1.06309986, "epoch": 0.028016593518908194, "flos": 21684550734720.0, "grad_norm": 2.222790973848939, "language_loss": 0.86790031, "learning_rate": 3.948982730951712e-06, "loss": 0.89265728, "num_input_tokens_seen": 4875855, "step": 233, "time_per_iteration": 2.756370782852173 }, { "auxiliary_loss_clip": 0.01364542, "auxiliary_loss_mlp": 0.01104063, "balance_loss_clip": 1.09906769, "balance_loss_mlp": 1.05699921, "epoch": 0.028136836409547287, "flos": 18439483305600.0, "grad_norm": 3.624768809050723, "language_loss": 0.8195979, "learning_rate": 3.9520852879421254e-06, "loss": 0.84428394, "num_input_tokens_seen": 4893200, "step": 234, "time_per_iteration": 2.817683696746826 }, { "auxiliary_loss_clip": 0.01353845, "auxiliary_loss_mlp": 0.01107682, "balance_loss_clip": 1.09705842, "balance_loss_mlp": 1.05790031, "epoch": 0.028257079300186377, "flos": 31576934937600.0, "grad_norm": 2.2339799484753855, "language_loss": 0.81657916, "learning_rate": 3.955174614372137e-06, "loss": 0.84119445, "num_input_tokens_seen": 4912965, "step": 235, "time_per_iteration": 2.7764627933502197 }, { "auxiliary_loss_clip": 0.01361391, "auxiliary_loss_mlp": 0.01096162, "balance_loss_clip": 1.09687626, "balance_loss_mlp": 1.04731047, "epoch": 0.028377322190825467, "flos": 23513337832320.0, "grad_norm": 2.1912384475471725, "language_loss": 0.84384006, "learning_rate": 3.9582508226037045e-06, "loss": 0.86841559, "num_input_tokens_seen": 4933105, "step": 236, "time_per_iteration": 2.760528326034546 }, { "auxiliary_loss_clip": 0.0136592, "auxiliary_loss_mlp": 0.01107622, "balance_loss_clip": 1.09811139, "balance_loss_mlp": 1.05557477, "epoch": 0.02849756508146456, "flos": 20479604071680.0, "grad_norm": 2.6595422585928814, "language_loss": 0.94130719, "learning_rate": 3.9613140235734636e-06, "loss": 0.96604258, "num_input_tokens_seen": 4950085, "step": 237, "time_per_iteration": 2.6882126331329346 }, { "auxiliary_loss_clip": 0.01363247, "auxiliary_loss_mlp": 0.01102001, "balance_loss_clip": 1.10069609, "balance_loss_mlp": 1.05264843, "epoch": 0.02861780797210365, "flos": 14283362292480.0, "grad_norm": 1.984925870003617, "language_loss": 0.80835307, "learning_rate": 3.96436432681674e-06, "loss": 0.83300561, "num_input_tokens_seen": 4968075, "step": 238, "time_per_iteration": 2.7287802696228027 }, { "auxiliary_loss_clip": 0.0136234, "auxiliary_loss_mlp": 0.01117147, "balance_loss_clip": 1.09809279, "balance_loss_mlp": 1.06669796, "epoch": 0.02873805086274274, "flos": 25808532053760.0, "grad_norm": 4.445766902701044, "language_loss": 0.89047998, "learning_rate": 3.967401840491044e-06, "loss": 0.91527486, "num_input_tokens_seen": 4987355, "step": 239, "time_per_iteration": 2.719059944152832 }, { "auxiliary_loss_clip": 0.0135081, "auxiliary_loss_mlp": 0.01103735, "balance_loss_clip": 1.09851742, "balance_loss_mlp": 1.05917442, "epoch": 0.028858293753381833, "flos": 17304238984320.0, "grad_norm": 2.6362393404482924, "language_loss": 0.87741947, "learning_rate": 3.97042667139909e-06, "loss": 0.9019649, "num_input_tokens_seen": 5004680, "step": 240, "time_per_iteration": 2.7700328826904297 }, { "auxiliary_loss_clip": 0.01357068, "auxiliary_loss_mlp": 0.01098258, "balance_loss_clip": 1.09701347, "balance_loss_mlp": 1.0490005, "epoch": 0.028978536644020923, "flos": 23038347358080.0, "grad_norm": 2.848658576338228, "language_loss": 0.87338483, "learning_rate": 3.973438925011327e-06, "loss": 0.89793813, "num_input_tokens_seen": 5022965, "step": 241, "time_per_iteration": 2.6698412895202637 }, { "auxiliary_loss_clip": 0.01349659, "auxiliary_loss_mlp": 0.011027, "balance_loss_clip": 1.09091818, "balance_loss_mlp": 1.05446815, "epoch": 0.029098779534660012, "flos": 28329712692480.0, "grad_norm": 2.6255401465844943, "language_loss": 0.91293526, "learning_rate": 3.976438705488002e-06, "loss": 0.93745887, "num_input_tokens_seen": 5042625, "step": 242, "time_per_iteration": 2.7481367588043213 }, { "auxiliary_loss_clip": 0.01356175, "auxiliary_loss_mlp": 0.0110624, "balance_loss_clip": 1.09948087, "balance_loss_mlp": 1.05853224, "epoch": 0.029219022425299106, "flos": 13881665520000.0, "grad_norm": 2.6748625746085932, "language_loss": 0.92652595, "learning_rate": 3.9794261157007744e-06, "loss": 0.95115018, "num_input_tokens_seen": 5060380, "step": 243, "time_per_iteration": 2.70497989654541 }, { "auxiliary_loss_clip": 0.01358834, "auxiliary_loss_mlp": 0.01106253, "balance_loss_clip": 1.09542394, "balance_loss_mlp": 1.05752063, "epoch": 0.029339265315938196, "flos": 19422501788160.0, "grad_norm": 2.10415580329221, "language_loss": 0.84982318, "learning_rate": 3.982401257253887e-06, "loss": 0.87447399, "num_input_tokens_seen": 5078720, "step": 244, "time_per_iteration": 2.673243761062622 }, { "auxiliary_loss_clip": 0.01353406, "auxiliary_loss_mlp": 0.01099133, "balance_loss_clip": 1.09610319, "balance_loss_mlp": 1.04973304, "epoch": 0.029459508206577285, "flos": 15669550005120.0, "grad_norm": 2.6674059885580728, "language_loss": 0.89509118, "learning_rate": 3.985364230504893e-06, "loss": 0.91961658, "num_input_tokens_seen": 5096605, "step": 245, "time_per_iteration": 2.734968662261963 }, { "auxiliary_loss_clip": 0.01351184, "auxiliary_loss_mlp": 0.011181, "balance_loss_clip": 1.09442067, "balance_loss_mlp": 1.06810343, "epoch": 0.02957975109721638, "flos": 28220975245440.0, "grad_norm": 2.0327232859888973, "language_loss": 0.84050059, "learning_rate": 3.988315134584976e-06, "loss": 0.86519349, "num_input_tokens_seen": 5116285, "step": 246, "time_per_iteration": 2.7613775730133057 }, { "auxiliary_loss_clip": 0.01359031, "auxiliary_loss_mlp": 0.01097757, "balance_loss_clip": 1.09800506, "balance_loss_mlp": 1.0504787, "epoch": 0.02969999398785547, "flos": 24315869450880.0, "grad_norm": 1.8265498906716755, "language_loss": 0.80471241, "learning_rate": 3.991254067418851e-06, "loss": 0.82928026, "num_input_tokens_seen": 5136825, "step": 247, "time_per_iteration": 2.7622106075286865 }, { "auxiliary_loss_clip": 0.01350658, "auxiliary_loss_mlp": 0.01098716, "balance_loss_clip": 1.0977509, "balance_loss_mlp": 1.05255842, "epoch": 0.02982023687849456, "flos": 35078584193280.0, "grad_norm": 2.163598183042214, "language_loss": 0.82907218, "learning_rate": 3.994181125744254e-06, "loss": 0.85356593, "num_input_tokens_seen": 5158630, "step": 248, "time_per_iteration": 2.7482242584228516 }, { "auxiliary_loss_clip": 0.01342907, "auxiliary_loss_mlp": 0.01100028, "balance_loss_clip": 1.09406066, "balance_loss_mlp": 1.05139041, "epoch": 0.02994047976913365, "flos": 26177155378560.0, "grad_norm": 1.824829928608993, "language_loss": 0.73775184, "learning_rate": 3.99709640513106e-06, "loss": 0.76218122, "num_input_tokens_seen": 5179510, "step": 249, "time_per_iteration": 2.759646415710449 }, { "auxiliary_loss_clip": 0.01352346, "auxiliary_loss_mlp": 0.01110454, "balance_loss_clip": 1.09338391, "balance_loss_mlp": 1.06296158, "epoch": 0.03006072265977274, "flos": 25625028447360.0, "grad_norm": 2.447142119573689, "language_loss": 0.85379368, "learning_rate": 4e-06, "loss": 0.87842166, "num_input_tokens_seen": 5199345, "step": 250, "time_per_iteration": 4.672216176986694 }, { "auxiliary_loss_clip": 0.01347697, "auxiliary_loss_mlp": 0.01106086, "balance_loss_clip": 1.09555984, "balance_loss_mlp": 1.05823541, "epoch": 0.03018096555041183, "flos": 22127078292480.0, "grad_norm": 2.5138060257000445, "language_loss": 0.88530582, "learning_rate": 3.999999848300794e-06, "loss": 0.90984368, "num_input_tokens_seen": 5218330, "step": 251, "time_per_iteration": 2.740802049636841 }, { "auxiliary_loss_clip": 0.01342772, "auxiliary_loss_mlp": 0.01089584, "balance_loss_clip": 1.0903244, "balance_loss_mlp": 1.04294944, "epoch": 0.030301208441050925, "flos": 30188197359360.0, "grad_norm": 1.9841101637443732, "language_loss": 0.89080429, "learning_rate": 3.999999393203203e-06, "loss": 0.91512781, "num_input_tokens_seen": 5240740, "step": 252, "time_per_iteration": 3.78584361076355 }, { "auxiliary_loss_clip": 0.01348209, "auxiliary_loss_mlp": 0.01105927, "balance_loss_clip": 1.09212363, "balance_loss_mlp": 1.05876827, "epoch": 0.030421451331690014, "flos": 23621392920960.0, "grad_norm": 2.3476098281451003, "language_loss": 0.85060263, "learning_rate": 3.999998634707293e-06, "loss": 0.875144, "num_input_tokens_seen": 5260290, "step": 253, "time_per_iteration": 2.8615915775299072 }, { "auxiliary_loss_clip": 0.01354258, "auxiliary_loss_mlp": 0.01120115, "balance_loss_clip": 1.09631968, "balance_loss_mlp": 1.06911719, "epoch": 0.030541694222329104, "flos": 27928446883200.0, "grad_norm": 2.3165816845318608, "language_loss": 0.96303892, "learning_rate": 3.999997572813182e-06, "loss": 0.9877826, "num_input_tokens_seen": 5278100, "step": 254, "time_per_iteration": 3.595822811126709 }, { "auxiliary_loss_clip": 0.01343915, "auxiliary_loss_mlp": 0.01091297, "balance_loss_clip": 1.09268332, "balance_loss_mlp": 1.04337454, "epoch": 0.030661937112968194, "flos": 18588441006720.0, "grad_norm": 1.9687548948233762, "language_loss": 0.87465733, "learning_rate": 3.999996207521028e-06, "loss": 0.89900947, "num_input_tokens_seen": 5296810, "step": 255, "time_per_iteration": 2.7191085815429688 }, { "auxiliary_loss_clip": 0.01352486, "auxiliary_loss_mlp": 0.0110086, "balance_loss_clip": 1.09292328, "balance_loss_mlp": 1.05610871, "epoch": 0.030782180003607287, "flos": 12969139478400.0, "grad_norm": 2.6807682419841994, "language_loss": 0.82029939, "learning_rate": 3.999994538831039e-06, "loss": 0.8448329, "num_input_tokens_seen": 5313395, "step": 256, "time_per_iteration": 2.660717010498047 }, { "auxiliary_loss_clip": 0.01347915, "auxiliary_loss_mlp": 0.01104174, "balance_loss_clip": 1.09141099, "balance_loss_mlp": 1.05527425, "epoch": 0.030902422894246377, "flos": 23335364920320.0, "grad_norm": 2.248497800774475, "language_loss": 0.85727662, "learning_rate": 3.99999256674347e-06, "loss": 0.88179755, "num_input_tokens_seen": 5333545, "step": 257, "time_per_iteration": 2.793316125869751 }, { "auxiliary_loss_clip": 0.0127106, "auxiliary_loss_mlp": 0.01025744, "balance_loss_clip": 1.12724423, "balance_loss_mlp": 1.0075289, "epoch": 0.031022665784885467, "flos": 55094151438720.0, "grad_norm": 1.0439352413141239, "language_loss": 0.53500766, "learning_rate": 3.999990291258618e-06, "loss": 0.55797571, "num_input_tokens_seen": 5392235, "step": 258, "time_per_iteration": 3.2857632637023926 }, { "auxiliary_loss_clip": 0.01341361, "auxiliary_loss_mlp": 0.0110475, "balance_loss_clip": 1.09176731, "balance_loss_mlp": 1.0582819, "epoch": 0.03114290867552456, "flos": 19317786664320.0, "grad_norm": 2.16605850439145, "language_loss": 0.86661518, "learning_rate": 3.999987712376829e-06, "loss": 0.89107633, "num_input_tokens_seen": 5410555, "step": 259, "time_per_iteration": 2.7026805877685547 }, { "auxiliary_loss_clip": 0.01347657, "auxiliary_loss_mlp": 0.01093918, "balance_loss_clip": 1.09567547, "balance_loss_mlp": 1.04780793, "epoch": 0.031263151566163654, "flos": 20959442881920.0, "grad_norm": 2.2257504287581944, "language_loss": 0.82140774, "learning_rate": 3.999984830098494e-06, "loss": 0.84582353, "num_input_tokens_seen": 5430135, "step": 260, "time_per_iteration": 2.7673466205596924 }, { "auxiliary_loss_clip": 0.01348045, "auxiliary_loss_mlp": 0.01102456, "balance_loss_clip": 1.09500623, "balance_loss_mlp": 1.05331802, "epoch": 0.03138339445680274, "flos": 14793006412800.0, "grad_norm": 7.377850718226571, "language_loss": 0.97714019, "learning_rate": 3.999981644424051e-06, "loss": 1.00164533, "num_input_tokens_seen": 5444935, "step": 261, "time_per_iteration": 2.6607930660247803 }, { "auxiliary_loss_clip": 0.013449, "auxiliary_loss_mlp": 0.01110486, "balance_loss_clip": 1.09124899, "balance_loss_mlp": 1.06194401, "epoch": 0.03150363734744183, "flos": 11655599022720.0, "grad_norm": 2.5790469884350196, "language_loss": 0.85941982, "learning_rate": 3.999978155353982e-06, "loss": 0.88397372, "num_input_tokens_seen": 5462080, "step": 262, "time_per_iteration": 2.6652793884277344 }, { "auxiliary_loss_clip": 0.01345965, "auxiliary_loss_mlp": 0.01099476, "balance_loss_clip": 1.08992624, "balance_loss_mlp": 1.05272222, "epoch": 0.03162388023808092, "flos": 33727732485120.0, "grad_norm": 2.492975797827368, "language_loss": 0.80182862, "learning_rate": 3.9999743628888186e-06, "loss": 0.8262831, "num_input_tokens_seen": 5483870, "step": 263, "time_per_iteration": 2.762723207473755 }, { "auxiliary_loss_clip": 0.01343928, "auxiliary_loss_mlp": 0.01107008, "balance_loss_clip": 1.09024668, "balance_loss_mlp": 1.0587759, "epoch": 0.03174412312872001, "flos": 20810952057600.0, "grad_norm": 4.618061861887177, "language_loss": 0.8954643, "learning_rate": 3.999970267029133e-06, "loss": 0.91997373, "num_input_tokens_seen": 5502830, "step": 264, "time_per_iteration": 2.7293283939361572 }, { "auxiliary_loss_clip": 0.01339172, "auxiliary_loss_mlp": 0.01107732, "balance_loss_clip": 1.0912739, "balance_loss_mlp": 1.06085896, "epoch": 0.0318643660193591, "flos": 23727939638400.0, "grad_norm": 1.7358577547975882, "language_loss": 0.80125403, "learning_rate": 3.999965867775548e-06, "loss": 0.82572305, "num_input_tokens_seen": 5523225, "step": 265, "time_per_iteration": 2.675804376602173 }, { "auxiliary_loss_clip": 0.01342584, "auxiliary_loss_mlp": 0.01096001, "balance_loss_clip": 1.09247565, "balance_loss_mlp": 1.05086827, "epoch": 0.0319846089099982, "flos": 13917863450880.0, "grad_norm": 2.8170667785101147, "language_loss": 0.86730266, "learning_rate": 3.9999611651287315e-06, "loss": 0.89168859, "num_input_tokens_seen": 5541380, "step": 266, "time_per_iteration": 2.7228593826293945 }, { "auxiliary_loss_clip": 0.01338255, "auxiliary_loss_mlp": 0.01097372, "balance_loss_clip": 1.09009242, "balance_loss_mlp": 1.05002213, "epoch": 0.03210485180063729, "flos": 14753253035520.0, "grad_norm": 2.348927809039779, "language_loss": 0.78426754, "learning_rate": 3.999956159089396e-06, "loss": 0.80862379, "num_input_tokens_seen": 5558830, "step": 267, "time_per_iteration": 2.6757309436798096 }, { "auxiliary_loss_clip": 0.01340038, "auxiliary_loss_mlp": 0.01108717, "balance_loss_clip": 1.09142196, "balance_loss_mlp": 1.0644424, "epoch": 0.03222509469127638, "flos": 28913153304960.0, "grad_norm": 2.205486005684234, "language_loss": 0.79708302, "learning_rate": 3.999950849658302e-06, "loss": 0.82157063, "num_input_tokens_seen": 5577750, "step": 268, "time_per_iteration": 2.7541277408599854 }, { "auxiliary_loss_clip": 0.01344089, "auxiliary_loss_mlp": 0.01094033, "balance_loss_clip": 1.0910393, "balance_loss_mlp": 1.04897213, "epoch": 0.03234533758191547, "flos": 16946389739520.0, "grad_norm": 2.296859908197639, "language_loss": 0.84171921, "learning_rate": 3.999945236836254e-06, "loss": 0.86610043, "num_input_tokens_seen": 5596715, "step": 269, "time_per_iteration": 2.7269325256347656 }, { "auxiliary_loss_clip": 0.01345569, "auxiliary_loss_mlp": 0.01103294, "balance_loss_clip": 1.09395802, "balance_loss_mlp": 1.05658829, "epoch": 0.03246558047255456, "flos": 18989096284800.0, "grad_norm": 2.705357710905537, "language_loss": 0.94490147, "learning_rate": 3.999939320624103e-06, "loss": 0.96939003, "num_input_tokens_seen": 5611865, "step": 270, "time_per_iteration": 2.6688575744628906 }, { "auxiliary_loss_clip": 0.01335115, "auxiliary_loss_mlp": 0.01090781, "balance_loss_clip": 1.09013581, "balance_loss_mlp": 1.04519582, "epoch": 0.03258582336319365, "flos": 23728334688000.0, "grad_norm": 1.9065637835941547, "language_loss": 0.89989871, "learning_rate": 3.999933101022749e-06, "loss": 0.92415774, "num_input_tokens_seen": 5632270, "step": 271, "time_per_iteration": 2.7716212272644043 }, { "auxiliary_loss_clip": 0.01335997, "auxiliary_loss_mlp": 0.01098343, "balance_loss_clip": 1.08926845, "balance_loss_mlp": 1.05270958, "epoch": 0.032706066253832745, "flos": 27670823562240.0, "grad_norm": 1.86086026748666, "language_loss": 0.86669028, "learning_rate": 3.999926578033132e-06, "loss": 0.89103365, "num_input_tokens_seen": 5652085, "step": 272, "time_per_iteration": 2.736888885498047 }, { "auxiliary_loss_clip": 0.01344885, "auxiliary_loss_mlp": 0.01101522, "balance_loss_clip": 1.088117, "balance_loss_mlp": 1.05505395, "epoch": 0.032826309144471835, "flos": 45624685968000.0, "grad_norm": 2.061414675442751, "language_loss": 0.6275804, "learning_rate": 3.999919751656244e-06, "loss": 0.65204453, "num_input_tokens_seen": 5678985, "step": 273, "time_per_iteration": 2.9892005920410156 }, { "auxiliary_loss_clip": 0.01338827, "auxiliary_loss_mlp": 0.0110671, "balance_loss_clip": 1.09067285, "balance_loss_mlp": 1.06088579, "epoch": 0.032946552035110925, "flos": 25812374808960.0, "grad_norm": 3.5090593485742247, "language_loss": 0.75726354, "learning_rate": 3.9999126218931195e-06, "loss": 0.78171885, "num_input_tokens_seen": 5697020, "step": 274, "time_per_iteration": 2.7594785690307617 }, { "auxiliary_loss_clip": 0.013464, "auxiliary_loss_mlp": 0.01103813, "balance_loss_clip": 1.09545279, "balance_loss_mlp": 1.05491328, "epoch": 0.033066794925750015, "flos": 15121984101120.0, "grad_norm": 2.45184507283567, "language_loss": 0.89657891, "learning_rate": 3.99990518874484e-06, "loss": 0.92108107, "num_input_tokens_seen": 5713460, "step": 275, "time_per_iteration": 3.8386480808258057 }, { "auxiliary_loss_clip": 0.01334613, "auxiliary_loss_mlp": 0.01110349, "balance_loss_clip": 1.09162891, "balance_loss_mlp": 1.06354773, "epoch": 0.033187037816389105, "flos": 22776593973120.0, "grad_norm": 3.5781109825309168, "language_loss": 0.925336, "learning_rate": 3.999897452212534e-06, "loss": 0.94978559, "num_input_tokens_seen": 5730790, "step": 276, "time_per_iteration": 2.695594310760498 }, { "auxiliary_loss_clip": 0.01336398, "auxiliary_loss_mlp": 0.01107495, "balance_loss_clip": 1.08840156, "balance_loss_mlp": 1.06088388, "epoch": 0.033307280707028195, "flos": 23331414424320.0, "grad_norm": 2.7560875152775837, "language_loss": 1.00249207, "learning_rate": 3.999889412297374e-06, "loss": 1.02693105, "num_input_tokens_seen": 5750215, "step": 277, "time_per_iteration": 3.5704004764556885 }, { "auxiliary_loss_clip": 0.01337552, "auxiliary_loss_mlp": 0.01089829, "balance_loss_clip": 1.09035659, "balance_loss_mlp": 1.04386163, "epoch": 0.03342752359766729, "flos": 28840290566400.0, "grad_norm": 2.211754721103613, "language_loss": 0.78913593, "learning_rate": 3.999881069000581e-06, "loss": 0.81340969, "num_input_tokens_seen": 5769945, "step": 278, "time_per_iteration": 3.6704769134521484 }, { "auxiliary_loss_clip": 0.01339877, "auxiliary_loss_mlp": 0.01105908, "balance_loss_clip": 1.08887744, "balance_loss_mlp": 1.05762815, "epoch": 0.03354776648830638, "flos": 19384544090880.0, "grad_norm": 2.4364007187050034, "language_loss": 0.86893392, "learning_rate": 3.99987242232342e-06, "loss": 0.89339179, "num_input_tokens_seen": 5784950, "step": 279, "time_per_iteration": 3.5608415603637695 }, { "auxiliary_loss_clip": 0.0133912, "auxiliary_loss_mlp": 0.01112448, "balance_loss_clip": 1.09389234, "balance_loss_mlp": 1.06490779, "epoch": 0.03366800937894547, "flos": 17858628472320.0, "grad_norm": 1.9095833007748004, "language_loss": 0.79619521, "learning_rate": 3.9998634722672026e-06, "loss": 0.8207109, "num_input_tokens_seen": 5805005, "step": 280, "time_per_iteration": 2.674992561340332 }, { "auxiliary_loss_clip": 0.01342859, "auxiliary_loss_mlp": 0.01125202, "balance_loss_clip": 1.09488702, "balance_loss_mlp": 1.07856691, "epoch": 0.03378825226958456, "flos": 35951033635200.0, "grad_norm": 3.1624762054772098, "language_loss": 0.78261983, "learning_rate": 3.999854218833286e-06, "loss": 0.80730045, "num_input_tokens_seen": 5825825, "step": 281, "time_per_iteration": 2.835635185241699 }, { "auxiliary_loss_clip": 0.01335169, "auxiliary_loss_mlp": 0.01096517, "balance_loss_clip": 1.09310031, "balance_loss_mlp": 1.05071723, "epoch": 0.03390849516022365, "flos": 25702488126720.0, "grad_norm": 1.9385770231559811, "language_loss": 0.81993437, "learning_rate": 3.999844662023075e-06, "loss": 0.84425128, "num_input_tokens_seen": 5845700, "step": 282, "time_per_iteration": 2.809685707092285 }, { "auxiliary_loss_clip": 0.01320643, "auxiliary_loss_mlp": 0.01106663, "balance_loss_clip": 1.08502841, "balance_loss_mlp": 1.05928969, "epoch": 0.03402873805086274, "flos": 21284505987840.0, "grad_norm": 2.118942842928707, "language_loss": 0.92258978, "learning_rate": 3.999834801838018e-06, "loss": 0.94686288, "num_input_tokens_seen": 5864680, "step": 283, "time_per_iteration": 2.718393564224243 }, { "auxiliary_loss_clip": 0.01329814, "auxiliary_loss_mlp": 0.01094114, "balance_loss_clip": 1.09009278, "balance_loss_mlp": 1.04991102, "epoch": 0.03414898094150183, "flos": 22710913954560.0, "grad_norm": 2.044779065998592, "language_loss": 0.74147409, "learning_rate": 3.9998246382796115e-06, "loss": 0.76571333, "num_input_tokens_seen": 5884260, "step": 284, "time_per_iteration": 2.704906463623047 }, { "auxiliary_loss_clip": 0.01338203, "auxiliary_loss_mlp": 0.01102349, "balance_loss_clip": 1.08960366, "balance_loss_mlp": 1.05671573, "epoch": 0.03426922383214093, "flos": 18879927874560.0, "grad_norm": 2.1234405515227737, "language_loss": 0.90716708, "learning_rate": 3.999814171349399e-06, "loss": 0.93157256, "num_input_tokens_seen": 5902120, "step": 285, "time_per_iteration": 2.6307480335235596 }, { "auxiliary_loss_clip": 0.01330672, "auxiliary_loss_mlp": 0.01112449, "balance_loss_clip": 1.0915072, "balance_loss_mlp": 1.06435943, "epoch": 0.03438946672278002, "flos": 34752012716160.0, "grad_norm": 2.1138335830133506, "language_loss": 0.73595762, "learning_rate": 3.9998034010489655e-06, "loss": 0.76038885, "num_input_tokens_seen": 5925810, "step": 286, "time_per_iteration": 2.829082727432251 }, { "auxiliary_loss_clip": 0.01333938, "auxiliary_loss_mlp": 0.01117785, "balance_loss_clip": 1.09278738, "balance_loss_mlp": 1.06924272, "epoch": 0.03450970961341911, "flos": 22164102236160.0, "grad_norm": 2.5515575870423284, "language_loss": 0.75984055, "learning_rate": 3.999792327379946e-06, "loss": 0.78435779, "num_input_tokens_seen": 5945185, "step": 287, "time_per_iteration": 2.7144062519073486 }, { "auxiliary_loss_clip": 0.01329529, "auxiliary_loss_mlp": 0.01103601, "balance_loss_clip": 1.09100354, "balance_loss_mlp": 1.05727589, "epoch": 0.034629952504058197, "flos": 21725740656000.0, "grad_norm": 2.3425225151758777, "language_loss": 0.96343613, "learning_rate": 3.999780950344021e-06, "loss": 0.98776746, "num_input_tokens_seen": 5963375, "step": 288, "time_per_iteration": 2.712407350540161 }, { "auxiliary_loss_clip": 0.01338425, "auxiliary_loss_mlp": 0.01102636, "balance_loss_clip": 1.09329331, "balance_loss_mlp": 1.05809975, "epoch": 0.034750195394697286, "flos": 20047994248320.0, "grad_norm": 1.8236313611974722, "language_loss": 0.82461804, "learning_rate": 3.999769269942916e-06, "loss": 0.84902871, "num_input_tokens_seen": 5983415, "step": 289, "time_per_iteration": 2.6426968574523926 }, { "auxiliary_loss_clip": 0.01323076, "auxiliary_loss_mlp": 0.01094789, "balance_loss_clip": 1.08722115, "balance_loss_mlp": 1.05232692, "epoch": 0.034870438285336376, "flos": 27965865876480.0, "grad_norm": 2.5522735815984503, "language_loss": 0.81017685, "learning_rate": 3.999757286178402e-06, "loss": 0.83435547, "num_input_tokens_seen": 6005850, "step": 290, "time_per_iteration": 2.707730293273926 }, { "auxiliary_loss_clip": 0.01330135, "auxiliary_loss_mlp": 0.01100071, "balance_loss_clip": 1.0890491, "balance_loss_mlp": 1.05458117, "epoch": 0.03499068117597547, "flos": 22017514832640.0, "grad_norm": 2.05106531267823, "language_loss": 0.90537167, "learning_rate": 3.999744999052299e-06, "loss": 0.92967379, "num_input_tokens_seen": 6027240, "step": 291, "time_per_iteration": 2.6756505966186523 }, { "auxiliary_loss_clip": 0.01246294, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.11463428, "balance_loss_mlp": 1.01067543, "epoch": 0.03511092406661456, "flos": 57242147725440.0, "grad_norm": 0.9652297684803145, "language_loss": 0.61220407, "learning_rate": 3.9997324085664675e-06, "loss": 0.63494635, "num_input_tokens_seen": 6087470, "step": 292, "time_per_iteration": 3.2112693786621094 }, { "auxiliary_loss_clip": 0.01333828, "auxiliary_loss_mlp": 0.01107046, "balance_loss_clip": 1.09307408, "balance_loss_mlp": 1.06064951, "epoch": 0.03523116695725365, "flos": 22928065626240.0, "grad_norm": 2.364606266898949, "language_loss": 0.91724944, "learning_rate": 3.999719514722821e-06, "loss": 0.94165814, "num_input_tokens_seen": 6107600, "step": 293, "time_per_iteration": 2.704019069671631 }, { "auxiliary_loss_clip": 0.01323031, "auxiliary_loss_mlp": 0.01110998, "balance_loss_clip": 1.08620775, "balance_loss_mlp": 1.06743932, "epoch": 0.03535140984789274, "flos": 36903241226880.0, "grad_norm": 2.2075696162558733, "language_loss": 0.747118, "learning_rate": 3.999706317523314e-06, "loss": 0.77145827, "num_input_tokens_seen": 6126160, "step": 294, "time_per_iteration": 2.831724166870117 }, { "auxiliary_loss_clip": 0.01323311, "auxiliary_loss_mlp": 0.01106321, "balance_loss_clip": 1.0893209, "balance_loss_mlp": 1.06388295, "epoch": 0.03547165273853183, "flos": 20449152316800.0, "grad_norm": 2.184465737318531, "language_loss": 0.85864323, "learning_rate": 3.999692816969948e-06, "loss": 0.88293958, "num_input_tokens_seen": 6145695, "step": 295, "time_per_iteration": 2.6892812252044678 }, { "auxiliary_loss_clip": 0.01242374, "auxiliary_loss_mlp": 0.01016377, "balance_loss_clip": 1.11301184, "balance_loss_mlp": 1.0003556, "epoch": 0.03559189562917092, "flos": 69850564871040.0, "grad_norm": 1.0027023299532813, "language_loss": 0.69445783, "learning_rate": 3.999679013064772e-06, "loss": 0.71704537, "num_input_tokens_seen": 6212440, "step": 296, "time_per_iteration": 3.2687320709228516 }, { "auxiliary_loss_clip": 0.01329743, "auxiliary_loss_mlp": 0.0109759, "balance_loss_clip": 1.08675528, "balance_loss_mlp": 1.05043077, "epoch": 0.03571213851981002, "flos": 21651944163840.0, "grad_norm": 3.285280915233992, "language_loss": 0.85589182, "learning_rate": 3.99966490580988e-06, "loss": 0.8801651, "num_input_tokens_seen": 6229800, "step": 297, "time_per_iteration": 2.7338767051696777 }, { "auxiliary_loss_clip": 0.01331122, "auxiliary_loss_mlp": 0.01093042, "balance_loss_clip": 1.09238172, "balance_loss_mlp": 1.04912484, "epoch": 0.03583238141044911, "flos": 43945610757120.0, "grad_norm": 3.4429555787746016, "language_loss": 0.65714765, "learning_rate": 3.999650495207411e-06, "loss": 0.68138927, "num_input_tokens_seen": 6255825, "step": 298, "time_per_iteration": 2.9346420764923096 }, { "auxiliary_loss_clip": 0.01324134, "auxiliary_loss_mlp": 0.01104888, "balance_loss_clip": 1.09167004, "balance_loss_mlp": 1.06233072, "epoch": 0.0359526243010882, "flos": 18910810592640.0, "grad_norm": 2.489332849006672, "language_loss": 0.90639758, "learning_rate": 3.999635781259553e-06, "loss": 0.93068773, "num_input_tokens_seen": 6271090, "step": 299, "time_per_iteration": 2.7465789318084717 }, { "auxiliary_loss_clip": 0.0123316, "auxiliary_loss_mlp": 0.01023413, "balance_loss_clip": 1.10577083, "balance_loss_mlp": 1.00691438, "epoch": 0.03607286719172729, "flos": 61668892782720.0, "grad_norm": 0.9368685687782168, "language_loss": 0.52337539, "learning_rate": 3.999620763968535e-06, "loss": 0.54594111, "num_input_tokens_seen": 6329965, "step": 300, "time_per_iteration": 3.0697274208068848 }, { "auxiliary_loss_clip": 0.01320703, "auxiliary_loss_mlp": 0.01101657, "balance_loss_clip": 1.09042418, "balance_loss_mlp": 1.0581218, "epoch": 0.03619311008236638, "flos": 27819062991360.0, "grad_norm": 1.9785706785644546, "language_loss": 0.86260074, "learning_rate": 3.999605443336638e-06, "loss": 0.88682431, "num_input_tokens_seen": 6352095, "step": 301, "time_per_iteration": 2.6851813793182373 }, { "auxiliary_loss_clip": 0.01333379, "auxiliary_loss_mlp": 0.01104429, "balance_loss_clip": 1.08807147, "balance_loss_mlp": 1.06051266, "epoch": 0.03631335297300547, "flos": 13621133197440.0, "grad_norm": 2.291546627603189, "language_loss": 0.89056027, "learning_rate": 3.999589819366185e-06, "loss": 0.91493833, "num_input_tokens_seen": 6365885, "step": 302, "time_per_iteration": 3.5745882987976074 }, { "auxiliary_loss_clip": 0.01327246, "auxiliary_loss_mlp": 0.01097971, "balance_loss_clip": 1.08720815, "balance_loss_mlp": 1.05252862, "epoch": 0.036433595863644565, "flos": 27631788456960.0, "grad_norm": 2.3837205960136103, "language_loss": 0.84953797, "learning_rate": 3.999573892059547e-06, "loss": 0.87379014, "num_input_tokens_seen": 6385015, "step": 303, "time_per_iteration": 3.766550064086914 }, { "auxiliary_loss_clip": 0.01341318, "auxiliary_loss_mlp": 0.01114495, "balance_loss_clip": 1.09071541, "balance_loss_mlp": 1.06869507, "epoch": 0.036553838754283655, "flos": 24572020314240.0, "grad_norm": 1.9655013306982319, "language_loss": 0.81115067, "learning_rate": 3.999557661419138e-06, "loss": 0.8357088, "num_input_tokens_seen": 6405165, "step": 304, "time_per_iteration": 3.644613742828369 }, { "auxiliary_loss_clip": 0.01332481, "auxiliary_loss_mlp": 0.0110835, "balance_loss_clip": 1.09270191, "balance_loss_mlp": 1.06543517, "epoch": 0.036674081644922744, "flos": 23404313076480.0, "grad_norm": 2.39783641003516, "language_loss": 0.81273007, "learning_rate": 3.9995411274474225e-06, "loss": 0.83713841, "num_input_tokens_seen": 6424445, "step": 305, "time_per_iteration": 2.7275772094726562 }, { "auxiliary_loss_clip": 0.0132194, "auxiliary_loss_mlp": 0.01112357, "balance_loss_clip": 1.08613467, "balance_loss_mlp": 1.06822586, "epoch": 0.036794324535561834, "flos": 27489690253440.0, "grad_norm": 2.2455645047893604, "language_loss": 0.81188524, "learning_rate": 3.999524290146908e-06, "loss": 0.83622825, "num_input_tokens_seen": 6444650, "step": 306, "time_per_iteration": 3.816707134246826 }, { "auxiliary_loss_clip": 0.01336076, "auxiliary_loss_mlp": 0.01105085, "balance_loss_clip": 1.09290767, "balance_loss_mlp": 1.06047678, "epoch": 0.036914567426200924, "flos": 19463476227840.0, "grad_norm": 2.2077476781229493, "language_loss": 0.9265548, "learning_rate": 3.9995071495201485e-06, "loss": 0.95096648, "num_input_tokens_seen": 6461755, "step": 307, "time_per_iteration": 2.657191753387451 }, { "auxiliary_loss_clip": 0.01324971, "auxiliary_loss_mlp": 0.01109147, "balance_loss_clip": 1.08773732, "balance_loss_mlp": 1.06551659, "epoch": 0.037034810316840014, "flos": 22309324922880.0, "grad_norm": 4.087274468760799, "language_loss": 0.97567999, "learning_rate": 3.999489705569744e-06, "loss": 1.0000211, "num_input_tokens_seen": 6479455, "step": 308, "time_per_iteration": 2.7460222244262695 }, { "auxiliary_loss_clip": 0.01320753, "auxiliary_loss_mlp": 0.01102956, "balance_loss_clip": 1.0844152, "balance_loss_mlp": 1.05875349, "epoch": 0.03715505320747911, "flos": 18588333265920.0, "grad_norm": 1.9613956934484185, "language_loss": 0.866943, "learning_rate": 3.999471958298341e-06, "loss": 0.89118016, "num_input_tokens_seen": 6498365, "step": 309, "time_per_iteration": 2.636584758758545 }, { "auxiliary_loss_clip": 0.01336075, "auxiliary_loss_mlp": 0.01105108, "balance_loss_clip": 1.0929985, "balance_loss_mlp": 1.05945063, "epoch": 0.0372752960981182, "flos": 35955343267200.0, "grad_norm": 2.013876863799778, "language_loss": 0.76092732, "learning_rate": 3.999453907708631e-06, "loss": 0.78533918, "num_input_tokens_seen": 6520770, "step": 310, "time_per_iteration": 2.8391661643981934 }, { "auxiliary_loss_clip": 0.01319337, "auxiliary_loss_mlp": 0.01085758, "balance_loss_clip": 1.08894587, "balance_loss_mlp": 1.04241383, "epoch": 0.03739553898875729, "flos": 20814040627200.0, "grad_norm": 1.7015688307054044, "language_loss": 0.81254613, "learning_rate": 3.999435553803353e-06, "loss": 0.83659708, "num_input_tokens_seen": 6540170, "step": 311, "time_per_iteration": 2.6342687606811523 }, { "auxiliary_loss_clip": 0.01320532, "auxiliary_loss_mlp": 0.01103106, "balance_loss_clip": 1.08434403, "balance_loss_mlp": 1.06102467, "epoch": 0.03751578187939638, "flos": 20264140339200.0, "grad_norm": 2.3847458057453905, "language_loss": 0.83442086, "learning_rate": 3.999416896585292e-06, "loss": 0.85865712, "num_input_tokens_seen": 6557200, "step": 312, "time_per_iteration": 2.679409980773926 }, { "auxiliary_loss_clip": 0.01325585, "auxiliary_loss_mlp": 0.01096079, "balance_loss_clip": 1.08740973, "balance_loss_mlp": 1.05144668, "epoch": 0.03763602477003547, "flos": 20668063754880.0, "grad_norm": 3.533129375689569, "language_loss": 0.85765219, "learning_rate": 3.9993979360572775e-06, "loss": 0.88186878, "num_input_tokens_seen": 6577340, "step": 313, "time_per_iteration": 2.7038965225219727 }, { "auxiliary_loss_clip": 0.01327937, "auxiliary_loss_mlp": 0.01106862, "balance_loss_clip": 1.08990288, "balance_loss_mlp": 1.06506717, "epoch": 0.03775626766067456, "flos": 16691352197760.0, "grad_norm": 3.3801624649217334, "language_loss": 0.82518792, "learning_rate": 3.999378672222185e-06, "loss": 0.84953588, "num_input_tokens_seen": 6595125, "step": 314, "time_per_iteration": 2.7910561561584473 }, { "auxiliary_loss_clip": 0.0131939, "auxiliary_loss_mlp": 0.01100899, "balance_loss_clip": 1.0845387, "balance_loss_mlp": 1.05888927, "epoch": 0.03787651055131366, "flos": 21141797253120.0, "grad_norm": 1.913445587975055, "language_loss": 0.82444096, "learning_rate": 3.9993591050829385e-06, "loss": 0.84864384, "num_input_tokens_seen": 6612990, "step": 315, "time_per_iteration": 2.7296245098114014 }, { "auxiliary_loss_clip": 0.01330154, "auxiliary_loss_mlp": 0.01087676, "balance_loss_clip": 1.09267282, "balance_loss_mlp": 1.04349744, "epoch": 0.037996753441952746, "flos": 22018089450240.0, "grad_norm": 1.9238809360274294, "language_loss": 0.79143518, "learning_rate": 3.999339234642506e-06, "loss": 0.81561351, "num_input_tokens_seen": 6632740, "step": 316, "time_per_iteration": 2.6748881340026855 }, { "auxiliary_loss_clip": 0.01319795, "auxiliary_loss_mlp": 0.01092619, "balance_loss_clip": 1.08650911, "balance_loss_mlp": 1.04982233, "epoch": 0.038116996332591836, "flos": 27709391790720.0, "grad_norm": 2.147686610263323, "language_loss": 0.83647239, "learning_rate": 3.9993190609038994e-06, "loss": 0.86059648, "num_input_tokens_seen": 6651505, "step": 317, "time_per_iteration": 2.7629787921905518 }, { "auxiliary_loss_clip": 0.01322296, "auxiliary_loss_mlp": 0.01117317, "balance_loss_clip": 1.08547962, "balance_loss_mlp": 1.0723753, "epoch": 0.038237239223230926, "flos": 21178067011200.0, "grad_norm": 2.3642665322534375, "language_loss": 0.83240986, "learning_rate": 3.999298583870182e-06, "loss": 0.85680604, "num_input_tokens_seen": 6671090, "step": 318, "time_per_iteration": 2.6470448970794678 }, { "auxiliary_loss_clip": 0.01319243, "auxiliary_loss_mlp": 0.01101525, "balance_loss_clip": 1.08523011, "balance_loss_mlp": 1.05722678, "epoch": 0.038357482113870016, "flos": 25556618995200.0, "grad_norm": 2.0778287423396073, "language_loss": 0.77450484, "learning_rate": 3.999277803544458e-06, "loss": 0.79871255, "num_input_tokens_seen": 6691245, "step": 319, "time_per_iteration": 2.731790065765381 }, { "auxiliary_loss_clip": 0.01215813, "auxiliary_loss_mlp": 0.01019108, "balance_loss_clip": 1.08708298, "balance_loss_mlp": 1.00499403, "epoch": 0.038477725004509106, "flos": 59227578034560.0, "grad_norm": 0.9459850045965338, "language_loss": 0.6244173, "learning_rate": 3.999256719929882e-06, "loss": 0.64676654, "num_input_tokens_seen": 6752520, "step": 320, "time_per_iteration": 3.1960835456848145 }, { "auxiliary_loss_clip": 0.01213513, "auxiliary_loss_mlp": 0.01017552, "balance_loss_clip": 1.08569431, "balance_loss_mlp": 1.00391412, "epoch": 0.0385979678951482, "flos": 67317676398720.0, "grad_norm": 1.214222075020542, "language_loss": 0.67132545, "learning_rate": 3.999235333029651e-06, "loss": 0.69363612, "num_input_tokens_seen": 6806460, "step": 321, "time_per_iteration": 3.1452386379241943 }, { "auxiliary_loss_clip": 0.01322098, "auxiliary_loss_mlp": 0.01095727, "balance_loss_clip": 1.08962131, "balance_loss_mlp": 1.05257344, "epoch": 0.03871821078578729, "flos": 22746752749440.0, "grad_norm": 4.869462198204938, "language_loss": 0.81939304, "learning_rate": 3.999213642847009e-06, "loss": 0.84357125, "num_input_tokens_seen": 6827045, "step": 322, "time_per_iteration": 2.7402751445770264 }, { "auxiliary_loss_clip": 0.01319978, "auxiliary_loss_mlp": 0.01106097, "balance_loss_clip": 1.08699012, "balance_loss_mlp": 1.06260967, "epoch": 0.03883845367642638, "flos": 26280613526400.0, "grad_norm": 1.9137920178390528, "language_loss": 0.90988553, "learning_rate": 3.999191649385247e-06, "loss": 0.93414623, "num_input_tokens_seen": 6848220, "step": 323, "time_per_iteration": 2.6725857257843018 }, { "auxiliary_loss_clip": 0.0120791, "auxiliary_loss_mlp": 0.01020325, "balance_loss_clip": 1.08628142, "balance_loss_mlp": 1.00678277, "epoch": 0.03895869656706547, "flos": 56962835568000.0, "grad_norm": 0.9810691525930006, "language_loss": 0.59815013, "learning_rate": 3.999169352647702e-06, "loss": 0.6204325, "num_input_tokens_seen": 6909400, "step": 324, "time_per_iteration": 3.124147415161133 }, { "auxiliary_loss_clip": 0.0132663, "auxiliary_loss_mlp": 0.01107932, "balance_loss_clip": 1.08966708, "balance_loss_mlp": 1.0619173, "epoch": 0.03907893945770456, "flos": 24863363527680.0, "grad_norm": 2.3509314572596733, "language_loss": 0.83179998, "learning_rate": 3.999146752637755e-06, "loss": 0.8561455, "num_input_tokens_seen": 6930445, "step": 325, "time_per_iteration": 2.6548023223876953 }, { "auxiliary_loss_clip": 0.01316803, "auxiliary_loss_mlp": 0.01097826, "balance_loss_clip": 1.08668637, "balance_loss_mlp": 1.05431461, "epoch": 0.03919918234834365, "flos": 18368595815040.0, "grad_norm": 2.396886961427794, "language_loss": 0.8972075, "learning_rate": 3.999123849358836e-06, "loss": 0.92135382, "num_input_tokens_seen": 6948110, "step": 326, "time_per_iteration": 2.7077150344848633 }, { "auxiliary_loss_clip": 0.01319379, "auxiliary_loss_mlp": 0.01106999, "balance_loss_clip": 1.08669472, "balance_loss_mlp": 1.06382108, "epoch": 0.03931942523898275, "flos": 25225414663680.0, "grad_norm": 2.0887836881522417, "language_loss": 0.74698067, "learning_rate": 3.999100642814418e-06, "loss": 0.77124441, "num_input_tokens_seen": 6968550, "step": 327, "time_per_iteration": 2.720128059387207 }, { "auxiliary_loss_clip": 0.01319682, "auxiliary_loss_mlp": 0.01091095, "balance_loss_clip": 1.08746421, "balance_loss_mlp": 1.04920506, "epoch": 0.03943966812962184, "flos": 23257905240960.0, "grad_norm": 2.323028434062064, "language_loss": 0.88552928, "learning_rate": 3.999077133008022e-06, "loss": 0.90963709, "num_input_tokens_seen": 6987135, "step": 328, "time_per_iteration": 3.732396125793457 }, { "auxiliary_loss_clip": 0.01320906, "auxiliary_loss_mlp": 0.01088516, "balance_loss_clip": 1.08935702, "balance_loss_mlp": 1.04731762, "epoch": 0.03955991102026093, "flos": 29168837291520.0, "grad_norm": 1.9691629390697332, "language_loss": 0.90617085, "learning_rate": 3.9990533199432145e-06, "loss": 0.93026507, "num_input_tokens_seen": 7008630, "step": 329, "time_per_iteration": 3.6982951164245605 }, { "auxiliary_loss_clip": 0.01318995, "auxiliary_loss_mlp": 0.011025, "balance_loss_clip": 1.08547592, "balance_loss_mlp": 1.0611347, "epoch": 0.03968015391090002, "flos": 17602441695360.0, "grad_norm": 2.6017956282269994, "language_loss": 0.75653023, "learning_rate": 3.999029203623608e-06, "loss": 0.78074515, "num_input_tokens_seen": 7026350, "step": 330, "time_per_iteration": 2.6170201301574707 }, { "auxiliary_loss_clip": 0.0131781, "auxiliary_loss_mlp": 0.01102144, "balance_loss_clip": 1.08884037, "balance_loss_mlp": 1.05839443, "epoch": 0.03980039680153911, "flos": 21799285752960.0, "grad_norm": 2.3122207949770512, "language_loss": 0.86976892, "learning_rate": 3.99900478405286e-06, "loss": 0.89396846, "num_input_tokens_seen": 7045660, "step": 331, "time_per_iteration": 3.552151918411255 }, { "auxiliary_loss_clip": 0.01321395, "auxiliary_loss_mlp": 0.01122778, "balance_loss_clip": 1.09190547, "balance_loss_mlp": 1.07914758, "epoch": 0.0399206396921782, "flos": 15195134148480.0, "grad_norm": 6.265230619388697, "language_loss": 0.82397175, "learning_rate": 3.998980061234676e-06, "loss": 0.84841347, "num_input_tokens_seen": 7063575, "step": 332, "time_per_iteration": 3.556950330734253 }, { "auxiliary_loss_clip": 0.0132056, "auxiliary_loss_mlp": 0.01084816, "balance_loss_clip": 1.08793271, "balance_loss_mlp": 1.04438043, "epoch": 0.040040882582817294, "flos": 14422910630400.0, "grad_norm": 2.674693994831123, "language_loss": 0.75891203, "learning_rate": 3.9989550351728055e-06, "loss": 0.78296578, "num_input_tokens_seen": 7080505, "step": 333, "time_per_iteration": 2.727982521057129 }, { "auxiliary_loss_clip": 0.01314253, "auxiliary_loss_mlp": 0.01089618, "balance_loss_clip": 1.08641529, "balance_loss_mlp": 1.04970646, "epoch": 0.040161125473456384, "flos": 19280906375040.0, "grad_norm": 2.547851520688933, "language_loss": 0.84593379, "learning_rate": 3.998929705871046e-06, "loss": 0.86997247, "num_input_tokens_seen": 7097860, "step": 334, "time_per_iteration": 2.756216287612915 }, { "auxiliary_loss_clip": 0.01315872, "auxiliary_loss_mlp": 0.01084316, "balance_loss_clip": 1.08907616, "balance_loss_mlp": 1.04161549, "epoch": 0.040281368364095474, "flos": 17821101738240.0, "grad_norm": 4.38300230960733, "language_loss": 0.89364743, "learning_rate": 3.99890407333324e-06, "loss": 0.91764927, "num_input_tokens_seen": 7116390, "step": 335, "time_per_iteration": 2.6398537158966064 }, { "auxiliary_loss_clip": 0.01311303, "auxiliary_loss_mlp": 0.01095233, "balance_loss_clip": 1.08394921, "balance_loss_mlp": 1.053653, "epoch": 0.040401611254734564, "flos": 19573757959680.0, "grad_norm": 1.7818608754626915, "language_loss": 0.87409854, "learning_rate": 3.998878137563275e-06, "loss": 0.89816391, "num_input_tokens_seen": 7135940, "step": 336, "time_per_iteration": 2.7438831329345703 }, { "auxiliary_loss_clip": 0.01317329, "auxiliary_loss_mlp": 0.01083267, "balance_loss_clip": 1.08437204, "balance_loss_mlp": 1.04309392, "epoch": 0.040521854145373654, "flos": 22054466949120.0, "grad_norm": 2.4412125244635203, "language_loss": 0.85094273, "learning_rate": 3.998851898565085e-06, "loss": 0.87494862, "num_input_tokens_seen": 7155745, "step": 337, "time_per_iteration": 2.714106559753418 }, { "auxiliary_loss_clip": 0.01318141, "auxiliary_loss_mlp": 0.01088761, "balance_loss_clip": 1.08928609, "balance_loss_mlp": 1.04694271, "epoch": 0.04064209703601274, "flos": 22674644196480.0, "grad_norm": 1.9241141704404954, "language_loss": 0.83034956, "learning_rate": 3.998825356342653e-06, "loss": 0.85441864, "num_input_tokens_seen": 7175920, "step": 338, "time_per_iteration": 2.6896908283233643 }, { "auxiliary_loss_clip": 0.0132071, "auxiliary_loss_mlp": 0.01092642, "balance_loss_clip": 1.08595872, "balance_loss_mlp": 1.05017984, "epoch": 0.04076233992665183, "flos": 38582172783360.0, "grad_norm": 2.4938703100931665, "language_loss": 0.72977853, "learning_rate": 3.998798510900003e-06, "loss": 0.75391209, "num_input_tokens_seen": 7198720, "step": 339, "time_per_iteration": 2.8561930656433105 }, { "auxiliary_loss_clip": 0.01313554, "auxiliary_loss_mlp": 0.01103845, "balance_loss_clip": 1.08545697, "balance_loss_mlp": 1.06371915, "epoch": 0.04088258281729093, "flos": 25885309374720.0, "grad_norm": 2.0769045882844335, "language_loss": 0.84095299, "learning_rate": 3.998771362241207e-06, "loss": 0.86512697, "num_input_tokens_seen": 7219125, "step": 340, "time_per_iteration": 2.6569769382476807 }, { "auxiliary_loss_clip": 0.01309083, "auxiliary_loss_mlp": 0.01091233, "balance_loss_clip": 1.08479381, "balance_loss_mlp": 1.0501771, "epoch": 0.04100282570793002, "flos": 19789832223360.0, "grad_norm": 1.8937932038584622, "language_loss": 0.88083375, "learning_rate": 3.998743910370385e-06, "loss": 0.90483689, "num_input_tokens_seen": 7237985, "step": 341, "time_per_iteration": 2.70196533203125 }, { "auxiliary_loss_clip": 0.01317764, "auxiliary_loss_mlp": 0.01103071, "balance_loss_clip": 1.09065473, "balance_loss_mlp": 1.06268287, "epoch": 0.04112306859856911, "flos": 22565152563840.0, "grad_norm": 2.2696681308009947, "language_loss": 0.73248076, "learning_rate": 3.998716155291702e-06, "loss": 0.75668907, "num_input_tokens_seen": 7255825, "step": 342, "time_per_iteration": 2.651750087738037 }, { "auxiliary_loss_clip": 0.01318044, "auxiliary_loss_mlp": 0.01088969, "balance_loss_clip": 1.09163952, "balance_loss_mlp": 1.04684019, "epoch": 0.0412433114892082, "flos": 25040654081280.0, "grad_norm": 1.9508852879918304, "language_loss": 0.90450883, "learning_rate": 3.998688097009366e-06, "loss": 0.92857897, "num_input_tokens_seen": 7276590, "step": 343, "time_per_iteration": 2.7318167686462402 }, { "auxiliary_loss_clip": 0.01312286, "auxiliary_loss_mlp": 0.01090317, "balance_loss_clip": 1.08764625, "balance_loss_mlp": 1.04964304, "epoch": 0.04136355437984729, "flos": 25191371548800.0, "grad_norm": 2.2722586264691764, "language_loss": 0.80300021, "learning_rate": 3.998659735527636e-06, "loss": 0.82702625, "num_input_tokens_seen": 7295680, "step": 344, "time_per_iteration": 2.651763677597046 }, { "auxiliary_loss_clip": 0.01308057, "auxiliary_loss_mlp": 0.01086041, "balance_loss_clip": 1.08515692, "balance_loss_mlp": 1.04419827, "epoch": 0.04148379727048638, "flos": 22966777509120.0, "grad_norm": 1.7207083537465695, "language_loss": 0.778018, "learning_rate": 3.998631070850813e-06, "loss": 0.80195898, "num_input_tokens_seen": 7316300, "step": 345, "time_per_iteration": 2.673168659210205 }, { "auxiliary_loss_clip": 0.01315818, "auxiliary_loss_mlp": 0.01095607, "balance_loss_clip": 1.08947015, "balance_loss_mlp": 1.05369318, "epoch": 0.041604040161125476, "flos": 14063481187200.0, "grad_norm": 2.3683594420837264, "language_loss": 0.83688843, "learning_rate": 3.9986021029832455e-06, "loss": 0.86100268, "num_input_tokens_seen": 7333615, "step": 346, "time_per_iteration": 2.6341872215270996 }, { "auxiliary_loss_clip": 0.01314073, "auxiliary_loss_mlp": 0.0108955, "balance_loss_clip": 1.08460069, "balance_loss_mlp": 1.04765975, "epoch": 0.041724283051764566, "flos": 12091877614080.0, "grad_norm": 2.503890228101889, "language_loss": 0.91438574, "learning_rate": 3.9985728319293285e-06, "loss": 0.93842196, "num_input_tokens_seen": 7347590, "step": 347, "time_per_iteration": 2.6277363300323486 }, { "auxiliary_loss_clip": 0.01319181, "auxiliary_loss_mlp": 0.01097992, "balance_loss_clip": 1.08639598, "balance_loss_mlp": 1.05502892, "epoch": 0.041844525942403656, "flos": 12385303816320.0, "grad_norm": 2.15990975538298, "language_loss": 0.8538453, "learning_rate": 3.998543257693501e-06, "loss": 0.87801701, "num_input_tokens_seen": 7364345, "step": 348, "time_per_iteration": 2.609990358352661 }, { "auxiliary_loss_clip": 0.01313406, "auxiliary_loss_mlp": 0.0109238, "balance_loss_clip": 1.08971381, "balance_loss_mlp": 1.05368459, "epoch": 0.041964768833042745, "flos": 23769345041280.0, "grad_norm": 2.3883247650743247, "language_loss": 0.8767122, "learning_rate": 3.998513380280251e-06, "loss": 0.90077007, "num_input_tokens_seen": 7384625, "step": 349, "time_per_iteration": 2.7940449714660645 }, { "auxiliary_loss_clip": 0.01316908, "auxiliary_loss_mlp": 0.01103816, "balance_loss_clip": 1.08559203, "balance_loss_mlp": 1.0623548, "epoch": 0.042085011723681835, "flos": 11875336473600.0, "grad_norm": 2.208066699031685, "language_loss": 0.95109475, "learning_rate": 3.99848319969411e-06, "loss": 0.97530198, "num_input_tokens_seen": 7402225, "step": 350, "time_per_iteration": 2.6454787254333496 }, { "auxiliary_loss_clip": 0.01322342, "auxiliary_loss_mlp": 0.01103323, "balance_loss_clip": 1.09233928, "balance_loss_mlp": 1.06207657, "epoch": 0.042205254614320925, "flos": 16873957964160.0, "grad_norm": 3.7857583090738705, "language_loss": 0.79069132, "learning_rate": 3.9984527159396564e-06, "loss": 0.81494796, "num_input_tokens_seen": 7420865, "step": 351, "time_per_iteration": 2.847536563873291 }, { "auxiliary_loss_clip": 0.01307829, "auxiliary_loss_mlp": 0.010784, "balance_loss_clip": 1.08386493, "balance_loss_mlp": 1.03956127, "epoch": 0.04232549750496002, "flos": 25118508810240.0, "grad_norm": 2.395201933108926, "language_loss": 0.84770155, "learning_rate": 3.9984219290215154e-06, "loss": 0.87156391, "num_input_tokens_seen": 7441040, "step": 352, "time_per_iteration": 2.7329342365264893 }, { "auxiliary_loss_clip": 0.01312295, "auxiliary_loss_mlp": 0.01101556, "balance_loss_clip": 1.09028804, "balance_loss_mlp": 1.06171644, "epoch": 0.04244574039559911, "flos": 26724541714560.0, "grad_norm": 1.6172709906741247, "language_loss": 0.89124119, "learning_rate": 3.998390838944356e-06, "loss": 0.91537964, "num_input_tokens_seen": 7462545, "step": 353, "time_per_iteration": 2.6955018043518066 }, { "auxiliary_loss_clip": 0.01313282, "auxiliary_loss_mlp": 0.01113479, "balance_loss_clip": 1.08871865, "balance_loss_mlp": 1.07275724, "epoch": 0.0425659832862382, "flos": 20923244951040.0, "grad_norm": 2.8142941647114035, "language_loss": 0.90009457, "learning_rate": 3.998359445712895e-06, "loss": 0.92436218, "num_input_tokens_seen": 7481650, "step": 354, "time_per_iteration": 3.5796165466308594 }, { "auxiliary_loss_clip": 0.01316341, "auxiliary_loss_mlp": 0.01083053, "balance_loss_clip": 1.08553052, "balance_loss_mlp": 1.04338002, "epoch": 0.04268622617687729, "flos": 23331127115520.0, "grad_norm": 2.2371136189698992, "language_loss": 0.80934775, "learning_rate": 3.9983277493318955e-06, "loss": 0.83334172, "num_input_tokens_seen": 7500945, "step": 355, "time_per_iteration": 2.675226926803589 }, { "auxiliary_loss_clip": 0.01312076, "auxiliary_loss_mlp": 0.01102703, "balance_loss_clip": 1.08830225, "balance_loss_mlp": 1.06212449, "epoch": 0.04280646906751638, "flos": 25994010908160.0, "grad_norm": 3.125817762533593, "language_loss": 0.81176496, "learning_rate": 3.998295749806165e-06, "loss": 0.8359127, "num_input_tokens_seen": 7522170, "step": 356, "time_per_iteration": 3.6725568771362305 }, { "auxiliary_loss_clip": 0.01311806, "auxiliary_loss_mlp": 0.01101727, "balance_loss_clip": 1.08509541, "balance_loss_mlp": 1.06098127, "epoch": 0.04292671195815547, "flos": 26906824258560.0, "grad_norm": 1.8542781785184197, "language_loss": 0.83398151, "learning_rate": 3.998263447140558e-06, "loss": 0.85811687, "num_input_tokens_seen": 7542370, "step": 357, "time_per_iteration": 2.761113405227661 }, { "auxiliary_loss_clip": 0.01311974, "auxiliary_loss_mlp": 0.01077454, "balance_loss_clip": 1.08297539, "balance_loss_mlp": 1.0386157, "epoch": 0.04304695484879457, "flos": 39457315745280.0, "grad_norm": 1.7307135255366257, "language_loss": 0.81608856, "learning_rate": 3.998230841339976e-06, "loss": 0.83998287, "num_input_tokens_seen": 7564380, "step": 358, "time_per_iteration": 3.757901906967163 }, { "auxiliary_loss_clip": 0.01312268, "auxiliary_loss_mlp": 0.0110177, "balance_loss_clip": 1.08721018, "balance_loss_mlp": 1.06126308, "epoch": 0.04316719773943366, "flos": 19646297475840.0, "grad_norm": 2.1991853936315504, "language_loss": 0.84879673, "learning_rate": 3.998197932409363e-06, "loss": 0.87293708, "num_input_tokens_seen": 7582390, "step": 359, "time_per_iteration": 3.637657403945923 }, { "auxiliary_loss_clip": 0.01302552, "auxiliary_loss_mlp": 0.01086596, "balance_loss_clip": 1.08561254, "balance_loss_mlp": 1.04754293, "epoch": 0.04328744063007275, "flos": 22452320966400.0, "grad_norm": 8.726709836955868, "language_loss": 0.86222017, "learning_rate": 3.9981647203537125e-06, "loss": 0.88611162, "num_input_tokens_seen": 7599890, "step": 360, "time_per_iteration": 2.711561918258667 }, { "auxiliary_loss_clip": 0.01313314, "auxiliary_loss_mlp": 0.01104949, "balance_loss_clip": 1.0866257, "balance_loss_mlp": 1.06513309, "epoch": 0.04340768352071184, "flos": 21283033530240.0, "grad_norm": 2.2558632164372043, "language_loss": 0.95539552, "learning_rate": 3.998131205178063e-06, "loss": 0.97957814, "num_input_tokens_seen": 7618360, "step": 361, "time_per_iteration": 2.6283562183380127 }, { "auxiliary_loss_clip": 0.01322207, "auxiliary_loss_mlp": 0.01106186, "balance_loss_clip": 1.09089148, "balance_loss_mlp": 1.06436789, "epoch": 0.04352792641135093, "flos": 11583705951360.0, "grad_norm": 3.3356999076078524, "language_loss": 0.76670825, "learning_rate": 3.998097386887498e-06, "loss": 0.7909922, "num_input_tokens_seen": 7635435, "step": 362, "time_per_iteration": 2.7445240020751953 }, { "auxiliary_loss_clip": 0.01304809, "auxiliary_loss_mlp": 0.01095446, "balance_loss_clip": 1.08747745, "balance_loss_mlp": 1.05341327, "epoch": 0.04364816930199002, "flos": 23623547736960.0, "grad_norm": 1.82965603830489, "language_loss": 0.84834301, "learning_rate": 3.998063265487148e-06, "loss": 0.87234557, "num_input_tokens_seen": 7656485, "step": 363, "time_per_iteration": 2.693246364593506 }, { "auxiliary_loss_clip": 0.0130939, "auxiliary_loss_mlp": 0.01093629, "balance_loss_clip": 1.08666062, "balance_loss_mlp": 1.05369341, "epoch": 0.043768412192629114, "flos": 14429734214400.0, "grad_norm": 1.8429798667348765, "language_loss": 0.80878961, "learning_rate": 3.99802884098219e-06, "loss": 0.83281982, "num_input_tokens_seen": 7674595, "step": 364, "time_per_iteration": 2.712219715118408 }, { "auxiliary_loss_clip": 0.01312531, "auxiliary_loss_mlp": 0.01096885, "balance_loss_clip": 1.08768308, "balance_loss_mlp": 1.05599666, "epoch": 0.043888655083268203, "flos": 26468893641600.0, "grad_norm": 3.6150918791441504, "language_loss": 0.82332838, "learning_rate": 3.997994113377845e-06, "loss": 0.8474226, "num_input_tokens_seen": 7693495, "step": 365, "time_per_iteration": 2.703249216079712 }, { "auxiliary_loss_clip": 0.01308284, "auxiliary_loss_mlp": 0.01093474, "balance_loss_clip": 1.08394349, "balance_loss_mlp": 1.05098772, "epoch": 0.04400889797390729, "flos": 27235263242880.0, "grad_norm": 2.1965877452963887, "language_loss": 0.83259612, "learning_rate": 3.9979590826793815e-06, "loss": 0.8566137, "num_input_tokens_seen": 7714685, "step": 366, "time_per_iteration": 2.6825194358825684 }, { "auxiliary_loss_clip": 0.01319775, "auxiliary_loss_mlp": 0.01094307, "balance_loss_clip": 1.08814466, "balance_loss_mlp": 1.05086684, "epoch": 0.04412914086454638, "flos": 20119528183680.0, "grad_norm": 1.9774845060979178, "language_loss": 0.81092268, "learning_rate": 3.997923748892113e-06, "loss": 0.83506358, "num_input_tokens_seen": 7734005, "step": 367, "time_per_iteration": 2.61942195892334 }, { "auxiliary_loss_clip": 0.01310013, "auxiliary_loss_mlp": 0.01109513, "balance_loss_clip": 1.08944857, "balance_loss_mlp": 1.06969666, "epoch": 0.04424938375518547, "flos": 22604618632320.0, "grad_norm": 1.7557476520976574, "language_loss": 0.88786644, "learning_rate": 3.9978881120214015e-06, "loss": 0.91206169, "num_input_tokens_seen": 7755525, "step": 368, "time_per_iteration": 2.702378749847412 }, { "auxiliary_loss_clip": 0.01304967, "auxiliary_loss_mlp": 0.01093607, "balance_loss_clip": 1.08506882, "balance_loss_mlp": 1.05500746, "epoch": 0.04436962664582456, "flos": 24132365844480.0, "grad_norm": 1.8394308495258875, "language_loss": 0.79119796, "learning_rate": 3.997852172072652e-06, "loss": 0.8151837, "num_input_tokens_seen": 7776740, "step": 369, "time_per_iteration": 2.6583220958709717 }, { "auxiliary_loss_clip": 0.0131032, "auxiliary_loss_mlp": 0.01098335, "balance_loss_clip": 1.08705115, "balance_loss_mlp": 1.05673099, "epoch": 0.04448986953646366, "flos": 18222906251520.0, "grad_norm": 2.4815115456174803, "language_loss": 0.88757759, "learning_rate": 3.9978159290513155e-06, "loss": 0.91166419, "num_input_tokens_seen": 7794820, "step": 370, "time_per_iteration": 2.6217124462127686 }, { "auxiliary_loss_clip": 0.01316166, "auxiliary_loss_mlp": 0.0109059, "balance_loss_clip": 1.08825493, "balance_loss_mlp": 1.05065513, "epoch": 0.04461011242710275, "flos": 30117920400000.0, "grad_norm": 1.8060650505553806, "language_loss": 0.80331486, "learning_rate": 3.997779382962892e-06, "loss": 0.82738245, "num_input_tokens_seen": 7817705, "step": 371, "time_per_iteration": 2.78894305229187 }, { "auxiliary_loss_clip": 0.01304709, "auxiliary_loss_mlp": 0.01096969, "balance_loss_clip": 1.08640504, "balance_loss_mlp": 1.0555315, "epoch": 0.04473035531774184, "flos": 29752529299200.0, "grad_norm": 2.9338752057186626, "language_loss": 0.73543382, "learning_rate": 3.997742533812924e-06, "loss": 0.75945061, "num_input_tokens_seen": 7840970, "step": 372, "time_per_iteration": 2.824434995651245 }, { "auxiliary_loss_clip": 0.01309727, "auxiliary_loss_mlp": 0.01096595, "balance_loss_clip": 1.08772564, "balance_loss_mlp": 1.05379879, "epoch": 0.04485059820838093, "flos": 13151565676800.0, "grad_norm": 2.677556011026103, "language_loss": 0.92257941, "learning_rate": 3.997705381607001e-06, "loss": 0.9466427, "num_input_tokens_seen": 7857785, "step": 373, "time_per_iteration": 2.769364356994629 }, { "auxiliary_loss_clip": 0.01195661, "auxiliary_loss_mlp": 0.01015652, "balance_loss_clip": 1.08090019, "balance_loss_mlp": 1.00315905, "epoch": 0.04497084109902002, "flos": 68094209548800.0, "grad_norm": 0.9799995981411267, "language_loss": 0.60247421, "learning_rate": 3.997667926350761e-06, "loss": 0.62458736, "num_input_tokens_seen": 7916115, "step": 374, "time_per_iteration": 3.140333890914917 }, { "auxiliary_loss_clip": 0.01191879, "auxiliary_loss_mlp": 0.01016877, "balance_loss_clip": 1.0784781, "balance_loss_mlp": 1.00419283, "epoch": 0.04509108398965911, "flos": 64342263346560.0, "grad_norm": 0.9024856948444803, "language_loss": 0.57811141, "learning_rate": 3.997630168049886e-06, "loss": 0.60019898, "num_input_tokens_seen": 7974480, "step": 375, "time_per_iteration": 3.2307121753692627 }, { "auxiliary_loss_clip": 0.01315598, "auxiliary_loss_mlp": 0.01095983, "balance_loss_clip": 1.09131312, "balance_loss_mlp": 1.0548799, "epoch": 0.045211326880298205, "flos": 22271115830400.0, "grad_norm": 1.8653768850409114, "language_loss": 0.7735731, "learning_rate": 3.997592106710101e-06, "loss": 0.79768896, "num_input_tokens_seen": 7993940, "step": 376, "time_per_iteration": 2.678194522857666 }, { "auxiliary_loss_clip": 0.01304559, "auxiliary_loss_mlp": 0.01084807, "balance_loss_clip": 1.08457661, "balance_loss_mlp": 1.04787564, "epoch": 0.045331569770937295, "flos": 32159441796480.0, "grad_norm": 2.2915107207844545, "language_loss": 0.65800583, "learning_rate": 3.997553742337182e-06, "loss": 0.68189955, "num_input_tokens_seen": 8013365, "step": 377, "time_per_iteration": 2.7338616847991943 }, { "auxiliary_loss_clip": 0.0130527, "auxiliary_loss_mlp": 0.0109263, "balance_loss_clip": 1.08271754, "balance_loss_mlp": 1.05381513, "epoch": 0.045451812661576385, "flos": 22163455791360.0, "grad_norm": 1.6622892031677712, "language_loss": 0.91144562, "learning_rate": 3.997515074936949e-06, "loss": 0.93542457, "num_input_tokens_seen": 8034240, "step": 378, "time_per_iteration": 2.7274999618530273 }, { "auxiliary_loss_clip": 0.01303858, "auxiliary_loss_mlp": 0.01093831, "balance_loss_clip": 1.08337414, "balance_loss_mlp": 1.05344248, "epoch": 0.045572055552215475, "flos": 16581968305920.0, "grad_norm": 2.201974307867392, "language_loss": 0.8698715, "learning_rate": 3.997476104515268e-06, "loss": 0.89384836, "num_input_tokens_seen": 8052430, "step": 379, "time_per_iteration": 2.6362874507904053 }, { "auxiliary_loss_clip": 0.01304033, "auxiliary_loss_mlp": 0.01107002, "balance_loss_clip": 1.08485949, "balance_loss_mlp": 1.06432533, "epoch": 0.045692298442854565, "flos": 17603375448960.0, "grad_norm": 1.9298543506138477, "language_loss": 0.77863699, "learning_rate": 3.9974368310780485e-06, "loss": 0.80274737, "num_input_tokens_seen": 8069605, "step": 380, "time_per_iteration": 2.696676731109619 }, { "auxiliary_loss_clip": 0.01319711, "auxiliary_loss_mlp": 0.01097253, "balance_loss_clip": 1.08900476, "balance_loss_mlp": 1.05364621, "epoch": 0.045812541333493655, "flos": 26761098781440.0, "grad_norm": 2.41210857250748, "language_loss": 0.74251992, "learning_rate": 3.997397254631251e-06, "loss": 0.76668954, "num_input_tokens_seen": 8090225, "step": 381, "time_per_iteration": 3.5775539875030518 }, { "auxiliary_loss_clip": 0.01178795, "auxiliary_loss_mlp": 0.01025946, "balance_loss_clip": 1.06820154, "balance_loss_mlp": 1.01407301, "epoch": 0.04593278422413275, "flos": 60250349894400.0, "grad_norm": 0.8201509027098824, "language_loss": 0.60077882, "learning_rate": 3.997357375180878e-06, "loss": 0.62282622, "num_input_tokens_seen": 8154505, "step": 382, "time_per_iteration": 4.181485652923584 }, { "auxiliary_loss_clip": 0.0130193, "auxiliary_loss_mlp": 0.01098382, "balance_loss_clip": 1.08219814, "balance_loss_mlp": 1.05804205, "epoch": 0.04605302711477184, "flos": 21799249839360.0, "grad_norm": 1.7748555292163164, "language_loss": 0.75218749, "learning_rate": 3.997317192732979e-06, "loss": 0.77619064, "num_input_tokens_seen": 8173285, "step": 383, "time_per_iteration": 2.8482019901275635 }, { "auxiliary_loss_clip": 0.01303905, "auxiliary_loss_mlp": 0.01103231, "balance_loss_clip": 1.08253694, "balance_loss_mlp": 1.06305766, "epoch": 0.04617327000541093, "flos": 19459705299840.0, "grad_norm": 1.7097045134910254, "language_loss": 0.82609701, "learning_rate": 3.99727670729365e-06, "loss": 0.85016835, "num_input_tokens_seen": 8191845, "step": 384, "time_per_iteration": 2.7003352642059326 }, { "auxiliary_loss_clip": 0.01307336, "auxiliary_loss_mlp": 0.01093579, "balance_loss_clip": 1.08899879, "balance_loss_mlp": 1.05364406, "epoch": 0.04629351289605002, "flos": 25411468135680.0, "grad_norm": 2.925873502256834, "language_loss": 0.78089905, "learning_rate": 3.997235918869033e-06, "loss": 0.80490816, "num_input_tokens_seen": 8212880, "step": 385, "time_per_iteration": 3.7523844242095947 }, { "auxiliary_loss_clip": 0.01306161, "auxiliary_loss_mlp": 0.01082558, "balance_loss_clip": 1.08783305, "balance_loss_mlp": 1.04503143, "epoch": 0.04641375578668911, "flos": 20558284813440.0, "grad_norm": 1.8668306879978886, "language_loss": 0.82737935, "learning_rate": 3.997194827465315e-06, "loss": 0.85126644, "num_input_tokens_seen": 8231475, "step": 386, "time_per_iteration": 3.6618812084198 }, { "auxiliary_loss_clip": 0.01302968, "auxiliary_loss_mlp": 0.01086711, "balance_loss_clip": 1.08508694, "balance_loss_mlp": 1.04544067, "epoch": 0.0465339986773282, "flos": 13188661447680.0, "grad_norm": 2.7278613469605055, "language_loss": 0.91162431, "learning_rate": 3.997153433088728e-06, "loss": 0.93552107, "num_input_tokens_seen": 8248600, "step": 387, "time_per_iteration": 2.8022024631500244 }, { "auxiliary_loss_clip": 0.01305288, "auxiliary_loss_mlp": 0.01098366, "balance_loss_clip": 1.08543897, "balance_loss_mlp": 1.05695248, "epoch": 0.0466542415679673, "flos": 25556547168000.0, "grad_norm": 2.8061386211287456, "language_loss": 0.81185871, "learning_rate": 3.997111735745554e-06, "loss": 0.8358953, "num_input_tokens_seen": 8271570, "step": 388, "time_per_iteration": 2.773101806640625 }, { "auxiliary_loss_clip": 0.01299175, "auxiliary_loss_mlp": 0.01079184, "balance_loss_clip": 1.08248234, "balance_loss_mlp": 1.0384146, "epoch": 0.04677448445860639, "flos": 22236749493120.0, "grad_norm": 3.005192350900017, "language_loss": 0.82350707, "learning_rate": 3.997069735442118e-06, "loss": 0.84729064, "num_input_tokens_seen": 8291265, "step": 389, "time_per_iteration": 2.7408499717712402 }, { "auxiliary_loss_clip": 0.01300894, "auxiliary_loss_mlp": 0.01094086, "balance_loss_clip": 1.08428979, "balance_loss_mlp": 1.05362618, "epoch": 0.04689472734924548, "flos": 28147825198080.0, "grad_norm": 1.465620261830187, "language_loss": 0.80219305, "learning_rate": 3.997027432184792e-06, "loss": 0.82614279, "num_input_tokens_seen": 8315925, "step": 390, "time_per_iteration": 2.841285228729248 }, { "auxiliary_loss_clip": 0.01305531, "auxiliary_loss_mlp": 0.01090049, "balance_loss_clip": 1.08555913, "balance_loss_mlp": 1.05285549, "epoch": 0.04701497023988457, "flos": 23148952312320.0, "grad_norm": 1.8827413240605784, "language_loss": 0.8947978, "learning_rate": 3.99698482597999e-06, "loss": 0.91875356, "num_input_tokens_seen": 8333605, "step": 391, "time_per_iteration": 2.8568427562713623 }, { "auxiliary_loss_clip": 0.01171009, "auxiliary_loss_mlp": 0.0101313, "balance_loss_clip": 1.06364608, "balance_loss_mlp": 1.00201976, "epoch": 0.04713521313052366, "flos": 64827668764800.0, "grad_norm": 0.8822116106105341, "language_loss": 0.63877535, "learning_rate": 3.99694191683418e-06, "loss": 0.66061676, "num_input_tokens_seen": 8394405, "step": 392, "time_per_iteration": 3.2960636615753174 }, { "auxiliary_loss_clip": 0.01311288, "auxiliary_loss_mlp": 0.01105868, "balance_loss_clip": 1.08617926, "balance_loss_mlp": 1.06292832, "epoch": 0.047255456021162746, "flos": 18771585477120.0, "grad_norm": 1.8645720467950988, "language_loss": 0.81632042, "learning_rate": 3.996898704753867e-06, "loss": 0.84049201, "num_input_tokens_seen": 8412355, "step": 393, "time_per_iteration": 2.909348964691162 }, { "auxiliary_loss_clip": 0.01300662, "auxiliary_loss_mlp": 0.01087034, "balance_loss_clip": 1.08271837, "balance_loss_mlp": 1.0505321, "epoch": 0.04737569891180184, "flos": 22053820504320.0, "grad_norm": 2.203871597831758, "language_loss": 0.87757665, "learning_rate": 3.996855189745609e-06, "loss": 0.90145361, "num_input_tokens_seen": 8431620, "step": 394, "time_per_iteration": 2.7278354167938232 }, { "auxiliary_loss_clip": 0.01298818, "auxiliary_loss_mlp": 0.01084458, "balance_loss_clip": 1.08172321, "balance_loss_mlp": 1.04347444, "epoch": 0.04749594180244093, "flos": 29057370410880.0, "grad_norm": 2.042883879148692, "language_loss": 0.92445219, "learning_rate": 3.996811371816007e-06, "loss": 0.94828498, "num_input_tokens_seen": 8454045, "step": 395, "time_per_iteration": 2.7881643772125244 }, { "auxiliary_loss_clip": 0.01300288, "auxiliary_loss_mlp": 0.01087657, "balance_loss_clip": 1.08607244, "balance_loss_mlp": 1.04879522, "epoch": 0.04761618469308002, "flos": 35112268172160.0, "grad_norm": 2.045089534579289, "language_loss": 0.7795496, "learning_rate": 3.996767250971707e-06, "loss": 0.80342901, "num_input_tokens_seen": 8476785, "step": 396, "time_per_iteration": 2.9611127376556396 }, { "auxiliary_loss_clip": 0.01309716, "auxiliary_loss_mlp": 0.01088071, "balance_loss_clip": 1.09001946, "balance_loss_mlp": 1.04598987, "epoch": 0.04773642758371911, "flos": 25630702796160.0, "grad_norm": 2.357399231772335, "language_loss": 0.87012374, "learning_rate": 3.996722827219403e-06, "loss": 0.89410162, "num_input_tokens_seen": 8498400, "step": 397, "time_per_iteration": 2.806002378463745 }, { "auxiliary_loss_clip": 0.01307699, "auxiliary_loss_mlp": 0.01093827, "balance_loss_clip": 1.08536673, "balance_loss_mlp": 1.05401158, "epoch": 0.0478566704743582, "flos": 20631506688000.0, "grad_norm": 3.891039227217653, "language_loss": 0.82522297, "learning_rate": 3.996678100565833e-06, "loss": 0.84923822, "num_input_tokens_seen": 8517455, "step": 398, "time_per_iteration": 2.6930606365203857 }, { "auxiliary_loss_clip": 0.01298459, "auxiliary_loss_mlp": 0.01094408, "balance_loss_clip": 1.08515358, "balance_loss_mlp": 1.05247045, "epoch": 0.04797691336499729, "flos": 18835721210880.0, "grad_norm": 2.9792177226417746, "language_loss": 0.88507676, "learning_rate": 3.996633071017783e-06, "loss": 0.9090054, "num_input_tokens_seen": 8534085, "step": 399, "time_per_iteration": 2.6187453269958496 }, { "auxiliary_loss_clip": 0.01295233, "auxiliary_loss_mlp": 0.01094638, "balance_loss_clip": 1.08337784, "balance_loss_mlp": 1.05501318, "epoch": 0.04809715625563638, "flos": 21099673578240.0, "grad_norm": 2.184889682170587, "language_loss": 0.81877935, "learning_rate": 3.996587738582084e-06, "loss": 0.84267795, "num_input_tokens_seen": 8550885, "step": 400, "time_per_iteration": 2.69822359085083 }, { "auxiliary_loss_clip": 0.01300818, "auxiliary_loss_mlp": 0.01082585, "balance_loss_clip": 1.08344245, "balance_loss_mlp": 1.04427147, "epoch": 0.04821739914627548, "flos": 23805650712960.0, "grad_norm": 2.6892302453997234, "language_loss": 0.8589617, "learning_rate": 3.9965421032656115e-06, "loss": 0.88279575, "num_input_tokens_seen": 8570815, "step": 401, "time_per_iteration": 2.704885959625244 }, { "auxiliary_loss_clip": 0.01301286, "auxiliary_loss_mlp": 0.01084039, "balance_loss_clip": 1.08069468, "balance_loss_mlp": 1.04567707, "epoch": 0.04833764203691457, "flos": 22200587475840.0, "grad_norm": 3.199112025757639, "language_loss": 0.94171917, "learning_rate": 3.99649616507529e-06, "loss": 0.96557242, "num_input_tokens_seen": 8589910, "step": 402, "time_per_iteration": 2.6935603618621826 }, { "auxiliary_loss_clip": 0.01166069, "auxiliary_loss_mlp": 0.0101608, "balance_loss_clip": 1.06193733, "balance_loss_mlp": 1.0052079, "epoch": 0.04845788492755366, "flos": 65904376896000.0, "grad_norm": 0.8874480384316052, "language_loss": 0.630822, "learning_rate": 3.996449924018088e-06, "loss": 0.6526435, "num_input_tokens_seen": 8650370, "step": 403, "time_per_iteration": 3.137922763824463 }, { "auxiliary_loss_clip": 0.01298373, "auxiliary_loss_mlp": 0.01089929, "balance_loss_clip": 1.0844419, "balance_loss_mlp": 1.05283093, "epoch": 0.04857812781819275, "flos": 19281301424640.0, "grad_norm": 2.093665437884269, "language_loss": 0.79194367, "learning_rate": 3.99640338010102e-06, "loss": 0.81582671, "num_input_tokens_seen": 8669475, "step": 404, "time_per_iteration": 2.7488653659820557 }, { "auxiliary_loss_clip": 0.01297309, "auxiliary_loss_mlp": 0.01071552, "balance_loss_clip": 1.08244085, "balance_loss_mlp": 1.03505063, "epoch": 0.04869837070883184, "flos": 24062376193920.0, "grad_norm": 1.9688717613799043, "language_loss": 0.78248876, "learning_rate": 3.996356533331146e-06, "loss": 0.80617738, "num_input_tokens_seen": 8691345, "step": 405, "time_per_iteration": 2.702929735183716 }, { "auxiliary_loss_clip": 0.01305867, "auxiliary_loss_mlp": 0.01085735, "balance_loss_clip": 1.08362412, "balance_loss_mlp": 1.0478977, "epoch": 0.04881861359947093, "flos": 25187169657600.0, "grad_norm": 2.6839259300575806, "language_loss": 0.61891556, "learning_rate": 3.996309383715573e-06, "loss": 0.64283156, "num_input_tokens_seen": 8710125, "step": 406, "time_per_iteration": 2.6591053009033203 }, { "auxiliary_loss_clip": 0.01294088, "auxiliary_loss_mlp": 0.01090331, "balance_loss_clip": 1.07821167, "balance_loss_mlp": 1.05115867, "epoch": 0.048938856490110025, "flos": 16362913213440.0, "grad_norm": 2.14706987588304, "language_loss": 0.73507738, "learning_rate": 3.996261931261454e-06, "loss": 0.75892162, "num_input_tokens_seen": 8728705, "step": 407, "time_per_iteration": 3.635981321334839 }, { "auxiliary_loss_clip": 0.01296965, "auxiliary_loss_mlp": 0.01078309, "balance_loss_clip": 1.08580399, "balance_loss_mlp": 1.04001915, "epoch": 0.049059099380749115, "flos": 29895094379520.0, "grad_norm": 1.7990263552660228, "language_loss": 0.86442363, "learning_rate": 3.996214175975987e-06, "loss": 0.88817632, "num_input_tokens_seen": 8749225, "step": 408, "time_per_iteration": 2.6922571659088135 }, { "auxiliary_loss_clip": 0.01299448, "auxiliary_loss_mlp": 0.01092101, "balance_loss_clip": 1.08375776, "balance_loss_mlp": 1.0512358, "epoch": 0.049179342271388204, "flos": 35918858027520.0, "grad_norm": 2.171715818950792, "language_loss": 0.79116559, "learning_rate": 3.996166117866417e-06, "loss": 0.81508106, "num_input_tokens_seen": 8771160, "step": 409, "time_per_iteration": 3.6025631427764893 }, { "auxiliary_loss_clip": 0.01293196, "auxiliary_loss_mlp": 0.01088119, "balance_loss_clip": 1.07889056, "balance_loss_mlp": 1.04873228, "epoch": 0.049299585162027294, "flos": 14611226659200.0, "grad_norm": 2.3261043063583657, "language_loss": 0.86788303, "learning_rate": 3.996117756940035e-06, "loss": 0.89169616, "num_input_tokens_seen": 8787845, "step": 410, "time_per_iteration": 2.6502537727355957 }, { "auxiliary_loss_clip": 0.01296015, "auxiliary_loss_mlp": 0.01086745, "balance_loss_clip": 1.08069074, "balance_loss_mlp": 1.04740632, "epoch": 0.049419828052666384, "flos": 19567939956480.0, "grad_norm": 2.0375700505995153, "language_loss": 0.9783113, "learning_rate": 3.996069093204175e-06, "loss": 1.00213885, "num_input_tokens_seen": 8803805, "step": 411, "time_per_iteration": 3.518192768096924 }, { "auxiliary_loss_clip": 0.01308305, "auxiliary_loss_mlp": 0.01098392, "balance_loss_clip": 1.09091091, "balance_loss_mlp": 1.05886245, "epoch": 0.049540070943305474, "flos": 13659916907520.0, "grad_norm": 2.7630790905775715, "language_loss": 0.88170612, "learning_rate": 3.996020126666221e-06, "loss": 0.90577316, "num_input_tokens_seen": 8820785, "step": 412, "time_per_iteration": 3.575892210006714 }, { "auxiliary_loss_clip": 0.0129764, "auxiliary_loss_mlp": 0.01080738, "balance_loss_clip": 1.08274603, "balance_loss_mlp": 1.04154253, "epoch": 0.04966031383394457, "flos": 21832035978240.0, "grad_norm": 2.452279567157135, "language_loss": 0.81708795, "learning_rate": 3.995970857333601e-06, "loss": 0.84087175, "num_input_tokens_seen": 8841195, "step": 413, "time_per_iteration": 2.6683664321899414 }, { "auxiliary_loss_clip": 0.01300012, "auxiliary_loss_mlp": 0.01081825, "balance_loss_clip": 1.08250046, "balance_loss_mlp": 1.0434401, "epoch": 0.04978055672458366, "flos": 28618793349120.0, "grad_norm": 1.8965669842402644, "language_loss": 0.79566628, "learning_rate": 3.995921285213789e-06, "loss": 0.81948459, "num_input_tokens_seen": 8861455, "step": 414, "time_per_iteration": 2.7289271354675293 }, { "auxiliary_loss_clip": 0.01293057, "auxiliary_loss_mlp": 0.01077469, "balance_loss_clip": 1.08301449, "balance_loss_mlp": 1.04223037, "epoch": 0.04990079961522275, "flos": 19828220883840.0, "grad_norm": 2.540525890509057, "language_loss": 0.81027675, "learning_rate": 3.995871410314305e-06, "loss": 0.83398199, "num_input_tokens_seen": 8880015, "step": 415, "time_per_iteration": 2.65982723236084 }, { "auxiliary_loss_clip": 0.01148293, "auxiliary_loss_mlp": 0.01011649, "balance_loss_clip": 1.05727863, "balance_loss_mlp": 1.00058675, "epoch": 0.05002104250586184, "flos": 62735045293440.0, "grad_norm": 0.9086194878121692, "language_loss": 0.59679592, "learning_rate": 3.995821232642714e-06, "loss": 0.61839539, "num_input_tokens_seen": 8938420, "step": 416, "time_per_iteration": 3.299260139465332 }, { "auxiliary_loss_clip": 0.01278605, "auxiliary_loss_mlp": 0.0109158, "balance_loss_clip": 1.08320951, "balance_loss_mlp": 1.05345702, "epoch": 0.05014128539650093, "flos": 27928518710400.0, "grad_norm": 2.0034632928589815, "language_loss": 0.82468975, "learning_rate": 3.995770752206629e-06, "loss": 0.84839165, "num_input_tokens_seen": 8959495, "step": 417, "time_per_iteration": 2.7812414169311523 }, { "auxiliary_loss_clip": 0.0129537, "auxiliary_loss_mlp": 0.01091705, "balance_loss_clip": 1.08313847, "balance_loss_mlp": 1.05393982, "epoch": 0.05026152828714002, "flos": 17705576620800.0, "grad_norm": 4.582199079126615, "language_loss": 0.96746075, "learning_rate": 3.995719969013709e-06, "loss": 0.99133146, "num_input_tokens_seen": 8976675, "step": 418, "time_per_iteration": 2.561115264892578 }, { "auxiliary_loss_clip": 0.0126444, "auxiliary_loss_mlp": 0.01087949, "balance_loss_clip": 1.07788384, "balance_loss_mlp": 1.04922962, "epoch": 0.05038177117777912, "flos": 19133277477120.0, "grad_norm": 3.307140221111704, "language_loss": 0.85634565, "learning_rate": 3.995668883071655e-06, "loss": 0.87986952, "num_input_tokens_seen": 8992900, "step": 419, "time_per_iteration": 2.6803226470947266 }, { "auxiliary_loss_clip": 0.01291511, "auxiliary_loss_mlp": 0.01094019, "balance_loss_clip": 1.08164215, "balance_loss_mlp": 1.05441797, "epoch": 0.050502014068418206, "flos": 20667704618880.0, "grad_norm": 2.312127449217452, "language_loss": 0.90609503, "learning_rate": 3.995617494388219e-06, "loss": 0.92995036, "num_input_tokens_seen": 9011020, "step": 420, "time_per_iteration": 2.514235258102417 }, { "auxiliary_loss_clip": 0.01259755, "auxiliary_loss_mlp": 0.0108473, "balance_loss_clip": 1.07636571, "balance_loss_mlp": 1.04853821, "epoch": 0.050622256959057296, "flos": 21361103740800.0, "grad_norm": 1.9255406687377796, "language_loss": 0.8048628, "learning_rate": 3.995565802971196e-06, "loss": 0.82830769, "num_input_tokens_seen": 9030995, "step": 421, "time_per_iteration": 2.6730918884277344 }, { "auxiliary_loss_clip": 0.01257203, "auxiliary_loss_mlp": 0.01082216, "balance_loss_clip": 1.07831228, "balance_loss_mlp": 1.04509449, "epoch": 0.050742499849696386, "flos": 27673588909440.0, "grad_norm": 1.9146823638531045, "language_loss": 0.67309386, "learning_rate": 3.995513808828427e-06, "loss": 0.69648802, "num_input_tokens_seen": 9053790, "step": 422, "time_per_iteration": 2.768442153930664 }, { "auxiliary_loss_clip": 0.01265355, "auxiliary_loss_mlp": 0.01080244, "balance_loss_clip": 1.07982349, "balance_loss_mlp": 1.04331326, "epoch": 0.050862742740335476, "flos": 19865999013120.0, "grad_norm": 1.9401932049508546, "language_loss": 0.76795006, "learning_rate": 3.9954615119678e-06, "loss": 0.79140604, "num_input_tokens_seen": 9072345, "step": 423, "time_per_iteration": 2.701798915863037 }, { "auxiliary_loss_clip": 0.0126398, "auxiliary_loss_mlp": 0.0108306, "balance_loss_clip": 1.0782845, "balance_loss_mlp": 1.0427916, "epoch": 0.050982985630974566, "flos": 22085098272000.0, "grad_norm": 2.1023563307645032, "language_loss": 0.80966818, "learning_rate": 3.995408912397248e-06, "loss": 0.83313853, "num_input_tokens_seen": 9090240, "step": 424, "time_per_iteration": 2.702662944793701 }, { "auxiliary_loss_clip": 0.0126711, "auxiliary_loss_mlp": 0.01094803, "balance_loss_clip": 1.08104348, "balance_loss_mlp": 1.05496311, "epoch": 0.05110322852161366, "flos": 20740962407040.0, "grad_norm": 2.2513747582942067, "language_loss": 0.93298197, "learning_rate": 3.99535601012475e-06, "loss": 0.95660114, "num_input_tokens_seen": 9105570, "step": 425, "time_per_iteration": 2.71905517578125 }, { "auxiliary_loss_clip": 0.01235935, "auxiliary_loss_mlp": 0.00785827, "balance_loss_clip": 1.07597125, "balance_loss_mlp": 1.0004338, "epoch": 0.05122347141225275, "flos": 28547295327360.0, "grad_norm": 1.684219201556884, "language_loss": 0.75463831, "learning_rate": 3.995302805158333e-06, "loss": 0.77485591, "num_input_tokens_seen": 9128225, "step": 426, "time_per_iteration": 2.87941575050354 }, { "auxiliary_loss_clip": 0.01249788, "auxiliary_loss_mlp": 0.01087539, "balance_loss_clip": 1.07431376, "balance_loss_mlp": 1.04905796, "epoch": 0.05134371430289184, "flos": 19722679747200.0, "grad_norm": 3.5816331460163733, "language_loss": 0.83368397, "learning_rate": 3.9952492975060665e-06, "loss": 0.85705721, "num_input_tokens_seen": 9148295, "step": 427, "time_per_iteration": 2.7963688373565674 }, { "auxiliary_loss_clip": 0.01271906, "auxiliary_loss_mlp": 0.01079255, "balance_loss_clip": 1.07887733, "balance_loss_mlp": 1.04323006, "epoch": 0.05146395719353093, "flos": 34458945649920.0, "grad_norm": 2.49173464163558, "language_loss": 0.85077894, "learning_rate": 3.995195487176067e-06, "loss": 0.87429059, "num_input_tokens_seen": 9168525, "step": 428, "time_per_iteration": 2.975557565689087 }, { "auxiliary_loss_clip": 0.0129635, "auxiliary_loss_mlp": 0.01095193, "balance_loss_clip": 1.08236921, "balance_loss_mlp": 1.05416119, "epoch": 0.05158420008417002, "flos": 21760286561280.0, "grad_norm": 1.8182217927902782, "language_loss": 0.85291433, "learning_rate": 3.995141374176499e-06, "loss": 0.87682974, "num_input_tokens_seen": 9186920, "step": 429, "time_per_iteration": 2.694963216781616 }, { "auxiliary_loss_clip": 0.01125839, "auxiliary_loss_mlp": 0.00765894, "balance_loss_clip": 1.05918157, "balance_loss_mlp": 1.00100732, "epoch": 0.05170444297480911, "flos": 72553956226560.0, "grad_norm": 0.8753617398226406, "language_loss": 0.63068354, "learning_rate": 3.995086958515572e-06, "loss": 0.64960086, "num_input_tokens_seen": 9244940, "step": 430, "time_per_iteration": 3.4043142795562744 }, { "auxiliary_loss_clip": 0.01157895, "auxiliary_loss_mlp": 0.00765567, "balance_loss_clip": 1.06002617, "balance_loss_mlp": 1.00106966, "epoch": 0.05182468586544821, "flos": 62416159326720.0, "grad_norm": 0.8609503039287535, "language_loss": 0.59912926, "learning_rate": 3.995032240201538e-06, "loss": 0.61836392, "num_input_tokens_seen": 9307335, "step": 431, "time_per_iteration": 3.4095733165740967 }, { "auxiliary_loss_clip": 0.01136733, "auxiliary_loss_mlp": 0.01014314, "balance_loss_clip": 1.05790114, "balance_loss_mlp": 1.00391853, "epoch": 0.0519449287560873, "flos": 41225989432320.0, "grad_norm": 0.9353942496842611, "language_loss": 0.63126272, "learning_rate": 3.9949772192427e-06, "loss": 0.6527732, "num_input_tokens_seen": 9353960, "step": 432, "time_per_iteration": 3.0669684410095215 }, { "auxiliary_loss_clip": 0.01260584, "auxiliary_loss_mlp": 0.01089736, "balance_loss_clip": 1.07735896, "balance_loss_mlp": 1.05192256, "epoch": 0.05206517164672639, "flos": 17494530261120.0, "grad_norm": 1.8489425357206986, "language_loss": 0.79681957, "learning_rate": 3.994921895647405e-06, "loss": 0.82032275, "num_input_tokens_seen": 9372130, "step": 433, "time_per_iteration": 4.394885063171387 }, { "auxiliary_loss_clip": 0.01148968, "auxiliary_loss_mlp": 0.0100838, "balance_loss_clip": 1.05179429, "balance_loss_mlp": 0.99836618, "epoch": 0.05218541453736548, "flos": 64002762973440.0, "grad_norm": 0.8385655663330898, "language_loss": 0.55330569, "learning_rate": 3.994866269424043e-06, "loss": 0.57487917, "num_input_tokens_seen": 9428500, "step": 434, "time_per_iteration": 3.298490524291992 }, { "auxiliary_loss_clip": 0.01206372, "auxiliary_loss_mlp": 0.01077514, "balance_loss_clip": 1.06563354, "balance_loss_mlp": 1.03881907, "epoch": 0.05230565742800457, "flos": 19317319787520.0, "grad_norm": 3.296862394169801, "language_loss": 0.78391552, "learning_rate": 3.9948103405810545e-06, "loss": 0.80675429, "num_input_tokens_seen": 9447450, "step": 435, "time_per_iteration": 4.02087926864624 }, { "auxiliary_loss_clip": 0.01205918, "auxiliary_loss_mlp": 0.01087291, "balance_loss_clip": 1.06031942, "balance_loss_mlp": 1.04864383, "epoch": 0.05242590031864366, "flos": 25298636538240.0, "grad_norm": 1.8182326049251747, "language_loss": 0.86063218, "learning_rate": 3.994754109126923e-06, "loss": 0.88356429, "num_input_tokens_seen": 9468945, "step": 436, "time_per_iteration": 2.9278321266174316 }, { "auxiliary_loss_clip": 0.01201288, "auxiliary_loss_mlp": 0.01089794, "balance_loss_clip": 1.07229936, "balance_loss_mlp": 1.0531249, "epoch": 0.052546143209282754, "flos": 26211629456640.0, "grad_norm": 1.7686485625923873, "language_loss": 0.93492651, "learning_rate": 3.994697575070181e-06, "loss": 0.95783728, "num_input_tokens_seen": 9488405, "step": 437, "time_per_iteration": 4.130101680755615 }, { "auxiliary_loss_clip": 0.01265676, "auxiliary_loss_mlp": 0.01086472, "balance_loss_clip": 1.08279228, "balance_loss_mlp": 1.04927897, "epoch": 0.052666386099921844, "flos": 22158140578560.0, "grad_norm": 1.8838595139369778, "language_loss": 0.91525388, "learning_rate": 3.994640738419402e-06, "loss": 0.93877536, "num_input_tokens_seen": 9507780, "step": 438, "time_per_iteration": 3.906161069869995 }, { "auxiliary_loss_clip": 0.01275543, "auxiliary_loss_mlp": 0.01076584, "balance_loss_clip": 1.08020616, "balance_loss_mlp": 1.04210854, "epoch": 0.052786628990560934, "flos": 23881817502720.0, "grad_norm": 2.0009803454111617, "language_loss": 0.80839235, "learning_rate": 3.9945835991832075e-06, "loss": 0.83191359, "num_input_tokens_seen": 9529665, "step": 439, "time_per_iteration": 3.260446548461914 }, { "auxiliary_loss_clip": 0.01295936, "auxiliary_loss_mlp": 0.01096948, "balance_loss_clip": 1.08574271, "balance_loss_mlp": 1.05982614, "epoch": 0.052906871881200024, "flos": 24605021934720.0, "grad_norm": 1.9874910159647272, "language_loss": 0.93036389, "learning_rate": 3.994526157370268e-06, "loss": 0.95429265, "num_input_tokens_seen": 9548280, "step": 440, "time_per_iteration": 2.788486957550049 }, { "auxiliary_loss_clip": 0.01121101, "auxiliary_loss_mlp": 0.01013978, "balance_loss_clip": 1.04159141, "balance_loss_mlp": 1.00496566, "epoch": 0.053027114771839114, "flos": 56461631143680.0, "grad_norm": 0.9119986016676522, "language_loss": 0.59292722, "learning_rate": 3.994468412989296e-06, "loss": 0.61427796, "num_input_tokens_seen": 9609690, "step": 441, "time_per_iteration": 3.488919973373413 }, { "auxiliary_loss_clip": 0.01213049, "auxiliary_loss_mlp": 0.01093518, "balance_loss_clip": 1.06358933, "balance_loss_mlp": 1.0555861, "epoch": 0.053147357662478203, "flos": 17311098481920.0, "grad_norm": 2.3842630154111117, "language_loss": 0.92579913, "learning_rate": 3.994410366049052e-06, "loss": 0.94886476, "num_input_tokens_seen": 9627550, "step": 442, "time_per_iteration": 2.8299694061279297 }, { "auxiliary_loss_clip": 0.01273928, "auxiliary_loss_mlp": 0.01101935, "balance_loss_clip": 1.08183002, "balance_loss_mlp": 1.06486106, "epoch": 0.0532676005531173, "flos": 17164977955200.0, "grad_norm": 2.2325379859801857, "language_loss": 0.82966256, "learning_rate": 3.994352016558341e-06, "loss": 0.85342121, "num_input_tokens_seen": 9644855, "step": 443, "time_per_iteration": 2.8384358882904053 }, { "auxiliary_loss_clip": 0.01270608, "auxiliary_loss_mlp": 0.01075784, "balance_loss_clip": 1.08193922, "balance_loss_mlp": 1.03923488, "epoch": 0.05338784344375639, "flos": 27819960831360.0, "grad_norm": 3.067401874432966, "language_loss": 0.74013847, "learning_rate": 3.994293364526014e-06, "loss": 0.76360238, "num_input_tokens_seen": 9665740, "step": 444, "time_per_iteration": 2.8927435874938965 }, { "auxiliary_loss_clip": 0.01239111, "auxiliary_loss_mlp": 0.01087631, "balance_loss_clip": 1.07502449, "balance_loss_mlp": 1.04993677, "epoch": 0.05350808633439548, "flos": 21507691144320.0, "grad_norm": 1.9351585933680457, "language_loss": 0.84971607, "learning_rate": 3.99423440996097e-06, "loss": 0.87298346, "num_input_tokens_seen": 9685280, "step": 445, "time_per_iteration": 2.8403701782226562 }, { "auxiliary_loss_clip": 0.01261321, "auxiliary_loss_mlp": 0.01084811, "balance_loss_clip": 1.07864833, "balance_loss_mlp": 1.0474751, "epoch": 0.05362832922503457, "flos": 20084299920000.0, "grad_norm": 2.818469795008216, "language_loss": 0.81644994, "learning_rate": 3.994175152872152e-06, "loss": 0.83991128, "num_input_tokens_seen": 9704365, "step": 446, "time_per_iteration": 2.923551559448242 }, { "auxiliary_loss_clip": 0.01272995, "auxiliary_loss_mlp": 0.01093146, "balance_loss_clip": 1.08067143, "balance_loss_mlp": 1.05809879, "epoch": 0.05374857211567366, "flos": 26137222433280.0, "grad_norm": 2.0039312782173995, "language_loss": 0.78516245, "learning_rate": 3.994115593268548e-06, "loss": 0.80882388, "num_input_tokens_seen": 9724145, "step": 447, "time_per_iteration": 2.8867621421813965 }, { "auxiliary_loss_clip": 0.01288309, "auxiliary_loss_mlp": 0.01090316, "balance_loss_clip": 1.0822705, "balance_loss_mlp": 1.05319405, "epoch": 0.05386881500631275, "flos": 27486817165440.0, "grad_norm": 2.535379246775649, "language_loss": 0.82419217, "learning_rate": 3.994055731159195e-06, "loss": 0.84797847, "num_input_tokens_seen": 9741615, "step": 448, "time_per_iteration": 2.8914549350738525 }, { "auxiliary_loss_clip": 0.01277768, "auxiliary_loss_mlp": 0.01102068, "balance_loss_clip": 1.08523667, "balance_loss_mlp": 1.06616199, "epoch": 0.053989057896951846, "flos": 23585087249280.0, "grad_norm": 1.9675416210187227, "language_loss": 0.86900663, "learning_rate": 3.993995566553172e-06, "loss": 0.89280498, "num_input_tokens_seen": 9760580, "step": 449, "time_per_iteration": 2.819828748703003 }, { "auxiliary_loss_clip": 0.01231989, "auxiliary_loss_mlp": 0.01074683, "balance_loss_clip": 1.0687964, "balance_loss_mlp": 1.03849077, "epoch": 0.054109300787590936, "flos": 25228862369280.0, "grad_norm": 1.5544446246586916, "language_loss": 0.77078038, "learning_rate": 3.993935099459607e-06, "loss": 0.79384714, "num_input_tokens_seen": 9782195, "step": 450, "time_per_iteration": 2.9051032066345215 }, { "auxiliary_loss_clip": 0.01277934, "auxiliary_loss_mlp": 0.0107568, "balance_loss_clip": 1.07668889, "balance_loss_mlp": 1.04113293, "epoch": 0.054229543678230026, "flos": 23841525421440.0, "grad_norm": 1.9288104959883288, "language_loss": 0.73938125, "learning_rate": 3.993874329887673e-06, "loss": 0.7629174, "num_input_tokens_seen": 9800850, "step": 451, "time_per_iteration": 2.793227195739746 }, { "auxiliary_loss_clip": 0.01271826, "auxiliary_loss_mlp": 0.01075003, "balance_loss_clip": 1.07881021, "balance_loss_mlp": 1.03878689, "epoch": 0.054349786568869116, "flos": 16320933192960.0, "grad_norm": 2.2996849690813557, "language_loss": 0.86366934, "learning_rate": 3.993813257846589e-06, "loss": 0.88713765, "num_input_tokens_seen": 9817605, "step": 452, "time_per_iteration": 2.7863056659698486 }, { "auxiliary_loss_clip": 0.01272181, "auxiliary_loss_mlp": 0.01079915, "balance_loss_clip": 1.08121467, "balance_loss_mlp": 1.04350829, "epoch": 0.054470029459508205, "flos": 18660729127680.0, "grad_norm": 2.4872557517984624, "language_loss": 0.93171042, "learning_rate": 3.993751883345619e-06, "loss": 0.95523131, "num_input_tokens_seen": 9835965, "step": 453, "time_per_iteration": 2.953312873840332 }, { "auxiliary_loss_clip": 0.01246289, "auxiliary_loss_mlp": 0.01087929, "balance_loss_clip": 1.07495022, "balance_loss_mlp": 1.05157018, "epoch": 0.054590272350147295, "flos": 17785298856960.0, "grad_norm": 3.1611791308722865, "language_loss": 0.87575543, "learning_rate": 3.993690206394073e-06, "loss": 0.89909762, "num_input_tokens_seen": 9852265, "step": 454, "time_per_iteration": 2.8711798191070557 }, { "auxiliary_loss_clip": 0.0126924, "auxiliary_loss_mlp": 0.01079719, "balance_loss_clip": 1.08166909, "balance_loss_mlp": 1.04405153, "epoch": 0.054710515240786385, "flos": 17785945301760.0, "grad_norm": 2.0944851716332358, "language_loss": 0.87470043, "learning_rate": 3.993628227001307e-06, "loss": 0.89819002, "num_input_tokens_seen": 9870465, "step": 455, "time_per_iteration": 2.7976603507995605 }, { "auxiliary_loss_clip": 0.01249005, "auxiliary_loss_mlp": 0.01077395, "balance_loss_clip": 1.07697558, "balance_loss_mlp": 1.04389763, "epoch": 0.05483075813142548, "flos": 48210900180480.0, "grad_norm": 1.8571710027884438, "language_loss": 0.71115232, "learning_rate": 3.993565945176726e-06, "loss": 0.73441631, "num_input_tokens_seen": 9891490, "step": 456, "time_per_iteration": 3.003697156906128 }, { "auxiliary_loss_clip": 0.01256525, "auxiliary_loss_mlp": 0.01092008, "balance_loss_clip": 1.08201921, "balance_loss_mlp": 1.05631685, "epoch": 0.05495100102206457, "flos": 19682244011520.0, "grad_norm": 4.188068421831037, "language_loss": 0.8384552, "learning_rate": 3.993503360929776e-06, "loss": 0.8619405, "num_input_tokens_seen": 9910375, "step": 457, "time_per_iteration": 2.9446377754211426 }, { "auxiliary_loss_clip": 0.01198318, "auxiliary_loss_mlp": 0.01084037, "balance_loss_clip": 1.071141, "balance_loss_mlp": 1.04743981, "epoch": 0.05507124391270366, "flos": 26360048453760.0, "grad_norm": 1.8246239800144517, "language_loss": 0.80948603, "learning_rate": 3.99344047426995e-06, "loss": 0.8323096, "num_input_tokens_seen": 9931635, "step": 458, "time_per_iteration": 3.181276559829712 }, { "auxiliary_loss_clip": 0.01234619, "auxiliary_loss_mlp": 0.01070365, "balance_loss_clip": 1.0751996, "balance_loss_mlp": 1.03414881, "epoch": 0.05519148680334275, "flos": 22601314581120.0, "grad_norm": 3.0784569991859, "language_loss": 0.93413067, "learning_rate": 3.993377285206789e-06, "loss": 0.9571805, "num_input_tokens_seen": 9951420, "step": 459, "time_per_iteration": 4.269329071044922 }, { "auxiliary_loss_clip": 0.01200302, "auxiliary_loss_mlp": 0.01089016, "balance_loss_clip": 1.06397069, "balance_loss_mlp": 1.05127406, "epoch": 0.05531172969398184, "flos": 40552519380480.0, "grad_norm": 1.868538137205958, "language_loss": 0.86337638, "learning_rate": 3.99331379374988e-06, "loss": 0.88626957, "num_input_tokens_seen": 9975025, "step": 460, "time_per_iteration": 3.0018229484558105 }, { "auxiliary_loss_clip": 0.01256992, "auxiliary_loss_mlp": 0.01077402, "balance_loss_clip": 1.0758183, "balance_loss_mlp": 1.0428555, "epoch": 0.05543197258462093, "flos": 23477894087040.0, "grad_norm": 2.014317830544952, "language_loss": 0.8003695, "learning_rate": 3.993249999908852e-06, "loss": 0.82371342, "num_input_tokens_seen": 9995175, "step": 461, "time_per_iteration": 4.3320722579956055 }, { "auxiliary_loss_clip": 0.01284528, "auxiliary_loss_mlp": 0.01081814, "balance_loss_clip": 1.07934165, "balance_loss_mlp": 1.04562247, "epoch": 0.05555221547526003, "flos": 18624603024000.0, "grad_norm": 2.311071553210613, "language_loss": 0.87092066, "learning_rate": 3.993185903693384e-06, "loss": 0.89458412, "num_input_tokens_seen": 10011975, "step": 462, "time_per_iteration": 2.870130777359009 }, { "auxiliary_loss_clip": 0.01259623, "auxiliary_loss_mlp": 0.01076077, "balance_loss_clip": 1.0771594, "balance_loss_mlp": 1.04019523, "epoch": 0.05567245836589912, "flos": 23587098410880.0, "grad_norm": 2.2044025845105852, "language_loss": 0.82258689, "learning_rate": 3.9931215051131995e-06, "loss": 0.84594393, "num_input_tokens_seen": 10032620, "step": 463, "time_per_iteration": 2.894141435623169 }, { "auxiliary_loss_clip": 0.01256901, "auxiliary_loss_mlp": 0.0107747, "balance_loss_clip": 1.07507205, "balance_loss_mlp": 1.04208887, "epoch": 0.05579270125653821, "flos": 27746667129600.0, "grad_norm": 1.575977671019802, "language_loss": 0.79983413, "learning_rate": 3.993056804178068e-06, "loss": 0.82317787, "num_input_tokens_seen": 10054165, "step": 464, "time_per_iteration": 5.0652992725372314 }, { "auxiliary_loss_clip": 0.01219826, "auxiliary_loss_mlp": 0.01086124, "balance_loss_clip": 1.07138491, "balance_loss_mlp": 1.05150557, "epoch": 0.0559129441471773, "flos": 27014161075200.0, "grad_norm": 2.02423277138266, "language_loss": 0.84137058, "learning_rate": 3.992991800897803e-06, "loss": 0.86443007, "num_input_tokens_seen": 10073970, "step": 465, "time_per_iteration": 2.9873123168945312 }, { "auxiliary_loss_clip": 0.01279556, "auxiliary_loss_mlp": 0.01078834, "balance_loss_clip": 1.07902801, "balance_loss_mlp": 1.04197454, "epoch": 0.05603318703781639, "flos": 15229787794560.0, "grad_norm": 2.592894433483289, "language_loss": 0.89627934, "learning_rate": 3.9929264952822665e-06, "loss": 0.91986322, "num_input_tokens_seen": 10091505, "step": 466, "time_per_iteration": 2.704319715499878 }, { "auxiliary_loss_clip": 0.01277124, "auxiliary_loss_mlp": 0.01085434, "balance_loss_clip": 1.08201623, "balance_loss_mlp": 1.0495522, "epoch": 0.05615342992845548, "flos": 22266482976000.0, "grad_norm": 2.5826791893012144, "language_loss": 0.88330984, "learning_rate": 3.992860887341366e-06, "loss": 0.90693545, "num_input_tokens_seen": 10109675, "step": 467, "time_per_iteration": 2.855611562728882 }, { "auxiliary_loss_clip": 0.01224051, "auxiliary_loss_mlp": 0.010856, "balance_loss_clip": 1.0736475, "balance_loss_mlp": 1.04874015, "epoch": 0.056273672819094574, "flos": 23584979508480.0, "grad_norm": 2.1420074981576986, "language_loss": 0.81310081, "learning_rate": 3.992794977085052e-06, "loss": 0.83619732, "num_input_tokens_seen": 10127675, "step": 468, "time_per_iteration": 2.872462272644043 }, { "auxiliary_loss_clip": 0.01244473, "auxiliary_loss_mlp": 0.0107887, "balance_loss_clip": 1.07822394, "balance_loss_mlp": 1.04377484, "epoch": 0.056393915709733664, "flos": 19858708552320.0, "grad_norm": 1.95165659826216, "language_loss": 0.84757853, "learning_rate": 3.992728764523326e-06, "loss": 0.87081188, "num_input_tokens_seen": 10146620, "step": 469, "time_per_iteration": 2.9112296104431152 }, { "auxiliary_loss_clip": 0.01256434, "auxiliary_loss_mlp": 0.0107827, "balance_loss_clip": 1.08285737, "balance_loss_mlp": 1.04231668, "epoch": 0.05651415860037275, "flos": 22163779013760.0, "grad_norm": 1.7575295719778994, "language_loss": 0.80896765, "learning_rate": 3.99266224966623e-06, "loss": 0.83231467, "num_input_tokens_seen": 10167535, "step": 470, "time_per_iteration": 2.91676664352417 }, { "auxiliary_loss_clip": 0.01221383, "auxiliary_loss_mlp": 0.01090098, "balance_loss_clip": 1.06256247, "balance_loss_mlp": 1.05307198, "epoch": 0.05663440149101184, "flos": 19463548055040.0, "grad_norm": 1.9321935228052927, "language_loss": 0.87958348, "learning_rate": 3.992595432523855e-06, "loss": 0.90269828, "num_input_tokens_seen": 10184825, "step": 471, "time_per_iteration": 2.831310510635376 }, { "auxiliary_loss_clip": 0.01225238, "auxiliary_loss_mlp": 0.01079026, "balance_loss_clip": 1.07564843, "balance_loss_mlp": 1.0448606, "epoch": 0.05675464438165093, "flos": 22670226823680.0, "grad_norm": 2.0386432062023774, "language_loss": 0.85955817, "learning_rate": 3.992528313106338e-06, "loss": 0.88260084, "num_input_tokens_seen": 10203025, "step": 472, "time_per_iteration": 2.8532004356384277 }, { "auxiliary_loss_clip": 0.01280704, "auxiliary_loss_mlp": 0.00783685, "balance_loss_clip": 1.08219278, "balance_loss_mlp": 1.00042331, "epoch": 0.05687488727229002, "flos": 16901177495040.0, "grad_norm": 3.8310554788104825, "language_loss": 0.82084084, "learning_rate": 3.9924608914238595e-06, "loss": 0.84148467, "num_input_tokens_seen": 10218020, "step": 473, "time_per_iteration": 2.7247250080108643 }, { "auxiliary_loss_clip": 0.01274583, "auxiliary_loss_mlp": 0.01075526, "balance_loss_clip": 1.08442247, "balance_loss_mlp": 1.04076481, "epoch": 0.05699513016292912, "flos": 29168980945920.0, "grad_norm": 2.476657638961026, "language_loss": 0.84144586, "learning_rate": 3.992393167486648e-06, "loss": 0.86494696, "num_input_tokens_seen": 10237170, "step": 474, "time_per_iteration": 2.898573398590088 }, { "auxiliary_loss_clip": 0.01292168, "auxiliary_loss_mlp": 0.01086453, "balance_loss_clip": 1.08597159, "balance_loss_mlp": 1.05183458, "epoch": 0.05711537305356821, "flos": 18916197632640.0, "grad_norm": 2.4239489050528715, "language_loss": 0.80574799, "learning_rate": 3.992325141304977e-06, "loss": 0.82953417, "num_input_tokens_seen": 10255125, "step": 475, "time_per_iteration": 2.927159309387207 }, { "auxiliary_loss_clip": 0.01207099, "auxiliary_loss_mlp": 0.01078026, "balance_loss_clip": 1.06370068, "balance_loss_mlp": 1.04278755, "epoch": 0.0572356159442073, "flos": 26758979879040.0, "grad_norm": 2.083713902656185, "language_loss": 0.86569417, "learning_rate": 3.992256812889166e-06, "loss": 0.88854539, "num_input_tokens_seen": 10271230, "step": 476, "time_per_iteration": 3.2147035598754883 }, { "auxiliary_loss_clip": 0.01285663, "auxiliary_loss_mlp": 0.01069365, "balance_loss_clip": 1.08540726, "balance_loss_mlp": 1.03837073, "epoch": 0.05735585883484639, "flos": 35116146840960.0, "grad_norm": 3.183891379772707, "language_loss": 0.76569283, "learning_rate": 3.992188182249582e-06, "loss": 0.7892431, "num_input_tokens_seen": 10293125, "step": 477, "time_per_iteration": 2.905318260192871 }, { "auxiliary_loss_clip": 0.0125125, "auxiliary_loss_mlp": 0.01077652, "balance_loss_clip": 1.07910752, "balance_loss_mlp": 1.04284251, "epoch": 0.05747610172548548, "flos": 18734381965440.0, "grad_norm": 2.148309289223227, "language_loss": 0.90696883, "learning_rate": 3.992119249396633e-06, "loss": 0.9302578, "num_input_tokens_seen": 10311810, "step": 478, "time_per_iteration": 2.7680702209472656 }, { "auxiliary_loss_clip": 0.01245311, "auxiliary_loss_mlp": 0.00784612, "balance_loss_clip": 1.07615995, "balance_loss_mlp": 1.00036478, "epoch": 0.05759634461612457, "flos": 27964752554880.0, "grad_norm": 1.8594399875622776, "language_loss": 0.8220582, "learning_rate": 3.992050014340778e-06, "loss": 0.84235746, "num_input_tokens_seen": 10332165, "step": 479, "time_per_iteration": 2.9107542037963867 }, { "auxiliary_loss_clip": 0.01133341, "auxiliary_loss_mlp": 0.01013423, "balance_loss_clip": 1.05674624, "balance_loss_mlp": 1.00445843, "epoch": 0.057716587506763666, "flos": 69292009405440.0, "grad_norm": 0.8260029031854124, "language_loss": 0.55034316, "learning_rate": 3.99198047709252e-06, "loss": 0.57181084, "num_input_tokens_seen": 10393685, "step": 480, "time_per_iteration": 3.5199432373046875 }, { "auxiliary_loss_clip": 0.01231181, "auxiliary_loss_mlp": 0.01073808, "balance_loss_clip": 1.07074404, "balance_loss_mlp": 1.03799796, "epoch": 0.057836830397402755, "flos": 25009196745600.0, "grad_norm": 3.581941150835553, "language_loss": 0.78473651, "learning_rate": 3.991910637662408e-06, "loss": 0.80778641, "num_input_tokens_seen": 10413975, "step": 481, "time_per_iteration": 2.875325918197632 }, { "auxiliary_loss_clip": 0.01278932, "auxiliary_loss_mlp": 0.01092715, "balance_loss_clip": 1.08245277, "balance_loss_mlp": 1.05950356, "epoch": 0.057957073288041845, "flos": 25593894334080.0, "grad_norm": 2.8828728716842984, "language_loss": 0.80748469, "learning_rate": 3.9918404960610355e-06, "loss": 0.8312012, "num_input_tokens_seen": 10433005, "step": 482, "time_per_iteration": 2.8739399909973145 }, { "auxiliary_loss_clip": 0.01270367, "auxiliary_loss_mlp": 0.01075224, "balance_loss_clip": 1.08010602, "balance_loss_mlp": 1.0399853, "epoch": 0.058077316178680935, "flos": 20777411733120.0, "grad_norm": 2.457682397314526, "language_loss": 0.77161527, "learning_rate": 3.991770052299043e-06, "loss": 0.79507124, "num_input_tokens_seen": 10451235, "step": 483, "time_per_iteration": 2.768482208251953 }, { "auxiliary_loss_clip": 0.01253804, "auxiliary_loss_mlp": 0.01080096, "balance_loss_clip": 1.07763064, "balance_loss_mlp": 1.04547727, "epoch": 0.058197559069320025, "flos": 18916484941440.0, "grad_norm": 2.901624001777671, "language_loss": 0.87618285, "learning_rate": 3.991699306387118e-06, "loss": 0.89952189, "num_input_tokens_seen": 10469705, "step": 484, "time_per_iteration": 2.8457632064819336 }, { "auxiliary_loss_clip": 0.01268974, "auxiliary_loss_mlp": 0.01080615, "balance_loss_clip": 1.07900715, "balance_loss_mlp": 1.04833341, "epoch": 0.058317801959959115, "flos": 24863327614080.0, "grad_norm": 1.823738905943609, "language_loss": 0.78234708, "learning_rate": 3.991628258335991e-06, "loss": 0.805843, "num_input_tokens_seen": 10491910, "step": 485, "time_per_iteration": 3.846111536026001 }, { "auxiliary_loss_clip": 0.01231216, "auxiliary_loss_mlp": 0.01086667, "balance_loss_clip": 1.07435274, "balance_loss_mlp": 1.05128574, "epoch": 0.05843804485059821, "flos": 23257977068160.0, "grad_norm": 3.6536323886720448, "language_loss": 0.8764683, "learning_rate": 3.991556908156442e-06, "loss": 0.89964712, "num_input_tokens_seen": 10508435, "step": 486, "time_per_iteration": 4.2062599658966064 }, { "auxiliary_loss_clip": 0.0125273, "auxiliary_loss_mlp": 0.01081764, "balance_loss_clip": 1.07533932, "balance_loss_mlp": 1.0483619, "epoch": 0.0585582877412373, "flos": 23150532510720.0, "grad_norm": 2.0371441722241794, "language_loss": 0.8767792, "learning_rate": 3.9914852558592914e-06, "loss": 0.90012413, "num_input_tokens_seen": 10529485, "step": 487, "time_per_iteration": 2.8985044956207275 }, { "auxiliary_loss_clip": 0.01268027, "auxiliary_loss_mlp": 0.01078805, "balance_loss_clip": 1.07991207, "balance_loss_mlp": 1.04189754, "epoch": 0.05867853063187639, "flos": 23506406507520.0, "grad_norm": 2.978150327088128, "language_loss": 0.80690199, "learning_rate": 3.991413301455413e-06, "loss": 0.83037031, "num_input_tokens_seen": 10545935, "step": 488, "time_per_iteration": 2.8319480419158936 }, { "auxiliary_loss_clip": 0.01216907, "auxiliary_loss_mlp": 0.01081005, "balance_loss_clip": 1.05786371, "balance_loss_mlp": 1.04693508, "epoch": 0.05879877352251548, "flos": 29495803818240.0, "grad_norm": 2.3180138964089805, "language_loss": 0.77986407, "learning_rate": 3.991341044955719e-06, "loss": 0.80284321, "num_input_tokens_seen": 10565690, "step": 489, "time_per_iteration": 3.889110565185547 }, { "auxiliary_loss_clip": 0.01267925, "auxiliary_loss_mlp": 0.00784621, "balance_loss_clip": 1.07935429, "balance_loss_mlp": 1.00038958, "epoch": 0.05891901641315457, "flos": 20157485880960.0, "grad_norm": 2.0092710997719747, "language_loss": 0.8148548, "learning_rate": 3.991268486371172e-06, "loss": 0.83538026, "num_input_tokens_seen": 10584245, "step": 490, "time_per_iteration": 3.621725559234619 }, { "auxiliary_loss_clip": 0.01252257, "auxiliary_loss_mlp": 0.01076639, "balance_loss_clip": 1.07361031, "balance_loss_mlp": 1.04202032, "epoch": 0.05903925930379366, "flos": 24644200694400.0, "grad_norm": 3.0655559313886127, "language_loss": 0.8789252, "learning_rate": 3.991195625712779e-06, "loss": 0.90221417, "num_input_tokens_seen": 10601210, "step": 491, "time_per_iteration": 2.9071059226989746 }, { "auxiliary_loss_clip": 0.0127796, "auxiliary_loss_mlp": 0.01068769, "balance_loss_clip": 1.08101666, "balance_loss_mlp": 1.03651154, "epoch": 0.05915950219443276, "flos": 21250391045760.0, "grad_norm": 2.77723392933014, "language_loss": 0.81492031, "learning_rate": 3.991122462991592e-06, "loss": 0.83838761, "num_input_tokens_seen": 10620730, "step": 492, "time_per_iteration": 2.7235307693481445 }, { "auxiliary_loss_clip": 0.0128529, "auxiliary_loss_mlp": 0.010766, "balance_loss_clip": 1.0811429, "balance_loss_mlp": 1.04365063, "epoch": 0.05927974508507185, "flos": 9902727319680.0, "grad_norm": 3.4319138681458843, "language_loss": 0.81674886, "learning_rate": 3.991048998218712e-06, "loss": 0.84036773, "num_input_tokens_seen": 10634035, "step": 493, "time_per_iteration": 2.739229917526245 }, { "auxiliary_loss_clip": 0.01264385, "auxiliary_loss_mlp": 0.01084762, "balance_loss_clip": 1.07782245, "balance_loss_mlp": 1.05183661, "epoch": 0.05939998797571094, "flos": 18259499232000.0, "grad_norm": 2.3310804918954253, "language_loss": 0.7668587, "learning_rate": 3.990975231405281e-06, "loss": 0.7903502, "num_input_tokens_seen": 10652485, "step": 494, "time_per_iteration": 2.7762863636016846 }, { "auxiliary_loss_clip": 0.0125824, "auxiliary_loss_mlp": 0.01085659, "balance_loss_clip": 1.07775164, "balance_loss_mlp": 1.05104065, "epoch": 0.05952023086635003, "flos": 28256598558720.0, "grad_norm": 1.983885815129201, "language_loss": 0.78501469, "learning_rate": 3.990901162562491e-06, "loss": 0.80845368, "num_input_tokens_seen": 10673175, "step": 495, "time_per_iteration": 2.819082498550415 }, { "auxiliary_loss_clip": 0.01222337, "auxiliary_loss_mlp": 0.00784491, "balance_loss_clip": 1.0680058, "balance_loss_mlp": 1.00038886, "epoch": 0.05964047375698912, "flos": 14902498045440.0, "grad_norm": 2.0120401558518584, "language_loss": 0.91008759, "learning_rate": 3.9908267917015765e-06, "loss": 0.93015587, "num_input_tokens_seen": 10691235, "step": 496, "time_per_iteration": 2.7880334854125977 }, { "auxiliary_loss_clip": 0.0123692, "auxiliary_loss_mlp": 0.01091037, "balance_loss_clip": 1.06275105, "balance_loss_mlp": 1.05527401, "epoch": 0.059760716647628206, "flos": 23185581206400.0, "grad_norm": 2.1687747516218243, "language_loss": 0.93197829, "learning_rate": 3.990752118833821e-06, "loss": 0.95525789, "num_input_tokens_seen": 10708675, "step": 497, "time_per_iteration": 2.759716272354126 }, { "auxiliary_loss_clip": 0.01287122, "auxiliary_loss_mlp": 0.01074632, "balance_loss_clip": 1.08403707, "balance_loss_mlp": 1.04013324, "epoch": 0.0598809595382673, "flos": 22746968231040.0, "grad_norm": 1.9666072323756494, "language_loss": 0.77608138, "learning_rate": 3.990677143970553e-06, "loss": 0.79969895, "num_input_tokens_seen": 10729485, "step": 498, "time_per_iteration": 2.8153655529022217 }, { "auxiliary_loss_clip": 0.01231858, "auxiliary_loss_mlp": 0.01085953, "balance_loss_clip": 1.07489681, "balance_loss_mlp": 1.05121505, "epoch": 0.06000120242890639, "flos": 22127221946880.0, "grad_norm": 2.237882790347282, "language_loss": 0.81293917, "learning_rate": 3.990601867123144e-06, "loss": 0.83611727, "num_input_tokens_seen": 10749210, "step": 499, "time_per_iteration": 2.7989001274108887 }, { "auxiliary_loss_clip": 0.01212334, "auxiliary_loss_mlp": 0.01078783, "balance_loss_clip": 1.07264304, "balance_loss_mlp": 1.04442728, "epoch": 0.06012144531954548, "flos": 19171773878400.0, "grad_norm": 2.5383590161545, "language_loss": 0.84753519, "learning_rate": 3.990526288303014e-06, "loss": 0.87044632, "num_input_tokens_seen": 10768000, "step": 500, "time_per_iteration": 2.900810956954956 }, { "auxiliary_loss_clip": 0.01241829, "auxiliary_loss_mlp": 0.00782221, "balance_loss_clip": 1.07700419, "balance_loss_mlp": 1.00042641, "epoch": 0.06024168821018457, "flos": 22783345729920.0, "grad_norm": 1.8874059808780104, "language_loss": 0.90758932, "learning_rate": 3.9904504075216295e-06, "loss": 0.92782986, "num_input_tokens_seen": 10788760, "step": 501, "time_per_iteration": 2.7933268547058105 }, { "auxiliary_loss_clip": 0.01233038, "auxiliary_loss_mlp": 0.01077535, "balance_loss_clip": 1.0692091, "balance_loss_mlp": 1.04305995, "epoch": 0.06036193110082366, "flos": 18770687637120.0, "grad_norm": 2.3542549799359747, "language_loss": 0.93891406, "learning_rate": 3.990374224790501e-06, "loss": 0.9620198, "num_input_tokens_seen": 10806965, "step": 502, "time_per_iteration": 2.8275609016418457 }, { "auxiliary_loss_clip": 0.0124919, "auxiliary_loss_mlp": 0.01084957, "balance_loss_clip": 1.07779026, "balance_loss_mlp": 1.05126834, "epoch": 0.06048217399146275, "flos": 17201570935680.0, "grad_norm": 2.630997853867219, "language_loss": 0.70700562, "learning_rate": 3.990297740121185e-06, "loss": 0.73034716, "num_input_tokens_seen": 10824900, "step": 503, "time_per_iteration": 2.7149994373321533 }, { "auxiliary_loss_clip": 0.01270046, "auxiliary_loss_mlp": 0.00783487, "balance_loss_clip": 1.07918215, "balance_loss_mlp": 1.00041676, "epoch": 0.06060241688210185, "flos": 24024131187840.0, "grad_norm": 1.7371566799422484, "language_loss": 0.77922434, "learning_rate": 3.990220953525284e-06, "loss": 0.79975975, "num_input_tokens_seen": 10842010, "step": 504, "time_per_iteration": 2.7923150062561035 }, { "auxiliary_loss_clip": 0.01218094, "auxiliary_loss_mlp": 0.01086369, "balance_loss_clip": 1.05815315, "balance_loss_mlp": 1.05148828, "epoch": 0.06072265977274094, "flos": 14611190745600.0, "grad_norm": 7.1222975397119805, "language_loss": 0.74272925, "learning_rate": 3.9901438650144465e-06, "loss": 0.76577383, "num_input_tokens_seen": 10858260, "step": 505, "time_per_iteration": 2.8663833141326904 }, { "auxiliary_loss_clip": 0.01233407, "auxiliary_loss_mlp": 0.01075537, "balance_loss_clip": 1.06046438, "balance_loss_mlp": 1.04277813, "epoch": 0.06084290266338003, "flos": 20558284813440.0, "grad_norm": 2.49222187690208, "language_loss": 0.91725159, "learning_rate": 3.990066474600367e-06, "loss": 0.94034106, "num_input_tokens_seen": 10876230, "step": 506, "time_per_iteration": 2.856203556060791 }, { "auxiliary_loss_clip": 0.01246353, "auxiliary_loss_mlp": 0.01077834, "balance_loss_clip": 1.07206392, "balance_loss_mlp": 1.04114163, "epoch": 0.06096314555401912, "flos": 22309217182080.0, "grad_norm": 1.9399865061374313, "language_loss": 0.68192202, "learning_rate": 3.989988782294786e-06, "loss": 0.7051639, "num_input_tokens_seen": 10896320, "step": 507, "time_per_iteration": 2.7849180698394775 }, { "auxiliary_loss_clip": 0.01204978, "auxiliary_loss_mlp": 0.01081368, "balance_loss_clip": 1.06552815, "balance_loss_mlp": 1.04438913, "epoch": 0.06108338844465821, "flos": 19131374056320.0, "grad_norm": 1.727981740197765, "language_loss": 0.95034158, "learning_rate": 3.989910788109489e-06, "loss": 0.97320503, "num_input_tokens_seen": 10912970, "step": 508, "time_per_iteration": 2.8502464294433594 }, { "auxiliary_loss_clip": 0.01228308, "auxiliary_loss_mlp": 0.01075832, "balance_loss_clip": 1.07013631, "balance_loss_mlp": 1.04219091, "epoch": 0.0612036313352973, "flos": 33584018169600.0, "grad_norm": 2.4926942646170103, "language_loss": 0.74536669, "learning_rate": 3.989832492056307e-06, "loss": 0.76840806, "num_input_tokens_seen": 10933995, "step": 509, "time_per_iteration": 2.96783709526062 }, { "auxiliary_loss_clip": 0.01259233, "auxiliary_loss_mlp": 0.01079115, "balance_loss_clip": 1.07581794, "balance_loss_mlp": 1.04440141, "epoch": 0.06132387422593639, "flos": 27490552179840.0, "grad_norm": 3.225904999581253, "language_loss": 0.80730444, "learning_rate": 3.989753894147119e-06, "loss": 0.83068788, "num_input_tokens_seen": 10954120, "step": 510, "time_per_iteration": 2.812401294708252 }, { "auxiliary_loss_clip": 0.01259187, "auxiliary_loss_mlp": 0.01093461, "balance_loss_clip": 1.08116496, "balance_loss_mlp": 1.05977201, "epoch": 0.061444117116575485, "flos": 25885057979520.0, "grad_norm": 1.8831989698203473, "language_loss": 0.79944867, "learning_rate": 3.989674994393846e-06, "loss": 0.82297516, "num_input_tokens_seen": 10973595, "step": 511, "time_per_iteration": 3.7527074813842773 }, { "auxiliary_loss_clip": 0.0125359, "auxiliary_loss_mlp": 0.0106585, "balance_loss_clip": 1.07615781, "balance_loss_mlp": 1.03189921, "epoch": 0.061564360007214575, "flos": 28512031150080.0, "grad_norm": 2.101252153173221, "language_loss": 0.93814963, "learning_rate": 3.98959579280846e-06, "loss": 0.96134406, "num_input_tokens_seen": 10991995, "step": 512, "time_per_iteration": 3.7558202743530273 }, { "auxiliary_loss_clip": 0.01196301, "auxiliary_loss_mlp": 0.01072552, "balance_loss_clip": 1.07390606, "balance_loss_mlp": 1.04046082, "epoch": 0.061684602897853665, "flos": 12094355652480.0, "grad_norm": 2.2502571046429467, "language_loss": 0.82918829, "learning_rate": 3.989516289402973e-06, "loss": 0.85187685, "num_input_tokens_seen": 11007625, "step": 513, "time_per_iteration": 2.7352631092071533 }, { "auxiliary_loss_clip": 0.01155711, "auxiliary_loss_mlp": 0.01074062, "balance_loss_clip": 1.05024409, "balance_loss_mlp": 1.0380131, "epoch": 0.061804845788492754, "flos": 19532639865600.0, "grad_norm": 2.1685992111251022, "language_loss": 0.80068356, "learning_rate": 3.989436484189447e-06, "loss": 0.82298124, "num_input_tokens_seen": 11025570, "step": 514, "time_per_iteration": 2.823718786239624 }, { "auxiliary_loss_clip": 0.01261104, "auxiliary_loss_mlp": 0.0108604, "balance_loss_clip": 1.07862639, "balance_loss_mlp": 1.05151749, "epoch": 0.061925088679131844, "flos": 15341111020800.0, "grad_norm": 2.530860338451024, "language_loss": 0.80460405, "learning_rate": 3.9893563771799885e-06, "loss": 0.82807553, "num_input_tokens_seen": 11042045, "step": 515, "time_per_iteration": 3.911238670349121 }, { "auxiliary_loss_clip": 0.01279918, "auxiliary_loss_mlp": 0.01074596, "balance_loss_clip": 1.08195066, "balance_loss_mlp": 1.04074097, "epoch": 0.062045331569770934, "flos": 25919927107200.0, "grad_norm": 2.0960356037797614, "language_loss": 0.86254334, "learning_rate": 3.989275968386749e-06, "loss": 0.88608849, "num_input_tokens_seen": 11059955, "step": 516, "time_per_iteration": 3.6263015270233154 }, { "auxiliary_loss_clip": 0.01228063, "auxiliary_loss_mlp": 0.01074511, "balance_loss_clip": 1.06903148, "balance_loss_mlp": 1.03870034, "epoch": 0.06216557446041003, "flos": 28110621686400.0, "grad_norm": 2.062472227263757, "language_loss": 0.76745355, "learning_rate": 3.989195257821926e-06, "loss": 0.7904793, "num_input_tokens_seen": 11078440, "step": 517, "time_per_iteration": 2.815434217453003 }, { "auxiliary_loss_clip": 0.01241743, "auxiliary_loss_mlp": 0.01077311, "balance_loss_clip": 1.07392061, "balance_loss_mlp": 1.04207242, "epoch": 0.06228581735104912, "flos": 23478181395840.0, "grad_norm": 3.0446991011203317, "language_loss": 0.84244549, "learning_rate": 3.989114245497765e-06, "loss": 0.86563599, "num_input_tokens_seen": 11098240, "step": 518, "time_per_iteration": 2.840923547744751 }, { "auxiliary_loss_clip": 0.01258181, "auxiliary_loss_mlp": 0.01072047, "balance_loss_clip": 1.07371294, "balance_loss_mlp": 1.03809667, "epoch": 0.06240606024168821, "flos": 15195205975680.0, "grad_norm": 2.506014658074946, "language_loss": 0.9499985, "learning_rate": 3.989032931426554e-06, "loss": 0.97330076, "num_input_tokens_seen": 11115395, "step": 519, "time_per_iteration": 2.6984007358551025 }, { "auxiliary_loss_clip": 0.01244243, "auxiliary_loss_mlp": 0.01076403, "balance_loss_clip": 1.07435894, "balance_loss_mlp": 1.04304826, "epoch": 0.06252630313232731, "flos": 20631829910400.0, "grad_norm": 2.0781404155070904, "language_loss": 0.8661896, "learning_rate": 3.9889513156206295e-06, "loss": 0.88939607, "num_input_tokens_seen": 11134835, "step": 520, "time_per_iteration": 2.715111494064331 }, { "auxiliary_loss_clip": 0.01232823, "auxiliary_loss_mlp": 0.01078673, "balance_loss_clip": 1.07437396, "balance_loss_mlp": 1.04417431, "epoch": 0.06264654602296639, "flos": 20778058177920.0, "grad_norm": 3.3044766878265444, "language_loss": 0.731592, "learning_rate": 3.988869398092371e-06, "loss": 0.75470698, "num_input_tokens_seen": 11154745, "step": 521, "time_per_iteration": 2.7612130641937256 }, { "auxiliary_loss_clip": 0.0124628, "auxiliary_loss_mlp": 0.01075655, "balance_loss_clip": 1.07560349, "balance_loss_mlp": 1.03965342, "epoch": 0.06276678891360549, "flos": 29605798241280.0, "grad_norm": 2.4246116631892027, "language_loss": 0.78349161, "learning_rate": 3.988787178854206e-06, "loss": 0.80671096, "num_input_tokens_seen": 11174280, "step": 522, "time_per_iteration": 2.7685344219207764 }, { "auxiliary_loss_clip": 0.01272986, "auxiliary_loss_mlp": 0.01070477, "balance_loss_clip": 1.07841551, "balance_loss_mlp": 1.03483379, "epoch": 0.06288703180424457, "flos": 22126288193280.0, "grad_norm": 2.5694762797967847, "language_loss": 0.87149137, "learning_rate": 3.988704657918608e-06, "loss": 0.89492601, "num_input_tokens_seen": 11193340, "step": 523, "time_per_iteration": 2.7366573810577393 }, { "auxiliary_loss_clip": 0.01261189, "auxiliary_loss_mlp": 0.01075965, "balance_loss_clip": 1.07935858, "balance_loss_mlp": 1.04394555, "epoch": 0.06300727469488367, "flos": 14976689587200.0, "grad_norm": 2.578370788757501, "language_loss": 0.79517388, "learning_rate": 3.988621835298094e-06, "loss": 0.81854546, "num_input_tokens_seen": 11210555, "step": 524, "time_per_iteration": 2.800938844680786 }, { "auxiliary_loss_clip": 0.01269817, "auxiliary_loss_mlp": 0.01073753, "balance_loss_clip": 1.07828474, "balance_loss_mlp": 1.04089916, "epoch": 0.06312751758552275, "flos": 24535391420160.0, "grad_norm": 2.10676467856321, "language_loss": 0.91679066, "learning_rate": 3.988538711005229e-06, "loss": 0.94022632, "num_input_tokens_seen": 11230010, "step": 525, "time_per_iteration": 2.914144277572632 }, { "auxiliary_loss_clip": 0.01251998, "auxiliary_loss_mlp": 0.01073022, "balance_loss_clip": 1.07698584, "balance_loss_mlp": 1.0405736, "epoch": 0.06324776047616185, "flos": 21507008785920.0, "grad_norm": 2.68955387112967, "language_loss": 0.8831709, "learning_rate": 3.988455285052622e-06, "loss": 0.90642112, "num_input_tokens_seen": 11246190, "step": 526, "time_per_iteration": 2.715452194213867 }, { "auxiliary_loss_clip": 0.01249419, "auxiliary_loss_mlp": 0.01071171, "balance_loss_clip": 1.07769096, "balance_loss_mlp": 1.03948545, "epoch": 0.06336800336680094, "flos": 21688034353920.0, "grad_norm": 2.500230902585409, "language_loss": 0.83848083, "learning_rate": 3.98837155745293e-06, "loss": 0.86168671, "num_input_tokens_seen": 11264230, "step": 527, "time_per_iteration": 2.7703654766082764 }, { "auxiliary_loss_clip": 0.0125702, "auxiliary_loss_mlp": 0.01068294, "balance_loss_clip": 1.07718444, "balance_loss_mlp": 1.0350821, "epoch": 0.06348824625744003, "flos": 19500895221120.0, "grad_norm": 2.196595611368093, "language_loss": 0.76116753, "learning_rate": 3.988287528218854e-06, "loss": 0.78442073, "num_input_tokens_seen": 11283015, "step": 528, "time_per_iteration": 2.7487258911132812 }, { "auxiliary_loss_clip": 0.01258171, "auxiliary_loss_mlp": 0.01080759, "balance_loss_clip": 1.08166695, "balance_loss_mlp": 1.04857218, "epoch": 0.06360848914807912, "flos": 15481233976320.0, "grad_norm": 2.295327090323836, "language_loss": 0.90643501, "learning_rate": 3.98820319736314e-06, "loss": 0.92982429, "num_input_tokens_seen": 11299630, "step": 529, "time_per_iteration": 2.7198243141174316 }, { "auxiliary_loss_clip": 0.01230232, "auxiliary_loss_mlp": 0.01068467, "balance_loss_clip": 1.07211685, "balance_loss_mlp": 1.03680539, "epoch": 0.0637287320387182, "flos": 20593369422720.0, "grad_norm": 2.0558707577749202, "language_loss": 0.85524201, "learning_rate": 3.988118564898582e-06, "loss": 0.87822902, "num_input_tokens_seen": 11319170, "step": 530, "time_per_iteration": 2.7065114974975586 }, { "auxiliary_loss_clip": 0.01226114, "auxiliary_loss_mlp": 0.00785298, "balance_loss_clip": 1.07842541, "balance_loss_mlp": 1.00029564, "epoch": 0.0638489749293573, "flos": 17412222245760.0, "grad_norm": 2.7570638494244157, "language_loss": 0.89368534, "learning_rate": 3.988033630838019e-06, "loss": 0.91379946, "num_input_tokens_seen": 11333210, "step": 531, "time_per_iteration": 2.6087331771850586 }, { "auxiliary_loss_clip": 0.0125593, "auxiliary_loss_mlp": 0.0108099, "balance_loss_clip": 1.08063829, "balance_loss_mlp": 1.0483501, "epoch": 0.0639692178199964, "flos": 23807661874560.0, "grad_norm": 3.634382670214176, "language_loss": 0.88217807, "learning_rate": 3.987948395194334e-06, "loss": 0.90554726, "num_input_tokens_seen": 11355590, "step": 532, "time_per_iteration": 2.7048158645629883 }, { "auxiliary_loss_clip": 0.01240329, "auxiliary_loss_mlp": 0.01078058, "balance_loss_clip": 1.07028091, "balance_loss_mlp": 1.04484677, "epoch": 0.06408946071063548, "flos": 18477225521280.0, "grad_norm": 2.3193519203217563, "language_loss": 0.769786, "learning_rate": 3.987862857980458e-06, "loss": 0.79296982, "num_input_tokens_seen": 11371535, "step": 533, "time_per_iteration": 2.7571356296539307 }, { "auxiliary_loss_clip": 0.01238847, "auxiliary_loss_mlp": 0.01078432, "balance_loss_clip": 1.07941771, "balance_loss_mlp": 1.04593563, "epoch": 0.06420970360127458, "flos": 27162220936320.0, "grad_norm": 2.5218461315169347, "language_loss": 0.76867282, "learning_rate": 3.987777019209368e-06, "loss": 0.79184568, "num_input_tokens_seen": 11392050, "step": 534, "time_per_iteration": 2.8943514823913574 }, { "auxiliary_loss_clip": 0.01276173, "auxiliary_loss_mlp": 0.01086389, "balance_loss_clip": 1.08347297, "balance_loss_mlp": 1.05210447, "epoch": 0.06432994649191366, "flos": 23659673840640.0, "grad_norm": 1.7680400639928973, "language_loss": 0.81202555, "learning_rate": 3.987690878894084e-06, "loss": 0.83565116, "num_input_tokens_seen": 11411765, "step": 535, "time_per_iteration": 2.793557643890381 }, { "auxiliary_loss_clip": 0.01245579, "auxiliary_loss_mlp": 0.01075722, "balance_loss_clip": 1.07555366, "balance_loss_mlp": 1.03957748, "epoch": 0.06445018938255276, "flos": 23403953940480.0, "grad_norm": 2.3581078862809006, "language_loss": 0.85086346, "learning_rate": 3.987604437047673e-06, "loss": 0.87407643, "num_input_tokens_seen": 11431565, "step": 536, "time_per_iteration": 3.731388807296753 }, { "auxiliary_loss_clip": 0.01255084, "auxiliary_loss_mlp": 0.01084098, "balance_loss_clip": 1.07860088, "balance_loss_mlp": 1.05091, "epoch": 0.06457043227319184, "flos": 19646692525440.0, "grad_norm": 2.1596039686680113, "language_loss": 0.7765094, "learning_rate": 3.987517693683251e-06, "loss": 0.79990125, "num_input_tokens_seen": 11450140, "step": 537, "time_per_iteration": 2.7380316257476807 }, { "auxiliary_loss_clip": 0.01231341, "auxiliary_loss_mlp": 0.0109206, "balance_loss_clip": 1.07118416, "balance_loss_mlp": 1.05994487, "epoch": 0.06469067516383094, "flos": 16978744915200.0, "grad_norm": 2.4674839093498644, "language_loss": 0.96190745, "learning_rate": 3.9874306488139745e-06, "loss": 0.98514152, "num_input_tokens_seen": 11465400, "step": 538, "time_per_iteration": 3.660281181335449 }, { "auxiliary_loss_clip": 0.01227753, "auxiliary_loss_mlp": 0.01083159, "balance_loss_clip": 1.07588029, "balance_loss_mlp": 1.04756308, "epoch": 0.06481091805447003, "flos": 23296401642240.0, "grad_norm": 2.251525065068862, "language_loss": 0.87793064, "learning_rate": 3.987343302453049e-06, "loss": 0.90103972, "num_input_tokens_seen": 11486675, "step": 539, "time_per_iteration": 2.775897264480591 }, { "auxiliary_loss_clip": 0.01241047, "auxiliary_loss_mlp": 0.01080967, "balance_loss_clip": 1.07722259, "balance_loss_mlp": 1.05061579, "epoch": 0.06493116094510912, "flos": 29172356824320.0, "grad_norm": 1.7327157172568888, "language_loss": 0.82516956, "learning_rate": 3.987255654613724e-06, "loss": 0.84838963, "num_input_tokens_seen": 11510440, "step": 540, "time_per_iteration": 2.857105255126953 }, { "auxiliary_loss_clip": 0.01223567, "auxiliary_loss_mlp": 0.01090206, "balance_loss_clip": 1.0698359, "balance_loss_mlp": 1.05759072, "epoch": 0.06505140383574821, "flos": 19865065259520.0, "grad_norm": 2.26609714240894, "language_loss": 0.7037611, "learning_rate": 3.987167705309296e-06, "loss": 0.72689891, "num_input_tokens_seen": 11529715, "step": 541, "time_per_iteration": 5.04986310005188 }, { "auxiliary_loss_clip": 0.01257321, "auxiliary_loss_mlp": 0.00782403, "balance_loss_clip": 1.07767689, "balance_loss_mlp": 1.00030863, "epoch": 0.0651716467263873, "flos": 17924703540480.0, "grad_norm": 2.2811914363970764, "language_loss": 0.95299864, "learning_rate": 3.987079454553108e-06, "loss": 0.97339594, "num_input_tokens_seen": 11547665, "step": 542, "time_per_iteration": 2.7574126720428467 }, { "auxiliary_loss_clip": 0.01227066, "auxiliary_loss_mlp": 0.01074118, "balance_loss_clip": 1.07729912, "balance_loss_mlp": 1.04038143, "epoch": 0.0652918896170264, "flos": 20842840356480.0, "grad_norm": 1.9495785885776415, "language_loss": 0.90875721, "learning_rate": 3.986990902358546e-06, "loss": 0.93176907, "num_input_tokens_seen": 11564605, "step": 543, "time_per_iteration": 2.7923481464385986 }, { "auxiliary_loss_clip": 0.01254451, "auxiliary_loss_mlp": 0.0106689, "balance_loss_clip": 1.07451952, "balance_loss_mlp": 1.03234279, "epoch": 0.06541213250766549, "flos": 21872507627520.0, "grad_norm": 2.791578854161934, "language_loss": 0.93322593, "learning_rate": 3.986902048739045e-06, "loss": 0.95643932, "num_input_tokens_seen": 11584550, "step": 544, "time_per_iteration": 2.6839637756347656 }, { "auxiliary_loss_clip": 0.01244264, "auxiliary_loss_mlp": 0.01085252, "balance_loss_clip": 1.0764724, "balance_loss_mlp": 1.05184996, "epoch": 0.06553237539830457, "flos": 23110743219840.0, "grad_norm": 2.9053438305109913, "language_loss": 0.80218858, "learning_rate": 3.986812893708082e-06, "loss": 0.82548374, "num_input_tokens_seen": 11600740, "step": 545, "time_per_iteration": 2.73701810836792 }, { "auxiliary_loss_clip": 0.01244121, "auxiliary_loss_mlp": 0.01075925, "balance_loss_clip": 1.07466674, "balance_loss_mlp": 1.04145002, "epoch": 0.06565261828894367, "flos": 17923769786880.0, "grad_norm": 2.0816731713332888, "language_loss": 0.81266296, "learning_rate": 3.9867234372791826e-06, "loss": 0.83586335, "num_input_tokens_seen": 11618695, "step": 546, "time_per_iteration": 2.7047529220581055 }, { "auxiliary_loss_clip": 0.01249889, "auxiliary_loss_mlp": 0.01072156, "balance_loss_clip": 1.07463193, "balance_loss_mlp": 1.03901553, "epoch": 0.06577286117958275, "flos": 22783058421120.0, "grad_norm": 1.5728430450810387, "language_loss": 0.87171912, "learning_rate": 3.986633679465918e-06, "loss": 0.8949396, "num_input_tokens_seen": 11638850, "step": 547, "time_per_iteration": 2.6650869846343994 }, { "auxiliary_loss_clip": 0.01217989, "auxiliary_loss_mlp": 0.01072397, "balance_loss_clip": 1.07200873, "balance_loss_mlp": 1.03928089, "epoch": 0.06589310407022185, "flos": 23696194993920.0, "grad_norm": 4.497480877385218, "language_loss": 0.80742085, "learning_rate": 3.986543620281904e-06, "loss": 0.83032477, "num_input_tokens_seen": 11658500, "step": 548, "time_per_iteration": 2.8981680870056152 }, { "auxiliary_loss_clip": 0.01219879, "auxiliary_loss_mlp": 0.01079444, "balance_loss_clip": 1.07046318, "balance_loss_mlp": 1.04763865, "epoch": 0.06601334696086093, "flos": 26864772410880.0, "grad_norm": 1.713625203780731, "language_loss": 0.91023505, "learning_rate": 3.986453259740802e-06, "loss": 0.93322831, "num_input_tokens_seen": 11676670, "step": 549, "time_per_iteration": 2.721559524536133 }, { "auxiliary_loss_clip": 0.01245818, "auxiliary_loss_mlp": 0.01080244, "balance_loss_clip": 1.07854176, "balance_loss_mlp": 1.04872489, "epoch": 0.06613358985150003, "flos": 12567694101120.0, "grad_norm": 2.767422038591748, "language_loss": 0.78836936, "learning_rate": 3.986362597856319e-06, "loss": 0.81163001, "num_input_tokens_seen": 11693170, "step": 550, "time_per_iteration": 2.681151866912842 }, { "auxiliary_loss_clip": 0.0123528, "auxiliary_loss_mlp": 0.00783163, "balance_loss_clip": 1.07398224, "balance_loss_mlp": 1.00020766, "epoch": 0.06625383274213913, "flos": 18332505624960.0, "grad_norm": 3.1801276602583846, "language_loss": 0.81801069, "learning_rate": 3.986271634642211e-06, "loss": 0.83819515, "num_input_tokens_seen": 11710150, "step": 551, "time_per_iteration": 2.717451333999634 }, { "auxiliary_loss_clip": 0.01270031, "auxiliary_loss_mlp": 0.01070804, "balance_loss_clip": 1.07998049, "balance_loss_mlp": 1.03961849, "epoch": 0.06637407563277821, "flos": 15375585098880.0, "grad_norm": 1.9715024427949608, "language_loss": 0.81375539, "learning_rate": 3.986180370112274e-06, "loss": 0.83716369, "num_input_tokens_seen": 11726670, "step": 552, "time_per_iteration": 2.5541460514068604 }, { "auxiliary_loss_clip": 0.01254346, "auxiliary_loss_mlp": 0.00783776, "balance_loss_clip": 1.0748992, "balance_loss_mlp": 1.00024891, "epoch": 0.0664943185234173, "flos": 24025244509440.0, "grad_norm": 1.7605685659100634, "language_loss": 0.74466121, "learning_rate": 3.986088804280354e-06, "loss": 0.76504242, "num_input_tokens_seen": 11746400, "step": 553, "time_per_iteration": 2.747239112854004 }, { "auxiliary_loss_clip": 0.01239564, "auxiliary_loss_mlp": 0.01077433, "balance_loss_clip": 1.07264328, "balance_loss_mlp": 1.04469764, "epoch": 0.06661456141405639, "flos": 20957503547520.0, "grad_norm": 2.603364261060684, "language_loss": 0.93755245, "learning_rate": 3.985996937160342e-06, "loss": 0.96072239, "num_input_tokens_seen": 11765590, "step": 554, "time_per_iteration": 2.7029316425323486 }, { "auxiliary_loss_clip": 0.01256577, "auxiliary_loss_mlp": 0.01085629, "balance_loss_clip": 1.0838443, "balance_loss_mlp": 1.05318046, "epoch": 0.06673480430469549, "flos": 52223953322880.0, "grad_norm": 2.018575214994967, "language_loss": 0.68968421, "learning_rate": 3.985904768766173e-06, "loss": 0.71310627, "num_input_tokens_seen": 11788365, "step": 555, "time_per_iteration": 2.9402427673339844 }, { "auxiliary_loss_clip": 0.01227483, "auxiliary_loss_mlp": 0.01078139, "balance_loss_clip": 1.07506609, "balance_loss_mlp": 1.04216123, "epoch": 0.06685504719533458, "flos": 16217079995520.0, "grad_norm": 2.555309497949713, "language_loss": 0.76288605, "learning_rate": 3.98581229911183e-06, "loss": 0.78594232, "num_input_tokens_seen": 11807285, "step": 556, "time_per_iteration": 2.6455485820770264 }, { "auxiliary_loss_clip": 0.01254399, "auxiliary_loss_mlp": 0.01074846, "balance_loss_clip": 1.07218301, "balance_loss_mlp": 1.0413245, "epoch": 0.06697529008597367, "flos": 22491535639680.0, "grad_norm": 1.9804515923202575, "language_loss": 0.91980839, "learning_rate": 3.985719528211341e-06, "loss": 0.94310087, "num_input_tokens_seen": 11826655, "step": 557, "time_per_iteration": 2.654383659362793 }, { "auxiliary_loss_clip": 0.01119199, "auxiliary_loss_mlp": 0.01016573, "balance_loss_clip": 1.04789042, "balance_loss_mlp": 1.0067029, "epoch": 0.06709553297661276, "flos": 62688216936960.0, "grad_norm": 0.8437003884598827, "language_loss": 0.63013637, "learning_rate": 3.985626456078777e-06, "loss": 0.65149409, "num_input_tokens_seen": 11891310, "step": 558, "time_per_iteration": 3.3199026584625244 }, { "auxiliary_loss_clip": 0.01229028, "auxiliary_loss_mlp": 0.01074305, "balance_loss_clip": 1.07471538, "balance_loss_mlp": 1.0398773, "epoch": 0.06721577586725185, "flos": 11216590997760.0, "grad_norm": 3.091368575056358, "language_loss": 0.85884804, "learning_rate": 3.985533082728259e-06, "loss": 0.88188136, "num_input_tokens_seen": 11906965, "step": 559, "time_per_iteration": 2.6294758319854736 }, { "auxiliary_loss_clip": 0.01270328, "auxiliary_loss_mlp": 0.01076623, "balance_loss_clip": 1.07651329, "balance_loss_mlp": 1.0425055, "epoch": 0.06733601875789094, "flos": 25922189664000.0, "grad_norm": 1.9312152246300198, "language_loss": 0.74864841, "learning_rate": 3.985439408173951e-06, "loss": 0.77211791, "num_input_tokens_seen": 11927190, "step": 560, "time_per_iteration": 2.6737725734710693 }, { "auxiliary_loss_clip": 0.01276191, "auxiliary_loss_mlp": 0.01088554, "balance_loss_clip": 1.07926154, "balance_loss_mlp": 1.05541372, "epoch": 0.06745626164853002, "flos": 20813645577600.0, "grad_norm": 2.5575211682645644, "language_loss": 0.70520324, "learning_rate": 3.9853454324300634e-06, "loss": 0.72885066, "num_input_tokens_seen": 11946400, "step": 561, "time_per_iteration": 2.596156358718872 }, { "auxiliary_loss_clip": 0.01197483, "auxiliary_loss_mlp": 0.01077607, "balance_loss_clip": 1.06842124, "balance_loss_mlp": 1.04496717, "epoch": 0.06757650453916912, "flos": 19829262378240.0, "grad_norm": 3.693785804831592, "language_loss": 0.78051519, "learning_rate": 3.985251155510852e-06, "loss": 0.80326611, "num_input_tokens_seen": 11965430, "step": 562, "time_per_iteration": 2.7968404293060303 }, { "auxiliary_loss_clip": 0.01215706, "auxiliary_loss_mlp": 0.01080508, "balance_loss_clip": 1.0742209, "balance_loss_mlp": 1.04646146, "epoch": 0.06769674742980822, "flos": 25739224761600.0, "grad_norm": 1.8447686628153959, "language_loss": 0.80373776, "learning_rate": 3.98515657743062e-06, "loss": 0.82669985, "num_input_tokens_seen": 11984895, "step": 563, "time_per_iteration": 3.704972743988037 }, { "auxiliary_loss_clip": 0.01236473, "auxiliary_loss_mlp": 0.01076759, "balance_loss_clip": 1.07140815, "balance_loss_mlp": 1.04235506, "epoch": 0.0678169903204473, "flos": 13074788355840.0, "grad_norm": 2.0156741624095824, "language_loss": 0.77768946, "learning_rate": 3.985061698203711e-06, "loss": 0.80082178, "num_input_tokens_seen": 12002010, "step": 564, "time_per_iteration": 3.5584115982055664 }, { "auxiliary_loss_clip": 0.01133554, "auxiliary_loss_mlp": 0.0100744, "balance_loss_clip": 1.0486604, "balance_loss_mlp": 0.99966735, "epoch": 0.0679372332110864, "flos": 70865830788480.0, "grad_norm": 0.8901066120490269, "language_loss": 0.63829362, "learning_rate": 3.984966517844523e-06, "loss": 0.65970355, "num_input_tokens_seen": 12057255, "step": 565, "time_per_iteration": 3.148895740509033 }, { "auxiliary_loss_clip": 0.01270099, "auxiliary_loss_mlp": 0.01068103, "balance_loss_clip": 1.07654369, "balance_loss_mlp": 1.03348422, "epoch": 0.06805747610172548, "flos": 28256418990720.0, "grad_norm": 2.218875279544528, "language_loss": 0.80563438, "learning_rate": 3.984871036367492e-06, "loss": 0.82901645, "num_input_tokens_seen": 12077280, "step": 566, "time_per_iteration": 2.6504900455474854 }, { "auxiliary_loss_clip": 0.01253483, "auxiliary_loss_mlp": 0.00782846, "balance_loss_clip": 1.08021641, "balance_loss_mlp": 1.00050426, "epoch": 0.06817771899236458, "flos": 20120533764480.0, "grad_norm": 2.342612004594897, "language_loss": 0.83132851, "learning_rate": 3.984775253787102e-06, "loss": 0.85169184, "num_input_tokens_seen": 12095570, "step": 567, "time_per_iteration": 4.590600967407227 }, { "auxiliary_loss_clip": 0.0125862, "auxiliary_loss_mlp": 0.01073138, "balance_loss_clip": 1.0770421, "balance_loss_mlp": 1.03880584, "epoch": 0.06829796188300366, "flos": 17930629284480.0, "grad_norm": 2.6854625923121485, "language_loss": 0.87830603, "learning_rate": 3.984679170117885e-06, "loss": 0.90162361, "num_input_tokens_seen": 12111775, "step": 568, "time_per_iteration": 2.610369920730591 }, { "auxiliary_loss_clip": 0.01253824, "auxiliary_loss_mlp": 0.01073787, "balance_loss_clip": 1.07600391, "balance_loss_mlp": 1.04021788, "epoch": 0.06841820477364276, "flos": 14501627285760.0, "grad_norm": 2.239593987980847, "language_loss": 0.78314024, "learning_rate": 3.984582785374415e-06, "loss": 0.80641639, "num_input_tokens_seen": 12129215, "step": 569, "time_per_iteration": 2.6130828857421875 }, { "auxiliary_loss_clip": 0.01243052, "auxiliary_loss_mlp": 0.00782871, "balance_loss_clip": 1.07814598, "balance_loss_mlp": 1.00022578, "epoch": 0.06853844766428185, "flos": 21938474954880.0, "grad_norm": 2.0098684519195924, "language_loss": 0.80443639, "learning_rate": 3.9844860995713155e-06, "loss": 0.82469565, "num_input_tokens_seen": 12148755, "step": 570, "time_per_iteration": 2.699143409729004 }, { "auxiliary_loss_clip": 0.01245561, "auxiliary_loss_mlp": 0.01085367, "balance_loss_clip": 1.07887197, "balance_loss_mlp": 1.05556464, "epoch": 0.06865869055492094, "flos": 16800628348800.0, "grad_norm": 2.6747105203452266, "language_loss": 0.82849509, "learning_rate": 3.9843891127232524e-06, "loss": 0.85180432, "num_input_tokens_seen": 12166290, "step": 571, "time_per_iteration": 2.5923032760620117 }, { "auxiliary_loss_clip": 0.01196606, "auxiliary_loss_mlp": 0.01088916, "balance_loss_clip": 1.06668341, "balance_loss_mlp": 1.05241442, "epoch": 0.06877893344556003, "flos": 19937281553280.0, "grad_norm": 2.611449458478092, "language_loss": 0.66858017, "learning_rate": 3.984291824844938e-06, "loss": 0.69143546, "num_input_tokens_seen": 12181385, "step": 572, "time_per_iteration": 2.726295232772827 }, { "auxiliary_loss_clip": 0.01269617, "auxiliary_loss_mlp": 0.01071161, "balance_loss_clip": 1.07644022, "balance_loss_mlp": 1.03930783, "epoch": 0.06889917633619912, "flos": 23039388852480.0, "grad_norm": 2.2626935996846345, "language_loss": 0.84638953, "learning_rate": 3.984194235951132e-06, "loss": 0.86979735, "num_input_tokens_seen": 12197530, "step": 573, "time_per_iteration": 2.5904476642608643 }, { "auxiliary_loss_clip": 0.01274705, "auxiliary_loss_mlp": 0.01088279, "balance_loss_clip": 1.08493233, "balance_loss_mlp": 1.05358934, "epoch": 0.06901941922683821, "flos": 20960556203520.0, "grad_norm": 2.3784206817782545, "language_loss": 0.84735215, "learning_rate": 3.9840963460566375e-06, "loss": 0.87098193, "num_input_tokens_seen": 12216310, "step": 574, "time_per_iteration": 2.6039528846740723 }, { "auxiliary_loss_clip": 0.01173643, "auxiliary_loss_mlp": 0.01076972, "balance_loss_clip": 1.06520998, "balance_loss_mlp": 1.04495203, "epoch": 0.06913966211747731, "flos": 24821850384000.0, "grad_norm": 1.6605139198480774, "language_loss": 0.89581239, "learning_rate": 3.983998155176305e-06, "loss": 0.91831857, "num_input_tokens_seen": 12236670, "step": 575, "time_per_iteration": 2.782470226287842 }, { "auxiliary_loss_clip": 0.01128448, "auxiliary_loss_mlp": 0.01010381, "balance_loss_clip": 1.04438972, "balance_loss_mlp": 1.00294209, "epoch": 0.06925990500811639, "flos": 58367446957440.0, "grad_norm": 0.9183623144762058, "language_loss": 0.57042444, "learning_rate": 3.9838996633250305e-06, "loss": 0.59181273, "num_input_tokens_seen": 12297185, "step": 576, "time_per_iteration": 3.1050302982330322 }, { "auxiliary_loss_clip": 0.01246455, "auxiliary_loss_mlp": 0.01081577, "balance_loss_clip": 1.07198131, "balance_loss_mlp": 1.04893756, "epoch": 0.06938014789875549, "flos": 12749940731520.0, "grad_norm": 2.3421544679990904, "language_loss": 0.88006097, "learning_rate": 3.983800870517753e-06, "loss": 0.90334129, "num_input_tokens_seen": 12313975, "step": 577, "time_per_iteration": 2.596203565597534 }, { "auxiliary_loss_clip": 0.01246499, "auxiliary_loss_mlp": 0.01073952, "balance_loss_clip": 1.07748556, "balance_loss_mlp": 1.04221845, "epoch": 0.06950039078939457, "flos": 22820226019200.0, "grad_norm": 4.116763161457316, "language_loss": 0.78128874, "learning_rate": 3.983701776769463e-06, "loss": 0.80449319, "num_input_tokens_seen": 12331385, "step": 578, "time_per_iteration": 2.6408040523529053 }, { "auxiliary_loss_clip": 0.01219859, "auxiliary_loss_mlp": 0.01074952, "balance_loss_clip": 1.06309104, "balance_loss_mlp": 1.0417645, "epoch": 0.06962063368003367, "flos": 21941348042880.0, "grad_norm": 1.8555318615677323, "language_loss": 0.85666305, "learning_rate": 3.9836023820951885e-06, "loss": 0.87961119, "num_input_tokens_seen": 12350600, "step": 579, "time_per_iteration": 2.619419574737549 }, { "auxiliary_loss_clip": 0.01212434, "auxiliary_loss_mlp": 0.01071902, "balance_loss_clip": 1.06796122, "balance_loss_mlp": 1.03845167, "epoch": 0.06974087657067275, "flos": 20706021452160.0, "grad_norm": 2.1013535770939753, "language_loss": 0.68742967, "learning_rate": 3.983502686510011e-06, "loss": 0.71027303, "num_input_tokens_seen": 12371430, "step": 580, "time_per_iteration": 2.6925718784332275 }, { "auxiliary_loss_clip": 0.01248156, "auxiliary_loss_mlp": 0.00782998, "balance_loss_clip": 1.07186019, "balance_loss_mlp": 1.00023115, "epoch": 0.06986111946131185, "flos": 22638230784000.0, "grad_norm": 1.8079817356818924, "language_loss": 0.73181015, "learning_rate": 3.9834026900290525e-06, "loss": 0.75212175, "num_input_tokens_seen": 12390825, "step": 581, "time_per_iteration": 2.6401798725128174 }, { "auxiliary_loss_clip": 0.01266616, "auxiliary_loss_mlp": 0.01078328, "balance_loss_clip": 1.08002293, "balance_loss_mlp": 1.04673767, "epoch": 0.06998136235195095, "flos": 26943453152640.0, "grad_norm": 1.9350692348391831, "language_loss": 1.00223351, "learning_rate": 3.983302392667482e-06, "loss": 1.02568293, "num_input_tokens_seen": 12411670, "step": 582, "time_per_iteration": 2.658559799194336 }, { "auxiliary_loss_clip": 0.01251415, "auxiliary_loss_mlp": 0.01068816, "balance_loss_clip": 1.08221483, "balance_loss_mlp": 1.03884625, "epoch": 0.07010160524259003, "flos": 22492505306880.0, "grad_norm": 2.1619522104229754, "language_loss": 0.93812084, "learning_rate": 3.983201794440517e-06, "loss": 0.96132314, "num_input_tokens_seen": 12431245, "step": 583, "time_per_iteration": 2.656583309173584 }, { "auxiliary_loss_clip": 0.01210951, "auxiliary_loss_mlp": 0.0107166, "balance_loss_clip": 1.0614934, "balance_loss_mlp": 1.03940165, "epoch": 0.07022184813322913, "flos": 18332541538560.0, "grad_norm": 2.1421671715911748, "language_loss": 0.67402911, "learning_rate": 3.9831008953634165e-06, "loss": 0.69685531, "num_input_tokens_seen": 12450535, "step": 584, "time_per_iteration": 2.6104514598846436 }, { "auxiliary_loss_clip": 0.01189445, "auxiliary_loss_mlp": 0.01076808, "balance_loss_clip": 1.06999648, "balance_loss_mlp": 1.04204667, "epoch": 0.07034209102386821, "flos": 24675550289280.0, "grad_norm": 2.3291720267207197, "language_loss": 0.81547654, "learning_rate": 3.9829996954514864e-06, "loss": 0.83813906, "num_input_tokens_seen": 12469675, "step": 585, "time_per_iteration": 2.734008550643921 }, { "auxiliary_loss_clip": 0.01235443, "auxiliary_loss_mlp": 0.01079139, "balance_loss_clip": 1.07257807, "balance_loss_mlp": 1.04351974, "epoch": 0.0704623339145073, "flos": 25995878415360.0, "grad_norm": 1.8342839918052727, "language_loss": 0.84080774, "learning_rate": 3.982898194720079e-06, "loss": 0.86395359, "num_input_tokens_seen": 12490405, "step": 586, "time_per_iteration": 2.6608774662017822 }, { "auxiliary_loss_clip": 0.01229254, "auxiliary_loss_mlp": 0.00783529, "balance_loss_clip": 1.07510197, "balance_loss_mlp": 1.00030148, "epoch": 0.0705825768051464, "flos": 25338318088320.0, "grad_norm": 2.743878760298746, "language_loss": 0.82612526, "learning_rate": 3.982796393184592e-06, "loss": 0.84625316, "num_input_tokens_seen": 12509485, "step": 587, "time_per_iteration": 2.688715934753418 }, { "auxiliary_loss_clip": 0.01124694, "auxiliary_loss_mlp": 0.01042613, "balance_loss_clip": 1.0455153, "balance_loss_mlp": 1.03317153, "epoch": 0.07070281969578548, "flos": 66047552507520.0, "grad_norm": 0.7968306451304824, "language_loss": 0.62676954, "learning_rate": 3.98269429086047e-06, "loss": 0.64844251, "num_input_tokens_seen": 12567325, "step": 588, "time_per_iteration": 3.100069999694824 }, { "auxiliary_loss_clip": 0.01223914, "auxiliary_loss_mlp": 0.0108811, "balance_loss_clip": 1.07398272, "balance_loss_mlp": 1.0508213, "epoch": 0.07082306258642458, "flos": 23653568528640.0, "grad_norm": 2.5572419060071105, "language_loss": 0.86594594, "learning_rate": 3.982591887763199e-06, "loss": 0.88906616, "num_input_tokens_seen": 12584785, "step": 589, "time_per_iteration": 3.6719205379486084 }, { "auxiliary_loss_clip": 0.01175582, "auxiliary_loss_mlp": 0.01084403, "balance_loss_clip": 1.0499599, "balance_loss_mlp": 1.04828238, "epoch": 0.07094330547706366, "flos": 13880049408000.0, "grad_norm": 2.1622432730809806, "language_loss": 0.82038432, "learning_rate": 3.982489183908316e-06, "loss": 0.8429842, "num_input_tokens_seen": 12601205, "step": 590, "time_per_iteration": 3.637917995452881 }, { "auxiliary_loss_clip": 0.01156379, "auxiliary_loss_mlp": 0.01090929, "balance_loss_clip": 1.06041276, "balance_loss_mlp": 1.06055498, "epoch": 0.07106354836770276, "flos": 24645098534400.0, "grad_norm": 2.80633266208322, "language_loss": 0.84736776, "learning_rate": 3.982386179311399e-06, "loss": 0.86984086, "num_input_tokens_seen": 12621725, "step": 591, "time_per_iteration": 2.7969257831573486 }, { "auxiliary_loss_clip": 0.01255696, "auxiliary_loss_mlp": 0.01083663, "balance_loss_clip": 1.07588124, "balance_loss_mlp": 1.0515244, "epoch": 0.07118379125834184, "flos": 16217223649920.0, "grad_norm": 2.1658617668008513, "language_loss": 0.87135017, "learning_rate": 3.982282873988075e-06, "loss": 0.89474374, "num_input_tokens_seen": 12639600, "step": 592, "time_per_iteration": 2.650547504425049 }, { "auxiliary_loss_clip": 0.01232342, "auxiliary_loss_mlp": 0.01073346, "balance_loss_clip": 1.07362187, "balance_loss_mlp": 1.04187441, "epoch": 0.07130403414898094, "flos": 19719986227200.0, "grad_norm": 1.8272643174082608, "language_loss": 0.87009197, "learning_rate": 3.982179267954016e-06, "loss": 0.89314884, "num_input_tokens_seen": 12660030, "step": 593, "time_per_iteration": 4.5410730838775635 }, { "auxiliary_loss_clip": 0.01262502, "auxiliary_loss_mlp": 0.01074399, "balance_loss_clip": 1.07683814, "balance_loss_mlp": 1.04228425, "epoch": 0.07142427703962004, "flos": 21871933009920.0, "grad_norm": 2.1620710011905486, "language_loss": 0.96003699, "learning_rate": 3.982075361224937e-06, "loss": 0.98340595, "num_input_tokens_seen": 12678395, "step": 594, "time_per_iteration": 2.7088301181793213 }, { "auxiliary_loss_clip": 0.01239478, "auxiliary_loss_mlp": 0.00782108, "balance_loss_clip": 1.07475793, "balance_loss_mlp": 1.00024009, "epoch": 0.07154451993025912, "flos": 18296595002880.0, "grad_norm": 1.8674933741496185, "language_loss": 0.88360691, "learning_rate": 3.981971153816602e-06, "loss": 0.90382272, "num_input_tokens_seen": 12696000, "step": 595, "time_per_iteration": 2.616363525390625 }, { "auxiliary_loss_clip": 0.01261105, "auxiliary_loss_mlp": 0.01076906, "balance_loss_clip": 1.07995915, "balance_loss_mlp": 1.04584002, "epoch": 0.07166476282089822, "flos": 22160690444160.0, "grad_norm": 1.8132347319159126, "language_loss": 0.96222258, "learning_rate": 3.981866645744819e-06, "loss": 0.98560268, "num_input_tokens_seen": 12716715, "step": 596, "time_per_iteration": 2.6268982887268066 }, { "auxiliary_loss_clip": 0.01269049, "auxiliary_loss_mlp": 0.00783206, "balance_loss_clip": 1.07899702, "balance_loss_mlp": 1.00018072, "epoch": 0.0717850057115373, "flos": 14136343925760.0, "grad_norm": 2.7074151099518478, "language_loss": 0.81372893, "learning_rate": 3.9817618370254416e-06, "loss": 0.83425146, "num_input_tokens_seen": 12733370, "step": 597, "time_per_iteration": 2.5799624919891357 }, { "auxiliary_loss_clip": 0.01265833, "auxiliary_loss_mlp": 0.01077078, "balance_loss_clip": 1.0782696, "balance_loss_mlp": 1.04686987, "epoch": 0.0719052486021764, "flos": 30917794412160.0, "grad_norm": 2.1688439382401894, "language_loss": 0.87027293, "learning_rate": 3.9816567276743684e-06, "loss": 0.89370203, "num_input_tokens_seen": 12753235, "step": 598, "time_per_iteration": 2.6710903644561768 }, { "auxiliary_loss_clip": 0.01228594, "auxiliary_loss_mlp": 0.01082206, "balance_loss_clip": 1.07419944, "balance_loss_mlp": 1.05133033, "epoch": 0.0720254914928155, "flos": 21287019939840.0, "grad_norm": 1.9585654798387995, "language_loss": 0.77254605, "learning_rate": 3.9815513177075466e-06, "loss": 0.79565406, "num_input_tokens_seen": 12772020, "step": 599, "time_per_iteration": 2.7050187587738037 }, { "auxiliary_loss_clip": 0.0123178, "auxiliary_loss_mlp": 0.0107174, "balance_loss_clip": 1.0732342, "balance_loss_mlp": 1.04270029, "epoch": 0.07214573438345458, "flos": 27819170732160.0, "grad_norm": 1.889126858034156, "language_loss": 0.70031691, "learning_rate": 3.9814456071409646e-06, "loss": 0.72335207, "num_input_tokens_seen": 12792555, "step": 600, "time_per_iteration": 2.6822781562805176 }, { "auxiliary_loss_clip": 0.01215471, "auxiliary_loss_mlp": 0.01082222, "balance_loss_clip": 1.06920075, "balance_loss_mlp": 1.04993987, "epoch": 0.07226597727409367, "flos": 25483576688640.0, "grad_norm": 2.53917626286285, "language_loss": 0.85488045, "learning_rate": 3.981339595990659e-06, "loss": 0.87785745, "num_input_tokens_seen": 12811085, "step": 601, "time_per_iteration": 2.7273218631744385 }, { "auxiliary_loss_clip": 0.01243754, "auxiliary_loss_mlp": 0.01065352, "balance_loss_clip": 1.07342911, "balance_loss_mlp": 1.03547847, "epoch": 0.07238622016473276, "flos": 23513840622720.0, "grad_norm": 2.373417245541459, "language_loss": 0.81338352, "learning_rate": 3.981233284272713e-06, "loss": 0.8364746, "num_input_tokens_seen": 12830830, "step": 602, "time_per_iteration": 2.6849427223205566 }, { "auxiliary_loss_clip": 0.01209903, "auxiliary_loss_mlp": 0.01082945, "balance_loss_clip": 1.07012248, "balance_loss_mlp": 1.05195069, "epoch": 0.07250646305537185, "flos": 25453519983360.0, "grad_norm": 1.731195562775122, "language_loss": 0.90122104, "learning_rate": 3.981126672003253e-06, "loss": 0.92414951, "num_input_tokens_seen": 12853505, "step": 603, "time_per_iteration": 2.8094615936279297 }, { "auxiliary_loss_clip": 0.01237227, "auxiliary_loss_mlp": 0.01066481, "balance_loss_clip": 1.0699656, "balance_loss_mlp": 1.03598726, "epoch": 0.07262670594601094, "flos": 27155038216320.0, "grad_norm": 2.2519397216685824, "language_loss": 0.78049892, "learning_rate": 3.981019759198451e-06, "loss": 0.803536, "num_input_tokens_seen": 12872455, "step": 604, "time_per_iteration": 2.692640781402588 }, { "auxiliary_loss_clip": 0.01246236, "auxiliary_loss_mlp": 0.01072024, "balance_loss_clip": 1.07921743, "balance_loss_mlp": 1.04222178, "epoch": 0.07274694883665003, "flos": 26651607148800.0, "grad_norm": 2.427029774187159, "language_loss": 0.84549373, "learning_rate": 3.980912545874528e-06, "loss": 0.8686763, "num_input_tokens_seen": 12892620, "step": 605, "time_per_iteration": 2.6637115478515625 }, { "auxiliary_loss_clip": 0.01238527, "auxiliary_loss_mlp": 0.00782615, "balance_loss_clip": 1.07358003, "balance_loss_mlp": 1.00026822, "epoch": 0.07286719172728913, "flos": 29862344154240.0, "grad_norm": 1.9543255444535959, "language_loss": 0.85534775, "learning_rate": 3.980805032047746e-06, "loss": 0.87555921, "num_input_tokens_seen": 12914090, "step": 606, "time_per_iteration": 2.699676990509033 }, { "auxiliary_loss_clip": 0.01232067, "auxiliary_loss_mlp": 0.01086607, "balance_loss_clip": 1.07324195, "balance_loss_mlp": 1.05554128, "epoch": 0.07298743461792821, "flos": 17382057799680.0, "grad_norm": 1.9060421744172287, "language_loss": 0.80872786, "learning_rate": 3.980697217734415e-06, "loss": 0.83191466, "num_input_tokens_seen": 12931830, "step": 607, "time_per_iteration": 2.577160120010376 }, { "auxiliary_loss_clip": 0.01212726, "auxiliary_loss_mlp": 0.00781668, "balance_loss_clip": 1.06973338, "balance_loss_mlp": 1.0002929, "epoch": 0.07310767750856731, "flos": 19498201701120.0, "grad_norm": 1.7592677920738198, "language_loss": 0.9160409, "learning_rate": 3.980589102950891e-06, "loss": 0.93598485, "num_input_tokens_seen": 12949995, "step": 608, "time_per_iteration": 2.6877033710479736 }, { "auxiliary_loss_clip": 0.01227575, "auxiliary_loss_mlp": 0.01072581, "balance_loss_clip": 1.07312584, "balance_loss_mlp": 1.0431838, "epoch": 0.07322792039920639, "flos": 29168693637120.0, "grad_norm": 2.2238996136747713, "language_loss": 0.76053387, "learning_rate": 3.9804806877135755e-06, "loss": 0.78353536, "num_input_tokens_seen": 12968040, "step": 609, "time_per_iteration": 2.7024056911468506 }, { "auxiliary_loss_clip": 0.01244571, "auxiliary_loss_mlp": 0.00782411, "balance_loss_clip": 1.07126486, "balance_loss_mlp": 1.00042701, "epoch": 0.07334816328984549, "flos": 23477822259840.0, "grad_norm": 2.048421921489141, "language_loss": 0.86177456, "learning_rate": 3.980371972038915e-06, "loss": 0.88204437, "num_input_tokens_seen": 12988530, "step": 610, "time_per_iteration": 2.648930311203003 }, { "auxiliary_loss_clip": 0.01264307, "auxiliary_loss_mlp": 0.01091614, "balance_loss_clip": 1.07991493, "balance_loss_mlp": 1.05625618, "epoch": 0.07346840618048459, "flos": 22962467877120.0, "grad_norm": 1.9108044042233636, "language_loss": 0.84344733, "learning_rate": 3.980262955943399e-06, "loss": 0.86700654, "num_input_tokens_seen": 13008195, "step": 611, "time_per_iteration": 2.5931477546691895 }, { "auxiliary_loss_clip": 0.01219909, "auxiliary_loss_mlp": 0.01076859, "balance_loss_clip": 1.07287693, "balance_loss_mlp": 1.04479206, "epoch": 0.07358864907112367, "flos": 17673903803520.0, "grad_norm": 2.571728649761884, "language_loss": 0.86639351, "learning_rate": 3.980153639443569e-06, "loss": 0.88936126, "num_input_tokens_seen": 13024180, "step": 612, "time_per_iteration": 2.6019811630249023 }, { "auxiliary_loss_clip": 0.01241401, "auxiliary_loss_mlp": 0.01085162, "balance_loss_clip": 1.0731622, "balance_loss_mlp": 1.05259442, "epoch": 0.07370889196176277, "flos": 24097029840000.0, "grad_norm": 3.3220208971588625, "language_loss": 0.79947543, "learning_rate": 3.980044022556005e-06, "loss": 0.82274115, "num_input_tokens_seen": 13043865, "step": 613, "time_per_iteration": 2.644455909729004 }, { "auxiliary_loss_clip": 0.01240274, "auxiliary_loss_mlp": 0.01066025, "balance_loss_clip": 1.07252491, "balance_loss_mlp": 1.03252685, "epoch": 0.07382913485240185, "flos": 25885919905920.0, "grad_norm": 2.4079822661035486, "language_loss": 0.73003507, "learning_rate": 3.9799341052973375e-06, "loss": 0.75309807, "num_input_tokens_seen": 13063700, "step": 614, "time_per_iteration": 2.766991376876831 }, { "auxiliary_loss_clip": 0.0122955, "auxiliary_loss_mlp": 0.01077159, "balance_loss_clip": 1.07372665, "balance_loss_mlp": 1.04382825, "epoch": 0.07394937774304094, "flos": 16873850223360.0, "grad_norm": 2.579321987287243, "language_loss": 0.75312984, "learning_rate": 3.979823887684241e-06, "loss": 0.77619696, "num_input_tokens_seen": 13082640, "step": 615, "time_per_iteration": 3.6755599975585938 }, { "auxiliary_loss_clip": 0.01258487, "auxiliary_loss_mlp": 0.01075035, "balance_loss_clip": 1.07737017, "balance_loss_mlp": 1.04623413, "epoch": 0.07406962063368003, "flos": 20703471586560.0, "grad_norm": 2.3986369652531487, "language_loss": 0.84758824, "learning_rate": 3.979713369733434e-06, "loss": 0.8709234, "num_input_tokens_seen": 13100505, "step": 616, "time_per_iteration": 3.537108898162842 }, { "auxiliary_loss_clip": 0.01225138, "auxiliary_loss_mlp": 0.01087844, "balance_loss_clip": 1.06142354, "balance_loss_mlp": 1.05470347, "epoch": 0.07418986352431912, "flos": 21430985650560.0, "grad_norm": 3.422805863429656, "language_loss": 0.85127139, "learning_rate": 3.979602551461683e-06, "loss": 0.87440121, "num_input_tokens_seen": 13121285, "step": 617, "time_per_iteration": 2.6597025394439697 }, { "auxiliary_loss_clip": 0.01229645, "auxiliary_loss_mlp": 0.01076071, "balance_loss_clip": 1.07517564, "balance_loss_mlp": 1.04531503, "epoch": 0.07431010641495822, "flos": 12021133777920.0, "grad_norm": 2.1637034433345406, "language_loss": 0.91665256, "learning_rate": 3.979491432885799e-06, "loss": 0.93970966, "num_input_tokens_seen": 13137550, "step": 618, "time_per_iteration": 2.6215660572052 }, { "auxiliary_loss_clip": 0.01193794, "auxiliary_loss_mlp": 0.00782306, "balance_loss_clip": 1.06854129, "balance_loss_mlp": 1.00036275, "epoch": 0.0744303493055973, "flos": 20957575374720.0, "grad_norm": 1.884515828311066, "language_loss": 0.83006859, "learning_rate": 3.97938001402264e-06, "loss": 0.84982955, "num_input_tokens_seen": 13156675, "step": 619, "time_per_iteration": 3.6307997703552246 }, { "auxiliary_loss_clip": 0.01204858, "auxiliary_loss_mlp": 0.01065576, "balance_loss_clip": 1.0708096, "balance_loss_mlp": 1.03481948, "epoch": 0.0745505921962364, "flos": 16253134272000.0, "grad_norm": 2.7925572416483027, "language_loss": 0.79594249, "learning_rate": 3.979268294889105e-06, "loss": 0.81864685, "num_input_tokens_seen": 13172225, "step": 620, "time_per_iteration": 3.5927107334136963 }, { "auxiliary_loss_clip": 0.0125614, "auxiliary_loss_mlp": 0.01071365, "balance_loss_clip": 1.07332301, "balance_loss_mlp": 1.0400126, "epoch": 0.07467083508687548, "flos": 50944635550080.0, "grad_norm": 1.8265451452495904, "language_loss": 0.74062443, "learning_rate": 3.979156275502143e-06, "loss": 0.76389956, "num_input_tokens_seen": 13195885, "step": 621, "time_per_iteration": 2.931152582168579 }, { "auxiliary_loss_clip": 0.01214799, "auxiliary_loss_mlp": 0.01072476, "balance_loss_clip": 1.07063794, "balance_loss_mlp": 1.04181564, "epoch": 0.07479107797751458, "flos": 17529686697600.0, "grad_norm": 2.9062724336914068, "language_loss": 0.91800731, "learning_rate": 3.979043955878749e-06, "loss": 0.94088006, "num_input_tokens_seen": 13213730, "step": 622, "time_per_iteration": 2.66097354888916 }, { "auxiliary_loss_clip": 0.01227036, "auxiliary_loss_mlp": 0.01074529, "balance_loss_clip": 1.07037067, "balance_loss_mlp": 1.04396367, "epoch": 0.07491132086815366, "flos": 23473943591040.0, "grad_norm": 1.9746900816189632, "language_loss": 0.83156216, "learning_rate": 3.978931336035959e-06, "loss": 0.8545779, "num_input_tokens_seen": 13232540, "step": 623, "time_per_iteration": 2.7058699131011963 }, { "auxiliary_loss_clip": 0.01244881, "auxiliary_loss_mlp": 0.01081171, "balance_loss_clip": 1.0752089, "balance_loss_mlp": 1.04905653, "epoch": 0.07503156375879276, "flos": 20157557708160.0, "grad_norm": 2.2211364172983386, "language_loss": 0.82434201, "learning_rate": 3.9788184159908595e-06, "loss": 0.84760255, "num_input_tokens_seen": 13249670, "step": 624, "time_per_iteration": 2.6168627738952637 }, { "auxiliary_loss_clip": 0.01213833, "auxiliary_loss_mlp": 0.01089438, "balance_loss_clip": 1.06967735, "balance_loss_mlp": 1.05787182, "epoch": 0.07515180664943186, "flos": 15115519653120.0, "grad_norm": 3.774065436891161, "language_loss": 0.82742202, "learning_rate": 3.97870519576058e-06, "loss": 0.85045481, "num_input_tokens_seen": 13266095, "step": 625, "time_per_iteration": 2.6296348571777344 }, { "auxiliary_loss_clip": 0.01212872, "auxiliary_loss_mlp": 0.00781965, "balance_loss_clip": 1.06933665, "balance_loss_mlp": 1.0004766, "epoch": 0.07527204954007094, "flos": 21287702298240.0, "grad_norm": 2.214692635243083, "language_loss": 0.80984807, "learning_rate": 3.978591675362295e-06, "loss": 0.82979643, "num_input_tokens_seen": 13284810, "step": 626, "time_per_iteration": 2.9362683296203613 }, { "auxiliary_loss_clip": 0.01191856, "auxiliary_loss_mlp": 0.01081439, "balance_loss_clip": 1.06824398, "balance_loss_mlp": 1.05057621, "epoch": 0.07539229243071004, "flos": 21324187537920.0, "grad_norm": 1.866696432951, "language_loss": 0.87622273, "learning_rate": 3.978477854813226e-06, "loss": 0.8989557, "num_input_tokens_seen": 13304150, "step": 627, "time_per_iteration": 2.7037222385406494 }, { "auxiliary_loss_clip": 0.01238126, "auxiliary_loss_mlp": 0.0106372, "balance_loss_clip": 1.07040834, "balance_loss_mlp": 1.03320277, "epoch": 0.07551253532134912, "flos": 13042540920960.0, "grad_norm": 2.0107457383604093, "language_loss": 0.82732153, "learning_rate": 3.97836373413064e-06, "loss": 0.85033995, "num_input_tokens_seen": 13322205, "step": 628, "time_per_iteration": 2.644395351409912 }, { "auxiliary_loss_clip": 0.0125616, "auxiliary_loss_mlp": 0.01072126, "balance_loss_clip": 1.07226467, "balance_loss_mlp": 1.04105961, "epoch": 0.07563277821198822, "flos": 19208761908480.0, "grad_norm": 1.7049822476191543, "language_loss": 0.74436021, "learning_rate": 3.978249313331848e-06, "loss": 0.76764303, "num_input_tokens_seen": 13340435, "step": 629, "time_per_iteration": 2.564662456512451 }, { "auxiliary_loss_clip": 0.01240065, "auxiliary_loss_mlp": 0.00781197, "balance_loss_clip": 1.06836867, "balance_loss_mlp": 1.00038004, "epoch": 0.07575302110262731, "flos": 19537200892800.0, "grad_norm": 4.030096852410422, "language_loss": 0.62242657, "learning_rate": 3.978134592434208e-06, "loss": 0.64263916, "num_input_tokens_seen": 13358185, "step": 630, "time_per_iteration": 2.581878185272217 }, { "auxiliary_loss_clip": 0.01072822, "auxiliary_loss_mlp": 0.01018457, "balance_loss_clip": 1.04292059, "balance_loss_mlp": 1.01035118, "epoch": 0.0758732639932664, "flos": 67961808017280.0, "grad_norm": 1.0270483331850935, "language_loss": 0.59403181, "learning_rate": 3.978019571455123e-06, "loss": 0.6149447, "num_input_tokens_seen": 13410130, "step": 631, "time_per_iteration": 3.2978808879852295 }, { "auxiliary_loss_clip": 0.0124824, "auxiliary_loss_mlp": 0.01067464, "balance_loss_clip": 1.07164359, "balance_loss_mlp": 1.03935432, "epoch": 0.07599350688390549, "flos": 18989204025600.0, "grad_norm": 2.420998092579993, "language_loss": 0.84005243, "learning_rate": 3.977904250412042e-06, "loss": 0.86320949, "num_input_tokens_seen": 13429085, "step": 632, "time_per_iteration": 2.5394816398620605 }, { "auxiliary_loss_clip": 0.01229202, "auxiliary_loss_mlp": 0.01080414, "balance_loss_clip": 1.07169139, "balance_loss_mlp": 1.0506835, "epoch": 0.07611374977454458, "flos": 21069006341760.0, "grad_norm": 2.3529876203336797, "language_loss": 0.8568092, "learning_rate": 3.97778862932246e-06, "loss": 0.8799054, "num_input_tokens_seen": 13446250, "step": 633, "time_per_iteration": 2.660824775695801 }, { "auxiliary_loss_clip": 0.01114459, "auxiliary_loss_mlp": 0.01070102, "balance_loss_clip": 1.04308712, "balance_loss_mlp": 1.03822517, "epoch": 0.07623399266518367, "flos": 18514536773760.0, "grad_norm": 2.1805693126451264, "language_loss": 0.94086969, "learning_rate": 3.9776727082039144e-06, "loss": 0.96271539, "num_input_tokens_seen": 13463220, "step": 634, "time_per_iteration": 3.0074682235717773 }, { "auxiliary_loss_clip": 0.01133483, "auxiliary_loss_mlp": 0.01007633, "balance_loss_clip": 1.04722238, "balance_loss_mlp": 1.00038552, "epoch": 0.07635423555582276, "flos": 44663036077440.0, "grad_norm": 0.8061752593789419, "language_loss": 0.55493802, "learning_rate": 3.977556487073991e-06, "loss": 0.5763492, "num_input_tokens_seen": 13517775, "step": 635, "time_per_iteration": 3.4351706504821777 }, { "auxiliary_loss_clip": 0.01204671, "auxiliary_loss_mlp": 0.01070837, "balance_loss_clip": 1.05778396, "balance_loss_mlp": 1.04012871, "epoch": 0.07647447844646185, "flos": 21761148487680.0, "grad_norm": 1.7305855532137715, "language_loss": 0.8146565, "learning_rate": 3.97743996595032e-06, "loss": 0.83741158, "num_input_tokens_seen": 13537815, "step": 636, "time_per_iteration": 2.7414448261260986 }, { "auxiliary_loss_clip": 0.0125411, "auxiliary_loss_mlp": 0.01067801, "balance_loss_clip": 1.07575953, "balance_loss_mlp": 1.0380944, "epoch": 0.07659472133710095, "flos": 23806799948160.0, "grad_norm": 1.634187062040188, "language_loss": 0.81817132, "learning_rate": 3.9773231448505804e-06, "loss": 0.84139049, "num_input_tokens_seen": 13559605, "step": 637, "time_per_iteration": 2.830885887145996 }, { "auxiliary_loss_clip": 0.01224908, "auxiliary_loss_mlp": 0.00782212, "balance_loss_clip": 1.07139504, "balance_loss_mlp": 1.0003736, "epoch": 0.07671496422774003, "flos": 21469984842240.0, "grad_norm": 2.6608448139160665, "language_loss": 0.7791934, "learning_rate": 3.977206023792491e-06, "loss": 0.79926455, "num_input_tokens_seen": 13579495, "step": 638, "time_per_iteration": 2.6601083278656006 }, { "auxiliary_loss_clip": 0.0123977, "auxiliary_loss_mlp": 0.01073841, "balance_loss_clip": 1.07542229, "balance_loss_mlp": 1.04330003, "epoch": 0.07683520711837913, "flos": 16980971558400.0, "grad_norm": 2.2648896473251106, "language_loss": 0.80956179, "learning_rate": 3.97708860279382e-06, "loss": 0.83269793, "num_input_tokens_seen": 13597605, "step": 639, "time_per_iteration": 2.697878837585449 }, { "auxiliary_loss_clip": 0.01205748, "auxiliary_loss_mlp": 0.0108062, "balance_loss_clip": 1.06706119, "balance_loss_mlp": 1.04798079, "epoch": 0.07695545000901821, "flos": 23476744851840.0, "grad_norm": 1.6041249736239236, "language_loss": 0.78145593, "learning_rate": 3.97697088187238e-06, "loss": 0.80431962, "num_input_tokens_seen": 13618120, "step": 640, "time_per_iteration": 2.722376585006714 }, { "auxiliary_loss_clip": 0.01214453, "auxiliary_loss_mlp": 0.01065578, "balance_loss_clip": 1.0681026, "balance_loss_mlp": 1.03646719, "epoch": 0.07707569289965731, "flos": 17634258167040.0, "grad_norm": 1.93730158706233, "language_loss": 0.91967547, "learning_rate": 3.976852861046029e-06, "loss": 0.9424758, "num_input_tokens_seen": 13634735, "step": 641, "time_per_iteration": 3.6694037914276123 }, { "auxiliary_loss_clip": 0.01192485, "auxiliary_loss_mlp": 0.01075766, "balance_loss_clip": 1.06819391, "balance_loss_mlp": 1.04670286, "epoch": 0.0771959357902964, "flos": 25775674087680.0, "grad_norm": 2.825237413291122, "language_loss": 0.80222857, "learning_rate": 3.97673454033267e-06, "loss": 0.82491112, "num_input_tokens_seen": 13656835, "step": 642, "time_per_iteration": 3.7682175636291504 }, { "auxiliary_loss_clip": 0.01224669, "auxiliary_loss_mlp": 0.01076891, "balance_loss_clip": 1.06795216, "balance_loss_mlp": 1.04551458, "epoch": 0.07731617868093549, "flos": 19828651847040.0, "grad_norm": 2.0249430849091996, "language_loss": 0.82464051, "learning_rate": 3.976615919750254e-06, "loss": 0.84765613, "num_input_tokens_seen": 13674535, "step": 643, "time_per_iteration": 2.6291379928588867 }, { "auxiliary_loss_clip": 0.01237391, "auxiliary_loss_mlp": 0.01085542, "balance_loss_clip": 1.07175219, "balance_loss_mlp": 1.0538559, "epoch": 0.07743642157157458, "flos": 21324654414720.0, "grad_norm": 1.8498980914492371, "language_loss": 0.86848944, "learning_rate": 3.976496999316775e-06, "loss": 0.89171875, "num_input_tokens_seen": 13693290, "step": 644, "time_per_iteration": 2.6294095516204834 }, { "auxiliary_loss_clip": 0.0122481, "auxiliary_loss_mlp": 0.01074569, "balance_loss_clip": 1.07435322, "balance_loss_mlp": 1.04529071, "epoch": 0.07755666446221367, "flos": 19969133938560.0, "grad_norm": 2.0012333454355526, "language_loss": 0.84136784, "learning_rate": 3.976377779050271e-06, "loss": 0.86436164, "num_input_tokens_seen": 13711420, "step": 645, "time_per_iteration": 4.542988061904907 }, { "auxiliary_loss_clip": 0.01229784, "auxiliary_loss_mlp": 0.01074779, "balance_loss_clip": 1.07116735, "balance_loss_mlp": 1.04454708, "epoch": 0.07767690735285276, "flos": 23623224514560.0, "grad_norm": 2.041512980575447, "language_loss": 0.84472525, "learning_rate": 3.976258258968831e-06, "loss": 0.86777085, "num_input_tokens_seen": 13729965, "step": 646, "time_per_iteration": 2.814870595932007 }, { "auxiliary_loss_clip": 0.01213216, "auxiliary_loss_mlp": 0.010861, "balance_loss_clip": 1.07311773, "balance_loss_mlp": 1.05629754, "epoch": 0.07779715024349185, "flos": 22236246702720.0, "grad_norm": 2.2227074013685857, "language_loss": 0.7419256, "learning_rate": 3.976138439090583e-06, "loss": 0.7649188, "num_input_tokens_seen": 13748045, "step": 647, "time_per_iteration": 2.678532361984253 }, { "auxiliary_loss_clip": 0.01212675, "auxiliary_loss_mlp": 0.010732, "balance_loss_clip": 1.06884027, "balance_loss_mlp": 1.04072738, "epoch": 0.07791739313413094, "flos": 20955097336320.0, "grad_norm": 2.2456869746323536, "language_loss": 0.84901851, "learning_rate": 3.976018319433706e-06, "loss": 0.87187725, "num_input_tokens_seen": 13765590, "step": 648, "time_per_iteration": 2.8095691204071045 }, { "auxiliary_loss_clip": 0.01240576, "auxiliary_loss_mlp": 0.01060355, "balance_loss_clip": 1.07548797, "balance_loss_mlp": 1.02914572, "epoch": 0.07803763602477004, "flos": 19312327797120.0, "grad_norm": 2.52914098910361, "language_loss": 0.9145171, "learning_rate": 3.9758979000164205e-06, "loss": 0.93752646, "num_input_tokens_seen": 13782410, "step": 649, "time_per_iteration": 2.6645548343658447 }, { "auxiliary_loss_clip": 0.01213756, "auxiliary_loss_mlp": 0.01074105, "balance_loss_clip": 1.06813002, "balance_loss_mlp": 1.04137015, "epoch": 0.07815787891540912, "flos": 22710806213760.0, "grad_norm": 1.7875161282841203, "language_loss": 0.71792841, "learning_rate": 3.975777180856995e-06, "loss": 0.74080706, "num_input_tokens_seen": 13801530, "step": 650, "time_per_iteration": 2.8453006744384766 }, { "auxiliary_loss_clip": 0.01258613, "auxiliary_loss_mlp": 0.01071783, "balance_loss_clip": 1.0727253, "balance_loss_mlp": 1.04157531, "epoch": 0.07827812180604822, "flos": 22711129436160.0, "grad_norm": 2.5795865194118517, "language_loss": 0.86281586, "learning_rate": 3.975656161973742e-06, "loss": 0.88611984, "num_input_tokens_seen": 13820615, "step": 651, "time_per_iteration": 2.5679028034210205 }, { "auxiliary_loss_clip": 0.01260004, "auxiliary_loss_mlp": 0.01072648, "balance_loss_clip": 1.07325816, "balance_loss_mlp": 1.04110551, "epoch": 0.0783983646966873, "flos": 21725597001600.0, "grad_norm": 4.03459007234146, "language_loss": 0.88735783, "learning_rate": 3.9755348433850194e-06, "loss": 0.91068435, "num_input_tokens_seen": 13835955, "step": 652, "time_per_iteration": 2.596437454223633 }, { "auxiliary_loss_clip": 0.01088689, "auxiliary_loss_mlp": 0.01015865, "balance_loss_clip": 1.03188586, "balance_loss_mlp": 1.00909352, "epoch": 0.0785186075873264, "flos": 60640877537280.0, "grad_norm": 0.9561483167774542, "language_loss": 0.63567495, "learning_rate": 3.975413225109232e-06, "loss": 0.65672052, "num_input_tokens_seen": 13896505, "step": 653, "time_per_iteration": 3.262683868408203 }, { "auxiliary_loss_clip": 0.01237028, "auxiliary_loss_mlp": 0.01074579, "balance_loss_clip": 1.07169688, "balance_loss_mlp": 1.0409621, "epoch": 0.0786388504779655, "flos": 23877902920320.0, "grad_norm": 3.4148677584235854, "language_loss": 0.93572533, "learning_rate": 3.975291307164829e-06, "loss": 0.95884138, "num_input_tokens_seen": 13915150, "step": 654, "time_per_iteration": 2.6288771629333496 }, { "auxiliary_loss_clip": 0.01196637, "auxiliary_loss_mlp": 0.01063722, "balance_loss_clip": 1.06613445, "balance_loss_mlp": 1.0330379, "epoch": 0.07875909336860458, "flos": 15158684822400.0, "grad_norm": 2.0466703235754644, "language_loss": 0.85460931, "learning_rate": 3.975169089570306e-06, "loss": 0.87721288, "num_input_tokens_seen": 13933525, "step": 655, "time_per_iteration": 2.675004005432129 }, { "auxiliary_loss_clip": 0.01221317, "auxiliary_loss_mlp": 0.01082784, "balance_loss_clip": 1.07003331, "balance_loss_mlp": 1.051337, "epoch": 0.07887933625924368, "flos": 22236857233920.0, "grad_norm": 1.8770808561140229, "language_loss": 0.91568995, "learning_rate": 3.975046572344202e-06, "loss": 0.93873096, "num_input_tokens_seen": 13949985, "step": 656, "time_per_iteration": 2.7375242710113525 }, { "auxiliary_loss_clip": 0.01203892, "auxiliary_loss_mlp": 0.01067638, "balance_loss_clip": 1.06498456, "balance_loss_mlp": 1.03619075, "epoch": 0.07899957914988276, "flos": 20777734955520.0, "grad_norm": 1.7109087252296418, "language_loss": 0.7122246, "learning_rate": 3.974923755505103e-06, "loss": 0.73493993, "num_input_tokens_seen": 13969215, "step": 657, "time_per_iteration": 2.6836516857147217 }, { "auxiliary_loss_clip": 0.01180994, "auxiliary_loss_mlp": 0.01070591, "balance_loss_clip": 1.05543208, "balance_loss_mlp": 1.04169416, "epoch": 0.07911982204052186, "flos": 23003047267200.0, "grad_norm": 1.7100418509154705, "language_loss": 0.91389513, "learning_rate": 3.974800639071641e-06, "loss": 0.93641096, "num_input_tokens_seen": 13989935, "step": 658, "time_per_iteration": 2.715358257293701 }, { "auxiliary_loss_clip": 0.01160742, "auxiliary_loss_mlp": 0.00783014, "balance_loss_clip": 1.05738688, "balance_loss_mlp": 1.00026381, "epoch": 0.07924006493116094, "flos": 23111389664640.0, "grad_norm": 2.024648078765436, "language_loss": 1.00618875, "learning_rate": 3.974677223062492e-06, "loss": 1.02562642, "num_input_tokens_seen": 14007150, "step": 659, "time_per_iteration": 2.727893114089966 }, { "auxiliary_loss_clip": 0.01223945, "auxiliary_loss_mlp": 0.01078065, "balance_loss_clip": 1.07303488, "balance_loss_mlp": 1.04947913, "epoch": 0.07936030782180004, "flos": 16472153450880.0, "grad_norm": 2.492067953363904, "language_loss": 0.7447226, "learning_rate": 3.974553507496378e-06, "loss": 0.76774269, "num_input_tokens_seen": 14025725, "step": 660, "time_per_iteration": 2.649143934249878 }, { "auxiliary_loss_clip": 0.01214047, "auxiliary_loss_mlp": 0.01079143, "balance_loss_clip": 1.07026458, "balance_loss_mlp": 1.04523993, "epoch": 0.07948055071243913, "flos": 23733290764800.0, "grad_norm": 2.38778100515406, "language_loss": 0.89277261, "learning_rate": 3.974429492392068e-06, "loss": 0.91570455, "num_input_tokens_seen": 14045750, "step": 661, "time_per_iteration": 2.7132174968719482 }, { "auxiliary_loss_clip": 0.01250192, "auxiliary_loss_mlp": 0.00781113, "balance_loss_clip": 1.07347393, "balance_loss_mlp": 1.00030184, "epoch": 0.07960079360307822, "flos": 19573326996480.0, "grad_norm": 1.8418466095743002, "language_loss": 0.91040468, "learning_rate": 3.974305177768373e-06, "loss": 0.93071777, "num_input_tokens_seen": 14063960, "step": 662, "time_per_iteration": 2.5813732147216797 }, { "auxiliary_loss_clip": 0.01199685, "auxiliary_loss_mlp": 0.01078867, "balance_loss_clip": 1.07021439, "balance_loss_mlp": 1.04339063, "epoch": 0.07972103649371731, "flos": 23513409659520.0, "grad_norm": 2.323075123639327, "language_loss": 0.8676036, "learning_rate": 3.974180563644152e-06, "loss": 0.89038914, "num_input_tokens_seen": 14082525, "step": 663, "time_per_iteration": 2.6824986934661865 }, { "auxiliary_loss_clip": 0.01223032, "auxiliary_loss_mlp": 0.01080727, "balance_loss_clip": 1.0689429, "balance_loss_mlp": 1.04701459, "epoch": 0.0798412793843564, "flos": 16726867770240.0, "grad_norm": 2.0709987265079954, "language_loss": 0.88980252, "learning_rate": 3.97405565003831e-06, "loss": 0.91284007, "num_input_tokens_seen": 14098610, "step": 664, "time_per_iteration": 2.7936949729919434 }, { "auxiliary_loss_clip": 0.01216345, "auxiliary_loss_mlp": 0.01079357, "balance_loss_clip": 1.06957817, "balance_loss_mlp": 1.0466696, "epoch": 0.07996152227499549, "flos": 18223337214720.0, "grad_norm": 2.0554954878701346, "language_loss": 0.78229892, "learning_rate": 3.973930436969794e-06, "loss": 0.80525601, "num_input_tokens_seen": 14117065, "step": 665, "time_per_iteration": 2.659942626953125 }, { "auxiliary_loss_clip": 0.01213324, "auxiliary_loss_mlp": 0.01096169, "balance_loss_clip": 1.06788886, "balance_loss_mlp": 1.06169343, "epoch": 0.08008176516563459, "flos": 20594877793920.0, "grad_norm": 1.8565353722274922, "language_loss": 0.85726047, "learning_rate": 3.973804924457602e-06, "loss": 0.88035536, "num_input_tokens_seen": 14135145, "step": 666, "time_per_iteration": 2.65956974029541 }, { "auxiliary_loss_clip": 0.01215486, "auxiliary_loss_mlp": 0.01082329, "balance_loss_clip": 1.07063127, "balance_loss_mlp": 1.04818761, "epoch": 0.08020200805627367, "flos": 31834306863360.0, "grad_norm": 1.8107527312570568, "language_loss": 0.85597336, "learning_rate": 3.973679112520771e-06, "loss": 0.87895155, "num_input_tokens_seen": 14156860, "step": 667, "time_per_iteration": 3.7367448806762695 }, { "auxiliary_loss_clip": 0.01194191, "auxiliary_loss_mlp": 0.01092788, "balance_loss_clip": 1.0638864, "balance_loss_mlp": 1.05988622, "epoch": 0.08032225094691277, "flos": 17783503176960.0, "grad_norm": 1.8390678332196035, "language_loss": 0.98936522, "learning_rate": 3.973553001178389e-06, "loss": 1.01223493, "num_input_tokens_seen": 14174365, "step": 668, "time_per_iteration": 2.633962631225586 }, { "auxiliary_loss_clip": 0.0120965, "auxiliary_loss_mlp": 0.0108166, "balance_loss_clip": 1.06971002, "balance_loss_mlp": 1.04701781, "epoch": 0.08044249383755185, "flos": 24061693835520.0, "grad_norm": 2.2051651404546697, "language_loss": 0.7572186, "learning_rate": 3.973426590449585e-06, "loss": 0.7801317, "num_input_tokens_seen": 14192320, "step": 669, "time_per_iteration": 3.7191689014434814 }, { "auxiliary_loss_clip": 0.01193818, "auxiliary_loss_mlp": 0.01080676, "balance_loss_clip": 1.06697273, "balance_loss_mlp": 1.05018258, "epoch": 0.08056273672819095, "flos": 18223624523520.0, "grad_norm": 1.7402426005070242, "language_loss": 0.75067216, "learning_rate": 3.9732998803535364e-06, "loss": 0.77341712, "num_input_tokens_seen": 14210380, "step": 670, "time_per_iteration": 2.7005908489227295 }, { "auxiliary_loss_clip": 0.01249073, "auxiliary_loss_mlp": 0.0108057, "balance_loss_clip": 1.07158518, "balance_loss_mlp": 1.05083942, "epoch": 0.08068297961883003, "flos": 19676856971520.0, "grad_norm": 2.1417662910849726, "language_loss": 0.85203743, "learning_rate": 3.973172870909465e-06, "loss": 0.87533391, "num_input_tokens_seen": 14225145, "step": 671, "time_per_iteration": 3.5807528495788574 }, { "auxiliary_loss_clip": 0.01226745, "auxiliary_loss_mlp": 0.01074117, "balance_loss_clip": 1.06939089, "balance_loss_mlp": 1.04114354, "epoch": 0.08080322250946913, "flos": 23148736830720.0, "grad_norm": 2.4763675067567523, "language_loss": 0.80882269, "learning_rate": 3.973045562136638e-06, "loss": 0.83183134, "num_input_tokens_seen": 14241960, "step": 672, "time_per_iteration": 2.9018139839172363 }, { "auxiliary_loss_clip": 0.0123574, "auxiliary_loss_mlp": 0.01074022, "balance_loss_clip": 1.06881785, "balance_loss_mlp": 1.04364717, "epoch": 0.08092346540010822, "flos": 21763626526080.0, "grad_norm": 2.0002198340399513, "language_loss": 0.91326153, "learning_rate": 3.972917954054368e-06, "loss": 0.93635917, "num_input_tokens_seen": 14260515, "step": 673, "time_per_iteration": 2.623504638671875 }, { "auxiliary_loss_clip": 0.01210463, "auxiliary_loss_mlp": 0.01084573, "balance_loss_clip": 1.06323838, "balance_loss_mlp": 1.05272079, "epoch": 0.08104370829074731, "flos": 21032485188480.0, "grad_norm": 2.3662534296081743, "language_loss": 0.82082158, "learning_rate": 3.972790046682013e-06, "loss": 0.84377193, "num_input_tokens_seen": 14279190, "step": 674, "time_per_iteration": 2.643184185028076 }, { "auxiliary_loss_clip": 0.01211592, "auxiliary_loss_mlp": 0.01073326, "balance_loss_clip": 1.06766844, "balance_loss_mlp": 1.04097307, "epoch": 0.0811639511813864, "flos": 20083186598400.0, "grad_norm": 1.6468609062915398, "language_loss": 0.79260266, "learning_rate": 3.972661840038977e-06, "loss": 0.81545186, "num_input_tokens_seen": 14299480, "step": 675, "time_per_iteration": 2.7665977478027344 }, { "auxiliary_loss_clip": 0.01245933, "auxiliary_loss_mlp": 0.01077721, "balance_loss_clip": 1.07641792, "balance_loss_mlp": 1.04765654, "epoch": 0.08128419407202549, "flos": 16836718538880.0, "grad_norm": 2.4146085086501348, "language_loss": 0.83749586, "learning_rate": 3.972533334144707e-06, "loss": 0.86073244, "num_input_tokens_seen": 14316405, "step": 676, "time_per_iteration": 2.6004011631011963 }, { "auxiliary_loss_clip": 0.01235609, "auxiliary_loss_mlp": 0.01064037, "balance_loss_clip": 1.06851995, "balance_loss_mlp": 1.03485417, "epoch": 0.08140443696266458, "flos": 23769273214080.0, "grad_norm": 1.9373445114622783, "language_loss": 0.78597939, "learning_rate": 3.972404529018699e-06, "loss": 0.80897588, "num_input_tokens_seen": 14336265, "step": 677, "time_per_iteration": 2.6501832008361816 }, { "auxiliary_loss_clip": 0.01208508, "auxiliary_loss_mlp": 0.01082782, "balance_loss_clip": 1.06132638, "balance_loss_mlp": 1.05224085, "epoch": 0.08152467985330367, "flos": 24390132819840.0, "grad_norm": 1.7527691214008547, "language_loss": 0.85524809, "learning_rate": 3.972275424680493e-06, "loss": 0.87816095, "num_input_tokens_seen": 14356375, "step": 678, "time_per_iteration": 2.6807782649993896 }, { "auxiliary_loss_clip": 0.01247163, "auxiliary_loss_mlp": 0.01069148, "balance_loss_clip": 1.07260966, "balance_loss_mlp": 1.0393219, "epoch": 0.08164492274394276, "flos": 19317750750720.0, "grad_norm": 1.918303594273687, "language_loss": 0.91848421, "learning_rate": 3.972146021149673e-06, "loss": 0.94164729, "num_input_tokens_seen": 14374650, "step": 679, "time_per_iteration": 2.5803592205047607 }, { "auxiliary_loss_clip": 0.0120681, "auxiliary_loss_mlp": 0.01061481, "balance_loss_clip": 1.06778646, "balance_loss_mlp": 1.03239393, "epoch": 0.08176516563458186, "flos": 14830461319680.0, "grad_norm": 2.3836241407968086, "language_loss": 0.78429699, "learning_rate": 3.972016318445868e-06, "loss": 0.80697995, "num_input_tokens_seen": 14392650, "step": 680, "time_per_iteration": 2.6527323722839355 }, { "auxiliary_loss_clip": 0.01231745, "auxiliary_loss_mlp": 0.01067864, "balance_loss_clip": 1.06829441, "balance_loss_mlp": 1.03768015, "epoch": 0.08188540852522094, "flos": 22602320161920.0, "grad_norm": 2.184449828553853, "language_loss": 0.92598158, "learning_rate": 3.971886316588757e-06, "loss": 0.94897771, "num_input_tokens_seen": 14413155, "step": 681, "time_per_iteration": 2.660037040710449 }, { "auxiliary_loss_clip": 0.01195541, "auxiliary_loss_mlp": 0.01080108, "balance_loss_clip": 1.068434, "balance_loss_mlp": 1.04963827, "epoch": 0.08200565141586004, "flos": 19463727623040.0, "grad_norm": 2.15797012792304, "language_loss": 0.72848129, "learning_rate": 3.9717560155980595e-06, "loss": 0.75123787, "num_input_tokens_seen": 14428805, "step": 682, "time_per_iteration": 2.6539146900177 }, { "auxiliary_loss_clip": 0.01236585, "auxiliary_loss_mlp": 0.01083551, "balance_loss_clip": 1.0702002, "balance_loss_mlp": 1.05331957, "epoch": 0.08212589430649912, "flos": 20594662312320.0, "grad_norm": 1.9510648756764182, "language_loss": 0.9215008, "learning_rate": 3.971625415493542e-06, "loss": 0.94470221, "num_input_tokens_seen": 14447125, "step": 683, "time_per_iteration": 2.6275389194488525 }, { "auxiliary_loss_clip": 0.01199845, "auxiliary_loss_mlp": 0.01071658, "balance_loss_clip": 1.06252789, "balance_loss_mlp": 1.04242754, "epoch": 0.08224613719713822, "flos": 25953611086080.0, "grad_norm": 1.9188226251255303, "language_loss": 0.87435722, "learning_rate": 3.971494516295017e-06, "loss": 0.89707226, "num_input_tokens_seen": 14466575, "step": 684, "time_per_iteration": 2.736025333404541 }, { "auxiliary_loss_clip": 0.01203495, "auxiliary_loss_mlp": 0.01080822, "balance_loss_clip": 1.06536579, "balance_loss_mlp": 1.04927897, "epoch": 0.08236638008777732, "flos": 23768734510080.0, "grad_norm": 1.9144286801329977, "language_loss": 0.85432363, "learning_rate": 3.971363318022341e-06, "loss": 0.87716681, "num_input_tokens_seen": 14487915, "step": 685, "time_per_iteration": 2.6857123374938965 }, { "auxiliary_loss_clip": 0.01214203, "auxiliary_loss_mlp": 0.01070323, "balance_loss_clip": 1.06284142, "balance_loss_mlp": 1.04016292, "epoch": 0.0824866229784164, "flos": 38799144887040.0, "grad_norm": 1.7472421975229386, "language_loss": 0.68499172, "learning_rate": 3.971231820695417e-06, "loss": 0.70783699, "num_input_tokens_seen": 14511530, "step": 686, "time_per_iteration": 2.796889305114746 }, { "auxiliary_loss_clip": 0.01219517, "auxiliary_loss_mlp": 0.01062141, "balance_loss_clip": 1.06407166, "balance_loss_mlp": 1.03378129, "epoch": 0.0826068658690555, "flos": 23107762391040.0, "grad_norm": 2.1224745594377303, "language_loss": 0.81584978, "learning_rate": 3.971100024334193e-06, "loss": 0.83866632, "num_input_tokens_seen": 14529050, "step": 687, "time_per_iteration": 2.648160219192505 }, { "auxiliary_loss_clip": 0.01181496, "auxiliary_loss_mlp": 0.01073254, "balance_loss_clip": 1.05804038, "balance_loss_mlp": 1.04335618, "epoch": 0.08272710875969458, "flos": 21136374299520.0, "grad_norm": 1.9619926710488, "language_loss": 0.8664884, "learning_rate": 3.970967928958663e-06, "loss": 0.88903588, "num_input_tokens_seen": 14546165, "step": 688, "time_per_iteration": 2.660062551498413 }, { "auxiliary_loss_clip": 0.01194073, "auxiliary_loss_mlp": 0.01087809, "balance_loss_clip": 1.06421542, "balance_loss_mlp": 1.05731547, "epoch": 0.08284735165033368, "flos": 19063000517760.0, "grad_norm": 1.689103512905966, "language_loss": 0.83521473, "learning_rate": 3.970835534588865e-06, "loss": 0.85803354, "num_input_tokens_seen": 14563660, "step": 689, "time_per_iteration": 2.692110300064087 }, { "auxiliary_loss_clip": 0.01220579, "auxiliary_loss_mlp": 0.01087861, "balance_loss_clip": 1.06902766, "balance_loss_mlp": 1.05758226, "epoch": 0.08296759454097276, "flos": 16727442387840.0, "grad_norm": 1.8623660356744867, "language_loss": 0.8559829, "learning_rate": 3.970702841244883e-06, "loss": 0.8790673, "num_input_tokens_seen": 14581980, "step": 690, "time_per_iteration": 2.6392009258270264 }, { "auxiliary_loss_clip": 0.01234497, "auxiliary_loss_mlp": 0.01075499, "balance_loss_clip": 1.06940544, "balance_loss_mlp": 1.0457685, "epoch": 0.08308783743161186, "flos": 18004928567040.0, "grad_norm": 1.789210746383429, "language_loss": 0.82512772, "learning_rate": 3.970569848946847e-06, "loss": 0.84822768, "num_input_tokens_seen": 14601795, "step": 691, "time_per_iteration": 2.585120439529419 }, { "auxiliary_loss_clip": 0.0121576, "auxiliary_loss_mlp": 0.01081173, "balance_loss_clip": 1.06742644, "balance_loss_mlp": 1.04903376, "epoch": 0.08320808032225095, "flos": 15079788599040.0, "grad_norm": 3.28112965470891, "language_loss": 0.82714701, "learning_rate": 3.970436557714932e-06, "loss": 0.85011637, "num_input_tokens_seen": 14618315, "step": 692, "time_per_iteration": 3.4788594245910645 }, { "auxiliary_loss_clip": 0.01214685, "auxiliary_loss_mlp": 0.01075031, "balance_loss_clip": 1.06370687, "balance_loss_mlp": 1.04413152, "epoch": 0.08332832321289003, "flos": 22383085501440.0, "grad_norm": 2.7863827585568104, "language_loss": 0.86592788, "learning_rate": 3.970302967569358e-06, "loss": 0.88882506, "num_input_tokens_seen": 14636905, "step": 693, "time_per_iteration": 2.6749444007873535 }, { "auxiliary_loss_clip": 0.01228851, "auxiliary_loss_mlp": 0.01087076, "balance_loss_clip": 1.07007074, "balance_loss_mlp": 1.05708337, "epoch": 0.08344856610352913, "flos": 24717386655360.0, "grad_norm": 2.1909198046691403, "language_loss": 0.68066275, "learning_rate": 3.9701690785303896e-06, "loss": 0.70382196, "num_input_tokens_seen": 14656100, "step": 694, "time_per_iteration": 3.6279349327087402 }, { "auxiliary_loss_clip": 0.0123201, "auxiliary_loss_mlp": 0.01080033, "balance_loss_clip": 1.06536698, "balance_loss_mlp": 1.04825163, "epoch": 0.08356880899416821, "flos": 25370206387200.0, "grad_norm": 2.101702580918017, "language_loss": 0.88056183, "learning_rate": 3.970034890618339e-06, "loss": 0.90368235, "num_input_tokens_seen": 14675790, "step": 695, "time_per_iteration": 2.7066879272460938 }, { "auxiliary_loss_clip": 0.01214441, "auxiliary_loss_mlp": 0.01084534, "balance_loss_clip": 1.06357551, "balance_loss_mlp": 1.05513668, "epoch": 0.08368905188480731, "flos": 24353072962560.0, "grad_norm": 3.7713202825821437, "language_loss": 0.88050973, "learning_rate": 3.969900403853562e-06, "loss": 0.90349948, "num_input_tokens_seen": 14694830, "step": 696, "time_per_iteration": 2.6401588916778564 }, { "auxiliary_loss_clip": 0.01245342, "auxiliary_loss_mlp": 0.01081222, "balance_loss_clip": 1.07141614, "balance_loss_mlp": 1.05113411, "epoch": 0.08380929477544641, "flos": 18037319656320.0, "grad_norm": 1.8228226468398891, "language_loss": 0.78183448, "learning_rate": 3.96976561825646e-06, "loss": 0.8051002, "num_input_tokens_seen": 14711920, "step": 697, "time_per_iteration": 3.6007800102233887 }, { "auxiliary_loss_clip": 0.01190417, "auxiliary_loss_mlp": 0.01072429, "balance_loss_clip": 1.0608443, "balance_loss_mlp": 1.04300809, "epoch": 0.08392953766608549, "flos": 26286287875200.0, "grad_norm": 3.4781595631215914, "language_loss": 0.87033832, "learning_rate": 3.969630533847479e-06, "loss": 0.89296681, "num_input_tokens_seen": 14730880, "step": 698, "time_per_iteration": 2.759432554244995 }, { "auxiliary_loss_clip": 0.01229916, "auxiliary_loss_mlp": 0.01074642, "balance_loss_clip": 1.06592405, "balance_loss_mlp": 1.04433882, "epoch": 0.08404978055672459, "flos": 22492146170880.0, "grad_norm": 2.088671537079344, "language_loss": 0.84435332, "learning_rate": 3.969495150647113e-06, "loss": 0.86739892, "num_input_tokens_seen": 14749050, "step": 699, "time_per_iteration": 2.630375862121582 }, { "auxiliary_loss_clip": 0.01200175, "auxiliary_loss_mlp": 0.01081101, "balance_loss_clip": 1.06761563, "balance_loss_mlp": 1.05154872, "epoch": 0.08417002344736367, "flos": 24826878288000.0, "grad_norm": 1.738622391460531, "language_loss": 0.76702321, "learning_rate": 3.969359468675899e-06, "loss": 0.78983593, "num_input_tokens_seen": 14769180, "step": 700, "time_per_iteration": 2.726810932159424 }, { "auxiliary_loss_clip": 0.01226475, "auxiliary_loss_mlp": 0.01060747, "balance_loss_clip": 1.06540537, "balance_loss_mlp": 1.03073037, "epoch": 0.08429026633800277, "flos": 16945922862720.0, "grad_norm": 1.9371494807131941, "language_loss": 0.89611042, "learning_rate": 3.969223487954418e-06, "loss": 0.91898268, "num_input_tokens_seen": 14786640, "step": 701, "time_per_iteration": 2.604614734649658 }, { "auxiliary_loss_clip": 0.01181301, "auxiliary_loss_mlp": 0.01079633, "balance_loss_clip": 1.05966115, "balance_loss_mlp": 1.05041456, "epoch": 0.08441050922864185, "flos": 23841920471040.0, "grad_norm": 2.1562770372673747, "language_loss": 0.83061701, "learning_rate": 3.969087208503301e-06, "loss": 0.85322636, "num_input_tokens_seen": 14806720, "step": 702, "time_per_iteration": 2.721618890762329 }, { "auxiliary_loss_clip": 0.01185127, "auxiliary_loss_mlp": 0.01070302, "balance_loss_clip": 1.06044865, "balance_loss_mlp": 1.03766239, "epoch": 0.08453075211928095, "flos": 25520205582720.0, "grad_norm": 2.6864159467219144, "language_loss": 0.84523803, "learning_rate": 3.968950630343219e-06, "loss": 0.86779237, "num_input_tokens_seen": 14823705, "step": 703, "time_per_iteration": 2.708346366882324 }, { "auxiliary_loss_clip": 0.01206461, "auxiliary_loss_mlp": 0.01076799, "balance_loss_clip": 1.05750132, "balance_loss_mlp": 1.04721129, "epoch": 0.08465099500992004, "flos": 19532496211200.0, "grad_norm": 1.9645290112278273, "language_loss": 0.93668908, "learning_rate": 3.968813753494892e-06, "loss": 0.95952159, "num_input_tokens_seen": 14841865, "step": 704, "time_per_iteration": 2.6370749473571777 }, { "auxiliary_loss_clip": 0.01183856, "auxiliary_loss_mlp": 0.00783492, "balance_loss_clip": 1.05790949, "balance_loss_mlp": 1.00061154, "epoch": 0.08477123790055913, "flos": 29351299403520.0, "grad_norm": 2.2090500271884466, "language_loss": 0.75543988, "learning_rate": 3.968676577979084e-06, "loss": 0.77511334, "num_input_tokens_seen": 14861415, "step": 705, "time_per_iteration": 2.7357397079467773 }, { "auxiliary_loss_clip": 0.01176992, "auxiliary_loss_mlp": 0.01067985, "balance_loss_clip": 1.05636764, "balance_loss_mlp": 1.03767025, "epoch": 0.08489148079119822, "flos": 18624495283200.0, "grad_norm": 2.018715592261399, "language_loss": 0.78096378, "learning_rate": 3.968539103816605e-06, "loss": 0.80341351, "num_input_tokens_seen": 14879215, "step": 706, "time_per_iteration": 2.7177438735961914 }, { "auxiliary_loss_clip": 0.01206782, "auxiliary_loss_mlp": 0.00781562, "balance_loss_clip": 1.06537616, "balance_loss_mlp": 1.00076962, "epoch": 0.0850117236818373, "flos": 23471393725440.0, "grad_norm": 1.8068586850022959, "language_loss": 0.89124691, "learning_rate": 3.9684013310283085e-06, "loss": 0.91113037, "num_input_tokens_seen": 14897900, "step": 707, "time_per_iteration": 2.8043863773345947 }, { "auxiliary_loss_clip": 0.01211336, "auxiliary_loss_mlp": 0.01068411, "balance_loss_clip": 1.06556976, "balance_loss_mlp": 1.03966951, "epoch": 0.0851319665724764, "flos": 40625058896640.0, "grad_norm": 2.1275133071094845, "language_loss": 0.63839686, "learning_rate": 3.9682632596350956e-06, "loss": 0.66119426, "num_input_tokens_seen": 14919065, "step": 708, "time_per_iteration": 2.8101563453674316 }, { "auxiliary_loss_clip": 0.01223131, "auxiliary_loss_mlp": 0.01075816, "balance_loss_clip": 1.06634128, "balance_loss_mlp": 1.04378474, "epoch": 0.0852522094631155, "flos": 15879554870400.0, "grad_norm": 2.441480493997263, "language_loss": 0.78535604, "learning_rate": 3.968124889657911e-06, "loss": 0.80834556, "num_input_tokens_seen": 14934165, "step": 709, "time_per_iteration": 2.595248222351074 }, { "auxiliary_loss_clip": 0.01176317, "auxiliary_loss_mlp": 0.01074317, "balance_loss_clip": 1.05667031, "balance_loss_mlp": 1.04475331, "epoch": 0.08537245235375458, "flos": 14567091822720.0, "grad_norm": 2.609838822055416, "language_loss": 0.90808308, "learning_rate": 3.967986221117746e-06, "loss": 0.93058938, "num_input_tokens_seen": 14950105, "step": 710, "time_per_iteration": 2.6688015460968018 }, { "auxiliary_loss_clip": 0.01164827, "auxiliary_loss_mlp": 0.01086049, "balance_loss_clip": 1.05698276, "balance_loss_mlp": 1.05488729, "epoch": 0.08549269524439368, "flos": 26468929555200.0, "grad_norm": 2.221968893195734, "language_loss": 0.86692297, "learning_rate": 3.967847254035635e-06, "loss": 0.88943172, "num_input_tokens_seen": 14969490, "step": 711, "time_per_iteration": 3.1032869815826416 }, { "auxiliary_loss_clip": 0.01210508, "auxiliary_loss_mlp": 0.01072754, "balance_loss_clip": 1.06632984, "balance_loss_mlp": 1.04142594, "epoch": 0.08561293813503276, "flos": 13590214565760.0, "grad_norm": 2.0512253269767275, "language_loss": 0.86146253, "learning_rate": 3.967707988432661e-06, "loss": 0.88429511, "num_input_tokens_seen": 14987195, "step": 712, "time_per_iteration": 3.111281156539917 }, { "auxiliary_loss_clip": 0.01240558, "auxiliary_loss_mlp": 0.01072299, "balance_loss_clip": 1.06566095, "balance_loss_mlp": 1.04232979, "epoch": 0.08573318102567186, "flos": 26943524979840.0, "grad_norm": 2.2918564629383886, "language_loss": 0.87927508, "learning_rate": 3.967568424329949e-06, "loss": 0.90240371, "num_input_tokens_seen": 15007620, "step": 713, "time_per_iteration": 2.7147786617279053 }, { "auxiliary_loss_clip": 0.01093005, "auxiliary_loss_mlp": 0.0101246, "balance_loss_clip": 1.03179002, "balance_loss_mlp": 1.00521207, "epoch": 0.08585342391631094, "flos": 67302739319040.0, "grad_norm": 0.8217856743182581, "language_loss": 0.55501187, "learning_rate": 3.967428561748671e-06, "loss": 0.57606649, "num_input_tokens_seen": 15075590, "step": 714, "time_per_iteration": 3.3831307888031006 }, { "auxiliary_loss_clip": 0.01175245, "auxiliary_loss_mlp": 0.0107229, "balance_loss_clip": 1.05293727, "balance_loss_mlp": 1.04077125, "epoch": 0.08597366680695004, "flos": 22456594684800.0, "grad_norm": 1.7485589144555862, "language_loss": 0.87499082, "learning_rate": 3.967288400710045e-06, "loss": 0.89746624, "num_input_tokens_seen": 15095055, "step": 715, "time_per_iteration": 2.748455047607422 }, { "auxiliary_loss_clip": 0.01195805, "auxiliary_loss_mlp": 0.01081999, "balance_loss_clip": 1.06427169, "balance_loss_mlp": 1.05204129, "epoch": 0.08609390969758914, "flos": 23550505430400.0, "grad_norm": 2.018705161567106, "language_loss": 0.88399202, "learning_rate": 3.9671479412353335e-06, "loss": 0.90677011, "num_input_tokens_seen": 15113520, "step": 716, "time_per_iteration": 2.691201686859131 }, { "auxiliary_loss_clip": 0.01223956, "auxiliary_loss_mlp": 0.01069469, "balance_loss_clip": 1.06165075, "balance_loss_mlp": 1.03948784, "epoch": 0.08621415258822822, "flos": 25885848078720.0, "grad_norm": 2.089878674337531, "language_loss": 0.74287003, "learning_rate": 3.967007183345843e-06, "loss": 0.76580429, "num_input_tokens_seen": 15133375, "step": 717, "time_per_iteration": 2.6655681133270264 }, { "auxiliary_loss_clip": 0.01219753, "auxiliary_loss_mlp": 0.01074238, "balance_loss_clip": 1.06327748, "balance_loss_mlp": 1.04386353, "epoch": 0.08633439547886732, "flos": 13589568120960.0, "grad_norm": 2.132919672166487, "language_loss": 0.89214468, "learning_rate": 3.966866127062927e-06, "loss": 0.9150846, "num_input_tokens_seen": 15150500, "step": 718, "time_per_iteration": 2.673187017440796 }, { "auxiliary_loss_clip": 0.010957, "auxiliary_loss_mlp": 0.01009378, "balance_loss_clip": 1.0314157, "balance_loss_mlp": 1.00251126, "epoch": 0.0864546383695064, "flos": 57767342434560.0, "grad_norm": 0.8678190874918289, "language_loss": 0.6274178, "learning_rate": 3.966724772407982e-06, "loss": 0.64846855, "num_input_tokens_seen": 15208015, "step": 719, "time_per_iteration": 4.112178564071655 }, { "auxiliary_loss_clip": 0.01184472, "auxiliary_loss_mlp": 0.01086695, "balance_loss_clip": 1.05795336, "balance_loss_mlp": 1.05629659, "epoch": 0.0865748812601455, "flos": 20046952753920.0, "grad_norm": 2.003311476652617, "language_loss": 0.8860836, "learning_rate": 3.966583119402454e-06, "loss": 0.9087953, "num_input_tokens_seen": 15224780, "step": 720, "time_per_iteration": 3.661938190460205 }, { "auxiliary_loss_clip": 0.0121537, "auxiliary_loss_mlp": 0.00782384, "balance_loss_clip": 1.05895567, "balance_loss_mlp": 1.00065184, "epoch": 0.08669512415078459, "flos": 35262446935680.0, "grad_norm": 1.8250371810133663, "language_loss": 0.82082736, "learning_rate": 3.9664411680678305e-06, "loss": 0.84080493, "num_input_tokens_seen": 15246535, "step": 721, "time_per_iteration": 2.7496016025543213 }, { "auxiliary_loss_clip": 0.01071367, "auxiliary_loss_mlp": 0.01010284, "balance_loss_clip": 1.02580905, "balance_loss_mlp": 1.00389469, "epoch": 0.08681536704142367, "flos": 65654870048640.0, "grad_norm": 0.8424467622554627, "language_loss": 0.61475205, "learning_rate": 3.966298918425644e-06, "loss": 0.63556862, "num_input_tokens_seen": 15304025, "step": 722, "time_per_iteration": 4.175750732421875 }, { "auxiliary_loss_clip": 0.01227297, "auxiliary_loss_mlp": 0.01084608, "balance_loss_clip": 1.06321979, "balance_loss_mlp": 1.05317271, "epoch": 0.08693560993206277, "flos": 34529940881280.0, "grad_norm": 2.979433307218087, "language_loss": 0.8283096, "learning_rate": 3.966156370497476e-06, "loss": 0.85142869, "num_input_tokens_seen": 15327635, "step": 723, "time_per_iteration": 3.94876766204834 }, { "auxiliary_loss_clip": 0.01225335, "auxiliary_loss_mlp": 0.01068619, "balance_loss_clip": 1.06288409, "balance_loss_mlp": 1.03762484, "epoch": 0.08705585282270185, "flos": 23149419189120.0, "grad_norm": 1.7144223611200013, "language_loss": 0.88757539, "learning_rate": 3.96601352430495e-06, "loss": 0.91051483, "num_input_tokens_seen": 15347405, "step": 724, "time_per_iteration": 2.7588322162628174 }, { "auxiliary_loss_clip": 0.01205297, "auxiliary_loss_mlp": 0.01069788, "balance_loss_clip": 1.06020904, "balance_loss_mlp": 1.04005694, "epoch": 0.08717609571334095, "flos": 29497599498240.0, "grad_norm": 1.6718500671751222, "language_loss": 0.83326107, "learning_rate": 3.965870379869735e-06, "loss": 0.85601187, "num_input_tokens_seen": 15369450, "step": 725, "time_per_iteration": 2.829657793045044 }, { "auxiliary_loss_clip": 0.01216036, "auxiliary_loss_mlp": 0.01062682, "balance_loss_clip": 1.05791616, "balance_loss_mlp": 1.034549, "epoch": 0.08729633860398003, "flos": 20667489137280.0, "grad_norm": 2.2412244706153097, "language_loss": 0.87275189, "learning_rate": 3.965726937213547e-06, "loss": 0.8955391, "num_input_tokens_seen": 15388085, "step": 726, "time_per_iteration": 2.74792742729187 }, { "auxiliary_loss_clip": 0.01218181, "auxiliary_loss_mlp": 0.0106971, "balance_loss_clip": 1.05834389, "balance_loss_mlp": 1.03645039, "epoch": 0.08741658149461913, "flos": 18369493655040.0, "grad_norm": 2.235652209421066, "language_loss": 0.8099218, "learning_rate": 3.965583196358144e-06, "loss": 0.83280075, "num_input_tokens_seen": 15407120, "step": 727, "time_per_iteration": 2.706151247024536 }, { "auxiliary_loss_clip": 0.01233296, "auxiliary_loss_mlp": 0.0107186, "balance_loss_clip": 1.05907834, "balance_loss_mlp": 1.04112744, "epoch": 0.08753682438525823, "flos": 18729677283840.0, "grad_norm": 24.407808499343975, "language_loss": 0.7438125, "learning_rate": 3.965439157325335e-06, "loss": 0.766864, "num_input_tokens_seen": 15424485, "step": 728, "time_per_iteration": 2.6683011054992676 }, { "auxiliary_loss_clip": 0.01200219, "auxiliary_loss_mlp": 0.01083937, "balance_loss_clip": 1.05601919, "balance_loss_mlp": 1.04927111, "epoch": 0.08765706727589731, "flos": 27776113303680.0, "grad_norm": 2.1252580769633074, "language_loss": 0.75990307, "learning_rate": 3.965294820136968e-06, "loss": 0.78274465, "num_input_tokens_seen": 15446285, "step": 729, "time_per_iteration": 2.7766571044921875 }, { "auxiliary_loss_clip": 0.01209489, "auxiliary_loss_mlp": 0.01085735, "balance_loss_clip": 1.06013584, "balance_loss_mlp": 1.05574191, "epoch": 0.08777731016653641, "flos": 24389127239040.0, "grad_norm": 2.060657394154863, "language_loss": 0.87045228, "learning_rate": 3.965150184814938e-06, "loss": 0.89340448, "num_input_tokens_seen": 15465770, "step": 730, "time_per_iteration": 2.6789042949676514 }, { "auxiliary_loss_clip": 0.01189091, "auxiliary_loss_mlp": 0.01082062, "balance_loss_clip": 1.05712867, "balance_loss_mlp": 1.04980397, "epoch": 0.08789755305717549, "flos": 21981855605760.0, "grad_norm": 2.2543050671022264, "language_loss": 0.76529777, "learning_rate": 3.965005251381189e-06, "loss": 0.78800941, "num_input_tokens_seen": 15483705, "step": 731, "time_per_iteration": 2.722379446029663 }, { "auxiliary_loss_clip": 0.01087908, "auxiliary_loss_mlp": 0.01014379, "balance_loss_clip": 1.02050602, "balance_loss_mlp": 1.00848985, "epoch": 0.08801779594781459, "flos": 58360120583040.0, "grad_norm": 0.8882538266775809, "language_loss": 0.6460638, "learning_rate": 3.964860019857705e-06, "loss": 0.66708672, "num_input_tokens_seen": 15548620, "step": 732, "time_per_iteration": 3.2176711559295654 }, { "auxiliary_loss_clip": 0.01235471, "auxiliary_loss_mlp": 0.01065726, "balance_loss_clip": 1.06507182, "balance_loss_mlp": 1.03679407, "epoch": 0.08813803883845367, "flos": 23294785530240.0, "grad_norm": 1.7366976998601724, "language_loss": 0.84190488, "learning_rate": 3.964714490266518e-06, "loss": 0.86491686, "num_input_tokens_seen": 15569265, "step": 733, "time_per_iteration": 2.584880828857422 }, { "auxiliary_loss_clip": 0.01080931, "auxiliary_loss_mlp": 0.01013624, "balance_loss_clip": 1.01810455, "balance_loss_mlp": 1.00799751, "epoch": 0.08825828172909277, "flos": 63424924882560.0, "grad_norm": 0.8993163459118441, "language_loss": 0.64596343, "learning_rate": 3.964568662629706e-06, "loss": 0.66690898, "num_input_tokens_seen": 15630570, "step": 734, "time_per_iteration": 3.120495080947876 }, { "auxiliary_loss_clip": 0.01214743, "auxiliary_loss_mlp": 0.01070299, "balance_loss_clip": 1.05610573, "balance_loss_mlp": 1.03870821, "epoch": 0.08837852461973186, "flos": 26720986268160.0, "grad_norm": 2.1255013790625297, "language_loss": 0.84540677, "learning_rate": 3.9644225369693895e-06, "loss": 0.86825716, "num_input_tokens_seen": 15650870, "step": 735, "time_per_iteration": 2.662184715270996 }, { "auxiliary_loss_clip": 0.01238505, "auxiliary_loss_mlp": 0.01088731, "balance_loss_clip": 1.06631041, "balance_loss_mlp": 1.05772424, "epoch": 0.08849876751037095, "flos": 27265427688960.0, "grad_norm": 1.9473094661129335, "language_loss": 0.86728144, "learning_rate": 3.964276113307735e-06, "loss": 0.89055377, "num_input_tokens_seen": 15670835, "step": 736, "time_per_iteration": 2.7168285846710205 }, { "auxiliary_loss_clip": 0.01196105, "auxiliary_loss_mlp": 0.01078201, "balance_loss_clip": 1.06248951, "balance_loss_mlp": 1.04506111, "epoch": 0.08861901040101004, "flos": 19828759587840.0, "grad_norm": 1.9169578345161855, "language_loss": 0.80930996, "learning_rate": 3.9641293916669574e-06, "loss": 0.83205307, "num_input_tokens_seen": 15689795, "step": 737, "time_per_iteration": 2.668978214263916 }, { "auxiliary_loss_clip": 0.01187692, "auxiliary_loss_mlp": 0.01086727, "balance_loss_clip": 1.05656242, "balance_loss_mlp": 1.05525577, "epoch": 0.08873925329164913, "flos": 23658704173440.0, "grad_norm": 1.6450698652072597, "language_loss": 0.8252883, "learning_rate": 3.9639823720693115e-06, "loss": 0.84803247, "num_input_tokens_seen": 15711650, "step": 738, "time_per_iteration": 2.7826459407806396 }, { "auxiliary_loss_clip": 0.0106225, "auxiliary_loss_mlp": 0.01013036, "balance_loss_clip": 1.02211487, "balance_loss_mlp": 1.0055021, "epoch": 0.08885949618228822, "flos": 71831541893760.0, "grad_norm": 0.8257128540633473, "language_loss": 0.5999428, "learning_rate": 3.963835054537102e-06, "loss": 0.62069571, "num_input_tokens_seen": 15780615, "step": 739, "time_per_iteration": 3.3618111610412598 }, { "auxiliary_loss_clip": 0.01203819, "auxiliary_loss_mlp": 0.0107207, "balance_loss_clip": 1.060027, "balance_loss_mlp": 1.04210079, "epoch": 0.08897973907292732, "flos": 22346169298560.0, "grad_norm": 2.508809149886962, "language_loss": 0.60526299, "learning_rate": 3.963687439092676e-06, "loss": 0.6280219, "num_input_tokens_seen": 15801300, "step": 740, "time_per_iteration": 2.6573398113250732 }, { "auxiliary_loss_clip": 0.01221151, "auxiliary_loss_mlp": 0.01072391, "balance_loss_clip": 1.06468451, "balance_loss_mlp": 1.04177809, "epoch": 0.0890999819635664, "flos": 21251827589760.0, "grad_norm": 1.8839225359756537, "language_loss": 0.80721051, "learning_rate": 3.963539525758427e-06, "loss": 0.83014584, "num_input_tokens_seen": 15820860, "step": 741, "time_per_iteration": 2.6600935459136963 }, { "auxiliary_loss_clip": 0.01213981, "auxiliary_loss_mlp": 0.01066256, "balance_loss_clip": 1.06461728, "balance_loss_mlp": 1.03776455, "epoch": 0.0892202248542055, "flos": 25370888745600.0, "grad_norm": 2.050679089776448, "language_loss": 0.67940676, "learning_rate": 3.9633913145567925e-06, "loss": 0.70220917, "num_input_tokens_seen": 15841350, "step": 742, "time_per_iteration": 2.7113778591156006 }, { "auxiliary_loss_clip": 0.01204083, "auxiliary_loss_mlp": 0.01066901, "balance_loss_clip": 1.06028664, "balance_loss_mlp": 1.03712225, "epoch": 0.08934046774484458, "flos": 24457895827200.0, "grad_norm": 1.853581957364249, "language_loss": 0.81552786, "learning_rate": 3.9632428055102575e-06, "loss": 0.8382377, "num_input_tokens_seen": 15861360, "step": 743, "time_per_iteration": 2.71909236907959 }, { "auxiliary_loss_clip": 0.01219708, "auxiliary_loss_mlp": 0.01075962, "balance_loss_clip": 1.06179285, "balance_loss_mlp": 1.04475331, "epoch": 0.08946071063548368, "flos": 35772773414400.0, "grad_norm": 2.1145579585707197, "language_loss": 0.6696161, "learning_rate": 3.9630939986413495e-06, "loss": 0.69257283, "num_input_tokens_seen": 15883160, "step": 744, "time_per_iteration": 2.766995429992676 }, { "auxiliary_loss_clip": 0.01172933, "auxiliary_loss_mlp": 0.01076896, "balance_loss_clip": 1.05433559, "balance_loss_mlp": 1.04613996, "epoch": 0.08958095352612276, "flos": 14356584167040.0, "grad_norm": 1.7859599758592288, "language_loss": 0.7801227, "learning_rate": 3.962944893972643e-06, "loss": 0.80262101, "num_input_tokens_seen": 15901610, "step": 745, "time_per_iteration": 3.6475396156311035 }, { "auxiliary_loss_clip": 0.01203235, "auxiliary_loss_mlp": 0.0106636, "balance_loss_clip": 1.05997753, "balance_loss_mlp": 1.03589058, "epoch": 0.08970119641676186, "flos": 17853277345920.0, "grad_norm": 2.68101233359301, "language_loss": 0.90762794, "learning_rate": 3.962795491526756e-06, "loss": 0.93032384, "num_input_tokens_seen": 15918770, "step": 746, "time_per_iteration": 3.640228033065796 }, { "auxiliary_loss_clip": 0.01244859, "auxiliary_loss_mlp": 0.01074278, "balance_loss_clip": 1.06849182, "balance_loss_mlp": 1.04379594, "epoch": 0.08982143930740095, "flos": 20811670329600.0, "grad_norm": 2.267408284651698, "language_loss": 0.89559251, "learning_rate": 3.962645791326354e-06, "loss": 0.9187839, "num_input_tokens_seen": 15938025, "step": 747, "time_per_iteration": 2.553499937057495 }, { "auxiliary_loss_clip": 0.01214821, "auxiliary_loss_mlp": 0.01066908, "balance_loss_clip": 1.06161451, "balance_loss_mlp": 1.0388701, "epoch": 0.08994168219804004, "flos": 24097712198400.0, "grad_norm": 2.5515796601450624, "language_loss": 0.83102882, "learning_rate": 3.962495793394146e-06, "loss": 0.85384607, "num_input_tokens_seen": 15957215, "step": 748, "time_per_iteration": 4.4951171875 }, { "auxiliary_loss_clip": 0.01097015, "auxiliary_loss_mlp": 0.01026031, "balance_loss_clip": 1.02278972, "balance_loss_mlp": 1.02028465, "epoch": 0.09006192508867913, "flos": 57188893812480.0, "grad_norm": 0.746924459700977, "language_loss": 0.61247182, "learning_rate": 3.9623454977528864e-06, "loss": 0.63370228, "num_input_tokens_seen": 16015870, "step": 749, "time_per_iteration": 3.0212416648864746 }, { "auxiliary_loss_clip": 0.01191667, "auxiliary_loss_mlp": 0.01077027, "balance_loss_clip": 1.05799603, "balance_loss_mlp": 1.04596114, "epoch": 0.09018216797931822, "flos": 20487505063680.0, "grad_norm": 1.9875327053362248, "language_loss": 0.84894407, "learning_rate": 3.962194904425375e-06, "loss": 0.87163103, "num_input_tokens_seen": 16036500, "step": 750, "time_per_iteration": 2.699367046356201 }, { "auxiliary_loss_clip": 0.0121974, "auxiliary_loss_mlp": 0.0106555, "balance_loss_clip": 1.06274211, "balance_loss_mlp": 1.03643882, "epoch": 0.09030241086995731, "flos": 22638123043200.0, "grad_norm": 2.162397948153344, "language_loss": 0.67930514, "learning_rate": 3.9620440134344566e-06, "loss": 0.70215809, "num_input_tokens_seen": 16054655, "step": 751, "time_per_iteration": 2.696286678314209 }, { "auxiliary_loss_clip": 0.01190224, "auxiliary_loss_mlp": 0.01067733, "balance_loss_clip": 1.05713892, "balance_loss_mlp": 1.03844321, "epoch": 0.09042265376059641, "flos": 21871502046720.0, "grad_norm": 2.886671580585938, "language_loss": 0.82349408, "learning_rate": 3.9618928248030215e-06, "loss": 0.84607363, "num_input_tokens_seen": 16074165, "step": 752, "time_per_iteration": 2.6782288551330566 }, { "auxiliary_loss_clip": 0.01219199, "auxiliary_loss_mlp": 0.0108074, "balance_loss_clip": 1.06301439, "balance_loss_mlp": 1.04931617, "epoch": 0.0905428966512355, "flos": 24316192673280.0, "grad_norm": 2.03584106778214, "language_loss": 0.82779604, "learning_rate": 3.961741338554005e-06, "loss": 0.85079539, "num_input_tokens_seen": 16092505, "step": 753, "time_per_iteration": 2.6316401958465576 }, { "auxiliary_loss_clip": 0.01217045, "auxiliary_loss_mlp": 0.01070357, "balance_loss_clip": 1.06188655, "balance_loss_mlp": 1.03962517, "epoch": 0.09066313954187459, "flos": 35845061535360.0, "grad_norm": 1.8220829495059774, "language_loss": 0.75889611, "learning_rate": 3.9615895547103865e-06, "loss": 0.78177011, "num_input_tokens_seen": 16116150, "step": 754, "time_per_iteration": 2.7519283294677734 }, { "auxiliary_loss_clip": 0.01200857, "auxiliary_loss_mlp": 0.01071566, "balance_loss_clip": 1.05819821, "balance_loss_mlp": 1.04111969, "epoch": 0.09078338243251367, "flos": 29168729550720.0, "grad_norm": 3.9194742156536453, "language_loss": 0.77857625, "learning_rate": 3.961437473295193e-06, "loss": 0.80130053, "num_input_tokens_seen": 16136295, "step": 755, "time_per_iteration": 2.70273494720459 }, { "auxiliary_loss_clip": 0.0115317, "auxiliary_loss_mlp": 0.01070556, "balance_loss_clip": 1.04847574, "balance_loss_mlp": 1.03965712, "epoch": 0.09090362532315277, "flos": 21907699977600.0, "grad_norm": 2.628436879299095, "language_loss": 0.72330976, "learning_rate": 3.961285094331495e-06, "loss": 0.74554706, "num_input_tokens_seen": 16154210, "step": 756, "time_per_iteration": 2.717642068862915 }, { "auxiliary_loss_clip": 0.01234846, "auxiliary_loss_mlp": 0.01079615, "balance_loss_clip": 1.06471252, "balance_loss_mlp": 1.0521971, "epoch": 0.09102386821379185, "flos": 27344503480320.0, "grad_norm": 1.8567483684443276, "language_loss": 0.85975802, "learning_rate": 3.961132417842406e-06, "loss": 0.88290262, "num_input_tokens_seen": 16173995, "step": 757, "time_per_iteration": 2.632869243621826 }, { "auxiliary_loss_clip": 0.01208289, "auxiliary_loss_mlp": 0.01078962, "balance_loss_clip": 1.06107974, "balance_loss_mlp": 1.04935026, "epoch": 0.09114411110443095, "flos": 20813501923200.0, "grad_norm": 2.9324180366469714, "language_loss": 0.75324392, "learning_rate": 3.960979443851089e-06, "loss": 0.77611637, "num_input_tokens_seen": 16191020, "step": 758, "time_per_iteration": 2.615554094314575 }, { "auxiliary_loss_clip": 0.0119846, "auxiliary_loss_mlp": 0.01080325, "balance_loss_clip": 1.05648971, "balance_loss_mlp": 1.04742289, "epoch": 0.09126435399507005, "flos": 26145949438080.0, "grad_norm": 1.7589671024094982, "language_loss": 0.78958821, "learning_rate": 3.96082617238075e-06, "loss": 0.81237602, "num_input_tokens_seen": 16213645, "step": 759, "time_per_iteration": 2.7124009132385254 }, { "auxiliary_loss_clip": 0.01197559, "auxiliary_loss_mlp": 0.01071204, "balance_loss_clip": 1.05800927, "balance_loss_mlp": 1.04279661, "epoch": 0.09138459688570913, "flos": 24388911757440.0, "grad_norm": 2.248579620909605, "language_loss": 0.79608309, "learning_rate": 3.960672603454639e-06, "loss": 0.81877065, "num_input_tokens_seen": 16233625, "step": 760, "time_per_iteration": 2.6837055683135986 }, { "auxiliary_loss_clip": 0.01208201, "auxiliary_loss_mlp": 0.01069593, "balance_loss_clip": 1.05620658, "balance_loss_mlp": 1.03805053, "epoch": 0.09150483977634823, "flos": 21032664756480.0, "grad_norm": 2.7472665429636303, "language_loss": 0.77207005, "learning_rate": 3.960518737096054e-06, "loss": 0.79484797, "num_input_tokens_seen": 16253255, "step": 761, "time_per_iteration": 2.6735429763793945 }, { "auxiliary_loss_clip": 0.01220655, "auxiliary_loss_mlp": 0.01076657, "balance_loss_clip": 1.0608474, "balance_loss_mlp": 1.04801106, "epoch": 0.09162508266698731, "flos": 22856998567680.0, "grad_norm": 2.350635759212103, "language_loss": 0.73130655, "learning_rate": 3.960364573328334e-06, "loss": 0.75427967, "num_input_tokens_seen": 16272580, "step": 762, "time_per_iteration": 2.606306314468384 }, { "auxiliary_loss_clip": 0.01196057, "auxiliary_loss_mlp": 0.0106335, "balance_loss_clip": 1.0592134, "balance_loss_mlp": 1.03338122, "epoch": 0.0917453255576264, "flos": 21724411852800.0, "grad_norm": 2.7342503170196006, "language_loss": 0.8885085, "learning_rate": 3.9602101121748675e-06, "loss": 0.91110259, "num_input_tokens_seen": 16293075, "step": 763, "time_per_iteration": 2.6816694736480713 }, { "auxiliary_loss_clip": 0.01204857, "auxiliary_loss_mlp": 0.01065647, "balance_loss_clip": 1.06137109, "balance_loss_mlp": 1.03462851, "epoch": 0.0918655684482655, "flos": 14609215497600.0, "grad_norm": 2.2134537395278273, "language_loss": 0.72695398, "learning_rate": 3.960055353659085e-06, "loss": 0.74965906, "num_input_tokens_seen": 16310185, "step": 764, "time_per_iteration": 2.592930316925049 }, { "auxiliary_loss_clip": 0.01190673, "auxiliary_loss_mlp": 0.01083304, "balance_loss_clip": 1.05679345, "balance_loss_mlp": 1.05264378, "epoch": 0.09198581133890459, "flos": 23435016226560.0, "grad_norm": 1.8279423477200205, "language_loss": 0.83708864, "learning_rate": 3.959900297804465e-06, "loss": 0.85982835, "num_input_tokens_seen": 16330355, "step": 765, "time_per_iteration": 2.6878185272216797 }, { "auxiliary_loss_clip": 0.01187817, "auxiliary_loss_mlp": 0.01068684, "balance_loss_clip": 1.05784094, "balance_loss_mlp": 1.03850079, "epoch": 0.09210605422954368, "flos": 16795887753600.0, "grad_norm": 2.1802934594355543, "language_loss": 0.77485144, "learning_rate": 3.9597449446345276e-06, "loss": 0.79741645, "num_input_tokens_seen": 16347600, "step": 766, "time_per_iteration": 2.5818557739257812 }, { "auxiliary_loss_clip": 0.01186585, "auxiliary_loss_mlp": 0.01073686, "balance_loss_clip": 1.0568769, "balance_loss_mlp": 1.04524291, "epoch": 0.09222629712018277, "flos": 22674249146880.0, "grad_norm": 2.2481984260684436, "language_loss": 0.83236861, "learning_rate": 3.95958929417284e-06, "loss": 0.85497129, "num_input_tokens_seen": 16365755, "step": 767, "time_per_iteration": 2.640455484390259 }, { "auxiliary_loss_clip": 0.01082439, "auxiliary_loss_mlp": 0.01007754, "balance_loss_clip": 1.0165422, "balance_loss_mlp": 1.00253296, "epoch": 0.09234654001082186, "flos": 69976756327680.0, "grad_norm": 0.7345919636196383, "language_loss": 0.58778811, "learning_rate": 3.9594333464430145e-06, "loss": 0.60869002, "num_input_tokens_seen": 16435245, "step": 768, "time_per_iteration": 3.3740603923797607 }, { "auxiliary_loss_clip": 0.01131043, "auxiliary_loss_mlp": 0.01072164, "balance_loss_clip": 1.04832172, "balance_loss_mlp": 1.04355407, "epoch": 0.09246678290146094, "flos": 20011437181440.0, "grad_norm": 1.9125549650243647, "language_loss": 0.88242972, "learning_rate": 3.959277101468709e-06, "loss": 0.90446174, "num_input_tokens_seen": 16454795, "step": 769, "time_per_iteration": 2.896559476852417 }, { "auxiliary_loss_clip": 0.01182969, "auxiliary_loss_mlp": 0.01073419, "balance_loss_clip": 1.05508924, "balance_loss_mlp": 1.04387927, "epoch": 0.09258702579210004, "flos": 17747448900480.0, "grad_norm": 2.1827388008324267, "language_loss": 0.78546441, "learning_rate": 3.959120559273624e-06, "loss": 0.80802822, "num_input_tokens_seen": 16472580, "step": 770, "time_per_iteration": 2.908935546875 }, { "auxiliary_loss_clip": 0.01185808, "auxiliary_loss_mlp": 0.01071938, "balance_loss_clip": 1.05722499, "balance_loss_mlp": 1.04320872, "epoch": 0.09270726868273914, "flos": 20886544229760.0, "grad_norm": 2.0136195298395476, "language_loss": 0.83661056, "learning_rate": 3.958963719881509e-06, "loss": 0.85918796, "num_input_tokens_seen": 16490670, "step": 771, "time_per_iteration": 3.5942177772521973 }, { "auxiliary_loss_clip": 0.01223999, "auxiliary_loss_mlp": 0.01068531, "balance_loss_clip": 1.06997836, "balance_loss_mlp": 1.04152989, "epoch": 0.09282751157337822, "flos": 17015697031680.0, "grad_norm": 1.8511774183589849, "language_loss": 0.93550175, "learning_rate": 3.958806583316154e-06, "loss": 0.95842707, "num_input_tokens_seen": 16508640, "step": 772, "time_per_iteration": 3.666017770767212 }, { "auxiliary_loss_clip": 0.01239231, "auxiliary_loss_mlp": 0.01072976, "balance_loss_clip": 1.06724143, "balance_loss_mlp": 1.04322112, "epoch": 0.09294775446401732, "flos": 32523647748480.0, "grad_norm": 2.587671902419299, "language_loss": 0.78704631, "learning_rate": 3.9586491496013985e-06, "loss": 0.81016839, "num_input_tokens_seen": 16531035, "step": 773, "time_per_iteration": 2.736044406890869 }, { "auxiliary_loss_clip": 0.01226768, "auxiliary_loss_mlp": 0.01073304, "balance_loss_clip": 1.06354046, "balance_loss_mlp": 1.04500353, "epoch": 0.0930679973546564, "flos": 18259750627200.0, "grad_norm": 2.037667722765316, "language_loss": 0.83170247, "learning_rate": 3.958491418761124e-06, "loss": 0.85470331, "num_input_tokens_seen": 16548605, "step": 774, "time_per_iteration": 3.5638341903686523 }, { "auxiliary_loss_clip": 0.01204127, "auxiliary_loss_mlp": 0.01069016, "balance_loss_clip": 1.05910504, "balance_loss_mlp": 1.03940439, "epoch": 0.0931882402452955, "flos": 21099745405440.0, "grad_norm": 7.978622252243272, "language_loss": 0.73050123, "learning_rate": 3.958333390819258e-06, "loss": 0.75323272, "num_input_tokens_seen": 16565535, "step": 775, "time_per_iteration": 2.619147539138794 }, { "auxiliary_loss_clip": 0.01232127, "auxiliary_loss_mlp": 0.01087248, "balance_loss_clip": 1.06481504, "balance_loss_mlp": 1.0565393, "epoch": 0.0933084831359346, "flos": 24207275658240.0, "grad_norm": 2.5878392062378857, "language_loss": 0.8006345, "learning_rate": 3.9581750657997754e-06, "loss": 0.82382828, "num_input_tokens_seen": 16584900, "step": 776, "time_per_iteration": 2.635972023010254 }, { "auxiliary_loss_clip": 0.01199015, "auxiliary_loss_mlp": 0.01064888, "balance_loss_clip": 1.05892813, "balance_loss_mlp": 1.03622973, "epoch": 0.09342872602657368, "flos": 25480272637440.0, "grad_norm": 2.558035147969105, "language_loss": 0.89532834, "learning_rate": 3.95801644372669e-06, "loss": 0.91796738, "num_input_tokens_seen": 16604805, "step": 777, "time_per_iteration": 2.6511693000793457 }, { "auxiliary_loss_clip": 0.0121122, "auxiliary_loss_mlp": 0.01064249, "balance_loss_clip": 1.05861545, "balance_loss_mlp": 1.03771269, "epoch": 0.09354896891721277, "flos": 23149060053120.0, "grad_norm": 2.1686249149874133, "language_loss": 0.84399194, "learning_rate": 3.957857524624068e-06, "loss": 0.8667466, "num_input_tokens_seen": 16623685, "step": 778, "time_per_iteration": 2.749464511871338 }, { "auxiliary_loss_clip": 0.01201402, "auxiliary_loss_mlp": 0.01073443, "balance_loss_clip": 1.05616367, "balance_loss_mlp": 1.04442799, "epoch": 0.09366921180785186, "flos": 24279563779200.0, "grad_norm": 1.618839753873244, "language_loss": 0.89438838, "learning_rate": 3.957698308516016e-06, "loss": 0.91713685, "num_input_tokens_seen": 16644985, "step": 779, "time_per_iteration": 2.682359218597412 }, { "auxiliary_loss_clip": 0.01216686, "auxiliary_loss_mlp": 0.0078162, "balance_loss_clip": 1.06606519, "balance_loss_mlp": 1.00070548, "epoch": 0.09378945469849095, "flos": 18730036419840.0, "grad_norm": 1.769755654475778, "language_loss": 0.82724267, "learning_rate": 3.957538795426688e-06, "loss": 0.84722573, "num_input_tokens_seen": 16662410, "step": 780, "time_per_iteration": 2.6397311687469482 }, { "auxiliary_loss_clip": 0.01201411, "auxiliary_loss_mlp": 0.01069915, "balance_loss_clip": 1.05742311, "balance_loss_mlp": 1.04153085, "epoch": 0.09390969758913004, "flos": 23218834222080.0, "grad_norm": 5.048235598631644, "language_loss": 0.76909697, "learning_rate": 3.9573789853802804e-06, "loss": 0.79181015, "num_input_tokens_seen": 16680885, "step": 781, "time_per_iteration": 2.621096134185791 }, { "auxiliary_loss_clip": 0.01204321, "auxiliary_loss_mlp": 0.00781738, "balance_loss_clip": 1.05989373, "balance_loss_mlp": 1.00071454, "epoch": 0.09402994047976913, "flos": 19646728439040.0, "grad_norm": 2.13707352591941, "language_loss": 0.74599457, "learning_rate": 3.957218878401037e-06, "loss": 0.76585519, "num_input_tokens_seen": 16699375, "step": 782, "time_per_iteration": 2.6439807415008545 }, { "auxiliary_loss_clip": 0.01234034, "auxiliary_loss_mlp": 0.01079428, "balance_loss_clip": 1.066921, "balance_loss_mlp": 1.04962587, "epoch": 0.09415018337040823, "flos": 29420463041280.0, "grad_norm": 2.047201479775014, "language_loss": 0.89364332, "learning_rate": 3.957058474513246e-06, "loss": 0.91677797, "num_input_tokens_seen": 16719230, "step": 783, "time_per_iteration": 2.6562957763671875 }, { "auxiliary_loss_clip": 0.01218951, "auxiliary_loss_mlp": 0.01070237, "balance_loss_clip": 1.06438494, "balance_loss_mlp": 1.04210377, "epoch": 0.09427042626104731, "flos": 24572092141440.0, "grad_norm": 1.745115763086254, "language_loss": 0.78734374, "learning_rate": 3.956897773741241e-06, "loss": 0.81023562, "num_input_tokens_seen": 16738220, "step": 784, "time_per_iteration": 2.6230456829071045 }, { "auxiliary_loss_clip": 0.01190278, "auxiliary_loss_mlp": 0.01057064, "balance_loss_clip": 1.05902696, "balance_loss_mlp": 1.0272615, "epoch": 0.09439066915168641, "flos": 26359581576960.0, "grad_norm": 2.3389586974332492, "language_loss": 0.7171579, "learning_rate": 3.956736776109398e-06, "loss": 0.7396313, "num_input_tokens_seen": 16759395, "step": 785, "time_per_iteration": 2.7130892276763916 }, { "auxiliary_loss_clip": 0.0120313, "auxiliary_loss_mlp": 0.00781485, "balance_loss_clip": 1.05884647, "balance_loss_mlp": 1.00082636, "epoch": 0.09451091204232549, "flos": 19427278296960.0, "grad_norm": 3.6290355469595506, "language_loss": 0.83908141, "learning_rate": 3.956575481642143e-06, "loss": 0.85892761, "num_input_tokens_seen": 16778285, "step": 786, "time_per_iteration": 2.603935956954956 }, { "auxiliary_loss_clip": 0.01170937, "auxiliary_loss_mlp": 0.01075206, "balance_loss_clip": 1.0561502, "balance_loss_mlp": 1.0454514, "epoch": 0.09463115493296459, "flos": 25368051571200.0, "grad_norm": 2.5386840641774735, "language_loss": 0.74962306, "learning_rate": 3.956413890363943e-06, "loss": 0.77208447, "num_input_tokens_seen": 16795265, "step": 787, "time_per_iteration": 2.703078508377075 }, { "auxiliary_loss_clip": 0.01215698, "auxiliary_loss_mlp": 0.01075189, "balance_loss_clip": 1.06041312, "balance_loss_mlp": 1.04723489, "epoch": 0.09475139782360369, "flos": 10123254869760.0, "grad_norm": 2.1758394350562544, "language_loss": 0.81650686, "learning_rate": 3.956252002299312e-06, "loss": 0.83941573, "num_input_tokens_seen": 16811165, "step": 788, "time_per_iteration": 2.5796761512756348 }, { "auxiliary_loss_clip": 0.01231107, "auxiliary_loss_mlp": 0.01064775, "balance_loss_clip": 1.06256914, "balance_loss_mlp": 1.03378022, "epoch": 0.09487164071424277, "flos": 17231088936960.0, "grad_norm": 1.9702973198191331, "language_loss": 0.90923607, "learning_rate": 3.956089817472807e-06, "loss": 0.93219483, "num_input_tokens_seen": 16828470, "step": 789, "time_per_iteration": 2.545781135559082 }, { "auxiliary_loss_clip": 0.01202881, "auxiliary_loss_mlp": 0.01062209, "balance_loss_clip": 1.06493556, "balance_loss_mlp": 1.03450453, "epoch": 0.09499188360488187, "flos": 30849564528000.0, "grad_norm": 2.0872259859192503, "language_loss": 0.85649657, "learning_rate": 3.955927335909032e-06, "loss": 0.87914753, "num_input_tokens_seen": 16851680, "step": 790, "time_per_iteration": 2.707141876220703 }, { "auxiliary_loss_clip": 0.01174958, "auxiliary_loss_mlp": 0.0106761, "balance_loss_clip": 1.0623045, "balance_loss_mlp": 1.03902388, "epoch": 0.09511212649552095, "flos": 29351694453120.0, "grad_norm": 2.0992060227422056, "language_loss": 0.75859505, "learning_rate": 3.955764557632634e-06, "loss": 0.78102076, "num_input_tokens_seen": 16871490, "step": 791, "time_per_iteration": 2.7791571617126465 }, { "auxiliary_loss_clip": 0.01193905, "auxiliary_loss_mlp": 0.01078764, "balance_loss_clip": 1.06007576, "balance_loss_mlp": 1.04934287, "epoch": 0.09523236938616005, "flos": 10378687461120.0, "grad_norm": 2.300846750634406, "language_loss": 0.94766986, "learning_rate": 3.955601482668309e-06, "loss": 0.97039658, "num_input_tokens_seen": 16889350, "step": 792, "time_per_iteration": 2.6162400245666504 }, { "auxiliary_loss_clip": 0.01165104, "auxiliary_loss_mlp": 0.01080775, "balance_loss_clip": 1.05094373, "balance_loss_mlp": 1.04584646, "epoch": 0.09535261227679913, "flos": 19061815368960.0, "grad_norm": 1.8109329232025997, "language_loss": 0.88759786, "learning_rate": 3.955438111040794e-06, "loss": 0.91005665, "num_input_tokens_seen": 16907625, "step": 793, "time_per_iteration": 2.673604965209961 }, { "auxiliary_loss_clip": 0.01164645, "auxiliary_loss_mlp": 0.01075275, "balance_loss_clip": 1.05171919, "balance_loss_mlp": 1.04580641, "epoch": 0.09547285516743823, "flos": 20922993555840.0, "grad_norm": 1.7749867184589676, "language_loss": 0.79939401, "learning_rate": 3.955274442774873e-06, "loss": 0.8217932, "num_input_tokens_seen": 16926205, "step": 794, "time_per_iteration": 2.7051773071289062 }, { "auxiliary_loss_clip": 0.01215911, "auxiliary_loss_mlp": 0.0106409, "balance_loss_clip": 1.06150353, "balance_loss_mlp": 1.03588486, "epoch": 0.09559309805807732, "flos": 30154405639680.0, "grad_norm": 2.034574078401785, "language_loss": 0.70805621, "learning_rate": 3.9551104778953725e-06, "loss": 0.73085618, "num_input_tokens_seen": 16946500, "step": 795, "time_per_iteration": 2.6672441959381104 }, { "auxiliary_loss_clip": 0.01195152, "auxiliary_loss_mlp": 0.01064912, "balance_loss_clip": 1.061028, "balance_loss_mlp": 1.03613496, "epoch": 0.0957133409487164, "flos": 21066743784960.0, "grad_norm": 1.7226373481126243, "language_loss": 0.84978735, "learning_rate": 3.954946216427167e-06, "loss": 0.87238806, "num_input_tokens_seen": 16966960, "step": 796, "time_per_iteration": 2.6888160705566406 }, { "auxiliary_loss_clip": 0.01054549, "auxiliary_loss_mlp": 0.01011434, "balance_loss_clip": 1.01801777, "balance_loss_mlp": 1.0053786, "epoch": 0.0958335838393555, "flos": 71297979315840.0, "grad_norm": 0.9734153464151858, "language_loss": 0.61605299, "learning_rate": 3.954781658395176e-06, "loss": 0.63671291, "num_input_tokens_seen": 17023215, "step": 797, "time_per_iteration": 4.136622428894043 }, { "auxiliary_loss_clip": 0.01210748, "auxiliary_loss_mlp": 0.01088855, "balance_loss_clip": 1.06240749, "balance_loss_mlp": 1.05728865, "epoch": 0.09595382672999458, "flos": 21872974504320.0, "grad_norm": 3.3043223308297747, "language_loss": 0.9240548, "learning_rate": 3.95461680382436e-06, "loss": 0.94705087, "num_input_tokens_seen": 17042140, "step": 798, "time_per_iteration": 3.677520275115967 }, { "auxiliary_loss_clip": 0.01217512, "auxiliary_loss_mlp": 0.01078373, "balance_loss_clip": 1.06292152, "balance_loss_mlp": 1.04836798, "epoch": 0.09607406962063368, "flos": 18695562341760.0, "grad_norm": 3.325126511029615, "language_loss": 0.86132073, "learning_rate": 3.9544516527397295e-06, "loss": 0.88427961, "num_input_tokens_seen": 17058490, "step": 799, "time_per_iteration": 2.6289727687835693 }, { "auxiliary_loss_clip": 0.01187265, "auxiliary_loss_mlp": 0.01071151, "balance_loss_clip": 1.05879796, "balance_loss_mlp": 1.04025197, "epoch": 0.09619431251127276, "flos": 22568456615040.0, "grad_norm": 1.6673768660504409, "language_loss": 0.8055073, "learning_rate": 3.954286205166338e-06, "loss": 0.82809144, "num_input_tokens_seen": 17079655, "step": 800, "time_per_iteration": 3.6535260677337646 }, { "auxiliary_loss_clip": 0.01227897, "auxiliary_loss_mlp": 0.01081118, "balance_loss_clip": 1.06731796, "balance_loss_mlp": 1.04876482, "epoch": 0.09631455540191186, "flos": 14246230608000.0, "grad_norm": 2.030719241868603, "language_loss": 0.83757293, "learning_rate": 3.954120461129282e-06, "loss": 0.86066306, "num_input_tokens_seen": 17097065, "step": 801, "time_per_iteration": 3.5094072818756104 }, { "auxiliary_loss_clip": 0.01236244, "auxiliary_loss_mlp": 0.01076117, "balance_loss_clip": 1.06816697, "balance_loss_mlp": 1.04652965, "epoch": 0.09643479829255096, "flos": 20740387789440.0, "grad_norm": 1.9284784404082507, "language_loss": 0.83690763, "learning_rate": 3.953954420653706e-06, "loss": 0.86003125, "num_input_tokens_seen": 17114090, "step": 802, "time_per_iteration": 2.5468177795410156 }, { "auxiliary_loss_clip": 0.01215777, "auxiliary_loss_mlp": 0.01069531, "balance_loss_clip": 1.06514049, "balance_loss_mlp": 1.04018188, "epoch": 0.09655504118319004, "flos": 24420476833920.0, "grad_norm": 2.0377369335873277, "language_loss": 0.88371861, "learning_rate": 3.953788083764798e-06, "loss": 0.90657169, "num_input_tokens_seen": 17133325, "step": 803, "time_per_iteration": 2.6808128356933594 }, { "auxiliary_loss_clip": 0.01173418, "auxiliary_loss_mlp": 0.01075454, "balance_loss_clip": 1.05878222, "balance_loss_mlp": 1.04453135, "epoch": 0.09667528407382914, "flos": 18441961344000.0, "grad_norm": 1.9107025486413138, "language_loss": 0.91793472, "learning_rate": 3.953621450487792e-06, "loss": 0.94042349, "num_input_tokens_seen": 17151945, "step": 804, "time_per_iteration": 2.6628847122192383 }, { "auxiliary_loss_clip": 0.01090379, "auxiliary_loss_mlp": 0.01005915, "balance_loss_clip": 1.01920247, "balance_loss_mlp": 1.00045514, "epoch": 0.09679552696446822, "flos": 70816455544320.0, "grad_norm": 1.244283477662388, "language_loss": 0.61221147, "learning_rate": 3.953454520847964e-06, "loss": 0.63317442, "num_input_tokens_seen": 17216790, "step": 805, "time_per_iteration": 3.361769199371338 }, { "auxiliary_loss_clip": 0.01198339, "auxiliary_loss_mlp": 0.01086799, "balance_loss_clip": 1.05812132, "balance_loss_mlp": 1.05380213, "epoch": 0.09691576985510732, "flos": 21945514020480.0, "grad_norm": 2.1533351726188856, "language_loss": 0.73703873, "learning_rate": 3.9532872948706395e-06, "loss": 0.75989008, "num_input_tokens_seen": 17236285, "step": 806, "time_per_iteration": 2.6696465015411377 }, { "auxiliary_loss_clip": 0.01204696, "auxiliary_loss_mlp": 0.01072934, "balance_loss_clip": 1.06300831, "balance_loss_mlp": 1.04332244, "epoch": 0.09703601274574641, "flos": 17965211103360.0, "grad_norm": 2.263587811471747, "language_loss": 0.82737553, "learning_rate": 3.9531197725811845e-06, "loss": 0.85015184, "num_input_tokens_seen": 17251670, "step": 807, "time_per_iteration": 2.584963083267212 }, { "auxiliary_loss_clip": 0.01228147, "auxiliary_loss_mlp": 0.01066819, "balance_loss_clip": 1.06692958, "balance_loss_mlp": 1.03775585, "epoch": 0.0971562556363855, "flos": 22162162901760.0, "grad_norm": 2.2899955245298607, "language_loss": 0.87925917, "learning_rate": 3.952951954005013e-06, "loss": 0.90220881, "num_input_tokens_seen": 17271355, "step": 808, "time_per_iteration": 2.600796937942505 }, { "auxiliary_loss_clip": 0.01194121, "auxiliary_loss_mlp": 0.01067037, "balance_loss_clip": 1.05717826, "balance_loss_mlp": 1.03842711, "epoch": 0.0972764985270246, "flos": 25848716394240.0, "grad_norm": 1.6402042147790514, "language_loss": 0.84429699, "learning_rate": 3.952783839167584e-06, "loss": 0.86690861, "num_input_tokens_seen": 17291400, "step": 809, "time_per_iteration": 2.6787831783294678 }, { "auxiliary_loss_clip": 0.01217518, "auxiliary_loss_mlp": 0.01063132, "balance_loss_clip": 1.0610801, "balance_loss_mlp": 1.03235221, "epoch": 0.09739674141766368, "flos": 20339373375360.0, "grad_norm": 2.698545855470976, "language_loss": 0.7435559, "learning_rate": 3.952615428094398e-06, "loss": 0.76636243, "num_input_tokens_seen": 17310920, "step": 810, "time_per_iteration": 2.6426944732666016 }, { "auxiliary_loss_clip": 0.01162003, "auxiliary_loss_mlp": 0.0108177, "balance_loss_clip": 1.0550096, "balance_loss_mlp": 1.04829597, "epoch": 0.09751698430830277, "flos": 15743059188480.0, "grad_norm": 1.6182713402585711, "language_loss": 0.73476499, "learning_rate": 3.952446720811004e-06, "loss": 0.75720274, "num_input_tokens_seen": 17329245, "step": 811, "time_per_iteration": 2.677402973175049 }, { "auxiliary_loss_clip": 0.01056774, "auxiliary_loss_mlp": 0.01013179, "balance_loss_clip": 1.0201453, "balance_loss_mlp": 1.00807667, "epoch": 0.09763722719894186, "flos": 63716806800000.0, "grad_norm": 0.8318705688963045, "language_loss": 0.63587213, "learning_rate": 3.952277717342995e-06, "loss": 0.65657163, "num_input_tokens_seen": 17395680, "step": 812, "time_per_iteration": 3.347153902053833 }, { "auxiliary_loss_clip": 0.01210038, "auxiliary_loss_mlp": 0.010725, "balance_loss_clip": 1.06323242, "balance_loss_mlp": 1.0394311, "epoch": 0.09775747008958095, "flos": 22090916275200.0, "grad_norm": 2.0788150632960614, "language_loss": 0.85540468, "learning_rate": 3.952108417716009e-06, "loss": 0.87823009, "num_input_tokens_seen": 17415135, "step": 813, "time_per_iteration": 2.6683382987976074 }, { "auxiliary_loss_clip": 0.01217765, "auxiliary_loss_mlp": 0.01086553, "balance_loss_clip": 1.06623554, "balance_loss_mlp": 1.05386531, "epoch": 0.09787771298022005, "flos": 21286050272640.0, "grad_norm": 1.8679247539378774, "language_loss": 0.8476429, "learning_rate": 3.951938821955727e-06, "loss": 0.87068605, "num_input_tokens_seen": 17434535, "step": 814, "time_per_iteration": 2.647611379623413 }, { "auxiliary_loss_clip": 0.01193877, "auxiliary_loss_mlp": 0.01068789, "balance_loss_clip": 1.05934632, "balance_loss_mlp": 1.03731751, "epoch": 0.09799795587085913, "flos": 22054574689920.0, "grad_norm": 1.9817908624716003, "language_loss": 0.76619482, "learning_rate": 3.9517689300878786e-06, "loss": 0.78882146, "num_input_tokens_seen": 17454270, "step": 815, "time_per_iteration": 2.6512367725372314 }, { "auxiliary_loss_clip": 0.01232493, "auxiliary_loss_mlp": 0.01074654, "balance_loss_clip": 1.06625378, "balance_loss_mlp": 1.04423201, "epoch": 0.09811819876149823, "flos": 22163743100160.0, "grad_norm": 1.8403551837461751, "language_loss": 0.78734207, "learning_rate": 3.951598742138236e-06, "loss": 0.81041354, "num_input_tokens_seen": 17472995, "step": 816, "time_per_iteration": 2.600351333618164 }, { "auxiliary_loss_clip": 0.01200889, "auxiliary_loss_mlp": 0.01069853, "balance_loss_clip": 1.05590558, "balance_loss_mlp": 1.03888202, "epoch": 0.09823844165213731, "flos": 22231111057920.0, "grad_norm": 2.1187209235671425, "language_loss": 0.80321538, "learning_rate": 3.951428258132615e-06, "loss": 0.82592285, "num_input_tokens_seen": 17491115, "step": 817, "time_per_iteration": 2.657989978790283 }, { "auxiliary_loss_clip": 0.01209503, "auxiliary_loss_mlp": 0.0106898, "balance_loss_clip": 1.06240392, "balance_loss_mlp": 1.03982091, "epoch": 0.09835868454277641, "flos": 22487728798080.0, "grad_norm": 2.0604247445401622, "language_loss": 0.84544957, "learning_rate": 3.951257478096879e-06, "loss": 0.86823434, "num_input_tokens_seen": 17509480, "step": 818, "time_per_iteration": 2.656342029571533 }, { "auxiliary_loss_clip": 0.01205508, "auxiliary_loss_mlp": 0.00782873, "balance_loss_clip": 1.063869, "balance_loss_mlp": 1.00072527, "epoch": 0.0984789274334155, "flos": 16362554077440.0, "grad_norm": 3.124780990893009, "language_loss": 0.6833508, "learning_rate": 3.951086402056936e-06, "loss": 0.70323455, "num_input_tokens_seen": 17524080, "step": 819, "time_per_iteration": 2.613417863845825 }, { "auxiliary_loss_clip": 0.01142985, "auxiliary_loss_mlp": 0.00782289, "balance_loss_clip": 1.05862522, "balance_loss_mlp": 1.00073218, "epoch": 0.09859917032405459, "flos": 24243545416320.0, "grad_norm": 1.6146389907373815, "language_loss": 0.83814228, "learning_rate": 3.950915030038735e-06, "loss": 0.85739499, "num_input_tokens_seen": 17543875, "step": 820, "time_per_iteration": 2.800478935241699 }, { "auxiliary_loss_clip": 0.01209503, "auxiliary_loss_mlp": 0.01059987, "balance_loss_clip": 1.06457686, "balance_loss_mlp": 1.03227115, "epoch": 0.09871941321469369, "flos": 17420195064960.0, "grad_norm": 2.6692313539404933, "language_loss": 0.83656323, "learning_rate": 3.9507433620682765e-06, "loss": 0.85925817, "num_input_tokens_seen": 17560810, "step": 821, "time_per_iteration": 2.6839771270751953 }, { "auxiliary_loss_clip": 0.0118346, "auxiliary_loss_mlp": 0.01085111, "balance_loss_clip": 1.05504584, "balance_loss_mlp": 1.05373478, "epoch": 0.09883965610533277, "flos": 28477341590400.0, "grad_norm": 1.690482338894628, "language_loss": 0.88334537, "learning_rate": 3.9505713981716e-06, "loss": 0.90603107, "num_input_tokens_seen": 17583640, "step": 822, "time_per_iteration": 2.772791624069214 }, { "auxiliary_loss_clip": 0.01196927, "auxiliary_loss_mlp": 0.0106305, "balance_loss_clip": 1.06436241, "balance_loss_mlp": 1.03432107, "epoch": 0.09895989899597187, "flos": 23693932437120.0, "grad_norm": 2.1617061212835424, "language_loss": 0.81099856, "learning_rate": 3.950399138374795e-06, "loss": 0.83359838, "num_input_tokens_seen": 17602720, "step": 823, "time_per_iteration": 3.5861852169036865 }, { "auxiliary_loss_clip": 0.01213802, "auxiliary_loss_mlp": 0.0107091, "balance_loss_clip": 1.06448078, "balance_loss_mlp": 1.04086912, "epoch": 0.09908014188661095, "flos": 24679608526080.0, "grad_norm": 1.9409010298942484, "language_loss": 0.74387527, "learning_rate": 3.95022658270399e-06, "loss": 0.76672244, "num_input_tokens_seen": 17623085, "step": 824, "time_per_iteration": 3.619746685028076 }, { "auxiliary_loss_clip": 0.01190856, "auxiliary_loss_mlp": 0.01066399, "balance_loss_clip": 1.06052089, "balance_loss_mlp": 1.03635788, "epoch": 0.09920038477725004, "flos": 14064307200000.0, "grad_norm": 1.80971949031073, "language_loss": 0.78333783, "learning_rate": 3.9500537311853635e-06, "loss": 0.80591035, "num_input_tokens_seen": 17641040, "step": 825, "time_per_iteration": 2.628176212310791 }, { "auxiliary_loss_clip": 0.01212614, "auxiliary_loss_mlp": 0.01076303, "balance_loss_clip": 1.0593307, "balance_loss_mlp": 1.04557085, "epoch": 0.09932062766788914, "flos": 13407070095360.0, "grad_norm": 2.0851569745881684, "language_loss": 0.83265102, "learning_rate": 3.949880583845136e-06, "loss": 0.85554022, "num_input_tokens_seen": 17659115, "step": 826, "time_per_iteration": 4.55352783203125 }, { "auxiliary_loss_clip": 0.01199847, "auxiliary_loss_mlp": 0.01058603, "balance_loss_clip": 1.06170273, "balance_loss_mlp": 1.02881241, "epoch": 0.09944087055852822, "flos": 19500751566720.0, "grad_norm": 1.6290809868606468, "language_loss": 0.80963284, "learning_rate": 3.949707140709575e-06, "loss": 0.83221734, "num_input_tokens_seen": 17678845, "step": 827, "time_per_iteration": 2.643625497817993 }, { "auxiliary_loss_clip": 0.01216013, "auxiliary_loss_mlp": 0.01073512, "balance_loss_clip": 1.06013978, "balance_loss_mlp": 1.04490185, "epoch": 0.09956111344916732, "flos": 17749100926080.0, "grad_norm": 2.00099013905247, "language_loss": 0.8320477, "learning_rate": 3.949533401804991e-06, "loss": 0.85494304, "num_input_tokens_seen": 17695750, "step": 828, "time_per_iteration": 2.5843069553375244 }, { "auxiliary_loss_clip": 0.01214069, "auxiliary_loss_mlp": 0.00782122, "balance_loss_clip": 1.06319809, "balance_loss_mlp": 1.00058532, "epoch": 0.0996813563398064, "flos": 17967581400960.0, "grad_norm": 2.016025314377972, "language_loss": 0.90878916, "learning_rate": 3.949359367157739e-06, "loss": 0.92875111, "num_input_tokens_seen": 17714445, "step": 829, "time_per_iteration": 2.6169180870056152 }, { "auxiliary_loss_clip": 0.01216583, "auxiliary_loss_mlp": 0.01074513, "balance_loss_clip": 1.06233525, "balance_loss_mlp": 1.04399562, "epoch": 0.0998015992304455, "flos": 17457039440640.0, "grad_norm": 2.0442830209913203, "language_loss": 0.75323755, "learning_rate": 3.949185036794222e-06, "loss": 0.7761485, "num_input_tokens_seen": 17732455, "step": 830, "time_per_iteration": 2.5877153873443604 }, { "auxiliary_loss_clip": 0.01222914, "auxiliary_loss_mlp": 0.01058154, "balance_loss_clip": 1.06356764, "balance_loss_mlp": 1.03116488, "epoch": 0.0999218421210846, "flos": 25888757080320.0, "grad_norm": 1.5961042499632292, "language_loss": 0.78992683, "learning_rate": 3.949010410740884e-06, "loss": 0.81273746, "num_input_tokens_seen": 17755280, "step": 831, "time_per_iteration": 2.629798412322998 }, { "auxiliary_loss_clip": 0.01187716, "auxiliary_loss_mlp": 0.00782406, "balance_loss_clip": 1.05679524, "balance_loss_mlp": 1.00051749, "epoch": 0.10004208501172368, "flos": 21215916967680.0, "grad_norm": 1.69477131991224, "language_loss": 0.86330765, "learning_rate": 3.948835489024216e-06, "loss": 0.88300884, "num_input_tokens_seen": 17775015, "step": 832, "time_per_iteration": 2.6404736042022705 }, { "auxiliary_loss_clip": 0.01211155, "auxiliary_loss_mlp": 0.01076405, "balance_loss_clip": 1.05831099, "balance_loss_mlp": 1.04766369, "epoch": 0.10016232790236278, "flos": 17348409734400.0, "grad_norm": 2.1347445978390134, "language_loss": 0.90375638, "learning_rate": 3.948660271670755e-06, "loss": 0.92663199, "num_input_tokens_seen": 17792165, "step": 833, "time_per_iteration": 2.793505907058716 }, { "auxiliary_loss_clip": 0.0119122, "auxiliary_loss_mlp": 0.01065188, "balance_loss_clip": 1.05813622, "balance_loss_mlp": 1.03543329, "epoch": 0.10028257079300186, "flos": 25666541591040.0, "grad_norm": 2.063999091355438, "language_loss": 0.84448808, "learning_rate": 3.948484758707079e-06, "loss": 0.86705214, "num_input_tokens_seen": 17811765, "step": 834, "time_per_iteration": 2.6643385887145996 }, { "auxiliary_loss_clip": 0.01171617, "auxiliary_loss_mlp": 0.01078201, "balance_loss_clip": 1.05610061, "balance_loss_mlp": 1.04720652, "epoch": 0.10040281368364096, "flos": 25156035544320.0, "grad_norm": 2.1784359259056103, "language_loss": 0.83733344, "learning_rate": 3.948308950159815e-06, "loss": 0.85983163, "num_input_tokens_seen": 17830445, "step": 835, "time_per_iteration": 2.678372621536255 }, { "auxiliary_loss_clip": 0.01172813, "auxiliary_loss_mlp": 0.01086123, "balance_loss_clip": 1.05191255, "balance_loss_mlp": 1.0531733, "epoch": 0.10052305657428004, "flos": 17603303621760.0, "grad_norm": 3.006839980882135, "language_loss": 0.75803292, "learning_rate": 3.9481328460556326e-06, "loss": 0.78062224, "num_input_tokens_seen": 17847665, "step": 836, "time_per_iteration": 2.6558873653411865 }, { "auxiliary_loss_clip": 0.01184339, "auxiliary_loss_mlp": 0.0106907, "balance_loss_clip": 1.05538046, "balance_loss_mlp": 1.0418067, "epoch": 0.10064329946491914, "flos": 18660154510080.0, "grad_norm": 2.0618598289331835, "language_loss": 0.89787406, "learning_rate": 3.9479564464212455e-06, "loss": 0.92040819, "num_input_tokens_seen": 17866825, "step": 837, "time_per_iteration": 2.6437761783599854 }, { "auxiliary_loss_clip": 0.0123314, "auxiliary_loss_mlp": 0.01074488, "balance_loss_clip": 1.06482983, "balance_loss_mlp": 1.04373217, "epoch": 0.10076354235555823, "flos": 17199056983680.0, "grad_norm": 3.0323083400935564, "language_loss": 0.76043665, "learning_rate": 3.947779751283414e-06, "loss": 0.78351295, "num_input_tokens_seen": 17883995, "step": 838, "time_per_iteration": 2.5203847885131836 }, { "auxiliary_loss_clip": 0.01211561, "auxiliary_loss_mlp": 0.00782312, "balance_loss_clip": 1.06842315, "balance_loss_mlp": 1.00057805, "epoch": 0.10088378524619732, "flos": 22962252395520.0, "grad_norm": 1.8006948958169384, "language_loss": 0.76120007, "learning_rate": 3.947602760668944e-06, "loss": 0.78113878, "num_input_tokens_seen": 17903785, "step": 839, "time_per_iteration": 2.588059186935425 }, { "auxiliary_loss_clip": 0.01207742, "auxiliary_loss_mlp": 0.01067363, "balance_loss_clip": 1.06105614, "balance_loss_mlp": 1.03770423, "epoch": 0.10100402813683641, "flos": 37885828746240.0, "grad_norm": 1.6899382018884892, "language_loss": 0.71369696, "learning_rate": 3.947425474604684e-06, "loss": 0.73644805, "num_input_tokens_seen": 17927720, "step": 840, "time_per_iteration": 2.7536063194274902 }, { "auxiliary_loss_clip": 0.01195011, "auxiliary_loss_mlp": 0.01058027, "balance_loss_clip": 1.06096458, "balance_loss_mlp": 1.02932107, "epoch": 0.1011242710274755, "flos": 21543458112000.0, "grad_norm": 2.8819625938255697, "language_loss": 0.92377251, "learning_rate": 3.947247893117528e-06, "loss": 0.94630289, "num_input_tokens_seen": 17946225, "step": 841, "time_per_iteration": 2.632704257965088 }, { "auxiliary_loss_clip": 0.01204091, "auxiliary_loss_mlp": 0.0107065, "balance_loss_clip": 1.05748045, "balance_loss_mlp": 1.04244554, "epoch": 0.10124451391811459, "flos": 13621456419840.0, "grad_norm": 2.750490826645327, "language_loss": 0.69842076, "learning_rate": 3.947070016234413e-06, "loss": 0.72116822, "num_input_tokens_seen": 17962015, "step": 842, "time_per_iteration": 2.579274892807007 }, { "auxiliary_loss_clip": 0.01206104, "auxiliary_loss_mlp": 0.0108129, "balance_loss_clip": 1.06169224, "balance_loss_mlp": 1.05030787, "epoch": 0.10136475680875369, "flos": 16649228522880.0, "grad_norm": 2.2879640622689608, "language_loss": 0.74691379, "learning_rate": 3.946891843982326e-06, "loss": 0.76978773, "num_input_tokens_seen": 17979680, "step": 843, "time_per_iteration": 2.629936456680298 }, { "auxiliary_loss_clip": 0.0121213, "auxiliary_loss_mlp": 0.01066947, "balance_loss_clip": 1.06307316, "balance_loss_mlp": 1.03673887, "epoch": 0.10148499969939277, "flos": 19461034103040.0, "grad_norm": 1.9716905582259, "language_loss": 0.74242884, "learning_rate": 3.9467133763882935e-06, "loss": 0.76521957, "num_input_tokens_seen": 17998145, "step": 844, "time_per_iteration": 2.6038050651550293 }, { "auxiliary_loss_clip": 0.01198913, "auxiliary_loss_mlp": 0.01075088, "balance_loss_clip": 1.06092858, "balance_loss_mlp": 1.04592991, "epoch": 0.10160524259003187, "flos": 21104988791040.0, "grad_norm": 11.128977944956578, "language_loss": 0.86146259, "learning_rate": 3.9465346134793905e-06, "loss": 0.8842026, "num_input_tokens_seen": 18017955, "step": 845, "time_per_iteration": 2.681483268737793 }, { "auxiliary_loss_clip": 0.01184935, "auxiliary_loss_mlp": 0.01064643, "balance_loss_clip": 1.06072128, "balance_loss_mlp": 1.03695059, "epoch": 0.10172548548067095, "flos": 17712687513600.0, "grad_norm": 1.8851039102859741, "language_loss": 0.79338825, "learning_rate": 3.9463555552827335e-06, "loss": 0.81588405, "num_input_tokens_seen": 18035125, "step": 846, "time_per_iteration": 2.6425931453704834 }, { "auxiliary_loss_clip": 0.01193545, "auxiliary_loss_mlp": 0.0106471, "balance_loss_clip": 1.05647409, "balance_loss_mlp": 1.03717291, "epoch": 0.10184572837131005, "flos": 21104845136640.0, "grad_norm": 2.235973914861761, "language_loss": 0.85922533, "learning_rate": 3.946176201825487e-06, "loss": 0.88180792, "num_input_tokens_seen": 18053160, "step": 847, "time_per_iteration": 2.587547779083252 }, { "auxiliary_loss_clip": 0.01198529, "auxiliary_loss_mlp": 0.01067057, "balance_loss_clip": 1.06258702, "balance_loss_mlp": 1.03677833, "epoch": 0.10196597126194913, "flos": 26067591918720.0, "grad_norm": 1.763831671748678, "language_loss": 0.83398318, "learning_rate": 3.9459965531348575e-06, "loss": 0.85663909, "num_input_tokens_seen": 18072815, "step": 848, "time_per_iteration": 2.6824378967285156 }, { "auxiliary_loss_clip": 0.01198877, "auxiliary_loss_mlp": 0.00781894, "balance_loss_clip": 1.06071353, "balance_loss_mlp": 1.00061381, "epoch": 0.10208621415258823, "flos": 29314634595840.0, "grad_norm": 2.1578810162243163, "language_loss": 0.85245073, "learning_rate": 3.945816609238098e-06, "loss": 0.87225842, "num_input_tokens_seen": 18092225, "step": 849, "time_per_iteration": 2.702897310256958 }, { "auxiliary_loss_clip": 0.01164276, "auxiliary_loss_mlp": 0.01076415, "balance_loss_clip": 1.05695248, "balance_loss_mlp": 1.04530096, "epoch": 0.10220645704322733, "flos": 23805794367360.0, "grad_norm": 2.0005130505991717, "language_loss": 0.85695469, "learning_rate": 3.945636370162507e-06, "loss": 0.87936163, "num_input_tokens_seen": 18112335, "step": 850, "time_per_iteration": 3.5477397441864014 }, { "auxiliary_loss_clip": 0.01210922, "auxiliary_loss_mlp": 0.0107579, "balance_loss_clip": 1.06243992, "balance_loss_mlp": 1.04843152, "epoch": 0.10232669993386641, "flos": 23218546913280.0, "grad_norm": 2.014750689292411, "language_loss": 0.79067022, "learning_rate": 3.945455835935425e-06, "loss": 0.8135373, "num_input_tokens_seen": 18131520, "step": 851, "time_per_iteration": 4.688019037246704 }, { "auxiliary_loss_clip": 0.01199464, "auxiliary_loss_mlp": 0.01073275, "balance_loss_clip": 1.06282461, "balance_loss_mlp": 1.04275775, "epoch": 0.1024469428245055, "flos": 22922929981440.0, "grad_norm": 2.5558049759097003, "language_loss": 0.75320399, "learning_rate": 3.94527500658424e-06, "loss": 0.77593142, "num_input_tokens_seen": 18149185, "step": 852, "time_per_iteration": 3.5496203899383545 }, { "auxiliary_loss_clip": 0.01173802, "auxiliary_loss_mlp": 0.01067881, "balance_loss_clip": 1.06203938, "balance_loss_mlp": 1.03919971, "epoch": 0.10256718571514459, "flos": 31359495957120.0, "grad_norm": 4.907529509123324, "language_loss": 0.81037349, "learning_rate": 3.945093882136382e-06, "loss": 0.83279032, "num_input_tokens_seen": 18172960, "step": 853, "time_per_iteration": 2.775959014892578 }, { "auxiliary_loss_clip": 0.01193707, "auxiliary_loss_mlp": 0.00780974, "balance_loss_clip": 1.06013072, "balance_loss_mlp": 1.00043499, "epoch": 0.10268742860578368, "flos": 23474877344640.0, "grad_norm": 1.8645718362451311, "language_loss": 0.84589297, "learning_rate": 3.944912462619329e-06, "loss": 0.86563975, "num_input_tokens_seen": 18191925, "step": 854, "time_per_iteration": 2.6701223850250244 }, { "auxiliary_loss_clip": 0.01204387, "auxiliary_loss_mlp": 0.01076572, "balance_loss_clip": 1.05922806, "balance_loss_mlp": 1.04669857, "epoch": 0.10280767149642277, "flos": 25520313323520.0, "grad_norm": 2.1174494832103634, "language_loss": 0.80664659, "learning_rate": 3.9447307480606025e-06, "loss": 0.82945615, "num_input_tokens_seen": 18212010, "step": 855, "time_per_iteration": 2.75919246673584 }, { "auxiliary_loss_clip": 0.01182099, "auxiliary_loss_mlp": 0.01076859, "balance_loss_clip": 1.05272031, "balance_loss_mlp": 1.04495859, "epoch": 0.10292791438706186, "flos": 17347691462400.0, "grad_norm": 5.0743530448537895, "language_loss": 0.90125632, "learning_rate": 3.944548738487767e-06, "loss": 0.92384601, "num_input_tokens_seen": 18229525, "step": 856, "time_per_iteration": 2.5876643657684326 }, { "auxiliary_loss_clip": 0.0123282, "auxiliary_loss_mlp": 0.0107358, "balance_loss_clip": 1.06521332, "balance_loss_mlp": 1.04568505, "epoch": 0.10304815727770096, "flos": 27052693390080.0, "grad_norm": 1.851004415160319, "language_loss": 0.9081949, "learning_rate": 3.944366433928434e-06, "loss": 0.93125886, "num_input_tokens_seen": 18249505, "step": 857, "time_per_iteration": 2.6025869846343994 }, { "auxiliary_loss_clip": 0.01183812, "auxiliary_loss_mlp": 0.01075814, "balance_loss_clip": 1.05548477, "balance_loss_mlp": 1.04613125, "epoch": 0.10316840016834004, "flos": 22782591544320.0, "grad_norm": 3.646874146861173, "language_loss": 0.83567011, "learning_rate": 3.9441838344102594e-06, "loss": 0.85826635, "num_input_tokens_seen": 18269230, "step": 858, "time_per_iteration": 2.803748846054077 }, { "auxiliary_loss_clip": 0.0120391, "auxiliary_loss_mlp": 0.01089814, "balance_loss_clip": 1.06262469, "balance_loss_mlp": 1.06065559, "epoch": 0.10328864305897914, "flos": 20704584908160.0, "grad_norm": 2.3707144389900754, "language_loss": 0.66730767, "learning_rate": 3.944000939960943e-06, "loss": 0.69024491, "num_input_tokens_seen": 18287955, "step": 859, "time_per_iteration": 2.766909599304199 }, { "auxiliary_loss_clip": 0.01211482, "auxiliary_loss_mlp": 0.01065611, "balance_loss_clip": 1.05964208, "balance_loss_mlp": 1.03909922, "epoch": 0.10340888594961822, "flos": 28478814048000.0, "grad_norm": 1.6973433861819043, "language_loss": 0.79933208, "learning_rate": 3.943817750608229e-06, "loss": 0.82210296, "num_input_tokens_seen": 18310505, "step": 860, "time_per_iteration": 2.70904278755188 }, { "auxiliary_loss_clip": 0.012115, "auxiliary_loss_mlp": 0.01067625, "balance_loss_clip": 1.0619421, "balance_loss_mlp": 1.03978968, "epoch": 0.10352912884025732, "flos": 13370333460480.0, "grad_norm": 2.1491207329326225, "language_loss": 0.82397914, "learning_rate": 3.943634266379908e-06, "loss": 0.84677047, "num_input_tokens_seen": 18327400, "step": 861, "time_per_iteration": 2.554621934890747 }, { "auxiliary_loss_clip": 0.01208428, "auxiliary_loss_mlp": 0.0106218, "balance_loss_clip": 1.05604136, "balance_loss_mlp": 1.03505993, "epoch": 0.10364937173089642, "flos": 25558558329600.0, "grad_norm": 1.811931212674972, "language_loss": 0.84752768, "learning_rate": 3.943450487303815e-06, "loss": 0.87023377, "num_input_tokens_seen": 18347895, "step": 862, "time_per_iteration": 2.6656415462493896 }, { "auxiliary_loss_clip": 0.0120418, "auxiliary_loss_mlp": 0.01062247, "balance_loss_clip": 1.05992424, "balance_loss_mlp": 1.0353775, "epoch": 0.1037696146215355, "flos": 21215486004480.0, "grad_norm": 1.9805317820656778, "language_loss": 0.85175145, "learning_rate": 3.943266413407827e-06, "loss": 0.87441576, "num_input_tokens_seen": 18367170, "step": 863, "time_per_iteration": 2.6031761169433594 }, { "auxiliary_loss_clip": 0.01207698, "auxiliary_loss_mlp": 0.01063288, "balance_loss_clip": 1.06059051, "balance_loss_mlp": 1.03586948, "epoch": 0.1038898575121746, "flos": 25807382818560.0, "grad_norm": 6.986404302832945, "language_loss": 0.85324168, "learning_rate": 3.94308204471987e-06, "loss": 0.87595153, "num_input_tokens_seen": 18386185, "step": 864, "time_per_iteration": 2.6412088871002197 }, { "auxiliary_loss_clip": 0.01171497, "auxiliary_loss_mlp": 0.01059214, "balance_loss_clip": 1.05170143, "balance_loss_mlp": 1.03160548, "epoch": 0.10401010040281368, "flos": 19062425900160.0, "grad_norm": 2.8900569491856443, "language_loss": 0.74640989, "learning_rate": 3.942897381267912e-06, "loss": 0.76871705, "num_input_tokens_seen": 18402550, "step": 865, "time_per_iteration": 2.601710557937622 }, { "auxiliary_loss_clip": 0.0121575, "auxiliary_loss_mlp": 0.01062709, "balance_loss_clip": 1.06033707, "balance_loss_mlp": 1.03395557, "epoch": 0.10413034329345278, "flos": 16355119962240.0, "grad_norm": 2.977209089717367, "language_loss": 0.66499412, "learning_rate": 3.942712423079965e-06, "loss": 0.68777871, "num_input_tokens_seen": 18418940, "step": 866, "time_per_iteration": 2.5773754119873047 }, { "auxiliary_loss_clip": 0.01155257, "auxiliary_loss_mlp": 0.01063405, "balance_loss_clip": 1.04856098, "balance_loss_mlp": 1.03705978, "epoch": 0.10425058618409186, "flos": 17236511890560.0, "grad_norm": 2.3572097455692442, "language_loss": 0.90077424, "learning_rate": 3.942527170184088e-06, "loss": 0.92296088, "num_input_tokens_seen": 18435560, "step": 867, "time_per_iteration": 2.6982345581054688 }, { "auxiliary_loss_clip": 0.01227549, "auxiliary_loss_mlp": 0.0106681, "balance_loss_clip": 1.06401181, "balance_loss_mlp": 1.03841388, "epoch": 0.10437082907473096, "flos": 17967365919360.0, "grad_norm": 2.244652062598689, "language_loss": 0.774275, "learning_rate": 3.942341622608385e-06, "loss": 0.79721856, "num_input_tokens_seen": 18452590, "step": 868, "time_per_iteration": 2.770578145980835 }, { "auxiliary_loss_clip": 0.01193214, "auxiliary_loss_mlp": 0.0106296, "balance_loss_clip": 1.0615567, "balance_loss_mlp": 1.03524411, "epoch": 0.10449107196537005, "flos": 36283315374720.0, "grad_norm": 1.8566051427097383, "language_loss": 0.78051811, "learning_rate": 3.942155780381001e-06, "loss": 0.80307984, "num_input_tokens_seen": 18476325, "step": 869, "time_per_iteration": 2.7588326930999756 }, { "auxiliary_loss_clip": 0.01194212, "auxiliary_loss_mlp": 0.01070186, "balance_loss_clip": 1.05741954, "balance_loss_mlp": 1.04241014, "epoch": 0.10461131485600914, "flos": 23802095266560.0, "grad_norm": 2.305995718616766, "language_loss": 0.75888366, "learning_rate": 3.94196964353013e-06, "loss": 0.7815277, "num_input_tokens_seen": 18495775, "step": 870, "time_per_iteration": 2.6481876373291016 }, { "auxiliary_loss_clip": 0.0118671, "auxiliary_loss_mlp": 0.00780216, "balance_loss_clip": 1.05690956, "balance_loss_mlp": 1.00053, "epoch": 0.10473155774664823, "flos": 18405476104320.0, "grad_norm": 2.302402248722254, "language_loss": 0.80704898, "learning_rate": 3.941783212084008e-06, "loss": 0.82671827, "num_input_tokens_seen": 18513530, "step": 871, "time_per_iteration": 2.6200716495513916 }, { "auxiliary_loss_clip": 0.01180375, "auxiliary_loss_mlp": 0.01067623, "balance_loss_clip": 1.06049097, "balance_loss_mlp": 1.04050255, "epoch": 0.10485180063728732, "flos": 25592637358080.0, "grad_norm": 2.49442552599763, "language_loss": 0.78794229, "learning_rate": 3.941596486070916e-06, "loss": 0.81042224, "num_input_tokens_seen": 18531575, "step": 872, "time_per_iteration": 2.657883405685425 }, { "auxiliary_loss_clip": 0.01160217, "auxiliary_loss_mlp": 0.01068865, "balance_loss_clip": 1.05390692, "balance_loss_mlp": 1.03920591, "epoch": 0.10497204352792641, "flos": 27088747666560.0, "grad_norm": 3.6512051708650892, "language_loss": 0.5895772, "learning_rate": 3.941409465519182e-06, "loss": 0.61186802, "num_input_tokens_seen": 18552100, "step": 873, "time_per_iteration": 2.6917848587036133 }, { "auxiliary_loss_clip": 0.01192464, "auxiliary_loss_mlp": 0.01067006, "balance_loss_clip": 1.05394554, "balance_loss_mlp": 1.03820467, "epoch": 0.10509228641856551, "flos": 32858479353600.0, "grad_norm": 1.6264302250976426, "language_loss": 0.85329843, "learning_rate": 3.941222150457176e-06, "loss": 0.87589312, "num_input_tokens_seen": 18575355, "step": 874, "time_per_iteration": 2.7298409938812256 }, { "auxiliary_loss_clip": 0.01210365, "auxiliary_loss_mlp": 0.01062154, "balance_loss_clip": 1.05997169, "balance_loss_mlp": 1.03707254, "epoch": 0.10521252930920459, "flos": 14319165173760.0, "grad_norm": 3.499329281829212, "language_loss": 0.71048748, "learning_rate": 3.941034540913311e-06, "loss": 0.73321271, "num_input_tokens_seen": 18592885, "step": 875, "time_per_iteration": 3.615849018096924 }, { "auxiliary_loss_clip": 0.012049, "auxiliary_loss_mlp": 0.00780524, "balance_loss_clip": 1.06028509, "balance_loss_mlp": 1.00055349, "epoch": 0.10533277219984369, "flos": 21687028773120.0, "grad_norm": 1.9278572686793445, "language_loss": 0.82399297, "learning_rate": 3.940846636916051e-06, "loss": 0.84384722, "num_input_tokens_seen": 18612920, "step": 876, "time_per_iteration": 3.5601837635040283 }, { "auxiliary_loss_clip": 0.01185227, "auxiliary_loss_mlp": 0.01068903, "balance_loss_clip": 1.05717468, "balance_loss_mlp": 1.04322577, "epoch": 0.10545301509048277, "flos": 22269787027200.0, "grad_norm": 1.9490665507925493, "language_loss": 0.86742997, "learning_rate": 3.940658438493899e-06, "loss": 0.88997138, "num_input_tokens_seen": 18630765, "step": 877, "time_per_iteration": 3.673255205154419 }, { "auxiliary_loss_clip": 0.01224858, "auxiliary_loss_mlp": 0.01057919, "balance_loss_clip": 1.05921698, "balance_loss_mlp": 1.0296191, "epoch": 0.10557325798112187, "flos": 22199725549440.0, "grad_norm": 2.3564205828803026, "language_loss": 0.76016831, "learning_rate": 3.940469945675405e-06, "loss": 0.78299618, "num_input_tokens_seen": 18649150, "step": 878, "time_per_iteration": 2.545016050338745 }, { "auxiliary_loss_clip": 0.01139232, "auxiliary_loss_mlp": 0.01058475, "balance_loss_clip": 1.05157781, "balance_loss_mlp": 1.03103268, "epoch": 0.10569350087176095, "flos": 25775889569280.0, "grad_norm": 1.9044970600044477, "language_loss": 0.91274059, "learning_rate": 3.940281158489163e-06, "loss": 0.93471766, "num_input_tokens_seen": 18668380, "step": 879, "time_per_iteration": 3.593959093093872 }, { "auxiliary_loss_clip": 0.01146325, "auxiliary_loss_mlp": 0.01063738, "balance_loss_clip": 1.05095625, "balance_loss_mlp": 1.03431702, "epoch": 0.10581374376240005, "flos": 17311385790720.0, "grad_norm": 1.7791844461254114, "language_loss": 0.82788718, "learning_rate": 3.940092076963812e-06, "loss": 0.8499878, "num_input_tokens_seen": 18685875, "step": 880, "time_per_iteration": 2.777127981185913 }, { "auxiliary_loss_clip": 0.01194067, "auxiliary_loss_mlp": 0.01064057, "balance_loss_clip": 1.05950999, "balance_loss_mlp": 1.03699636, "epoch": 0.10593398665303914, "flos": 34349454017280.0, "grad_norm": 2.7704217236982145, "language_loss": 0.78957111, "learning_rate": 3.9399027011280355e-06, "loss": 0.81215227, "num_input_tokens_seen": 18707970, "step": 881, "time_per_iteration": 2.7559187412261963 }, { "auxiliary_loss_clip": 0.01193787, "auxiliary_loss_mlp": 0.01066655, "balance_loss_clip": 1.06142962, "balance_loss_mlp": 1.03821158, "epoch": 0.10605422954367823, "flos": 23257977068160.0, "grad_norm": 2.1153594307340096, "language_loss": 0.77501965, "learning_rate": 3.939713031010561e-06, "loss": 0.79762411, "num_input_tokens_seen": 18726335, "step": 882, "time_per_iteration": 2.628025531768799 }, { "auxiliary_loss_clip": 0.01182544, "auxiliary_loss_mlp": 0.01066529, "balance_loss_clip": 1.05953455, "balance_loss_mlp": 1.03913498, "epoch": 0.10617447243431732, "flos": 22820118278400.0, "grad_norm": 2.0250377287151373, "language_loss": 0.78126633, "learning_rate": 3.939523066640163e-06, "loss": 0.80375707, "num_input_tokens_seen": 18745230, "step": 883, "time_per_iteration": 2.68100905418396 }, { "auxiliary_loss_clip": 0.01204692, "auxiliary_loss_mlp": 0.01058943, "balance_loss_clip": 1.05831456, "balance_loss_mlp": 1.03054738, "epoch": 0.10629471532495641, "flos": 24386577373440.0, "grad_norm": 1.7938230841024714, "language_loss": 0.80949819, "learning_rate": 3.939332808045657e-06, "loss": 0.83213449, "num_input_tokens_seen": 18764880, "step": 884, "time_per_iteration": 2.6917436122894287 }, { "auxiliary_loss_clip": 0.01182286, "auxiliary_loss_mlp": 0.01059932, "balance_loss_clip": 1.05743897, "balance_loss_mlp": 1.03301418, "epoch": 0.1064149582155955, "flos": 21105491581440.0, "grad_norm": 1.7517484491505015, "language_loss": 0.84635234, "learning_rate": 3.939142255255906e-06, "loss": 0.86877453, "num_input_tokens_seen": 18785765, "step": 885, "time_per_iteration": 2.803663492202759 }, { "auxiliary_loss_clip": 0.01205223, "auxiliary_loss_mlp": 0.01074392, "balance_loss_clip": 1.06032228, "balance_loss_mlp": 1.04708076, "epoch": 0.1065352011062346, "flos": 20702035042560.0, "grad_norm": 2.5366250477870445, "language_loss": 0.86921048, "learning_rate": 3.938951408299817e-06, "loss": 0.89200664, "num_input_tokens_seen": 18804605, "step": 886, "time_per_iteration": 2.6023566722869873 }, { "auxiliary_loss_clip": 0.01049152, "auxiliary_loss_mlp": 0.01049138, "balance_loss_clip": 1.02920747, "balance_loss_mlp": 1.04398811, "epoch": 0.10665544399687368, "flos": 62659632689280.0, "grad_norm": 0.8182409216303205, "language_loss": 0.54380381, "learning_rate": 3.938760267206342e-06, "loss": 0.56478673, "num_input_tokens_seen": 18866425, "step": 887, "time_per_iteration": 3.1614232063293457 }, { "auxiliary_loss_clip": 0.01223552, "auxiliary_loss_mlp": 0.01057655, "balance_loss_clip": 1.06434345, "balance_loss_mlp": 1.03062987, "epoch": 0.10677568688751278, "flos": 26140382830080.0, "grad_norm": 2.217371311552115, "language_loss": 0.78465998, "learning_rate": 3.938568832004475e-06, "loss": 0.80747199, "num_input_tokens_seen": 18885130, "step": 888, "time_per_iteration": 2.7087531089782715 }, { "auxiliary_loss_clip": 0.01179425, "auxiliary_loss_mlp": 0.01069939, "balance_loss_clip": 1.0544374, "balance_loss_mlp": 1.04006541, "epoch": 0.10689592977815186, "flos": 12786533712000.0, "grad_norm": 2.0313834208491506, "language_loss": 0.75565886, "learning_rate": 3.938377102723257e-06, "loss": 0.77815247, "num_input_tokens_seen": 18902265, "step": 889, "time_per_iteration": 2.6155548095703125 }, { "auxiliary_loss_clip": 0.01150288, "auxiliary_loss_mlp": 0.01075858, "balance_loss_clip": 1.05124378, "balance_loss_mlp": 1.04462552, "epoch": 0.10701617266879096, "flos": 22126683242880.0, "grad_norm": 1.910471280435211, "language_loss": 0.83414537, "learning_rate": 3.938185079391774e-06, "loss": 0.85640681, "num_input_tokens_seen": 18919310, "step": 890, "time_per_iteration": 2.957470178604126 }, { "auxiliary_loss_clip": 0.01219548, "auxiliary_loss_mlp": 0.01071983, "balance_loss_clip": 1.06281459, "balance_loss_mlp": 1.04319429, "epoch": 0.10713641555943004, "flos": 19745625559680.0, "grad_norm": 2.5657993951521383, "language_loss": 1.05988646, "learning_rate": 3.937992762039157e-06, "loss": 1.08280182, "num_input_tokens_seen": 18932635, "step": 891, "time_per_iteration": 2.6039936542510986 }, { "auxiliary_loss_clip": 0.01203183, "auxiliary_loss_mlp": 0.01066258, "balance_loss_clip": 1.05933201, "balance_loss_mlp": 1.03947139, "epoch": 0.10725665845006914, "flos": 23952992302080.0, "grad_norm": 1.8979305372345274, "language_loss": 0.80624449, "learning_rate": 3.937800150694577e-06, "loss": 0.82893896, "num_input_tokens_seen": 18953810, "step": 892, "time_per_iteration": 2.6440675258636475 }, { "auxiliary_loss_clip": 0.01172437, "auxiliary_loss_mlp": 0.01071736, "balance_loss_clip": 1.05982769, "balance_loss_mlp": 1.04189777, "epoch": 0.10737690134070824, "flos": 18551704371840.0, "grad_norm": 2.5039429775848334, "language_loss": 0.76049256, "learning_rate": 3.937607245387255e-06, "loss": 0.78293431, "num_input_tokens_seen": 18973175, "step": 893, "time_per_iteration": 2.71631121635437 }, { "auxiliary_loss_clip": 0.01195288, "auxiliary_loss_mlp": 0.01059121, "balance_loss_clip": 1.05636024, "balance_loss_mlp": 1.03215611, "epoch": 0.10749714423134732, "flos": 22707609903360.0, "grad_norm": 1.922320821757126, "language_loss": 0.72004032, "learning_rate": 3.937414046146455e-06, "loss": 0.74258441, "num_input_tokens_seen": 18991130, "step": 894, "time_per_iteration": 2.804553270339966 }, { "auxiliary_loss_clip": 0.0122009, "auxiliary_loss_mlp": 0.01068353, "balance_loss_clip": 1.06201041, "balance_loss_mlp": 1.04055345, "epoch": 0.10761738712198642, "flos": 21106066199040.0, "grad_norm": 2.5836413042811657, "language_loss": 0.75194037, "learning_rate": 3.9372205530014845e-06, "loss": 0.77482486, "num_input_tokens_seen": 19009610, "step": 895, "time_per_iteration": 2.5782082080841064 }, { "auxiliary_loss_clip": 0.01223405, "auxiliary_loss_mlp": 0.01071667, "balance_loss_clip": 1.05989337, "balance_loss_mlp": 1.0411495, "epoch": 0.1077376300126255, "flos": 23766723348480.0, "grad_norm": 2.368943289286189, "language_loss": 0.71535981, "learning_rate": 3.937026765981696e-06, "loss": 0.73831058, "num_input_tokens_seen": 19029680, "step": 896, "time_per_iteration": 2.639681100845337 }, { "auxiliary_loss_clip": 0.0118531, "auxiliary_loss_mlp": 0.01072611, "balance_loss_clip": 1.06145072, "balance_loss_mlp": 1.04357195, "epoch": 0.1078578729032646, "flos": 20919581763840.0, "grad_norm": 1.9895665226226456, "language_loss": 0.7921747, "learning_rate": 3.936832685116488e-06, "loss": 0.81475401, "num_input_tokens_seen": 19047775, "step": 897, "time_per_iteration": 2.676011323928833 }, { "auxiliary_loss_clip": 0.01222431, "auxiliary_loss_mlp": 0.01063749, "balance_loss_clip": 1.06134582, "balance_loss_mlp": 1.03528178, "epoch": 0.10797811579390369, "flos": 14829886702080.0, "grad_norm": 2.058375001576354, "language_loss": 0.90102744, "learning_rate": 3.936638310435301e-06, "loss": 0.92388922, "num_input_tokens_seen": 19065640, "step": 898, "time_per_iteration": 2.5478522777557373 }, { "auxiliary_loss_clip": 0.01209177, "auxiliary_loss_mlp": 0.01064707, "balance_loss_clip": 1.06136131, "balance_loss_mlp": 1.03827786, "epoch": 0.10809835868454278, "flos": 19536985411200.0, "grad_norm": 2.5128761709539114, "language_loss": 0.81345308, "learning_rate": 3.936443641967623e-06, "loss": 0.83619189, "num_input_tokens_seen": 19084470, "step": 899, "time_per_iteration": 2.7359910011291504 }, { "auxiliary_loss_clip": 0.01197724, "auxiliary_loss_mlp": 0.01063901, "balance_loss_clip": 1.06091642, "balance_loss_mlp": 1.03502822, "epoch": 0.10821860157518187, "flos": 18442320480000.0, "grad_norm": 2.027902746080746, "language_loss": 0.83190036, "learning_rate": 3.936248679742983e-06, "loss": 0.85451663, "num_input_tokens_seen": 19102965, "step": 900, "time_per_iteration": 2.7988877296447754 }, { "auxiliary_loss_clip": 0.01066979, "auxiliary_loss_mlp": 0.0101795, "balance_loss_clip": 1.03337622, "balance_loss_mlp": 1.01301432, "epoch": 0.10833884446582095, "flos": 49359468447360.0, "grad_norm": 1.169562031183622, "language_loss": 0.70189595, "learning_rate": 3.936053423790959e-06, "loss": 0.72274524, "num_input_tokens_seen": 19151285, "step": 901, "time_per_iteration": 3.0788156986236572 }, { "auxiliary_loss_clip": 0.01218721, "auxiliary_loss_mlp": 0.01064275, "balance_loss_clip": 1.06188369, "balance_loss_mlp": 1.03359032, "epoch": 0.10845908735646005, "flos": 20411912891520.0, "grad_norm": 1.6878446446879922, "language_loss": 0.77434456, "learning_rate": 3.935857874141168e-06, "loss": 0.79717451, "num_input_tokens_seen": 19170120, "step": 902, "time_per_iteration": 3.7500295639038086 }, { "auxiliary_loss_clip": 0.01188198, "auxiliary_loss_mlp": 0.01070096, "balance_loss_clip": 1.0611614, "balance_loss_mlp": 1.03924417, "epoch": 0.10857933024709913, "flos": 14027750133120.0, "grad_norm": 2.4211413264544297, "language_loss": 0.83539164, "learning_rate": 3.935662030823279e-06, "loss": 0.85797453, "num_input_tokens_seen": 19186305, "step": 903, "time_per_iteration": 3.6323800086975098 }, { "auxiliary_loss_clip": 0.012074, "auxiliary_loss_mlp": 0.01063809, "balance_loss_clip": 1.05783415, "balance_loss_mlp": 1.03493631, "epoch": 0.10869957313773823, "flos": 13369004657280.0, "grad_norm": 2.2314018649135177, "language_loss": 0.72334123, "learning_rate": 3.935465893866998e-06, "loss": 0.74605334, "num_input_tokens_seen": 19204530, "step": 904, "time_per_iteration": 3.4911983013153076 }, { "auxiliary_loss_clip": 0.01191246, "auxiliary_loss_mlp": 0.01067128, "balance_loss_clip": 1.0562855, "balance_loss_mlp": 1.03646719, "epoch": 0.10881981602837733, "flos": 25807095509760.0, "grad_norm": 1.9146451107078313, "language_loss": 0.80095541, "learning_rate": 3.935269463302079e-06, "loss": 0.82353914, "num_input_tokens_seen": 19222735, "step": 905, "time_per_iteration": 2.6624813079833984 }, { "auxiliary_loss_clip": 0.01219412, "auxiliary_loss_mlp": 0.0107492, "balance_loss_clip": 1.0642302, "balance_loss_mlp": 1.0472281, "epoch": 0.10894005891901641, "flos": 20777555387520.0, "grad_norm": 1.8429848517499667, "language_loss": 0.76408976, "learning_rate": 3.935072739158322e-06, "loss": 0.78703308, "num_input_tokens_seen": 19242445, "step": 906, "time_per_iteration": 2.618772268295288 }, { "auxiliary_loss_clip": 0.01193496, "auxiliary_loss_mlp": 0.01078882, "balance_loss_clip": 1.05649424, "balance_loss_mlp": 1.04908013, "epoch": 0.10906030180965551, "flos": 26649883296000.0, "grad_norm": 3.456629893517942, "language_loss": 0.79837477, "learning_rate": 3.934875721465569e-06, "loss": 0.82109857, "num_input_tokens_seen": 19262865, "step": 907, "time_per_iteration": 2.6918585300445557 }, { "auxiliary_loss_clip": 0.01189015, "auxiliary_loss_mlp": 0.01073598, "balance_loss_clip": 1.05414891, "balance_loss_mlp": 1.04360509, "epoch": 0.10918054470029459, "flos": 36534402420480.0, "grad_norm": 2.617113994188007, "language_loss": 0.71629953, "learning_rate": 3.9346784102537076e-06, "loss": 0.7389257, "num_input_tokens_seen": 19285000, "step": 908, "time_per_iteration": 2.7676055431365967 }, { "auxiliary_loss_clip": 0.01219273, "auxiliary_loss_mlp": 0.01071588, "balance_loss_clip": 1.05935383, "balance_loss_mlp": 1.04197681, "epoch": 0.10930078759093369, "flos": 21762549118080.0, "grad_norm": 1.9803368165561406, "language_loss": 0.78580326, "learning_rate": 3.934480805552669e-06, "loss": 0.80871189, "num_input_tokens_seen": 19306010, "step": 909, "time_per_iteration": 2.5914692878723145 }, { "auxiliary_loss_clip": 0.01215248, "auxiliary_loss_mlp": 0.00780873, "balance_loss_clip": 1.05917573, "balance_loss_mlp": 1.00054932, "epoch": 0.10942103048157277, "flos": 22601781457920.0, "grad_norm": 2.2277537348700047, "language_loss": 0.88102049, "learning_rate": 3.93428290739243e-06, "loss": 0.90098166, "num_input_tokens_seen": 19325380, "step": 910, "time_per_iteration": 2.573812246322632 }, { "auxiliary_loss_clip": 0.01191206, "auxiliary_loss_mlp": 0.01077868, "balance_loss_clip": 1.05635977, "balance_loss_mlp": 1.04996085, "epoch": 0.10954127337221187, "flos": 15045781397760.0, "grad_norm": 2.6198260087050316, "language_loss": 0.79805952, "learning_rate": 3.9340847158030125e-06, "loss": 0.82075024, "num_input_tokens_seen": 19338960, "step": 911, "time_per_iteration": 2.708207607269287 }, { "auxiliary_loss_clip": 0.01208764, "auxiliary_loss_mlp": 0.01071015, "balance_loss_clip": 1.05997717, "balance_loss_mlp": 1.04407406, "epoch": 0.10966151626285096, "flos": 21650974496640.0, "grad_norm": 2.2510804612453934, "language_loss": 0.75603604, "learning_rate": 3.9338862308144814e-06, "loss": 0.77883387, "num_input_tokens_seen": 19357780, "step": 912, "time_per_iteration": 2.6248106956481934 }, { "auxiliary_loss_clip": 0.01220761, "auxiliary_loss_mlp": 0.01080036, "balance_loss_clip": 1.06139457, "balance_loss_mlp": 1.05073476, "epoch": 0.10978175915349005, "flos": 20121359777280.0, "grad_norm": 2.0365891617687173, "language_loss": 0.8447994, "learning_rate": 3.933687452456946e-06, "loss": 0.86780733, "num_input_tokens_seen": 19377680, "step": 913, "time_per_iteration": 2.7134697437286377 }, { "auxiliary_loss_clip": 0.01177249, "auxiliary_loss_mlp": 0.01077697, "balance_loss_clip": 1.05402207, "balance_loss_mlp": 1.04876494, "epoch": 0.10990200204412914, "flos": 20412667077120.0, "grad_norm": 2.7649484008436302, "language_loss": 0.86740845, "learning_rate": 3.933488380760562e-06, "loss": 0.8899579, "num_input_tokens_seen": 19397040, "step": 914, "time_per_iteration": 2.665905475616455 }, { "auxiliary_loss_clip": 0.01218976, "auxiliary_loss_mlp": 0.0078085, "balance_loss_clip": 1.06049991, "balance_loss_mlp": 1.0005852, "epoch": 0.11002224493476823, "flos": 17530117660800.0, "grad_norm": 2.078849361502927, "language_loss": 0.87205863, "learning_rate": 3.9332890157555286e-06, "loss": 0.89205682, "num_input_tokens_seen": 19413975, "step": 915, "time_per_iteration": 2.5581910610198975 }, { "auxiliary_loss_clip": 0.0119762, "auxiliary_loss_mlp": 0.01073239, "balance_loss_clip": 1.05857182, "balance_loss_mlp": 1.04545128, "epoch": 0.11014248782540732, "flos": 12203093099520.0, "grad_norm": 1.9989975827340276, "language_loss": 0.76697934, "learning_rate": 3.933089357472088e-06, "loss": 0.78968787, "num_input_tokens_seen": 19432005, "step": 916, "time_per_iteration": 2.604646921157837 }, { "auxiliary_loss_clip": 0.0121875, "auxiliary_loss_mlp": 0.01065989, "balance_loss_clip": 1.06521511, "balance_loss_mlp": 1.0383085, "epoch": 0.11026273071604642, "flos": 22382977760640.0, "grad_norm": 1.8840350499196743, "language_loss": 0.86084831, "learning_rate": 3.932889405940529e-06, "loss": 0.88369566, "num_input_tokens_seen": 19450100, "step": 917, "time_per_iteration": 2.592221975326538 }, { "auxiliary_loss_clip": 0.01199601, "auxiliary_loss_mlp": 0.01075803, "balance_loss_clip": 1.06630182, "balance_loss_mlp": 1.04621577, "epoch": 0.1103829736066855, "flos": 19829046896640.0, "grad_norm": 2.1348716640157237, "language_loss": 0.80162466, "learning_rate": 3.932689161191184e-06, "loss": 0.82437873, "num_input_tokens_seen": 19467805, "step": 918, "time_per_iteration": 2.645174741744995 }, { "auxiliary_loss_clip": 0.01208889, "auxiliary_loss_mlp": 0.0106166, "balance_loss_clip": 1.0595572, "balance_loss_mlp": 1.03318095, "epoch": 0.1105032164973246, "flos": 22669616292480.0, "grad_norm": 2.115647674634378, "language_loss": 0.87734318, "learning_rate": 3.93248862325443e-06, "loss": 0.90004861, "num_input_tokens_seen": 19486710, "step": 919, "time_per_iteration": 2.667813539505005 }, { "auxiliary_loss_clip": 0.01092045, "auxiliary_loss_mlp": 0.01027706, "balance_loss_clip": 1.03450143, "balance_loss_mlp": 1.02155495, "epoch": 0.11062345938796368, "flos": 66483507876480.0, "grad_norm": 0.9404612044658004, "language_loss": 0.64537978, "learning_rate": 3.932287792160688e-06, "loss": 0.66657734, "num_input_tokens_seen": 19545170, "step": 920, "time_per_iteration": 3.118089199066162 }, { "auxiliary_loss_clip": 0.01211918, "auxiliary_loss_mlp": 0.01076216, "balance_loss_clip": 1.0596683, "balance_loss_mlp": 1.04698563, "epoch": 0.11074370227860278, "flos": 21907771804800.0, "grad_norm": 2.6322760966986682, "language_loss": 0.8017866, "learning_rate": 3.932086667940424e-06, "loss": 0.82466793, "num_input_tokens_seen": 19561875, "step": 921, "time_per_iteration": 2.6103193759918213 }, { "auxiliary_loss_clip": 0.01205901, "auxiliary_loss_mlp": 0.00781325, "balance_loss_clip": 1.06274259, "balance_loss_mlp": 1.00054526, "epoch": 0.11086394516924186, "flos": 28658115763200.0, "grad_norm": 1.9038697788290408, "language_loss": 0.81276214, "learning_rate": 3.93188525062415e-06, "loss": 0.83263439, "num_input_tokens_seen": 19582340, "step": 922, "time_per_iteration": 2.671635389328003 }, { "auxiliary_loss_clip": 0.01213698, "auxiliary_loss_mlp": 0.01078766, "balance_loss_clip": 1.06080651, "balance_loss_mlp": 1.0499177, "epoch": 0.11098418805988096, "flos": 24535247765760.0, "grad_norm": 1.8946833118884572, "language_loss": 0.86320472, "learning_rate": 3.931683540242418e-06, "loss": 0.88612938, "num_input_tokens_seen": 19603405, "step": 923, "time_per_iteration": 2.657536268234253 }, { "auxiliary_loss_clip": 0.01198391, "auxiliary_loss_mlp": 0.01063974, "balance_loss_clip": 1.05913019, "balance_loss_mlp": 1.0344578, "epoch": 0.11110443095052006, "flos": 22960384888320.0, "grad_norm": 11.367274340683938, "language_loss": 0.91070735, "learning_rate": 3.9314815368258295e-06, "loss": 0.93333095, "num_input_tokens_seen": 19619885, "step": 924, "time_per_iteration": 2.645890474319458 }, { "auxiliary_loss_clip": 0.01215136, "auxiliary_loss_mlp": 0.01084488, "balance_loss_clip": 1.06473768, "balance_loss_mlp": 1.05559134, "epoch": 0.11122467384115914, "flos": 18950025265920.0, "grad_norm": 1.6826876901053613, "language_loss": 0.78722703, "learning_rate": 3.9312792404050275e-06, "loss": 0.81022328, "num_input_tokens_seen": 19637940, "step": 925, "time_per_iteration": 2.596118211746216 }, { "auxiliary_loss_clip": 0.01220489, "auxiliary_loss_mlp": 0.0108177, "balance_loss_clip": 1.06696022, "balance_loss_mlp": 1.05485249, "epoch": 0.11134491673179824, "flos": 25082957324160.0, "grad_norm": 2.0645870393726247, "language_loss": 0.77089536, "learning_rate": 3.9310766510107e-06, "loss": 0.79391789, "num_input_tokens_seen": 19657115, "step": 926, "time_per_iteration": 2.7679519653320312 }, { "auxiliary_loss_clip": 0.01187438, "auxiliary_loss_mlp": 0.0107006, "balance_loss_clip": 1.06211066, "balance_loss_mlp": 1.04204559, "epoch": 0.11146515962243732, "flos": 24499121662080.0, "grad_norm": 1.972878483454614, "language_loss": 0.92293686, "learning_rate": 3.9308737686735806e-06, "loss": 0.94551182, "num_input_tokens_seen": 19677075, "step": 927, "time_per_iteration": 2.6862103939056396 }, { "auxiliary_loss_clip": 0.01216983, "auxiliary_loss_mlp": 0.01056458, "balance_loss_clip": 1.06201184, "balance_loss_mlp": 1.02808583, "epoch": 0.11158540251307641, "flos": 22343763087360.0, "grad_norm": 2.0520978310649904, "language_loss": 0.82775855, "learning_rate": 3.9306705934244455e-06, "loss": 0.85049295, "num_input_tokens_seen": 19697155, "step": 928, "time_per_iteration": 3.5734574794769287 }, { "auxiliary_loss_clip": 0.01181171, "auxiliary_loss_mlp": 0.01083661, "balance_loss_clip": 1.05958509, "balance_loss_mlp": 1.0549798, "epoch": 0.11170564540371551, "flos": 19902304684800.0, "grad_norm": 1.6351150438731343, "language_loss": 0.88054311, "learning_rate": 3.930467125294116e-06, "loss": 0.90319145, "num_input_tokens_seen": 19716705, "step": 929, "time_per_iteration": 3.6960463523864746 }, { "auxiliary_loss_clip": 0.01044292, "auxiliary_loss_mlp": 0.01038274, "balance_loss_clip": 1.0350368, "balance_loss_mlp": 1.03310013, "epoch": 0.1118258882943546, "flos": 64586239499520.0, "grad_norm": 0.9357603323007091, "language_loss": 0.60489643, "learning_rate": 3.930263364313458e-06, "loss": 0.62572205, "num_input_tokens_seen": 19767275, "step": 930, "time_per_iteration": 4.155316352844238 }, { "auxiliary_loss_clip": 0.01175105, "auxiliary_loss_mlp": 0.01066889, "balance_loss_clip": 1.05630851, "balance_loss_mlp": 1.03799272, "epoch": 0.11194613118499369, "flos": 17201965985280.0, "grad_norm": 1.9912002595427647, "language_loss": 0.82529569, "learning_rate": 3.930059310513384e-06, "loss": 0.84771562, "num_input_tokens_seen": 19786315, "step": 931, "time_per_iteration": 2.9850270748138428 }, { "auxiliary_loss_clip": 0.01170048, "auxiliary_loss_mlp": 0.00782478, "balance_loss_clip": 1.05969489, "balance_loss_mlp": 1.0004406, "epoch": 0.11206637407563277, "flos": 31863465728640.0, "grad_norm": 1.7339291340244312, "language_loss": 0.83877003, "learning_rate": 3.929854963924846e-06, "loss": 0.85829532, "num_input_tokens_seen": 19806580, "step": 932, "time_per_iteration": 2.76346755027771 }, { "auxiliary_loss_clip": 0.01172098, "auxiliary_loss_mlp": 0.01061547, "balance_loss_clip": 1.05605555, "balance_loss_mlp": 1.03360426, "epoch": 0.11218661696627187, "flos": 21945621761280.0, "grad_norm": 1.7394350891544506, "language_loss": 0.7729339, "learning_rate": 3.929650324578845e-06, "loss": 0.79527032, "num_input_tokens_seen": 19826045, "step": 933, "time_per_iteration": 2.6799674034118652 }, { "auxiliary_loss_clip": 0.01212133, "auxiliary_loss_mlp": 0.01071017, "balance_loss_clip": 1.06720221, "balance_loss_mlp": 1.04048717, "epoch": 0.11230685985691095, "flos": 25878198481920.0, "grad_norm": 2.4077910213537237, "language_loss": 0.81953084, "learning_rate": 3.929445392506423e-06, "loss": 0.84236228, "num_input_tokens_seen": 19843985, "step": 934, "time_per_iteration": 2.6763291358947754 }, { "auxiliary_loss_clip": 0.01211501, "auxiliary_loss_mlp": 0.01069367, "balance_loss_clip": 1.06934071, "balance_loss_mlp": 1.04113817, "epoch": 0.11242710274755005, "flos": 22231506107520.0, "grad_norm": 1.876683768342877, "language_loss": 0.7578131, "learning_rate": 3.92924016773867e-06, "loss": 0.78062183, "num_input_tokens_seen": 19860480, "step": 935, "time_per_iteration": 2.722085952758789 }, { "auxiliary_loss_clip": 0.01191882, "auxiliary_loss_mlp": 0.00780694, "balance_loss_clip": 1.05901814, "balance_loss_mlp": 1.00050366, "epoch": 0.11254734563818915, "flos": 17712184723200.0, "grad_norm": 2.370363584152138, "language_loss": 0.73137009, "learning_rate": 3.9290346503067175e-06, "loss": 0.75109577, "num_input_tokens_seen": 19877145, "step": 936, "time_per_iteration": 2.6329267024993896 }, { "auxiliary_loss_clip": 0.01206425, "auxiliary_loss_mlp": 0.0107217, "balance_loss_clip": 1.05846763, "balance_loss_mlp": 1.04421544, "epoch": 0.11266758852882823, "flos": 54930397334400.0, "grad_norm": 2.245381933947721, "language_loss": 0.78669977, "learning_rate": 3.9288288402417415e-06, "loss": 0.80948579, "num_input_tokens_seen": 19903405, "step": 937, "time_per_iteration": 2.9213597774505615 }, { "auxiliary_loss_clip": 0.01209915, "auxiliary_loss_mlp": 0.01068535, "balance_loss_clip": 1.06399131, "balance_loss_mlp": 1.03937602, "epoch": 0.11278783141946733, "flos": 18878132194560.0, "grad_norm": 2.1800697030418577, "language_loss": 0.70361316, "learning_rate": 3.928622737574964e-06, "loss": 0.72639769, "num_input_tokens_seen": 19918740, "step": 938, "time_per_iteration": 2.597761392593384 }, { "auxiliary_loss_clip": 0.01192315, "auxiliary_loss_mlp": 0.01074883, "balance_loss_clip": 1.05822206, "balance_loss_mlp": 1.04596281, "epoch": 0.11290807431010641, "flos": 26469252777600.0, "grad_norm": 1.9979997322701715, "language_loss": 0.91026962, "learning_rate": 3.928416342337652e-06, "loss": 0.93294162, "num_input_tokens_seen": 19938475, "step": 939, "time_per_iteration": 2.6746256351470947 }, { "auxiliary_loss_clip": 0.01187538, "auxiliary_loss_mlp": 0.01061432, "balance_loss_clip": 1.05715752, "balance_loss_mlp": 1.03366828, "epoch": 0.1130283172007455, "flos": 22710590732160.0, "grad_norm": 1.8178223055609006, "language_loss": 0.82744682, "learning_rate": 3.928209654561113e-06, "loss": 0.84993654, "num_input_tokens_seen": 19959310, "step": 940, "time_per_iteration": 2.6623780727386475 }, { "auxiliary_loss_clip": 0.01190054, "auxiliary_loss_mlp": 0.01065725, "balance_loss_clip": 1.06518257, "balance_loss_mlp": 1.03830647, "epoch": 0.1131485600913846, "flos": 23219911630080.0, "grad_norm": 1.9781367410660706, "language_loss": 0.80880296, "learning_rate": 3.928002674276703e-06, "loss": 0.8313607, "num_input_tokens_seen": 19978700, "step": 941, "time_per_iteration": 2.683007001876831 }, { "auxiliary_loss_clip": 0.01135818, "auxiliary_loss_mlp": 0.01064223, "balance_loss_clip": 1.04714012, "balance_loss_mlp": 1.03387213, "epoch": 0.11326880298202369, "flos": 14064271286400.0, "grad_norm": 2.203413960653347, "language_loss": 0.75387794, "learning_rate": 3.92779540151582e-06, "loss": 0.77587837, "num_input_tokens_seen": 19995785, "step": 942, "time_per_iteration": 2.653430223464966 }, { "auxiliary_loss_clip": 0.01194085, "auxiliary_loss_mlp": 0.01065058, "balance_loss_clip": 1.06341434, "balance_loss_mlp": 1.03601909, "epoch": 0.11338904587266278, "flos": 16325386479360.0, "grad_norm": 1.778974357595115, "language_loss": 0.85592842, "learning_rate": 3.927587836309907e-06, "loss": 0.87851989, "num_input_tokens_seen": 20013615, "step": 943, "time_per_iteration": 2.636833429336548 }, { "auxiliary_loss_clip": 0.01185454, "auxiliary_loss_mlp": 0.01072701, "balance_loss_clip": 1.05489671, "balance_loss_mlp": 1.0429939, "epoch": 0.11350928876330187, "flos": 24426258923520.0, "grad_norm": 1.808312871396008, "language_loss": 0.78187501, "learning_rate": 3.927379978690452e-06, "loss": 0.80445659, "num_input_tokens_seen": 20032880, "step": 944, "time_per_iteration": 2.6866025924682617 }, { "auxiliary_loss_clip": 0.01156368, "auxiliary_loss_mlp": 0.01081498, "balance_loss_clip": 1.04678726, "balance_loss_mlp": 1.05059934, "epoch": 0.11362953165394096, "flos": 24497074586880.0, "grad_norm": 3.696102093276137, "language_loss": 0.87500858, "learning_rate": 3.927171828688987e-06, "loss": 0.89738727, "num_input_tokens_seen": 20052405, "step": 945, "time_per_iteration": 2.7039635181427 }, { "auxiliary_loss_clip": 0.01218021, "auxiliary_loss_mlp": 0.01064145, "balance_loss_clip": 1.06329226, "balance_loss_mlp": 1.0367862, "epoch": 0.11374977454458005, "flos": 24060831909120.0, "grad_norm": 2.2992496336860038, "language_loss": 0.81897116, "learning_rate": 3.926963386337088e-06, "loss": 0.84179276, "num_input_tokens_seen": 20070635, "step": 946, "time_per_iteration": 2.605811834335327 }, { "auxiliary_loss_clip": 0.01224754, "auxiliary_loss_mlp": 0.01072877, "balance_loss_clip": 1.06386161, "balance_loss_mlp": 1.04401648, "epoch": 0.11387001743521914, "flos": 39457638967680.0, "grad_norm": 3.6506037458607206, "language_loss": 0.70374203, "learning_rate": 3.926754651666375e-06, "loss": 0.72671831, "num_input_tokens_seen": 20091195, "step": 947, "time_per_iteration": 2.726686954498291 }, { "auxiliary_loss_clip": 0.01180587, "auxiliary_loss_mlp": 0.0107035, "balance_loss_clip": 1.06082892, "balance_loss_mlp": 1.04107201, "epoch": 0.11399026032585824, "flos": 25082454533760.0, "grad_norm": 3.2925963281512827, "language_loss": 0.78321159, "learning_rate": 3.926545624708513e-06, "loss": 0.80572093, "num_input_tokens_seen": 20110435, "step": 948, "time_per_iteration": 2.6933794021606445 }, { "auxiliary_loss_clip": 0.01171937, "auxiliary_loss_mlp": 0.01074113, "balance_loss_clip": 1.05520153, "balance_loss_mlp": 1.04417968, "epoch": 0.11411050321649732, "flos": 17961835224960.0, "grad_norm": 2.0128464798898023, "language_loss": 0.85853088, "learning_rate": 3.926336305495213e-06, "loss": 0.88099134, "num_input_tokens_seen": 20128995, "step": 949, "time_per_iteration": 2.833742618560791 }, { "auxiliary_loss_clip": 0.01160851, "auxiliary_loss_mlp": 0.01082385, "balance_loss_clip": 1.05241358, "balance_loss_mlp": 1.05210614, "epoch": 0.11423074610713642, "flos": 22455409536000.0, "grad_norm": 2.4097540213733373, "language_loss": 0.88548863, "learning_rate": 3.926126694058226e-06, "loss": 0.90792096, "num_input_tokens_seen": 20148145, "step": 950, "time_per_iteration": 2.704787254333496 }, { "auxiliary_loss_clip": 0.01172072, "auxiliary_loss_mlp": 0.0106815, "balance_loss_clip": 1.06496906, "balance_loss_mlp": 1.04116058, "epoch": 0.1143509889977755, "flos": 19717687756800.0, "grad_norm": 1.700355984154104, "language_loss": 0.81831044, "learning_rate": 3.92591679042935e-06, "loss": 0.84071267, "num_input_tokens_seen": 20168035, "step": 951, "time_per_iteration": 2.68550705909729 }, { "auxiliary_loss_clip": 0.0120718, "auxiliary_loss_mlp": 0.01067719, "balance_loss_clip": 1.06540477, "balance_loss_mlp": 1.03965759, "epoch": 0.1144712318884146, "flos": 19822869757440.0, "grad_norm": 1.6385407970176071, "language_loss": 0.82247156, "learning_rate": 3.92570659464043e-06, "loss": 0.84522057, "num_input_tokens_seen": 20186095, "step": 952, "time_per_iteration": 2.5955162048339844 }, { "auxiliary_loss_clip": 0.0120472, "auxiliary_loss_mlp": 0.00781144, "balance_loss_clip": 1.06492722, "balance_loss_mlp": 1.00043559, "epoch": 0.1145914747790537, "flos": 14939198766720.0, "grad_norm": 1.8290641899527773, "language_loss": 0.79532081, "learning_rate": 3.925496106723349e-06, "loss": 0.81517947, "num_input_tokens_seen": 20203535, "step": 953, "time_per_iteration": 2.6007306575775146 }, { "auxiliary_loss_clip": 0.01203493, "auxiliary_loss_mlp": 0.01079462, "balance_loss_clip": 1.0610981, "balance_loss_mlp": 1.05103087, "epoch": 0.11471171766969278, "flos": 19865029345920.0, "grad_norm": 1.8332777402461398, "language_loss": 0.83673632, "learning_rate": 3.9252853267100405e-06, "loss": 0.85956585, "num_input_tokens_seen": 20222780, "step": 954, "time_per_iteration": 5.624022722244263 }, { "auxiliary_loss_clip": 0.01168494, "auxiliary_loss_mlp": 0.01069544, "balance_loss_clip": 1.05652571, "balance_loss_mlp": 1.04254293, "epoch": 0.11483196056033187, "flos": 22526476594560.0, "grad_norm": 2.3560913966968102, "language_loss": 0.83524978, "learning_rate": 3.9250742546324786e-06, "loss": 0.85763007, "num_input_tokens_seen": 20243015, "step": 955, "time_per_iteration": 2.713864326477051 }, { "auxiliary_loss_clip": 0.01188753, "auxiliary_loss_mlp": 0.01070371, "balance_loss_clip": 1.05800629, "balance_loss_mlp": 1.04055655, "epoch": 0.11495220345097096, "flos": 28220292887040.0, "grad_norm": 1.7312591017925245, "language_loss": 0.86695921, "learning_rate": 3.924862890522683e-06, "loss": 0.88955051, "num_input_tokens_seen": 20263025, "step": 956, "time_per_iteration": 3.6116316318511963 }, { "auxiliary_loss_clip": 0.01203191, "auxiliary_loss_mlp": 0.01079145, "balance_loss_clip": 1.05834055, "balance_loss_mlp": 1.04896116, "epoch": 0.11507244634161005, "flos": 17492267704320.0, "grad_norm": 2.3328731369316937, "language_loss": 0.86148614, "learning_rate": 3.9246512344127174e-06, "loss": 0.88430947, "num_input_tokens_seen": 20280685, "step": 957, "time_per_iteration": 2.5820813179016113 }, { "auxiliary_loss_clip": 0.01119265, "auxiliary_loss_mlp": 0.01066761, "balance_loss_clip": 1.04403639, "balance_loss_mlp": 1.03781676, "epoch": 0.11519268923224914, "flos": 22564937082240.0, "grad_norm": 1.9412256900405114, "language_loss": 0.81978393, "learning_rate": 3.9244392863346895e-06, "loss": 0.84164417, "num_input_tokens_seen": 20300090, "step": 958, "time_per_iteration": 2.7431914806365967 }, { "auxiliary_loss_clip": 0.0118928, "auxiliary_loss_mlp": 0.0106451, "balance_loss_clip": 1.06115699, "balance_loss_mlp": 1.03712714, "epoch": 0.11531293212288823, "flos": 16982839065600.0, "grad_norm": 1.890368934007395, "language_loss": 0.92161608, "learning_rate": 3.9242270463207524e-06, "loss": 0.94415396, "num_input_tokens_seen": 20318480, "step": 959, "time_per_iteration": 2.6363797187805176 }, { "auxiliary_loss_clip": 0.01149166, "auxiliary_loss_mlp": 0.01065293, "balance_loss_clip": 1.05246305, "balance_loss_mlp": 1.03677785, "epoch": 0.11543317501352733, "flos": 12422004537600.0, "grad_norm": 2.733604357305767, "language_loss": 0.84954941, "learning_rate": 3.924014514403102e-06, "loss": 0.87169397, "num_input_tokens_seen": 20334635, "step": 960, "time_per_iteration": 2.675682783126831 }, { "auxiliary_loss_clip": 0.01152971, "auxiliary_loss_mlp": 0.01073782, "balance_loss_clip": 1.05019689, "balance_loss_mlp": 1.04489756, "epoch": 0.11555341790416641, "flos": 19821648695040.0, "grad_norm": 2.089579194371641, "language_loss": 0.91114819, "learning_rate": 3.92380169061398e-06, "loss": 0.93341565, "num_input_tokens_seen": 20352415, "step": 961, "time_per_iteration": 2.7419662475585938 }, { "auxiliary_loss_clip": 0.01160153, "auxiliary_loss_mlp": 0.00781439, "balance_loss_clip": 1.04817319, "balance_loss_mlp": 1.0003705, "epoch": 0.11567366079480551, "flos": 25738865625600.0, "grad_norm": 1.9990968514599827, "language_loss": 0.83909881, "learning_rate": 3.9235885749856705e-06, "loss": 0.85851467, "num_input_tokens_seen": 20371095, "step": 962, "time_per_iteration": 2.7157750129699707 }, { "auxiliary_loss_clip": 0.01190674, "auxiliary_loss_mlp": 0.01066326, "balance_loss_clip": 1.05819464, "balance_loss_mlp": 1.0385983, "epoch": 0.1157939036854446, "flos": 18223301301120.0, "grad_norm": 1.8692676692121846, "language_loss": 0.8261475, "learning_rate": 3.9233751675505035e-06, "loss": 0.84871751, "num_input_tokens_seen": 20389805, "step": 963, "time_per_iteration": 2.6208109855651855 }, { "auxiliary_loss_clip": 0.01182411, "auxiliary_loss_mlp": 0.01083565, "balance_loss_clip": 1.05783033, "balance_loss_mlp": 1.05438304, "epoch": 0.11591414657608369, "flos": 23073755189760.0, "grad_norm": 1.8324948846651112, "language_loss": 0.84798336, "learning_rate": 3.923161468340853e-06, "loss": 0.87064314, "num_input_tokens_seen": 20409640, "step": 964, "time_per_iteration": 2.6601550579071045 }, { "auxiliary_loss_clip": 0.0114532, "auxiliary_loss_mlp": 0.01069343, "balance_loss_clip": 1.04824305, "balance_loss_mlp": 1.03729963, "epoch": 0.11603438946672277, "flos": 19461716461440.0, "grad_norm": 1.8285047463195743, "language_loss": 0.81792891, "learning_rate": 3.9229474773891374e-06, "loss": 0.84007561, "num_input_tokens_seen": 20428180, "step": 965, "time_per_iteration": 2.704730987548828 }, { "auxiliary_loss_clip": 0.01177281, "auxiliary_loss_mlp": 0.0106894, "balance_loss_clip": 1.04972625, "balance_loss_mlp": 1.03817248, "epoch": 0.11615463235736187, "flos": 26831986272000.0, "grad_norm": 4.112361234587045, "language_loss": 0.83715165, "learning_rate": 3.922733194727818e-06, "loss": 0.85961384, "num_input_tokens_seen": 20447975, "step": 966, "time_per_iteration": 2.719836473464966 }, { "auxiliary_loss_clip": 0.01210308, "auxiliary_loss_mlp": 0.01065456, "balance_loss_clip": 1.06231368, "balance_loss_mlp": 1.0361073, "epoch": 0.11627487524800097, "flos": 18580324533120.0, "grad_norm": 1.9454217888935939, "language_loss": 0.87446076, "learning_rate": 3.922518620389402e-06, "loss": 0.89721835, "num_input_tokens_seen": 20464840, "step": 967, "time_per_iteration": 2.612196445465088 }, { "auxiliary_loss_clip": 0.0109641, "auxiliary_loss_mlp": 0.01071354, "balance_loss_clip": 1.04317951, "balance_loss_mlp": 1.04264879, "epoch": 0.11639511813864005, "flos": 18150474476160.0, "grad_norm": 2.0091442144659006, "language_loss": 0.89633584, "learning_rate": 3.922303754406439e-06, "loss": 0.91801345, "num_input_tokens_seen": 20482680, "step": 968, "time_per_iteration": 2.807835817337036 }, { "auxiliary_loss_clip": 0.01148198, "auxiliary_loss_mlp": 0.01074558, "balance_loss_clip": 1.04950786, "balance_loss_mlp": 1.04234779, "epoch": 0.11651536102927915, "flos": 20922023888640.0, "grad_norm": 1.8181028905456582, "language_loss": 0.7932784, "learning_rate": 3.922088596811526e-06, "loss": 0.81550598, "num_input_tokens_seen": 20501810, "step": 969, "time_per_iteration": 2.892091751098633 }, { "auxiliary_loss_clip": 0.01189913, "auxiliary_loss_mlp": 0.010634, "balance_loss_clip": 1.05841708, "balance_loss_mlp": 1.03631604, "epoch": 0.11663560391991823, "flos": 16508602776960.0, "grad_norm": 2.2586956316220723, "language_loss": 0.8693136, "learning_rate": 3.9218731476373e-06, "loss": 0.89184678, "num_input_tokens_seen": 20517995, "step": 970, "time_per_iteration": 2.6146674156188965 }, { "auxiliary_loss_clip": 0.0120966, "auxiliary_loss_mlp": 0.010684, "balance_loss_clip": 1.06137455, "balance_loss_mlp": 1.03763223, "epoch": 0.11675584681055733, "flos": 19865029345920.0, "grad_norm": 2.170334993894076, "language_loss": 0.84405833, "learning_rate": 3.9216574069164455e-06, "loss": 0.86683893, "num_input_tokens_seen": 20536970, "step": 971, "time_per_iteration": 2.638002634048462 }, { "auxiliary_loss_clip": 0.01213057, "auxiliary_loss_mlp": 0.01073632, "balance_loss_clip": 1.05880177, "balance_loss_mlp": 1.04611838, "epoch": 0.11687608970119642, "flos": 21944364785280.0, "grad_norm": 1.52760258389758, "language_loss": 0.80056882, "learning_rate": 3.921441374681691e-06, "loss": 0.82343578, "num_input_tokens_seen": 20557030, "step": 972, "time_per_iteration": 2.5871999263763428 }, { "auxiliary_loss_clip": 0.01177258, "auxiliary_loss_mlp": 0.01073876, "balance_loss_clip": 1.05354977, "balance_loss_mlp": 1.0451107, "epoch": 0.1169963325918355, "flos": 24061155131520.0, "grad_norm": 1.9882075125143819, "language_loss": 0.65037477, "learning_rate": 3.921225050965808e-06, "loss": 0.67288607, "num_input_tokens_seen": 20576915, "step": 973, "time_per_iteration": 2.66719388961792 }, { "auxiliary_loss_clip": 0.01167515, "auxiliary_loss_mlp": 0.01058307, "balance_loss_clip": 1.05326521, "balance_loss_mlp": 1.03171098, "epoch": 0.1171165754824746, "flos": 23368151059200.0, "grad_norm": 1.9545343850863415, "language_loss": 0.750157, "learning_rate": 3.921008435801612e-06, "loss": 0.77241528, "num_input_tokens_seen": 20596000, "step": 974, "time_per_iteration": 2.7093605995178223 }, { "auxiliary_loss_clip": 0.01188029, "auxiliary_loss_mlp": 0.01070347, "balance_loss_clip": 1.05540967, "balance_loss_mlp": 1.04280972, "epoch": 0.11723681837311369, "flos": 18552243075840.0, "grad_norm": 2.5830223363138263, "language_loss": 0.75684118, "learning_rate": 3.920791529221963e-06, "loss": 0.77942497, "num_input_tokens_seen": 20614675, "step": 975, "time_per_iteration": 2.5736138820648193 }, { "auxiliary_loss_clip": 0.01183105, "auxiliary_loss_mlp": 0.00781057, "balance_loss_clip": 1.05316591, "balance_loss_mlp": 1.00052536, "epoch": 0.11735706126375278, "flos": 23550541344000.0, "grad_norm": 2.1214484604981205, "language_loss": 0.76406538, "learning_rate": 3.920574331259768e-06, "loss": 0.78370696, "num_input_tokens_seen": 20635875, "step": 976, "time_per_iteration": 2.6770410537719727 }, { "auxiliary_loss_clip": 0.01171697, "auxiliary_loss_mlp": 0.01061121, "balance_loss_clip": 1.05387843, "balance_loss_mlp": 1.03322589, "epoch": 0.11747730415439187, "flos": 22381541216640.0, "grad_norm": 2.2666028467136177, "language_loss": 0.79695386, "learning_rate": 3.9203568419479716e-06, "loss": 0.81928205, "num_input_tokens_seen": 20656430, "step": 977, "time_per_iteration": 2.6566684246063232 }, { "auxiliary_loss_clip": 0.01182278, "auxiliary_loss_mlp": 0.01059915, "balance_loss_clip": 1.05387509, "balance_loss_mlp": 1.03300977, "epoch": 0.11759754704503096, "flos": 22200731130240.0, "grad_norm": 2.128828454382315, "language_loss": 0.75477189, "learning_rate": 3.92013906131957e-06, "loss": 0.77719378, "num_input_tokens_seen": 20675360, "step": 978, "time_per_iteration": 2.6417348384857178 }, { "auxiliary_loss_clip": 0.01163918, "auxiliary_loss_mlp": 0.01058659, "balance_loss_clip": 1.05285788, "balance_loss_mlp": 1.03156233, "epoch": 0.11771778993567006, "flos": 22309755886080.0, "grad_norm": 2.222114850374815, "language_loss": 0.82316178, "learning_rate": 3.9199209894076e-06, "loss": 0.84538758, "num_input_tokens_seen": 20695675, "step": 979, "time_per_iteration": 2.674229860305786 }, { "auxiliary_loss_clip": 0.01218742, "auxiliary_loss_mlp": 0.01068634, "balance_loss_clip": 1.05869341, "balance_loss_mlp": 1.04127574, "epoch": 0.11783803282630914, "flos": 21288169175040.0, "grad_norm": 1.826940761656661, "language_loss": 0.90069914, "learning_rate": 3.919702626245142e-06, "loss": 0.9235729, "num_input_tokens_seen": 20715330, "step": 980, "time_per_iteration": 5.492232084274292 }, { "auxiliary_loss_clip": 0.01169322, "auxiliary_loss_mlp": 0.01067917, "balance_loss_clip": 1.05083728, "balance_loss_mlp": 1.03970027, "epoch": 0.11795827571694824, "flos": 25371535190400.0, "grad_norm": 2.540609467100874, "language_loss": 0.66258872, "learning_rate": 3.919483971865322e-06, "loss": 0.68496108, "num_input_tokens_seen": 20735325, "step": 981, "time_per_iteration": 2.662976026535034 }, { "auxiliary_loss_clip": 0.01185269, "auxiliary_loss_mlp": 0.01068069, "balance_loss_clip": 1.0583725, "balance_loss_mlp": 1.04204535, "epoch": 0.11807851860758732, "flos": 23622218933760.0, "grad_norm": 1.8493714531959087, "language_loss": 0.87858701, "learning_rate": 3.91926502630131e-06, "loss": 0.90112031, "num_input_tokens_seen": 20755940, "step": 982, "time_per_iteration": 3.5235555171966553 }, { "auxiliary_loss_clip": 0.01201309, "auxiliary_loss_mlp": 0.01065732, "balance_loss_clip": 1.06104374, "balance_loss_mlp": 1.03938723, "epoch": 0.11819876149822642, "flos": 24972496024320.0, "grad_norm": 2.5984971782850455, "language_loss": 0.72426617, "learning_rate": 3.91904578958632e-06, "loss": 0.74693662, "num_input_tokens_seen": 20775355, "step": 983, "time_per_iteration": 2.6608638763427734 }, { "auxiliary_loss_clip": 0.01217842, "auxiliary_loss_mlp": 0.01070431, "balance_loss_clip": 1.06156147, "balance_loss_mlp": 1.04008055, "epoch": 0.11831900438886551, "flos": 23003226835200.0, "grad_norm": 2.064163506707136, "language_loss": 0.84185654, "learning_rate": 3.918826261753608e-06, "loss": 0.8647393, "num_input_tokens_seen": 20794935, "step": 984, "time_per_iteration": 2.5669424533843994 }, { "auxiliary_loss_clip": 0.01184739, "auxiliary_loss_mlp": 0.01065671, "balance_loss_clip": 1.054672, "balance_loss_mlp": 1.03808618, "epoch": 0.1184392472795046, "flos": 27965147604480.0, "grad_norm": 4.3321969379352545, "language_loss": 0.71603203, "learning_rate": 3.918606442836478e-06, "loss": 0.73853612, "num_input_tokens_seen": 20817155, "step": 985, "time_per_iteration": 2.698859930038452 }, { "auxiliary_loss_clip": 0.01189606, "auxiliary_loss_mlp": 0.01066846, "balance_loss_clip": 1.05776119, "balance_loss_mlp": 1.03980947, "epoch": 0.1185594901701437, "flos": 19898497843200.0, "grad_norm": 1.9180630658072602, "language_loss": 0.77327859, "learning_rate": 3.918386332868277e-06, "loss": 0.79584312, "num_input_tokens_seen": 20835125, "step": 986, "time_per_iteration": 2.6038320064544678 }, { "auxiliary_loss_clip": 0.011873, "auxiliary_loss_mlp": 0.01075899, "balance_loss_clip": 1.05466282, "balance_loss_mlp": 1.04931498, "epoch": 0.11867973306078278, "flos": 18912354877440.0, "grad_norm": 1.8687008385765576, "language_loss": 0.94329369, "learning_rate": 3.918165931882394e-06, "loss": 0.96592575, "num_input_tokens_seen": 20853525, "step": 987, "time_per_iteration": 2.629750967025757 }, { "auxiliary_loss_clip": 0.01127617, "auxiliary_loss_mlp": 0.01065351, "balance_loss_clip": 1.04359627, "balance_loss_mlp": 1.03655005, "epoch": 0.11879997595142187, "flos": 16982803152000.0, "grad_norm": 2.406037606160826, "language_loss": 0.7602511, "learning_rate": 3.917945239912264e-06, "loss": 0.78218079, "num_input_tokens_seen": 20871000, "step": 988, "time_per_iteration": 2.7410998344421387 }, { "auxiliary_loss_clip": 0.01157797, "auxiliary_loss_mlp": 0.01064538, "balance_loss_clip": 1.05336452, "balance_loss_mlp": 1.0388962, "epoch": 0.11892021884206096, "flos": 17530369056000.0, "grad_norm": 2.1658703677769484, "language_loss": 0.75703728, "learning_rate": 3.917724256991367e-06, "loss": 0.77926064, "num_input_tokens_seen": 20889745, "step": 989, "time_per_iteration": 2.685608148574829 }, { "auxiliary_loss_clip": 0.01170481, "auxiliary_loss_mlp": 0.01060374, "balance_loss_clip": 1.05344188, "balance_loss_mlp": 1.03253841, "epoch": 0.11904046173270005, "flos": 30955895763840.0, "grad_norm": 2.0675455453486395, "language_loss": 0.81465375, "learning_rate": 3.9175029831532245e-06, "loss": 0.83696228, "num_input_tokens_seen": 20909260, "step": 990, "time_per_iteration": 2.703033685684204 }, { "auxiliary_loss_clip": 0.01169847, "auxiliary_loss_mlp": 0.01062015, "balance_loss_clip": 1.05385399, "balance_loss_mlp": 1.03633761, "epoch": 0.11916070462333915, "flos": 20157234485760.0, "grad_norm": 2.18007737332042, "language_loss": 0.88503987, "learning_rate": 3.917281418431404e-06, "loss": 0.90735847, "num_input_tokens_seen": 20928305, "step": 991, "time_per_iteration": 2.666175603866577 }, { "auxiliary_loss_clip": 0.01183424, "auxiliary_loss_mlp": 0.01073506, "balance_loss_clip": 1.05645156, "balance_loss_mlp": 1.04465699, "epoch": 0.11928094751397823, "flos": 23551115961600.0, "grad_norm": 2.517132462052433, "language_loss": 0.76886392, "learning_rate": 3.917059562859516e-06, "loss": 0.79143322, "num_input_tokens_seen": 20947630, "step": 992, "time_per_iteration": 2.661245346069336 }, { "auxiliary_loss_clip": 0.01174755, "auxiliary_loss_mlp": 0.01061279, "balance_loss_clip": 1.05852938, "balance_loss_mlp": 1.0350883, "epoch": 0.11940119040461733, "flos": 23908426502400.0, "grad_norm": 2.0926738705606187, "language_loss": 0.88938534, "learning_rate": 3.916837416471218e-06, "loss": 0.91174567, "num_input_tokens_seen": 20964250, "step": 993, "time_per_iteration": 2.661180257797241 }, { "auxiliary_loss_clip": 0.01189692, "auxiliary_loss_mlp": 0.01062314, "balance_loss_clip": 1.05400348, "balance_loss_mlp": 1.03563464, "epoch": 0.11952143329525641, "flos": 13844533835520.0, "grad_norm": 2.5026999171733726, "language_loss": 0.72362649, "learning_rate": 3.916614979300207e-06, "loss": 0.74614656, "num_input_tokens_seen": 20979095, "step": 994, "time_per_iteration": 2.586601495742798 }, { "auxiliary_loss_clip": 0.01143309, "auxiliary_loss_mlp": 0.01065845, "balance_loss_clip": 1.05046725, "balance_loss_mlp": 1.03873706, "epoch": 0.11964167618589551, "flos": 27015525792000.0, "grad_norm": 1.7810333052551992, "language_loss": 0.78598046, "learning_rate": 3.9163922513802274e-06, "loss": 0.80807203, "num_input_tokens_seen": 21001430, "step": 995, "time_per_iteration": 2.756226062774658 }, { "auxiliary_loss_clip": 0.01215852, "auxiliary_loss_mlp": 0.01065961, "balance_loss_clip": 1.05778503, "balance_loss_mlp": 1.03745794, "epoch": 0.1197619190765346, "flos": 12567622273920.0, "grad_norm": 3.120833702258897, "language_loss": 0.82076818, "learning_rate": 3.916169232745067e-06, "loss": 0.84358633, "num_input_tokens_seen": 21019105, "step": 996, "time_per_iteration": 2.5589752197265625 }, { "auxiliary_loss_clip": 0.01175977, "auxiliary_loss_mlp": 0.01063403, "balance_loss_clip": 1.05313468, "balance_loss_mlp": 1.03417265, "epoch": 0.11988216196717369, "flos": 16909437623040.0, "grad_norm": 2.4090404231081455, "language_loss": 0.92050582, "learning_rate": 3.915945923428559e-06, "loss": 0.94289958, "num_input_tokens_seen": 21035630, "step": 997, "time_per_iteration": 2.607238292694092 }, { "auxiliary_loss_clip": 0.01195136, "auxiliary_loss_mlp": 0.01054033, "balance_loss_clip": 1.05694234, "balance_loss_mlp": 1.02784252, "epoch": 0.12000240485781279, "flos": 16216577205120.0, "grad_norm": 1.9705556998777936, "language_loss": 0.83035243, "learning_rate": 3.915722323464577e-06, "loss": 0.85284406, "num_input_tokens_seen": 21054235, "step": 998, "time_per_iteration": 2.6176931858062744 }, { "auxiliary_loss_clip": 0.01199758, "auxiliary_loss_mlp": 0.01066986, "balance_loss_clip": 1.05802011, "balance_loss_mlp": 1.04085517, "epoch": 0.12012264774845187, "flos": 49344887525760.0, "grad_norm": 2.4276588077869814, "language_loss": 0.70365858, "learning_rate": 3.91549843288704e-06, "loss": 0.72632599, "num_input_tokens_seen": 21077915, "step": 999, "time_per_iteration": 2.8410511016845703 }, { "auxiliary_loss_clip": 0.01169326, "auxiliary_loss_mlp": 0.00780189, "balance_loss_clip": 1.05298507, "balance_loss_mlp": 1.00042403, "epoch": 0.12024289063909097, "flos": 26979435601920.0, "grad_norm": 1.965468284776807, "language_loss": 0.79175603, "learning_rate": 3.915274251729916e-06, "loss": 0.81125116, "num_input_tokens_seen": 21099205, "step": 1000, "time_per_iteration": 2.75384783744812 }, { "auxiliary_loss_clip": 0.011792, "auxiliary_loss_mlp": 0.01066411, "balance_loss_clip": 1.05816436, "balance_loss_mlp": 1.03884971, "epoch": 0.12036313352973005, "flos": 19537308633600.0, "grad_norm": 2.0357881922759375, "language_loss": 0.89891052, "learning_rate": 3.91504978002721e-06, "loss": 0.92136663, "num_input_tokens_seen": 21118260, "step": 1001, "time_per_iteration": 2.674133777618408 }, { "auxiliary_loss_clip": 0.01183495, "auxiliary_loss_mlp": 0.00779278, "balance_loss_clip": 1.05524457, "balance_loss_mlp": 1.00034678, "epoch": 0.12048337642036915, "flos": 17268256535040.0, "grad_norm": 2.460307939256106, "language_loss": 0.76043451, "learning_rate": 3.914825017812974e-06, "loss": 0.78006226, "num_input_tokens_seen": 21134910, "step": 1002, "time_per_iteration": 2.602243423461914 }, { "auxiliary_loss_clip": 0.01181736, "auxiliary_loss_mlp": 0.01069469, "balance_loss_clip": 1.05504394, "balance_loss_mlp": 1.04245567, "epoch": 0.12060361931100824, "flos": 22856962654080.0, "grad_norm": 2.278524924054961, "language_loss": 0.72158778, "learning_rate": 3.9145999651213065e-06, "loss": 0.74409974, "num_input_tokens_seen": 21154150, "step": 1003, "time_per_iteration": 2.628406047821045 }, { "auxiliary_loss_clip": 0.01198682, "auxiliary_loss_mlp": 0.01069489, "balance_loss_clip": 1.05828166, "balance_loss_mlp": 1.04204655, "epoch": 0.12072386220164733, "flos": 16726795943040.0, "grad_norm": 3.19567942201039, "language_loss": 0.88422489, "learning_rate": 3.9143746219863465e-06, "loss": 0.9069066, "num_input_tokens_seen": 21171255, "step": 1004, "time_per_iteration": 2.616264581680298 }, { "auxiliary_loss_clip": 0.0108162, "auxiliary_loss_mlp": 0.01016194, "balance_loss_clip": 1.03188431, "balance_loss_mlp": 1.01183057, "epoch": 0.12084410509228642, "flos": 55144176105600.0, "grad_norm": 0.9097680078985064, "language_loss": 0.64770484, "learning_rate": 3.914148988442278e-06, "loss": 0.66868299, "num_input_tokens_seen": 21227045, "step": 1005, "time_per_iteration": 3.179946184158325 }, { "auxiliary_loss_clip": 0.01171023, "auxiliary_loss_mlp": 0.01063927, "balance_loss_clip": 1.05306864, "balance_loss_mlp": 1.03567469, "epoch": 0.1209643479829255, "flos": 26760236855040.0, "grad_norm": 3.056408218186805, "language_loss": 0.9508009, "learning_rate": 3.91392306452333e-06, "loss": 0.97315043, "num_input_tokens_seen": 21244120, "step": 1006, "time_per_iteration": 4.651889801025391 }, { "auxiliary_loss_clip": 0.01217058, "auxiliary_loss_mlp": 0.01057923, "balance_loss_clip": 1.06130195, "balance_loss_mlp": 1.03005219, "epoch": 0.1210845908735646, "flos": 11035026725760.0, "grad_norm": 2.8596481196637686, "language_loss": 0.66487402, "learning_rate": 3.913696850263774e-06, "loss": 0.68762386, "num_input_tokens_seen": 21258485, "step": 1007, "time_per_iteration": 2.541266441345215 }, { "auxiliary_loss_clip": 0.01194178, "auxiliary_loss_mlp": 0.01066597, "balance_loss_clip": 1.0543108, "balance_loss_mlp": 1.03994203, "epoch": 0.1212048337642037, "flos": 20484631975680.0, "grad_norm": 2.3094224989057848, "language_loss": 0.79361719, "learning_rate": 3.913470345697929e-06, "loss": 0.81622493, "num_input_tokens_seen": 21277115, "step": 1008, "time_per_iteration": 3.5258734226226807 }, { "auxiliary_loss_clip": 0.01157742, "auxiliary_loss_mlp": 0.01077162, "balance_loss_clip": 1.05396914, "balance_loss_mlp": 1.04776537, "epoch": 0.12132507665484278, "flos": 22346061557760.0, "grad_norm": 2.3217569051896176, "language_loss": 0.85367751, "learning_rate": 3.913243550860153e-06, "loss": 0.87602657, "num_input_tokens_seen": 21294880, "step": 1009, "time_per_iteration": 2.769273042678833 }, { "auxiliary_loss_clip": 0.01209392, "auxiliary_loss_mlp": 0.01070207, "balance_loss_clip": 1.06649637, "balance_loss_mlp": 1.0427171, "epoch": 0.12144531954548188, "flos": 29314957818240.0, "grad_norm": 1.9786451013170796, "language_loss": 0.75897044, "learning_rate": 3.913016465784852e-06, "loss": 0.78176641, "num_input_tokens_seen": 21315555, "step": 1010, "time_per_iteration": 2.7016379833221436 }, { "auxiliary_loss_clip": 0.01153836, "auxiliary_loss_mlp": 0.01068859, "balance_loss_clip": 1.05083966, "balance_loss_mlp": 1.04324079, "epoch": 0.12156556243612096, "flos": 20485242506880.0, "grad_norm": 2.7510078552432926, "language_loss": 0.71877533, "learning_rate": 3.912789090506474e-06, "loss": 0.74100232, "num_input_tokens_seen": 21334815, "step": 1011, "time_per_iteration": 2.6763312816619873 }, { "auxiliary_loss_clip": 0.01174663, "auxiliary_loss_mlp": 0.01058049, "balance_loss_clip": 1.05127573, "balance_loss_mlp": 1.03145313, "epoch": 0.12168580532676006, "flos": 16472009796480.0, "grad_norm": 2.378765449163589, "language_loss": 0.72104609, "learning_rate": 3.9125614250595114e-06, "loss": 0.74337327, "num_input_tokens_seen": 21351025, "step": 1012, "time_per_iteration": 2.6856024265289307 }, { "auxiliary_loss_clip": 0.01200948, "auxiliary_loss_mlp": 0.01066795, "balance_loss_clip": 1.05717266, "balance_loss_mlp": 1.03930545, "epoch": 0.12180604821739914, "flos": 15341290588800.0, "grad_norm": 2.339172180434936, "language_loss": 0.89265776, "learning_rate": 3.912333469478502e-06, "loss": 0.91533518, "num_input_tokens_seen": 21368990, "step": 1013, "time_per_iteration": 2.5964760780334473 }, { "auxiliary_loss_clip": 0.01181882, "auxiliary_loss_mlp": 0.01063206, "balance_loss_clip": 1.05436492, "balance_loss_mlp": 1.03781402, "epoch": 0.12192629110803824, "flos": 19318038059520.0, "grad_norm": 5.863213391794093, "language_loss": 0.78064764, "learning_rate": 3.912105223798025e-06, "loss": 0.80309856, "num_input_tokens_seen": 21388410, "step": 1014, "time_per_iteration": 2.612252950668335 }, { "auxiliary_loss_clip": 0.01065738, "auxiliary_loss_mlp": 0.01004704, "balance_loss_clip": 1.0264442, "balance_loss_mlp": 1.00007868, "epoch": 0.12204653399867733, "flos": 47725354085760.0, "grad_norm": 0.9929458758471656, "language_loss": 0.67648923, "learning_rate": 3.9118766880527065e-06, "loss": 0.69719362, "num_input_tokens_seen": 21442845, "step": 1015, "time_per_iteration": 3.1184306144714355 }, { "auxiliary_loss_clip": 0.01144268, "auxiliary_loss_mlp": 0.01052201, "balance_loss_clip": 1.05130053, "balance_loss_mlp": 1.02579582, "epoch": 0.12216677688931642, "flos": 18221936584320.0, "grad_norm": 1.8260132045906852, "language_loss": 0.73641676, "learning_rate": 3.9116478622772145e-06, "loss": 0.75838143, "num_input_tokens_seen": 21461420, "step": 1016, "time_per_iteration": 2.6892178058624268 }, { "auxiliary_loss_clip": 0.01194153, "auxiliary_loss_mlp": 0.01058793, "balance_loss_clip": 1.05878425, "balance_loss_mlp": 1.03307986, "epoch": 0.12228701977995551, "flos": 27525636789120.0, "grad_norm": 1.7894725796429745, "language_loss": 0.87870622, "learning_rate": 3.911418746506261e-06, "loss": 0.9012357, "num_input_tokens_seen": 21481550, "step": 1017, "time_per_iteration": 2.6244211196899414 }, { "auxiliary_loss_clip": 0.01209682, "auxiliary_loss_mlp": 0.01057958, "balance_loss_clip": 1.06584156, "balance_loss_mlp": 1.0332576, "epoch": 0.1224072626705946, "flos": 21798136517760.0, "grad_norm": 1.896603710064752, "language_loss": 0.78592867, "learning_rate": 3.911189340774604e-06, "loss": 0.80860507, "num_input_tokens_seen": 21501680, "step": 1018, "time_per_iteration": 2.6060118675231934 }, { "auxiliary_loss_clip": 0.01190548, "auxiliary_loss_mlp": 0.01064248, "balance_loss_clip": 1.05585122, "balance_loss_mlp": 1.03812981, "epoch": 0.1225275055612337, "flos": 20703758895360.0, "grad_norm": 1.8950848848496722, "language_loss": 0.79284704, "learning_rate": 3.910959645117043e-06, "loss": 0.81539506, "num_input_tokens_seen": 21521015, "step": 1019, "time_per_iteration": 2.6123390197753906 }, { "auxiliary_loss_clip": 0.01073901, "auxiliary_loss_mlp": 0.00761771, "balance_loss_clip": 1.02625656, "balance_loss_mlp": 1.00060081, "epoch": 0.12264774845187278, "flos": 57745294462080.0, "grad_norm": 0.8157509356915835, "language_loss": 0.56722414, "learning_rate": 3.910729659568423e-06, "loss": 0.58558083, "num_input_tokens_seen": 21578200, "step": 1020, "time_per_iteration": 3.207901954650879 }, { "auxiliary_loss_clip": 0.01183141, "auxiliary_loss_mlp": 0.01069154, "balance_loss_clip": 1.05755949, "balance_loss_mlp": 1.04242718, "epoch": 0.12276799134251187, "flos": 26396282298240.0, "grad_norm": 2.4514892263882, "language_loss": 0.82525963, "learning_rate": 3.9104993841636344e-06, "loss": 0.84778255, "num_input_tokens_seen": 21598770, "step": 1021, "time_per_iteration": 2.7288479804992676 }, { "auxiliary_loss_clip": 0.01179681, "auxiliary_loss_mlp": 0.00778736, "balance_loss_clip": 1.06100595, "balance_loss_mlp": 1.00029039, "epoch": 0.12288823423315097, "flos": 21064193919360.0, "grad_norm": 1.7787705990954645, "language_loss": 0.80932999, "learning_rate": 3.910268818937608e-06, "loss": 0.82891417, "num_input_tokens_seen": 21616925, "step": 1022, "time_per_iteration": 2.685170888900757 }, { "auxiliary_loss_clip": 0.01153307, "auxiliary_loss_mlp": 0.01068849, "balance_loss_clip": 1.05181146, "balance_loss_mlp": 1.04311168, "epoch": 0.12300847712379005, "flos": 12312441077760.0, "grad_norm": 2.5585340312834592, "language_loss": 0.87364137, "learning_rate": 3.9100379639253196e-06, "loss": 0.89586294, "num_input_tokens_seen": 21633645, "step": 1023, "time_per_iteration": 2.667616844177246 }, { "auxiliary_loss_clip": 0.0118044, "auxiliary_loss_mlp": 0.0106799, "balance_loss_clip": 1.05281246, "balance_loss_mlp": 1.04165626, "epoch": 0.12312872001442915, "flos": 16762239688320.0, "grad_norm": 3.056063514311825, "language_loss": 0.86275691, "learning_rate": 3.909806819161791e-06, "loss": 0.88524121, "num_input_tokens_seen": 21649120, "step": 1024, "time_per_iteration": 2.638084888458252 }, { "auxiliary_loss_clip": 0.01181385, "auxiliary_loss_mlp": 0.01065923, "balance_loss_clip": 1.05709589, "balance_loss_mlp": 1.03895831, "epoch": 0.12324896290506823, "flos": 18404937400320.0, "grad_norm": 1.958173268416401, "language_loss": 0.86327291, "learning_rate": 3.909575384682086e-06, "loss": 0.885746, "num_input_tokens_seen": 21668000, "step": 1025, "time_per_iteration": 2.657597780227661 }, { "auxiliary_loss_clip": 0.01199664, "auxiliary_loss_mlp": 0.01066765, "balance_loss_clip": 1.05734515, "balance_loss_mlp": 1.04235077, "epoch": 0.12336920579570733, "flos": 18915407533440.0, "grad_norm": 1.7893236061926037, "language_loss": 0.69297218, "learning_rate": 3.9093436605213144e-06, "loss": 0.71563643, "num_input_tokens_seen": 21688500, "step": 1026, "time_per_iteration": 2.643791913986206 }, { "auxiliary_loss_clip": 0.01188425, "auxiliary_loss_mlp": 0.01068523, "balance_loss_clip": 1.05700803, "balance_loss_mlp": 1.04029441, "epoch": 0.12348944868634643, "flos": 23878369797120.0, "grad_norm": 2.2854914575284218, "language_loss": 0.79943252, "learning_rate": 3.909111646714627e-06, "loss": 0.82200199, "num_input_tokens_seen": 21709345, "step": 1027, "time_per_iteration": 2.6578478813171387 }, { "auxiliary_loss_clip": 0.01207917, "auxiliary_loss_mlp": 0.01069928, "balance_loss_clip": 1.05832577, "balance_loss_mlp": 1.04451239, "epoch": 0.12360969157698551, "flos": 19026084314880.0, "grad_norm": 2.027313663700958, "language_loss": 0.72316885, "learning_rate": 3.9088793432972206e-06, "loss": 0.7459473, "num_input_tokens_seen": 21728165, "step": 1028, "time_per_iteration": 2.5795111656188965 }, { "auxiliary_loss_clip": 0.01161529, "auxiliary_loss_mlp": 0.01073604, "balance_loss_clip": 1.05549574, "balance_loss_mlp": 1.04640007, "epoch": 0.1237299344676246, "flos": 13224607983360.0, "grad_norm": 2.160507022973656, "language_loss": 0.82007617, "learning_rate": 3.908646750304336e-06, "loss": 0.84242749, "num_input_tokens_seen": 21745850, "step": 1029, "time_per_iteration": 2.655229330062866 }, { "auxiliary_loss_clip": 0.01189487, "auxiliary_loss_mlp": 0.01071724, "balance_loss_clip": 1.05865383, "balance_loss_mlp": 1.04457974, "epoch": 0.12385017735826369, "flos": 20485673470080.0, "grad_norm": 1.699099313572867, "language_loss": 0.87306672, "learning_rate": 3.908413867771257e-06, "loss": 0.89567888, "num_input_tokens_seen": 21764760, "step": 1030, "time_per_iteration": 2.663412094116211 }, { "auxiliary_loss_clip": 0.01198674, "auxiliary_loss_mlp": 0.0105776, "balance_loss_clip": 1.06231833, "balance_loss_mlp": 1.03342927, "epoch": 0.12397042024890279, "flos": 17347835116800.0, "grad_norm": 1.880145620267783, "language_loss": 0.80586463, "learning_rate": 3.908180695733311e-06, "loss": 0.82842898, "num_input_tokens_seen": 21784250, "step": 1031, "time_per_iteration": 3.847587823867798 }, { "auxiliary_loss_clip": 0.01129016, "auxiliary_loss_mlp": 0.01073688, "balance_loss_clip": 1.04619813, "balance_loss_mlp": 1.04338527, "epoch": 0.12409066313954187, "flos": 20412343854720.0, "grad_norm": 1.8189771442093892, "language_loss": 0.82839406, "learning_rate": 3.907947234225871e-06, "loss": 0.85042113, "num_input_tokens_seen": 21803260, "step": 1032, "time_per_iteration": 3.8113789558410645 }, { "auxiliary_loss_clip": 0.01133444, "auxiliary_loss_mlp": 0.01068327, "balance_loss_clip": 1.04828835, "balance_loss_mlp": 1.04310262, "epoch": 0.12421090603018096, "flos": 20736688688640.0, "grad_norm": 1.9563930851856584, "language_loss": 0.87170172, "learning_rate": 3.907713483284352e-06, "loss": 0.89371943, "num_input_tokens_seen": 21822735, "step": 1033, "time_per_iteration": 2.753763437271118 }, { "auxiliary_loss_clip": 0.01112799, "auxiliary_loss_mlp": 0.01079851, "balance_loss_clip": 1.04511404, "balance_loss_mlp": 1.05118072, "epoch": 0.12433114892082006, "flos": 24498834353280.0, "grad_norm": 2.699080136478538, "language_loss": 0.97579551, "learning_rate": 3.907479442944216e-06, "loss": 0.99772191, "num_input_tokens_seen": 21841140, "step": 1034, "time_per_iteration": 2.7906887531280518 }, { "auxiliary_loss_clip": 0.01198598, "auxiliary_loss_mlp": 0.01056647, "balance_loss_clip": 1.06244111, "balance_loss_mlp": 1.03195858, "epoch": 0.12445139181145914, "flos": 19682315838720.0, "grad_norm": 2.6035682176386676, "language_loss": 0.9231233, "learning_rate": 3.907245113240963e-06, "loss": 0.94567573, "num_input_tokens_seen": 21859260, "step": 1035, "time_per_iteration": 3.469311475753784 }, { "auxiliary_loss_clip": 0.0116913, "auxiliary_loss_mlp": 0.01063908, "balance_loss_clip": 1.05250478, "balance_loss_mlp": 1.03716874, "epoch": 0.12457163470209824, "flos": 46423087522560.0, "grad_norm": 1.8040732515917428, "language_loss": 0.73620784, "learning_rate": 3.907010494210144e-06, "loss": 0.75853825, "num_input_tokens_seen": 21881920, "step": 1036, "time_per_iteration": 2.8857266902923584 }, { "auxiliary_loss_clip": 0.01201671, "auxiliary_loss_mlp": 0.01052615, "balance_loss_clip": 1.06027436, "balance_loss_mlp": 1.02584052, "epoch": 0.12469187759273732, "flos": 20376289578240.0, "grad_norm": 1.9986623297830837, "language_loss": 0.92187369, "learning_rate": 3.9067755858873495e-06, "loss": 0.94441652, "num_input_tokens_seen": 21898720, "step": 1037, "time_per_iteration": 2.632922649383545 }, { "auxiliary_loss_clip": 0.01052414, "auxiliary_loss_mlp": 0.01018333, "balance_loss_clip": 1.02003193, "balance_loss_mlp": 1.01439917, "epoch": 0.12481212048337642, "flos": 69224641447680.0, "grad_norm": 0.8606905982057597, "language_loss": 0.62806106, "learning_rate": 3.906540388308214e-06, "loss": 0.64876854, "num_input_tokens_seen": 21958305, "step": 1038, "time_per_iteration": 3.286875009536743 }, { "auxiliary_loss_clip": 0.01142326, "auxiliary_loss_mlp": 0.01062303, "balance_loss_clip": 1.05099344, "balance_loss_mlp": 1.03703094, "epoch": 0.12493236337401552, "flos": 18223696350720.0, "grad_norm": 1.872708516295082, "language_loss": 0.81245327, "learning_rate": 3.906304901508417e-06, "loss": 0.83449954, "num_input_tokens_seen": 21977205, "step": 1039, "time_per_iteration": 2.712392568588257 }, { "auxiliary_loss_clip": 0.01196934, "auxiliary_loss_mlp": 0.01060147, "balance_loss_clip": 1.05850387, "balance_loss_mlp": 1.03466034, "epoch": 0.12505260626465461, "flos": 30044375303040.0, "grad_norm": 2.19074798238723, "language_loss": 0.75982606, "learning_rate": 3.9060691255236835e-06, "loss": 0.78239685, "num_input_tokens_seen": 21997770, "step": 1040, "time_per_iteration": 2.69500732421875 }, { "auxiliary_loss_clip": 0.01190579, "auxiliary_loss_mlp": 0.01066129, "balance_loss_clip": 1.05328274, "balance_loss_mlp": 1.03909206, "epoch": 0.1251728491552937, "flos": 24433980347520.0, "grad_norm": 6.82873299976796, "language_loss": 0.80543458, "learning_rate": 3.905833060389778e-06, "loss": 0.82800174, "num_input_tokens_seen": 22021890, "step": 1041, "time_per_iteration": 2.8052871227264404 }, { "auxiliary_loss_clip": 0.01212593, "auxiliary_loss_mlp": 0.0077976, "balance_loss_clip": 1.06126451, "balance_loss_mlp": 1.00034738, "epoch": 0.12529309204593278, "flos": 27119809952640.0, "grad_norm": 2.2406726006911373, "language_loss": 0.78045768, "learning_rate": 3.905596706142513e-06, "loss": 0.80038118, "num_input_tokens_seen": 22043300, "step": 1042, "time_per_iteration": 2.60554575920105 }, { "auxiliary_loss_clip": 0.01162905, "auxiliary_loss_mlp": 0.01059858, "balance_loss_clip": 1.05286181, "balance_loss_mlp": 1.03384614, "epoch": 0.12541333493657186, "flos": 30774151923840.0, "grad_norm": 2.108070545660993, "language_loss": 0.85587776, "learning_rate": 3.9053600628177435e-06, "loss": 0.8781054, "num_input_tokens_seen": 22062910, "step": 1043, "time_per_iteration": 2.761809825897217 }, { "auxiliary_loss_clip": 0.01205322, "auxiliary_loss_mlp": 0.01054372, "balance_loss_clip": 1.05757546, "balance_loss_mlp": 1.02969599, "epoch": 0.12553357782721097, "flos": 23659566099840.0, "grad_norm": 2.9011313073524803, "language_loss": 0.84666717, "learning_rate": 3.905123130451367e-06, "loss": 0.86926407, "num_input_tokens_seen": 22084010, "step": 1044, "time_per_iteration": 2.587061882019043 }, { "auxiliary_loss_clip": 0.01212141, "auxiliary_loss_mlp": 0.01071354, "balance_loss_clip": 1.05954409, "balance_loss_mlp": 1.04485369, "epoch": 0.12565382071785006, "flos": 24863758577280.0, "grad_norm": 1.9088728499897325, "language_loss": 0.79363799, "learning_rate": 3.904885909079326e-06, "loss": 0.81647301, "num_input_tokens_seen": 22102795, "step": 1045, "time_per_iteration": 2.603365898132324 }, { "auxiliary_loss_clip": 0.01197194, "auxiliary_loss_mlp": 0.01058901, "balance_loss_clip": 1.0543865, "balance_loss_mlp": 1.03255522, "epoch": 0.12577406360848914, "flos": 21360780518400.0, "grad_norm": 2.9445866446621207, "language_loss": 0.77944237, "learning_rate": 3.904648398737607e-06, "loss": 0.80200332, "num_input_tokens_seen": 22121360, "step": 1046, "time_per_iteration": 2.5998291969299316 }, { "auxiliary_loss_clip": 0.01207988, "auxiliary_loss_mlp": 0.01056059, "balance_loss_clip": 1.05802524, "balance_loss_mlp": 1.03061986, "epoch": 0.12589430649912825, "flos": 36138056774400.0, "grad_norm": 2.041191924283953, "language_loss": 0.78411102, "learning_rate": 3.9044105994622406e-06, "loss": 0.80675149, "num_input_tokens_seen": 22142505, "step": 1047, "time_per_iteration": 2.6785523891448975 }, { "auxiliary_loss_clip": 0.0118124, "auxiliary_loss_mlp": 0.00780491, "balance_loss_clip": 1.0522629, "balance_loss_mlp": 1.00027525, "epoch": 0.12601454938976733, "flos": 25337671643520.0, "grad_norm": 3.699692858755388, "language_loss": 0.81469035, "learning_rate": 3.9041725112893005e-06, "loss": 0.83430767, "num_input_tokens_seen": 22163730, "step": 1048, "time_per_iteration": 2.66684627532959 }, { "auxiliary_loss_clip": 0.01167873, "auxiliary_loss_mlp": 0.0106073, "balance_loss_clip": 1.05638528, "balance_loss_mlp": 1.03412223, "epoch": 0.12613479228040642, "flos": 15560094286080.0, "grad_norm": 1.6999582120803083, "language_loss": 0.75060159, "learning_rate": 3.903934134254904e-06, "loss": 0.77288759, "num_input_tokens_seen": 22181520, "step": 1049, "time_per_iteration": 2.6632583141326904 }, { "auxiliary_loss_clip": 0.01201237, "auxiliary_loss_mlp": 0.01066952, "balance_loss_clip": 1.05729389, "balance_loss_mlp": 1.03960586, "epoch": 0.1262550351710455, "flos": 21470595373440.0, "grad_norm": 2.319304481980088, "language_loss": 0.85374963, "learning_rate": 3.903695468395213e-06, "loss": 0.87643158, "num_input_tokens_seen": 22199390, "step": 1050, "time_per_iteration": 2.5833041667938232 }, { "auxiliary_loss_clip": 0.01182017, "auxiliary_loss_mlp": 0.01076379, "balance_loss_clip": 1.05084741, "balance_loss_mlp": 1.05033147, "epoch": 0.1263752780616846, "flos": 31576719456000.0, "grad_norm": 2.1199326063606283, "language_loss": 0.55563104, "learning_rate": 3.903456513746434e-06, "loss": 0.578215, "num_input_tokens_seen": 22220365, "step": 1051, "time_per_iteration": 2.689216136932373 }, { "auxiliary_loss_clip": 0.01208166, "auxiliary_loss_mlp": 0.01062743, "balance_loss_clip": 1.0596894, "balance_loss_mlp": 1.03964067, "epoch": 0.1264955209523237, "flos": 28768217927040.0, "grad_norm": 1.9053054712966029, "language_loss": 0.87646067, "learning_rate": 3.903217270344815e-06, "loss": 0.89916974, "num_input_tokens_seen": 22240615, "step": 1052, "time_per_iteration": 2.6178855895996094 }, { "auxiliary_loss_clip": 0.0116433, "auxiliary_loss_mlp": 0.01069938, "balance_loss_clip": 1.05292821, "balance_loss_mlp": 1.04287779, "epoch": 0.12661576384296278, "flos": 29241125412480.0, "grad_norm": 1.7330490908164367, "language_loss": 0.82122159, "learning_rate": 3.902977738226648e-06, "loss": 0.84356433, "num_input_tokens_seen": 22261350, "step": 1053, "time_per_iteration": 2.7343146800994873 }, { "auxiliary_loss_clip": 0.01196742, "auxiliary_loss_mlp": 0.01064695, "balance_loss_clip": 1.05532646, "balance_loss_mlp": 1.0386951, "epoch": 0.12673600673360189, "flos": 20850346298880.0, "grad_norm": 1.8591989583377564, "language_loss": 0.91029227, "learning_rate": 3.902737917428273e-06, "loss": 0.93290657, "num_input_tokens_seen": 22279515, "step": 1054, "time_per_iteration": 2.5816357135772705 }, { "auxiliary_loss_clip": 0.0120927, "auxiliary_loss_mlp": 0.01074923, "balance_loss_clip": 1.05654645, "balance_loss_mlp": 1.04988885, "epoch": 0.12685624962424097, "flos": 25263695583360.0, "grad_norm": 1.7154905447241569, "language_loss": 0.83810723, "learning_rate": 3.902497807986068e-06, "loss": 0.86094916, "num_input_tokens_seen": 22299535, "step": 1055, "time_per_iteration": 2.6281228065490723 }, { "auxiliary_loss_clip": 0.01166879, "auxiliary_loss_mlp": 0.01061203, "balance_loss_clip": 1.04915166, "balance_loss_mlp": 1.03533459, "epoch": 0.12697649251488005, "flos": 27527109246720.0, "grad_norm": 1.8188560158700597, "language_loss": 0.83795428, "learning_rate": 3.902257409936458e-06, "loss": 0.8602351, "num_input_tokens_seen": 22320300, "step": 1056, "time_per_iteration": 2.7132787704467773 }, { "auxiliary_loss_clip": 0.01181232, "auxiliary_loss_mlp": 0.01055426, "balance_loss_clip": 1.05724001, "balance_loss_mlp": 1.03184676, "epoch": 0.12709673540551916, "flos": 21251863503360.0, "grad_norm": 3.309124239793286, "language_loss": 0.84183204, "learning_rate": 3.902016723315912e-06, "loss": 0.86419857, "num_input_tokens_seen": 22338240, "step": 1057, "time_per_iteration": 3.5392184257507324 }, { "auxiliary_loss_clip": 0.01191522, "auxiliary_loss_mlp": 0.01068696, "balance_loss_clip": 1.05460966, "balance_loss_mlp": 1.04294658, "epoch": 0.12721697829615825, "flos": 25337707557120.0, "grad_norm": 2.36016612690401, "language_loss": 0.69277883, "learning_rate": 3.901775748160941e-06, "loss": 0.71538103, "num_input_tokens_seen": 22357420, "step": 1058, "time_per_iteration": 3.5485146045684814 }, { "auxiliary_loss_clip": 0.01069107, "auxiliary_loss_mlp": 0.0101096, "balance_loss_clip": 1.02978086, "balance_loss_mlp": 1.00709748, "epoch": 0.12733722118679733, "flos": 61943287754880.0, "grad_norm": 0.7979377860886971, "language_loss": 0.60902166, "learning_rate": 3.901534484508101e-06, "loss": 0.62982225, "num_input_tokens_seen": 22420095, "step": 1059, "time_per_iteration": 4.243150949478149 }, { "auxiliary_loss_clip": 0.01168641, "auxiliary_loss_mlp": 0.01052148, "balance_loss_clip": 1.05086529, "balance_loss_mlp": 1.02737665, "epoch": 0.1274574640774364, "flos": 26976742081920.0, "grad_norm": 1.9885463937888617, "language_loss": 0.74802285, "learning_rate": 3.901292932393991e-06, "loss": 0.77023077, "num_input_tokens_seen": 22438975, "step": 1060, "time_per_iteration": 3.569756507873535 }, { "auxiliary_loss_clip": 0.01214992, "auxiliary_loss_mlp": 0.0106884, "balance_loss_clip": 1.06213176, "balance_loss_mlp": 1.04324615, "epoch": 0.12757770696807552, "flos": 22236318529920.0, "grad_norm": 2.6140348688319492, "language_loss": 0.85546309, "learning_rate": 3.9010510918552555e-06, "loss": 0.87830138, "num_input_tokens_seen": 22458050, "step": 1061, "time_per_iteration": 2.608165979385376 }, { "auxiliary_loss_clip": 0.01179073, "auxiliary_loss_mlp": 0.01066953, "balance_loss_clip": 1.05222619, "balance_loss_mlp": 1.03979707, "epoch": 0.1276979498587146, "flos": 28547905858560.0, "grad_norm": 3.133831467576004, "language_loss": 0.74768257, "learning_rate": 3.900808962928581e-06, "loss": 0.77014285, "num_input_tokens_seen": 22475665, "step": 1062, "time_per_iteration": 2.647315740585327 }, { "auxiliary_loss_clip": 0.01210034, "auxiliary_loss_mlp": 0.01075291, "balance_loss_clip": 1.06114244, "balance_loss_mlp": 1.04951763, "epoch": 0.1278181927493537, "flos": 17420338719360.0, "grad_norm": 2.7329305503730326, "language_loss": 0.89668262, "learning_rate": 3.900566545650698e-06, "loss": 0.91953588, "num_input_tokens_seen": 22493335, "step": 1063, "time_per_iteration": 2.56972074508667 }, { "auxiliary_loss_clip": 0.01196783, "auxiliary_loss_mlp": 0.01060623, "balance_loss_clip": 1.05932355, "balance_loss_mlp": 1.03418255, "epoch": 0.1279384356399928, "flos": 21138636856320.0, "grad_norm": 2.1647492909165877, "language_loss": 0.82086372, "learning_rate": 3.900323840058381e-06, "loss": 0.84343779, "num_input_tokens_seen": 22511045, "step": 1064, "time_per_iteration": 2.602241277694702 }, { "auxiliary_loss_clip": 0.01195605, "auxiliary_loss_mlp": 0.01067642, "balance_loss_clip": 1.05500603, "balance_loss_mlp": 1.04177403, "epoch": 0.12805867853063188, "flos": 26576733248640.0, "grad_norm": 2.0592927373217513, "language_loss": 0.81632191, "learning_rate": 3.900080846188449e-06, "loss": 0.83895433, "num_input_tokens_seen": 22529635, "step": 1065, "time_per_iteration": 2.6387388706207275 }, { "auxiliary_loss_clip": 0.01212214, "auxiliary_loss_mlp": 0.01064271, "balance_loss_clip": 1.05976963, "balance_loss_mlp": 1.03825915, "epoch": 0.12817892142127096, "flos": 16436206915200.0, "grad_norm": 1.8701545720416402, "language_loss": 0.81103361, "learning_rate": 3.8998375640777625e-06, "loss": 0.83379841, "num_input_tokens_seen": 22547505, "step": 1066, "time_per_iteration": 2.6451914310455322 }, { "auxiliary_loss_clip": 0.01064847, "auxiliary_loss_mlp": 0.01011115, "balance_loss_clip": 1.02957666, "balance_loss_mlp": 1.00668073, "epoch": 0.12829916431191005, "flos": 60757049099520.0, "grad_norm": 0.7093636148899468, "language_loss": 0.52674276, "learning_rate": 3.899593993763229e-06, "loss": 0.5475024, "num_input_tokens_seen": 22608465, "step": 1067, "time_per_iteration": 3.1338653564453125 }, { "auxiliary_loss_clip": 0.01161454, "auxiliary_loss_mlp": 0.01068187, "balance_loss_clip": 1.04891968, "balance_loss_mlp": 1.03901672, "epoch": 0.12841940720254916, "flos": 29786895636480.0, "grad_norm": 3.5688778079872456, "language_loss": 0.81223857, "learning_rate": 3.899350135281796e-06, "loss": 0.83453494, "num_input_tokens_seen": 22629465, "step": 1068, "time_per_iteration": 2.7286362648010254 }, { "auxiliary_loss_clip": 0.0116937, "auxiliary_loss_mlp": 0.01065852, "balance_loss_clip": 1.05364275, "balance_loss_mlp": 1.04124689, "epoch": 0.12853965009318824, "flos": 25951851319680.0, "grad_norm": 2.1527952222306697, "language_loss": 0.79769427, "learning_rate": 3.8991059886704585e-06, "loss": 0.82004654, "num_input_tokens_seen": 22648970, "step": 1069, "time_per_iteration": 2.67242693901062 }, { "auxiliary_loss_clip": 0.01163061, "auxiliary_loss_mlp": 0.01073895, "balance_loss_clip": 1.05587602, "balance_loss_mlp": 1.04720426, "epoch": 0.12865989298382732, "flos": 30846871008000.0, "grad_norm": 2.1984503934426254, "language_loss": 0.83141196, "learning_rate": 3.898861553966252e-06, "loss": 0.85378146, "num_input_tokens_seen": 22668620, "step": 1070, "time_per_iteration": 2.772435188293457 }, { "auxiliary_loss_clip": 0.0112366, "auxiliary_loss_mlp": 0.01070045, "balance_loss_clip": 1.04613507, "balance_loss_mlp": 1.04492795, "epoch": 0.12878013587446643, "flos": 25885776251520.0, "grad_norm": 1.6934422709675923, "language_loss": 0.88201618, "learning_rate": 3.898616831206257e-06, "loss": 0.90395325, "num_input_tokens_seen": 22689045, "step": 1071, "time_per_iteration": 2.8858578205108643 }, { "auxiliary_loss_clip": 0.01162691, "auxiliary_loss_mlp": 0.01063362, "balance_loss_clip": 1.04779911, "balance_loss_mlp": 1.0335716, "epoch": 0.12890037876510552, "flos": 23333138277120.0, "grad_norm": 1.978601182199743, "language_loss": 0.77066386, "learning_rate": 3.8983718204276e-06, "loss": 0.7929244, "num_input_tokens_seen": 22711265, "step": 1072, "time_per_iteration": 2.7580063343048096 }, { "auxiliary_loss_clip": 0.0117562, "auxiliary_loss_mlp": 0.01054581, "balance_loss_clip": 1.05296993, "balance_loss_mlp": 1.02876067, "epoch": 0.1290206216557446, "flos": 23587242065280.0, "grad_norm": 2.7388168669065722, "language_loss": 0.82674921, "learning_rate": 3.898126521667446e-06, "loss": 0.84905124, "num_input_tokens_seen": 22731420, "step": 1073, "time_per_iteration": 2.6687088012695312 }, { "auxiliary_loss_clip": 0.01198118, "auxiliary_loss_mlp": 0.01060187, "balance_loss_clip": 1.05729258, "balance_loss_mlp": 1.0335083, "epoch": 0.12914086454638368, "flos": 24170610850560.0, "grad_norm": 1.7808312871576935, "language_loss": 0.83152252, "learning_rate": 3.897880934963007e-06, "loss": 0.85410559, "num_input_tokens_seen": 22750970, "step": 1074, "time_per_iteration": 2.6466143131256104 }, { "auxiliary_loss_clip": 0.01174303, "auxiliary_loss_mlp": 0.01066709, "balance_loss_clip": 1.05163586, "balance_loss_mlp": 1.03844428, "epoch": 0.1292611074370228, "flos": 20267157081600.0, "grad_norm": 1.9969206945580527, "language_loss": 0.78438449, "learning_rate": 3.89763506035154e-06, "loss": 0.80679464, "num_input_tokens_seen": 22768820, "step": 1075, "time_per_iteration": 2.625507354736328 }, { "auxiliary_loss_clip": 0.01183916, "auxiliary_loss_mlp": 0.01067649, "balance_loss_clip": 1.05624199, "balance_loss_mlp": 1.04224575, "epoch": 0.12938135032766188, "flos": 27377684668800.0, "grad_norm": 1.7126051092741188, "language_loss": 0.81523967, "learning_rate": 3.897388897870343e-06, "loss": 0.83775532, "num_input_tokens_seen": 22789460, "step": 1076, "time_per_iteration": 2.679821491241455 }, { "auxiliary_loss_clip": 0.01191595, "auxiliary_loss_mlp": 0.01067815, "balance_loss_clip": 1.05291629, "balance_loss_mlp": 1.0415175, "epoch": 0.12950159321830096, "flos": 29277107861760.0, "grad_norm": 1.9028733380920233, "language_loss": 0.74978745, "learning_rate": 3.89714244755676e-06, "loss": 0.77238154, "num_input_tokens_seen": 22810820, "step": 1077, "time_per_iteration": 2.7201948165893555 }, { "auxiliary_loss_clip": 0.01137187, "auxiliary_loss_mlp": 0.01070278, "balance_loss_clip": 1.04734802, "balance_loss_mlp": 1.04243088, "epoch": 0.12962183610894007, "flos": 24534888629760.0, "grad_norm": 6.616585511386183, "language_loss": 0.86141831, "learning_rate": 3.896895709448175e-06, "loss": 0.88349295, "num_input_tokens_seen": 22830570, "step": 1078, "time_per_iteration": 2.731736183166504 }, { "auxiliary_loss_clip": 0.01134305, "auxiliary_loss_mlp": 0.01068635, "balance_loss_clip": 1.04760289, "balance_loss_mlp": 1.04090643, "epoch": 0.12974207899957915, "flos": 11215944552960.0, "grad_norm": 2.8816547096968343, "language_loss": 0.77003491, "learning_rate": 3.896648683582019e-06, "loss": 0.79206431, "num_input_tokens_seen": 22845905, "step": 1079, "time_per_iteration": 2.7923247814178467 }, { "auxiliary_loss_clip": 0.01158654, "auxiliary_loss_mlp": 0.01061446, "balance_loss_clip": 1.05567074, "balance_loss_mlp": 1.03613758, "epoch": 0.12986232189021824, "flos": 24717889445760.0, "grad_norm": 3.1282695638058966, "language_loss": 0.80873346, "learning_rate": 3.896401369995766e-06, "loss": 0.83093441, "num_input_tokens_seen": 22865710, "step": 1080, "time_per_iteration": 2.716341257095337 }, { "auxiliary_loss_clip": 0.01212369, "auxiliary_loss_mlp": 0.01065092, "balance_loss_clip": 1.06168175, "balance_loss_mlp": 1.04011774, "epoch": 0.12998256478085732, "flos": 23915357827200.0, "grad_norm": 2.7414469930175556, "language_loss": 0.7964595, "learning_rate": 3.896153768726932e-06, "loss": 0.81923413, "num_input_tokens_seen": 22886020, "step": 1081, "time_per_iteration": 2.6254491806030273 }, { "auxiliary_loss_clip": 0.01198574, "auxiliary_loss_mlp": 0.01065864, "balance_loss_clip": 1.06076956, "balance_loss_mlp": 1.03981686, "epoch": 0.13010280767149643, "flos": 18624207974400.0, "grad_norm": 5.378932142568289, "language_loss": 0.87735891, "learning_rate": 3.8959058798130806e-06, "loss": 0.90000331, "num_input_tokens_seen": 22903995, "step": 1082, "time_per_iteration": 2.5755693912506104 }, { "auxiliary_loss_clip": 0.01183531, "auxiliary_loss_mlp": 0.00781715, "balance_loss_clip": 1.05569923, "balance_loss_mlp": 1.00031567, "epoch": 0.1302230505621355, "flos": 22783992174720.0, "grad_norm": 1.7497450588587378, "language_loss": 0.74843955, "learning_rate": 3.895657703291814e-06, "loss": 0.76809198, "num_input_tokens_seen": 22924100, "step": 1083, "time_per_iteration": 4.61716365814209 }, { "auxiliary_loss_clip": 0.01193765, "auxiliary_loss_mlp": 0.01057861, "balance_loss_clip": 1.0562098, "balance_loss_mlp": 1.02915585, "epoch": 0.1303432934527746, "flos": 21323612920320.0, "grad_norm": 2.7629251137781705, "language_loss": 0.79366302, "learning_rate": 3.895409239200781e-06, "loss": 0.81617928, "num_input_tokens_seen": 22939985, "step": 1084, "time_per_iteration": 2.6529011726379395 }, { "auxiliary_loss_clip": 0.01192291, "auxiliary_loss_mlp": 0.01072506, "balance_loss_clip": 1.05749774, "balance_loss_mlp": 1.04309762, "epoch": 0.1304635363434137, "flos": 20922490765440.0, "grad_norm": 2.237970709302442, "language_loss": 0.91442311, "learning_rate": 3.895160487577673e-06, "loss": 0.93707114, "num_input_tokens_seen": 22957555, "step": 1085, "time_per_iteration": 3.5945193767547607 }, { "auxiliary_loss_clip": 0.01084028, "auxiliary_loss_mlp": 0.01016277, "balance_loss_clip": 1.03084075, "balance_loss_mlp": 1.01224732, "epoch": 0.1305837792340528, "flos": 63245659080960.0, "grad_norm": 0.7937814926831654, "language_loss": 0.60943866, "learning_rate": 3.894911448460226e-06, "loss": 0.63044173, "num_input_tokens_seen": 23016870, "step": 1086, "time_per_iteration": 3.0967984199523926 }, { "auxiliary_loss_clip": 0.01104615, "auxiliary_loss_mlp": 0.01063138, "balance_loss_clip": 1.04614735, "balance_loss_mlp": 1.03505278, "epoch": 0.13070402212469187, "flos": 26428852955520.0, "grad_norm": 5.174209527478737, "language_loss": 0.72601175, "learning_rate": 3.8946621218862195e-06, "loss": 0.74768925, "num_input_tokens_seen": 23037870, "step": 1087, "time_per_iteration": 3.835824489593506 }, { "auxiliary_loss_clip": 0.01162432, "auxiliary_loss_mlp": 0.01057628, "balance_loss_clip": 1.0555532, "balance_loss_mlp": 1.03254652, "epoch": 0.13082426501533098, "flos": 27673409341440.0, "grad_norm": 1.8382289263383464, "language_loss": 0.88782591, "learning_rate": 3.894412507893475e-06, "loss": 0.91002655, "num_input_tokens_seen": 23058150, "step": 1088, "time_per_iteration": 2.738330841064453 }, { "auxiliary_loss_clip": 0.01169478, "auxiliary_loss_mlp": 0.01058953, "balance_loss_clip": 1.05628181, "balance_loss_mlp": 1.03047359, "epoch": 0.13094450790597006, "flos": 24826770547200.0, "grad_norm": 2.1975489299745368, "language_loss": 0.72100359, "learning_rate": 3.894162606519859e-06, "loss": 0.74328792, "num_input_tokens_seen": 23077100, "step": 1089, "time_per_iteration": 2.7152140140533447 }, { "auxiliary_loss_clip": 0.01155982, "auxiliary_loss_mlp": 0.01068013, "balance_loss_clip": 1.05518317, "balance_loss_mlp": 1.04109585, "epoch": 0.13106475079660915, "flos": 19062605468160.0, "grad_norm": 2.20497746408112, "language_loss": 0.77159774, "learning_rate": 3.893912417803282e-06, "loss": 0.79383773, "num_input_tokens_seen": 23096815, "step": 1090, "time_per_iteration": 2.710526704788208 }, { "auxiliary_loss_clip": 0.01153817, "auxiliary_loss_mlp": 0.01062556, "balance_loss_clip": 1.05051541, "balance_loss_mlp": 1.03299236, "epoch": 0.13118499368724823, "flos": 28913189218560.0, "grad_norm": 2.750293334528363, "language_loss": 0.76958811, "learning_rate": 3.8936619417816975e-06, "loss": 0.79175186, "num_input_tokens_seen": 23117145, "step": 1091, "time_per_iteration": 2.760956048965454 }, { "auxiliary_loss_clip": 0.01174007, "auxiliary_loss_mlp": 0.01067272, "balance_loss_clip": 1.05999041, "balance_loss_mlp": 1.04095078, "epoch": 0.13130523657788734, "flos": 14283398206080.0, "grad_norm": 2.0181139038737896, "language_loss": 0.71651292, "learning_rate": 3.8934111784931015e-06, "loss": 0.7389257, "num_input_tokens_seen": 23134595, "step": 1092, "time_per_iteration": 2.677518606185913 }, { "auxiliary_loss_clip": 0.01069803, "auxiliary_loss_mlp": 0.01012054, "balance_loss_clip": 1.02709222, "balance_loss_mlp": 1.00859737, "epoch": 0.13142547946852642, "flos": 70174155519360.0, "grad_norm": 0.9074441056120945, "language_loss": 0.59095597, "learning_rate": 3.893160127975535e-06, "loss": 0.61177456, "num_input_tokens_seen": 23195285, "step": 1093, "time_per_iteration": 3.345648765563965 }, { "auxiliary_loss_clip": 0.01156486, "auxiliary_loss_mlp": 0.01065512, "balance_loss_clip": 1.05057633, "balance_loss_mlp": 1.04171824, "epoch": 0.1315457223591655, "flos": 45805998844800.0, "grad_norm": 2.7741715561618165, "language_loss": 0.81355995, "learning_rate": 3.8929087902670826e-06, "loss": 0.83577991, "num_input_tokens_seen": 23216915, "step": 1094, "time_per_iteration": 2.940073013305664 }, { "auxiliary_loss_clip": 0.01077468, "auxiliary_loss_mlp": 0.01011591, "balance_loss_clip": 1.02430475, "balance_loss_mlp": 1.00787187, "epoch": 0.13166596524980462, "flos": 62881165820160.0, "grad_norm": 0.9992681237899037, "language_loss": 0.60702866, "learning_rate": 3.8926571654058715e-06, "loss": 0.62791926, "num_input_tokens_seen": 23273560, "step": 1095, "time_per_iteration": 3.100740909576416 }, { "auxiliary_loss_clip": 0.01170853, "auxiliary_loss_mlp": 0.01069801, "balance_loss_clip": 1.05649114, "balance_loss_mlp": 1.04100013, "epoch": 0.1317862081404437, "flos": 23586523793280.0, "grad_norm": 2.7979059701030478, "language_loss": 0.769508, "learning_rate": 3.892405253430074e-06, "loss": 0.79191452, "num_input_tokens_seen": 23291080, "step": 1096, "time_per_iteration": 2.8155364990234375 }, { "auxiliary_loss_clip": 0.01185827, "auxiliary_loss_mlp": 0.00780931, "balance_loss_clip": 1.05646372, "balance_loss_mlp": 1.00022221, "epoch": 0.13190645103108278, "flos": 20260764460800.0, "grad_norm": 4.511386887998889, "language_loss": 0.82278323, "learning_rate": 3.892153054377904e-06, "loss": 0.84245086, "num_input_tokens_seen": 23308485, "step": 1097, "time_per_iteration": 2.6673145294189453 }, { "auxiliary_loss_clip": 0.01025642, "auxiliary_loss_mlp": 0.01004403, "balance_loss_clip": 1.02445769, "balance_loss_mlp": 1.00025415, "epoch": 0.13202669392172187, "flos": 53455440136320.0, "grad_norm": 0.9399263488233189, "language_loss": 0.59466374, "learning_rate": 3.891900568287619e-06, "loss": 0.61496419, "num_input_tokens_seen": 23360870, "step": 1098, "time_per_iteration": 3.221008777618408 }, { "auxiliary_loss_clip": 0.01175848, "auxiliary_loss_mlp": 0.01063287, "balance_loss_clip": 1.0560689, "balance_loss_mlp": 1.03384185, "epoch": 0.13214693681236098, "flos": 15851293845120.0, "grad_norm": 2.9873815177675365, "language_loss": 0.719598, "learning_rate": 3.891647795197523e-06, "loss": 0.74198937, "num_input_tokens_seen": 23376910, "step": 1099, "time_per_iteration": 2.6680519580841064 }, { "auxiliary_loss_clip": 0.01179081, "auxiliary_loss_mlp": 0.01058997, "balance_loss_clip": 1.05263686, "balance_loss_mlp": 1.0296241, "epoch": 0.13226717970300006, "flos": 19353840940800.0, "grad_norm": 2.1508380089435803, "language_loss": 0.68591249, "learning_rate": 3.8913947351459605e-06, "loss": 0.70829326, "num_input_tokens_seen": 23394450, "step": 1100, "time_per_iteration": 2.721877336502075 }, { "auxiliary_loss_clip": 0.01209422, "auxiliary_loss_mlp": 0.0106801, "balance_loss_clip": 1.06115866, "balance_loss_mlp": 1.04241562, "epoch": 0.13238742259363914, "flos": 20698084546560.0, "grad_norm": 1.8830202024954807, "language_loss": 0.67606848, "learning_rate": 3.89114138817132e-06, "loss": 0.69884276, "num_input_tokens_seen": 23411115, "step": 1101, "time_per_iteration": 2.5648889541625977 }, { "auxiliary_loss_clip": 0.01191654, "auxiliary_loss_mlp": 0.01058571, "balance_loss_clip": 1.05647373, "balance_loss_mlp": 1.03377533, "epoch": 0.13250766548427825, "flos": 21032449274880.0, "grad_norm": 1.9440334919199775, "language_loss": 0.8425805, "learning_rate": 3.890887754312035e-06, "loss": 0.86508274, "num_input_tokens_seen": 23429360, "step": 1102, "time_per_iteration": 2.635068416595459 }, { "auxiliary_loss_clip": 0.01176967, "auxiliary_loss_mlp": 0.0106418, "balance_loss_clip": 1.05495214, "balance_loss_mlp": 1.03726244, "epoch": 0.13262790837491734, "flos": 22637871648000.0, "grad_norm": 2.4882431293464253, "language_loss": 0.87598693, "learning_rate": 3.890633833606581e-06, "loss": 0.8983984, "num_input_tokens_seen": 23449050, "step": 1103, "time_per_iteration": 2.6298279762268066 }, { "auxiliary_loss_clip": 0.01194752, "auxiliary_loss_mlp": 0.01068917, "balance_loss_clip": 1.05883133, "balance_loss_mlp": 1.04226184, "epoch": 0.13274815126555642, "flos": 19683141851520.0, "grad_norm": 1.8703373991560008, "language_loss": 0.69861436, "learning_rate": 3.890379626093477e-06, "loss": 0.72125101, "num_input_tokens_seen": 23468800, "step": 1104, "time_per_iteration": 2.609865665435791 }, { "auxiliary_loss_clip": 0.01138641, "auxiliary_loss_mlp": 0.0106092, "balance_loss_clip": 1.04995966, "balance_loss_mlp": 1.03216696, "epoch": 0.1328683941561955, "flos": 21317687176320.0, "grad_norm": 5.497423715498681, "language_loss": 0.92011154, "learning_rate": 3.890125131811287e-06, "loss": 0.94210714, "num_input_tokens_seen": 23486850, "step": 1105, "time_per_iteration": 2.6438934803009033 }, { "auxiliary_loss_clip": 0.01167902, "auxiliary_loss_mlp": 0.01057216, "balance_loss_clip": 1.05114233, "balance_loss_mlp": 1.02986908, "epoch": 0.1329886370468346, "flos": 13699131580800.0, "grad_norm": 2.4652064065906347, "language_loss": 0.75351375, "learning_rate": 3.889870350798618e-06, "loss": 0.77576494, "num_input_tokens_seen": 23504195, "step": 1106, "time_per_iteration": 2.552549123764038 }, { "auxiliary_loss_clip": 0.01210588, "auxiliary_loss_mlp": 0.01058598, "balance_loss_clip": 1.06031966, "balance_loss_mlp": 1.03086948, "epoch": 0.1331088799374737, "flos": 21032413361280.0, "grad_norm": 1.6082720098178738, "language_loss": 0.78494269, "learning_rate": 3.889615283094119e-06, "loss": 0.80763453, "num_input_tokens_seen": 23523385, "step": 1107, "time_per_iteration": 2.5066378116607666 }, { "auxiliary_loss_clip": 0.0121463, "auxiliary_loss_mlp": 0.01048939, "balance_loss_clip": 1.05904877, "balance_loss_mlp": 1.02293992, "epoch": 0.13322912282811278, "flos": 18260432985600.0, "grad_norm": 2.1420770105481415, "language_loss": 0.84479237, "learning_rate": 3.889359928736485e-06, "loss": 0.86742806, "num_input_tokens_seen": 23541330, "step": 1108, "time_per_iteration": 3.523590087890625 }, { "auxiliary_loss_clip": 0.01173743, "auxiliary_loss_mlp": 0.00779723, "balance_loss_clip": 1.056723, "balance_loss_mlp": 1.00023937, "epoch": 0.1333493657187519, "flos": 24460876656000.0, "grad_norm": 3.500542326412218, "language_loss": 0.91294068, "learning_rate": 3.889104287764451e-06, "loss": 0.93247533, "num_input_tokens_seen": 23561705, "step": 1109, "time_per_iteration": 2.691878080368042 }, { "auxiliary_loss_clip": 0.01183943, "auxiliary_loss_mlp": 0.01060522, "balance_loss_clip": 1.0594852, "balance_loss_mlp": 1.03465366, "epoch": 0.13346960860939097, "flos": 22158930677760.0, "grad_norm": 2.3778768613951784, "language_loss": 0.90345836, "learning_rate": 3.888848360216798e-06, "loss": 0.92590296, "num_input_tokens_seen": 23579350, "step": 1110, "time_per_iteration": 3.58111310005188 }, { "auxiliary_loss_clip": 0.01072039, "auxiliary_loss_mlp": 0.01017138, "balance_loss_clip": 1.02721524, "balance_loss_mlp": 1.0139432, "epoch": 0.13358985150003005, "flos": 67931212608000.0, "grad_norm": 0.807175516516254, "language_loss": 0.56649077, "learning_rate": 3.888592146132351e-06, "loss": 0.58738244, "num_input_tokens_seen": 23640620, "step": 1111, "time_per_iteration": 4.350676774978638 }, { "auxiliary_loss_clip": 0.01193258, "auxiliary_loss_mlp": 0.0106713, "balance_loss_clip": 1.05816078, "balance_loss_mlp": 1.04150033, "epoch": 0.13371009439066917, "flos": 26834284742400.0, "grad_norm": 1.8427464684435884, "language_loss": 0.78383124, "learning_rate": 3.888335645549978e-06, "loss": 0.80643505, "num_input_tokens_seen": 23661040, "step": 1112, "time_per_iteration": 3.567841053009033 }, { "auxiliary_loss_clip": 0.01207547, "auxiliary_loss_mlp": 0.01058154, "balance_loss_clip": 1.05846262, "balance_loss_mlp": 1.03265488, "epoch": 0.13383033728130825, "flos": 26322844942080.0, "grad_norm": 2.6447712333074263, "language_loss": 0.81266749, "learning_rate": 3.888078858508588e-06, "loss": 0.83532447, "num_input_tokens_seen": 23680900, "step": 1113, "time_per_iteration": 2.62693190574646 }, { "auxiliary_loss_clip": 0.01178323, "auxiliary_loss_mlp": 0.01058894, "balance_loss_clip": 1.0565176, "balance_loss_mlp": 1.03232229, "epoch": 0.13395058017194733, "flos": 22563931501440.0, "grad_norm": 1.9288981798080787, "language_loss": 0.8442502, "learning_rate": 3.8878217850471365e-06, "loss": 0.86662233, "num_input_tokens_seen": 23700815, "step": 1114, "time_per_iteration": 2.68259596824646 }, { "auxiliary_loss_clip": 0.01217129, "auxiliary_loss_mlp": 0.01060246, "balance_loss_clip": 1.06354499, "balance_loss_mlp": 1.03429461, "epoch": 0.13407082306258641, "flos": 25810938264960.0, "grad_norm": 1.858541152428869, "language_loss": 0.74002647, "learning_rate": 3.887564425204621e-06, "loss": 0.76280022, "num_input_tokens_seen": 23722500, "step": 1115, "time_per_iteration": 2.6816279888153076 }, { "auxiliary_loss_clip": 0.01052518, "auxiliary_loss_mlp": 0.01009797, "balance_loss_clip": 1.02381468, "balance_loss_mlp": 1.00631618, "epoch": 0.13419106595322552, "flos": 68338365269760.0, "grad_norm": 0.8371187432765518, "language_loss": 0.54622573, "learning_rate": 3.887306779020083e-06, "loss": 0.56684887, "num_input_tokens_seen": 23777155, "step": 1116, "time_per_iteration": 3.3908329010009766 }, { "auxiliary_loss_clip": 0.0120023, "auxiliary_loss_mlp": 0.01057342, "balance_loss_clip": 1.0601902, "balance_loss_mlp": 1.03018594, "epoch": 0.1343113088438646, "flos": 20449080489600.0, "grad_norm": 2.196738531597585, "language_loss": 0.70446295, "learning_rate": 3.887048846532608e-06, "loss": 0.72703862, "num_input_tokens_seen": 23794130, "step": 1117, "time_per_iteration": 2.694723129272461 }, { "auxiliary_loss_clip": 0.01053984, "auxiliary_loss_mlp": 0.01010585, "balance_loss_clip": 1.02334726, "balance_loss_mlp": 1.00677073, "epoch": 0.1344315517345037, "flos": 67389784951680.0, "grad_norm": 0.7678527467200741, "language_loss": 0.58156747, "learning_rate": 3.8867906277813224e-06, "loss": 0.60221314, "num_input_tokens_seen": 23852285, "step": 1118, "time_per_iteration": 3.1314496994018555 }, { "auxiliary_loss_clip": 0.01197601, "auxiliary_loss_mlp": 0.00780102, "balance_loss_clip": 1.05674171, "balance_loss_mlp": 1.0003221, "epoch": 0.1345517946251428, "flos": 40734442788480.0, "grad_norm": 2.342520133294296, "language_loss": 0.73611009, "learning_rate": 3.886532122805399e-06, "loss": 0.75588715, "num_input_tokens_seen": 23874765, "step": 1119, "time_per_iteration": 2.779252052307129 }, { "auxiliary_loss_clip": 0.01123533, "auxiliary_loss_mlp": 0.01066358, "balance_loss_clip": 1.04587293, "balance_loss_mlp": 1.03855884, "epoch": 0.13467203751578188, "flos": 22816850140800.0, "grad_norm": 1.7839530595925794, "language_loss": 0.89961451, "learning_rate": 3.886273331644053e-06, "loss": 0.92151338, "num_input_tokens_seen": 23893635, "step": 1120, "time_per_iteration": 2.769963264465332 }, { "auxiliary_loss_clip": 0.01150424, "auxiliary_loss_mlp": 0.0106471, "balance_loss_clip": 1.05165613, "balance_loss_mlp": 1.03842402, "epoch": 0.13479228040642097, "flos": 17091576512640.0, "grad_norm": 2.110559182075929, "language_loss": 0.82331806, "learning_rate": 3.886014254336542e-06, "loss": 0.84546942, "num_input_tokens_seen": 23910110, "step": 1121, "time_per_iteration": 2.70742130279541 }, { "auxiliary_loss_clip": 0.01187155, "auxiliary_loss_mlp": 0.01066897, "balance_loss_clip": 1.05454957, "balance_loss_mlp": 1.0426259, "epoch": 0.13491252329706005, "flos": 23730525417600.0, "grad_norm": 1.824985859594642, "language_loss": 0.92570043, "learning_rate": 3.885754890922168e-06, "loss": 0.948241, "num_input_tokens_seen": 23930440, "step": 1122, "time_per_iteration": 2.646737575531006 }, { "auxiliary_loss_clip": 0.01119128, "auxiliary_loss_mlp": 0.01061875, "balance_loss_clip": 1.05071974, "balance_loss_mlp": 1.03399193, "epoch": 0.13503276618769916, "flos": 34127058960000.0, "grad_norm": 1.7877662349909504, "language_loss": 0.78353298, "learning_rate": 3.885495241440277e-06, "loss": 0.80534303, "num_input_tokens_seen": 23954535, "step": 1123, "time_per_iteration": 2.8389132022857666 }, { "auxiliary_loss_clip": 0.01207005, "auxiliary_loss_mlp": 0.01058004, "balance_loss_clip": 1.05740762, "balance_loss_mlp": 1.03353095, "epoch": 0.13515300907833824, "flos": 17712328377600.0, "grad_norm": 4.511858819424904, "language_loss": 0.74152148, "learning_rate": 3.885235305930257e-06, "loss": 0.7641716, "num_input_tokens_seen": 23972735, "step": 1124, "time_per_iteration": 2.562513828277588 }, { "auxiliary_loss_clip": 0.01157722, "auxiliary_loss_mlp": 0.0107307, "balance_loss_clip": 1.05381393, "balance_loss_mlp": 1.0462358, "epoch": 0.13527325196897733, "flos": 20260872201600.0, "grad_norm": 1.9539652280770425, "language_loss": 0.85583615, "learning_rate": 3.884975084431539e-06, "loss": 0.87814409, "num_input_tokens_seen": 23987685, "step": 1125, "time_per_iteration": 2.673269510269165 }, { "auxiliary_loss_clip": 0.0118564, "auxiliary_loss_mlp": 0.00782029, "balance_loss_clip": 1.05734587, "balance_loss_mlp": 1.00042033, "epoch": 0.13539349485961644, "flos": 18186492839040.0, "grad_norm": 2.3427579297664143, "language_loss": 0.91577351, "learning_rate": 3.8847145769836e-06, "loss": 0.93545026, "num_input_tokens_seen": 24004105, "step": 1126, "time_per_iteration": 2.605541944503784 }, { "auxiliary_loss_clip": 0.01209983, "auxiliary_loss_mlp": 0.01065989, "balance_loss_clip": 1.05992055, "balance_loss_mlp": 1.03961968, "epoch": 0.13551373775025552, "flos": 19317463441920.0, "grad_norm": 3.850894899809773, "language_loss": 0.65894562, "learning_rate": 3.884453783625959e-06, "loss": 0.68170536, "num_input_tokens_seen": 24021715, "step": 1127, "time_per_iteration": 2.5974950790405273 }, { "auxiliary_loss_clip": 0.01178892, "auxiliary_loss_mlp": 0.01067304, "balance_loss_clip": 1.05518007, "balance_loss_mlp": 1.04073215, "epoch": 0.1356339806408946, "flos": 20850813175680.0, "grad_norm": 2.3346726871568957, "language_loss": 0.8512311, "learning_rate": 3.884192704398176e-06, "loss": 0.87369299, "num_input_tokens_seen": 24038915, "step": 1128, "time_per_iteration": 2.653221845626831 }, { "auxiliary_loss_clip": 0.01191548, "auxiliary_loss_mlp": 0.01058222, "balance_loss_clip": 1.05518913, "balance_loss_mlp": 1.0321629, "epoch": 0.13575422353153369, "flos": 50476037696640.0, "grad_norm": 1.7999775473292112, "language_loss": 0.74712729, "learning_rate": 3.883931339339858e-06, "loss": 0.76962501, "num_input_tokens_seen": 24063300, "step": 1129, "time_per_iteration": 2.8512325286865234 }, { "auxiliary_loss_clip": 0.01203625, "auxiliary_loss_mlp": 0.01064491, "balance_loss_clip": 1.06062317, "balance_loss_mlp": 1.03411698, "epoch": 0.1358744664221728, "flos": 18150797698560.0, "grad_norm": 1.8389269882951929, "language_loss": 0.78726912, "learning_rate": 3.883669688490654e-06, "loss": 0.80995029, "num_input_tokens_seen": 24081070, "step": 1130, "time_per_iteration": 2.5789260864257812 }, { "auxiliary_loss_clip": 0.0116674, "auxiliary_loss_mlp": 0.00780059, "balance_loss_clip": 1.0526979, "balance_loss_mlp": 1.00043607, "epoch": 0.13599470931281188, "flos": 18442966924800.0, "grad_norm": 1.8859793427246874, "language_loss": 0.85600251, "learning_rate": 3.883407751890256e-06, "loss": 0.87547052, "num_input_tokens_seen": 24099675, "step": 1131, "time_per_iteration": 2.641022205352783 }, { "auxiliary_loss_clip": 0.01158789, "auxiliary_loss_mlp": 0.01058828, "balance_loss_clip": 1.05030537, "balance_loss_mlp": 1.03079021, "epoch": 0.13611495220345096, "flos": 26680766014080.0, "grad_norm": 1.6696415815351495, "language_loss": 0.86112624, "learning_rate": 3.8831455295783994e-06, "loss": 0.88330245, "num_input_tokens_seen": 24118925, "step": 1132, "time_per_iteration": 2.7592406272888184 }, { "auxiliary_loss_clip": 0.01172175, "auxiliary_loss_mlp": 0.01069015, "balance_loss_clip": 1.05324268, "balance_loss_mlp": 1.04086995, "epoch": 0.13623519509409007, "flos": 21686238673920.0, "grad_norm": 2.07378943003738, "language_loss": 0.74260694, "learning_rate": 3.882883021594864e-06, "loss": 0.76501882, "num_input_tokens_seen": 24137065, "step": 1133, "time_per_iteration": 2.679651975631714 }, { "auxiliary_loss_clip": 0.0115635, "auxiliary_loss_mlp": 0.01065824, "balance_loss_clip": 1.05367064, "balance_loss_mlp": 1.03910899, "epoch": 0.13635543798472916, "flos": 14830389492480.0, "grad_norm": 3.067695963196646, "language_loss": 0.86873263, "learning_rate": 3.8826202279794705e-06, "loss": 0.89095438, "num_input_tokens_seen": 24154125, "step": 1134, "time_per_iteration": 3.5913538932800293 }, { "auxiliary_loss_clip": 0.01211083, "auxiliary_loss_mlp": 0.01059539, "balance_loss_clip": 1.06353676, "balance_loss_mlp": 1.03463602, "epoch": 0.13647568087536824, "flos": 22890323410560.0, "grad_norm": 2.0947696047378694, "language_loss": 0.70121944, "learning_rate": 3.882357148772085e-06, "loss": 0.72392571, "num_input_tokens_seen": 24171550, "step": 1135, "time_per_iteration": 2.590254783630371 }, { "auxiliary_loss_clip": 0.01151091, "auxiliary_loss_mlp": 0.01067508, "balance_loss_clip": 1.05301607, "balance_loss_mlp": 1.04122281, "epoch": 0.13659592376600732, "flos": 19937927998080.0, "grad_norm": 7.71229213291166, "language_loss": 0.84253806, "learning_rate": 3.882093784012617e-06, "loss": 0.86472404, "num_input_tokens_seen": 24190190, "step": 1136, "time_per_iteration": 3.6506259441375732 }, { "auxiliary_loss_clip": 0.01175284, "auxiliary_loss_mlp": 0.01072296, "balance_loss_clip": 1.05635655, "balance_loss_mlp": 1.04548573, "epoch": 0.13671616665664643, "flos": 21428579439360.0, "grad_norm": 1.9646740437495036, "language_loss": 0.83940136, "learning_rate": 3.881830133741019e-06, "loss": 0.8618772, "num_input_tokens_seen": 24209055, "step": 1137, "time_per_iteration": 3.6871933937072754 }, { "auxiliary_loss_clip": 0.01163057, "auxiliary_loss_mlp": 0.01059998, "balance_loss_clip": 1.05750108, "balance_loss_mlp": 1.03490448, "epoch": 0.13683640954728551, "flos": 22778138257920.0, "grad_norm": 2.4373394267729034, "language_loss": 0.76309156, "learning_rate": 3.881566197997285e-06, "loss": 0.78532207, "num_input_tokens_seen": 24225490, "step": 1138, "time_per_iteration": 3.608381748199463 }, { "auxiliary_loss_clip": 0.01171411, "auxiliary_loss_mlp": 0.01057423, "balance_loss_clip": 1.05495858, "balance_loss_mlp": 1.03340268, "epoch": 0.1369566524379246, "flos": 21725884310400.0, "grad_norm": 1.7679120449620414, "language_loss": 0.74724501, "learning_rate": 3.881301976821456e-06, "loss": 0.76953334, "num_input_tokens_seen": 24245520, "step": 1139, "time_per_iteration": 2.688311815261841 }, { "auxiliary_loss_clip": 0.01187303, "auxiliary_loss_mlp": 0.01061889, "balance_loss_clip": 1.05585206, "balance_loss_mlp": 1.03677201, "epoch": 0.1370768953285637, "flos": 18624459369600.0, "grad_norm": 1.8482034163781433, "language_loss": 0.9094224, "learning_rate": 3.881037470253612e-06, "loss": 0.93191427, "num_input_tokens_seen": 24265035, "step": 1140, "time_per_iteration": 2.59378981590271 }, { "auxiliary_loss_clip": 0.01147357, "auxiliary_loss_mlp": 0.01061184, "balance_loss_clip": 1.04913604, "balance_loss_mlp": 1.03663921, "epoch": 0.1371971382192028, "flos": 14939521989120.0, "grad_norm": 2.8337391128020886, "language_loss": 0.7970072, "learning_rate": 3.88077267833388e-06, "loss": 0.81909257, "num_input_tokens_seen": 24281550, "step": 1141, "time_per_iteration": 2.6935527324676514 }, { "auxiliary_loss_clip": 0.01137556, "auxiliary_loss_mlp": 0.01067698, "balance_loss_clip": 1.04734969, "balance_loss_mlp": 1.04120946, "epoch": 0.13731738110984187, "flos": 19023785844480.0, "grad_norm": 2.4113855072753014, "language_loss": 0.8360616, "learning_rate": 3.880507601102427e-06, "loss": 0.85811406, "num_input_tokens_seen": 24299485, "step": 1142, "time_per_iteration": 2.791571617126465 }, { "auxiliary_loss_clip": 0.01202351, "auxiliary_loss_mlp": 0.01070767, "balance_loss_clip": 1.05904913, "balance_loss_mlp": 1.04454064, "epoch": 0.13743762400048098, "flos": 18187462506240.0, "grad_norm": 1.96408634551571, "language_loss": 0.82169676, "learning_rate": 3.880242238599467e-06, "loss": 0.84442794, "num_input_tokens_seen": 24316010, "step": 1143, "time_per_iteration": 2.526625871658325 }, { "auxiliary_loss_clip": 0.01197643, "auxiliary_loss_mlp": 0.01066031, "balance_loss_clip": 1.05689549, "balance_loss_mlp": 1.04136634, "epoch": 0.13755786689112007, "flos": 21031982398080.0, "grad_norm": 1.7135290681279134, "language_loss": 0.83360279, "learning_rate": 3.879976590865254e-06, "loss": 0.85623956, "num_input_tokens_seen": 24335465, "step": 1144, "time_per_iteration": 2.615137815475464 }, { "auxiliary_loss_clip": 0.01175989, "auxiliary_loss_mlp": 0.01061678, "balance_loss_clip": 1.05757546, "balance_loss_mlp": 1.0373714, "epoch": 0.13767810978175915, "flos": 21360636864000.0, "grad_norm": 2.258365863932846, "language_loss": 0.87461531, "learning_rate": 3.879710657940087e-06, "loss": 0.89699197, "num_input_tokens_seen": 24354415, "step": 1145, "time_per_iteration": 2.651132345199585 }, { "auxiliary_loss_clip": 0.01194367, "auxiliary_loss_mlp": 0.01068506, "balance_loss_clip": 1.05898547, "balance_loss_mlp": 1.04202986, "epoch": 0.13779835267239823, "flos": 30592084861440.0, "grad_norm": 1.9726060606905238, "language_loss": 0.70061731, "learning_rate": 3.879444439864308e-06, "loss": 0.72324598, "num_input_tokens_seen": 24373990, "step": 1146, "time_per_iteration": 2.661550283432007 }, { "auxiliary_loss_clip": 0.01185443, "auxiliary_loss_mlp": 0.00779397, "balance_loss_clip": 1.05302525, "balance_loss_mlp": 1.00045443, "epoch": 0.13791859556303734, "flos": 22669867687680.0, "grad_norm": 1.7560599175783822, "language_loss": 0.85898602, "learning_rate": 3.879177936678301e-06, "loss": 0.87863445, "num_input_tokens_seen": 24392995, "step": 1147, "time_per_iteration": 2.6380605697631836 }, { "auxiliary_loss_clip": 0.01179038, "auxiliary_loss_mlp": 0.01069412, "balance_loss_clip": 1.05310893, "balance_loss_mlp": 1.04325795, "epoch": 0.13803883845367643, "flos": 35224166016000.0, "grad_norm": 2.1010385101671702, "language_loss": 0.77298564, "learning_rate": 3.878911148422496e-06, "loss": 0.79547012, "num_input_tokens_seen": 24414470, "step": 1148, "time_per_iteration": 2.7691521644592285 }, { "auxiliary_loss_clip": 0.01188856, "auxiliary_loss_mlp": 0.01065865, "balance_loss_clip": 1.05419314, "balance_loss_mlp": 1.03889942, "epoch": 0.1381590813443155, "flos": 32014542332160.0, "grad_norm": 2.5211000910586647, "language_loss": 0.70231473, "learning_rate": 3.878644075137364e-06, "loss": 0.72486192, "num_input_tokens_seen": 24435120, "step": 1149, "time_per_iteration": 2.6904456615448 }, { "auxiliary_loss_clip": 0.01136579, "auxiliary_loss_mlp": 0.01071421, "balance_loss_clip": 1.04763341, "balance_loss_mlp": 1.04412174, "epoch": 0.13827932423495462, "flos": 17821855923840.0, "grad_norm": 1.9528484407444393, "language_loss": 0.7926333, "learning_rate": 3.878376716863418e-06, "loss": 0.81471324, "num_input_tokens_seen": 24451420, "step": 1150, "time_per_iteration": 2.6912851333618164 }, { "auxiliary_loss_clip": 0.0117226, "auxiliary_loss_mlp": 0.0106138, "balance_loss_clip": 1.05219305, "balance_loss_mlp": 1.03671539, "epoch": 0.1383995671255937, "flos": 19427098728960.0, "grad_norm": 1.9737613029688779, "language_loss": 0.71800733, "learning_rate": 3.878109073641219e-06, "loss": 0.74034375, "num_input_tokens_seen": 24470450, "step": 1151, "time_per_iteration": 2.6292128562927246 }, { "auxiliary_loss_clip": 0.01141771, "auxiliary_loss_mlp": 0.01078887, "balance_loss_clip": 1.04895687, "balance_loss_mlp": 1.05331624, "epoch": 0.13851981001623279, "flos": 28296603331200.0, "grad_norm": 2.012375581168715, "language_loss": 0.8140856, "learning_rate": 3.877841145511366e-06, "loss": 0.83629215, "num_input_tokens_seen": 24493190, "step": 1152, "time_per_iteration": 2.8013267517089844 }, { "auxiliary_loss_clip": 0.01188423, "auxiliary_loss_mlp": 0.01070094, "balance_loss_clip": 1.05416203, "balance_loss_mlp": 1.04494071, "epoch": 0.13864005290687187, "flos": 21213079793280.0, "grad_norm": 1.9961805976837443, "language_loss": 0.82540452, "learning_rate": 3.8775729325145035e-06, "loss": 0.84798974, "num_input_tokens_seen": 24512425, "step": 1153, "time_per_iteration": 2.658003807067871 }, { "auxiliary_loss_clip": 0.0103984, "auxiliary_loss_mlp": 0.01008287, "balance_loss_clip": 1.02087998, "balance_loss_mlp": 1.00425756, "epoch": 0.13876029579751098, "flos": 71653389413760.0, "grad_norm": 0.7912486770422896, "language_loss": 0.64784497, "learning_rate": 3.877304434691321e-06, "loss": 0.6683262, "num_input_tokens_seen": 24579275, "step": 1154, "time_per_iteration": 3.3830010890960693 }, { "auxiliary_loss_clip": 0.01160096, "auxiliary_loss_mlp": 0.01051431, "balance_loss_clip": 1.05470109, "balance_loss_mlp": 1.02669513, "epoch": 0.13888053868815006, "flos": 21941348042880.0, "grad_norm": 1.7454017834486792, "language_loss": 0.79324877, "learning_rate": 3.877035652082548e-06, "loss": 0.81536412, "num_input_tokens_seen": 24598720, "step": 1155, "time_per_iteration": 2.6939873695373535 }, { "auxiliary_loss_clip": 0.0116522, "auxiliary_loss_mlp": 0.01065203, "balance_loss_clip": 1.05290961, "balance_loss_mlp": 1.03795195, "epoch": 0.13900078157878915, "flos": 19608627087360.0, "grad_norm": 1.9416298763341622, "language_loss": 0.85454619, "learning_rate": 3.87676658472896e-06, "loss": 0.87685049, "num_input_tokens_seen": 24617530, "step": 1156, "time_per_iteration": 2.671668767929077 }, { "auxiliary_loss_clip": 0.01190072, "auxiliary_loss_mlp": 0.01061936, "balance_loss_clip": 1.05501604, "balance_loss_mlp": 1.03360021, "epoch": 0.13912102446942826, "flos": 22638051216000.0, "grad_norm": 1.7962323395532978, "language_loss": 0.85028827, "learning_rate": 3.876497232671372e-06, "loss": 0.87280834, "num_input_tokens_seen": 24637485, "step": 1157, "time_per_iteration": 2.6273353099823 }, { "auxiliary_loss_clip": 0.01147405, "auxiliary_loss_mlp": 0.01050052, "balance_loss_clip": 1.04803801, "balance_loss_mlp": 1.02613831, "epoch": 0.13924126736006734, "flos": 29643324975360.0, "grad_norm": 3.241702697345254, "language_loss": 0.83717597, "learning_rate": 3.876227595950647e-06, "loss": 0.85915053, "num_input_tokens_seen": 24656915, "step": 1158, "time_per_iteration": 2.7762410640716553 }, { "auxiliary_loss_clip": 0.0120369, "auxiliary_loss_mlp": 0.01070422, "balance_loss_clip": 1.05709553, "balance_loss_mlp": 1.04284894, "epoch": 0.13936151025070642, "flos": 27417653527680.0, "grad_norm": 1.733864107182793, "language_loss": 0.79107159, "learning_rate": 3.875957674607686e-06, "loss": 0.81381267, "num_input_tokens_seen": 24679190, "step": 1159, "time_per_iteration": 2.605970859527588 }, { "auxiliary_loss_clip": 0.01177669, "auxiliary_loss_mlp": 0.00781614, "balance_loss_clip": 1.04888582, "balance_loss_mlp": 1.00044513, "epoch": 0.1394817531413455, "flos": 16399326625920.0, "grad_norm": 1.8102359474157215, "language_loss": 0.88490939, "learning_rate": 3.8756874686834386e-06, "loss": 0.90450227, "num_input_tokens_seen": 24697405, "step": 1160, "time_per_iteration": 3.526500701904297 }, { "auxiliary_loss_clip": 0.01189753, "auxiliary_loss_mlp": 0.00780188, "balance_loss_clip": 1.05333769, "balance_loss_mlp": 1.00044918, "epoch": 0.13960199603198462, "flos": 30922319525760.0, "grad_norm": 1.6552986862188361, "language_loss": 0.80183268, "learning_rate": 3.875416978218893e-06, "loss": 0.82153213, "num_input_tokens_seen": 24720600, "step": 1161, "time_per_iteration": 3.6025338172912598 }, { "auxiliary_loss_clip": 0.01165642, "auxiliary_loss_mlp": 0.01063769, "balance_loss_clip": 1.04988968, "balance_loss_mlp": 1.035815, "epoch": 0.1397222389226237, "flos": 18113773754880.0, "grad_norm": 3.716267488728913, "language_loss": 0.82690918, "learning_rate": 3.8751462032550835e-06, "loss": 0.84920329, "num_input_tokens_seen": 24737605, "step": 1162, "time_per_iteration": 2.654625654220581 }, { "auxiliary_loss_clip": 0.01168271, "auxiliary_loss_mlp": 0.01062752, "balance_loss_clip": 1.05444944, "balance_loss_mlp": 1.03788495, "epoch": 0.13984248181326278, "flos": 16872772815360.0, "grad_norm": 2.0562305747239646, "language_loss": 0.83224189, "learning_rate": 3.874875143833085e-06, "loss": 0.85455215, "num_input_tokens_seen": 24755845, "step": 1163, "time_per_iteration": 3.6316041946411133 }, { "auxiliary_loss_clip": 0.01189243, "auxiliary_loss_mlp": 0.01064234, "balance_loss_clip": 1.05174446, "balance_loss_mlp": 1.03670907, "epoch": 0.1399627247039019, "flos": 54121401267840.0, "grad_norm": 1.9082321781130454, "language_loss": 0.68936729, "learning_rate": 3.874603799994019e-06, "loss": 0.71190214, "num_input_tokens_seen": 24779380, "step": 1164, "time_per_iteration": 3.837430238723755 }, { "auxiliary_loss_clip": 0.01151135, "auxiliary_loss_mlp": 0.01059877, "balance_loss_clip": 1.04971826, "balance_loss_mlp": 1.03605914, "epoch": 0.14008296759454097, "flos": 11765521618560.0, "grad_norm": 2.0172142843036753, "language_loss": 0.86614144, "learning_rate": 3.874332171779046e-06, "loss": 0.88825154, "num_input_tokens_seen": 24794260, "step": 1165, "time_per_iteration": 2.698054075241089 }, { "auxiliary_loss_clip": 0.01154337, "auxiliary_loss_mlp": 0.01063689, "balance_loss_clip": 1.05003333, "balance_loss_mlp": 1.03563881, "epoch": 0.14020321048518006, "flos": 22017514832640.0, "grad_norm": 1.6995395882270319, "language_loss": 0.75517154, "learning_rate": 3.874060259229373e-06, "loss": 0.77735186, "num_input_tokens_seen": 24815835, "step": 1166, "time_per_iteration": 2.701467752456665 }, { "auxiliary_loss_clip": 0.01196793, "auxiliary_loss_mlp": 0.01070411, "balance_loss_clip": 1.05792201, "balance_loss_mlp": 1.04383945, "epoch": 0.14032345337581917, "flos": 23404313076480.0, "grad_norm": 2.050748540025334, "language_loss": 0.93446696, "learning_rate": 3.873788062386249e-06, "loss": 0.95713902, "num_input_tokens_seen": 24834095, "step": 1167, "time_per_iteration": 2.638444423675537 }, { "auxiliary_loss_clip": 0.0116584, "auxiliary_loss_mlp": 0.01065962, "balance_loss_clip": 1.05440664, "balance_loss_mlp": 1.0390203, "epoch": 0.14044369626645825, "flos": 29645767100160.0, "grad_norm": 1.8923323454906773, "language_loss": 0.82211423, "learning_rate": 3.873515581290965e-06, "loss": 0.84443218, "num_input_tokens_seen": 24858900, "step": 1168, "time_per_iteration": 2.760422706604004 }, { "auxiliary_loss_clip": 0.01165307, "auxiliary_loss_mlp": 0.01063776, "balance_loss_clip": 1.05550814, "balance_loss_mlp": 1.03809834, "epoch": 0.14056393915709733, "flos": 18332972501760.0, "grad_norm": 2.652039031655153, "language_loss": 0.75726783, "learning_rate": 3.8732428159848575e-06, "loss": 0.77955866, "num_input_tokens_seen": 24877875, "step": 1169, "time_per_iteration": 2.659660816192627 }, { "auxiliary_loss_clip": 0.01189921, "auxiliary_loss_mlp": 0.01057371, "balance_loss_clip": 1.05963981, "balance_loss_mlp": 1.03268242, "epoch": 0.14068418204773642, "flos": 26687517770880.0, "grad_norm": 1.787839896417676, "language_loss": 0.78299302, "learning_rate": 3.872969766509304e-06, "loss": 0.80546594, "num_input_tokens_seen": 24898430, "step": 1170, "time_per_iteration": 2.6649672985076904 }, { "auxiliary_loss_clip": 0.01046422, "auxiliary_loss_mlp": 0.01010413, "balance_loss_clip": 1.02293885, "balance_loss_mlp": 1.00609779, "epoch": 0.14080442493837553, "flos": 65259314501760.0, "grad_norm": 0.7685946276316399, "language_loss": 0.55714738, "learning_rate": 3.872696432905726e-06, "loss": 0.57771564, "num_input_tokens_seen": 24959250, "step": 1171, "time_per_iteration": 3.2426490783691406 }, { "auxiliary_loss_clip": 0.01193839, "auxiliary_loss_mlp": 0.01065266, "balance_loss_clip": 1.05551076, "balance_loss_mlp": 1.0378716, "epoch": 0.1409246678290146, "flos": 25776715582080.0, "grad_norm": 3.950835230484891, "language_loss": 0.71879977, "learning_rate": 3.872422815215589e-06, "loss": 0.74139082, "num_input_tokens_seen": 24978330, "step": 1172, "time_per_iteration": 2.67096209526062 }, { "auxiliary_loss_clip": 0.01182625, "auxiliary_loss_mlp": 0.01078126, "balance_loss_clip": 1.050632, "balance_loss_mlp": 1.05075526, "epoch": 0.1410449107196537, "flos": 21868521217920.0, "grad_norm": 1.8936581322987558, "language_loss": 0.7420702, "learning_rate": 3.8721489134803994e-06, "loss": 0.7646777, "num_input_tokens_seen": 24997120, "step": 1173, "time_per_iteration": 2.6275291442871094 }, { "auxiliary_loss_clip": 0.01184751, "auxiliary_loss_mlp": 0.01060621, "balance_loss_clip": 1.05319762, "balance_loss_mlp": 1.03443027, "epoch": 0.1411651536102928, "flos": 16684133564160.0, "grad_norm": 2.5957691972381824, "language_loss": 0.72306871, "learning_rate": 3.871874727741707e-06, "loss": 0.7455225, "num_input_tokens_seen": 25014350, "step": 1174, "time_per_iteration": 2.5956501960754395 }, { "auxiliary_loss_clip": 0.01185573, "auxiliary_loss_mlp": 0.01066212, "balance_loss_clip": 1.05859256, "balance_loss_mlp": 1.04130936, "epoch": 0.1412853965009319, "flos": 20992264934400.0, "grad_norm": 3.8250220449750656, "language_loss": 0.96519446, "learning_rate": 3.871600258041108e-06, "loss": 0.98771232, "num_input_tokens_seen": 25033875, "step": 1175, "time_per_iteration": 2.654142379760742 }, { "auxiliary_loss_clip": 0.01174278, "auxiliary_loss_mlp": 0.01059775, "balance_loss_clip": 1.05559468, "balance_loss_mlp": 1.03302479, "epoch": 0.14140563939157097, "flos": 20335279224960.0, "grad_norm": 2.602490370020468, "language_loss": 0.85411263, "learning_rate": 3.871325504420238e-06, "loss": 0.87645322, "num_input_tokens_seen": 25052865, "step": 1176, "time_per_iteration": 2.647559881210327 }, { "auxiliary_loss_clip": 0.01201085, "auxiliary_loss_mlp": 0.01056864, "balance_loss_clip": 1.05625403, "balance_loss_mlp": 1.03234243, "epoch": 0.14152588228221005, "flos": 21068826773760.0, "grad_norm": 2.9294496098254896, "language_loss": 0.81969512, "learning_rate": 3.871050466920776e-06, "loss": 0.84227455, "num_input_tokens_seen": 25072770, "step": 1177, "time_per_iteration": 2.5532474517822266 }, { "auxiliary_loss_clip": 0.01150338, "auxiliary_loss_mlp": 0.01068323, "balance_loss_clip": 1.04978561, "balance_loss_mlp": 1.04165637, "epoch": 0.14164612517284916, "flos": 18223157646720.0, "grad_norm": 1.8697129095359057, "language_loss": 0.79544401, "learning_rate": 3.870775145584447e-06, "loss": 0.81763065, "num_input_tokens_seen": 25090550, "step": 1178, "time_per_iteration": 2.656280279159546 }, { "auxiliary_loss_clip": 0.01190852, "auxiliary_loss_mlp": 0.01069134, "balance_loss_clip": 1.056288, "balance_loss_mlp": 1.04045236, "epoch": 0.14176636806348825, "flos": 22744454279040.0, "grad_norm": 2.7623418942352886, "language_loss": 0.6470986, "learning_rate": 3.8704995404530145e-06, "loss": 0.66969848, "num_input_tokens_seen": 25106175, "step": 1179, "time_per_iteration": 2.647576093673706 }, { "auxiliary_loss_clip": 0.01196232, "auxiliary_loss_mlp": 0.01066041, "balance_loss_clip": 1.05501151, "balance_loss_mlp": 1.0413053, "epoch": 0.14188661095412733, "flos": 22091095843200.0, "grad_norm": 2.0934290108459868, "language_loss": 0.8499524, "learning_rate": 3.87022365156829e-06, "loss": 0.87257516, "num_input_tokens_seen": 25126890, "step": 1180, "time_per_iteration": 2.5947351455688477 }, { "auxiliary_loss_clip": 0.01117685, "auxiliary_loss_mlp": 0.0106247, "balance_loss_clip": 1.04575634, "balance_loss_mlp": 1.03488553, "epoch": 0.14200685384476644, "flos": 24352390604160.0, "grad_norm": 6.876082947398421, "language_loss": 0.8087014, "learning_rate": 3.869947478972123e-06, "loss": 0.83050293, "num_input_tokens_seen": 25147915, "step": 1181, "time_per_iteration": 2.78981351852417 }, { "auxiliary_loss_clip": 0.01179089, "auxiliary_loss_mlp": 0.01074358, "balance_loss_clip": 1.05161917, "balance_loss_mlp": 1.04457998, "epoch": 0.14212709673540552, "flos": 24022048199040.0, "grad_norm": 2.108572468870917, "language_loss": 0.81979442, "learning_rate": 3.869671022706412e-06, "loss": 0.84232891, "num_input_tokens_seen": 25166645, "step": 1182, "time_per_iteration": 2.6529974937438965 }, { "auxiliary_loss_clip": 0.01127232, "auxiliary_loss_mlp": 0.01076151, "balance_loss_clip": 1.04604757, "balance_loss_mlp": 1.04751658, "epoch": 0.1422473396260446, "flos": 26431797870720.0, "grad_norm": 6.739639919581089, "language_loss": 0.64996558, "learning_rate": 3.869394282813092e-06, "loss": 0.67199945, "num_input_tokens_seen": 25185845, "step": 1183, "time_per_iteration": 2.7582712173461914 }, { "auxiliary_loss_clip": 0.01167664, "auxiliary_loss_mlp": 0.01070989, "balance_loss_clip": 1.05042052, "balance_loss_mlp": 1.04407179, "epoch": 0.1423675825166837, "flos": 17055306754560.0, "grad_norm": 3.686202648559934, "language_loss": 0.88908595, "learning_rate": 3.869117259334147e-06, "loss": 0.91147244, "num_input_tokens_seen": 25203770, "step": 1184, "time_per_iteration": 2.6569478511810303 }, { "auxiliary_loss_clip": 0.01181011, "auxiliary_loss_mlp": 0.01070252, "balance_loss_clip": 1.05335152, "balance_loss_mlp": 1.04295301, "epoch": 0.1424878254073228, "flos": 17929480049280.0, "grad_norm": 2.1070265340310304, "language_loss": 0.81977415, "learning_rate": 3.868839952311599e-06, "loss": 0.84228677, "num_input_tokens_seen": 25221725, "step": 1185, "time_per_iteration": 2.607832193374634 }, { "auxiliary_loss_clip": 0.01170252, "auxiliary_loss_mlp": 0.01063691, "balance_loss_clip": 1.05351532, "balance_loss_mlp": 1.03890705, "epoch": 0.14260806829796188, "flos": 20303606407680.0, "grad_norm": 2.549089483399958, "language_loss": 0.80591685, "learning_rate": 3.868562361787516e-06, "loss": 0.82825625, "num_input_tokens_seen": 25240855, "step": 1186, "time_per_iteration": 3.5661909580230713 }, { "auxiliary_loss_clip": 0.01112111, "auxiliary_loss_mlp": 0.01070779, "balance_loss_clip": 1.04591322, "balance_loss_mlp": 1.04346859, "epoch": 0.14272831118860096, "flos": 23185724860800.0, "grad_norm": 2.3295441097869047, "language_loss": 0.6873709, "learning_rate": 3.868284487804009e-06, "loss": 0.70919979, "num_input_tokens_seen": 25260085, "step": 1187, "time_per_iteration": 3.7272391319274902 }, { "auxiliary_loss_clip": 0.01173866, "auxiliary_loss_mlp": 0.01061302, "balance_loss_clip": 1.05238545, "balance_loss_mlp": 1.03346705, "epoch": 0.14284855407924008, "flos": 27232210586880.0, "grad_norm": 1.8065373242102303, "language_loss": 0.78152418, "learning_rate": 3.86800633040323e-06, "loss": 0.80387586, "num_input_tokens_seen": 25280675, "step": 1188, "time_per_iteration": 2.7186365127563477 }, { "auxiliary_loss_clip": 0.01176155, "auxiliary_loss_mlp": 0.0078129, "balance_loss_clip": 1.05857897, "balance_loss_mlp": 1.00040865, "epoch": 0.14296879696987916, "flos": 28184202696960.0, "grad_norm": 3.7747062945047665, "language_loss": 0.78340423, "learning_rate": 3.867727889627376e-06, "loss": 0.80297869, "num_input_tokens_seen": 25300290, "step": 1189, "time_per_iteration": 3.654005289077759 }, { "auxiliary_loss_clip": 0.0115141, "auxiliary_loss_mlp": 0.0106698, "balance_loss_clip": 1.05198741, "balance_loss_mlp": 1.04192233, "epoch": 0.14308903986051824, "flos": 19390290266880.0, "grad_norm": 3.0313380490865085, "language_loss": 0.78450638, "learning_rate": 3.867449165518687e-06, "loss": 0.80669034, "num_input_tokens_seen": 25316760, "step": 1190, "time_per_iteration": 3.602362632751465 }, { "auxiliary_loss_clip": 0.01205834, "auxiliary_loss_mlp": 0.00780018, "balance_loss_clip": 1.05560064, "balance_loss_mlp": 1.00038171, "epoch": 0.14320928275115732, "flos": 17457506317440.0, "grad_norm": 1.9298955664931672, "language_loss": 0.71123427, "learning_rate": 3.867170158119444e-06, "loss": 0.73109281, "num_input_tokens_seen": 25335760, "step": 1191, "time_per_iteration": 2.561131238937378 }, { "auxiliary_loss_clip": 0.01208747, "auxiliary_loss_mlp": 0.01073879, "balance_loss_clip": 1.05766773, "balance_loss_mlp": 1.04621005, "epoch": 0.14332952564179643, "flos": 21466070259840.0, "grad_norm": 2.252267364919607, "language_loss": 0.75418139, "learning_rate": 3.866890867471972e-06, "loss": 0.77700764, "num_input_tokens_seen": 25354230, "step": 1192, "time_per_iteration": 2.593095541000366 }, { "auxiliary_loss_clip": 0.01171205, "auxiliary_loss_mlp": 0.01073659, "balance_loss_clip": 1.05243134, "balance_loss_mlp": 1.04553795, "epoch": 0.14344976853243552, "flos": 16396992241920.0, "grad_norm": 3.4216621403995817, "language_loss": 0.89301062, "learning_rate": 3.86661129361864e-06, "loss": 0.91545922, "num_input_tokens_seen": 25368720, "step": 1193, "time_per_iteration": 2.6217663288116455 }, { "auxiliary_loss_clip": 0.01175437, "auxiliary_loss_mlp": 0.01071128, "balance_loss_clip": 1.05395889, "balance_loss_mlp": 1.04473531, "epoch": 0.1435700114230746, "flos": 18916736336640.0, "grad_norm": 2.2595038670620298, "language_loss": 0.85885453, "learning_rate": 3.866331436601859e-06, "loss": 0.88132018, "num_input_tokens_seen": 25386715, "step": 1194, "time_per_iteration": 2.6178901195526123 }, { "auxiliary_loss_clip": 0.01205705, "auxiliary_loss_mlp": 0.0106016, "balance_loss_clip": 1.06044412, "balance_loss_mlp": 1.03394544, "epoch": 0.1436902543137137, "flos": 19755394058880.0, "grad_norm": 2.087491260412543, "language_loss": 0.73322451, "learning_rate": 3.866051296464083e-06, "loss": 0.75588322, "num_input_tokens_seen": 25405550, "step": 1195, "time_per_iteration": 2.561347484588623 }, { "auxiliary_loss_clip": 0.01205635, "auxiliary_loss_mlp": 0.00779569, "balance_loss_clip": 1.05682957, "balance_loss_mlp": 1.00034714, "epoch": 0.1438104972043528, "flos": 14684807669760.0, "grad_norm": 2.312506814092542, "language_loss": 0.85178751, "learning_rate": 3.86577087324781e-06, "loss": 0.87163949, "num_input_tokens_seen": 25422040, "step": 1196, "time_per_iteration": 2.5671579837799072 }, { "auxiliary_loss_clip": 0.01188694, "auxiliary_loss_mlp": 0.01072283, "balance_loss_clip": 1.06019115, "balance_loss_mlp": 1.04817891, "epoch": 0.14393074009499188, "flos": 17092330698240.0, "grad_norm": 3.248688097375566, "language_loss": 0.77622235, "learning_rate": 3.865490166995578e-06, "loss": 0.79883218, "num_input_tokens_seen": 25440270, "step": 1197, "time_per_iteration": 2.629399538040161 }, { "auxiliary_loss_clip": 0.01187358, "auxiliary_loss_mlp": 0.01051552, "balance_loss_clip": 1.05424297, "balance_loss_mlp": 1.02596986, "epoch": 0.144050982985631, "flos": 30476200608000.0, "grad_norm": 2.765007995711633, "language_loss": 0.84139931, "learning_rate": 3.86520917774997e-06, "loss": 0.86378837, "num_input_tokens_seen": 25459705, "step": 1198, "time_per_iteration": 2.720432758331299 }, { "auxiliary_loss_clip": 0.01183392, "auxiliary_loss_mlp": 0.01063031, "balance_loss_clip": 1.05606627, "balance_loss_mlp": 1.03721023, "epoch": 0.14417122587627007, "flos": 17858484817920.0, "grad_norm": 2.2830470248527086, "language_loss": 0.75117594, "learning_rate": 3.864927905553614e-06, "loss": 0.77364016, "num_input_tokens_seen": 25477615, "step": 1199, "time_per_iteration": 2.574265956878662 }, { "auxiliary_loss_clip": 0.011524, "auxiliary_loss_mlp": 0.01065696, "balance_loss_clip": 1.04880929, "balance_loss_mlp": 1.03777742, "epoch": 0.14429146876690915, "flos": 21613914639360.0, "grad_norm": 2.0887312812036614, "language_loss": 0.8879177, "learning_rate": 3.8646463504491765e-06, "loss": 0.91009867, "num_input_tokens_seen": 25497750, "step": 1200, "time_per_iteration": 2.648192882537842 }, { "auxiliary_loss_clip": 0.01188553, "auxiliary_loss_mlp": 0.01066729, "balance_loss_clip": 1.05603683, "balance_loss_mlp": 1.04012108, "epoch": 0.14441171165754824, "flos": 23258120722560.0, "grad_norm": 1.7388363129382816, "language_loss": 0.83013213, "learning_rate": 3.8643645124793705e-06, "loss": 0.85268486, "num_input_tokens_seen": 25516650, "step": 1201, "time_per_iteration": 2.6335270404815674 }, { "auxiliary_loss_clip": 0.01186854, "auxiliary_loss_mlp": 0.01051981, "balance_loss_clip": 1.05593562, "balance_loss_mlp": 1.02718544, "epoch": 0.14453195454818735, "flos": 42854213963520.0, "grad_norm": 1.7730868226212566, "language_loss": 0.74479985, "learning_rate": 3.8640823916869515e-06, "loss": 0.76718819, "num_input_tokens_seen": 25540960, "step": 1202, "time_per_iteration": 2.8270933628082275 }, { "auxiliary_loss_clip": 0.011987, "auxiliary_loss_mlp": 0.01056367, "balance_loss_clip": 1.05618763, "balance_loss_mlp": 1.03051078, "epoch": 0.14465219743882643, "flos": 27235873774080.0, "grad_norm": 4.206537179990803, "language_loss": 0.78400707, "learning_rate": 3.863799988114714e-06, "loss": 0.80655771, "num_input_tokens_seen": 25562990, "step": 1203, "time_per_iteration": 2.6544597148895264 }, { "auxiliary_loss_clip": 0.01203344, "auxiliary_loss_mlp": 0.01067488, "balance_loss_clip": 1.05523348, "balance_loss_mlp": 1.04139292, "epoch": 0.1447724403294655, "flos": 16690705752960.0, "grad_norm": 3.4702355998735785, "language_loss": 0.70048845, "learning_rate": 3.863517301805502e-06, "loss": 0.72319674, "num_input_tokens_seen": 25581380, "step": 1204, "time_per_iteration": 2.5227251052856445 }, { "auxiliary_loss_clip": 0.01161269, "auxiliary_loss_mlp": 0.01062316, "balance_loss_clip": 1.05589712, "balance_loss_mlp": 1.03752041, "epoch": 0.14489268322010462, "flos": 20073741321600.0, "grad_norm": 2.57226036751723, "language_loss": 0.96864074, "learning_rate": 3.863234332802196e-06, "loss": 0.99087656, "num_input_tokens_seen": 25593585, "step": 1205, "time_per_iteration": 2.682415723800659 }, { "auxiliary_loss_clip": 0.01171111, "auxiliary_loss_mlp": 0.01069459, "balance_loss_clip": 1.05429065, "balance_loss_mlp": 1.04338765, "epoch": 0.1450129261107437, "flos": 27125627955840.0, "grad_norm": 2.3436657733997572, "language_loss": 0.73702842, "learning_rate": 3.862951081147723e-06, "loss": 0.7594341, "num_input_tokens_seen": 25613750, "step": 1206, "time_per_iteration": 2.6759490966796875 }, { "auxiliary_loss_clip": 0.01189289, "auxiliary_loss_mlp": 0.01062886, "balance_loss_clip": 1.05954647, "balance_loss_mlp": 1.03962815, "epoch": 0.1451331690013828, "flos": 25702344472320.0, "grad_norm": 2.292038088130674, "language_loss": 0.78021741, "learning_rate": 3.862667546885053e-06, "loss": 0.8027392, "num_input_tokens_seen": 25632300, "step": 1207, "time_per_iteration": 2.6737418174743652 }, { "auxiliary_loss_clip": 0.01176479, "auxiliary_loss_mlp": 0.01062866, "balance_loss_clip": 1.05312347, "balance_loss_mlp": 1.03723574, "epoch": 0.14525341189202187, "flos": 25737393168000.0, "grad_norm": 2.813389070588496, "language_loss": 0.73372078, "learning_rate": 3.8623837300571965e-06, "loss": 0.75611424, "num_input_tokens_seen": 25651285, "step": 1208, "time_per_iteration": 2.6939537525177 }, { "auxiliary_loss_clip": 0.01198242, "auxiliary_loss_mlp": 0.01073832, "balance_loss_clip": 1.05482864, "balance_loss_mlp": 1.04807091, "epoch": 0.14537365478266098, "flos": 23073898844160.0, "grad_norm": 3.3217206507437242, "language_loss": 0.84031808, "learning_rate": 3.8620996307072085e-06, "loss": 0.86303878, "num_input_tokens_seen": 25671990, "step": 1209, "time_per_iteration": 2.6182379722595215 }, { "auxiliary_loss_clip": 0.01161615, "auxiliary_loss_mlp": 0.01067574, "balance_loss_clip": 1.05106831, "balance_loss_mlp": 1.04239714, "epoch": 0.14549389767330007, "flos": 20595021448320.0, "grad_norm": 2.0138277554607, "language_loss": 0.64264119, "learning_rate": 3.861815248878188e-06, "loss": 0.66493309, "num_input_tokens_seen": 25689475, "step": 1210, "time_per_iteration": 2.6485376358032227 }, { "auxiliary_loss_clip": 0.01168413, "auxiliary_loss_mlp": 0.01064734, "balance_loss_clip": 1.05397582, "balance_loss_mlp": 1.03913927, "epoch": 0.14561414056393915, "flos": 15121804533120.0, "grad_norm": 2.486666104038616, "language_loss": 0.79652119, "learning_rate": 3.861530584613274e-06, "loss": 0.81885266, "num_input_tokens_seen": 25707475, "step": 1211, "time_per_iteration": 2.6402430534362793 }, { "auxiliary_loss_clip": 0.01192728, "auxiliary_loss_mlp": 0.00779758, "balance_loss_clip": 1.05543888, "balance_loss_mlp": 1.00038445, "epoch": 0.14573438345457826, "flos": 19427493778560.0, "grad_norm": 2.24970411098846, "language_loss": 0.82323056, "learning_rate": 3.86124563795565e-06, "loss": 0.84295547, "num_input_tokens_seen": 25726290, "step": 1212, "time_per_iteration": 3.600208282470703 }, { "auxiliary_loss_clip": 0.01200883, "auxiliary_loss_mlp": 0.01065389, "balance_loss_clip": 1.05773365, "balance_loss_mlp": 1.0388763, "epoch": 0.14585462634521734, "flos": 24828422572800.0, "grad_norm": 1.755198290012894, "language_loss": 0.69942844, "learning_rate": 3.860960408948543e-06, "loss": 0.72209114, "num_input_tokens_seen": 25748040, "step": 1213, "time_per_iteration": 3.5544614791870117 }, { "auxiliary_loss_clip": 0.01178998, "auxiliary_loss_mlp": 0.01065587, "balance_loss_clip": 1.05786765, "balance_loss_mlp": 1.04224622, "epoch": 0.14597486923585642, "flos": 15448627405440.0, "grad_norm": 2.9467050204477805, "language_loss": 0.89292753, "learning_rate": 3.860674897635222e-06, "loss": 0.91537344, "num_input_tokens_seen": 25764525, "step": 1214, "time_per_iteration": 2.6088991165161133 }, { "auxiliary_loss_clip": 0.01190203, "auxiliary_loss_mlp": 0.0106791, "balance_loss_clip": 1.05965877, "balance_loss_mlp": 1.04242301, "epoch": 0.1460951121264955, "flos": 16655154266880.0, "grad_norm": 2.1247183655332598, "language_loss": 0.83529872, "learning_rate": 3.860389104058998e-06, "loss": 0.85787988, "num_input_tokens_seen": 25782755, "step": 1215, "time_per_iteration": 3.6307194232940674 }, { "auxiliary_loss_clip": 0.01169865, "auxiliary_loss_mlp": 0.01058177, "balance_loss_clip": 1.05096626, "balance_loss_mlp": 1.03378677, "epoch": 0.14621535501713462, "flos": 24863291700480.0, "grad_norm": 1.987300814272488, "language_loss": 0.72855455, "learning_rate": 3.860103028263227e-06, "loss": 0.75083494, "num_input_tokens_seen": 25805860, "step": 1216, "time_per_iteration": 3.622828960418701 }, { "auxiliary_loss_clip": 0.01140296, "auxiliary_loss_mlp": 0.01060797, "balance_loss_clip": 1.04677069, "balance_loss_mlp": 1.03368878, "epoch": 0.1463355979077737, "flos": 25228000442880.0, "grad_norm": 2.209726209727109, "language_loss": 0.70005846, "learning_rate": 3.859816670291304e-06, "loss": 0.72206932, "num_input_tokens_seen": 25824955, "step": 1217, "time_per_iteration": 2.7367706298828125 }, { "auxiliary_loss_clip": 0.01131979, "auxiliary_loss_mlp": 0.01074003, "balance_loss_clip": 1.05303872, "balance_loss_mlp": 1.04598892, "epoch": 0.14645584079841278, "flos": 22054143726720.0, "grad_norm": 2.938855412563536, "language_loss": 0.89818239, "learning_rate": 3.859530030186672e-06, "loss": 0.92024219, "num_input_tokens_seen": 25841965, "step": 1218, "time_per_iteration": 2.7084710597991943 }, { "auxiliary_loss_clip": 0.01183419, "auxiliary_loss_mlp": 0.0106244, "balance_loss_clip": 1.05957639, "balance_loss_mlp": 1.03645289, "epoch": 0.1465760836890519, "flos": 23623870959360.0, "grad_norm": 2.2837211471344907, "language_loss": 0.82161635, "learning_rate": 3.859243107992813e-06, "loss": 0.8440749, "num_input_tokens_seen": 25860770, "step": 1219, "time_per_iteration": 2.6902079582214355 }, { "auxiliary_loss_clip": 0.01165401, "auxiliary_loss_mlp": 0.01061744, "balance_loss_clip": 1.05004883, "balance_loss_mlp": 1.03424239, "epoch": 0.14669632657969098, "flos": 37407893356800.0, "grad_norm": 3.248265510723263, "language_loss": 0.77831912, "learning_rate": 3.858955903753252e-06, "loss": 0.80059057, "num_input_tokens_seen": 25879410, "step": 1220, "time_per_iteration": 2.830049753189087 }, { "auxiliary_loss_clip": 0.01187043, "auxiliary_loss_mlp": 0.01057951, "balance_loss_clip": 1.05523682, "balance_loss_mlp": 1.03235674, "epoch": 0.14681656947033006, "flos": 28365910623360.0, "grad_norm": 2.252531487585227, "language_loss": 0.83380628, "learning_rate": 3.858668417511559e-06, "loss": 0.85625625, "num_input_tokens_seen": 25902160, "step": 1221, "time_per_iteration": 2.6824398040771484 }, { "auxiliary_loss_clip": 0.0117627, "auxiliary_loss_mlp": 0.01057773, "balance_loss_clip": 1.05613208, "balance_loss_mlp": 1.03281116, "epoch": 0.14693681236096917, "flos": 18479488078080.0, "grad_norm": 2.1105007277902197, "language_loss": 0.76309514, "learning_rate": 3.8583806493113445e-06, "loss": 0.78543556, "num_input_tokens_seen": 25920505, "step": 1222, "time_per_iteration": 2.685659408569336 }, { "auxiliary_loss_clip": 0.01181873, "auxiliary_loss_mlp": 0.01061306, "balance_loss_clip": 1.05458164, "balance_loss_mlp": 1.03670096, "epoch": 0.14705705525160825, "flos": 20777806782720.0, "grad_norm": 2.508830845024616, "language_loss": 0.82573497, "learning_rate": 3.858092599196263e-06, "loss": 0.84816676, "num_input_tokens_seen": 25938460, "step": 1223, "time_per_iteration": 2.6458749771118164 }, { "auxiliary_loss_clip": 0.01186906, "auxiliary_loss_mlp": 0.01065386, "balance_loss_clip": 1.05308306, "balance_loss_mlp": 1.04128182, "epoch": 0.14717729814224734, "flos": 29932944336000.0, "grad_norm": 2.2119358016334556, "language_loss": 0.82287788, "learning_rate": 3.857804267210012e-06, "loss": 0.84540081, "num_input_tokens_seen": 25957760, "step": 1224, "time_per_iteration": 2.675017833709717 }, { "auxiliary_loss_clip": 0.01145219, "auxiliary_loss_mlp": 0.01059947, "balance_loss_clip": 1.04892993, "balance_loss_mlp": 1.03425741, "epoch": 0.14729754103288642, "flos": 20047491457920.0, "grad_norm": 4.928999701215636, "language_loss": 0.88056552, "learning_rate": 3.857515653396331e-06, "loss": 0.90261722, "num_input_tokens_seen": 25974970, "step": 1225, "time_per_iteration": 2.694938898086548 }, { "auxiliary_loss_clip": 0.01149335, "auxiliary_loss_mlp": 0.01055376, "balance_loss_clip": 1.0521481, "balance_loss_mlp": 1.03145027, "epoch": 0.14741778392352553, "flos": 19281516906240.0, "grad_norm": 2.245887799119421, "language_loss": 0.86925954, "learning_rate": 3.857226757799002e-06, "loss": 0.89130664, "num_input_tokens_seen": 25992525, "step": 1226, "time_per_iteration": 2.702197551727295 }, { "auxiliary_loss_clip": 0.0117212, "auxiliary_loss_mlp": 0.01062764, "balance_loss_clip": 1.05120087, "balance_loss_mlp": 1.03598976, "epoch": 0.1475380268141646, "flos": 25411108999680.0, "grad_norm": 3.1830000518154855, "language_loss": 0.74094325, "learning_rate": 3.85693758046185e-06, "loss": 0.76329207, "num_input_tokens_seen": 26010815, "step": 1227, "time_per_iteration": 2.67063307762146 }, { "auxiliary_loss_clip": 0.0120216, "auxiliary_loss_mlp": 0.01068913, "balance_loss_clip": 1.0597887, "balance_loss_mlp": 1.04508352, "epoch": 0.1476582697048037, "flos": 20847652778880.0, "grad_norm": 2.438547698858487, "language_loss": 0.82539463, "learning_rate": 3.8566481214287435e-06, "loss": 0.84810537, "num_input_tokens_seen": 26028935, "step": 1228, "time_per_iteration": 2.6008548736572266 }, { "auxiliary_loss_clip": 0.01146256, "auxiliary_loss_mlp": 0.01067708, "balance_loss_clip": 1.04816914, "balance_loss_mlp": 1.04177976, "epoch": 0.1477785125954428, "flos": 14028109269120.0, "grad_norm": 2.170158035443355, "language_loss": 0.90491796, "learning_rate": 3.8563583807435935e-06, "loss": 0.92705762, "num_input_tokens_seen": 26045080, "step": 1229, "time_per_iteration": 2.6613638401031494 }, { "auxiliary_loss_clip": 0.01188031, "auxiliary_loss_mlp": 0.00778988, "balance_loss_clip": 1.05170274, "balance_loss_mlp": 1.00022006, "epoch": 0.1478987554860819, "flos": 20516699842560.0, "grad_norm": 1.9477977148865966, "language_loss": 0.77101719, "learning_rate": 3.856068358450353e-06, "loss": 0.79068744, "num_input_tokens_seen": 26065030, "step": 1230, "time_per_iteration": 2.6445364952087402 }, { "auxiliary_loss_clip": 0.01164713, "auxiliary_loss_mlp": 0.01063297, "balance_loss_clip": 1.05439138, "balance_loss_mlp": 1.0381912, "epoch": 0.14801899837672097, "flos": 17857012360320.0, "grad_norm": 1.8629533959862927, "language_loss": 0.85995495, "learning_rate": 3.8557780545930186e-06, "loss": 0.88223505, "num_input_tokens_seen": 26083445, "step": 1231, "time_per_iteration": 2.5919957160949707 }, { "auxiliary_loss_clip": 0.01170957, "auxiliary_loss_mlp": 0.01055013, "balance_loss_clip": 1.05274415, "balance_loss_mlp": 1.0296576, "epoch": 0.14813924126736006, "flos": 20881408584960.0, "grad_norm": 2.085244829635608, "language_loss": 0.79360455, "learning_rate": 3.855487469215628e-06, "loss": 0.81586432, "num_input_tokens_seen": 26102375, "step": 1232, "time_per_iteration": 2.729323625564575 }, { "auxiliary_loss_clip": 0.01152964, "auxiliary_loss_mlp": 0.01051769, "balance_loss_clip": 1.04821396, "balance_loss_mlp": 1.02832043, "epoch": 0.14825948415799917, "flos": 37414070496000.0, "grad_norm": 2.6193782783800765, "language_loss": 0.72452044, "learning_rate": 3.855196602362264e-06, "loss": 0.74656773, "num_input_tokens_seen": 26125295, "step": 1233, "time_per_iteration": 2.8076839447021484 }, { "auxiliary_loss_clip": 0.01186019, "auxiliary_loss_mlp": 0.01060176, "balance_loss_clip": 1.05332136, "balance_loss_mlp": 1.03516626, "epoch": 0.14837972704863825, "flos": 22014641744640.0, "grad_norm": 2.013542110051269, "language_loss": 0.94162655, "learning_rate": 3.854905454077051e-06, "loss": 0.96408844, "num_input_tokens_seen": 26142905, "step": 1234, "time_per_iteration": 2.5891032218933105 }, { "auxiliary_loss_clip": 0.01112648, "auxiliary_loss_mlp": 0.01062747, "balance_loss_clip": 1.04382324, "balance_loss_mlp": 1.03597236, "epoch": 0.14849996993927733, "flos": 20996323171200.0, "grad_norm": 1.8619650608929295, "language_loss": 0.88515532, "learning_rate": 3.854614024404155e-06, "loss": 0.90690923, "num_input_tokens_seen": 26161215, "step": 1235, "time_per_iteration": 2.802779197692871 }, { "auxiliary_loss_clip": 0.01156916, "auxiliary_loss_mlp": 0.01060665, "balance_loss_clip": 1.05075812, "balance_loss_mlp": 1.03569078, "epoch": 0.14862021282991644, "flos": 20047994248320.0, "grad_norm": 2.324388260064651, "language_loss": 0.89263427, "learning_rate": 3.8543223133877865e-06, "loss": 0.91481006, "num_input_tokens_seen": 26179810, "step": 1236, "time_per_iteration": 2.8568055629730225 }, { "auxiliary_loss_clip": 0.01153003, "auxiliary_loss_mlp": 0.01078141, "balance_loss_clip": 1.04917347, "balance_loss_mlp": 1.04616928, "epoch": 0.14874045572055553, "flos": 22712027276160.0, "grad_norm": 1.8377537319680344, "language_loss": 0.8811084, "learning_rate": 3.854030321072198e-06, "loss": 0.90341985, "num_input_tokens_seen": 26199715, "step": 1237, "time_per_iteration": 3.7317676544189453 }, { "auxiliary_loss_clip": 0.01162979, "auxiliary_loss_mlp": 0.01056404, "balance_loss_clip": 1.05176175, "balance_loss_mlp": 1.03085709, "epoch": 0.1488606986111946, "flos": 25411288567680.0, "grad_norm": 2.2407439587233355, "language_loss": 0.73193848, "learning_rate": 3.853738047501682e-06, "loss": 0.75413227, "num_input_tokens_seen": 26220275, "step": 1238, "time_per_iteration": 2.748296022415161 }, { "auxiliary_loss_clip": 0.01190502, "auxiliary_loss_mlp": 0.01064355, "balance_loss_clip": 1.05717587, "balance_loss_mlp": 1.03940415, "epoch": 0.1489809415018337, "flos": 17018749687680.0, "grad_norm": 1.8040236671963905, "language_loss": 0.77484566, "learning_rate": 3.85344549272058e-06, "loss": 0.79739428, "num_input_tokens_seen": 26238255, "step": 1239, "time_per_iteration": 3.560166597366333 }, { "auxiliary_loss_clip": 0.01177816, "auxiliary_loss_mlp": 0.01064531, "balance_loss_clip": 1.05108213, "balance_loss_mlp": 1.03817368, "epoch": 0.1491011843924728, "flos": 33659394860160.0, "grad_norm": 1.9960482681175635, "language_loss": 0.82225412, "learning_rate": 3.853152656773269e-06, "loss": 0.84467757, "num_input_tokens_seen": 26259690, "step": 1240, "time_per_iteration": 2.7816247940063477 }, { "auxiliary_loss_clip": 0.01172827, "auxiliary_loss_mlp": 0.01061406, "balance_loss_clip": 1.05292189, "balance_loss_mlp": 1.03472638, "epoch": 0.14922142728311188, "flos": 21179000764800.0, "grad_norm": 1.7532549533985071, "language_loss": 0.84604156, "learning_rate": 3.852859539704174e-06, "loss": 0.86838388, "num_input_tokens_seen": 26278990, "step": 1241, "time_per_iteration": 3.6793510913848877 }, { "auxiliary_loss_clip": 0.01143213, "auxiliary_loss_mlp": 0.01060976, "balance_loss_clip": 1.0485431, "balance_loss_mlp": 1.03638339, "epoch": 0.14934167017375097, "flos": 29860548474240.0, "grad_norm": 2.344636598303213, "language_loss": 0.76318723, "learning_rate": 3.85256614155776e-06, "loss": 0.78522909, "num_input_tokens_seen": 26299120, "step": 1242, "time_per_iteration": 3.6449031829833984 }, { "auxiliary_loss_clip": 0.01183788, "auxiliary_loss_mlp": 0.01057837, "balance_loss_clip": 1.05215347, "balance_loss_mlp": 1.03071725, "epoch": 0.14946191306439008, "flos": 17019216564480.0, "grad_norm": 3.264965348110949, "language_loss": 0.74293488, "learning_rate": 3.852272462378535e-06, "loss": 0.76535118, "num_input_tokens_seen": 26316995, "step": 1243, "time_per_iteration": 2.59047269821167 }, { "auxiliary_loss_clip": 0.01175073, "auxiliary_loss_mlp": 0.01055917, "balance_loss_clip": 1.05366969, "balance_loss_mlp": 1.0304656, "epoch": 0.14958215595502916, "flos": 15669047214720.0, "grad_norm": 1.9513841915015502, "language_loss": 0.77851832, "learning_rate": 3.85197850221105e-06, "loss": 0.80082822, "num_input_tokens_seen": 26333295, "step": 1244, "time_per_iteration": 2.6455764770507812 }, { "auxiliary_loss_clip": 0.01179317, "auxiliary_loss_mlp": 0.01058369, "balance_loss_clip": 1.05383873, "balance_loss_mlp": 1.03371656, "epoch": 0.14970239884566824, "flos": 33108560818560.0, "grad_norm": 2.646399709371552, "language_loss": 0.75470793, "learning_rate": 3.851684261099899e-06, "loss": 0.77708483, "num_input_tokens_seen": 26355035, "step": 1245, "time_per_iteration": 2.69229793548584 }, { "auxiliary_loss_clip": 0.01163861, "auxiliary_loss_mlp": 0.01059459, "balance_loss_clip": 1.05000114, "balance_loss_mlp": 1.03196955, "epoch": 0.14982264173630733, "flos": 17821245392640.0, "grad_norm": 2.0562565939896635, "language_loss": 0.86657465, "learning_rate": 3.851389739089718e-06, "loss": 0.88880777, "num_input_tokens_seen": 26371655, "step": 1246, "time_per_iteration": 2.60489559173584 }, { "auxiliary_loss_clip": 0.011907, "auxiliary_loss_mlp": 0.0106404, "balance_loss_clip": 1.05663931, "balance_loss_mlp": 1.03659856, "epoch": 0.14994288462694644, "flos": 32409559175040.0, "grad_norm": 1.7761842491995325, "language_loss": 0.80795276, "learning_rate": 3.851094936225186e-06, "loss": 0.83050013, "num_input_tokens_seen": 26392540, "step": 1247, "time_per_iteration": 2.6981101036071777 }, { "auxiliary_loss_clip": 0.01164366, "auxiliary_loss_mlp": 0.0105678, "balance_loss_clip": 1.05340481, "balance_loss_mlp": 1.03169847, "epoch": 0.15006312751758552, "flos": 31794661226880.0, "grad_norm": 1.5599605009646145, "language_loss": 0.76631832, "learning_rate": 3.850799852551024e-06, "loss": 0.78852975, "num_input_tokens_seen": 26414960, "step": 1248, "time_per_iteration": 2.7228338718414307 }, { "auxiliary_loss_clip": 0.01176826, "auxiliary_loss_mlp": 0.01066586, "balance_loss_clip": 1.05087507, "balance_loss_mlp": 1.03901255, "epoch": 0.1501833704082246, "flos": 16618022582400.0, "grad_norm": 3.844455392203056, "language_loss": 0.85906327, "learning_rate": 3.850504488111995e-06, "loss": 0.88149738, "num_input_tokens_seen": 26431635, "step": 1249, "time_per_iteration": 2.564070463180542 }, { "auxiliary_loss_clip": 0.01163962, "auxiliary_loss_mlp": 0.01054685, "balance_loss_clip": 1.0517683, "balance_loss_mlp": 1.03084278, "epoch": 0.15030361329886371, "flos": 23471178243840.0, "grad_norm": 2.0051547684300535, "language_loss": 0.82549781, "learning_rate": 3.850208842952907e-06, "loss": 0.84768426, "num_input_tokens_seen": 26450440, "step": 1250, "time_per_iteration": 2.7051169872283936 }, { "auxiliary_loss_clip": 0.01155437, "auxiliary_loss_mlp": 0.01064583, "balance_loss_clip": 1.05061364, "balance_loss_mlp": 1.03664064, "epoch": 0.1504238561895028, "flos": 25629409906560.0, "grad_norm": 5.13541664694514, "language_loss": 0.79068357, "learning_rate": 3.849912917118608e-06, "loss": 0.81288373, "num_input_tokens_seen": 26471480, "step": 1251, "time_per_iteration": 2.737846851348877 }, { "auxiliary_loss_clip": 0.01073955, "auxiliary_loss_mlp": 0.01010135, "balance_loss_clip": 1.02785897, "balance_loss_mlp": 1.00469899, "epoch": 0.15054409908014188, "flos": 52095146129280.0, "grad_norm": 0.8746167468057447, "language_loss": 0.59302974, "learning_rate": 3.849616710653992e-06, "loss": 0.61387062, "num_input_tokens_seen": 26532950, "step": 1252, "time_per_iteration": 3.114351272583008 }, { "auxiliary_loss_clip": 0.01186094, "auxiliary_loss_mlp": 0.01055091, "balance_loss_clip": 1.05304301, "balance_loss_mlp": 1.02930617, "epoch": 0.150664341970781, "flos": 18880251096960.0, "grad_norm": 1.8094805363632447, "language_loss": 0.75033867, "learning_rate": 3.84932022360399e-06, "loss": 0.77275056, "num_input_tokens_seen": 26551615, "step": 1253, "time_per_iteration": 2.5741827487945557 }, { "auxiliary_loss_clip": 0.01170673, "auxiliary_loss_mlp": 0.01058398, "balance_loss_clip": 1.055439, "balance_loss_mlp": 1.0323869, "epoch": 0.15078458486142007, "flos": 22163240309760.0, "grad_norm": 2.544475027328459, "language_loss": 0.83999175, "learning_rate": 3.849023456013581e-06, "loss": 0.86228245, "num_input_tokens_seen": 26569175, "step": 1254, "time_per_iteration": 2.631845712661743 }, { "auxiliary_loss_clip": 0.01193027, "auxiliary_loss_mlp": 0.01057341, "balance_loss_clip": 1.05619776, "balance_loss_mlp": 1.03192544, "epoch": 0.15090482775205916, "flos": 26651894457600.0, "grad_norm": 5.067725616038178, "language_loss": 0.62160146, "learning_rate": 3.848726407927784e-06, "loss": 0.6441052, "num_input_tokens_seen": 26589560, "step": 1255, "time_per_iteration": 2.6562657356262207 }, { "auxiliary_loss_clip": 0.01172468, "auxiliary_loss_mlp": 0.01057538, "balance_loss_clip": 1.05416107, "balance_loss_mlp": 1.03190827, "epoch": 0.15102507064269824, "flos": 21798998444160.0, "grad_norm": 16.5384539375802, "language_loss": 0.86509925, "learning_rate": 3.84842907939166e-06, "loss": 0.88739932, "num_input_tokens_seen": 26608785, "step": 1256, "time_per_iteration": 2.6381452083587646 }, { "auxiliary_loss_clip": 0.01149505, "auxiliary_loss_mlp": 0.0106318, "balance_loss_clip": 1.05116236, "balance_loss_mlp": 1.03762186, "epoch": 0.15114531353333735, "flos": 22820908377600.0, "grad_norm": 3.7882530423420455, "language_loss": 0.71524179, "learning_rate": 3.8481314704503146e-06, "loss": 0.73736858, "num_input_tokens_seen": 26628615, "step": 1257, "time_per_iteration": 2.6881303787231445 }, { "auxiliary_loss_clip": 0.01186016, "auxiliary_loss_mlp": 0.0106907, "balance_loss_clip": 1.05682802, "balance_loss_mlp": 1.04277229, "epoch": 0.15126555642397643, "flos": 19682674974720.0, "grad_norm": 2.3400193036516743, "language_loss": 0.8809402, "learning_rate": 3.847833581148895e-06, "loss": 0.90349102, "num_input_tokens_seen": 26647525, "step": 1258, "time_per_iteration": 2.6248371601104736 }, { "auxiliary_loss_clip": 0.012011, "auxiliary_loss_mlp": 0.01073217, "balance_loss_clip": 1.05373096, "balance_loss_mlp": 1.04620409, "epoch": 0.15138579931461552, "flos": 28726022424960.0, "grad_norm": 3.666978105442713, "language_loss": 0.80803621, "learning_rate": 3.84753541153259e-06, "loss": 0.83077937, "num_input_tokens_seen": 26667095, "step": 1259, "time_per_iteration": 2.632862091064453 }, { "auxiliary_loss_clip": 0.01189435, "auxiliary_loss_mlp": 0.01069763, "balance_loss_clip": 1.05602789, "balance_loss_mlp": 1.04313147, "epoch": 0.15150604220525463, "flos": 22127006465280.0, "grad_norm": 1.5804227216756588, "language_loss": 0.83207566, "learning_rate": 3.847236961646633e-06, "loss": 0.85466766, "num_input_tokens_seen": 26686075, "step": 1260, "time_per_iteration": 2.6644651889801025 }, { "auxiliary_loss_clip": 0.01167328, "auxiliary_loss_mlp": 0.01071492, "balance_loss_clip": 1.05355155, "balance_loss_mlp": 1.04450345, "epoch": 0.1516262850958937, "flos": 12968708515200.0, "grad_norm": 3.237335495106537, "language_loss": 0.78140974, "learning_rate": 3.846938231536296e-06, "loss": 0.80379808, "num_input_tokens_seen": 26701695, "step": 1261, "time_per_iteration": 2.5568251609802246 }, { "auxiliary_loss_clip": 0.01191918, "auxiliary_loss_mlp": 0.01058865, "balance_loss_clip": 1.05876088, "balance_loss_mlp": 1.03279424, "epoch": 0.1517465279865328, "flos": 21797130936960.0, "grad_norm": 4.013836277213972, "language_loss": 0.80911577, "learning_rate": 3.8466392212468995e-06, "loss": 0.83162355, "num_input_tokens_seen": 26721885, "step": 1262, "time_per_iteration": 2.6090097427368164 }, { "auxiliary_loss_clip": 0.01054238, "auxiliary_loss_mlp": 0.01038394, "balance_loss_clip": 1.02411413, "balance_loss_mlp": 1.03407848, "epoch": 0.15186677087717187, "flos": 58174569901440.0, "grad_norm": 0.8252595693006505, "language_loss": 0.61918217, "learning_rate": 3.8463399308238e-06, "loss": 0.64010853, "num_input_tokens_seen": 26780990, "step": 1263, "time_per_iteration": 4.256841421127319 }, { "auxiliary_loss_clip": 0.01186842, "auxiliary_loss_mlp": 0.010633, "balance_loss_clip": 1.05616176, "balance_loss_mlp": 1.03759873, "epoch": 0.15198701376781099, "flos": 32669696448000.0, "grad_norm": 2.426785168835401, "language_loss": 0.63936508, "learning_rate": 3.846040360312402e-06, "loss": 0.66186649, "num_input_tokens_seen": 26804250, "step": 1264, "time_per_iteration": 2.780104160308838 }, { "auxiliary_loss_clip": 0.01202581, "auxiliary_loss_mlp": 0.01058578, "balance_loss_clip": 1.05634844, "balance_loss_mlp": 1.03446245, "epoch": 0.15210725665845007, "flos": 28402575431040.0, "grad_norm": 2.193903035030968, "language_loss": 0.81557071, "learning_rate": 3.8457405097581485e-06, "loss": 0.83818233, "num_input_tokens_seen": 26823240, "step": 1265, "time_per_iteration": 3.6994993686676025 }, { "auxiliary_loss_clip": 0.0114726, "auxiliary_loss_mlp": 0.0105932, "balance_loss_clip": 1.04867125, "balance_loss_mlp": 1.03239071, "epoch": 0.15222749954908915, "flos": 19938179393280.0, "grad_norm": 1.836067005398632, "language_loss": 0.77728826, "learning_rate": 3.8454403792065275e-06, "loss": 0.79935408, "num_input_tokens_seen": 26842060, "step": 1266, "time_per_iteration": 2.8715932369232178 }, { "auxiliary_loss_clip": 0.01147472, "auxiliary_loss_mlp": 0.01071607, "balance_loss_clip": 1.05147827, "balance_loss_mlp": 1.04565525, "epoch": 0.15234774243972826, "flos": 21324223451520.0, "grad_norm": 2.156469150706767, "language_loss": 0.85303032, "learning_rate": 3.845139968703068e-06, "loss": 0.87522113, "num_input_tokens_seen": 26859580, "step": 1267, "time_per_iteration": 4.214510440826416 }, { "auxiliary_loss_clip": 0.0114618, "auxiliary_loss_mlp": 0.01073485, "balance_loss_clip": 1.04858398, "balance_loss_mlp": 1.04461265, "epoch": 0.15246798533036734, "flos": 25957812977280.0, "grad_norm": 2.084447477928313, "language_loss": 0.82934874, "learning_rate": 3.844839278293342e-06, "loss": 0.85154539, "num_input_tokens_seen": 26880430, "step": 1268, "time_per_iteration": 3.8703324794769287 }, { "auxiliary_loss_clip": 0.01207196, "auxiliary_loss_mlp": 0.01065868, "balance_loss_clip": 1.06089747, "balance_loss_mlp": 1.04005909, "epoch": 0.15258822822100643, "flos": 25811907932160.0, "grad_norm": 3.137655611100154, "language_loss": 0.75802398, "learning_rate": 3.8445383080229654e-06, "loss": 0.78075469, "num_input_tokens_seen": 26896445, "step": 1269, "time_per_iteration": 2.6181893348693848 }, { "auxiliary_loss_clip": 0.01166027, "auxiliary_loss_mlp": 0.01066536, "balance_loss_clip": 1.05066729, "balance_loss_mlp": 1.04047668, "epoch": 0.1527084711116455, "flos": 25265455349760.0, "grad_norm": 2.423118790241961, "language_loss": 0.73749518, "learning_rate": 3.844237057937593e-06, "loss": 0.75982076, "num_input_tokens_seen": 26915450, "step": 1270, "time_per_iteration": 2.82317852973938 }, { "auxiliary_loss_clip": 0.01191032, "auxiliary_loss_mlp": 0.01073818, "balance_loss_clip": 1.05488908, "balance_loss_mlp": 1.04911745, "epoch": 0.15282871400228462, "flos": 29240227572480.0, "grad_norm": 2.847604914417251, "language_loss": 0.77679813, "learning_rate": 3.843935528082926e-06, "loss": 0.79944658, "num_input_tokens_seen": 26936475, "step": 1271, "time_per_iteration": 2.7209792137145996 }, { "auxiliary_loss_clip": 0.01188469, "auxiliary_loss_mlp": 0.0107162, "balance_loss_clip": 1.05508745, "balance_loss_mlp": 1.04538202, "epoch": 0.1529489568929237, "flos": 20882952869760.0, "grad_norm": 1.862948988405418, "language_loss": 0.85008782, "learning_rate": 3.843633718504704e-06, "loss": 0.87268877, "num_input_tokens_seen": 26954920, "step": 1272, "time_per_iteration": 2.656811237335205 }, { "auxiliary_loss_clip": 0.01166937, "auxiliary_loss_mlp": 0.01060714, "balance_loss_clip": 1.05477095, "balance_loss_mlp": 1.03546572, "epoch": 0.1530691997835628, "flos": 20083833043200.0, "grad_norm": 3.039033006440757, "language_loss": 0.89762139, "learning_rate": 3.843331629248715e-06, "loss": 0.91989791, "num_input_tokens_seen": 26972520, "step": 1273, "time_per_iteration": 2.830214738845825 }, { "auxiliary_loss_clip": 0.01199157, "auxiliary_loss_mlp": 0.01056726, "balance_loss_clip": 1.05639791, "balance_loss_mlp": 1.03138268, "epoch": 0.1531894426742019, "flos": 28759814144640.0, "grad_norm": 4.223234529900979, "language_loss": 0.76639873, "learning_rate": 3.843029260360782e-06, "loss": 0.78895754, "num_input_tokens_seen": 26990890, "step": 1274, "time_per_iteration": 2.733403444290161 }, { "auxiliary_loss_clip": 0.01184949, "auxiliary_loss_mlp": 0.0106511, "balance_loss_clip": 1.05499363, "balance_loss_mlp": 1.03936076, "epoch": 0.15330968556484098, "flos": 22236282616320.0, "grad_norm": 1.9350584758704046, "language_loss": 0.79334164, "learning_rate": 3.8427266118867755e-06, "loss": 0.81584227, "num_input_tokens_seen": 27010640, "step": 1275, "time_per_iteration": 2.7708418369293213 }, { "auxiliary_loss_clip": 0.01174179, "auxiliary_loss_mlp": 0.0106038, "balance_loss_clip": 1.0533061, "balance_loss_mlp": 1.03327167, "epoch": 0.15342992845548006, "flos": 27527504296320.0, "grad_norm": 2.4576106872543972, "language_loss": 0.83041918, "learning_rate": 3.842423683872608e-06, "loss": 0.85276473, "num_input_tokens_seen": 27031215, "step": 1276, "time_per_iteration": 2.774834632873535 }, { "auxiliary_loss_clip": 0.01186283, "auxiliary_loss_mlp": 0.01066322, "balance_loss_clip": 1.05349731, "balance_loss_mlp": 1.03890336, "epoch": 0.15355017134611917, "flos": 19609596754560.0, "grad_norm": 4.908548275213185, "language_loss": 0.77570248, "learning_rate": 3.842120476364232e-06, "loss": 0.7982285, "num_input_tokens_seen": 27049665, "step": 1277, "time_per_iteration": 2.7246921062469482 }, { "auxiliary_loss_clip": 0.01191148, "auxiliary_loss_mlp": 0.01063784, "balance_loss_clip": 1.05408645, "balance_loss_mlp": 1.03675938, "epoch": 0.15367041423675826, "flos": 18478590238080.0, "grad_norm": 2.5387898252961625, "language_loss": 0.83840537, "learning_rate": 3.841816989407644e-06, "loss": 0.86095464, "num_input_tokens_seen": 27065155, "step": 1278, "time_per_iteration": 2.726262092590332 }, { "auxiliary_loss_clip": 0.01154306, "auxiliary_loss_mlp": 0.0105993, "balance_loss_clip": 1.05040836, "balance_loss_mlp": 1.03536129, "epoch": 0.15379065712739734, "flos": 41427662342400.0, "grad_norm": 2.2540083533813418, "language_loss": 0.76609564, "learning_rate": 3.841513223048884e-06, "loss": 0.78823805, "num_input_tokens_seen": 27085840, "step": 1279, "time_per_iteration": 2.911322593688965 }, { "auxiliary_loss_clip": 0.01159504, "auxiliary_loss_mlp": 0.01069524, "balance_loss_clip": 1.05283844, "balance_loss_mlp": 1.04083049, "epoch": 0.15391090001803642, "flos": 22054215553920.0, "grad_norm": 2.3089005300958445, "language_loss": 0.78221536, "learning_rate": 3.841209177334031e-06, "loss": 0.80450559, "num_input_tokens_seen": 27104200, "step": 1280, "time_per_iteration": 2.8340537548065186 }, { "auxiliary_loss_clip": 0.01183465, "auxiliary_loss_mlp": 0.010692, "balance_loss_clip": 1.05213833, "balance_loss_mlp": 1.04373717, "epoch": 0.15403114290867553, "flos": 15450351258240.0, "grad_norm": 1.9145342784595143, "language_loss": 0.74671084, "learning_rate": 3.84090485230921e-06, "loss": 0.76923752, "num_input_tokens_seen": 27122440, "step": 1281, "time_per_iteration": 2.7304255962371826 }, { "auxiliary_loss_clip": 0.01200236, "auxiliary_loss_mlp": 0.01071245, "balance_loss_clip": 1.05734634, "balance_loss_mlp": 1.04431605, "epoch": 0.15415138579931462, "flos": 17929156826880.0, "grad_norm": 2.8782717924180505, "language_loss": 0.76349163, "learning_rate": 3.840600248020588e-06, "loss": 0.78620642, "num_input_tokens_seen": 27139380, "step": 1282, "time_per_iteration": 2.7272558212280273 }, { "auxiliary_loss_clip": 0.01179747, "auxiliary_loss_mlp": 0.01062298, "balance_loss_clip": 1.05400038, "balance_loss_mlp": 1.03368735, "epoch": 0.1542716286899537, "flos": 11429325296640.0, "grad_norm": 2.291089602492229, "language_loss": 0.8009094, "learning_rate": 3.840295364514371e-06, "loss": 0.82332981, "num_input_tokens_seen": 27156760, "step": 1283, "time_per_iteration": 2.715891122817993 }, { "auxiliary_loss_clip": 0.01174049, "auxiliary_loss_mlp": 0.01058235, "balance_loss_clip": 1.05225706, "balance_loss_mlp": 1.03259265, "epoch": 0.1543918715805928, "flos": 17420338719360.0, "grad_norm": 3.42364805087563, "language_loss": 0.78472483, "learning_rate": 3.83999020183681e-06, "loss": 0.80704767, "num_input_tokens_seen": 27175455, "step": 1284, "time_per_iteration": 2.750802993774414 }, { "auxiliary_loss_clip": 0.01130124, "auxiliary_loss_mlp": 0.01067711, "balance_loss_clip": 1.05029655, "balance_loss_mlp": 1.04040003, "epoch": 0.1545121144712319, "flos": 17786376264960.0, "grad_norm": 1.9135615165634527, "language_loss": 0.78070521, "learning_rate": 3.839684760034199e-06, "loss": 0.80268359, "num_input_tokens_seen": 27193660, "step": 1285, "time_per_iteration": 2.858452558517456 }, { "auxiliary_loss_clip": 0.01151786, "auxiliary_loss_mlp": 0.01066929, "balance_loss_clip": 1.05030215, "balance_loss_mlp": 1.03933167, "epoch": 0.15463235736187098, "flos": 28220185146240.0, "grad_norm": 3.241953091177597, "language_loss": 0.65695661, "learning_rate": 3.8393790391528716e-06, "loss": 0.67914379, "num_input_tokens_seen": 27214355, "step": 1286, "time_per_iteration": 2.8570964336395264 }, { "auxiliary_loss_clip": 0.01167656, "auxiliary_loss_mlp": 0.01059647, "balance_loss_clip": 1.05173719, "balance_loss_mlp": 1.03327823, "epoch": 0.15475260025251006, "flos": 22856890826880.0, "grad_norm": 2.4320172223224916, "language_loss": 0.89109319, "learning_rate": 3.8390730392392075e-06, "loss": 0.9133662, "num_input_tokens_seen": 27234335, "step": 1287, "time_per_iteration": 2.9188928604125977 }, { "auxiliary_loss_clip": 0.01200921, "auxiliary_loss_mlp": 0.01064175, "balance_loss_clip": 1.05962276, "balance_loss_mlp": 1.03982043, "epoch": 0.15487284314314917, "flos": 17602872658560.0, "grad_norm": 2.42278120053436, "language_loss": 0.79484642, "learning_rate": 3.838766760339626e-06, "loss": 0.81749737, "num_input_tokens_seen": 27252860, "step": 1288, "time_per_iteration": 2.652101755142212 }, { "auxiliary_loss_clip": 0.01134023, "auxiliary_loss_mlp": 0.01070151, "balance_loss_clip": 1.04589081, "balance_loss_mlp": 1.04295921, "epoch": 0.15499308603378825, "flos": 20082037363200.0, "grad_norm": 3.591646074838867, "language_loss": 0.79680067, "learning_rate": 3.838460202500587e-06, "loss": 0.81884241, "num_input_tokens_seen": 27268650, "step": 1289, "time_per_iteration": 3.7825748920440674 }, { "auxiliary_loss_clip": 0.01155818, "auxiliary_loss_mlp": 0.0106749, "balance_loss_clip": 1.05479074, "balance_loss_mlp": 1.04101396, "epoch": 0.15511332892442733, "flos": 15918051271680.0, "grad_norm": 2.8168483942007483, "language_loss": 0.74216014, "learning_rate": 3.838153365768599e-06, "loss": 0.76439321, "num_input_tokens_seen": 27285160, "step": 1290, "time_per_iteration": 2.742047071456909 }, { "auxiliary_loss_clip": 0.01160742, "auxiliary_loss_mlp": 0.01064967, "balance_loss_clip": 1.05550337, "balance_loss_mlp": 1.03914678, "epoch": 0.15523357181506645, "flos": 41282475569280.0, "grad_norm": 3.9172690308061107, "language_loss": 0.75161147, "learning_rate": 3.837846250190206e-06, "loss": 0.77386856, "num_input_tokens_seen": 27308025, "step": 1291, "time_per_iteration": 3.909602165222168 }, { "auxiliary_loss_clip": 0.01136289, "auxiliary_loss_mlp": 0.00780398, "balance_loss_clip": 1.04877949, "balance_loss_mlp": 1.0002594, "epoch": 0.15535381470570553, "flos": 18478769806080.0, "grad_norm": 4.505546887845464, "language_loss": 0.77073789, "learning_rate": 3.837538855811998e-06, "loss": 0.78990483, "num_input_tokens_seen": 27326200, "step": 1292, "time_per_iteration": 2.8413007259368896 }, { "auxiliary_loss_clip": 0.01172243, "auxiliary_loss_mlp": 0.01060856, "balance_loss_clip": 1.05147648, "balance_loss_mlp": 1.03468943, "epoch": 0.1554740575963446, "flos": 13918150759680.0, "grad_norm": 2.155211778248624, "language_loss": 0.70865113, "learning_rate": 3.837231182680606e-06, "loss": 0.73098212, "num_input_tokens_seen": 27344165, "step": 1293, "time_per_iteration": 4.556642293930054 }, { "auxiliary_loss_clip": 0.01189077, "auxiliary_loss_mlp": 0.01060482, "balance_loss_clip": 1.05686235, "balance_loss_mlp": 1.03663981, "epoch": 0.1555943004869837, "flos": 20847078161280.0, "grad_norm": 1.8144858197304592, "language_loss": 0.75726789, "learning_rate": 3.836923230842706e-06, "loss": 0.77976346, "num_input_tokens_seen": 27363280, "step": 1294, "time_per_iteration": 3.6489059925079346 }, { "auxiliary_loss_clip": 0.01145906, "auxiliary_loss_mlp": 0.0105625, "balance_loss_clip": 1.046525, "balance_loss_mlp": 1.03185987, "epoch": 0.1557145433776228, "flos": 22085888371200.0, "grad_norm": 2.496813123207141, "language_loss": 0.80827469, "learning_rate": 3.836615000345011e-06, "loss": 0.83029628, "num_input_tokens_seen": 27381460, "step": 1295, "time_per_iteration": 2.8166275024414062 }, { "auxiliary_loss_clip": 0.01195908, "auxiliary_loss_mlp": 0.01057563, "balance_loss_clip": 1.0558486, "balance_loss_mlp": 1.03417432, "epoch": 0.1558347862682619, "flos": 19791987039360.0, "grad_norm": 2.2266513228230447, "language_loss": 0.77753198, "learning_rate": 3.836306491234282e-06, "loss": 0.80006665, "num_input_tokens_seen": 27399310, "step": 1296, "time_per_iteration": 2.751103162765503 }, { "auxiliary_loss_clip": 0.01166146, "auxiliary_loss_mlp": 0.01062942, "balance_loss_clip": 1.05525517, "balance_loss_mlp": 1.03843284, "epoch": 0.15595502915890097, "flos": 17237086508160.0, "grad_norm": 2.0804585773532276, "language_loss": 0.7542758, "learning_rate": 3.835997703557317e-06, "loss": 0.77656668, "num_input_tokens_seen": 27416050, "step": 1297, "time_per_iteration": 2.794973611831665 }, { "auxiliary_loss_clip": 0.01136906, "auxiliary_loss_mlp": 0.01049666, "balance_loss_clip": 1.04363859, "balance_loss_mlp": 1.0252285, "epoch": 0.15607527204954008, "flos": 19719519350400.0, "grad_norm": 2.720248535629578, "language_loss": 0.80039537, "learning_rate": 3.83568863736096e-06, "loss": 0.82226115, "num_input_tokens_seen": 27434920, "step": 1298, "time_per_iteration": 2.816765785217285 }, { "auxiliary_loss_clip": 0.01158709, "auxiliary_loss_mlp": 0.01059822, "balance_loss_clip": 1.05028391, "balance_loss_mlp": 1.03456116, "epoch": 0.15619551494017916, "flos": 18515650095360.0, "grad_norm": 2.284945383847158, "language_loss": 0.89546478, "learning_rate": 3.8353792926920975e-06, "loss": 0.9176501, "num_input_tokens_seen": 27453570, "step": 1299, "time_per_iteration": 2.8205015659332275 }, { "auxiliary_loss_clip": 0.01193467, "auxiliary_loss_mlp": 0.0107292, "balance_loss_clip": 1.05866385, "balance_loss_mlp": 1.04757643, "epoch": 0.15631575783081825, "flos": 19902125116800.0, "grad_norm": 2.3660379479290277, "language_loss": 0.81694281, "learning_rate": 3.835069669597655e-06, "loss": 0.8396067, "num_input_tokens_seen": 27471960, "step": 1300, "time_per_iteration": 2.744175910949707 }, { "auxiliary_loss_clip": 0.01193403, "auxiliary_loss_mlp": 0.00779927, "balance_loss_clip": 1.05551529, "balance_loss_mlp": 1.00020671, "epoch": 0.15643600072145733, "flos": 20777663128320.0, "grad_norm": 2.3994382222204687, "language_loss": 0.80067897, "learning_rate": 3.834759768124603e-06, "loss": 0.82041222, "num_input_tokens_seen": 27490835, "step": 1301, "time_per_iteration": 2.771306037902832 }, { "auxiliary_loss_clip": 0.01161984, "auxiliary_loss_mlp": 0.01071759, "balance_loss_clip": 1.05410838, "balance_loss_mlp": 1.04417431, "epoch": 0.15655624361209644, "flos": 18546389159040.0, "grad_norm": 2.6313567029805673, "language_loss": 0.76250827, "learning_rate": 3.834449588319953e-06, "loss": 0.78484571, "num_input_tokens_seen": 27508870, "step": 1302, "time_per_iteration": 2.848095417022705 }, { "auxiliary_loss_clip": 0.01179443, "auxiliary_loss_mlp": 0.01065434, "balance_loss_clip": 1.05547118, "balance_loss_mlp": 1.04297471, "epoch": 0.15667648650273552, "flos": 25229544727680.0, "grad_norm": 1.910777961427634, "language_loss": 0.8519007, "learning_rate": 3.834139130230758e-06, "loss": 0.87434947, "num_input_tokens_seen": 27528175, "step": 1303, "time_per_iteration": 2.698967218399048 }, { "auxiliary_loss_clip": 0.01173893, "auxiliary_loss_mlp": 0.01058711, "balance_loss_clip": 1.04963183, "balance_loss_mlp": 1.03430891, "epoch": 0.1567967293933746, "flos": 24827093769600.0, "grad_norm": 1.6437983578724622, "language_loss": 0.80982858, "learning_rate": 3.833828393904117e-06, "loss": 0.83215457, "num_input_tokens_seen": 27548455, "step": 1304, "time_per_iteration": 2.812882661819458 }, { "auxiliary_loss_clip": 0.0113547, "auxiliary_loss_mlp": 0.01059465, "balance_loss_clip": 1.04747772, "balance_loss_mlp": 1.03334618, "epoch": 0.15691697228401372, "flos": 19164555244800.0, "grad_norm": 2.2185413998381396, "language_loss": 0.77293193, "learning_rate": 3.833517379387165e-06, "loss": 0.79488122, "num_input_tokens_seen": 27564910, "step": 1305, "time_per_iteration": 2.7891581058502197 }, { "auxiliary_loss_clip": 0.01185139, "auxiliary_loss_mlp": 0.01065231, "balance_loss_clip": 1.05366755, "balance_loss_mlp": 1.04172254, "epoch": 0.1570372151746528, "flos": 24790931752320.0, "grad_norm": 2.1548029011181966, "language_loss": 0.88653123, "learning_rate": 3.833206086727085e-06, "loss": 0.90903497, "num_input_tokens_seen": 27584260, "step": 1306, "time_per_iteration": 2.7263448238372803 }, { "auxiliary_loss_clip": 0.01160436, "auxiliary_loss_mlp": 0.0106109, "balance_loss_clip": 1.04852295, "balance_loss_mlp": 1.03764153, "epoch": 0.15715745806529188, "flos": 24863650836480.0, "grad_norm": 2.5177594642322045, "language_loss": 0.70328665, "learning_rate": 3.8328945159710994e-06, "loss": 0.72550189, "num_input_tokens_seen": 27604440, "step": 1307, "time_per_iteration": 2.853736400604248 }, { "auxiliary_loss_clip": 0.01190454, "auxiliary_loss_mlp": 0.00778063, "balance_loss_clip": 1.05526876, "balance_loss_mlp": 1.00016201, "epoch": 0.157277700955931, "flos": 21872148491520.0, "grad_norm": 2.0425155179850685, "language_loss": 0.88487023, "learning_rate": 3.832582667166473e-06, "loss": 0.90455544, "num_input_tokens_seen": 27624250, "step": 1308, "time_per_iteration": 2.6882388591766357 }, { "auxiliary_loss_clip": 0.0117375, "auxiliary_loss_mlp": 0.01068044, "balance_loss_clip": 1.0553143, "balance_loss_mlp": 1.04076934, "epoch": 0.15739794384657008, "flos": 24533344344960.0, "grad_norm": 2.0473752062459267, "language_loss": 0.81550384, "learning_rate": 3.8322705403605125e-06, "loss": 0.83792174, "num_input_tokens_seen": 27644595, "step": 1309, "time_per_iteration": 2.818680763244629 }, { "auxiliary_loss_clip": 0.0116403, "auxiliary_loss_mlp": 0.01054155, "balance_loss_clip": 1.05286241, "balance_loss_mlp": 1.03149319, "epoch": 0.15751818673720916, "flos": 17745329998080.0, "grad_norm": 4.166263350499551, "language_loss": 0.8077988, "learning_rate": 3.831958135600568e-06, "loss": 0.82998067, "num_input_tokens_seen": 27662145, "step": 1310, "time_per_iteration": 2.7459986209869385 }, { "auxiliary_loss_clip": 0.01187984, "auxiliary_loss_mlp": 0.01065148, "balance_loss_clip": 1.05771184, "balance_loss_mlp": 1.03968501, "epoch": 0.15763842962784824, "flos": 17858520731520.0, "grad_norm": 2.5274987727135954, "language_loss": 0.79762119, "learning_rate": 3.831645452934032e-06, "loss": 0.82015252, "num_input_tokens_seen": 27680575, "step": 1311, "time_per_iteration": 2.694713830947876 }, { "auxiliary_loss_clip": 0.01200827, "auxiliary_loss_mlp": 0.01062382, "balance_loss_clip": 1.06022692, "balance_loss_mlp": 1.03842115, "epoch": 0.15775867251848735, "flos": 26980908059520.0, "grad_norm": 2.5526134884350338, "language_loss": 0.79790413, "learning_rate": 3.831332492408336e-06, "loss": 0.82053626, "num_input_tokens_seen": 27701985, "step": 1312, "time_per_iteration": 2.754063367843628 }, { "auxiliary_loss_clip": 0.01162975, "auxiliary_loss_mlp": 0.01062828, "balance_loss_clip": 1.04916668, "balance_loss_mlp": 1.03816402, "epoch": 0.15787891540912644, "flos": 19240398812160.0, "grad_norm": 2.3250793642319527, "language_loss": 0.68955201, "learning_rate": 3.831019254070957e-06, "loss": 0.71180999, "num_input_tokens_seen": 27719770, "step": 1313, "time_per_iteration": 2.7610554695129395 }, { "auxiliary_loss_clip": 0.01144292, "auxiliary_loss_mlp": 0.01062419, "balance_loss_clip": 1.04892182, "balance_loss_mlp": 1.03935194, "epoch": 0.15799915829976552, "flos": 27271102037760.0, "grad_norm": 4.1124945807183275, "language_loss": 0.94960415, "learning_rate": 3.8307057379694135e-06, "loss": 0.97167128, "num_input_tokens_seen": 27739105, "step": 1314, "time_per_iteration": 2.878473997116089 }, { "auxiliary_loss_clip": 0.01197992, "auxiliary_loss_mlp": 0.01063538, "balance_loss_clip": 1.05412626, "balance_loss_mlp": 1.03769302, "epoch": 0.15811940119040463, "flos": 20405520270720.0, "grad_norm": 2.6339370233443904, "language_loss": 0.8235532, "learning_rate": 3.830391944151264e-06, "loss": 0.84616846, "num_input_tokens_seen": 27754985, "step": 1315, "time_per_iteration": 3.6162009239196777 }, { "auxiliary_loss_clip": 0.01174244, "auxiliary_loss_mlp": 0.0106316, "balance_loss_clip": 1.0505023, "balance_loss_mlp": 1.03891301, "epoch": 0.1582396440810437, "flos": 32599347661440.0, "grad_norm": 2.124472415458349, "language_loss": 0.67150676, "learning_rate": 3.830077872664114e-06, "loss": 0.69388074, "num_input_tokens_seen": 27776110, "step": 1316, "time_per_iteration": 2.848763942718506 }, { "auxiliary_loss_clip": 0.01135978, "auxiliary_loss_mlp": 0.01055632, "balance_loss_clip": 1.05063081, "balance_loss_mlp": 1.03243375, "epoch": 0.1583598869716828, "flos": 33800559310080.0, "grad_norm": 2.4057524790945406, "language_loss": 0.7338196, "learning_rate": 3.829763523555604e-06, "loss": 0.75573564, "num_input_tokens_seen": 27796510, "step": 1317, "time_per_iteration": 3.910292387008667 }, { "auxiliary_loss_clip": 0.01172129, "auxiliary_loss_mlp": 0.01056213, "balance_loss_clip": 1.05504084, "balance_loss_mlp": 1.03294384, "epoch": 0.15848012986232188, "flos": 24681332378880.0, "grad_norm": 2.667078085220616, "language_loss": 0.78238696, "learning_rate": 3.829448896873423e-06, "loss": 0.80467039, "num_input_tokens_seen": 27815610, "step": 1318, "time_per_iteration": 2.755354166030884 }, { "auxiliary_loss_clip": 0.01135906, "auxiliary_loss_mlp": 0.00779998, "balance_loss_clip": 1.05555379, "balance_loss_mlp": 1.0002346, "epoch": 0.158600372752961, "flos": 22602068766720.0, "grad_norm": 2.2565479348435815, "language_loss": 0.79115731, "learning_rate": 3.829133992665299e-06, "loss": 0.81031632, "num_input_tokens_seen": 27834735, "step": 1319, "time_per_iteration": 4.188889265060425 }, { "auxiliary_loss_clip": 0.01172465, "auxiliary_loss_mlp": 0.01062663, "balance_loss_clip": 1.05371404, "balance_loss_mlp": 1.03762925, "epoch": 0.15872061564360007, "flos": 27927944092800.0, "grad_norm": 2.933142095495075, "language_loss": 0.89133883, "learning_rate": 3.828818810979002e-06, "loss": 0.91369015, "num_input_tokens_seen": 27853065, "step": 1320, "time_per_iteration": 3.648561716079712 }, { "auxiliary_loss_clip": 0.01196835, "auxiliary_loss_mlp": 0.01057709, "balance_loss_clip": 1.05804622, "balance_loss_mlp": 1.03418911, "epoch": 0.15884085853423915, "flos": 23696805525120.0, "grad_norm": 2.008538109035829, "language_loss": 0.80496502, "learning_rate": 3.8285033518623454e-06, "loss": 0.82751048, "num_input_tokens_seen": 27873315, "step": 1321, "time_per_iteration": 2.750234842300415 }, { "auxiliary_loss_clip": 0.01188182, "auxiliary_loss_mlp": 0.01069523, "balance_loss_clip": 1.05627823, "balance_loss_mlp": 1.04468, "epoch": 0.15896110142487826, "flos": 23112359331840.0, "grad_norm": 2.8030560586894104, "language_loss": 0.81179255, "learning_rate": 3.8281876153631845e-06, "loss": 0.83436966, "num_input_tokens_seen": 27890070, "step": 1322, "time_per_iteration": 2.747929096221924 }, { "auxiliary_loss_clip": 0.01137464, "auxiliary_loss_mlp": 0.01065804, "balance_loss_clip": 1.04710913, "balance_loss_mlp": 1.04030538, "epoch": 0.15908134431551735, "flos": 14685238632960.0, "grad_norm": 2.421074709103821, "language_loss": 0.64459467, "learning_rate": 3.827871601529416e-06, "loss": 0.66662741, "num_input_tokens_seen": 27908590, "step": 1323, "time_per_iteration": 2.8186826705932617 }, { "auxiliary_loss_clip": 0.01147949, "auxiliary_loss_mlp": 0.01067882, "balance_loss_clip": 1.04911256, "balance_loss_mlp": 1.04316974, "epoch": 0.15920158720615643, "flos": 20193611984640.0, "grad_norm": 3.178709243299198, "language_loss": 0.80451375, "learning_rate": 3.827555310408979e-06, "loss": 0.82667202, "num_input_tokens_seen": 27927985, "step": 1324, "time_per_iteration": 2.7750391960144043 }, { "auxiliary_loss_clip": 0.01157347, "auxiliary_loss_mlp": 0.01055268, "balance_loss_clip": 1.05586028, "balance_loss_mlp": 1.03024626, "epoch": 0.1593218300967955, "flos": 24826626892800.0, "grad_norm": 1.685547214023791, "language_loss": 0.82875252, "learning_rate": 3.827238742049854e-06, "loss": 0.8508786, "num_input_tokens_seen": 27948280, "step": 1325, "time_per_iteration": 2.8277299404144287 }, { "auxiliary_loss_clip": 0.01193918, "auxiliary_loss_mlp": 0.01056231, "balance_loss_clip": 1.05300999, "balance_loss_mlp": 1.03236485, "epoch": 0.15944207298743462, "flos": 28328707111680.0, "grad_norm": 2.5989063311635725, "language_loss": 0.5170083, "learning_rate": 3.826921896500066e-06, "loss": 0.53950977, "num_input_tokens_seen": 27969565, "step": 1326, "time_per_iteration": 2.7349579334259033 }, { "auxiliary_loss_clip": 0.01164699, "auxiliary_loss_mlp": 0.01066961, "balance_loss_clip": 1.05535924, "balance_loss_mlp": 1.04182005, "epoch": 0.1595623158780737, "flos": 22964838174720.0, "grad_norm": 1.945645335170272, "language_loss": 0.78115952, "learning_rate": 3.826604773807678e-06, "loss": 0.8034761, "num_input_tokens_seen": 27987540, "step": 1327, "time_per_iteration": 2.7977945804595947 }, { "auxiliary_loss_clip": 0.01166834, "auxiliary_loss_mlp": 0.01056966, "balance_loss_clip": 1.048123, "balance_loss_mlp": 1.03181279, "epoch": 0.1596825587687128, "flos": 19710540950400.0, "grad_norm": 2.6001267414028466, "language_loss": 0.73311806, "learning_rate": 3.826287374020798e-06, "loss": 0.75535607, "num_input_tokens_seen": 28002345, "step": 1328, "time_per_iteration": 2.680349349975586 }, { "auxiliary_loss_clip": 0.01194433, "auxiliary_loss_mlp": 0.01062348, "balance_loss_clip": 1.05587792, "balance_loss_mlp": 1.03761208, "epoch": 0.1598028016593519, "flos": 22637727993600.0, "grad_norm": 2.401206922925597, "language_loss": 0.82015967, "learning_rate": 3.825969697187575e-06, "loss": 0.84272742, "num_input_tokens_seen": 28021675, "step": 1329, "time_per_iteration": 2.67071795463562 }, { "auxiliary_loss_clip": 0.01152401, "auxiliary_loss_mlp": 0.01067919, "balance_loss_clip": 1.04863524, "balance_loss_mlp": 1.0410254, "epoch": 0.15992304454999098, "flos": 20482908122880.0, "grad_norm": 2.590276943337933, "language_loss": 0.69347847, "learning_rate": 3.8256517433562015e-06, "loss": 0.71568167, "num_input_tokens_seen": 28039615, "step": 1330, "time_per_iteration": 2.7003345489501953 }, { "auxiliary_loss_clip": 0.01197707, "auxiliary_loss_mlp": 0.0105677, "balance_loss_clip": 1.05770028, "balance_loss_mlp": 1.03291678, "epoch": 0.16004328744063007, "flos": 17676094533120.0, "grad_norm": 2.489600976151873, "language_loss": 0.92299271, "learning_rate": 3.82533351257491e-06, "loss": 0.94553739, "num_input_tokens_seen": 28057565, "step": 1331, "time_per_iteration": 2.6124305725097656 }, { "auxiliary_loss_clip": 0.0118044, "auxiliary_loss_mlp": 0.01056261, "balance_loss_clip": 1.05511045, "balance_loss_mlp": 1.03127444, "epoch": 0.16016353033126918, "flos": 24098717779200.0, "grad_norm": 2.4375070016543483, "language_loss": 0.88778454, "learning_rate": 3.825015004891975e-06, "loss": 0.91015154, "num_input_tokens_seen": 28076305, "step": 1332, "time_per_iteration": 2.7236509323120117 }, { "auxiliary_loss_clip": 0.01174033, "auxiliary_loss_mlp": 0.01059086, "balance_loss_clip": 1.05163717, "balance_loss_mlp": 1.037485, "epoch": 0.16028377322190826, "flos": 27634841112960.0, "grad_norm": 1.7991586674738302, "language_loss": 0.75837421, "learning_rate": 3.824696220355716e-06, "loss": 0.78070539, "num_input_tokens_seen": 28097895, "step": 1333, "time_per_iteration": 2.7139461040496826 }, { "auxiliary_loss_clip": 0.01164665, "auxiliary_loss_mlp": 0.01066345, "balance_loss_clip": 1.04915094, "balance_loss_mlp": 1.04169273, "epoch": 0.16040401611254734, "flos": 20961202648320.0, "grad_norm": 1.9106514966365873, "language_loss": 0.78978467, "learning_rate": 3.824377159014491e-06, "loss": 0.81209469, "num_input_tokens_seen": 28118790, "step": 1334, "time_per_iteration": 2.694004535675049 }, { "auxiliary_loss_clip": 0.01181133, "auxiliary_loss_mlp": 0.01062133, "balance_loss_clip": 1.05476582, "balance_loss_mlp": 1.03688455, "epoch": 0.16052425900318643, "flos": 21247051080960.0, "grad_norm": 1.9046105372029565, "language_loss": 0.84895933, "learning_rate": 3.824057820916702e-06, "loss": 0.87139201, "num_input_tokens_seen": 28135995, "step": 1335, "time_per_iteration": 2.6592602729797363 }, { "auxiliary_loss_clip": 0.01175068, "auxiliary_loss_mlp": 0.01052985, "balance_loss_clip": 1.0535264, "balance_loss_mlp": 1.02671099, "epoch": 0.16064450189382554, "flos": 15524004096000.0, "grad_norm": 2.465729397187327, "language_loss": 0.71696264, "learning_rate": 3.8237382061107904e-06, "loss": 0.73924315, "num_input_tokens_seen": 28152715, "step": 1336, "time_per_iteration": 2.625462532043457 }, { "auxiliary_loss_clip": 0.01102685, "auxiliary_loss_mlp": 0.01063366, "balance_loss_clip": 1.04021096, "balance_loss_mlp": 1.03787947, "epoch": 0.16076474478446462, "flos": 21178497974400.0, "grad_norm": 2.7298889495899803, "language_loss": 0.78959978, "learning_rate": 3.823418314645243e-06, "loss": 0.81126022, "num_input_tokens_seen": 28171590, "step": 1337, "time_per_iteration": 2.829833507537842 }, { "auxiliary_loss_clip": 0.01125663, "auxiliary_loss_mlp": 0.0105938, "balance_loss_clip": 1.05131721, "balance_loss_mlp": 1.03503776, "epoch": 0.1608849876751037, "flos": 18366476912640.0, "grad_norm": 1.7863715588835547, "language_loss": 0.75139666, "learning_rate": 3.823098146568588e-06, "loss": 0.77324706, "num_input_tokens_seen": 28191295, "step": 1338, "time_per_iteration": 2.755760669708252 }, { "auxiliary_loss_clip": 0.01182732, "auxiliary_loss_mlp": 0.01056536, "balance_loss_clip": 1.05556142, "balance_loss_mlp": 1.03489983, "epoch": 0.1610052305657428, "flos": 29497024880640.0, "grad_norm": 2.9509919114415526, "language_loss": 0.71328592, "learning_rate": 3.822777701929394e-06, "loss": 0.73567861, "num_input_tokens_seen": 28213120, "step": 1339, "time_per_iteration": 2.69789719581604 }, { "auxiliary_loss_clip": 0.01170705, "auxiliary_loss_mlp": 0.01067314, "balance_loss_clip": 1.05141604, "balance_loss_mlp": 1.04080176, "epoch": 0.1611254734563819, "flos": 26797871329920.0, "grad_norm": 2.099778238999609, "language_loss": 0.73235416, "learning_rate": 3.8224569807762714e-06, "loss": 0.75473434, "num_input_tokens_seen": 28232440, "step": 1340, "time_per_iteration": 2.748634099960327 }, { "auxiliary_loss_clip": 0.01117831, "auxiliary_loss_mlp": 0.01060459, "balance_loss_clip": 1.04246199, "balance_loss_mlp": 1.03538918, "epoch": 0.16124571634702098, "flos": 22419570741120.0, "grad_norm": 1.9917486489858356, "language_loss": 0.76372945, "learning_rate": 3.822135983157873e-06, "loss": 0.78551245, "num_input_tokens_seen": 28251715, "step": 1341, "time_per_iteration": 3.773326873779297 }, { "auxiliary_loss_clip": 0.01190549, "auxiliary_loss_mlp": 0.00778261, "balance_loss_clip": 1.05483723, "balance_loss_mlp": 1.00025773, "epoch": 0.16136595923766006, "flos": 10999116103680.0, "grad_norm": 3.0051814971236808, "language_loss": 0.84316313, "learning_rate": 3.821814709122896e-06, "loss": 0.86285126, "num_input_tokens_seen": 28269765, "step": 1342, "time_per_iteration": 2.5756213665008545 }, { "auxiliary_loss_clip": 0.0116498, "auxiliary_loss_mlp": 0.01059623, "balance_loss_clip": 1.05213201, "balance_loss_mlp": 1.03653252, "epoch": 0.16148620212829917, "flos": 21214983214080.0, "grad_norm": 2.1928789054161975, "language_loss": 0.84555173, "learning_rate": 3.821493158720076e-06, "loss": 0.86779767, "num_input_tokens_seen": 28288870, "step": 1343, "time_per_iteration": 3.7264931201934814 }, { "auxiliary_loss_clip": 0.0115404, "auxiliary_loss_mlp": 0.01064062, "balance_loss_clip": 1.04680359, "balance_loss_mlp": 1.03709698, "epoch": 0.16160644501893826, "flos": 16758468760320.0, "grad_norm": 3.5388870825750613, "language_loss": 0.73839951, "learning_rate": 3.821171331998191e-06, "loss": 0.76058054, "num_input_tokens_seen": 28305400, "step": 1344, "time_per_iteration": 2.6544134616851807 }, { "auxiliary_loss_clip": 0.01049128, "auxiliary_loss_mlp": 0.01016789, "balance_loss_clip": 1.02068865, "balance_loss_mlp": 1.01287925, "epoch": 0.16172668790957734, "flos": 64444967308800.0, "grad_norm": 0.7097246029772608, "language_loss": 0.54408133, "learning_rate": 3.820849229006064e-06, "loss": 0.56474042, "num_input_tokens_seen": 28373150, "step": 1345, "time_per_iteration": 4.514969110488892 }, { "auxiliary_loss_clip": 0.01196974, "auxiliary_loss_mlp": 0.01061426, "balance_loss_clip": 1.05569935, "balance_loss_mlp": 1.03764367, "epoch": 0.16184693080021645, "flos": 23257689759360.0, "grad_norm": 2.18114949968756, "language_loss": 0.7051127, "learning_rate": 3.8205268497925564e-06, "loss": 0.72769672, "num_input_tokens_seen": 28393620, "step": 1346, "time_per_iteration": 3.5735652446746826 }, { "auxiliary_loss_clip": 0.01197322, "auxiliary_loss_mlp": 0.01069538, "balance_loss_clip": 1.05655706, "balance_loss_mlp": 1.04532683, "epoch": 0.16196717369085553, "flos": 17451113696640.0, "grad_norm": 4.592187762149748, "language_loss": 0.78771365, "learning_rate": 3.8202041944065725e-06, "loss": 0.81038225, "num_input_tokens_seen": 28409440, "step": 1347, "time_per_iteration": 2.5689642429351807 }, { "auxiliary_loss_clip": 0.01194111, "auxiliary_loss_mlp": 0.01058727, "balance_loss_clip": 1.05522358, "balance_loss_mlp": 1.03633964, "epoch": 0.16208741658149461, "flos": 23873377806720.0, "grad_norm": 2.993174229061157, "language_loss": 0.73927164, "learning_rate": 3.819881262897061e-06, "loss": 0.76180005, "num_input_tokens_seen": 28427575, "step": 1348, "time_per_iteration": 2.6675050258636475 }, { "auxiliary_loss_clip": 0.01163849, "auxiliary_loss_mlp": 0.01061767, "balance_loss_clip": 1.05664372, "balance_loss_mlp": 1.03542209, "epoch": 0.1622076594721337, "flos": 25884806584320.0, "grad_norm": 2.8017280072767523, "language_loss": 0.73783082, "learning_rate": 3.819558055313008e-06, "loss": 0.76008701, "num_input_tokens_seen": 28448260, "step": 1349, "time_per_iteration": 2.9078450202941895 }, { "auxiliary_loss_clip": 0.01185788, "auxiliary_loss_mlp": 0.01058589, "balance_loss_clip": 1.05519533, "balance_loss_mlp": 1.03528321, "epoch": 0.1623279023627728, "flos": 21539759011200.0, "grad_norm": 1.8282791065730422, "language_loss": 0.7715224, "learning_rate": 3.819234571703444e-06, "loss": 0.79396623, "num_input_tokens_seen": 28467085, "step": 1350, "time_per_iteration": 2.646367073059082 }, { "auxiliary_loss_clip": 0.01174116, "auxiliary_loss_mlp": 0.0105851, "balance_loss_clip": 1.05048096, "balance_loss_mlp": 1.03264201, "epoch": 0.1624481452534119, "flos": 22085421494400.0, "grad_norm": 6.565576179039308, "language_loss": 0.85618019, "learning_rate": 3.8189108121174435e-06, "loss": 0.87850648, "num_input_tokens_seen": 28486850, "step": 1351, "time_per_iteration": 2.631819486618042 }, { "auxiliary_loss_clip": 0.01154082, "auxiliary_loss_mlp": 0.01067897, "balance_loss_clip": 1.05554712, "balance_loss_mlp": 1.0434587, "epoch": 0.16256838814405097, "flos": 27087490690560.0, "grad_norm": 1.6712848065332968, "language_loss": 0.83500242, "learning_rate": 3.818586776604118e-06, "loss": 0.8572222, "num_input_tokens_seen": 28507490, "step": 1352, "time_per_iteration": 2.747307777404785 }, { "auxiliary_loss_clip": 0.01159629, "auxiliary_loss_mlp": 0.01067212, "balance_loss_clip": 1.04997396, "balance_loss_mlp": 1.04275048, "epoch": 0.16268863103469008, "flos": 20120354196480.0, "grad_norm": 1.9999919760755704, "language_loss": 0.61827552, "learning_rate": 3.818262465212625e-06, "loss": 0.64054394, "num_input_tokens_seen": 28527615, "step": 1353, "time_per_iteration": 2.6782400608062744 }, { "auxiliary_loss_clip": 0.01174711, "auxiliary_loss_mlp": 0.01073297, "balance_loss_clip": 1.05636644, "balance_loss_mlp": 1.04827476, "epoch": 0.16280887392532917, "flos": 18332792933760.0, "grad_norm": 2.042034345521885, "language_loss": 0.77083772, "learning_rate": 3.817937877992161e-06, "loss": 0.79331779, "num_input_tokens_seen": 28544910, "step": 1354, "time_per_iteration": 2.6542470455169678 }, { "auxiliary_loss_clip": 0.01151394, "auxiliary_loss_mlp": 0.00779576, "balance_loss_clip": 1.04805279, "balance_loss_mlp": 1.00030839, "epoch": 0.16292911681596825, "flos": 11874330892800.0, "grad_norm": 2.2715519869442766, "language_loss": 0.8520056, "learning_rate": 3.817613014991967e-06, "loss": 0.8713153, "num_input_tokens_seen": 28561050, "step": 1355, "time_per_iteration": 2.6822519302368164 }, { "auxiliary_loss_clip": 0.01143842, "auxiliary_loss_mlp": 0.01069487, "balance_loss_clip": 1.0479461, "balance_loss_mlp": 1.04240215, "epoch": 0.16304935970660733, "flos": 26103466627200.0, "grad_norm": 2.9632600976712165, "language_loss": 0.76464427, "learning_rate": 3.817287876261323e-06, "loss": 0.78677762, "num_input_tokens_seen": 28581385, "step": 1356, "time_per_iteration": 2.6889493465423584 }, { "auxiliary_loss_clip": 0.01166306, "auxiliary_loss_mlp": 0.01061909, "balance_loss_clip": 1.05307806, "balance_loss_mlp": 1.03742361, "epoch": 0.16316960259724644, "flos": 29351945848320.0, "grad_norm": 1.704245719748679, "language_loss": 0.79953313, "learning_rate": 3.816962461849553e-06, "loss": 0.82181525, "num_input_tokens_seen": 28603255, "step": 1357, "time_per_iteration": 2.6874239444732666 }, { "auxiliary_loss_clip": 0.01158777, "auxiliary_loss_mlp": 0.01062082, "balance_loss_clip": 1.05438399, "balance_loss_mlp": 1.03578436, "epoch": 0.16328984548788553, "flos": 20886759711360.0, "grad_norm": 2.280771403775675, "language_loss": 0.84762114, "learning_rate": 3.8166367718060235e-06, "loss": 0.86982971, "num_input_tokens_seen": 28623145, "step": 1358, "time_per_iteration": 2.6634678840637207 }, { "auxiliary_loss_clip": 0.01174671, "auxiliary_loss_mlp": 0.01050359, "balance_loss_clip": 1.05038619, "balance_loss_mlp": 1.02687526, "epoch": 0.1634100883785246, "flos": 18041090584320.0, "grad_norm": 2.64613971689605, "language_loss": 0.76263005, "learning_rate": 3.816310806180139e-06, "loss": 0.78488034, "num_input_tokens_seen": 28641555, "step": 1359, "time_per_iteration": 2.637747287750244 }, { "auxiliary_loss_clip": 0.01171202, "auxiliary_loss_mlp": 0.01066049, "balance_loss_clip": 1.05417073, "balance_loss_mlp": 1.0403837, "epoch": 0.16353033126916372, "flos": 24572128055040.0, "grad_norm": 1.9052284483352455, "language_loss": 0.80776727, "learning_rate": 3.81598456502135e-06, "loss": 0.83013982, "num_input_tokens_seen": 28661575, "step": 1360, "time_per_iteration": 2.68255877494812 }, { "auxiliary_loss_clip": 0.01165595, "auxiliary_loss_mlp": 0.0105916, "balance_loss_clip": 1.05225635, "balance_loss_mlp": 1.03357744, "epoch": 0.1636505741598028, "flos": 19892895321600.0, "grad_norm": 2.0147548246038096, "language_loss": 0.87025577, "learning_rate": 3.8156580483791455e-06, "loss": 0.89250332, "num_input_tokens_seen": 28676765, "step": 1361, "time_per_iteration": 2.624321699142456 }, { "auxiliary_loss_clip": 0.01194023, "auxiliary_loss_mlp": 0.01057918, "balance_loss_clip": 1.055933, "balance_loss_mlp": 1.03576922, "epoch": 0.16377081705044189, "flos": 28402611344640.0, "grad_norm": 2.2025775496848623, "language_loss": 0.77068406, "learning_rate": 3.815331256303059e-06, "loss": 0.79320347, "num_input_tokens_seen": 28696795, "step": 1362, "time_per_iteration": 2.6584434509277344 }, { "auxiliary_loss_clip": 0.01157247, "auxiliary_loss_mlp": 0.01068832, "balance_loss_clip": 1.05553985, "balance_loss_mlp": 1.04677784, "epoch": 0.163891059941081, "flos": 21908059113600.0, "grad_norm": 2.8694026272341, "language_loss": 0.77109563, "learning_rate": 3.815004188842665e-06, "loss": 0.79335642, "num_input_tokens_seen": 28714835, "step": 1363, "time_per_iteration": 2.7093474864959717 }, { "auxiliary_loss_clip": 0.01156233, "auxiliary_loss_mlp": 0.01064787, "balance_loss_clip": 1.04574299, "balance_loss_mlp": 1.03615308, "epoch": 0.16401130283172008, "flos": 26797619934720.0, "grad_norm": 1.5752595046562214, "language_loss": 0.79379702, "learning_rate": 3.814676846047578e-06, "loss": 0.81600726, "num_input_tokens_seen": 28735710, "step": 1364, "time_per_iteration": 2.6906509399414062 }, { "auxiliary_loss_clip": 0.01177343, "auxiliary_loss_mlp": 0.01065914, "balance_loss_clip": 1.05113816, "balance_loss_mlp": 1.04147601, "epoch": 0.16413154572235916, "flos": 32997417160320.0, "grad_norm": 1.9754581761079575, "language_loss": 0.69957781, "learning_rate": 3.8143492279674565e-06, "loss": 0.72201037, "num_input_tokens_seen": 28758405, "step": 1365, "time_per_iteration": 2.7025880813598633 }, { "auxiliary_loss_clip": 0.01056966, "auxiliary_loss_mlp": 0.0103123, "balance_loss_clip": 1.03151023, "balance_loss_mlp": 1.02724838, "epoch": 0.16425178861299825, "flos": 40113622074240.0, "grad_norm": 0.8425939015285457, "language_loss": 0.58386666, "learning_rate": 3.8140213346519997e-06, "loss": 0.60474861, "num_input_tokens_seen": 28809000, "step": 1366, "time_per_iteration": 3.010856866836548 }, { "auxiliary_loss_clip": 0.01143096, "auxiliary_loss_mlp": 0.01063605, "balance_loss_clip": 1.04868054, "balance_loss_mlp": 1.03798723, "epoch": 0.16437203150363736, "flos": 25447486498560.0, "grad_norm": 1.656083121129761, "language_loss": 0.76957721, "learning_rate": 3.813693166150948e-06, "loss": 0.79164422, "num_input_tokens_seen": 28829210, "step": 1367, "time_per_iteration": 3.694936513900757 }, { "auxiliary_loss_clip": 0.01147631, "auxiliary_loss_mlp": 0.01062485, "balance_loss_clip": 1.05091381, "balance_loss_mlp": 1.03665233, "epoch": 0.16449227439427644, "flos": 23476888506240.0, "grad_norm": 2.4583149211178963, "language_loss": 0.85595173, "learning_rate": 3.813364722514086e-06, "loss": 0.87805289, "num_input_tokens_seen": 28847545, "step": 1368, "time_per_iteration": 2.7374846935272217 }, { "auxiliary_loss_clip": 0.0117657, "auxiliary_loss_mlp": 0.01062907, "balance_loss_clip": 1.0518204, "balance_loss_mlp": 1.03893399, "epoch": 0.16461251728491552, "flos": 13545217802880.0, "grad_norm": 2.3431651203092096, "language_loss": 0.80204695, "learning_rate": 3.8130360037912368e-06, "loss": 0.82444173, "num_input_tokens_seen": 28863990, "step": 1369, "time_per_iteration": 3.4809553623199463 }, { "auxiliary_loss_clip": 0.01179416, "auxiliary_loss_mlp": 0.0106356, "balance_loss_clip": 1.05239987, "balance_loss_mlp": 1.03947949, "epoch": 0.16473276017555463, "flos": 23003298662400.0, "grad_norm": 2.130349787971804, "language_loss": 0.81965709, "learning_rate": 3.812707010032268e-06, "loss": 0.84208691, "num_input_tokens_seen": 28883045, "step": 1370, "time_per_iteration": 2.656616449356079 }, { "auxiliary_loss_clip": 0.01188984, "auxiliary_loss_mlp": 0.01066117, "balance_loss_clip": 1.05684924, "balance_loss_mlp": 1.04020119, "epoch": 0.16485300306619372, "flos": 24790680357120.0, "grad_norm": 1.9057050964840234, "language_loss": 0.79314661, "learning_rate": 3.8123777412870863e-06, "loss": 0.81569755, "num_input_tokens_seen": 28902545, "step": 1371, "time_per_iteration": 3.640247344970703 }, { "auxiliary_loss_clip": 0.01173003, "auxiliary_loss_mlp": 0.01063123, "balance_loss_clip": 1.05200779, "balance_loss_mlp": 1.03692031, "epoch": 0.1649732459568328, "flos": 21106497162240.0, "grad_norm": 2.419907852005554, "language_loss": 0.78625923, "learning_rate": 3.812048197605643e-06, "loss": 0.80862045, "num_input_tokens_seen": 28921440, "step": 1372, "time_per_iteration": 3.605276107788086 }, { "auxiliary_loss_clip": 0.01182309, "auxiliary_loss_mlp": 0.01061659, "balance_loss_clip": 1.05104661, "balance_loss_mlp": 1.0366962, "epoch": 0.16509348884747188, "flos": 20266726118400.0, "grad_norm": 6.225314346207793, "language_loss": 0.81106305, "learning_rate": 3.8117183790379277e-06, "loss": 0.83350277, "num_input_tokens_seen": 28939890, "step": 1373, "time_per_iteration": 2.643150568008423 }, { "auxiliary_loss_clip": 0.01193528, "auxiliary_loss_mlp": 0.01060172, "balance_loss_clip": 1.05301785, "balance_loss_mlp": 1.0367955, "epoch": 0.165213731738111, "flos": 11035493602560.0, "grad_norm": 2.796666796080538, "language_loss": 0.93672597, "learning_rate": 3.811388285633976e-06, "loss": 0.95926303, "num_input_tokens_seen": 28955875, "step": 1374, "time_per_iteration": 2.609790563583374 }, { "auxiliary_loss_clip": 0.01147771, "auxiliary_loss_mlp": 0.01056895, "balance_loss_clip": 1.04969084, "balance_loss_mlp": 1.033149, "epoch": 0.16533397462875007, "flos": 29972051268480.0, "grad_norm": 2.600164981647488, "language_loss": 0.62190771, "learning_rate": 3.811057917443861e-06, "loss": 0.6439544, "num_input_tokens_seen": 28975140, "step": 1375, "time_per_iteration": 2.8211162090301514 }, { "auxiliary_loss_clip": 0.0106928, "auxiliary_loss_mlp": 0.01030586, "balance_loss_clip": 1.02927864, "balance_loss_mlp": 1.02700996, "epoch": 0.16545421751938916, "flos": 65556763027200.0, "grad_norm": 0.8611878520125069, "language_loss": 0.68272138, "learning_rate": 3.8107272745177e-06, "loss": 0.70371997, "num_input_tokens_seen": 29047470, "step": 1376, "time_per_iteration": 3.362804651260376 }, { "auxiliary_loss_clip": 0.01156648, "auxiliary_loss_mlp": 0.01059318, "balance_loss_clip": 1.05237782, "balance_loss_mlp": 1.03446329, "epoch": 0.16557446041002827, "flos": 22492361652480.0, "grad_norm": 2.2259172678034087, "language_loss": 0.78459793, "learning_rate": 3.8103963569056513e-06, "loss": 0.80675757, "num_input_tokens_seen": 29066605, "step": 1377, "time_per_iteration": 2.7103371620178223 }, { "auxiliary_loss_clip": 0.01156265, "auxiliary_loss_mlp": 0.01058579, "balance_loss_clip": 1.04713893, "balance_loss_mlp": 1.0345583, "epoch": 0.16569470330066735, "flos": 24602723464320.0, "grad_norm": 1.7649452965589467, "language_loss": 0.87944221, "learning_rate": 3.8100651646579146e-06, "loss": 0.9015907, "num_input_tokens_seen": 29085815, "step": 1378, "time_per_iteration": 2.7012462615966797 }, { "auxiliary_loss_clip": 0.01156686, "auxiliary_loss_mlp": 0.01062023, "balance_loss_clip": 1.04839349, "balance_loss_mlp": 1.03715587, "epoch": 0.16581494619130643, "flos": 15006207588480.0, "grad_norm": 2.4186258979155393, "language_loss": 0.9235729, "learning_rate": 3.8097336978247317e-06, "loss": 0.94575995, "num_input_tokens_seen": 29102520, "step": 1379, "time_per_iteration": 2.605243682861328 }, { "auxiliary_loss_clip": 0.01154077, "auxiliary_loss_mlp": 0.0106321, "balance_loss_clip": 1.0495007, "balance_loss_mlp": 1.03734159, "epoch": 0.16593518908194552, "flos": 17420338719360.0, "grad_norm": 2.167721874553351, "language_loss": 0.88846087, "learning_rate": 3.8094019564563854e-06, "loss": 0.91063374, "num_input_tokens_seen": 29119450, "step": 1380, "time_per_iteration": 2.7091991901397705 }, { "auxiliary_loss_clip": 0.01196833, "auxiliary_loss_mlp": 0.00778804, "balance_loss_clip": 1.05611765, "balance_loss_mlp": 1.00031424, "epoch": 0.16605543197258463, "flos": 20412631163520.0, "grad_norm": 2.1702868621471567, "language_loss": 0.75244844, "learning_rate": 3.809069940603201e-06, "loss": 0.77220482, "num_input_tokens_seen": 29137405, "step": 1381, "time_per_iteration": 2.623821258544922 }, { "auxiliary_loss_clip": 0.01153488, "auxiliary_loss_mlp": 0.01061042, "balance_loss_clip": 1.05021644, "balance_loss_mlp": 1.03787971, "epoch": 0.1661756748632237, "flos": 14209745368320.0, "grad_norm": 3.107198550749946, "language_loss": 0.78341728, "learning_rate": 3.8087376503155452e-06, "loss": 0.80556256, "num_input_tokens_seen": 29154890, "step": 1382, "time_per_iteration": 2.6822972297668457 }, { "auxiliary_loss_clip": 0.01057312, "auxiliary_loss_mlp": 0.01011207, "balance_loss_clip": 1.02180982, "balance_loss_mlp": 1.00741577, "epoch": 0.1662959177538628, "flos": 66080877350400.0, "grad_norm": 0.9031358032406831, "language_loss": 0.56184751, "learning_rate": 3.808405085643826e-06, "loss": 0.5825327, "num_input_tokens_seen": 29219770, "step": 1383, "time_per_iteration": 3.325075626373291 }, { "auxiliary_loss_clip": 0.01189933, "auxiliary_loss_mlp": 0.00778466, "balance_loss_clip": 1.05435658, "balance_loss_mlp": 1.00042427, "epoch": 0.1664161606445019, "flos": 20740567357440.0, "grad_norm": 2.047984215510872, "language_loss": 0.88543737, "learning_rate": 3.8080722466384925e-06, "loss": 0.90512133, "num_input_tokens_seen": 29237620, "step": 1384, "time_per_iteration": 2.578726053237915 }, { "auxiliary_loss_clip": 0.0119989, "auxiliary_loss_mlp": 0.0106749, "balance_loss_clip": 1.05329633, "balance_loss_mlp": 1.04162169, "epoch": 0.166536403535141, "flos": 25260930236160.0, "grad_norm": 2.861435594140036, "language_loss": 0.70947778, "learning_rate": 3.8077391333500376e-06, "loss": 0.73215163, "num_input_tokens_seen": 29256760, "step": 1385, "time_per_iteration": 2.6511778831481934 }, { "auxiliary_loss_clip": 0.01164126, "auxiliary_loss_mlp": 0.0106987, "balance_loss_clip": 1.05231977, "balance_loss_mlp": 1.04681516, "epoch": 0.16665664642578007, "flos": 25447450584960.0, "grad_norm": 1.7708426056785607, "language_loss": 0.76666373, "learning_rate": 3.8074057458289934e-06, "loss": 0.78900373, "num_input_tokens_seen": 29277450, "step": 1386, "time_per_iteration": 2.7137081623077393 }, { "auxiliary_loss_clip": 0.01169341, "auxiliary_loss_mlp": 0.01066634, "balance_loss_clip": 1.05140269, "balance_loss_mlp": 1.0412066, "epoch": 0.16677688931641918, "flos": 22200767043840.0, "grad_norm": 2.1524601161430157, "language_loss": 0.82886624, "learning_rate": 3.807072084125934e-06, "loss": 0.85122597, "num_input_tokens_seen": 29299300, "step": 1387, "time_per_iteration": 2.6773643493652344 }, { "auxiliary_loss_clip": 0.01163076, "auxiliary_loss_mlp": 0.01067647, "balance_loss_clip": 1.05195343, "balance_loss_mlp": 1.04338765, "epoch": 0.16689713220705826, "flos": 16945958776320.0, "grad_norm": 2.891927430595892, "language_loss": 0.80499339, "learning_rate": 3.806738148291477e-06, "loss": 0.82730067, "num_input_tokens_seen": 29316125, "step": 1388, "time_per_iteration": 2.6396474838256836 }, { "auxiliary_loss_clip": 0.01126384, "auxiliary_loss_mlp": 0.01068463, "balance_loss_clip": 1.04278922, "balance_loss_mlp": 1.044204, "epoch": 0.16701737509769735, "flos": 36244423923840.0, "grad_norm": 1.9592031592887813, "language_loss": 0.71685958, "learning_rate": 3.8064039383762793e-06, "loss": 0.73880804, "num_input_tokens_seen": 29338490, "step": 1389, "time_per_iteration": 2.8679518699645996 }, { "auxiliary_loss_clip": 0.01179919, "auxiliary_loss_mlp": 0.01067696, "balance_loss_clip": 1.05643272, "balance_loss_mlp": 1.04230452, "epoch": 0.16713761798833643, "flos": 23258659426560.0, "grad_norm": 2.0392104407452596, "language_loss": 0.76790988, "learning_rate": 3.8060694544310396e-06, "loss": 0.79038596, "num_input_tokens_seen": 29357000, "step": 1390, "time_per_iteration": 2.6465837955474854 }, { "auxiliary_loss_clip": 0.01197054, "auxiliary_loss_mlp": 0.01056867, "balance_loss_clip": 1.05521464, "balance_loss_mlp": 1.03178537, "epoch": 0.16725786087897554, "flos": 25302515207040.0, "grad_norm": 2.3534534873613335, "language_loss": 0.78852016, "learning_rate": 3.8057346965065006e-06, "loss": 0.81105936, "num_input_tokens_seen": 29378230, "step": 1391, "time_per_iteration": 2.6120219230651855 }, { "auxiliary_loss_clip": 0.01162473, "auxiliary_loss_mlp": 0.01061622, "balance_loss_clip": 1.05096519, "balance_loss_mlp": 1.03812611, "epoch": 0.16737810376961462, "flos": 31831541516160.0, "grad_norm": 2.5345209651133658, "language_loss": 0.84566343, "learning_rate": 3.805399664653443e-06, "loss": 0.86790442, "num_input_tokens_seen": 29400370, "step": 1392, "time_per_iteration": 2.7660608291625977 }, { "auxiliary_loss_clip": 0.01195608, "auxiliary_loss_mlp": 0.01071055, "balance_loss_clip": 1.05574918, "balance_loss_mlp": 1.04529333, "epoch": 0.1674983466602537, "flos": 27961843553280.0, "grad_norm": 5.700089807378857, "language_loss": 0.7420696, "learning_rate": 3.805064358922692e-06, "loss": 0.76473624, "num_input_tokens_seen": 29418660, "step": 1393, "time_per_iteration": 3.60117244720459 }, { "auxiliary_loss_clip": 0.01188985, "auxiliary_loss_mlp": 0.01073615, "balance_loss_clip": 1.05639708, "balance_loss_mlp": 1.04786634, "epoch": 0.16761858955089282, "flos": 21762656858880.0, "grad_norm": 1.9107859367703595, "language_loss": 0.81134468, "learning_rate": 3.8047287793651136e-06, "loss": 0.83397067, "num_input_tokens_seen": 29440105, "step": 1394, "time_per_iteration": 2.6635918617248535 }, { "auxiliary_loss_clip": 0.01156737, "auxiliary_loss_mlp": 0.01063535, "balance_loss_clip": 1.05109525, "balance_loss_mlp": 1.03970551, "epoch": 0.1677388324415319, "flos": 23805507058560.0, "grad_norm": 2.1573235963187836, "language_loss": 0.88546181, "learning_rate": 3.8043929260316137e-06, "loss": 0.90766454, "num_input_tokens_seen": 29458260, "step": 1395, "time_per_iteration": 3.73319411277771 }, { "auxiliary_loss_clip": 0.01173189, "auxiliary_loss_mlp": 0.0106941, "balance_loss_clip": 1.05774403, "balance_loss_mlp": 1.0447216, "epoch": 0.16785907533217098, "flos": 20558859431040.0, "grad_norm": 2.309624988672776, "language_loss": 0.83626294, "learning_rate": 3.8040567989731417e-06, "loss": 0.85868895, "num_input_tokens_seen": 29476205, "step": 1396, "time_per_iteration": 2.669771194458008 }, { "auxiliary_loss_clip": 0.01174982, "auxiliary_loss_mlp": 0.01055008, "balance_loss_clip": 1.0531441, "balance_loss_mlp": 1.03145242, "epoch": 0.16797931822281006, "flos": 15669657745920.0, "grad_norm": 2.135015191324893, "language_loss": 0.80136311, "learning_rate": 3.8037203982406876e-06, "loss": 0.82366306, "num_input_tokens_seen": 29494370, "step": 1397, "time_per_iteration": 3.597008466720581 }, { "auxiliary_loss_clip": 0.01195606, "auxiliary_loss_mlp": 0.01054098, "balance_loss_clip": 1.05648673, "balance_loss_mlp": 1.02905178, "epoch": 0.16809956111344918, "flos": 16541101607040.0, "grad_norm": 1.8397297070143925, "language_loss": 0.7323637, "learning_rate": 3.8033837238852835e-06, "loss": 0.75486076, "num_input_tokens_seen": 29511070, "step": 1398, "time_per_iteration": 3.5052402019500732 }, { "auxiliary_loss_clip": 0.01154978, "auxiliary_loss_mlp": 0.01063167, "balance_loss_clip": 1.04996634, "balance_loss_mlp": 1.04027903, "epoch": 0.16821980400408826, "flos": 23258084808960.0, "grad_norm": 1.9128133194369599, "language_loss": 0.69639945, "learning_rate": 3.8030467759580017e-06, "loss": 0.71858096, "num_input_tokens_seen": 29531990, "step": 1399, "time_per_iteration": 2.679929733276367 }, { "auxiliary_loss_clip": 0.01179756, "auxiliary_loss_mlp": 0.01052015, "balance_loss_clip": 1.05167174, "balance_loss_mlp": 1.02816129, "epoch": 0.16834004689472734, "flos": 20774754126720.0, "grad_norm": 2.117711861063973, "language_loss": 0.86960727, "learning_rate": 3.802709554509958e-06, "loss": 0.89192498, "num_input_tokens_seen": 29549790, "step": 1400, "time_per_iteration": 2.643657922744751 }, { "auxiliary_loss_clip": 0.01162657, "auxiliary_loss_mlp": 0.01050407, "balance_loss_clip": 1.04696441, "balance_loss_mlp": 1.026613, "epoch": 0.16846028978536645, "flos": 26687302289280.0, "grad_norm": 2.980757109440259, "language_loss": 0.79317069, "learning_rate": 3.8023720595923083e-06, "loss": 0.81530136, "num_input_tokens_seen": 29569045, "step": 1401, "time_per_iteration": 2.6895265579223633 }, { "auxiliary_loss_clip": 0.01138281, "auxiliary_loss_mlp": 0.01056795, "balance_loss_clip": 1.04839587, "balance_loss_mlp": 1.03334665, "epoch": 0.16858053267600553, "flos": 18843298980480.0, "grad_norm": 2.9371970505271983, "language_loss": 0.875, "learning_rate": 3.80203429125625e-06, "loss": 0.89695084, "num_input_tokens_seen": 29587220, "step": 1402, "time_per_iteration": 2.7002058029174805 }, { "auxiliary_loss_clip": 0.01112708, "auxiliary_loss_mlp": 0.01051616, "balance_loss_clip": 1.04659915, "balance_loss_mlp": 1.02884698, "epoch": 0.16870077556664462, "flos": 27744548227200.0, "grad_norm": 2.60254987978506, "language_loss": 0.70287311, "learning_rate": 3.8016962495530225e-06, "loss": 0.72451639, "num_input_tokens_seen": 29606410, "step": 1403, "time_per_iteration": 2.8101871013641357 }, { "auxiliary_loss_clip": 0.01194386, "auxiliary_loss_mlp": 0.01055829, "balance_loss_clip": 1.05626941, "balance_loss_mlp": 1.03218973, "epoch": 0.1688210184572837, "flos": 13730768484480.0, "grad_norm": 2.379146737717254, "language_loss": 0.76177192, "learning_rate": 3.8013579345339063e-06, "loss": 0.78427404, "num_input_tokens_seen": 29621275, "step": 1404, "time_per_iteration": 2.6141457557678223 }, { "auxiliary_loss_clip": 0.01157104, "auxiliary_loss_mlp": 0.01055069, "balance_loss_clip": 1.05085278, "balance_loss_mlp": 1.03166831, "epoch": 0.1689412613479228, "flos": 26468785900800.0, "grad_norm": 2.629019900257882, "language_loss": 0.69131911, "learning_rate": 3.801019346250224e-06, "loss": 0.71344078, "num_input_tokens_seen": 29641420, "step": 1405, "time_per_iteration": 2.697331666946411 }, { "auxiliary_loss_clip": 0.01174826, "auxiliary_loss_mlp": 0.01060163, "balance_loss_clip": 1.05243993, "balance_loss_mlp": 1.0369885, "epoch": 0.1690615042385619, "flos": 21138852337920.0, "grad_norm": 2.8171914006392775, "language_loss": 0.83488792, "learning_rate": 3.8006804847533395e-06, "loss": 0.85723782, "num_input_tokens_seen": 29660935, "step": 1406, "time_per_iteration": 2.6665329933166504 }, { "auxiliary_loss_clip": 0.01192228, "auxiliary_loss_mlp": 0.01055035, "balance_loss_clip": 1.05310261, "balance_loss_mlp": 1.03099012, "epoch": 0.16918174712920098, "flos": 20849340718080.0, "grad_norm": 2.2496790741228523, "language_loss": 0.85529977, "learning_rate": 3.8003413500946556e-06, "loss": 0.87777239, "num_input_tokens_seen": 29681045, "step": 1407, "time_per_iteration": 2.6513755321502686 }, { "auxiliary_loss_clip": 0.01168854, "auxiliary_loss_mlp": 0.01066691, "balance_loss_clip": 1.0542419, "balance_loss_mlp": 1.04243183, "epoch": 0.1693019900198401, "flos": 16983270028800.0, "grad_norm": 3.0844183854442386, "language_loss": 0.83181983, "learning_rate": 3.8000019423256216e-06, "loss": 0.85417527, "num_input_tokens_seen": 29698810, "step": 1408, "time_per_iteration": 2.709416151046753 }, { "auxiliary_loss_clip": 0.01152399, "auxiliary_loss_mlp": 0.01052807, "balance_loss_clip": 1.05162454, "balance_loss_mlp": 1.02884626, "epoch": 0.16942223291047917, "flos": 26796901662720.0, "grad_norm": 1.8071046993182864, "language_loss": 0.88016242, "learning_rate": 3.7996622614977234e-06, "loss": 0.90221453, "num_input_tokens_seen": 29720000, "step": 1409, "time_per_iteration": 2.727043390274048 }, { "auxiliary_loss_clip": 0.01156242, "auxiliary_loss_mlp": 0.01055263, "balance_loss_clip": 1.05226862, "balance_loss_mlp": 1.03254175, "epoch": 0.16954247580111825, "flos": 18583700411520.0, "grad_norm": 1.858627990477496, "language_loss": 0.79060566, "learning_rate": 3.799322307662492e-06, "loss": 0.81272066, "num_input_tokens_seen": 29737820, "step": 1410, "time_per_iteration": 2.6674859523773193 }, { "auxiliary_loss_clip": 0.01142808, "auxiliary_loss_mlp": 0.01059341, "balance_loss_clip": 1.04577303, "balance_loss_mlp": 1.03517711, "epoch": 0.16966271869175734, "flos": 13983651210240.0, "grad_norm": 4.922776500758553, "language_loss": 0.83502734, "learning_rate": 3.798982080871496e-06, "loss": 0.85704887, "num_input_tokens_seen": 29752960, "step": 1411, "time_per_iteration": 2.737699031829834 }, { "auxiliary_loss_clip": 0.01192227, "auxiliary_loss_mlp": 0.01054286, "balance_loss_clip": 1.05429113, "balance_loss_mlp": 1.02970481, "epoch": 0.16978296158239645, "flos": 37487328284160.0, "grad_norm": 3.336882790462311, "language_loss": 0.67487448, "learning_rate": 3.798641581176349e-06, "loss": 0.69733959, "num_input_tokens_seen": 29775240, "step": 1412, "time_per_iteration": 2.7676103115081787 }, { "auxiliary_loss_clip": 0.01165621, "auxiliary_loss_mlp": 0.01064748, "balance_loss_clip": 1.05073893, "balance_loss_mlp": 1.03982139, "epoch": 0.16990320447303553, "flos": 28328958506880.0, "grad_norm": 2.2738451592705613, "language_loss": 0.74509776, "learning_rate": 3.7983008086287044e-06, "loss": 0.76740146, "num_input_tokens_seen": 29796560, "step": 1413, "time_per_iteration": 2.7362213134765625 }, { "auxiliary_loss_clip": 0.0116343, "auxiliary_loss_mlp": 0.01059775, "balance_loss_clip": 1.04884839, "balance_loss_mlp": 1.0357548, "epoch": 0.1700234473636746, "flos": 20188189031040.0, "grad_norm": 2.4478198384860588, "language_loss": 0.79359555, "learning_rate": 3.797959763280257e-06, "loss": 0.81582761, "num_input_tokens_seen": 29815245, "step": 1414, "time_per_iteration": 2.646134853363037 }, { "auxiliary_loss_clip": 0.01179481, "auxiliary_loss_mlp": 0.01055751, "balance_loss_clip": 1.05275309, "balance_loss_mlp": 1.03279161, "epoch": 0.17014369025431372, "flos": 24858658846080.0, "grad_norm": 2.1810361421176228, "language_loss": 0.78870189, "learning_rate": 3.797618445182743e-06, "loss": 0.81105423, "num_input_tokens_seen": 29836640, "step": 1415, "time_per_iteration": 2.666987895965576 }, { "auxiliary_loss_clip": 0.01132787, "auxiliary_loss_mlp": 0.01058389, "balance_loss_clip": 1.04649544, "balance_loss_mlp": 1.03364158, "epoch": 0.1702639331449528, "flos": 16467233287680.0, "grad_norm": 2.2376260480723382, "language_loss": 0.8544578, "learning_rate": 3.79727685438794e-06, "loss": 0.8763696, "num_input_tokens_seen": 29850830, "step": 1416, "time_per_iteration": 2.7131733894348145 }, { "auxiliary_loss_clip": 0.01075312, "auxiliary_loss_mlp": 0.01024834, "balance_loss_clip": 1.02909017, "balance_loss_mlp": 1.02173412, "epoch": 0.1703841760355919, "flos": 52508870979840.0, "grad_norm": 0.8393123105117848, "language_loss": 0.61582279, "learning_rate": 3.796934990947667e-06, "loss": 0.63682425, "num_input_tokens_seen": 29912515, "step": 1417, "time_per_iteration": 3.2315452098846436 }, { "auxiliary_loss_clip": 0.01073315, "auxiliary_loss_mlp": 0.01013285, "balance_loss_clip": 1.02712345, "balance_loss_mlp": 1.01020908, "epoch": 0.170504418926231, "flos": 49370637576960.0, "grad_norm": 0.8796890617294261, "language_loss": 0.62490475, "learning_rate": 3.7965928549137854e-06, "loss": 0.64577067, "num_input_tokens_seen": 29969330, "step": 1418, "time_per_iteration": 3.1437339782714844 }, { "auxiliary_loss_clip": 0.01157872, "auxiliary_loss_mlp": 0.01053803, "balance_loss_clip": 1.04708946, "balance_loss_mlp": 1.02994883, "epoch": 0.17062466181687008, "flos": 25849219184640.0, "grad_norm": 2.186306842671204, "language_loss": 0.77420694, "learning_rate": 3.7962504463381953e-06, "loss": 0.79632366, "num_input_tokens_seen": 29990820, "step": 1419, "time_per_iteration": 3.75167179107666 }, { "auxiliary_loss_clip": 0.01154628, "auxiliary_loss_mlp": 0.00780713, "balance_loss_clip": 1.05052757, "balance_loss_mlp": 1.00061131, "epoch": 0.17074490470750917, "flos": 20960412549120.0, "grad_norm": 1.8811535691142893, "language_loss": 0.78818119, "learning_rate": 3.7959077652728412e-06, "loss": 0.80753464, "num_input_tokens_seen": 30009275, "step": 1420, "time_per_iteration": 2.6983468532562256 }, { "auxiliary_loss_clip": 0.01164274, "auxiliary_loss_mlp": 0.01059268, "balance_loss_clip": 1.04876876, "balance_loss_mlp": 1.03728569, "epoch": 0.17086514759814825, "flos": 20959766104320.0, "grad_norm": 2.046181322526915, "language_loss": 0.77095497, "learning_rate": 3.795564811769707e-06, "loss": 0.79319036, "num_input_tokens_seen": 30027630, "step": 1421, "time_per_iteration": 3.6223082542419434 }, { "auxiliary_loss_clip": 0.01165813, "auxiliary_loss_mlp": 0.01062276, "balance_loss_clip": 1.05331945, "balance_loss_mlp": 1.0393405, "epoch": 0.17098539048878736, "flos": 28474073452800.0, "grad_norm": 3.086392677842688, "language_loss": 0.77780682, "learning_rate": 3.795221585880818e-06, "loss": 0.80008769, "num_input_tokens_seen": 30048310, "step": 1422, "time_per_iteration": 2.7379276752471924 }, { "auxiliary_loss_clip": 0.01152867, "auxiliary_loss_mlp": 0.01059783, "balance_loss_clip": 1.05231428, "balance_loss_mlp": 1.03826547, "epoch": 0.17110563337942644, "flos": 16290014561280.0, "grad_norm": 1.9680882556362604, "language_loss": 0.91490591, "learning_rate": 3.794878087658242e-06, "loss": 0.93703234, "num_input_tokens_seen": 30066080, "step": 1423, "time_per_iteration": 3.679666519165039 }, { "auxiliary_loss_clip": 0.01174089, "auxiliary_loss_mlp": 0.01060378, "balance_loss_clip": 1.04888797, "balance_loss_mlp": 1.03577352, "epoch": 0.17122587627006552, "flos": 29674207693440.0, "grad_norm": 1.9655367543356033, "language_loss": 0.78800726, "learning_rate": 3.7945343171540873e-06, "loss": 0.81035185, "num_input_tokens_seen": 30086955, "step": 1424, "time_per_iteration": 3.5072271823883057 }, { "auxiliary_loss_clip": 0.01190283, "auxiliary_loss_mlp": 0.01059319, "balance_loss_clip": 1.05201471, "balance_loss_mlp": 1.03694344, "epoch": 0.17134611916070464, "flos": 25338389915520.0, "grad_norm": 3.140127495741122, "language_loss": 0.78615928, "learning_rate": 3.7941902744205033e-06, "loss": 0.80865526, "num_input_tokens_seen": 30107990, "step": 1425, "time_per_iteration": 2.789961099624634 }, { "auxiliary_loss_clip": 0.01170954, "auxiliary_loss_mlp": 0.01066205, "balance_loss_clip": 1.05385923, "balance_loss_mlp": 1.04267359, "epoch": 0.17146636205134372, "flos": 13953845900160.0, "grad_norm": 2.185436343266904, "language_loss": 0.83609784, "learning_rate": 3.7938459595096817e-06, "loss": 0.85846937, "num_input_tokens_seen": 30126535, "step": 1426, "time_per_iteration": 2.6427695751190186 }, { "auxiliary_loss_clip": 0.01191259, "auxiliary_loss_mlp": 0.01071494, "balance_loss_clip": 1.05600381, "balance_loss_mlp": 1.0462451, "epoch": 0.1715866049419828, "flos": 23915214172800.0, "grad_norm": 1.9779132593538593, "language_loss": 0.86400497, "learning_rate": 3.7935013724738545e-06, "loss": 0.8866325, "num_input_tokens_seen": 30147035, "step": 1427, "time_per_iteration": 2.6568641662597656 }, { "auxiliary_loss_clip": 0.01172382, "auxiliary_loss_mlp": 0.01057716, "balance_loss_clip": 1.05255163, "balance_loss_mlp": 1.03691423, "epoch": 0.17170684783262188, "flos": 22709369669760.0, "grad_norm": 2.531031974990312, "language_loss": 0.780195, "learning_rate": 3.7931565133652945e-06, "loss": 0.80249602, "num_input_tokens_seen": 30167110, "step": 1428, "time_per_iteration": 2.6213772296905518 }, { "auxiliary_loss_clip": 0.01188586, "auxiliary_loss_mlp": 0.01062976, "balance_loss_clip": 1.05114484, "balance_loss_mlp": 1.04026675, "epoch": 0.171827090723261, "flos": 26613290315520.0, "grad_norm": 2.791435488328735, "language_loss": 0.68402779, "learning_rate": 3.792811382236317e-06, "loss": 0.70654339, "num_input_tokens_seen": 30185620, "step": 1429, "time_per_iteration": 2.6337008476257324 }, { "auxiliary_loss_clip": 0.01182772, "auxiliary_loss_mlp": 0.01065498, "balance_loss_clip": 1.05395377, "balance_loss_mlp": 1.04175138, "epoch": 0.17194733361390008, "flos": 28148507556480.0, "grad_norm": 2.6793125497167343, "language_loss": 0.78354043, "learning_rate": 3.792465979139279e-06, "loss": 0.80602312, "num_input_tokens_seen": 30208225, "step": 1430, "time_per_iteration": 2.658454656600952 }, { "auxiliary_loss_clip": 0.01049368, "auxiliary_loss_mlp": 0.01036102, "balance_loss_clip": 1.02134216, "balance_loss_mlp": 1.03290689, "epoch": 0.17206757650453916, "flos": 65530689753600.0, "grad_norm": 0.926488935525102, "language_loss": 0.65635604, "learning_rate": 3.792120304126576e-06, "loss": 0.67721075, "num_input_tokens_seen": 30271600, "step": 1431, "time_per_iteration": 3.282867193222046 }, { "auxiliary_loss_clip": 0.01117219, "auxiliary_loss_mlp": 0.01048568, "balance_loss_clip": 1.04571199, "balance_loss_mlp": 1.02553618, "epoch": 0.17218781939517827, "flos": 22273486128000.0, "grad_norm": 2.1505744374885345, "language_loss": 0.83904213, "learning_rate": 3.791774357250649e-06, "loss": 0.86069995, "num_input_tokens_seen": 30290430, "step": 1432, "time_per_iteration": 2.7828304767608643 }, { "auxiliary_loss_clip": 0.01161672, "auxiliary_loss_mlp": 0.0106668, "balance_loss_clip": 1.05100012, "balance_loss_mlp": 1.04252827, "epoch": 0.17230806228581735, "flos": 14137313592960.0, "grad_norm": 2.770733162766459, "language_loss": 0.79411447, "learning_rate": 3.7914281385639757e-06, "loss": 0.81639791, "num_input_tokens_seen": 30308305, "step": 1433, "time_per_iteration": 2.6248958110809326 }, { "auxiliary_loss_clip": 0.01175662, "auxiliary_loss_mlp": 0.0105813, "balance_loss_clip": 1.04944742, "balance_loss_mlp": 1.03483629, "epoch": 0.17242830517645644, "flos": 20704836303360.0, "grad_norm": 2.009998153632052, "language_loss": 0.79658592, "learning_rate": 3.7910816481190784e-06, "loss": 0.81892383, "num_input_tokens_seen": 30328120, "step": 1434, "time_per_iteration": 2.642784595489502 }, { "auxiliary_loss_clip": 0.01151706, "auxiliary_loss_mlp": 0.01050288, "balance_loss_clip": 1.0472362, "balance_loss_mlp": 1.02732801, "epoch": 0.17254854806709552, "flos": 30774582887040.0, "grad_norm": 1.9907774391502688, "language_loss": 0.74911547, "learning_rate": 3.7907348859685193e-06, "loss": 0.77113545, "num_input_tokens_seen": 30349825, "step": 1435, "time_per_iteration": 2.724797248840332 }, { "auxiliary_loss_clip": 0.0116921, "auxiliary_loss_mlp": 0.01053928, "balance_loss_clip": 1.05271769, "balance_loss_mlp": 1.03133821, "epoch": 0.17266879095773463, "flos": 26614726859520.0, "grad_norm": 3.0921472009899715, "language_loss": 0.80560797, "learning_rate": 3.790387852164902e-06, "loss": 0.82783937, "num_input_tokens_seen": 30370555, "step": 1436, "time_per_iteration": 2.6517608165740967 }, { "auxiliary_loss_clip": 0.01176997, "auxiliary_loss_mlp": 0.01055297, "balance_loss_clip": 1.05168629, "balance_loss_mlp": 1.03380358, "epoch": 0.1727890338483737, "flos": 20266295155200.0, "grad_norm": 2.0006697101431845, "language_loss": 0.76747239, "learning_rate": 3.7900405467608707e-06, "loss": 0.78979528, "num_input_tokens_seen": 30390100, "step": 1437, "time_per_iteration": 2.621541976928711 }, { "auxiliary_loss_clip": 0.01122456, "auxiliary_loss_mlp": 0.01061324, "balance_loss_clip": 1.04278529, "balance_loss_mlp": 1.03755403, "epoch": 0.1729092767390128, "flos": 18179812909440.0, "grad_norm": 3.913113796471254, "language_loss": 0.79095399, "learning_rate": 3.7896929698091114e-06, "loss": 0.81279182, "num_input_tokens_seen": 30402915, "step": 1438, "time_per_iteration": 2.6660382747650146 }, { "auxiliary_loss_clip": 0.01193918, "auxiliary_loss_mlp": 0.01056452, "balance_loss_clip": 1.05709302, "balance_loss_mlp": 1.03429067, "epoch": 0.1730295196296519, "flos": 26759518583040.0, "grad_norm": 2.8901085103652773, "language_loss": 0.68337888, "learning_rate": 3.7893451213623518e-06, "loss": 0.70588261, "num_input_tokens_seen": 30420145, "step": 1439, "time_per_iteration": 2.627845525741577 }, { "auxiliary_loss_clip": 0.01178312, "auxiliary_loss_mlp": 0.00778166, "balance_loss_clip": 1.05453432, "balance_loss_mlp": 1.00059581, "epoch": 0.173149762520291, "flos": 23842531002240.0, "grad_norm": 2.9613975413967957, "language_loss": 0.82297689, "learning_rate": 3.7889970014733606e-06, "loss": 0.84254169, "num_input_tokens_seen": 30439250, "step": 1440, "time_per_iteration": 2.7201809883117676 }, { "auxiliary_loss_clip": 0.0111736, "auxiliary_loss_mlp": 0.01059145, "balance_loss_clip": 1.04314137, "balance_loss_mlp": 1.03422999, "epoch": 0.17327000541093007, "flos": 23368186972800.0, "grad_norm": 1.9040221202250507, "language_loss": 0.77993298, "learning_rate": 3.7886486101949463e-06, "loss": 0.80169803, "num_input_tokens_seen": 30460430, "step": 1441, "time_per_iteration": 2.7315711975097656 }, { "auxiliary_loss_clip": 0.01118801, "auxiliary_loss_mlp": 0.01069, "balance_loss_clip": 1.0431813, "balance_loss_mlp": 1.04402566, "epoch": 0.17339024830156918, "flos": 18221290139520.0, "grad_norm": 2.4830399271842274, "language_loss": 0.88186753, "learning_rate": 3.7882999475799594e-06, "loss": 0.90374553, "num_input_tokens_seen": 30478465, "step": 1442, "time_per_iteration": 2.6842029094696045 }, { "auxiliary_loss_clip": 0.0112155, "auxiliary_loss_mlp": 0.01070649, "balance_loss_clip": 1.04638743, "balance_loss_mlp": 1.04538882, "epoch": 0.17351049119220827, "flos": 23332024955520.0, "grad_norm": 2.6441318957705, "language_loss": 0.81697738, "learning_rate": 3.787951013681293e-06, "loss": 0.83889937, "num_input_tokens_seen": 30496510, "step": 1443, "time_per_iteration": 2.763702392578125 }, { "auxiliary_loss_clip": 0.01178101, "auxiliary_loss_mlp": 0.01062594, "balance_loss_clip": 1.05421138, "balance_loss_mlp": 1.03871679, "epoch": 0.17363073408284735, "flos": 23803495896960.0, "grad_norm": 2.480546571902462, "language_loss": 0.77535582, "learning_rate": 3.787601808551879e-06, "loss": 0.79776275, "num_input_tokens_seen": 30516325, "step": 1444, "time_per_iteration": 2.632662773132324 }, { "auxiliary_loss_clip": 0.01158494, "auxiliary_loss_mlp": 0.01057509, "balance_loss_clip": 1.05400324, "balance_loss_mlp": 1.03413177, "epoch": 0.17375097697348643, "flos": 18515290959360.0, "grad_norm": 3.096246056426304, "language_loss": 0.83583653, "learning_rate": 3.7872523322446926e-06, "loss": 0.85799658, "num_input_tokens_seen": 30535210, "step": 1445, "time_per_iteration": 3.699528694152832 }, { "auxiliary_loss_clip": 0.0114537, "auxiliary_loss_mlp": 0.01054793, "balance_loss_clip": 1.04775131, "balance_loss_mlp": 1.0320363, "epoch": 0.17387121986412554, "flos": 38877897456000.0, "grad_norm": 2.9702606389013795, "language_loss": 0.60222405, "learning_rate": 3.7869025848127478e-06, "loss": 0.62422574, "num_input_tokens_seen": 30559405, "step": 1446, "time_per_iteration": 2.8478708267211914 }, { "auxiliary_loss_clip": 0.011841, "auxiliary_loss_mlp": 0.01061734, "balance_loss_clip": 1.05558097, "balance_loss_mlp": 1.03888202, "epoch": 0.17399146275476463, "flos": 20375714960640.0, "grad_norm": 3.781948424295819, "language_loss": 0.81189549, "learning_rate": 3.786552566309102e-06, "loss": 0.83435386, "num_input_tokens_seen": 30577615, "step": 1447, "time_per_iteration": 3.4947988986968994 }, { "auxiliary_loss_clip": 0.01164348, "auxiliary_loss_mlp": 0.0077807, "balance_loss_clip": 1.05569386, "balance_loss_mlp": 1.00055027, "epoch": 0.1741117056454037, "flos": 19164339763200.0, "grad_norm": 2.692921053739515, "language_loss": 0.86227143, "learning_rate": 3.7862022767868517e-06, "loss": 0.88169557, "num_input_tokens_seen": 30595205, "step": 1448, "time_per_iteration": 2.635561227798462 }, { "auxiliary_loss_clip": 0.01150659, "auxiliary_loss_mlp": 0.01057499, "balance_loss_clip": 1.0590384, "balance_loss_mlp": 1.03486109, "epoch": 0.17423194853604282, "flos": 25374300537600.0, "grad_norm": 3.104213479352558, "language_loss": 0.84061569, "learning_rate": 3.7858517162991367e-06, "loss": 0.86269724, "num_input_tokens_seen": 30615280, "step": 1449, "time_per_iteration": 3.710846185684204 }, { "auxiliary_loss_clip": 0.01155463, "auxiliary_loss_mlp": 0.01064515, "balance_loss_clip": 1.05139208, "balance_loss_mlp": 1.04154336, "epoch": 0.1743521914266819, "flos": 25191874339200.0, "grad_norm": 3.5985015082542033, "language_loss": 0.60847592, "learning_rate": 3.7855008848991363e-06, "loss": 0.63067573, "num_input_tokens_seen": 30633485, "step": 1450, "time_per_iteration": 3.623237133026123 }, { "auxiliary_loss_clip": 0.01161247, "auxiliary_loss_mlp": 0.01052037, "balance_loss_clip": 1.05476379, "balance_loss_mlp": 1.03003144, "epoch": 0.17447243431732098, "flos": 25666577504640.0, "grad_norm": 51.83221526257473, "language_loss": 0.77487534, "learning_rate": 3.7851497826400714e-06, "loss": 0.79700816, "num_input_tokens_seen": 30653625, "step": 1451, "time_per_iteration": 2.6876206398010254 }, { "auxiliary_loss_clip": 0.01195893, "auxiliary_loss_mlp": 0.0105755, "balance_loss_clip": 1.05835521, "balance_loss_mlp": 1.03395844, "epoch": 0.17459267720796007, "flos": 36281950657920.0, "grad_norm": 2.9308050037162032, "language_loss": 0.76081252, "learning_rate": 3.7847984095752034e-06, "loss": 0.78334695, "num_input_tokens_seen": 30677080, "step": 1452, "time_per_iteration": 2.759134531021118 }, { "auxiliary_loss_clip": 0.0119059, "auxiliary_loss_mlp": 0.01054354, "balance_loss_clip": 1.05579376, "balance_loss_mlp": 1.03145385, "epoch": 0.17471292009859918, "flos": 20011113959040.0, "grad_norm": 2.32760094437493, "language_loss": 0.80061787, "learning_rate": 3.784446765757836e-06, "loss": 0.82306731, "num_input_tokens_seen": 30695725, "step": 1453, "time_per_iteration": 2.612516164779663 }, { "auxiliary_loss_clip": 0.01135877, "auxiliary_loss_mlp": 0.0106421, "balance_loss_clip": 1.04899621, "balance_loss_mlp": 1.04069042, "epoch": 0.17483316298923826, "flos": 27819242559360.0, "grad_norm": 4.978562582199496, "language_loss": 0.78078181, "learning_rate": 3.7840948512413133e-06, "loss": 0.80278265, "num_input_tokens_seen": 30713310, "step": 1454, "time_per_iteration": 2.734691619873047 }, { "auxiliary_loss_clip": 0.01150099, "auxiliary_loss_mlp": 0.01052889, "balance_loss_clip": 1.05407619, "balance_loss_mlp": 1.02932143, "epoch": 0.17495340587987734, "flos": 44017934791680.0, "grad_norm": 2.078566030298865, "language_loss": 0.78818703, "learning_rate": 3.7837426660790196e-06, "loss": 0.8102169, "num_input_tokens_seen": 30734725, "step": 1455, "time_per_iteration": 2.927290201187134 }, { "auxiliary_loss_clip": 0.01195823, "auxiliary_loss_mlp": 0.01055946, "balance_loss_clip": 1.05848885, "balance_loss_mlp": 1.03264058, "epoch": 0.17507364877051645, "flos": 20885825957760.0, "grad_norm": 2.740387306234974, "language_loss": 0.8232162, "learning_rate": 3.783390210324382e-06, "loss": 0.84573388, "num_input_tokens_seen": 30754450, "step": 1456, "time_per_iteration": 2.566779851913452 }, { "auxiliary_loss_clip": 0.0115009, "auxiliary_loss_mlp": 0.01050788, "balance_loss_clip": 1.05096793, "balance_loss_mlp": 1.02773261, "epoch": 0.17519389166115554, "flos": 24717602136960.0, "grad_norm": 1.853507794526038, "language_loss": 0.7250433, "learning_rate": 3.7830374840308676e-06, "loss": 0.74705207, "num_input_tokens_seen": 30774605, "step": 1457, "time_per_iteration": 2.7094504833221436 }, { "auxiliary_loss_clip": 0.01180588, "auxiliary_loss_mlp": 0.01053204, "balance_loss_clip": 1.05548191, "balance_loss_mlp": 1.02961278, "epoch": 0.17531413455179462, "flos": 23798144770560.0, "grad_norm": 4.593711928311298, "language_loss": 0.82735914, "learning_rate": 3.7826844872519842e-06, "loss": 0.84969711, "num_input_tokens_seen": 30792460, "step": 1458, "time_per_iteration": 2.6197621822357178 }, { "auxiliary_loss_clip": 0.01166311, "auxiliary_loss_mlp": 0.01051683, "balance_loss_clip": 1.05782819, "balance_loss_mlp": 1.03092909, "epoch": 0.1754343774424337, "flos": 24572379450240.0, "grad_norm": 2.3021473346061563, "language_loss": 0.72857785, "learning_rate": 3.782331220041282e-06, "loss": 0.75075781, "num_input_tokens_seen": 30812525, "step": 1459, "time_per_iteration": 2.6977274417877197 }, { "auxiliary_loss_clip": 0.01158111, "auxiliary_loss_mlp": 0.01048254, "balance_loss_clip": 1.05110335, "balance_loss_mlp": 1.02642679, "epoch": 0.17555462033307281, "flos": 18114599767680.0, "grad_norm": 2.5174586148336453, "language_loss": 0.82681417, "learning_rate": 3.7819776824523504e-06, "loss": 0.84887779, "num_input_tokens_seen": 30830390, "step": 1460, "time_per_iteration": 2.6941444873809814 }, { "auxiliary_loss_clip": 0.01169828, "auxiliary_loss_mlp": 0.010585, "balance_loss_clip": 1.05276465, "balance_loss_mlp": 1.0350157, "epoch": 0.1756748632237119, "flos": 28366018364160.0, "grad_norm": 2.2792062267390634, "language_loss": 0.84051973, "learning_rate": 3.7816238745388213e-06, "loss": 0.86280298, "num_input_tokens_seen": 30849935, "step": 1461, "time_per_iteration": 2.682703971862793 }, { "auxiliary_loss_clip": 0.01166308, "auxiliary_loss_mlp": 0.01046242, "balance_loss_clip": 1.05097449, "balance_loss_mlp": 1.02476072, "epoch": 0.17579510611435098, "flos": 25732939881600.0, "grad_norm": 1.8852917538563267, "language_loss": 0.86862969, "learning_rate": 3.781269796354367e-06, "loss": 0.89075518, "num_input_tokens_seen": 30869555, "step": 1462, "time_per_iteration": 2.687375545501709 }, { "auxiliary_loss_clip": 0.01162105, "auxiliary_loss_mlp": 0.01052261, "balance_loss_clip": 1.05261552, "balance_loss_mlp": 1.02844334, "epoch": 0.1759153490049901, "flos": 18588081870720.0, "grad_norm": 2.2001657308277283, "language_loss": 0.86106831, "learning_rate": 3.7809154479527006e-06, "loss": 0.88321197, "num_input_tokens_seen": 30888760, "step": 1463, "time_per_iteration": 2.658848762512207 }, { "auxiliary_loss_clip": 0.01140682, "auxiliary_loss_mlp": 0.01050195, "balance_loss_clip": 1.05023789, "balance_loss_mlp": 1.02916622, "epoch": 0.17603559189562917, "flos": 18619323724800.0, "grad_norm": 2.503661138603156, "language_loss": 0.8462767, "learning_rate": 3.780560829387577e-06, "loss": 0.86818546, "num_input_tokens_seen": 30907260, "step": 1464, "time_per_iteration": 2.6410515308380127 }, { "auxiliary_loss_clip": 0.01066557, "auxiliary_loss_mlp": 0.01014602, "balance_loss_clip": 1.02181196, "balance_loss_mlp": 1.01145482, "epoch": 0.17615583478626826, "flos": 60530775373440.0, "grad_norm": 0.8549624519057059, "language_loss": 0.57936919, "learning_rate": 3.7802059407127915e-06, "loss": 0.6001808, "num_input_tokens_seen": 30965810, "step": 1465, "time_per_iteration": 3.177417516708374 }, { "auxiliary_loss_clip": 0.01154243, "auxiliary_loss_mlp": 0.01055184, "balance_loss_clip": 1.04962802, "balance_loss_mlp": 1.03192616, "epoch": 0.17627607767690734, "flos": 23616221362560.0, "grad_norm": 2.3673233514599934, "language_loss": 0.8624326, "learning_rate": 3.7798507819821797e-06, "loss": 0.88452679, "num_input_tokens_seen": 30982935, "step": 1466, "time_per_iteration": 2.6429426670074463 }, { "auxiliary_loss_clip": 0.01142065, "auxiliary_loss_mlp": 0.01053579, "balance_loss_clip": 1.04841745, "balance_loss_mlp": 1.03133488, "epoch": 0.17639632056754645, "flos": 17639070589440.0, "grad_norm": 9.290953483107467, "language_loss": 0.7893672, "learning_rate": 3.7794953532496197e-06, "loss": 0.81132364, "num_input_tokens_seen": 30998840, "step": 1467, "time_per_iteration": 2.6419379711151123 }, { "auxiliary_loss_clip": 0.01030427, "auxiliary_loss_mlp": 0.00758899, "balance_loss_clip": 1.02246523, "balance_loss_mlp": 1.00084639, "epoch": 0.17651656345818553, "flos": 57932604910080.0, "grad_norm": 0.8649705516708434, "language_loss": 0.57913101, "learning_rate": 3.7791396545690295e-06, "loss": 0.59702432, "num_input_tokens_seen": 31060075, "step": 1468, "time_per_iteration": 3.272218704223633 }, { "auxiliary_loss_clip": 0.01174895, "auxiliary_loss_mlp": 0.01056834, "balance_loss_clip": 1.05405545, "balance_loss_mlp": 1.03488731, "epoch": 0.17663680634882462, "flos": 22929502170240.0, "grad_norm": 3.1877015034066156, "language_loss": 0.80656612, "learning_rate": 3.7787836859943685e-06, "loss": 0.82888341, "num_input_tokens_seen": 31078800, "step": 1469, "time_per_iteration": 2.7175776958465576 }, { "auxiliary_loss_clip": 0.01171785, "auxiliary_loss_mlp": 0.01054784, "balance_loss_clip": 1.05093479, "balance_loss_mlp": 1.03224182, "epoch": 0.17675704923946373, "flos": 22637979388800.0, "grad_norm": 3.3768276966829034, "language_loss": 0.78534615, "learning_rate": 3.7784274475796363e-06, "loss": 0.80761176, "num_input_tokens_seen": 31097430, "step": 1470, "time_per_iteration": 3.686469316482544 }, { "auxiliary_loss_clip": 0.01149719, "auxiliary_loss_mlp": 0.01054494, "balance_loss_clip": 1.04862344, "balance_loss_mlp": 1.03151035, "epoch": 0.1768772921301028, "flos": 27126525795840.0, "grad_norm": 2.782324790355066, "language_loss": 0.75979614, "learning_rate": 3.7780709393788745e-06, "loss": 0.78183824, "num_input_tokens_seen": 31117905, "step": 1471, "time_per_iteration": 2.7299695014953613 }, { "auxiliary_loss_clip": 0.01188979, "auxiliary_loss_mlp": 0.01057579, "balance_loss_clip": 1.05547023, "balance_loss_mlp": 1.03558528, "epoch": 0.1769975350207419, "flos": 19172133014400.0, "grad_norm": 2.0359951614964182, "language_loss": 0.75519961, "learning_rate": 3.777714161446165e-06, "loss": 0.77766514, "num_input_tokens_seen": 31137610, "step": 1472, "time_per_iteration": 2.5890374183654785 }, { "auxiliary_loss_clip": 0.01176452, "auxiliary_loss_mlp": 0.0104765, "balance_loss_clip": 1.05522513, "balance_loss_mlp": 1.02585816, "epoch": 0.177117777911381, "flos": 36134932291200.0, "grad_norm": 2.749949084397578, "language_loss": 0.6932627, "learning_rate": 3.7773571138356304e-06, "loss": 0.71550369, "num_input_tokens_seen": 31157780, "step": 1473, "time_per_iteration": 3.7447195053100586 }, { "auxiliary_loss_clip": 0.01119094, "auxiliary_loss_mlp": 0.01056198, "balance_loss_clip": 1.04667246, "balance_loss_mlp": 1.03428733, "epoch": 0.17723802080202009, "flos": 22090593052800.0, "grad_norm": 2.2726271190444938, "language_loss": 0.88909519, "learning_rate": 3.776999796601435e-06, "loss": 0.91084808, "num_input_tokens_seen": 31176540, "step": 1474, "time_per_iteration": 3.7861435413360596 }, { "auxiliary_loss_clip": 0.01183792, "auxiliary_loss_mlp": 0.01045639, "balance_loss_clip": 1.05732989, "balance_loss_mlp": 1.02312005, "epoch": 0.17735826369265917, "flos": 30222671437440.0, "grad_norm": 2.0521264939857433, "language_loss": 0.72796595, "learning_rate": 3.776642209797783e-06, "loss": 0.75026023, "num_input_tokens_seen": 31198370, "step": 1475, "time_per_iteration": 2.706880807876587 }, { "auxiliary_loss_clip": 0.01169064, "auxiliary_loss_mlp": 0.01064139, "balance_loss_clip": 1.0503366, "balance_loss_mlp": 1.03909278, "epoch": 0.17747850658329825, "flos": 21397588980480.0, "grad_norm": 2.3625200930948975, "language_loss": 0.77959561, "learning_rate": 3.7762843534789205e-06, "loss": 0.80192763, "num_input_tokens_seen": 31217120, "step": 1476, "time_per_iteration": 3.530000925064087 }, { "auxiliary_loss_clip": 0.01167597, "auxiliary_loss_mlp": 0.01054371, "balance_loss_clip": 1.05198097, "balance_loss_mlp": 1.0307554, "epoch": 0.17759874947393736, "flos": 16983341856000.0, "grad_norm": 2.3421159512184033, "language_loss": 0.88368785, "learning_rate": 3.7759262276991343e-06, "loss": 0.90590757, "num_input_tokens_seen": 31234730, "step": 1477, "time_per_iteration": 2.629394769668579 }, { "auxiliary_loss_clip": 0.01167585, "auxiliary_loss_mlp": 0.01059317, "balance_loss_clip": 1.05261469, "balance_loss_mlp": 1.03716791, "epoch": 0.17771899236457644, "flos": 11546107390080.0, "grad_norm": 2.538222787288204, "language_loss": 0.80606103, "learning_rate": 3.7755678325127506e-06, "loss": 0.8283301, "num_input_tokens_seen": 31252410, "step": 1478, "time_per_iteration": 2.6227920055389404 }, { "auxiliary_loss_clip": 0.01132896, "auxiliary_loss_mlp": 0.01060419, "balance_loss_clip": 1.05100322, "balance_loss_mlp": 1.037817, "epoch": 0.17783923525521553, "flos": 18807747494400.0, "grad_norm": 2.3991966110561647, "language_loss": 0.75488544, "learning_rate": 3.7752091679741393e-06, "loss": 0.77681863, "num_input_tokens_seen": 31270200, "step": 1479, "time_per_iteration": 2.7101168632507324 }, { "auxiliary_loss_clip": 0.01175496, "auxiliary_loss_mlp": 0.01051056, "balance_loss_clip": 1.05205393, "balance_loss_mlp": 1.02723789, "epoch": 0.17795947814585464, "flos": 30408365773440.0, "grad_norm": 2.805816944371158, "language_loss": 0.7777003, "learning_rate": 3.774850234137708e-06, "loss": 0.7999658, "num_input_tokens_seen": 31287495, "step": 1480, "time_per_iteration": 2.7060599327087402 }, { "auxiliary_loss_clip": 0.01170491, "auxiliary_loss_mlp": 0.01058368, "balance_loss_clip": 1.05039215, "balance_loss_mlp": 1.03390646, "epoch": 0.17807972103649372, "flos": 24389055411840.0, "grad_norm": 2.1278258741846994, "language_loss": 0.82364964, "learning_rate": 3.7744910310579076e-06, "loss": 0.84593832, "num_input_tokens_seen": 31306420, "step": 1481, "time_per_iteration": 2.638068437576294 }, { "auxiliary_loss_clip": 0.0118574, "auxiliary_loss_mlp": 0.0105281, "balance_loss_clip": 1.05308843, "balance_loss_mlp": 1.03100622, "epoch": 0.1781999639271328, "flos": 20301559332480.0, "grad_norm": 2.2461815096121507, "language_loss": 0.84929788, "learning_rate": 3.774131558789229e-06, "loss": 0.87168336, "num_input_tokens_seen": 31325750, "step": 1482, "time_per_iteration": 2.676140785217285 }, { "auxiliary_loss_clip": 0.01189113, "auxiliary_loss_mlp": 0.007774, "balance_loss_clip": 1.054528, "balance_loss_mlp": 1.00094438, "epoch": 0.1783202068177719, "flos": 15924479806080.0, "grad_norm": 10.818802861227566, "language_loss": 0.69300783, "learning_rate": 3.773771817386203e-06, "loss": 0.71267295, "num_input_tokens_seen": 31343080, "step": 1483, "time_per_iteration": 2.6036412715911865 }, { "auxiliary_loss_clip": 0.01159199, "auxiliary_loss_mlp": 0.01058292, "balance_loss_clip": 1.05067492, "balance_loss_mlp": 1.03747845, "epoch": 0.178440449708411, "flos": 20631758083200.0, "grad_norm": 2.474938094133878, "language_loss": 0.79426229, "learning_rate": 3.773411806903403e-06, "loss": 0.81643724, "num_input_tokens_seen": 31362160, "step": 1484, "time_per_iteration": 2.6820459365844727 }, { "auxiliary_loss_clip": 0.01122886, "auxiliary_loss_mlp": 0.01058605, "balance_loss_clip": 1.04661417, "balance_loss_mlp": 1.03380942, "epoch": 0.17856069259905008, "flos": 21686059105920.0, "grad_norm": 2.564599007325603, "language_loss": 0.94946837, "learning_rate": 3.7730515273954415e-06, "loss": 0.9712832, "num_input_tokens_seen": 31380770, "step": 1485, "time_per_iteration": 2.8005409240722656 }, { "auxiliary_loss_clip": 0.01185918, "auxiliary_loss_mlp": 0.01054613, "balance_loss_clip": 1.05420637, "balance_loss_mlp": 1.03226161, "epoch": 0.17868093548968916, "flos": 26572962320640.0, "grad_norm": 2.1307609289005933, "language_loss": 0.85181606, "learning_rate": 3.772690978916973e-06, "loss": 0.87422132, "num_input_tokens_seen": 31400525, "step": 1486, "time_per_iteration": 2.6695895195007324 }, { "auxiliary_loss_clip": 0.01173112, "auxiliary_loss_mlp": 0.01052265, "balance_loss_clip": 1.05355442, "balance_loss_mlp": 1.03029525, "epoch": 0.17880117838032827, "flos": 18581006891520.0, "grad_norm": 3.074380680212497, "language_loss": 0.86806059, "learning_rate": 3.772330161522693e-06, "loss": 0.89031434, "num_input_tokens_seen": 31418435, "step": 1487, "time_per_iteration": 2.5930771827697754 }, { "auxiliary_loss_clip": 0.01159295, "auxiliary_loss_mlp": 0.01062289, "balance_loss_clip": 1.05360889, "balance_loss_mlp": 1.03977036, "epoch": 0.17892142127096736, "flos": 26541217676160.0, "grad_norm": 2.0656178935567318, "language_loss": 0.80039001, "learning_rate": 3.7719690752673365e-06, "loss": 0.82260579, "num_input_tokens_seen": 31439230, "step": 1488, "time_per_iteration": 2.699850082397461 }, { "auxiliary_loss_clip": 0.01155725, "auxiliary_loss_mlp": 0.01061988, "balance_loss_clip": 1.05568957, "balance_loss_mlp": 1.03802741, "epoch": 0.17904166416160644, "flos": 23872623621120.0, "grad_norm": 2.4433736902005356, "language_loss": 0.7845065, "learning_rate": 3.7716077202056796e-06, "loss": 0.80668366, "num_input_tokens_seen": 31457705, "step": 1489, "time_per_iteration": 2.7375707626342773 }, { "auxiliary_loss_clip": 0.01150092, "auxiliary_loss_mlp": 0.0105782, "balance_loss_clip": 1.05009484, "balance_loss_mlp": 1.03469324, "epoch": 0.17916190705224552, "flos": 19134426712320.0, "grad_norm": 3.1019224282052664, "language_loss": 0.93127882, "learning_rate": 3.7712460963925404e-06, "loss": 0.95335793, "num_input_tokens_seen": 31473645, "step": 1490, "time_per_iteration": 2.6823747158050537 }, { "auxiliary_loss_clip": 0.01153124, "auxiliary_loss_mlp": 0.01053778, "balance_loss_clip": 1.0511843, "balance_loss_mlp": 1.03115165, "epoch": 0.17928214994288463, "flos": 25152120961920.0, "grad_norm": 2.184752554860026, "language_loss": 0.75037169, "learning_rate": 3.7708842038827775e-06, "loss": 0.77244067, "num_input_tokens_seen": 31492605, "step": 1491, "time_per_iteration": 2.6640782356262207 }, { "auxiliary_loss_clip": 0.0117368, "auxiliary_loss_mlp": 0.01059224, "balance_loss_clip": 1.05108762, "balance_loss_mlp": 1.03562057, "epoch": 0.17940239283352372, "flos": 22384629786240.0, "grad_norm": 1.9459349920304962, "language_loss": 0.85572594, "learning_rate": 3.770522042731288e-06, "loss": 0.87805498, "num_input_tokens_seen": 31514500, "step": 1492, "time_per_iteration": 2.6590917110443115 }, { "auxiliary_loss_clip": 0.01128721, "auxiliary_loss_mlp": 0.01060382, "balance_loss_clip": 1.04885077, "balance_loss_mlp": 1.03588426, "epoch": 0.1795226357241628, "flos": 23178685795200.0, "grad_norm": 2.1804425388673, "language_loss": 0.87486374, "learning_rate": 3.7701596129930122e-06, "loss": 0.89675474, "num_input_tokens_seen": 31533225, "step": 1493, "time_per_iteration": 2.754901647567749 }, { "auxiliary_loss_clip": 0.01158238, "auxiliary_loss_mlp": 0.01052842, "balance_loss_clip": 1.0534898, "balance_loss_mlp": 1.02685392, "epoch": 0.1796428786148019, "flos": 22090413484800.0, "grad_norm": 2.882884729368331, "language_loss": 0.73631883, "learning_rate": 3.7697969147229315e-06, "loss": 0.75842959, "num_input_tokens_seen": 31551385, "step": 1494, "time_per_iteration": 2.708627939224243 }, { "auxiliary_loss_clip": 0.01170273, "auxiliary_loss_mlp": 0.01052999, "balance_loss_clip": 1.05075812, "balance_loss_mlp": 1.02993166, "epoch": 0.179763121505441, "flos": 21324618501120.0, "grad_norm": 2.0533537562262216, "language_loss": 0.85200924, "learning_rate": 3.7694339479760647e-06, "loss": 0.87424195, "num_input_tokens_seen": 31570415, "step": 1495, "time_per_iteration": 2.6190788745880127 }, { "auxiliary_loss_clip": 0.01052571, "auxiliary_loss_mlp": 0.01008288, "balance_loss_clip": 1.01863766, "balance_loss_mlp": 1.00478292, "epoch": 0.17988336439608008, "flos": 68161864815360.0, "grad_norm": 0.7776585199131112, "language_loss": 0.57248032, "learning_rate": 3.769070712807476e-06, "loss": 0.59308892, "num_input_tokens_seen": 31632445, "step": 1496, "time_per_iteration": 4.2774035930633545 }, { "auxiliary_loss_clip": 0.01114134, "auxiliary_loss_mlp": 0.01073718, "balance_loss_clip": 1.04804957, "balance_loss_mlp": 1.04649019, "epoch": 0.18000360728671919, "flos": 21945047143680.0, "grad_norm": 1.9695834608085132, "language_loss": 0.78731787, "learning_rate": 3.768707209272266e-06, "loss": 0.80919635, "num_input_tokens_seen": 31652575, "step": 1497, "time_per_iteration": 2.7238569259643555 }, { "auxiliary_loss_clip": 0.01155924, "auxiliary_loss_mlp": 0.01057434, "balance_loss_clip": 1.04764557, "balance_loss_mlp": 1.03321064, "epoch": 0.18012385017735827, "flos": 18986330937600.0, "grad_norm": 2.4217359734006734, "language_loss": 0.76427162, "learning_rate": 3.768343437425579e-06, "loss": 0.78640521, "num_input_tokens_seen": 31671145, "step": 1498, "time_per_iteration": 2.686633825302124 }, { "auxiliary_loss_clip": 0.01100354, "auxiliary_loss_mlp": 0.01066152, "balance_loss_clip": 1.04504001, "balance_loss_mlp": 1.04157066, "epoch": 0.18024409306799735, "flos": 19748103598080.0, "grad_norm": 2.75915341222621, "language_loss": 0.86276853, "learning_rate": 3.7679793973225987e-06, "loss": 0.88443363, "num_input_tokens_seen": 31686955, "step": 1499, "time_per_iteration": 3.6899991035461426 }, { "auxiliary_loss_clip": 0.01034573, "auxiliary_loss_mlp": 0.01006707, "balance_loss_clip": 1.02419221, "balance_loss_mlp": 1.00332153, "epoch": 0.18036433595863643, "flos": 67227183060480.0, "grad_norm": 0.8473210015905821, "language_loss": 0.61567867, "learning_rate": 3.767615089018549e-06, "loss": 0.63609147, "num_input_tokens_seen": 31749300, "step": 1500, "time_per_iteration": 4.280498504638672 }, { "auxiliary_loss_clip": 0.01155177, "auxiliary_loss_mlp": 0.01055768, "balance_loss_clip": 1.04953432, "balance_loss_mlp": 1.02960145, "epoch": 0.18048457884927555, "flos": 18181464935040.0, "grad_norm": 2.0942168884603087, "language_loss": 0.86182129, "learning_rate": 3.7672505125686966e-06, "loss": 0.88393074, "num_input_tokens_seen": 31765665, "step": 1501, "time_per_iteration": 2.620112657546997 }, { "auxiliary_loss_clip": 0.01135836, "auxiliary_loss_mlp": 0.01065169, "balance_loss_clip": 1.04618013, "balance_loss_mlp": 1.03924096, "epoch": 0.18060482173991463, "flos": 15813767111040.0, "grad_norm": 3.5628509343304584, "language_loss": 0.84438789, "learning_rate": 3.7668856680283455e-06, "loss": 0.86639792, "num_input_tokens_seen": 31782690, "step": 1502, "time_per_iteration": 3.604592800140381 }, { "auxiliary_loss_clip": 0.01168237, "auxiliary_loss_mlp": 0.0105716, "balance_loss_clip": 1.05267286, "balance_loss_mlp": 1.03332996, "epoch": 0.1807250646305537, "flos": 18587399512320.0, "grad_norm": 3.5101264466085658, "language_loss": 0.82076383, "learning_rate": 3.7665205554528437e-06, "loss": 0.84301782, "num_input_tokens_seen": 31802045, "step": 1503, "time_per_iteration": 2.636594295501709 }, { "auxiliary_loss_clip": 0.01165116, "auxiliary_loss_mlp": 0.01052434, "balance_loss_clip": 1.05249262, "balance_loss_mlp": 1.03045237, "epoch": 0.18084530752119282, "flos": 23149131880320.0, "grad_norm": 1.7316446403127497, "language_loss": 0.74273825, "learning_rate": 3.7661551748975782e-06, "loss": 0.7649138, "num_input_tokens_seen": 31820220, "step": 1504, "time_per_iteration": 2.684216022491455 }, { "auxiliary_loss_clip": 0.01056025, "auxiliary_loss_mlp": 0.01015992, "balance_loss_clip": 1.02093089, "balance_loss_mlp": 1.01227307, "epoch": 0.1809655504118319, "flos": 59803153568640.0, "grad_norm": 0.8147537539923697, "language_loss": 0.6050539, "learning_rate": 3.7657895264179772e-06, "loss": 0.62577415, "num_input_tokens_seen": 31876195, "step": 1505, "time_per_iteration": 3.201169013977051 }, { "auxiliary_loss_clip": 0.01149179, "auxiliary_loss_mlp": 0.01058426, "balance_loss_clip": 1.04778624, "balance_loss_mlp": 1.03473878, "epoch": 0.181085793302471, "flos": 44201941188480.0, "grad_norm": 1.8995360544886744, "language_loss": 0.7437433, "learning_rate": 3.765423610069509e-06, "loss": 0.76581931, "num_input_tokens_seen": 31901585, "step": 1506, "time_per_iteration": 2.8734219074249268 }, { "auxiliary_loss_clip": 0.01162246, "auxiliary_loss_mlp": 0.01055078, "balance_loss_clip": 1.05261874, "balance_loss_mlp": 1.03265512, "epoch": 0.18120603619311007, "flos": 34898384638080.0, "grad_norm": 2.83546197551986, "language_loss": 0.7222544, "learning_rate": 3.765057425907683e-06, "loss": 0.74442768, "num_input_tokens_seen": 31923045, "step": 1507, "time_per_iteration": 2.755704641342163 }, { "auxiliary_loss_clip": 0.01175863, "auxiliary_loss_mlp": 0.01057253, "balance_loss_clip": 1.05206275, "balance_loss_mlp": 1.03379273, "epoch": 0.18132627908374918, "flos": 21506757390720.0, "grad_norm": 2.055439446664661, "language_loss": 0.78265691, "learning_rate": 3.764690973988048e-06, "loss": 0.80498815, "num_input_tokens_seen": 31943385, "step": 1508, "time_per_iteration": 2.630709648132324 }, { "auxiliary_loss_clip": 0.01148476, "auxiliary_loss_mlp": 0.01058109, "balance_loss_clip": 1.04814255, "balance_loss_mlp": 1.03503036, "epoch": 0.18144652197438826, "flos": 29057693633280.0, "grad_norm": 1.9924329299095591, "language_loss": 0.73788601, "learning_rate": 3.7643242543661967e-06, "loss": 0.75995183, "num_input_tokens_seen": 31966045, "step": 1509, "time_per_iteration": 2.729297399520874 }, { "auxiliary_loss_clip": 0.01045758, "auxiliary_loss_mlp": 0.01008075, "balance_loss_clip": 1.0164994, "balance_loss_mlp": 1.00508332, "epoch": 0.18156676486502735, "flos": 68675064382080.0, "grad_norm": 0.8287238256864792, "language_loss": 0.60499012, "learning_rate": 3.7639572670977573e-06, "loss": 0.62552845, "num_input_tokens_seen": 32021540, "step": 1510, "time_per_iteration": 3.133840560913086 }, { "auxiliary_loss_clip": 0.01153172, "auxiliary_loss_mlp": 0.01056254, "balance_loss_clip": 1.0501281, "balance_loss_mlp": 1.03218544, "epoch": 0.18168700775566646, "flos": 26471515334400.0, "grad_norm": 1.7057249228446485, "language_loss": 0.76846403, "learning_rate": 3.7635900122384042e-06, "loss": 0.79055834, "num_input_tokens_seen": 32044535, "step": 1511, "time_per_iteration": 2.7216529846191406 }, { "auxiliary_loss_clip": 0.01161226, "auxiliary_loss_mlp": 0.01052409, "balance_loss_clip": 1.04928243, "balance_loss_mlp": 1.02882898, "epoch": 0.18180725064630554, "flos": 15005668884480.0, "grad_norm": 2.368998266579264, "language_loss": 0.86769366, "learning_rate": 3.7632224898438477e-06, "loss": 0.88982999, "num_input_tokens_seen": 32061010, "step": 1512, "time_per_iteration": 2.62905216217041 }, { "auxiliary_loss_clip": 0.01150128, "auxiliary_loss_mlp": 0.01059628, "balance_loss_clip": 1.04925919, "balance_loss_mlp": 1.0363704, "epoch": 0.18192749353694462, "flos": 19682387665920.0, "grad_norm": 1.664737312860949, "language_loss": 0.79377747, "learning_rate": 3.762854699969842e-06, "loss": 0.81587493, "num_input_tokens_seen": 32081520, "step": 1513, "time_per_iteration": 2.6612637042999268 }, { "auxiliary_loss_clip": 0.01171449, "auxiliary_loss_mlp": 0.01048785, "balance_loss_clip": 1.05322218, "balance_loss_mlp": 1.02505076, "epoch": 0.1820477364275837, "flos": 20702717400960.0, "grad_norm": 2.4697342283294015, "language_loss": 0.73047221, "learning_rate": 3.762486642672179e-06, "loss": 0.75267458, "num_input_tokens_seen": 32098460, "step": 1514, "time_per_iteration": 2.6237406730651855 }, { "auxiliary_loss_clip": 0.01166372, "auxiliary_loss_mlp": 0.01053623, "balance_loss_clip": 1.05381989, "balance_loss_mlp": 1.02789748, "epoch": 0.18216797931822282, "flos": 17128708197120.0, "grad_norm": 1.9849397568334193, "language_loss": 0.86900502, "learning_rate": 3.7621183180066946e-06, "loss": 0.89120495, "num_input_tokens_seen": 32116420, "step": 1515, "time_per_iteration": 2.6294777393341064 }, { "auxiliary_loss_clip": 0.01157659, "auxiliary_loss_mlp": 0.01057582, "balance_loss_clip": 1.04722548, "balance_loss_mlp": 1.0341574, "epoch": 0.1822882222088619, "flos": 29242561956480.0, "grad_norm": 1.9594673837951442, "language_loss": 0.73418415, "learning_rate": 3.7617497260292625e-06, "loss": 0.75633657, "num_input_tokens_seen": 32138475, "step": 1516, "time_per_iteration": 2.703848361968994 }, { "auxiliary_loss_clip": 0.01155543, "auxiliary_loss_mlp": 0.01058574, "balance_loss_clip": 1.0517875, "balance_loss_mlp": 1.03424311, "epoch": 0.18240846509950098, "flos": 17702739446400.0, "grad_norm": 3.243582281496762, "language_loss": 0.78330564, "learning_rate": 3.7613808667957967e-06, "loss": 0.8054468, "num_input_tokens_seen": 32151165, "step": 1517, "time_per_iteration": 2.598205089569092 }, { "auxiliary_loss_clip": 0.01159684, "auxiliary_loss_mlp": 0.0105322, "balance_loss_clip": 1.04994678, "balance_loss_mlp": 1.02976, "epoch": 0.1825287079901401, "flos": 14790025584000.0, "grad_norm": 3.8944067181773434, "language_loss": 0.90445244, "learning_rate": 3.7610117403622547e-06, "loss": 0.9265815, "num_input_tokens_seen": 32167725, "step": 1518, "time_per_iteration": 2.5752484798431396 }, { "auxiliary_loss_clip": 0.01138315, "auxiliary_loss_mlp": 0.0106825, "balance_loss_clip": 1.04578435, "balance_loss_mlp": 1.04327536, "epoch": 0.18264895088077918, "flos": 21946232292480.0, "grad_norm": 1.768436209419491, "language_loss": 0.89937174, "learning_rate": 3.7606423467846313e-06, "loss": 0.92143738, "num_input_tokens_seen": 32187330, "step": 1519, "time_per_iteration": 2.6797633171081543 }, { "auxiliary_loss_clip": 0.01158192, "auxiliary_loss_mlp": 0.01058658, "balance_loss_clip": 1.05341339, "balance_loss_mlp": 1.03658056, "epoch": 0.18276919377141826, "flos": 20886759711360.0, "grad_norm": 1.5067362155069464, "language_loss": 0.79560137, "learning_rate": 3.760272686118964e-06, "loss": 0.81776989, "num_input_tokens_seen": 32205550, "step": 1520, "time_per_iteration": 2.6484766006469727 }, { "auxiliary_loss_clip": 0.011618, "auxiliary_loss_mlp": 0.01063604, "balance_loss_clip": 1.04946446, "balance_loss_mlp": 1.03884375, "epoch": 0.18288943666205737, "flos": 21469877101440.0, "grad_norm": 2.30503240425681, "language_loss": 0.92659074, "learning_rate": 3.7599027584213297e-06, "loss": 0.94884479, "num_input_tokens_seen": 32224430, "step": 1521, "time_per_iteration": 2.655045986175537 }, { "auxiliary_loss_clip": 0.01178425, "auxiliary_loss_mlp": 0.01064853, "balance_loss_clip": 1.05190122, "balance_loss_mlp": 1.04110599, "epoch": 0.18300967955269645, "flos": 21539363961600.0, "grad_norm": 2.5663514219479926, "language_loss": 0.78152591, "learning_rate": 3.7595325637478465e-06, "loss": 0.80395865, "num_input_tokens_seen": 32242455, "step": 1522, "time_per_iteration": 3.594806432723999 }, { "auxiliary_loss_clip": 0.01150193, "auxiliary_loss_mlp": 0.01060324, "balance_loss_clip": 1.05190682, "balance_loss_mlp": 1.03632724, "epoch": 0.18312992244333554, "flos": 28876237102080.0, "grad_norm": 2.3146572965613244, "language_loss": 0.81901336, "learning_rate": 3.7591621021546723e-06, "loss": 0.84111851, "num_input_tokens_seen": 32264450, "step": 1523, "time_per_iteration": 2.6806223392486572 }, { "auxiliary_loss_clip": 0.01166812, "auxiliary_loss_mlp": 0.01077061, "balance_loss_clip": 1.04993844, "balance_loss_mlp": 1.05051267, "epoch": 0.18325016533397462, "flos": 20120102801280.0, "grad_norm": 1.9743902678649115, "language_loss": 0.81544268, "learning_rate": 3.7587913736980062e-06, "loss": 0.83788139, "num_input_tokens_seen": 32284090, "step": 1524, "time_per_iteration": 3.498072624206543 }, { "auxiliary_loss_clip": 0.01104611, "auxiliary_loss_mlp": 0.01061818, "balance_loss_clip": 1.04175282, "balance_loss_mlp": 1.03779721, "epoch": 0.18337040822461373, "flos": 23329187781120.0, "grad_norm": 2.1225352834096203, "language_loss": 0.84508628, "learning_rate": 3.7584203784340865e-06, "loss": 0.8667506, "num_input_tokens_seen": 32303260, "step": 1525, "time_per_iteration": 2.713181495666504 }, { "auxiliary_loss_clip": 0.0115629, "auxiliary_loss_mlp": 0.01058875, "balance_loss_clip": 1.04960227, "balance_loss_mlp": 1.03561723, "epoch": 0.1834906511152528, "flos": 25009555881600.0, "grad_norm": 2.603143587912044, "language_loss": 0.86046588, "learning_rate": 3.7580491164191938e-06, "loss": 0.88261753, "num_input_tokens_seen": 32321570, "step": 1526, "time_per_iteration": 2.6484179496765137 }, { "auxiliary_loss_clip": 0.01072736, "auxiliary_loss_mlp": 0.01014555, "balance_loss_clip": 1.0289464, "balance_loss_mlp": 1.01119363, "epoch": 0.1836108940058919, "flos": 67251493589760.0, "grad_norm": 0.7622184749490204, "language_loss": 0.61331737, "learning_rate": 3.757677587709648e-06, "loss": 0.6341902, "num_input_tokens_seen": 32384835, "step": 1527, "time_per_iteration": 4.303121328353882 }, { "auxiliary_loss_clip": 0.01147259, "auxiliary_loss_mlp": 0.01057405, "balance_loss_clip": 1.0509994, "balance_loss_mlp": 1.03274107, "epoch": 0.183731136896531, "flos": 25738721971200.0, "grad_norm": 2.0822502701994874, "language_loss": 0.75599569, "learning_rate": 3.7573057923618095e-06, "loss": 0.77804232, "num_input_tokens_seen": 32404930, "step": 1528, "time_per_iteration": 3.6901724338531494 }, { "auxiliary_loss_clip": 0.01130123, "auxiliary_loss_mlp": 0.01057734, "balance_loss_clip": 1.04304194, "balance_loss_mlp": 1.03322446, "epoch": 0.1838513797871701, "flos": 20449403712000.0, "grad_norm": 2.050865638881101, "language_loss": 0.74590969, "learning_rate": 3.7569337304320793e-06, "loss": 0.76778829, "num_input_tokens_seen": 32424515, "step": 1529, "time_per_iteration": 2.7328615188598633 }, { "auxiliary_loss_clip": 0.01055975, "auxiliary_loss_mlp": 0.01006372, "balance_loss_clip": 1.02367198, "balance_loss_mlp": 1.00293863, "epoch": 0.18397162267780917, "flos": 68565141786240.0, "grad_norm": 0.8451954941569445, "language_loss": 0.64466524, "learning_rate": 3.756561401976899e-06, "loss": 0.66528869, "num_input_tokens_seen": 32484220, "step": 1530, "time_per_iteration": 3.1045026779174805 }, { "auxiliary_loss_clip": 0.01188916, "auxiliary_loss_mlp": 0.01051236, "balance_loss_clip": 1.05540657, "balance_loss_mlp": 1.02787137, "epoch": 0.18409186556844825, "flos": 31941104976000.0, "grad_norm": 1.8470336662356908, "language_loss": 0.82232606, "learning_rate": 3.7561888070527514e-06, "loss": 0.84472758, "num_input_tokens_seen": 32506260, "step": 1531, "time_per_iteration": 2.665390729904175 }, { "auxiliary_loss_clip": 0.01130309, "auxiliary_loss_mlp": 0.00777952, "balance_loss_clip": 1.04575133, "balance_loss_mlp": 1.00116646, "epoch": 0.18421210845908736, "flos": 20120533764480.0, "grad_norm": 2.350174824412832, "language_loss": 0.80131787, "learning_rate": 3.7558159457161577e-06, "loss": 0.82040048, "num_input_tokens_seen": 32524225, "step": 1532, "time_per_iteration": 2.6585941314697266 }, { "auxiliary_loss_clip": 0.01165176, "auxiliary_loss_mlp": 0.00778058, "balance_loss_clip": 1.05260348, "balance_loss_mlp": 1.00105023, "epoch": 0.18433235134972645, "flos": 23110491824640.0, "grad_norm": 2.432007514062363, "language_loss": 0.77834314, "learning_rate": 3.755442818023681e-06, "loss": 0.79777557, "num_input_tokens_seen": 32543850, "step": 1533, "time_per_iteration": 2.636549711227417 }, { "auxiliary_loss_clip": 0.01145205, "auxiliary_loss_mlp": 0.0105405, "balance_loss_clip": 1.04642391, "balance_loss_mlp": 1.03172159, "epoch": 0.18445259424036553, "flos": 18291351617280.0, "grad_norm": 2.0807834927235263, "language_loss": 0.76213062, "learning_rate": 3.7550694240319246e-06, "loss": 0.78412318, "num_input_tokens_seen": 32561725, "step": 1534, "time_per_iteration": 2.648491144180298 }, { "auxiliary_loss_clip": 0.01173746, "auxiliary_loss_mlp": 0.01061706, "balance_loss_clip": 1.0511508, "balance_loss_mlp": 1.03738761, "epoch": 0.18457283713100464, "flos": 21324079797120.0, "grad_norm": 9.440376032719751, "language_loss": 0.76168358, "learning_rate": 3.7546957637975326e-06, "loss": 0.78403807, "num_input_tokens_seen": 32579135, "step": 1535, "time_per_iteration": 2.6079018115997314 }, { "auxiliary_loss_clip": 0.01107121, "auxiliary_loss_mlp": 0.01055069, "balance_loss_clip": 1.03913987, "balance_loss_mlp": 1.03054762, "epoch": 0.18469308002164372, "flos": 20375679047040.0, "grad_norm": 1.5938579121809953, "language_loss": 0.74355698, "learning_rate": 3.7543218373771873e-06, "loss": 0.76517886, "num_input_tokens_seen": 32598460, "step": 1536, "time_per_iteration": 2.7191410064697266 }, { "auxiliary_loss_clip": 0.01107884, "auxiliary_loss_mlp": 0.00780581, "balance_loss_clip": 1.04267323, "balance_loss_mlp": 1.00112319, "epoch": 0.1848133229122828, "flos": 26435892021120.0, "grad_norm": 1.4172123683871425, "language_loss": 0.78216767, "learning_rate": 3.753947644827615e-06, "loss": 0.80105233, "num_input_tokens_seen": 32621920, "step": 1537, "time_per_iteration": 2.756021738052368 }, { "auxiliary_loss_clip": 0.01052424, "auxiliary_loss_mlp": 0.01032177, "balance_loss_clip": 1.01985741, "balance_loss_mlp": 1.02891064, "epoch": 0.1849335658029219, "flos": 70547447612160.0, "grad_norm": 0.9525311278614025, "language_loss": 0.57232863, "learning_rate": 3.753573186205579e-06, "loss": 0.59317464, "num_input_tokens_seen": 32690040, "step": 1538, "time_per_iteration": 3.323310136795044 }, { "auxiliary_loss_clip": 0.01144495, "auxiliary_loss_mlp": 0.00777804, "balance_loss_clip": 1.04628706, "balance_loss_mlp": 1.0009799, "epoch": 0.185053808693561, "flos": 17384140788480.0, "grad_norm": 2.379441334761577, "language_loss": 0.78502047, "learning_rate": 3.753198461567885e-06, "loss": 0.80424345, "num_input_tokens_seen": 32707285, "step": 1539, "time_per_iteration": 2.6397125720977783 }, { "auxiliary_loss_clip": 0.01139059, "auxiliary_loss_mlp": 0.01062647, "balance_loss_clip": 1.05000103, "balance_loss_mlp": 1.04020023, "epoch": 0.18517405158420008, "flos": 28986159697920.0, "grad_norm": 1.89768502214466, "language_loss": 0.92056155, "learning_rate": 3.7528234709713783e-06, "loss": 0.94257867, "num_input_tokens_seen": 32730030, "step": 1540, "time_per_iteration": 2.7484335899353027 }, { "auxiliary_loss_clip": 0.01173259, "auxiliary_loss_mlp": 0.01057957, "balance_loss_clip": 1.05140936, "balance_loss_mlp": 1.03660631, "epoch": 0.18529429447483917, "flos": 26794962328320.0, "grad_norm": 2.289083853962301, "language_loss": 0.84231329, "learning_rate": 3.7524482144729447e-06, "loss": 0.86462545, "num_input_tokens_seen": 32749485, "step": 1541, "time_per_iteration": 2.6443734169006348 }, { "auxiliary_loss_clip": 0.01133132, "auxiliary_loss_mlp": 0.01066217, "balance_loss_clip": 1.04335904, "balance_loss_mlp": 1.0407058, "epoch": 0.18541453736547828, "flos": 13581595301760.0, "grad_norm": 3.096941520152919, "language_loss": 0.83188981, "learning_rate": 3.7520726921295106e-06, "loss": 0.85388327, "num_input_tokens_seen": 32766205, "step": 1542, "time_per_iteration": 2.6315226554870605 }, { "auxiliary_loss_clip": 0.01167295, "auxiliary_loss_mlp": 0.01055132, "balance_loss_clip": 1.04924798, "balance_loss_mlp": 1.03310251, "epoch": 0.18553478025611736, "flos": 24025424077440.0, "grad_norm": 1.9249829897132953, "language_loss": 0.7248587, "learning_rate": 3.751696903998042e-06, "loss": 0.74708301, "num_input_tokens_seen": 32784840, "step": 1543, "time_per_iteration": 2.665055513381958 }, { "auxiliary_loss_clip": 0.01164178, "auxiliary_loss_mlp": 0.01049558, "balance_loss_clip": 1.04899454, "balance_loss_mlp": 1.02706325, "epoch": 0.18565502314675644, "flos": 25885165720320.0, "grad_norm": 2.4644546529208173, "language_loss": 0.7057265, "learning_rate": 3.7513208501355456e-06, "loss": 0.72786385, "num_input_tokens_seen": 32805945, "step": 1544, "time_per_iteration": 2.632448196411133 }, { "auxiliary_loss_clip": 0.01156914, "auxiliary_loss_mlp": 0.01052152, "balance_loss_clip": 1.0518086, "balance_loss_mlp": 1.02988386, "epoch": 0.18577526603739553, "flos": 19610063631360.0, "grad_norm": 2.7003651031715723, "language_loss": 0.84136188, "learning_rate": 3.750944530599069e-06, "loss": 0.86345255, "num_input_tokens_seen": 32825515, "step": 1545, "time_per_iteration": 2.61381196975708 }, { "auxiliary_loss_clip": 0.01177073, "auxiliary_loss_mlp": 0.01047067, "balance_loss_clip": 1.05206573, "balance_loss_mlp": 1.0236783, "epoch": 0.18589550892803464, "flos": 18474891137280.0, "grad_norm": 3.123072826069401, "language_loss": 0.80769515, "learning_rate": 3.7505679454456992e-06, "loss": 0.8299365, "num_input_tokens_seen": 32842125, "step": 1546, "time_per_iteration": 2.6401383876800537 }, { "auxiliary_loss_clip": 0.01097859, "auxiliary_loss_mlp": 0.01069082, "balance_loss_clip": 1.04143834, "balance_loss_mlp": 1.04551411, "epoch": 0.18601575181867372, "flos": 23549966726400.0, "grad_norm": 2.9058917331597023, "language_loss": 0.70145512, "learning_rate": 3.750191094732564e-06, "loss": 0.7231245, "num_input_tokens_seen": 32862990, "step": 1547, "time_per_iteration": 2.866480588912964 }, { "auxiliary_loss_clip": 0.01097554, "auxiliary_loss_mlp": 0.00778643, "balance_loss_clip": 1.04074788, "balance_loss_mlp": 1.00090873, "epoch": 0.1861359947093128, "flos": 26360192108160.0, "grad_norm": 3.001603631426388, "language_loss": 0.75353932, "learning_rate": 3.7498139785168313e-06, "loss": 0.77230132, "num_input_tokens_seen": 32883595, "step": 1548, "time_per_iteration": 4.330397367477417 }, { "auxiliary_loss_clip": 0.01170418, "auxiliary_loss_mlp": 0.01046412, "balance_loss_clip": 1.05353165, "balance_loss_mlp": 1.02409601, "epoch": 0.1862562375999519, "flos": 23331198942720.0, "grad_norm": 2.275463162736257, "language_loss": 0.77242446, "learning_rate": 3.749436596855709e-06, "loss": 0.79459286, "num_input_tokens_seen": 32902895, "step": 1549, "time_per_iteration": 3.5967657566070557 }, { "auxiliary_loss_clip": 0.01171256, "auxiliary_loss_mlp": 0.01073129, "balance_loss_clip": 1.05353212, "balance_loss_mlp": 1.0482856, "epoch": 0.186376480490591, "flos": 16648222942080.0, "grad_norm": 2.2445580215987238, "language_loss": 0.90609622, "learning_rate": 3.749058949806446e-06, "loss": 0.92854011, "num_input_tokens_seen": 32919620, "step": 1550, "time_per_iteration": 2.605219602584839 }, { "auxiliary_loss_clip": 0.01171517, "auxiliary_loss_mlp": 0.0104864, "balance_loss_clip": 1.05108798, "balance_loss_mlp": 1.02554941, "epoch": 0.18649672338123008, "flos": 21468656039040.0, "grad_norm": 1.8422651502642036, "language_loss": 0.84469378, "learning_rate": 3.748681037426331e-06, "loss": 0.86689532, "num_input_tokens_seen": 32938830, "step": 1551, "time_per_iteration": 2.6058948040008545 }, { "auxiliary_loss_clip": 0.01183749, "auxiliary_loss_mlp": 0.01063163, "balance_loss_clip": 1.05440903, "balance_loss_mlp": 1.03895116, "epoch": 0.1866169662718692, "flos": 12312728386560.0, "grad_norm": 2.194517738871919, "language_loss": 0.91902649, "learning_rate": 3.7483028597726936e-06, "loss": 0.94149566, "num_input_tokens_seen": 32955600, "step": 1552, "time_per_iteration": 3.5738582611083984 }, { "auxiliary_loss_clip": 0.01150054, "auxiliary_loss_mlp": 0.01058698, "balance_loss_clip": 1.05044711, "balance_loss_mlp": 1.03353262, "epoch": 0.18673720916250827, "flos": 23581280407680.0, "grad_norm": 2.164780070937584, "language_loss": 0.62869889, "learning_rate": 3.7479244169029017e-06, "loss": 0.6507864, "num_input_tokens_seen": 32975390, "step": 1553, "time_per_iteration": 2.6931533813476562 }, { "auxiliary_loss_clip": 0.01173752, "auxiliary_loss_mlp": 0.01050974, "balance_loss_clip": 1.05005527, "balance_loss_mlp": 1.02750134, "epoch": 0.18685745205314735, "flos": 19718370115200.0, "grad_norm": 3.4931773367182077, "language_loss": 0.73700583, "learning_rate": 3.7475457088743658e-06, "loss": 0.75925303, "num_input_tokens_seen": 32992640, "step": 1554, "time_per_iteration": 2.57950496673584 }, { "auxiliary_loss_clip": 0.01150205, "auxiliary_loss_mlp": 0.01068244, "balance_loss_clip": 1.04876363, "balance_loss_mlp": 1.04152918, "epoch": 0.18697769494378644, "flos": 34204123589760.0, "grad_norm": 1.8582088492110695, "language_loss": 0.7472415, "learning_rate": 3.7471667357445348e-06, "loss": 0.76942599, "num_input_tokens_seen": 33012470, "step": 1555, "time_per_iteration": 3.637296199798584 }, { "auxiliary_loss_clip": 0.01121399, "auxiliary_loss_mlp": 0.01055874, "balance_loss_clip": 1.04533386, "balance_loss_mlp": 1.03274691, "epoch": 0.18709793783442555, "flos": 34241327101440.0, "grad_norm": 2.0485047322217302, "language_loss": 0.72217238, "learning_rate": 3.7467874975709e-06, "loss": 0.74394506, "num_input_tokens_seen": 33033275, "step": 1556, "time_per_iteration": 2.8386480808258057 }, { "auxiliary_loss_clip": 0.0117606, "auxiliary_loss_mlp": 0.01055415, "balance_loss_clip": 1.05426407, "balance_loss_mlp": 1.031919, "epoch": 0.18721818072506463, "flos": 40734550529280.0, "grad_norm": 2.2024980671900716, "language_loss": 0.78586483, "learning_rate": 3.7464079944109904e-06, "loss": 0.80817956, "num_input_tokens_seen": 33055135, "step": 1557, "time_per_iteration": 2.7661848068237305 }, { "auxiliary_loss_clip": 0.01140432, "auxiliary_loss_mlp": 0.01050417, "balance_loss_clip": 1.04665303, "balance_loss_mlp": 1.02816021, "epoch": 0.18733842361570371, "flos": 22157386392960.0, "grad_norm": 2.1059992950430027, "language_loss": 0.7768563, "learning_rate": 3.746028226322376e-06, "loss": 0.79876471, "num_input_tokens_seen": 33071015, "step": 1558, "time_per_iteration": 2.642841339111328 }, { "auxiliary_loss_clip": 0.01151973, "auxiliary_loss_mlp": 0.0105537, "balance_loss_clip": 1.04895937, "balance_loss_mlp": 1.03307772, "epoch": 0.18745866650634282, "flos": 18914940656640.0, "grad_norm": 2.108317704236277, "language_loss": 0.75391138, "learning_rate": 3.745648193362669e-06, "loss": 0.77598488, "num_input_tokens_seen": 33090370, "step": 1559, "time_per_iteration": 2.662475109100342 }, { "auxiliary_loss_clip": 0.01157557, "auxiliary_loss_mlp": 0.01048373, "balance_loss_clip": 1.04876423, "balance_loss_mlp": 1.02617621, "epoch": 0.1875789093969819, "flos": 19314626267520.0, "grad_norm": 2.4586406535050505, "language_loss": 0.72175562, "learning_rate": 3.745267895589518e-06, "loss": 0.74381495, "num_input_tokens_seen": 33108910, "step": 1560, "time_per_iteration": 2.6096179485321045 }, { "auxiliary_loss_clip": 0.01162181, "auxiliary_loss_mlp": 0.01051192, "balance_loss_clip": 1.04956555, "balance_loss_mlp": 1.02755284, "epoch": 0.187699152287621, "flos": 17018965169280.0, "grad_norm": 1.901505271046359, "language_loss": 0.82009357, "learning_rate": 3.7448873330606154e-06, "loss": 0.84222728, "num_input_tokens_seen": 33126680, "step": 1561, "time_per_iteration": 2.620602607727051 }, { "auxiliary_loss_clip": 0.01139861, "auxiliary_loss_mlp": 0.01054475, "balance_loss_clip": 1.04900599, "balance_loss_mlp": 1.03052545, "epoch": 0.18781939517826007, "flos": 22346384780160.0, "grad_norm": 2.2436722694826177, "language_loss": 0.87046409, "learning_rate": 3.7445065058336914e-06, "loss": 0.89240742, "num_input_tokens_seen": 33145550, "step": 1562, "time_per_iteration": 2.7081804275512695 }, { "auxiliary_loss_clip": 0.01115824, "auxiliary_loss_mlp": 0.0106472, "balance_loss_clip": 1.0437324, "balance_loss_mlp": 1.03873205, "epoch": 0.18793963806889918, "flos": 14611478054400.0, "grad_norm": 2.9668286442987286, "language_loss": 0.85944116, "learning_rate": 3.7441254139665176e-06, "loss": 0.88124663, "num_input_tokens_seen": 33161735, "step": 1563, "time_per_iteration": 2.6624369621276855 }, { "auxiliary_loss_clip": 0.01184213, "auxiliary_loss_mlp": 0.01054385, "balance_loss_clip": 1.05589223, "balance_loss_mlp": 1.0316751, "epoch": 0.18805988095953827, "flos": 17457075354240.0, "grad_norm": 2.1231549376810324, "language_loss": 0.82166749, "learning_rate": 3.743744057516905e-06, "loss": 0.84405345, "num_input_tokens_seen": 33179795, "step": 1564, "time_per_iteration": 2.571350336074829 }, { "auxiliary_loss_clip": 0.01131926, "auxiliary_loss_mlp": 0.01051333, "balance_loss_clip": 1.0477283, "balance_loss_mlp": 1.02852869, "epoch": 0.18818012385017735, "flos": 15043877976960.0, "grad_norm": 2.8680745023361247, "language_loss": 0.87418383, "learning_rate": 3.743362436542706e-06, "loss": 0.89601642, "num_input_tokens_seen": 33194485, "step": 1565, "time_per_iteration": 2.6825435161590576 }, { "auxiliary_loss_clip": 0.01183582, "auxiliary_loss_mlp": 0.01057586, "balance_loss_clip": 1.05209184, "balance_loss_mlp": 1.03344607, "epoch": 0.18830036674081646, "flos": 47551975136640.0, "grad_norm": 2.2982484001408254, "language_loss": 0.76699042, "learning_rate": 3.7429805511018115e-06, "loss": 0.78940213, "num_input_tokens_seen": 33216145, "step": 1566, "time_per_iteration": 2.7764623165130615 }, { "auxiliary_loss_clip": 0.01146289, "auxiliary_loss_mlp": 0.00778934, "balance_loss_clip": 1.05276704, "balance_loss_mlp": 1.00068796, "epoch": 0.18842060963145554, "flos": 30044626698240.0, "grad_norm": 1.976712378854774, "language_loss": 0.78301257, "learning_rate": 3.7425984012521524e-06, "loss": 0.80226481, "num_input_tokens_seen": 33236345, "step": 1567, "time_per_iteration": 2.746161699295044 }, { "auxiliary_loss_clip": 0.01038019, "auxiliary_loss_mlp": 0.00759337, "balance_loss_clip": 1.02035403, "balance_loss_mlp": 1.00102758, "epoch": 0.18854085252209463, "flos": 70318372625280.0, "grad_norm": 0.7414185660727787, "language_loss": 0.60467947, "learning_rate": 3.7422159870517025e-06, "loss": 0.62265313, "num_input_tokens_seen": 33301600, "step": 1568, "time_per_iteration": 3.282858371734619 }, { "auxiliary_loss_clip": 0.01152187, "auxiliary_loss_mlp": 0.01051898, "balance_loss_clip": 1.04712009, "balance_loss_mlp": 1.02992809, "epoch": 0.1886610954127337, "flos": 21289318410240.0, "grad_norm": 1.784863934389183, "language_loss": 0.78673851, "learning_rate": 3.7418333085584717e-06, "loss": 0.80877942, "num_input_tokens_seen": 33322785, "step": 1569, "time_per_iteration": 2.727992534637451 }, { "auxiliary_loss_clip": 0.01148759, "auxiliary_loss_mlp": 0.01053224, "balance_loss_clip": 1.05175209, "balance_loss_mlp": 1.02973962, "epoch": 0.18878133830337282, "flos": 17266819991040.0, "grad_norm": 2.1393010981616802, "language_loss": 0.90859002, "learning_rate": 3.7414503658305128e-06, "loss": 0.93060982, "num_input_tokens_seen": 33340020, "step": 1570, "time_per_iteration": 2.6753315925598145 }, { "auxiliary_loss_clip": 0.01148717, "auxiliary_loss_mlp": 0.01057714, "balance_loss_clip": 1.05061698, "balance_loss_mlp": 1.0338006, "epoch": 0.1889015811940119, "flos": 25775207210880.0, "grad_norm": 2.768061645785404, "language_loss": 0.77294379, "learning_rate": 3.7410671589259185e-06, "loss": 0.79500806, "num_input_tokens_seen": 33358620, "step": 1571, "time_per_iteration": 2.708730459213257 }, { "auxiliary_loss_clip": 0.01189923, "auxiliary_loss_mlp": 0.01058601, "balance_loss_clip": 1.05495226, "balance_loss_mlp": 1.03485489, "epoch": 0.18902182408465099, "flos": 21032197879680.0, "grad_norm": 2.6191565970522257, "language_loss": 0.79904437, "learning_rate": 3.7406836879028205e-06, "loss": 0.82152957, "num_input_tokens_seen": 33378845, "step": 1572, "time_per_iteration": 2.575251340866089 }, { "auxiliary_loss_clip": 0.01169244, "auxiliary_loss_mlp": 0.01054263, "balance_loss_clip": 1.05092216, "balance_loss_mlp": 1.03299618, "epoch": 0.1891420669752901, "flos": 22272121411200.0, "grad_norm": 4.279860485237004, "language_loss": 0.76747239, "learning_rate": 3.7402999528193907e-06, "loss": 0.78970748, "num_input_tokens_seen": 33398345, "step": 1573, "time_per_iteration": 2.699420213699341 }, { "auxiliary_loss_clip": 0.01133291, "auxiliary_loss_mlp": 0.00780796, "balance_loss_clip": 1.04741943, "balance_loss_mlp": 1.00056386, "epoch": 0.18926230986592918, "flos": 22017802141440.0, "grad_norm": 13.598801650056798, "language_loss": 0.85879481, "learning_rate": 3.739915953733842e-06, "loss": 0.87793565, "num_input_tokens_seen": 33416390, "step": 1574, "time_per_iteration": 3.647568464279175 }, { "auxiliary_loss_clip": 0.01182641, "auxiliary_loss_mlp": 0.01063974, "balance_loss_clip": 1.05182374, "balance_loss_mlp": 1.04019201, "epoch": 0.18938255275656826, "flos": 24462672336000.0, "grad_norm": 1.8831416036351183, "language_loss": 0.81766224, "learning_rate": 3.7395316907044264e-06, "loss": 0.84012842, "num_input_tokens_seen": 33437175, "step": 1575, "time_per_iteration": 3.5251624584198 }, { "auxiliary_loss_clip": 0.01173089, "auxiliary_loss_mlp": 0.01054337, "balance_loss_clip": 1.05181253, "balance_loss_mlp": 1.03059089, "epoch": 0.18950279564720737, "flos": 24427049022720.0, "grad_norm": 2.1871227835759233, "language_loss": 0.79755914, "learning_rate": 3.7391471637894364e-06, "loss": 0.8198334, "num_input_tokens_seen": 33459440, "step": 1576, "time_per_iteration": 2.6490020751953125 }, { "auxiliary_loss_clip": 0.01145343, "auxiliary_loss_mlp": 0.01054034, "balance_loss_clip": 1.04672194, "balance_loss_mlp": 1.02936983, "epoch": 0.18962303853784646, "flos": 19756291898880.0, "grad_norm": 1.883579915201479, "language_loss": 0.84585398, "learning_rate": 3.738762373047205e-06, "loss": 0.8678478, "num_input_tokens_seen": 33479360, "step": 1577, "time_per_iteration": 2.731520891189575 }, { "auxiliary_loss_clip": 0.01149983, "auxiliary_loss_mlp": 0.01055596, "balance_loss_clip": 1.05288506, "balance_loss_mlp": 1.03298259, "epoch": 0.18974328142848554, "flos": 21032054225280.0, "grad_norm": 1.6810580159729998, "language_loss": 0.83546644, "learning_rate": 3.738377318536103e-06, "loss": 0.85752225, "num_input_tokens_seen": 33499245, "step": 1578, "time_per_iteration": 3.6586949825286865 }, { "auxiliary_loss_clip": 0.01181237, "auxiliary_loss_mlp": 0.01055828, "balance_loss_clip": 1.0553273, "balance_loss_mlp": 1.03315425, "epoch": 0.18986352431912462, "flos": 12966122736000.0, "grad_norm": 14.441576269828037, "language_loss": 0.71429813, "learning_rate": 3.7379920003145447e-06, "loss": 0.73666883, "num_input_tokens_seen": 33513520, "step": 1579, "time_per_iteration": 2.5155861377716064 }, { "auxiliary_loss_clip": 0.01152113, "auxiliary_loss_mlp": 0.0105848, "balance_loss_clip": 1.0503819, "balance_loss_mlp": 1.03320742, "epoch": 0.18998376720976373, "flos": 23767908497280.0, "grad_norm": 2.5637832848635926, "language_loss": 0.836061, "learning_rate": 3.7376064184409817e-06, "loss": 0.85816687, "num_input_tokens_seen": 33533100, "step": 1580, "time_per_iteration": 3.566329002380371 }, { "auxiliary_loss_clip": 0.01162555, "auxiliary_loss_mlp": 0.01058511, "balance_loss_clip": 1.05454826, "balance_loss_mlp": 1.03433573, "epoch": 0.19010401010040281, "flos": 22966023323520.0, "grad_norm": 1.8349876810069583, "language_loss": 0.86979496, "learning_rate": 3.7372205729739063e-06, "loss": 0.89200568, "num_input_tokens_seen": 33554915, "step": 1581, "time_per_iteration": 2.681873083114624 }, { "auxiliary_loss_clip": 0.01176756, "auxiliary_loss_mlp": 0.01055637, "balance_loss_clip": 1.05628824, "balance_loss_mlp": 1.03217697, "epoch": 0.1902242529910419, "flos": 19135647774720.0, "grad_norm": 2.047955470212333, "language_loss": 0.71431863, "learning_rate": 3.7368344639718514e-06, "loss": 0.7366426, "num_input_tokens_seen": 33572850, "step": 1582, "time_per_iteration": 2.5725040435791016 }, { "auxiliary_loss_clip": 0.01176491, "auxiliary_loss_mlp": 0.01058755, "balance_loss_clip": 1.05426431, "balance_loss_mlp": 1.03447187, "epoch": 0.190344495881681, "flos": 25483935824640.0, "grad_norm": 1.6729982629583957, "language_loss": 0.80817562, "learning_rate": 3.7364480914933895e-06, "loss": 0.83052802, "num_input_tokens_seen": 33593090, "step": 1583, "time_per_iteration": 2.728869676589966 }, { "auxiliary_loss_clip": 0.01132129, "auxiliary_loss_mlp": 0.00779242, "balance_loss_clip": 1.05007493, "balance_loss_mlp": 1.00047541, "epoch": 0.1904647387723201, "flos": 26792843425920.0, "grad_norm": 2.365464067072429, "language_loss": 0.80509394, "learning_rate": 3.7360614555971325e-06, "loss": 0.82420766, "num_input_tokens_seen": 33612745, "step": 1584, "time_per_iteration": 2.7293283939361572 }, { "auxiliary_loss_clip": 0.01177658, "auxiliary_loss_mlp": 0.00777993, "balance_loss_clip": 1.05642796, "balance_loss_mlp": 1.00052631, "epoch": 0.19058498166295917, "flos": 23987753688960.0, "grad_norm": 2.034502418614161, "language_loss": 0.84995806, "learning_rate": 3.735674556341733e-06, "loss": 0.86951458, "num_input_tokens_seen": 33632360, "step": 1585, "time_per_iteration": 2.661513090133667 }, { "auxiliary_loss_clip": 0.01159215, "auxiliary_loss_mlp": 0.0105636, "balance_loss_clip": 1.05554557, "balance_loss_mlp": 1.03194618, "epoch": 0.19070522455359826, "flos": 28293299280000.0, "grad_norm": 2.5201599838738185, "language_loss": 0.82877576, "learning_rate": 3.7352873937858835e-06, "loss": 0.85093153, "num_input_tokens_seen": 33653895, "step": 1586, "time_per_iteration": 2.689398765563965 }, { "auxiliary_loss_clip": 0.01140054, "auxiliary_loss_mlp": 0.00778834, "balance_loss_clip": 1.04960179, "balance_loss_mlp": 1.00044203, "epoch": 0.19082546744423737, "flos": 25660220797440.0, "grad_norm": 2.0087531156261647, "language_loss": 0.71981174, "learning_rate": 3.734899967988316e-06, "loss": 0.73900068, "num_input_tokens_seen": 33672075, "step": 1587, "time_per_iteration": 2.7214906215667725 }, { "auxiliary_loss_clip": 0.01135156, "auxiliary_loss_mlp": 0.01049395, "balance_loss_clip": 1.04931808, "balance_loss_mlp": 1.02539802, "epoch": 0.19094571033487645, "flos": 19719483436800.0, "grad_norm": 2.102867644995441, "language_loss": 0.83891892, "learning_rate": 3.7345122790078026e-06, "loss": 0.8607645, "num_input_tokens_seen": 33689640, "step": 1588, "time_per_iteration": 2.6617157459259033 }, { "auxiliary_loss_clip": 0.01175213, "auxiliary_loss_mlp": 0.01058356, "balance_loss_clip": 1.05590677, "balance_loss_mlp": 1.03379846, "epoch": 0.19106595322551553, "flos": 21616320850560.0, "grad_norm": 2.815437665026172, "language_loss": 0.92737639, "learning_rate": 3.7341243269031556e-06, "loss": 0.94971204, "num_input_tokens_seen": 33708630, "step": 1589, "time_per_iteration": 2.639404296875 }, { "auxiliary_loss_clip": 0.01149943, "auxiliary_loss_mlp": 0.01059799, "balance_loss_clip": 1.04948783, "balance_loss_mlp": 1.0367676, "epoch": 0.19118619611615464, "flos": 29896890059520.0, "grad_norm": 1.6992373550604567, "language_loss": 0.77552235, "learning_rate": 3.7337361117332275e-06, "loss": 0.79761982, "num_input_tokens_seen": 33730370, "step": 1590, "time_per_iteration": 2.6913042068481445 }, { "auxiliary_loss_clip": 0.01147064, "auxiliary_loss_mlp": 0.01063953, "balance_loss_clip": 1.04935014, "balance_loss_mlp": 1.04113603, "epoch": 0.19130643900679373, "flos": 17273428093440.0, "grad_norm": 1.9212334909000057, "language_loss": 0.77164567, "learning_rate": 3.7333476335569087e-06, "loss": 0.79375583, "num_input_tokens_seen": 33748370, "step": 1591, "time_per_iteration": 2.6519665718078613 }, { "auxiliary_loss_clip": 0.01161955, "auxiliary_loss_mlp": 0.01058757, "balance_loss_clip": 1.05233765, "balance_loss_mlp": 1.03388953, "epoch": 0.1914266818974328, "flos": 24826339584000.0, "grad_norm": 6.214167523039351, "language_loss": 0.66553342, "learning_rate": 3.7329588924331325e-06, "loss": 0.68774056, "num_input_tokens_seen": 33769575, "step": 1592, "time_per_iteration": 2.6730403900146484 }, { "auxiliary_loss_clip": 0.0113884, "auxiliary_loss_mlp": 0.01057193, "balance_loss_clip": 1.04780221, "balance_loss_mlp": 1.03426933, "epoch": 0.1915469247880719, "flos": 18952467390720.0, "grad_norm": 1.751550269289747, "language_loss": 0.82632327, "learning_rate": 3.732569888420871e-06, "loss": 0.84828359, "num_input_tokens_seen": 33789110, "step": 1593, "time_per_iteration": 2.651013135910034 }, { "auxiliary_loss_clip": 0.01187046, "auxiliary_loss_mlp": 0.01068547, "balance_loss_clip": 1.0524013, "balance_loss_mlp": 1.04470468, "epoch": 0.191667167678711, "flos": 21032952065280.0, "grad_norm": 2.5793210384066447, "language_loss": 0.82636905, "learning_rate": 3.732180621579134e-06, "loss": 0.84892499, "num_input_tokens_seen": 33808325, "step": 1594, "time_per_iteration": 2.580994129180908 }, { "auxiliary_loss_clip": 0.01162138, "auxiliary_loss_mlp": 0.01075193, "balance_loss_clip": 1.05565357, "balance_loss_mlp": 1.05151844, "epoch": 0.1917874105693501, "flos": 34237663914240.0, "grad_norm": 1.914487971198367, "language_loss": 0.81362087, "learning_rate": 3.7317910919669745e-06, "loss": 0.83599412, "num_input_tokens_seen": 33829520, "step": 1595, "time_per_iteration": 2.774867534637451 }, { "auxiliary_loss_clip": 0.01172632, "auxiliary_loss_mlp": 0.01069144, "balance_loss_clip": 1.05601478, "balance_loss_mlp": 1.04383564, "epoch": 0.19190765345998917, "flos": 23550613171200.0, "grad_norm": 2.145012781873817, "language_loss": 0.76497424, "learning_rate": 3.7314012996434826e-06, "loss": 0.78739202, "num_input_tokens_seen": 33848250, "step": 1596, "time_per_iteration": 2.606679677963257 }, { "auxiliary_loss_clip": 0.01159846, "auxiliary_loss_mlp": 0.01058579, "balance_loss_clip": 1.05642509, "balance_loss_mlp": 1.03428435, "epoch": 0.19202789635062828, "flos": 19861330245120.0, "grad_norm": 2.1867975264086237, "language_loss": 0.81145149, "learning_rate": 3.7310112446677907e-06, "loss": 0.83363575, "num_input_tokens_seen": 33866160, "step": 1597, "time_per_iteration": 2.6088922023773193 }, { "auxiliary_loss_clip": 0.01190662, "auxiliary_loss_mlp": 0.0105707, "balance_loss_clip": 1.0560689, "balance_loss_mlp": 1.03196502, "epoch": 0.19214813924126736, "flos": 20922957642240.0, "grad_norm": 2.2238914124358025, "language_loss": 0.68974054, "learning_rate": 3.7306209270990695e-06, "loss": 0.71221787, "num_input_tokens_seen": 33884165, "step": 1598, "time_per_iteration": 2.5774126052856445 }, { "auxiliary_loss_clip": 0.01164765, "auxiliary_loss_mlp": 0.01056714, "balance_loss_clip": 1.05706501, "balance_loss_mlp": 1.0343504, "epoch": 0.19226838213190645, "flos": 26359725231360.0, "grad_norm": 2.185309585269273, "language_loss": 0.86753464, "learning_rate": 3.7302303469965292e-06, "loss": 0.88974947, "num_input_tokens_seen": 33903705, "step": 1599, "time_per_iteration": 2.6532442569732666 }, { "auxiliary_loss_clip": 0.01172871, "auxiliary_loss_mlp": 0.01055002, "balance_loss_clip": 1.05399394, "balance_loss_mlp": 1.03281736, "epoch": 0.19238862502254553, "flos": 20850525866880.0, "grad_norm": 1.8164462748139258, "language_loss": 0.70745623, "learning_rate": 3.7298395044194206e-06, "loss": 0.72973496, "num_input_tokens_seen": 33922515, "step": 1600, "time_per_iteration": 3.5727498531341553 }, { "auxiliary_loss_clip": 0.01186158, "auxiliary_loss_mlp": 0.01060721, "balance_loss_clip": 1.05710387, "balance_loss_mlp": 1.03721285, "epoch": 0.19250886791318464, "flos": 21726063878400.0, "grad_norm": 2.4626591433220675, "language_loss": 0.94317305, "learning_rate": 3.7294483994270356e-06, "loss": 0.9656418, "num_input_tokens_seen": 33940840, "step": 1601, "time_per_iteration": 2.5773563385009766 }, { "auxiliary_loss_clip": 0.01127162, "auxiliary_loss_mlp": 0.01056, "balance_loss_clip": 1.05030978, "balance_loss_mlp": 1.03211021, "epoch": 0.19262911080382372, "flos": 23367827836800.0, "grad_norm": 2.780833019584307, "language_loss": 0.77548486, "learning_rate": 3.7290570320787033e-06, "loss": 0.79731643, "num_input_tokens_seen": 33960420, "step": 1602, "time_per_iteration": 3.587289333343506 }, { "auxiliary_loss_clip": 0.01170068, "auxiliary_loss_mlp": 0.01052801, "balance_loss_clip": 1.05520988, "balance_loss_mlp": 1.02943611, "epoch": 0.1927493536944628, "flos": 21943502858880.0, "grad_norm": 1.9800544255354549, "language_loss": 0.70848888, "learning_rate": 3.728665402433793e-06, "loss": 0.73071754, "num_input_tokens_seen": 33978990, "step": 1603, "time_per_iteration": 2.6329503059387207 }, { "auxiliary_loss_clip": 0.0115698, "auxiliary_loss_mlp": 0.01062773, "balance_loss_clip": 1.05500746, "balance_loss_mlp": 1.04017079, "epoch": 0.19286959658510192, "flos": 16545590807040.0, "grad_norm": 2.7980369776758893, "language_loss": 0.86598182, "learning_rate": 3.7282735105517164e-06, "loss": 0.88817936, "num_input_tokens_seen": 33997115, "step": 1604, "time_per_iteration": 3.5839004516601562 }, { "auxiliary_loss_clip": 0.01139865, "auxiliary_loss_mlp": 0.01057867, "balance_loss_clip": 1.04882097, "balance_loss_mlp": 1.03406096, "epoch": 0.192989839475741, "flos": 21616967295360.0, "grad_norm": 2.442623631771635, "language_loss": 0.67642546, "learning_rate": 3.727881356491922e-06, "loss": 0.69840276, "num_input_tokens_seen": 34015525, "step": 1605, "time_per_iteration": 2.712379217147827 }, { "auxiliary_loss_clip": 0.01185438, "auxiliary_loss_mlp": 0.01058233, "balance_loss_clip": 1.05935752, "balance_loss_mlp": 1.03598809, "epoch": 0.19311008236638008, "flos": 19281516906240.0, "grad_norm": 2.2505700945737432, "language_loss": 0.75976056, "learning_rate": 3.7274889403139002e-06, "loss": 0.78219724, "num_input_tokens_seen": 34033150, "step": 1606, "time_per_iteration": 3.4361610412597656 }, { "auxiliary_loss_clip": 0.0113012, "auxiliary_loss_mlp": 0.0105025, "balance_loss_clip": 1.04924786, "balance_loss_mlp": 1.02795768, "epoch": 0.1932303252570192, "flos": 28652369587200.0, "grad_norm": 2.4507644893771587, "language_loss": 0.78353822, "learning_rate": 3.727096262077179e-06, "loss": 0.8053419, "num_input_tokens_seen": 34052145, "step": 1607, "time_per_iteration": 2.749694585800171 }, { "auxiliary_loss_clip": 0.01173238, "auxiliary_loss_mlp": 0.01071994, "balance_loss_clip": 1.05376899, "balance_loss_mlp": 1.04892719, "epoch": 0.19335056814765827, "flos": 18368990864640.0, "grad_norm": 1.8469013113051478, "language_loss": 0.85311127, "learning_rate": 3.7267033218413285e-06, "loss": 0.87556368, "num_input_tokens_seen": 34069940, "step": 1608, "time_per_iteration": 2.5791127681732178 }, { "auxiliary_loss_clip": 0.01120352, "auxiliary_loss_mlp": 0.010603, "balance_loss_clip": 1.04386163, "balance_loss_mlp": 1.03573108, "epoch": 0.19347081103829736, "flos": 13260877741440.0, "grad_norm": 2.298092073588989, "language_loss": 0.8101005, "learning_rate": 3.726310119665957e-06, "loss": 0.83190703, "num_input_tokens_seen": 34086275, "step": 1609, "time_per_iteration": 2.7129292488098145 }, { "auxiliary_loss_clip": 0.01172739, "auxiliary_loss_mlp": 0.01054124, "balance_loss_clip": 1.05456233, "balance_loss_mlp": 1.03133082, "epoch": 0.19359105392893644, "flos": 20300122788480.0, "grad_norm": 2.1528805929832995, "language_loss": 0.85210359, "learning_rate": 3.725916655610713e-06, "loss": 0.87437224, "num_input_tokens_seen": 34105605, "step": 1610, "time_per_iteration": 2.6142303943634033 }, { "auxiliary_loss_clip": 0.01151444, "auxiliary_loss_mlp": 0.01073274, "balance_loss_clip": 1.05084944, "balance_loss_mlp": 1.04649937, "epoch": 0.19371129681957555, "flos": 20484596062080.0, "grad_norm": 3.613103139282613, "language_loss": 0.7547009, "learning_rate": 3.725522929735284e-06, "loss": 0.77694809, "num_input_tokens_seen": 34122540, "step": 1611, "time_per_iteration": 2.6542856693267822 }, { "auxiliary_loss_clip": 0.0116313, "auxiliary_loss_mlp": 0.01058743, "balance_loss_clip": 1.05182362, "balance_loss_mlp": 1.03610492, "epoch": 0.19383153971021463, "flos": 30445497457920.0, "grad_norm": 2.5424491258260344, "language_loss": 0.74606764, "learning_rate": 3.725128942099399e-06, "loss": 0.76828635, "num_input_tokens_seen": 34142940, "step": 1612, "time_per_iteration": 2.7095448970794678 }, { "auxiliary_loss_clip": 0.01149604, "auxiliary_loss_mlp": 0.01049726, "balance_loss_clip": 1.04963577, "balance_loss_mlp": 1.02518094, "epoch": 0.19395178260085372, "flos": 24569937325440.0, "grad_norm": 1.9976796473102465, "language_loss": 0.7999568, "learning_rate": 3.7247346927628245e-06, "loss": 0.82195014, "num_input_tokens_seen": 34162875, "step": 1613, "time_per_iteration": 2.6871681213378906 }, { "auxiliary_loss_clip": 0.01163218, "auxiliary_loss_mlp": 0.00778681, "balance_loss_clip": 1.05610728, "balance_loss_mlp": 1.0003562, "epoch": 0.19407202549149283, "flos": 28950608211840.0, "grad_norm": 2.881778137039176, "language_loss": 0.79572129, "learning_rate": 3.7243401817853694e-06, "loss": 0.81514031, "num_input_tokens_seen": 34183565, "step": 1614, "time_per_iteration": 2.737656354904175 }, { "auxiliary_loss_clip": 0.01166, "auxiliary_loss_mlp": 0.01055092, "balance_loss_clip": 1.05366397, "balance_loss_mlp": 1.03238297, "epoch": 0.1941922683821319, "flos": 18004497603840.0, "grad_norm": 3.070262133293884, "language_loss": 0.71767956, "learning_rate": 3.723945409226879e-06, "loss": 0.73989046, "num_input_tokens_seen": 34202055, "step": 1615, "time_per_iteration": 2.6261074542999268 }, { "auxiliary_loss_clip": 0.01169257, "auxiliary_loss_mlp": 0.01055917, "balance_loss_clip": 1.05153942, "balance_loss_mlp": 1.03193164, "epoch": 0.194312511272771, "flos": 9720337034880.0, "grad_norm": 2.81420291141271, "language_loss": 0.80142361, "learning_rate": 3.723550375147241e-06, "loss": 0.82367533, "num_input_tokens_seen": 34216830, "step": 1616, "time_per_iteration": 2.705319881439209 }, { "auxiliary_loss_clip": 0.01134137, "auxiliary_loss_mlp": 0.01064379, "balance_loss_clip": 1.0480144, "balance_loss_mlp": 1.04002488, "epoch": 0.19443275416341008, "flos": 27016208150400.0, "grad_norm": 1.8600905474126137, "language_loss": 0.79869974, "learning_rate": 3.7231550796063816e-06, "loss": 0.82068491, "num_input_tokens_seen": 34236840, "step": 1617, "time_per_iteration": 2.7157185077667236 }, { "auxiliary_loss_clip": 0.01168499, "auxiliary_loss_mlp": 0.01064535, "balance_loss_clip": 1.05570281, "balance_loss_mlp": 1.04027557, "epoch": 0.1945529970540492, "flos": 15846625077120.0, "grad_norm": 1.9327983871378405, "language_loss": 0.64964384, "learning_rate": 3.722759522664266e-06, "loss": 0.67197418, "num_input_tokens_seen": 34254140, "step": 1618, "time_per_iteration": 2.642329692840576 }, { "auxiliary_loss_clip": 0.01133869, "auxiliary_loss_mlp": 0.01056091, "balance_loss_clip": 1.04969049, "balance_loss_mlp": 1.03272653, "epoch": 0.19467323994468827, "flos": 19314985403520.0, "grad_norm": 2.4339368062399065, "language_loss": 0.81819844, "learning_rate": 3.7223637043809016e-06, "loss": 0.84009796, "num_input_tokens_seen": 34273120, "step": 1619, "time_per_iteration": 2.811950922012329 }, { "auxiliary_loss_clip": 0.01151223, "auxiliary_loss_mlp": 0.01057711, "balance_loss_clip": 1.05327535, "balance_loss_mlp": 1.03650331, "epoch": 0.19479348283532735, "flos": 24133227770880.0, "grad_norm": 2.0310307701024146, "language_loss": 0.8616904, "learning_rate": 3.7219676248163322e-06, "loss": 0.8837797, "num_input_tokens_seen": 34290285, "step": 1620, "time_per_iteration": 2.6871323585510254 }, { "auxiliary_loss_clip": 0.01173744, "auxiliary_loss_mlp": 0.01061964, "balance_loss_clip": 1.0552901, "balance_loss_mlp": 1.03901577, "epoch": 0.19491372572596646, "flos": 25775638174080.0, "grad_norm": 1.8617884372207203, "language_loss": 0.93419778, "learning_rate": 3.721571284030643e-06, "loss": 0.95655489, "num_input_tokens_seen": 34310095, "step": 1621, "time_per_iteration": 2.65907621383667 }, { "auxiliary_loss_clip": 0.01173622, "auxiliary_loss_mlp": 0.0105841, "balance_loss_clip": 1.05413008, "balance_loss_mlp": 1.03484249, "epoch": 0.19503396861660555, "flos": 19645220067840.0, "grad_norm": 3.083077201104829, "language_loss": 0.78837442, "learning_rate": 3.7211746820839587e-06, "loss": 0.81069469, "num_input_tokens_seen": 34327190, "step": 1622, "time_per_iteration": 2.5826663970947266 }, { "auxiliary_loss_clip": 0.01085519, "auxiliary_loss_mlp": 0.01050923, "balance_loss_clip": 1.04054213, "balance_loss_mlp": 1.02740312, "epoch": 0.19515421150724463, "flos": 21033023892480.0, "grad_norm": 1.799645945616957, "language_loss": 0.80879205, "learning_rate": 3.7207778190364437e-06, "loss": 0.83015645, "num_input_tokens_seen": 34345615, "step": 1623, "time_per_iteration": 2.9786171913146973 }, { "auxiliary_loss_clip": 0.01101429, "auxiliary_loss_mlp": 0.01054432, "balance_loss_clip": 1.04469311, "balance_loss_mlp": 1.03154349, "epoch": 0.1952744543978837, "flos": 32961255143040.0, "grad_norm": 2.343863715123315, "language_loss": 0.73914504, "learning_rate": 3.720380694948302e-06, "loss": 0.76070362, "num_input_tokens_seen": 34368500, "step": 1624, "time_per_iteration": 3.1363863945007324 }, { "auxiliary_loss_clip": 0.01040805, "auxiliary_loss_mlp": 0.01006408, "balance_loss_clip": 1.02521563, "balance_loss_mlp": 1.00242639, "epoch": 0.19539469728852282, "flos": 64044312030720.0, "grad_norm": 1.0341271158907763, "language_loss": 0.71139872, "learning_rate": 3.719983309879777e-06, "loss": 0.73187089, "num_input_tokens_seen": 34428280, "step": 1625, "time_per_iteration": 3.273986339569092 }, { "auxiliary_loss_clip": 0.01131403, "auxiliary_loss_mlp": 0.01052422, "balance_loss_clip": 1.04711962, "balance_loss_mlp": 1.03043962, "epoch": 0.1955149401791619, "flos": 13370908078080.0, "grad_norm": 1.7807929353823067, "language_loss": 0.77541661, "learning_rate": 3.719585663891151e-06, "loss": 0.7972548, "num_input_tokens_seen": 34445815, "step": 1626, "time_per_iteration": 3.5636279582977295 }, { "auxiliary_loss_clip": 0.01125146, "auxiliary_loss_mlp": 0.01057164, "balance_loss_clip": 1.04926896, "balance_loss_mlp": 1.03311956, "epoch": 0.195635183069801, "flos": 18728887184640.0, "grad_norm": 2.3111721947348056, "language_loss": 0.7860024, "learning_rate": 3.719187757042747e-06, "loss": 0.80782551, "num_input_tokens_seen": 34463635, "step": 1627, "time_per_iteration": 2.7131459712982178 }, { "auxiliary_loss_clip": 0.01054269, "auxiliary_loss_mlp": 0.01012893, "balance_loss_clip": 1.02366114, "balance_loss_mlp": 1.00929248, "epoch": 0.1957554259604401, "flos": 69313952615040.0, "grad_norm": 0.7256089115453837, "language_loss": 0.54974174, "learning_rate": 3.7187895893949275e-06, "loss": 0.57041335, "num_input_tokens_seen": 34530105, "step": 1628, "time_per_iteration": 4.274688720703125 }, { "auxiliary_loss_clip": 0.01117072, "auxiliary_loss_mlp": 0.01064504, "balance_loss_clip": 1.04390848, "balance_loss_mlp": 1.03881502, "epoch": 0.19587566885107918, "flos": 21069257736960.0, "grad_norm": 2.3952710385922122, "language_loss": 0.76213318, "learning_rate": 3.7183911610080937e-06, "loss": 0.78394896, "num_input_tokens_seen": 34546970, "step": 1629, "time_per_iteration": 2.7484734058380127 }, { "auxiliary_loss_clip": 0.01144689, "auxiliary_loss_mlp": 0.01056398, "balance_loss_clip": 1.04940593, "balance_loss_mlp": 1.03212726, "epoch": 0.19599591174171827, "flos": 22194661731840.0, "grad_norm": 2.624525677426571, "language_loss": 0.74918091, "learning_rate": 3.7179924719426872e-06, "loss": 0.77119178, "num_input_tokens_seen": 34564865, "step": 1630, "time_per_iteration": 3.7282164096832275 }, { "auxiliary_loss_clip": 0.01175085, "auxiliary_loss_mlp": 0.01070265, "balance_loss_clip": 1.05344057, "balance_loss_mlp": 1.04624414, "epoch": 0.19611615463235738, "flos": 23768375374080.0, "grad_norm": 4.438198135943882, "language_loss": 0.75973356, "learning_rate": 3.7175935222591885e-06, "loss": 0.78218699, "num_input_tokens_seen": 34584165, "step": 1631, "time_per_iteration": 2.634505271911621 }, { "auxiliary_loss_clip": 0.01165781, "auxiliary_loss_mlp": 0.01072868, "balance_loss_clip": 1.05705452, "balance_loss_mlp": 1.04934835, "epoch": 0.19623639752299646, "flos": 28618218731520.0, "grad_norm": 1.9350860165383263, "language_loss": 0.74474156, "learning_rate": 3.717194312018118e-06, "loss": 0.76712805, "num_input_tokens_seen": 34603150, "step": 1632, "time_per_iteration": 3.588071584701538 }, { "auxiliary_loss_clip": 0.01175111, "auxiliary_loss_mlp": 0.01052985, "balance_loss_clip": 1.05387998, "balance_loss_mlp": 1.02959561, "epoch": 0.19635664041363554, "flos": 21032700670080.0, "grad_norm": 2.1713623393858934, "language_loss": 0.76230663, "learning_rate": 3.716794841280036e-06, "loss": 0.78458756, "num_input_tokens_seen": 34621855, "step": 1633, "time_per_iteration": 2.5698840618133545 }, { "auxiliary_loss_clip": 0.01170968, "auxiliary_loss_mlp": 0.01054402, "balance_loss_clip": 1.04982483, "balance_loss_mlp": 1.03225267, "epoch": 0.19647688330427462, "flos": 18879748306560.0, "grad_norm": 3.1110669973356937, "language_loss": 0.77586448, "learning_rate": 3.7163951101055407e-06, "loss": 0.79811823, "num_input_tokens_seen": 34639915, "step": 1634, "time_per_iteration": 2.601206064224243 }, { "auxiliary_loss_clip": 0.01155449, "auxiliary_loss_mlp": 0.01068637, "balance_loss_clip": 1.05204904, "balance_loss_mlp": 1.04270875, "epoch": 0.19659712619491373, "flos": 24242503921920.0, "grad_norm": 1.784818975229322, "language_loss": 0.78737223, "learning_rate": 3.715995118555273e-06, "loss": 0.80961311, "num_input_tokens_seen": 34659890, "step": 1635, "time_per_iteration": 2.661525249481201 }, { "auxiliary_loss_clip": 0.01126014, "auxiliary_loss_mlp": 0.01055357, "balance_loss_clip": 1.04809856, "balance_loss_mlp": 1.02792692, "epoch": 0.19671736908555282, "flos": 24717422568960.0, "grad_norm": 2.2702438408598016, "language_loss": 0.85950583, "learning_rate": 3.71559486668991e-06, "loss": 0.88131952, "num_input_tokens_seen": 34678750, "step": 1636, "time_per_iteration": 2.6959407329559326 }, { "auxiliary_loss_clip": 0.01173025, "auxiliary_loss_mlp": 0.00776926, "balance_loss_clip": 1.05298352, "balance_loss_mlp": 1.00053525, "epoch": 0.1968376119761919, "flos": 23842279607040.0, "grad_norm": 1.7170726449989382, "language_loss": 0.77182007, "learning_rate": 3.715194354570169e-06, "loss": 0.79131961, "num_input_tokens_seen": 34698755, "step": 1637, "time_per_iteration": 2.6772732734680176 }, { "auxiliary_loss_clip": 0.01166243, "auxiliary_loss_mlp": 0.01054822, "balance_loss_clip": 1.05243516, "balance_loss_mlp": 1.03179061, "epoch": 0.196957854866831, "flos": 18113917409280.0, "grad_norm": 7.71216447290658, "language_loss": 0.8338086, "learning_rate": 3.714793582256809e-06, "loss": 0.8560192, "num_input_tokens_seen": 34715820, "step": 1638, "time_per_iteration": 2.6007375717163086 }, { "auxiliary_loss_clip": 0.01181387, "auxiliary_loss_mlp": 0.01054668, "balance_loss_clip": 1.0545727, "balance_loss_mlp": 1.03393793, "epoch": 0.1970780977574701, "flos": 21653129312640.0, "grad_norm": 2.2982323599156786, "language_loss": 0.84613556, "learning_rate": 3.7143925498106253e-06, "loss": 0.86849612, "num_input_tokens_seen": 34734360, "step": 1639, "time_per_iteration": 2.624453544616699 }, { "auxiliary_loss_clip": 0.01153404, "auxiliary_loss_mlp": 0.01056478, "balance_loss_clip": 1.04668379, "balance_loss_mlp": 1.03339887, "epoch": 0.19719834064810918, "flos": 20811813984000.0, "grad_norm": 2.086272048448583, "language_loss": 0.79298741, "learning_rate": 3.7139912572924558e-06, "loss": 0.81508625, "num_input_tokens_seen": 34753390, "step": 1640, "time_per_iteration": 2.6704587936401367 }, { "auxiliary_loss_clip": 0.0116256, "auxiliary_loss_mlp": 0.01052714, "balance_loss_clip": 1.04613662, "balance_loss_mlp": 1.0294919, "epoch": 0.19731858353874826, "flos": 23434800744960.0, "grad_norm": 3.2941109318382664, "language_loss": 0.80081975, "learning_rate": 3.7135897047631744e-06, "loss": 0.82297254, "num_input_tokens_seen": 34771275, "step": 1641, "time_per_iteration": 2.6176624298095703 }, { "auxiliary_loss_clip": 0.01160278, "auxiliary_loss_mlp": 0.01055592, "balance_loss_clip": 1.05304956, "balance_loss_mlp": 1.03389561, "epoch": 0.19743882642938737, "flos": 23988184652160.0, "grad_norm": 2.388446601418865, "language_loss": 0.76064193, "learning_rate": 3.713187892283698e-06, "loss": 0.78280061, "num_input_tokens_seen": 34790885, "step": 1642, "time_per_iteration": 2.723628520965576 }, { "auxiliary_loss_clip": 0.01127115, "auxiliary_loss_mlp": 0.01054529, "balance_loss_clip": 1.04419065, "balance_loss_mlp": 1.03124762, "epoch": 0.19755906932002645, "flos": 15004340081280.0, "grad_norm": 2.652860520489604, "language_loss": 0.87269831, "learning_rate": 3.71278581991498e-06, "loss": 0.89451474, "num_input_tokens_seen": 34806745, "step": 1643, "time_per_iteration": 2.771780490875244 }, { "auxiliary_loss_clip": 0.01153091, "auxiliary_loss_mlp": 0.00778329, "balance_loss_clip": 1.0576483, "balance_loss_mlp": 1.00059748, "epoch": 0.19767931221066554, "flos": 19494466686720.0, "grad_norm": 2.3409465495668713, "language_loss": 0.79166448, "learning_rate": 3.712383487718015e-06, "loss": 0.81097865, "num_input_tokens_seen": 34824985, "step": 1644, "time_per_iteration": 2.723259210586548 }, { "auxiliary_loss_clip": 0.01111086, "auxiliary_loss_mlp": 0.0106078, "balance_loss_clip": 1.04823709, "balance_loss_mlp": 1.03849983, "epoch": 0.19779955510130465, "flos": 25737895958400.0, "grad_norm": 1.8562914272773021, "language_loss": 0.86538506, "learning_rate": 3.7119808957538365e-06, "loss": 0.8871038, "num_input_tokens_seen": 34843980, "step": 1645, "time_per_iteration": 2.8740577697753906 }, { "auxiliary_loss_clip": 0.01148503, "auxiliary_loss_mlp": 0.01053601, "balance_loss_clip": 1.04760981, "balance_loss_mlp": 1.03142834, "epoch": 0.19791979799194373, "flos": 20777699041920.0, "grad_norm": 14.297445133542794, "language_loss": 0.80347836, "learning_rate": 3.711578044083517e-06, "loss": 0.82549936, "num_input_tokens_seen": 34860780, "step": 1646, "time_per_iteration": 2.75075101852417 }, { "auxiliary_loss_clip": 0.01153428, "auxiliary_loss_mlp": 0.01055224, "balance_loss_clip": 1.04911911, "balance_loss_mlp": 1.03144228, "epoch": 0.1980400408825828, "flos": 25589010084480.0, "grad_norm": 2.7822236107721956, "language_loss": 0.74343473, "learning_rate": 3.7111749327681698e-06, "loss": 0.76552129, "num_input_tokens_seen": 34880815, "step": 1647, "time_per_iteration": 2.8491194248199463 }, { "auxiliary_loss_clip": 0.01175343, "auxiliary_loss_mlp": 0.01057437, "balance_loss_clip": 1.05659962, "balance_loss_mlp": 1.03512073, "epoch": 0.1981602837732219, "flos": 23513840622720.0, "grad_norm": 2.0898322792403365, "language_loss": 0.86183578, "learning_rate": 3.7107715618689455e-06, "loss": 0.8841635, "num_input_tokens_seen": 34899790, "step": 1648, "time_per_iteration": 2.7644691467285156 }, { "auxiliary_loss_clip": 0.01169501, "auxiliary_loss_mlp": 0.01055934, "balance_loss_clip": 1.05584097, "balance_loss_mlp": 1.03409457, "epoch": 0.198280526663861, "flos": 23185365724800.0, "grad_norm": 1.4783062792186015, "language_loss": 0.83447587, "learning_rate": 3.710367931447035e-06, "loss": 0.85673022, "num_input_tokens_seen": 34921570, "step": 1649, "time_per_iteration": 2.7186684608459473 }, { "auxiliary_loss_clip": 0.01178231, "auxiliary_loss_mlp": 0.01062755, "balance_loss_clip": 1.05449963, "balance_loss_mlp": 1.0397594, "epoch": 0.1984007695545001, "flos": 21689470897920.0, "grad_norm": 2.247120749458769, "language_loss": 0.86737382, "learning_rate": 3.70996404156367e-06, "loss": 0.88978374, "num_input_tokens_seen": 34941205, "step": 1650, "time_per_iteration": 2.7252988815307617 }, { "auxiliary_loss_clip": 0.01124652, "auxiliary_loss_mlp": 0.01055593, "balance_loss_clip": 1.05058432, "balance_loss_mlp": 1.03189445, "epoch": 0.19852101244513917, "flos": 36064008887040.0, "grad_norm": 2.1429555735905246, "language_loss": 0.72731662, "learning_rate": 3.7095598922801187e-06, "loss": 0.74911904, "num_input_tokens_seen": 34963280, "step": 1651, "time_per_iteration": 3.9212143421173096 }, { "auxiliary_loss_clip": 0.01185708, "auxiliary_loss_mlp": 0.01052948, "balance_loss_clip": 1.0547688, "balance_loss_mlp": 1.03101325, "epoch": 0.19864125533577828, "flos": 23105894883840.0, "grad_norm": 2.462827962884229, "language_loss": 0.76229537, "learning_rate": 3.7091554836576914e-06, "loss": 0.78468192, "num_input_tokens_seen": 34979955, "step": 1652, "time_per_iteration": 2.7195510864257812 }, { "auxiliary_loss_clip": 0.01169717, "auxiliary_loss_mlp": 0.00777903, "balance_loss_clip": 1.05728221, "balance_loss_mlp": 1.0004003, "epoch": 0.19876149822641737, "flos": 24608505553920.0, "grad_norm": 2.3224050910679854, "language_loss": 0.82871544, "learning_rate": 3.708750815757736e-06, "loss": 0.84819162, "num_input_tokens_seen": 35000725, "step": 1653, "time_per_iteration": 2.8353428840637207 }, { "auxiliary_loss_clip": 0.01175033, "auxiliary_loss_mlp": 0.0107036, "balance_loss_clip": 1.05579829, "balance_loss_mlp": 1.04595828, "epoch": 0.19888174111705645, "flos": 32196645308160.0, "grad_norm": 2.5327705515931416, "language_loss": 0.73025942, "learning_rate": 3.7083458886416407e-06, "loss": 0.75271338, "num_input_tokens_seen": 35019920, "step": 1654, "time_per_iteration": 3.7228963375091553 }, { "auxiliary_loss_clip": 0.01122087, "auxiliary_loss_mlp": 0.01049208, "balance_loss_clip": 1.04944062, "balance_loss_mlp": 1.02742839, "epoch": 0.19900198400769553, "flos": 24608469640320.0, "grad_norm": 2.4398130072472926, "language_loss": 0.88155067, "learning_rate": 3.707940702370832e-06, "loss": 0.90326357, "num_input_tokens_seen": 35040765, "step": 1655, "time_per_iteration": 4.327443838119507 }, { "auxiliary_loss_clip": 0.01063934, "auxiliary_loss_mlp": 0.01003964, "balance_loss_clip": 1.02617693, "balance_loss_mlp": 1.00121009, "epoch": 0.19912222689833464, "flos": 67915805673600.0, "grad_norm": 0.7613219124214097, "language_loss": 0.58191884, "learning_rate": 3.707535257006777e-06, "loss": 0.60259783, "num_input_tokens_seen": 35106390, "step": 1656, "time_per_iteration": 3.3817102909088135 }, { "auxiliary_loss_clip": 0.01158506, "auxiliary_loss_mlp": 0.0104788, "balance_loss_clip": 1.05389321, "balance_loss_mlp": 1.02480078, "epoch": 0.19924246978897373, "flos": 15742340916480.0, "grad_norm": 2.369301844520752, "language_loss": 0.88486362, "learning_rate": 3.707129552610981e-06, "loss": 0.90692747, "num_input_tokens_seen": 35125040, "step": 1657, "time_per_iteration": 2.7660396099090576 }, { "auxiliary_loss_clip": 0.01151589, "auxiliary_loss_mlp": 0.01057858, "balance_loss_clip": 1.05477142, "balance_loss_mlp": 1.03470778, "epoch": 0.1993627126796128, "flos": 17566566986880.0, "grad_norm": 2.0392979335714907, "language_loss": 0.73686147, "learning_rate": 3.70672358924499e-06, "loss": 0.75895596, "num_input_tokens_seen": 35144280, "step": 1658, "time_per_iteration": 3.6835644245147705 }, { "auxiliary_loss_clip": 0.01146346, "auxiliary_loss_mlp": 0.01059801, "balance_loss_clip": 1.05445266, "balance_loss_mlp": 1.03793788, "epoch": 0.19948295557025192, "flos": 40843826680320.0, "grad_norm": 1.9890459201521218, "language_loss": 0.78306013, "learning_rate": 3.706317366970386e-06, "loss": 0.8051216, "num_input_tokens_seen": 35165280, "step": 1659, "time_per_iteration": 2.958122968673706 }, { "auxiliary_loss_clip": 0.01187532, "auxiliary_loss_mlp": 0.00778439, "balance_loss_clip": 1.0540309, "balance_loss_mlp": 1.00049257, "epoch": 0.199603198460891, "flos": 25082418620160.0, "grad_norm": 2.451284413460492, "language_loss": 0.83603972, "learning_rate": 3.705910885848795e-06, "loss": 0.85569942, "num_input_tokens_seen": 35183655, "step": 1660, "time_per_iteration": 2.829531669616699 }, { "auxiliary_loss_clip": 0.01170907, "auxiliary_loss_mlp": 0.01050568, "balance_loss_clip": 1.05371594, "balance_loss_mlp": 1.02876449, "epoch": 0.19972344135153008, "flos": 20084120352000.0, "grad_norm": 3.7234505999738783, "language_loss": 0.84930623, "learning_rate": 3.705504145941879e-06, "loss": 0.87152088, "num_input_tokens_seen": 35201825, "step": 1661, "time_per_iteration": 2.947319984436035 }, { "auxiliary_loss_clip": 0.0118745, "auxiliary_loss_mlp": 0.01060551, "balance_loss_clip": 1.05645859, "balance_loss_mlp": 1.03798425, "epoch": 0.1998436842421692, "flos": 23727472761600.0, "grad_norm": 2.429834048059368, "language_loss": 0.78435862, "learning_rate": 3.7050971473113403e-06, "loss": 0.80683863, "num_input_tokens_seen": 35221600, "step": 1662, "time_per_iteration": 2.758579969406128 }, { "auxiliary_loss_clip": 0.01166718, "auxiliary_loss_mlp": 0.00776492, "balance_loss_clip": 1.05076349, "balance_loss_mlp": 1.00045013, "epoch": 0.19996392713280828, "flos": 36102361633920.0, "grad_norm": 6.655403041316262, "language_loss": 0.79553258, "learning_rate": 3.7046898900189196e-06, "loss": 0.81496465, "num_input_tokens_seen": 35245935, "step": 1663, "time_per_iteration": 3.0302913188934326 }, { "auxiliary_loss_clip": 0.0115651, "auxiliary_loss_mlp": 0.01058572, "balance_loss_clip": 1.05726123, "balance_loss_mlp": 1.03502858, "epoch": 0.20008417002344736, "flos": 23657662679040.0, "grad_norm": 1.6552206772024893, "language_loss": 0.83183396, "learning_rate": 3.704282374126398e-06, "loss": 0.85398483, "num_input_tokens_seen": 35265615, "step": 1664, "time_per_iteration": 2.851407289505005 }, { "auxiliary_loss_clip": 0.01144657, "auxiliary_loss_mlp": 0.01056414, "balance_loss_clip": 1.05097842, "balance_loss_mlp": 1.03343022, "epoch": 0.20020441291408644, "flos": 21872076664320.0, "grad_norm": 1.884368460892727, "language_loss": 0.87585407, "learning_rate": 3.7038745996955954e-06, "loss": 0.89786482, "num_input_tokens_seen": 35284960, "step": 1665, "time_per_iteration": 2.9108879566192627 }, { "auxiliary_loss_clip": 0.01153447, "auxiliary_loss_mlp": 0.01066415, "balance_loss_clip": 1.05291605, "balance_loss_mlp": 1.04384851, "epoch": 0.20032465580472555, "flos": 23179691376000.0, "grad_norm": 6.5509934549337645, "language_loss": 0.72464168, "learning_rate": 3.703466566788371e-06, "loss": 0.7468403, "num_input_tokens_seen": 35304090, "step": 1666, "time_per_iteration": 2.7996561527252197 }, { "auxiliary_loss_clip": 0.01146506, "auxiliary_loss_mlp": 0.01057641, "balance_loss_clip": 1.05118477, "balance_loss_mlp": 1.03084278, "epoch": 0.20044489869536464, "flos": 23873521461120.0, "grad_norm": 2.0389639752025133, "language_loss": 0.74312526, "learning_rate": 3.703058275466622e-06, "loss": 0.76516676, "num_input_tokens_seen": 35323325, "step": 1667, "time_per_iteration": 2.822925090789795 }, { "auxiliary_loss_clip": 0.01159356, "auxiliary_loss_mlp": 0.01048283, "balance_loss_clip": 1.05599058, "balance_loss_mlp": 1.0262655, "epoch": 0.20056514158600372, "flos": 21945226711680.0, "grad_norm": 3.1019195235891317, "language_loss": 0.77843487, "learning_rate": 3.7026497257922877e-06, "loss": 0.80051124, "num_input_tokens_seen": 35343635, "step": 1668, "time_per_iteration": 2.937772512435913 }, { "auxiliary_loss_clip": 0.01128964, "auxiliary_loss_mlp": 0.01061127, "balance_loss_clip": 1.04805255, "balance_loss_mlp": 1.03696358, "epoch": 0.20068538447664283, "flos": 23879159896320.0, "grad_norm": 1.9495970329610404, "language_loss": 0.8542515, "learning_rate": 3.7022409178273436e-06, "loss": 0.8761524, "num_input_tokens_seen": 35364615, "step": 1669, "time_per_iteration": 2.9193758964538574 }, { "auxiliary_loss_clip": 0.01170969, "auxiliary_loss_mlp": 0.01046618, "balance_loss_clip": 1.05634558, "balance_loss_mlp": 1.02486193, "epoch": 0.2008056273672819, "flos": 18442823270400.0, "grad_norm": 3.345156431517875, "language_loss": 0.78552085, "learning_rate": 3.7018318516338054e-06, "loss": 0.8076967, "num_input_tokens_seen": 35383775, "step": 1670, "time_per_iteration": 2.652799367904663 }, { "auxiliary_loss_clip": 0.01174648, "auxiliary_loss_mlp": 0.01051865, "balance_loss_clip": 1.05512571, "balance_loss_mlp": 1.03118217, "epoch": 0.200925870257921, "flos": 23659530186240.0, "grad_norm": 3.523514573702675, "language_loss": 0.81365657, "learning_rate": 3.7014225272737284e-06, "loss": 0.83592165, "num_input_tokens_seen": 35403000, "step": 1671, "time_per_iteration": 2.7693941593170166 }, { "auxiliary_loss_clip": 0.0116468, "auxiliary_loss_mlp": 0.01051192, "balance_loss_clip": 1.05338037, "balance_loss_mlp": 1.02758825, "epoch": 0.20104611314856008, "flos": 16217115909120.0, "grad_norm": 2.7015641315414696, "language_loss": 0.74278879, "learning_rate": 3.701012944809207e-06, "loss": 0.76494747, "num_input_tokens_seen": 35420115, "step": 1672, "time_per_iteration": 2.8058295249938965 }, { "auxiliary_loss_clip": 0.01157396, "auxiliary_loss_mlp": 0.00778368, "balance_loss_clip": 1.05227089, "balance_loss_mlp": 1.00041163, "epoch": 0.2011663560391992, "flos": 21397373498880.0, "grad_norm": 2.8706845450822356, "language_loss": 0.79193258, "learning_rate": 3.700603104302374e-06, "loss": 0.81129026, "num_input_tokens_seen": 35439925, "step": 1673, "time_per_iteration": 2.7953696250915527 }, { "auxiliary_loss_clip": 0.01025404, "auxiliary_loss_mlp": 0.01010117, "balance_loss_clip": 1.02220821, "balance_loss_mlp": 1.00710154, "epoch": 0.20128659892983827, "flos": 62229459409920.0, "grad_norm": 0.9206350931588502, "language_loss": 0.55983973, "learning_rate": 3.7001930058154027e-06, "loss": 0.58019495, "num_input_tokens_seen": 35504885, "step": 1674, "time_per_iteration": 3.404399871826172 }, { "auxiliary_loss_clip": 0.0114684, "auxiliary_loss_mlp": 0.01058839, "balance_loss_clip": 1.04973412, "balance_loss_mlp": 1.03505683, "epoch": 0.20140684182047736, "flos": 28438737448320.0, "grad_norm": 2.7555616481426113, "language_loss": 0.794505, "learning_rate": 3.6997826494105037e-06, "loss": 0.81656182, "num_input_tokens_seen": 35525330, "step": 1675, "time_per_iteration": 2.870330572128296 }, { "auxiliary_loss_clip": 0.01158998, "auxiliary_loss_mlp": 0.01046487, "balance_loss_clip": 1.05029964, "balance_loss_mlp": 1.02587605, "epoch": 0.20152708471111647, "flos": 28074064619520.0, "grad_norm": 2.095815726668956, "language_loss": 0.6918267, "learning_rate": 3.6993720351499286e-06, "loss": 0.71388149, "num_input_tokens_seen": 35546455, "step": 1676, "time_per_iteration": 3.7360458374023438 }, { "auxiliary_loss_clip": 0.01154125, "auxiliary_loss_mlp": 0.01057799, "balance_loss_clip": 1.05552208, "balance_loss_mlp": 1.03561449, "epoch": 0.20164732760175555, "flos": 23549751244800.0, "grad_norm": 2.1731195312809386, "language_loss": 0.77063841, "learning_rate": 3.6989611630959666e-06, "loss": 0.79275763, "num_input_tokens_seen": 35565010, "step": 1677, "time_per_iteration": 2.777365207672119 }, { "auxiliary_loss_clip": 0.01051924, "auxiliary_loss_mlp": 0.01004267, "balance_loss_clip": 1.01486731, "balance_loss_mlp": 1.00083399, "epoch": 0.20176757049239463, "flos": 71100616037760.0, "grad_norm": 0.6744364205328103, "language_loss": 0.58255374, "learning_rate": 3.6985500333109474e-06, "loss": 0.60311568, "num_input_tokens_seen": 35633340, "step": 1678, "time_per_iteration": 3.3359336853027344 }, { "auxiliary_loss_clip": 0.01136845, "auxiliary_loss_mlp": 0.01051027, "balance_loss_clip": 1.04888296, "balance_loss_mlp": 1.02706647, "epoch": 0.20188781338303372, "flos": 21430159637760.0, "grad_norm": 3.8687352266531145, "language_loss": 0.76366305, "learning_rate": 3.6981386458572385e-06, "loss": 0.78554177, "num_input_tokens_seen": 35651315, "step": 1679, "time_per_iteration": 3.7909176349639893 }, { "auxiliary_loss_clip": 0.01139094, "auxiliary_loss_mlp": 0.01056902, "balance_loss_clip": 1.04750788, "balance_loss_mlp": 1.03243995, "epoch": 0.20200805627367283, "flos": 11546215130880.0, "grad_norm": 3.1440654577045866, "language_loss": 0.76526582, "learning_rate": 3.6977270007972468e-06, "loss": 0.78722572, "num_input_tokens_seen": 35668850, "step": 1680, "time_per_iteration": 2.7061665058135986 }, { "auxiliary_loss_clip": 0.01159263, "auxiliary_loss_mlp": 0.01058387, "balance_loss_clip": 1.05122256, "balance_loss_mlp": 1.03585649, "epoch": 0.2021282991643119, "flos": 28545391906560.0, "grad_norm": 2.6626802259921893, "language_loss": 0.72527117, "learning_rate": 3.6973150981934196e-06, "loss": 0.74744761, "num_input_tokens_seen": 35690080, "step": 1681, "time_per_iteration": 3.875412702560425 }, { "auxiliary_loss_clip": 0.01194588, "auxiliary_loss_mlp": 0.01051958, "balance_loss_clip": 1.05907321, "balance_loss_mlp": 1.02964211, "epoch": 0.202248542054951, "flos": 17923446564480.0, "grad_norm": 2.6363443451246242, "language_loss": 0.83633441, "learning_rate": 3.6969029381082415e-06, "loss": 0.85879982, "num_input_tokens_seen": 35706075, "step": 1682, "time_per_iteration": 2.653913974761963 }, { "auxiliary_loss_clip": 0.01148021, "auxiliary_loss_mlp": 0.01058089, "balance_loss_clip": 1.04941976, "balance_loss_mlp": 1.03709602, "epoch": 0.2023687849455901, "flos": 19864634296320.0, "grad_norm": 1.8547731336902573, "language_loss": 0.79400098, "learning_rate": 3.696490520604237e-06, "loss": 0.81606209, "num_input_tokens_seen": 35724765, "step": 1683, "time_per_iteration": 3.5967023372650146 }, { "auxiliary_loss_clip": 0.01163277, "auxiliary_loss_mlp": 0.01052712, "balance_loss_clip": 1.05077767, "balance_loss_mlp": 1.03244615, "epoch": 0.20248902783622919, "flos": 22564721600640.0, "grad_norm": 1.7412943226555069, "language_loss": 0.81082618, "learning_rate": 3.696077845743968e-06, "loss": 0.83298606, "num_input_tokens_seen": 35744355, "step": 1684, "time_per_iteration": 2.6275463104248047 }, { "auxiliary_loss_clip": 0.01185042, "auxiliary_loss_mlp": 0.0105723, "balance_loss_clip": 1.05070436, "balance_loss_mlp": 1.03318548, "epoch": 0.20260927072686827, "flos": 22709728805760.0, "grad_norm": 3.6580274623651645, "language_loss": 0.72891593, "learning_rate": 3.69566491359004e-06, "loss": 0.7513386, "num_input_tokens_seen": 35761000, "step": 1685, "time_per_iteration": 2.5084187984466553 }, { "auxiliary_loss_clip": 0.01155945, "auxiliary_loss_mlp": 0.01067263, "balance_loss_clip": 1.05098796, "balance_loss_mlp": 1.04325473, "epoch": 0.20272951361750738, "flos": 51023998650240.0, "grad_norm": 1.8205552332321275, "language_loss": 0.69318759, "learning_rate": 3.695251724205092e-06, "loss": 0.71541965, "num_input_tokens_seen": 35785360, "step": 1686, "time_per_iteration": 2.8225820064544678 }, { "auxiliary_loss_clip": 0.01180945, "auxiliary_loss_mlp": 0.01054849, "balance_loss_clip": 1.05139399, "balance_loss_mlp": 1.03311741, "epoch": 0.20284975650814646, "flos": 26578133879040.0, "grad_norm": 1.6368942878935506, "language_loss": 0.86378467, "learning_rate": 3.6948382776518054e-06, "loss": 0.88614267, "num_input_tokens_seen": 35806065, "step": 1687, "time_per_iteration": 2.5494003295898438 }, { "auxiliary_loss_clip": 0.0115199, "auxiliary_loss_mlp": 0.01059325, "balance_loss_clip": 1.04968119, "balance_loss_mlp": 1.03638911, "epoch": 0.20296999939878554, "flos": 16034222833920.0, "grad_norm": 2.09781423384483, "language_loss": 0.79245424, "learning_rate": 3.6944245739929e-06, "loss": 0.81456733, "num_input_tokens_seen": 35822225, "step": 1688, "time_per_iteration": 2.721987724304199 }, { "auxiliary_loss_clip": 0.01173143, "auxiliary_loss_mlp": 0.01055335, "balance_loss_clip": 1.05496264, "balance_loss_mlp": 1.03123116, "epoch": 0.20309024228942463, "flos": 19203374868480.0, "grad_norm": 2.8603240907638896, "language_loss": 0.71967846, "learning_rate": 3.6940106132911332e-06, "loss": 0.74196321, "num_input_tokens_seen": 35839410, "step": 1689, "time_per_iteration": 2.5926876068115234 }, { "auxiliary_loss_clip": 0.0117417, "auxiliary_loss_mlp": 0.01055796, "balance_loss_clip": 1.05470145, "balance_loss_mlp": 1.03331304, "epoch": 0.20321048518006374, "flos": 22821087945600.0, "grad_norm": 1.9787729460788512, "language_loss": 0.88721204, "learning_rate": 3.6935963956093037e-06, "loss": 0.90951163, "num_input_tokens_seen": 35859495, "step": 1690, "time_per_iteration": 2.6834216117858887 }, { "auxiliary_loss_clip": 0.01159272, "auxiliary_loss_mlp": 0.01051726, "balance_loss_clip": 1.05085957, "balance_loss_mlp": 1.02971983, "epoch": 0.20333072807070282, "flos": 19096397187840.0, "grad_norm": 1.7996748414739394, "language_loss": 0.6883902, "learning_rate": 3.6931819210102474e-06, "loss": 0.71050018, "num_input_tokens_seen": 35878890, "step": 1691, "time_per_iteration": 2.6689767837524414 }, { "auxiliary_loss_clip": 0.01183596, "auxiliary_loss_mlp": 0.01062179, "balance_loss_clip": 1.05296087, "balance_loss_mlp": 1.03945792, "epoch": 0.2034509709613419, "flos": 18180962144640.0, "grad_norm": 1.9884382229016373, "language_loss": 0.84607607, "learning_rate": 3.6927671895568402e-06, "loss": 0.86853385, "num_input_tokens_seen": 35897950, "step": 1692, "time_per_iteration": 2.5699946880340576 }, { "auxiliary_loss_clip": 0.01181882, "auxiliary_loss_mlp": 0.01050358, "balance_loss_clip": 1.05394816, "balance_loss_mlp": 1.02670681, "epoch": 0.20357121385198101, "flos": 22923899648640.0, "grad_norm": 2.7146059332266717, "language_loss": 0.86649096, "learning_rate": 3.692352201311996e-06, "loss": 0.88881338, "num_input_tokens_seen": 35916800, "step": 1693, "time_per_iteration": 2.628746271133423 }, { "auxiliary_loss_clip": 0.01138714, "auxiliary_loss_mlp": 0.01054537, "balance_loss_clip": 1.04737353, "balance_loss_mlp": 1.03147006, "epoch": 0.2036914567426201, "flos": 20922131629440.0, "grad_norm": 1.9760821294979187, "language_loss": 0.76933193, "learning_rate": 3.6919369563386687e-06, "loss": 0.79126453, "num_input_tokens_seen": 35936600, "step": 1694, "time_per_iteration": 2.713921546936035 }, { "auxiliary_loss_clip": 0.01154127, "auxiliary_loss_mlp": 0.01052424, "balance_loss_clip": 1.05258274, "balance_loss_mlp": 1.02899969, "epoch": 0.20381169963325918, "flos": 15519155760000.0, "grad_norm": 3.117533805022156, "language_loss": 0.79226327, "learning_rate": 3.69152145469985e-06, "loss": 0.81432879, "num_input_tokens_seen": 35953645, "step": 1695, "time_per_iteration": 2.66874098777771 }, { "auxiliary_loss_clip": 0.01140242, "auxiliary_loss_mlp": 0.01069848, "balance_loss_clip": 1.05192471, "balance_loss_mlp": 1.04560041, "epoch": 0.20393194252389826, "flos": 28833143760000.0, "grad_norm": 1.9404071242928436, "language_loss": 0.82166058, "learning_rate": 3.691105696458572e-06, "loss": 0.8437615, "num_input_tokens_seen": 35970940, "step": 1696, "time_per_iteration": 2.871143341064453 }, { "auxiliary_loss_clip": 0.01183735, "auxiliary_loss_mlp": 0.01054948, "balance_loss_clip": 1.05646777, "balance_loss_mlp": 1.03335953, "epoch": 0.20405218541453737, "flos": 22488554810880.0, "grad_norm": 2.988118365682765, "language_loss": 0.68068194, "learning_rate": 3.690689681677904e-06, "loss": 0.70306873, "num_input_tokens_seen": 35989410, "step": 1697, "time_per_iteration": 2.6922624111175537 }, { "auxiliary_loss_clip": 0.01158658, "auxiliary_loss_mlp": 0.01059114, "balance_loss_clip": 1.05208158, "balance_loss_mlp": 1.03472352, "epoch": 0.20417242830517646, "flos": 25374408278400.0, "grad_norm": 1.7020333849882368, "language_loss": 0.88385135, "learning_rate": 3.690273410420956e-06, "loss": 0.90602911, "num_input_tokens_seen": 36009175, "step": 1698, "time_per_iteration": 2.725149393081665 }, { "auxiliary_loss_clip": 0.01168415, "auxiliary_loss_mlp": 0.01051133, "balance_loss_clip": 1.05214036, "balance_loss_mlp": 1.0286026, "epoch": 0.20429267119581554, "flos": 14793078240000.0, "grad_norm": 2.905698821302587, "language_loss": 0.7726413, "learning_rate": 3.689856882750875e-06, "loss": 0.79483676, "num_input_tokens_seen": 36024375, "step": 1699, "time_per_iteration": 2.7074644565582275 }, { "auxiliary_loss_clip": 0.01164391, "auxiliary_loss_mlp": 0.01051343, "balance_loss_clip": 1.05270362, "balance_loss_mlp": 1.0300045, "epoch": 0.20441291408645465, "flos": 17781851151360.0, "grad_norm": 1.729428497053028, "language_loss": 0.78706956, "learning_rate": 3.6894400987308486e-06, "loss": 0.80922693, "num_input_tokens_seen": 36041895, "step": 1700, "time_per_iteration": 2.610522747039795 }, { "auxiliary_loss_clip": 0.01176308, "auxiliary_loss_mlp": 0.01058413, "balance_loss_clip": 1.05546224, "balance_loss_mlp": 1.03508413, "epoch": 0.20453315697709373, "flos": 16435668211200.0, "grad_norm": 3.765032344697856, "language_loss": 0.84995371, "learning_rate": 3.6890230584241024e-06, "loss": 0.87230092, "num_input_tokens_seen": 36058825, "step": 1701, "time_per_iteration": 2.7057533264160156 }, { "auxiliary_loss_clip": 0.01064858, "auxiliary_loss_mlp": 0.01007578, "balance_loss_clip": 1.01737988, "balance_loss_mlp": 1.00388229, "epoch": 0.20465339986773282, "flos": 66713085653760.0, "grad_norm": 1.0622104231444924, "language_loss": 0.66320527, "learning_rate": 3.6886057618939016e-06, "loss": 0.68392956, "num_input_tokens_seen": 36121645, "step": 1702, "time_per_iteration": 3.262810230255127 }, { "auxiliary_loss_clip": 0.01132132, "auxiliary_loss_mlp": 0.01066495, "balance_loss_clip": 1.04645705, "balance_loss_mlp": 1.04117548, "epoch": 0.2047736427583719, "flos": 41974114924800.0, "grad_norm": 2.2648748587340504, "language_loss": 0.69313669, "learning_rate": 3.6881882092035492e-06, "loss": 0.715123, "num_input_tokens_seen": 36143030, "step": 1703, "time_per_iteration": 4.221449613571167 }, { "auxiliary_loss_clip": 0.01044233, "auxiliary_loss_mlp": 0.00758869, "balance_loss_clip": 1.02259851, "balance_loss_mlp": 1.00096822, "epoch": 0.204893885649011, "flos": 69940878641280.0, "grad_norm": 0.9249552035518188, "language_loss": 0.61185312, "learning_rate": 3.6877704004163873e-06, "loss": 0.62988412, "num_input_tokens_seen": 36203435, "step": 1704, "time_per_iteration": 3.5022592544555664 }, { "auxiliary_loss_clip": 0.01184706, "auxiliary_loss_mlp": 0.01048451, "balance_loss_clip": 1.05473673, "balance_loss_mlp": 1.02798271, "epoch": 0.2050141285396501, "flos": 22200012858240.0, "grad_norm": 1.9088406364705253, "language_loss": 0.77750057, "learning_rate": 3.6873523355957984e-06, "loss": 0.79983217, "num_input_tokens_seen": 36222435, "step": 1705, "time_per_iteration": 3.7131857872009277 }, { "auxiliary_loss_clip": 0.01062571, "auxiliary_loss_mlp": 0.01004037, "balance_loss_clip": 1.01543319, "balance_loss_mlp": 1.00060356, "epoch": 0.20513437143028918, "flos": 46283721730560.0, "grad_norm": 0.9899998963209093, "language_loss": 0.6408959, "learning_rate": 3.686934014805201e-06, "loss": 0.66156203, "num_input_tokens_seen": 36273065, "step": 1706, "time_per_iteration": 3.1148922443389893 }, { "auxiliary_loss_clip": 0.0117286, "auxiliary_loss_mlp": 0.01068334, "balance_loss_clip": 1.05381525, "balance_loss_mlp": 1.04511237, "epoch": 0.20525461432092829, "flos": 21904324099200.0, "grad_norm": 2.068731018702998, "language_loss": 0.81042987, "learning_rate": 3.6865154381080552e-06, "loss": 0.83284187, "num_input_tokens_seen": 36293750, "step": 1707, "time_per_iteration": 3.6483209133148193 }, { "auxiliary_loss_clip": 0.01112835, "auxiliary_loss_mlp": 0.01058978, "balance_loss_clip": 1.04712689, "balance_loss_mlp": 1.03607845, "epoch": 0.20537485721156737, "flos": 21214264942080.0, "grad_norm": 2.0771040278543382, "language_loss": 0.82768941, "learning_rate": 3.6860966055678585e-06, "loss": 0.84940755, "num_input_tokens_seen": 36310105, "step": 1708, "time_per_iteration": 2.859302043914795 }, { "auxiliary_loss_clip": 0.0117124, "auxiliary_loss_mlp": 0.01064821, "balance_loss_clip": 1.05224574, "balance_loss_mlp": 1.04190874, "epoch": 0.20549510010220645, "flos": 20191205773440.0, "grad_norm": 2.0738378023343715, "language_loss": 0.8634876, "learning_rate": 3.685677517248147e-06, "loss": 0.88584816, "num_input_tokens_seen": 36328995, "step": 1709, "time_per_iteration": 2.6827802658081055 }, { "auxiliary_loss_clip": 0.01159296, "auxiliary_loss_mlp": 0.00777098, "balance_loss_clip": 1.05358946, "balance_loss_mlp": 1.00060403, "epoch": 0.20561534299284553, "flos": 17016702612480.0, "grad_norm": 2.1169735607690843, "language_loss": 0.80528086, "learning_rate": 3.6852581732124967e-06, "loss": 0.8246448, "num_input_tokens_seen": 36346340, "step": 1710, "time_per_iteration": 3.61580491065979 }, { "auxiliary_loss_clip": 0.01174031, "auxiliary_loss_mlp": 0.01056989, "balance_loss_clip": 1.05377281, "balance_loss_mlp": 1.03226471, "epoch": 0.20573558588348465, "flos": 22890467064960.0, "grad_norm": 3.5258000956823006, "language_loss": 0.76442677, "learning_rate": 3.6848385735245213e-06, "loss": 0.78673697, "num_input_tokens_seen": 36365430, "step": 1711, "time_per_iteration": 2.7673349380493164 }, { "auxiliary_loss_clip": 0.01152645, "auxiliary_loss_mlp": 0.01058989, "balance_loss_clip": 1.04803467, "balance_loss_mlp": 1.03680396, "epoch": 0.20585582877412373, "flos": 24643123286400.0, "grad_norm": 1.9224924892275002, "language_loss": 0.86454451, "learning_rate": 3.6844187182478734e-06, "loss": 0.88666093, "num_input_tokens_seen": 36386285, "step": 1712, "time_per_iteration": 2.7109572887420654 }, { "auxiliary_loss_clip": 0.01149789, "auxiliary_loss_mlp": 0.01063234, "balance_loss_clip": 1.04906464, "balance_loss_mlp": 1.03885579, "epoch": 0.2059760716647628, "flos": 24206952435840.0, "grad_norm": 2.5470909083465414, "language_loss": 0.74813503, "learning_rate": 3.683998607446246e-06, "loss": 0.77026528, "num_input_tokens_seen": 36404935, "step": 1713, "time_per_iteration": 2.8549821376800537 }, { "auxiliary_loss_clip": 0.01168603, "auxiliary_loss_mlp": 0.01052011, "balance_loss_clip": 1.0524404, "balance_loss_mlp": 1.03051734, "epoch": 0.20609631455540192, "flos": 20229522606720.0, "grad_norm": 1.814072689744323, "language_loss": 0.75151485, "learning_rate": 3.6835782411833686e-06, "loss": 0.77372098, "num_input_tokens_seen": 36424455, "step": 1714, "time_per_iteration": 2.7295987606048584 }, { "auxiliary_loss_clip": 0.01134715, "auxiliary_loss_mlp": 0.01060168, "balance_loss_clip": 1.05088305, "balance_loss_mlp": 1.0372442, "epoch": 0.206216557446041, "flos": 19864957518720.0, "grad_norm": 1.8075658396663359, "language_loss": 0.74122822, "learning_rate": 3.68315761952301e-06, "loss": 0.7631771, "num_input_tokens_seen": 36441685, "step": 1715, "time_per_iteration": 2.794447898864746 }, { "auxiliary_loss_clip": 0.01181822, "auxiliary_loss_mlp": 0.01060573, "balance_loss_clip": 1.05388856, "balance_loss_mlp": 1.03888845, "epoch": 0.2063368003366801, "flos": 24096311568000.0, "grad_norm": 2.0110726376736103, "language_loss": 0.82950151, "learning_rate": 3.6827367425289797e-06, "loss": 0.85192543, "num_input_tokens_seen": 36461460, "step": 1716, "time_per_iteration": 2.787893295288086 }, { "auxiliary_loss_clip": 0.01158703, "auxiliary_loss_mlp": 0.01056328, "balance_loss_clip": 1.05188084, "balance_loss_mlp": 1.03396392, "epoch": 0.2064570432273192, "flos": 20340163474560.0, "grad_norm": 2.710598363660335, "language_loss": 0.72381973, "learning_rate": 3.6823156102651225e-06, "loss": 0.74597001, "num_input_tokens_seen": 36479615, "step": 1717, "time_per_iteration": 2.746532440185547 }, { "auxiliary_loss_clip": 0.01110718, "auxiliary_loss_mlp": 0.01047989, "balance_loss_clip": 1.0491724, "balance_loss_mlp": 1.02491021, "epoch": 0.20657728611795828, "flos": 20520363029760.0, "grad_norm": 2.22114871228431, "language_loss": 0.71000087, "learning_rate": 3.6818942227953257e-06, "loss": 0.73158801, "num_input_tokens_seen": 36500160, "step": 1718, "time_per_iteration": 2.8600785732269287 }, { "auxiliary_loss_clip": 0.0115039, "auxiliary_loss_mlp": 0.01058855, "balance_loss_clip": 1.05317068, "balance_loss_mlp": 1.0372299, "epoch": 0.20669752900859736, "flos": 21799285752960.0, "grad_norm": 2.294517816262675, "language_loss": 0.68824112, "learning_rate": 3.681472580183512e-06, "loss": 0.71033359, "num_input_tokens_seen": 36518810, "step": 1719, "time_per_iteration": 2.8545804023742676 }, { "auxiliary_loss_clip": 0.01165784, "auxiliary_loss_mlp": 0.01055558, "balance_loss_clip": 1.0513041, "balance_loss_mlp": 1.03233624, "epoch": 0.20681777189923645, "flos": 15122020014720.0, "grad_norm": 2.3001331713351663, "language_loss": 0.86575937, "learning_rate": 3.6810506824936455e-06, "loss": 0.88797277, "num_input_tokens_seen": 36536890, "step": 1720, "time_per_iteration": 2.6937239170074463 }, { "auxiliary_loss_clip": 0.01048373, "auxiliary_loss_mlp": 0.01014286, "balance_loss_clip": 1.01993954, "balance_loss_mlp": 1.01123381, "epoch": 0.20693801478987556, "flos": 56481021509760.0, "grad_norm": 1.0530533088184775, "language_loss": 0.62574363, "learning_rate": 3.680628529789726e-06, "loss": 0.64637029, "num_input_tokens_seen": 36589300, "step": 1721, "time_per_iteration": 3.0992255210876465 }, { "auxiliary_loss_clip": 0.01191376, "auxiliary_loss_mlp": 0.01051355, "balance_loss_clip": 1.05610836, "balance_loss_mlp": 1.02794182, "epoch": 0.20705825768051464, "flos": 21614201948160.0, "grad_norm": 3.325689392346665, "language_loss": 0.86552715, "learning_rate": 3.680206122135796e-06, "loss": 0.88795447, "num_input_tokens_seen": 36609905, "step": 1722, "time_per_iteration": 2.6617398262023926 }, { "auxiliary_loss_clip": 0.01137928, "auxiliary_loss_mlp": 0.01051161, "balance_loss_clip": 1.05124295, "balance_loss_mlp": 1.0282129, "epoch": 0.20717850057115372, "flos": 25848895962240.0, "grad_norm": 2.0753819805035145, "language_loss": 0.78518081, "learning_rate": 3.6797834595959323e-06, "loss": 0.80707175, "num_input_tokens_seen": 36629805, "step": 1723, "time_per_iteration": 2.7303788661956787 }, { "auxiliary_loss_clip": 0.01120812, "auxiliary_loss_mlp": 0.01054939, "balance_loss_clip": 1.0449965, "balance_loss_mlp": 1.03040624, "epoch": 0.20729874346179283, "flos": 29130807767040.0, "grad_norm": 2.627906217814018, "language_loss": 0.77608037, "learning_rate": 3.679360542234254e-06, "loss": 0.79783785, "num_input_tokens_seen": 36649150, "step": 1724, "time_per_iteration": 2.79882550239563 }, { "auxiliary_loss_clip": 0.01144049, "auxiliary_loss_mlp": 0.00777181, "balance_loss_clip": 1.0460577, "balance_loss_mlp": 1.00044668, "epoch": 0.20741898635243192, "flos": 29023363209600.0, "grad_norm": 1.9907408543617564, "language_loss": 0.72173136, "learning_rate": 3.678937370114916e-06, "loss": 0.74094367, "num_input_tokens_seen": 36668955, "step": 1725, "time_per_iteration": 2.7226693630218506 }, { "auxiliary_loss_clip": 0.01152117, "auxiliary_loss_mlp": 0.01050698, "balance_loss_clip": 1.05163205, "balance_loss_mlp": 1.02877545, "epoch": 0.207539229243071, "flos": 15559447841280.0, "grad_norm": 2.0264858786941646, "language_loss": 0.78962457, "learning_rate": 3.678513943302114e-06, "loss": 0.81165278, "num_input_tokens_seen": 36685730, "step": 1726, "time_per_iteration": 2.6852853298187256 }, { "auxiliary_loss_clip": 0.01180252, "auxiliary_loss_mlp": 0.01045263, "balance_loss_clip": 1.05275059, "balance_loss_mlp": 1.02406752, "epoch": 0.20765947213371008, "flos": 20521081301760.0, "grad_norm": 2.335188584968219, "language_loss": 0.85122859, "learning_rate": 3.678090261860082e-06, "loss": 0.87348378, "num_input_tokens_seen": 36705460, "step": 1727, "time_per_iteration": 2.685629367828369 }, { "auxiliary_loss_clip": 0.01143594, "auxiliary_loss_mlp": 0.01061617, "balance_loss_clip": 1.04807401, "balance_loss_mlp": 1.03893161, "epoch": 0.2077797150243492, "flos": 19354415558400.0, "grad_norm": 1.8862940160219372, "language_loss": 0.77822143, "learning_rate": 3.6776663258530906e-06, "loss": 0.80027354, "num_input_tokens_seen": 36724110, "step": 1728, "time_per_iteration": 2.7293291091918945 }, { "auxiliary_loss_clip": 0.01169589, "auxiliary_loss_mlp": 0.0104873, "balance_loss_clip": 1.05316067, "balance_loss_mlp": 1.02714157, "epoch": 0.20789995791498828, "flos": 21829952989440.0, "grad_norm": 1.8690804512752983, "language_loss": 0.71308243, "learning_rate": 3.6772421353454516e-06, "loss": 0.73526561, "num_input_tokens_seen": 36742705, "step": 1729, "time_per_iteration": 3.7551093101501465 }, { "auxiliary_loss_clip": 0.01169734, "auxiliary_loss_mlp": 0.01052332, "balance_loss_clip": 1.05526304, "balance_loss_mlp": 1.02890801, "epoch": 0.20802020080562736, "flos": 23148844571520.0, "grad_norm": 1.9456685981310842, "language_loss": 0.88163733, "learning_rate": 3.6768176904015153e-06, "loss": 0.90385807, "num_input_tokens_seen": 36762510, "step": 1730, "time_per_iteration": 2.7115838527679443 }, { "auxiliary_loss_clip": 0.01171301, "auxiliary_loss_mlp": 0.01053286, "balance_loss_clip": 1.0519309, "balance_loss_mlp": 1.03114879, "epoch": 0.20814044369626647, "flos": 23072677781760.0, "grad_norm": 2.1057577965908574, "language_loss": 0.60042, "learning_rate": 3.6763929910856674e-06, "loss": 0.62266582, "num_input_tokens_seen": 36780960, "step": 1731, "time_per_iteration": 3.665271282196045 }, { "auxiliary_loss_clip": 0.01171035, "auxiliary_loss_mlp": 0.01061078, "balance_loss_clip": 1.05587745, "balance_loss_mlp": 1.03933406, "epoch": 0.20826068658690555, "flos": 19608016556160.0, "grad_norm": 2.7895337306266645, "language_loss": 0.77629715, "learning_rate": 3.6759680374623365e-06, "loss": 0.79861832, "num_input_tokens_seen": 36798875, "step": 1732, "time_per_iteration": 2.8060014247894287 }, { "auxiliary_loss_clip": 0.01183037, "auxiliary_loss_mlp": 0.0105588, "balance_loss_clip": 1.05601203, "balance_loss_mlp": 1.03349268, "epoch": 0.20838092947754464, "flos": 25374049142400.0, "grad_norm": 2.4376519014233238, "language_loss": 0.75458622, "learning_rate": 3.675542829595986e-06, "loss": 0.77697533, "num_input_tokens_seen": 36818540, "step": 1733, "time_per_iteration": 3.696361780166626 }, { "auxiliary_loss_clip": 0.01159913, "auxiliary_loss_mlp": 0.0106515, "balance_loss_clip": 1.05285048, "balance_loss_mlp": 1.0430485, "epoch": 0.20850117236818372, "flos": 24061729749120.0, "grad_norm": 1.7770820122591935, "language_loss": 0.79350507, "learning_rate": 3.6751173675511213e-06, "loss": 0.81575572, "num_input_tokens_seen": 36840585, "step": 1734, "time_per_iteration": 2.734830141067505 }, { "auxiliary_loss_clip": 0.01149929, "auxiliary_loss_mlp": 0.01062493, "balance_loss_clip": 1.0472002, "balance_loss_mlp": 1.04022491, "epoch": 0.20862141525882283, "flos": 20077799558400.0, "grad_norm": 2.3716747708950483, "language_loss": 0.87617749, "learning_rate": 3.674691651392283e-06, "loss": 0.89830172, "num_input_tokens_seen": 36858255, "step": 1735, "time_per_iteration": 2.7096400260925293 }, { "auxiliary_loss_clip": 0.01164923, "auxiliary_loss_mlp": 0.01060558, "balance_loss_clip": 1.05415213, "balance_loss_mlp": 1.03857577, "epoch": 0.2087416581494619, "flos": 39015183237120.0, "grad_norm": 4.152248248984742, "language_loss": 0.75571847, "learning_rate": 3.674265681184053e-06, "loss": 0.77797329, "num_input_tokens_seen": 36881515, "step": 1736, "time_per_iteration": 3.6876180171966553 }, { "auxiliary_loss_clip": 0.01158988, "auxiliary_loss_mlp": 0.01055194, "balance_loss_clip": 1.05137777, "balance_loss_mlp": 1.03258014, "epoch": 0.208861901040101, "flos": 26101994169600.0, "grad_norm": 1.8078245881192825, "language_loss": 0.86668426, "learning_rate": 3.6738394569910504e-06, "loss": 0.88882607, "num_input_tokens_seen": 36902055, "step": 1737, "time_per_iteration": 2.7750699520111084 }, { "auxiliary_loss_clip": 0.01174545, "auxiliary_loss_mlp": 0.01057614, "balance_loss_clip": 1.06182623, "balance_loss_mlp": 1.03453553, "epoch": 0.2089821439307401, "flos": 28398732675840.0, "grad_norm": 2.763023023556795, "language_loss": 0.82683992, "learning_rate": 3.6734129788779333e-06, "loss": 0.84916151, "num_input_tokens_seen": 36921230, "step": 1738, "time_per_iteration": 2.6936395168304443 }, { "auxiliary_loss_clip": 0.01143444, "auxiliary_loss_mlp": 0.01057955, "balance_loss_clip": 1.05365443, "balance_loss_mlp": 1.03562713, "epoch": 0.2091023868213792, "flos": 21069616872960.0, "grad_norm": 2.141343239562756, "language_loss": 0.90393549, "learning_rate": 3.6729862469093976e-06, "loss": 0.92594945, "num_input_tokens_seen": 36940325, "step": 1739, "time_per_iteration": 2.7665352821350098 }, { "auxiliary_loss_clip": 0.01146324, "auxiliary_loss_mlp": 0.01056196, "balance_loss_clip": 1.05113149, "balance_loss_mlp": 1.03485727, "epoch": 0.20922262971201827, "flos": 22455481363200.0, "grad_norm": 2.386788781586381, "language_loss": 0.8247171, "learning_rate": 3.6725592611501782e-06, "loss": 0.84674227, "num_input_tokens_seen": 36959000, "step": 1740, "time_per_iteration": 2.802743673324585 }, { "auxiliary_loss_clip": 0.01169001, "auxiliary_loss_mlp": 0.01061444, "balance_loss_clip": 1.05133593, "balance_loss_mlp": 1.03885388, "epoch": 0.20934287260265738, "flos": 27852244179840.0, "grad_norm": 1.796808801725008, "language_loss": 0.76397133, "learning_rate": 3.6721320216650496e-06, "loss": 0.7862758, "num_input_tokens_seen": 36979615, "step": 1741, "time_per_iteration": 2.7296643257141113 }, { "auxiliary_loss_clip": 0.01161378, "auxiliary_loss_mlp": 0.01059876, "balance_loss_clip": 1.05414474, "balance_loss_mlp": 1.03784621, "epoch": 0.20946311549329646, "flos": 16435309075200.0, "grad_norm": 2.10801400373551, "language_loss": 0.83810842, "learning_rate": 3.6717045285188215e-06, "loss": 0.86032093, "num_input_tokens_seen": 36997310, "step": 1742, "time_per_iteration": 2.7499818801879883 }, { "auxiliary_loss_clip": 0.01119821, "auxiliary_loss_mlp": 0.01061625, "balance_loss_clip": 1.04573178, "balance_loss_mlp": 1.03772318, "epoch": 0.20958335838393555, "flos": 22492720788480.0, "grad_norm": 1.971592888114381, "language_loss": 0.86841011, "learning_rate": 3.671276781776346e-06, "loss": 0.89022458, "num_input_tokens_seen": 37015965, "step": 1743, "time_per_iteration": 2.8228442668914795 }, { "auxiliary_loss_clip": 0.01147477, "auxiliary_loss_mlp": 0.01059154, "balance_loss_clip": 1.04835045, "balance_loss_mlp": 1.03676677, "epoch": 0.20970360127457463, "flos": 25224768218880.0, "grad_norm": 2.2566995862567487, "language_loss": 0.67330611, "learning_rate": 3.6708487815025128e-06, "loss": 0.6953724, "num_input_tokens_seen": 37036545, "step": 1744, "time_per_iteration": 2.7910776138305664 }, { "auxiliary_loss_clip": 0.01148822, "auxiliary_loss_mlp": 0.01060932, "balance_loss_clip": 1.05183935, "balance_loss_mlp": 1.03890181, "epoch": 0.20982384416521374, "flos": 18479164855680.0, "grad_norm": 2.753260361038552, "language_loss": 0.74324584, "learning_rate": 3.6704205277622463e-06, "loss": 0.76534331, "num_input_tokens_seen": 37054985, "step": 1745, "time_per_iteration": 2.8122401237487793 }, { "auxiliary_loss_clip": 0.01157389, "auxiliary_loss_mlp": 0.0105135, "balance_loss_clip": 1.04985976, "balance_loss_mlp": 1.03013027, "epoch": 0.20994408705585282, "flos": 25373546352000.0, "grad_norm": 2.152928206179106, "language_loss": 0.80108976, "learning_rate": 3.6699920206205146e-06, "loss": 0.8231771, "num_input_tokens_seen": 37075725, "step": 1746, "time_per_iteration": 2.8465728759765625 }, { "auxiliary_loss_clip": 0.01172951, "auxiliary_loss_mlp": 0.01062211, "balance_loss_clip": 1.05247378, "balance_loss_mlp": 1.03972816, "epoch": 0.2100643299464919, "flos": 21320955313920.0, "grad_norm": 1.794371793015787, "language_loss": 0.81907374, "learning_rate": 3.669563260142321e-06, "loss": 0.84142536, "num_input_tokens_seen": 37094615, "step": 1747, "time_per_iteration": 2.6062939167022705 }, { "auxiliary_loss_clip": 0.01156246, "auxiliary_loss_mlp": 0.01052232, "balance_loss_clip": 1.05254781, "balance_loss_mlp": 1.02879548, "epoch": 0.21018457283713102, "flos": 19354379644800.0, "grad_norm": 2.567243664065309, "language_loss": 0.84617472, "learning_rate": 3.6691342463927083e-06, "loss": 0.86825949, "num_input_tokens_seen": 37113610, "step": 1748, "time_per_iteration": 2.73982310295105 }, { "auxiliary_loss_clip": 0.0114606, "auxiliary_loss_mlp": 0.01062145, "balance_loss_clip": 1.05105138, "balance_loss_mlp": 1.04021025, "epoch": 0.2103048157277701, "flos": 28330035914880.0, "grad_norm": 1.9437773276880481, "language_loss": 0.81563753, "learning_rate": 3.668704979436758e-06, "loss": 0.83771956, "num_input_tokens_seen": 37133705, "step": 1749, "time_per_iteration": 2.749112844467163 }, { "auxiliary_loss_clip": 0.01149732, "auxiliary_loss_mlp": 0.01065952, "balance_loss_clip": 1.04855788, "balance_loss_mlp": 1.04115617, "epoch": 0.21042505861840918, "flos": 17457290835840.0, "grad_norm": 2.4081940786435503, "language_loss": 0.78821266, "learning_rate": 3.668275459339588e-06, "loss": 0.81036943, "num_input_tokens_seen": 37152185, "step": 1750, "time_per_iteration": 2.697995901107788 }, { "auxiliary_loss_clip": 0.01185306, "auxiliary_loss_mlp": 0.01061104, "balance_loss_clip": 1.05698884, "balance_loss_mlp": 1.0377152, "epoch": 0.21054530150904827, "flos": 14209817195520.0, "grad_norm": 4.256055334664717, "language_loss": 0.80029392, "learning_rate": 3.667845686166358e-06, "loss": 0.82275808, "num_input_tokens_seen": 37169110, "step": 1751, "time_per_iteration": 2.596005439758301 }, { "auxiliary_loss_clip": 0.01123054, "auxiliary_loss_mlp": 0.01058669, "balance_loss_clip": 1.04431868, "balance_loss_mlp": 1.03332531, "epoch": 0.21066554439968738, "flos": 18618210403200.0, "grad_norm": 1.7152235652691912, "language_loss": 0.85840762, "learning_rate": 3.6674156599822634e-06, "loss": 0.88022482, "num_input_tokens_seen": 37184905, "step": 1752, "time_per_iteration": 2.668760299682617 }, { "auxiliary_loss_clip": 0.01129785, "auxiliary_loss_mlp": 0.01063436, "balance_loss_clip": 1.04571283, "balance_loss_mlp": 1.03983247, "epoch": 0.21078578729032646, "flos": 23658883741440.0, "grad_norm": 2.265656445828761, "language_loss": 0.81550819, "learning_rate": 3.666985380852539e-06, "loss": 0.83744043, "num_input_tokens_seen": 37203910, "step": 1753, "time_per_iteration": 2.739177942276001 }, { "auxiliary_loss_clip": 0.01162514, "auxiliary_loss_mlp": 0.01056693, "balance_loss_clip": 1.05210114, "balance_loss_mlp": 1.03248215, "epoch": 0.21090603018096554, "flos": 29346379240320.0, "grad_norm": 3.206531958365361, "language_loss": 0.74553055, "learning_rate": 3.6665548488424576e-06, "loss": 0.76772261, "num_input_tokens_seen": 37222670, "step": 1754, "time_per_iteration": 3.770080089569092 }, { "auxiliary_loss_clip": 0.01187741, "auxiliary_loss_mlp": 0.0106006, "balance_loss_clip": 1.05815256, "balance_loss_mlp": 1.03553843, "epoch": 0.21102627307160465, "flos": 23261245205760.0, "grad_norm": 2.0430819752318934, "language_loss": 0.88097858, "learning_rate": 3.6661240640173307e-06, "loss": 0.90345663, "num_input_tokens_seen": 37244140, "step": 1755, "time_per_iteration": 2.660417318344116 }, { "auxiliary_loss_clip": 0.01036091, "auxiliary_loss_mlp": 0.01000781, "balance_loss_clip": 1.01890445, "balance_loss_mlp": 0.99768144, "epoch": 0.21114651596224374, "flos": 54633454577280.0, "grad_norm": 1.0593180999309737, "language_loss": 0.57967514, "learning_rate": 3.6656930264425085e-06, "loss": 0.60004383, "num_input_tokens_seen": 37308185, "step": 1756, "time_per_iteration": 3.3043811321258545 }, { "auxiliary_loss_clip": 0.01184021, "auxiliary_loss_mlp": 0.0105291, "balance_loss_clip": 1.05573511, "balance_loss_mlp": 1.02990294, "epoch": 0.21126675885288282, "flos": 21543314457600.0, "grad_norm": 2.4076236712669092, "language_loss": 0.75748479, "learning_rate": 3.665261736183378e-06, "loss": 0.77985406, "num_input_tokens_seen": 37328220, "step": 1757, "time_per_iteration": 3.5970118045806885 }, { "auxiliary_loss_clip": 0.01146454, "auxiliary_loss_mlp": 0.01053929, "balance_loss_clip": 1.05461812, "balance_loss_mlp": 1.02949071, "epoch": 0.2113870017435219, "flos": 10961876678400.0, "grad_norm": 3.1544738826898095, "language_loss": 0.88771135, "learning_rate": 3.664830193305366e-06, "loss": 0.90971518, "num_input_tokens_seen": 37345995, "step": 1758, "time_per_iteration": 2.712824821472168 }, { "auxiliary_loss_clip": 0.0113467, "auxiliary_loss_mlp": 0.01055905, "balance_loss_clip": 1.04457045, "balance_loss_mlp": 1.03057289, "epoch": 0.211507244634161, "flos": 16653825463680.0, "grad_norm": 3.0362814227666277, "language_loss": 0.77023113, "learning_rate": 3.6643983978739373e-06, "loss": 0.79213691, "num_input_tokens_seen": 37362610, "step": 1759, "time_per_iteration": 3.6303036212921143 }, { "auxiliary_loss_clip": 0.0114955, "auxiliary_loss_mlp": 0.01062464, "balance_loss_clip": 1.05117035, "balance_loss_mlp": 1.03758478, "epoch": 0.2116274875248001, "flos": 20954091755520.0, "grad_norm": 3.4834103000806853, "language_loss": 0.82076305, "learning_rate": 3.663966349954596e-06, "loss": 0.84288323, "num_input_tokens_seen": 37382790, "step": 1760, "time_per_iteration": 2.757870674133301 }, { "auxiliary_loss_clip": 0.01054658, "auxiliary_loss_mlp": 0.0100605, "balance_loss_clip": 1.01756132, "balance_loss_mlp": 1.00273597, "epoch": 0.21174773041543918, "flos": 68196949424640.0, "grad_norm": 0.7857924544232523, "language_loss": 0.59635669, "learning_rate": 3.6635340496128816e-06, "loss": 0.61696374, "num_input_tokens_seen": 37439720, "step": 1761, "time_per_iteration": 3.1830527782440186 }, { "auxiliary_loss_clip": 0.01125802, "auxiliary_loss_mlp": 0.01060303, "balance_loss_clip": 1.0471499, "balance_loss_mlp": 1.03708136, "epoch": 0.2118679733060783, "flos": 20668315150080.0, "grad_norm": 1.9271867520728285, "language_loss": 0.93096387, "learning_rate": 3.6631014969143747e-06, "loss": 0.95282495, "num_input_tokens_seen": 37459410, "step": 1762, "time_per_iteration": 3.6293818950653076 }, { "auxiliary_loss_clip": 0.01167847, "auxiliary_loss_mlp": 0.01059541, "balance_loss_clip": 1.05345833, "balance_loss_mlp": 1.03562808, "epoch": 0.21198821619671737, "flos": 23223431162880.0, "grad_norm": 1.8733012849786839, "language_loss": 0.88640141, "learning_rate": 3.662668691924693e-06, "loss": 0.90867531, "num_input_tokens_seen": 37480460, "step": 1763, "time_per_iteration": 2.722928285598755 }, { "auxiliary_loss_clip": 0.01140171, "auxiliary_loss_mlp": 0.01056829, "balance_loss_clip": 1.04855788, "balance_loss_mlp": 1.03152108, "epoch": 0.21210845908735645, "flos": 24498547044480.0, "grad_norm": 2.278010680249471, "language_loss": 0.71430838, "learning_rate": 3.6622356347094927e-06, "loss": 0.73627841, "num_input_tokens_seen": 37502025, "step": 1764, "time_per_iteration": 2.7840240001678467 }, { "auxiliary_loss_clip": 0.01138644, "auxiliary_loss_mlp": 0.01061792, "balance_loss_clip": 1.04646993, "balance_loss_mlp": 1.03574467, "epoch": 0.21222870197799554, "flos": 27089789160960.0, "grad_norm": 2.1605216399435645, "language_loss": 0.78759712, "learning_rate": 3.6618023253344684e-06, "loss": 0.80960149, "num_input_tokens_seen": 37520885, "step": 1765, "time_per_iteration": 2.8382906913757324 }, { "auxiliary_loss_clip": 0.01167969, "auxiliary_loss_mlp": 0.01063042, "balance_loss_clip": 1.0503211, "balance_loss_mlp": 1.03914022, "epoch": 0.21234894486863465, "flos": 16873850223360.0, "grad_norm": 1.7288738381981348, "language_loss": 0.83512759, "learning_rate": 3.6613687638653527e-06, "loss": 0.85743767, "num_input_tokens_seen": 37539055, "step": 1766, "time_per_iteration": 2.739403486251831 }, { "auxiliary_loss_clip": 0.0115533, "auxiliary_loss_mlp": 0.01052725, "balance_loss_clip": 1.05192018, "balance_loss_mlp": 1.02906215, "epoch": 0.21246918775927373, "flos": 23474949171840.0, "grad_norm": 2.405802553576027, "language_loss": 0.77783674, "learning_rate": 3.660934950367916e-06, "loss": 0.79991734, "num_input_tokens_seen": 37558300, "step": 1767, "time_per_iteration": 2.720916271209717 }, { "auxiliary_loss_clip": 0.01171466, "auxiliary_loss_mlp": 0.01055377, "balance_loss_clip": 1.05319881, "balance_loss_mlp": 1.02992594, "epoch": 0.21258943064991281, "flos": 22382295402240.0, "grad_norm": 1.8443708984553542, "language_loss": 0.83411717, "learning_rate": 3.660500884907968e-06, "loss": 0.85638559, "num_input_tokens_seen": 37579040, "step": 1768, "time_per_iteration": 2.898369312286377 }, { "auxiliary_loss_clip": 0.01020653, "auxiliary_loss_mlp": 0.01023168, "balance_loss_clip": 1.01336586, "balance_loss_mlp": 1.0197829, "epoch": 0.21270967354055192, "flos": 59440168679040.0, "grad_norm": 0.826186078354373, "language_loss": 0.60037327, "learning_rate": 3.660066567551356e-06, "loss": 0.62081146, "num_input_tokens_seen": 37639185, "step": 1769, "time_per_iteration": 3.2134459018707275 }, { "auxiliary_loss_clip": 0.01168663, "auxiliary_loss_mlp": 0.00776779, "balance_loss_clip": 1.05424333, "balance_loss_mlp": 1.00069416, "epoch": 0.212829916431191, "flos": 21544032729600.0, "grad_norm": 3.1388908312580135, "language_loss": 0.84181768, "learning_rate": 3.6596319983639657e-06, "loss": 0.8612721, "num_input_tokens_seen": 37657765, "step": 1770, "time_per_iteration": 2.6933891773223877 }, { "auxiliary_loss_clip": 0.01150423, "auxiliary_loss_mlp": 0.00778121, "balance_loss_clip": 1.05459809, "balance_loss_mlp": 1.00076151, "epoch": 0.2129501593218301, "flos": 28987739896320.0, "grad_norm": 1.666059044813124, "language_loss": 0.85962343, "learning_rate": 3.6591971774117214e-06, "loss": 0.87890887, "num_input_tokens_seen": 37680740, "step": 1771, "time_per_iteration": 2.7717981338500977 }, { "auxiliary_loss_clip": 0.01174693, "auxiliary_loss_mlp": 0.01061068, "balance_loss_clip": 1.05529046, "balance_loss_mlp": 1.03890705, "epoch": 0.2130704022124692, "flos": 18806993308800.0, "grad_norm": 2.3909388472142936, "language_loss": 0.80427194, "learning_rate": 3.6587621047605833e-06, "loss": 0.82662952, "num_input_tokens_seen": 37697910, "step": 1772, "time_per_iteration": 2.6137068271636963 }, { "auxiliary_loss_clip": 0.01161649, "auxiliary_loss_mlp": 0.0105375, "balance_loss_clip": 1.04951143, "balance_loss_mlp": 1.02989578, "epoch": 0.21319064510310828, "flos": 13918150759680.0, "grad_norm": 2.209573577945875, "language_loss": 0.86682534, "learning_rate": 3.6583267804765542e-06, "loss": 0.88897932, "num_input_tokens_seen": 37712245, "step": 1773, "time_per_iteration": 2.6269845962524414 }, { "auxiliary_loss_clip": 0.01166183, "auxiliary_loss_mlp": 0.01062627, "balance_loss_clip": 1.05077147, "balance_loss_mlp": 1.03860629, "epoch": 0.21331088799374737, "flos": 20959694277120.0, "grad_norm": 2.2986141007299605, "language_loss": 0.85822278, "learning_rate": 3.6578912046256702e-06, "loss": 0.88051093, "num_input_tokens_seen": 37730765, "step": 1774, "time_per_iteration": 2.6591989994049072 }, { "auxiliary_loss_clip": 0.01134248, "auxiliary_loss_mlp": 0.01058744, "balance_loss_clip": 1.04402435, "balance_loss_mlp": 1.0334959, "epoch": 0.21343113088438645, "flos": 18624638937600.0, "grad_norm": 2.2740827352003916, "language_loss": 0.76257217, "learning_rate": 3.6574553772740083e-06, "loss": 0.78450209, "num_input_tokens_seen": 37748695, "step": 1775, "time_per_iteration": 2.7374110221862793 }, { "auxiliary_loss_clip": 0.01047527, "auxiliary_loss_mlp": 0.01004461, "balance_loss_clip": 1.01600015, "balance_loss_mlp": 1.00136113, "epoch": 0.21355137377502556, "flos": 67413128791680.0, "grad_norm": 1.070033807231055, "language_loss": 0.61894876, "learning_rate": 3.657019298487684e-06, "loss": 0.63946873, "num_input_tokens_seen": 37813705, "step": 1776, "time_per_iteration": 3.258190155029297 }, { "auxiliary_loss_clip": 0.01168483, "auxiliary_loss_mlp": 0.00777896, "balance_loss_clip": 1.04949629, "balance_loss_mlp": 1.00064325, "epoch": 0.21367161666566464, "flos": 34532095697280.0, "grad_norm": 1.677324887663463, "language_loss": 0.83376467, "learning_rate": 3.6565829683328495e-06, "loss": 0.85322839, "num_input_tokens_seen": 37836330, "step": 1777, "time_per_iteration": 2.81402587890625 }, { "auxiliary_loss_clip": 0.01163572, "auxiliary_loss_mlp": 0.01055495, "balance_loss_clip": 1.05361819, "balance_loss_mlp": 1.03179622, "epoch": 0.21379185955630373, "flos": 18989347680000.0, "grad_norm": 7.002028010814159, "language_loss": 0.85867155, "learning_rate": 3.6561463868756965e-06, "loss": 0.88086218, "num_input_tokens_seen": 37855030, "step": 1778, "time_per_iteration": 2.630042314529419 }, { "auxiliary_loss_clip": 0.01166058, "auxiliary_loss_mlp": 0.01053511, "balance_loss_clip": 1.05070055, "balance_loss_mlp": 1.02968109, "epoch": 0.21391210244694284, "flos": 28218497207040.0, "grad_norm": 1.6002552085362767, "language_loss": 0.78168607, "learning_rate": 3.655709554182452e-06, "loss": 0.80388176, "num_input_tokens_seen": 37875370, "step": 1779, "time_per_iteration": 2.736976146697998 }, { "auxiliary_loss_clip": 0.01171236, "auxiliary_loss_mlp": 0.01059402, "balance_loss_clip": 1.05078936, "balance_loss_mlp": 1.03571558, "epoch": 0.21403234533758192, "flos": 17455064192640.0, "grad_norm": 1.7070832096155122, "language_loss": 0.84279275, "learning_rate": 3.6552724703193855e-06, "loss": 0.86509913, "num_input_tokens_seen": 37892560, "step": 1780, "time_per_iteration": 3.713794469833374 }, { "auxiliary_loss_clip": 0.01014531, "auxiliary_loss_mlp": 0.01003535, "balance_loss_clip": 1.01062679, "balance_loss_mlp": 0.99936259, "epoch": 0.214152588228221, "flos": 51637606686720.0, "grad_norm": 0.7862127879370976, "language_loss": 0.55935669, "learning_rate": 3.654835135352801e-06, "loss": 0.57953727, "num_input_tokens_seen": 37947370, "step": 1781, "time_per_iteration": 3.1759538650512695 }, { "auxiliary_loss_clip": 0.01123142, "auxiliary_loss_mlp": 0.01064687, "balance_loss_clip": 1.04179263, "balance_loss_mlp": 1.04040384, "epoch": 0.21427283111886009, "flos": 19496154625920.0, "grad_norm": 2.056941370397785, "language_loss": 0.87549084, "learning_rate": 3.654397549349043e-06, "loss": 0.89736915, "num_input_tokens_seen": 37964745, "step": 1782, "time_per_iteration": 2.764613389968872 }, { "auxiliary_loss_clip": 0.01151865, "auxiliary_loss_mlp": 0.01051469, "balance_loss_clip": 1.05018079, "balance_loss_mlp": 1.02821088, "epoch": 0.2143930740094992, "flos": 20084802710400.0, "grad_norm": 2.2777617712473948, "language_loss": 0.75858009, "learning_rate": 3.653959712374491e-06, "loss": 0.78061342, "num_input_tokens_seen": 37982850, "step": 1783, "time_per_iteration": 2.680487871170044 }, { "auxiliary_loss_clip": 0.01135001, "auxiliary_loss_mlp": 0.01047964, "balance_loss_clip": 1.0497272, "balance_loss_mlp": 1.02626824, "epoch": 0.21451331690013828, "flos": 21798603394560.0, "grad_norm": 1.6873652810043505, "language_loss": 0.82748103, "learning_rate": 3.6535216244955663e-06, "loss": 0.8493107, "num_input_tokens_seen": 38002745, "step": 1784, "time_per_iteration": 3.6610987186431885 }, { "auxiliary_loss_clip": 0.0115226, "auxiliary_loss_mlp": 0.0106233, "balance_loss_clip": 1.04875433, "balance_loss_mlp": 1.03845286, "epoch": 0.21463355979077736, "flos": 32853882412800.0, "grad_norm": 2.0489349266507624, "language_loss": 0.71108997, "learning_rate": 3.653083285778726e-06, "loss": 0.73323584, "num_input_tokens_seen": 38024115, "step": 1785, "time_per_iteration": 2.868215322494507 }, { "auxiliary_loss_clip": 0.01173159, "auxiliary_loss_mlp": 0.0105906, "balance_loss_clip": 1.05260444, "balance_loss_mlp": 1.03590941, "epoch": 0.21475380268141647, "flos": 21543817248000.0, "grad_norm": 2.5151462543628282, "language_loss": 0.81274939, "learning_rate": 3.6526446962904653e-06, "loss": 0.83507162, "num_input_tokens_seen": 38042830, "step": 1786, "time_per_iteration": 3.6278293132781982 }, { "auxiliary_loss_clip": 0.01162527, "auxiliary_loss_mlp": 0.01053487, "balance_loss_clip": 1.05299211, "balance_loss_mlp": 1.03207695, "epoch": 0.21487404557205556, "flos": 32159082660480.0, "grad_norm": 1.5590312044415178, "language_loss": 0.7431637, "learning_rate": 3.652205856097318e-06, "loss": 0.76532382, "num_input_tokens_seen": 38066015, "step": 1787, "time_per_iteration": 2.7933030128479004 }, { "auxiliary_loss_clip": 0.01161906, "auxiliary_loss_mlp": 0.00778109, "balance_loss_clip": 1.05271757, "balance_loss_mlp": 1.00064182, "epoch": 0.21499428846269464, "flos": 12673091583360.0, "grad_norm": 2.892663971846416, "language_loss": 0.78955686, "learning_rate": 3.651766765265856e-06, "loss": 0.80895704, "num_input_tokens_seen": 38083025, "step": 1788, "time_per_iteration": 3.6007916927337646 }, { "auxiliary_loss_clip": 0.01152429, "auxiliary_loss_mlp": 0.01057557, "balance_loss_clip": 1.05127478, "balance_loss_mlp": 1.03463268, "epoch": 0.21511453135333372, "flos": 23471573293440.0, "grad_norm": 3.3371036241167267, "language_loss": 0.81527936, "learning_rate": 3.65132742386269e-06, "loss": 0.83737922, "num_input_tokens_seen": 38098245, "step": 1789, "time_per_iteration": 2.706037998199463 }, { "auxiliary_loss_clip": 0.01181569, "auxiliary_loss_mlp": 0.01064043, "balance_loss_clip": 1.05293, "balance_loss_mlp": 1.04181039, "epoch": 0.21523477424397283, "flos": 26943560893440.0, "grad_norm": 1.7450994630397125, "language_loss": 0.85016686, "learning_rate": 3.6508878319544656e-06, "loss": 0.87262297, "num_input_tokens_seen": 38118460, "step": 1790, "time_per_iteration": 2.746562957763672 }, { "auxiliary_loss_clip": 0.01143008, "auxiliary_loss_mlp": 0.01055593, "balance_loss_clip": 1.04755664, "balance_loss_mlp": 1.03390908, "epoch": 0.21535501713461191, "flos": 18916161719040.0, "grad_norm": 2.6755267974452646, "language_loss": 0.81424129, "learning_rate": 3.65044798960787e-06, "loss": 0.8362273, "num_input_tokens_seen": 38136800, "step": 1791, "time_per_iteration": 2.743711233139038 }, { "auxiliary_loss_clip": 0.01131996, "auxiliary_loss_mlp": 0.01055802, "balance_loss_clip": 1.04802942, "balance_loss_mlp": 1.03407037, "epoch": 0.215475260025251, "flos": 17895113712000.0, "grad_norm": 2.723702797449953, "language_loss": 0.78621101, "learning_rate": 3.650007896889627e-06, "loss": 0.80808902, "num_input_tokens_seen": 38155380, "step": 1792, "time_per_iteration": 2.7266664505004883 }, { "auxiliary_loss_clip": 0.01180638, "auxiliary_loss_mlp": 0.01060016, "balance_loss_clip": 1.05617261, "balance_loss_mlp": 1.03693748, "epoch": 0.2155955029158901, "flos": 16654292340480.0, "grad_norm": 2.341621556205827, "language_loss": 0.80629373, "learning_rate": 3.6495675538664974e-06, "loss": 0.8287003, "num_input_tokens_seen": 38174395, "step": 1793, "time_per_iteration": 2.552616596221924 }, { "auxiliary_loss_clip": 0.01154023, "auxiliary_loss_mlp": 0.01061825, "balance_loss_clip": 1.04828894, "balance_loss_mlp": 1.03789926, "epoch": 0.2157157458065292, "flos": 23621213352960.0, "grad_norm": 2.006298934832928, "language_loss": 0.82553667, "learning_rate": 3.649126960605282e-06, "loss": 0.84769511, "num_input_tokens_seen": 38195380, "step": 1794, "time_per_iteration": 2.7283265590667725 }, { "auxiliary_loss_clip": 0.01155926, "auxiliary_loss_mlp": 0.0106768, "balance_loss_clip": 1.05235028, "balance_loss_mlp": 1.04451728, "epoch": 0.21583598869716827, "flos": 22127078292480.0, "grad_norm": 3.741741027490424, "language_loss": 0.83362907, "learning_rate": 3.6486861171728174e-06, "loss": 0.85586512, "num_input_tokens_seen": 38213775, "step": 1795, "time_per_iteration": 2.6821322441101074 }, { "auxiliary_loss_clip": 0.01142458, "auxiliary_loss_mlp": 0.01056474, "balance_loss_clip": 1.04707861, "balance_loss_mlp": 1.0322988, "epoch": 0.21595623158780738, "flos": 23441229279360.0, "grad_norm": 1.7266121202275722, "language_loss": 0.78376222, "learning_rate": 3.6482450236359803e-06, "loss": 0.80575156, "num_input_tokens_seen": 38235630, "step": 1796, "time_per_iteration": 2.7625222206115723 }, { "auxiliary_loss_clip": 0.01163853, "auxiliary_loss_mlp": 0.01046635, "balance_loss_clip": 1.05088198, "balance_loss_mlp": 1.02449799, "epoch": 0.21607647447844647, "flos": 26906501036160.0, "grad_norm": 3.4793819591683106, "language_loss": 0.77939129, "learning_rate": 3.647803680061683e-06, "loss": 0.80149615, "num_input_tokens_seen": 38256045, "step": 1797, "time_per_iteration": 2.765634298324585 }, { "auxiliary_loss_clip": 0.01158059, "auxiliary_loss_mlp": 0.01062598, "balance_loss_clip": 1.05107927, "balance_loss_mlp": 1.04027009, "epoch": 0.21619671736908555, "flos": 14495378319360.0, "grad_norm": 2.5428179271051157, "language_loss": 0.7479068, "learning_rate": 3.6473620865168776e-06, "loss": 0.77011341, "num_input_tokens_seen": 38272915, "step": 1798, "time_per_iteration": 2.7877719402313232 }, { "auxiliary_loss_clip": 0.01155176, "auxiliary_loss_mlp": 0.01065476, "balance_loss_clip": 1.05321062, "balance_loss_mlp": 1.04392338, "epoch": 0.21631696025972463, "flos": 17931096161280.0, "grad_norm": 7.3681704614657635, "language_loss": 0.81721807, "learning_rate": 3.646920243068554e-06, "loss": 0.83942449, "num_input_tokens_seen": 38290810, "step": 1799, "time_per_iteration": 2.7264575958251953 }, { "auxiliary_loss_clip": 0.01143885, "auxiliary_loss_mlp": 0.01052169, "balance_loss_clip": 1.05097234, "balance_loss_mlp": 1.02746892, "epoch": 0.21643720315036374, "flos": 24462385027200.0, "grad_norm": 2.6488816140217395, "language_loss": 0.74546361, "learning_rate": 3.6464781497837384e-06, "loss": 0.76742417, "num_input_tokens_seen": 38312785, "step": 1800, "time_per_iteration": 2.8095216751098633 }, { "auxiliary_loss_clip": 0.01157599, "auxiliary_loss_mlp": 0.01054178, "balance_loss_clip": 1.0477879, "balance_loss_mlp": 1.03225493, "epoch": 0.21655744604100283, "flos": 28474432588800.0, "grad_norm": 1.7600972125701049, "language_loss": 0.72484624, "learning_rate": 3.6460358067294965e-06, "loss": 0.7469641, "num_input_tokens_seen": 38334015, "step": 1801, "time_per_iteration": 2.809067487716675 }, { "auxiliary_loss_clip": 0.01184413, "auxiliary_loss_mlp": 0.01063166, "balance_loss_clip": 1.05105877, "balance_loss_mlp": 1.03719068, "epoch": 0.2166776889316419, "flos": 20152960767360.0, "grad_norm": 2.2108576459443134, "language_loss": 0.77855092, "learning_rate": 3.645593213972932e-06, "loss": 0.8010267, "num_input_tokens_seen": 38352920, "step": 1802, "time_per_iteration": 2.679575204849243 }, { "auxiliary_loss_clip": 0.01162323, "auxiliary_loss_mlp": 0.01062225, "balance_loss_clip": 1.050071, "balance_loss_mlp": 1.03860962, "epoch": 0.21679793182228102, "flos": 15193482122880.0, "grad_norm": 2.5816353565433667, "language_loss": 0.79820877, "learning_rate": 3.6451503715811852e-06, "loss": 0.82045424, "num_input_tokens_seen": 38371230, "step": 1803, "time_per_iteration": 2.7964344024658203 }, { "auxiliary_loss_clip": 0.01154638, "auxiliary_loss_mlp": 0.01072166, "balance_loss_clip": 1.05379426, "balance_loss_mlp": 1.05019546, "epoch": 0.2169181747129201, "flos": 17384464010880.0, "grad_norm": 2.0154417511527893, "language_loss": 0.79820716, "learning_rate": 3.6447072796214345e-06, "loss": 0.82047522, "num_input_tokens_seen": 38389795, "step": 1804, "time_per_iteration": 2.7008767127990723 }, { "auxiliary_loss_clip": 0.01019715, "auxiliary_loss_mlp": 0.01009213, "balance_loss_clip": 1.01292813, "balance_loss_mlp": 1.00506496, "epoch": 0.21703841760355919, "flos": 58760955429120.0, "grad_norm": 0.939318546317084, "language_loss": 0.63187385, "learning_rate": 3.644263938160898e-06, "loss": 0.65216315, "num_input_tokens_seen": 38445760, "step": 1805, "time_per_iteration": 3.2090518474578857 }, { "auxiliary_loss_clip": 0.01140258, "auxiliary_loss_mlp": 0.01053441, "balance_loss_clip": 1.05070066, "balance_loss_mlp": 1.02933645, "epoch": 0.21715866049419827, "flos": 22418457419520.0, "grad_norm": 1.9212916266225517, "language_loss": 0.7203145, "learning_rate": 3.6438203472668293e-06, "loss": 0.7422514, "num_input_tokens_seen": 38465405, "step": 1806, "time_per_iteration": 3.820133686065674 }, { "auxiliary_loss_clip": 0.0115796, "auxiliary_loss_mlp": 0.01049775, "balance_loss_clip": 1.0510087, "balance_loss_mlp": 1.02811444, "epoch": 0.21727890338483738, "flos": 17237732952960.0, "grad_norm": 2.232533686604721, "language_loss": 0.82049465, "learning_rate": 3.6433765070065206e-06, "loss": 0.84257197, "num_input_tokens_seen": 38483195, "step": 1807, "time_per_iteration": 2.651764154434204 }, { "auxiliary_loss_clip": 0.01183187, "auxiliary_loss_mlp": 0.01054414, "balance_loss_clip": 1.05315542, "balance_loss_mlp": 1.0308938, "epoch": 0.21739914627547646, "flos": 13434792416640.0, "grad_norm": 2.4966025289859655, "language_loss": 0.87333441, "learning_rate": 3.6429324174473025e-06, "loss": 0.89571041, "num_input_tokens_seen": 38496735, "step": 1808, "time_per_iteration": 2.7117857933044434 }, { "auxiliary_loss_clip": 0.01170408, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.0535121, "balance_loss_mlp": 1.02997828, "epoch": 0.21751938916611555, "flos": 20959514709120.0, "grad_norm": 3.0544148510638855, "language_loss": 0.84702241, "learning_rate": 3.6424880786565425e-06, "loss": 0.86924469, "num_input_tokens_seen": 38512880, "step": 1809, "time_per_iteration": 2.6924827098846436 }, { "auxiliary_loss_clip": 0.01132311, "auxiliary_loss_mlp": 0.01053157, "balance_loss_clip": 1.05264091, "balance_loss_mlp": 1.02931523, "epoch": 0.21763963205675466, "flos": 27599936071680.0, "grad_norm": 2.476366040899027, "language_loss": 0.7981869, "learning_rate": 3.6420434907016482e-06, "loss": 0.8200416, "num_input_tokens_seen": 38532570, "step": 1810, "time_per_iteration": 3.7164628505706787 }, { "auxiliary_loss_clip": 0.01169218, "auxiliary_loss_mlp": 0.01048955, "balance_loss_clip": 1.05547857, "balance_loss_mlp": 1.02513719, "epoch": 0.21775987494739374, "flos": 21430411032960.0, "grad_norm": 1.7719307393837593, "language_loss": 0.81475425, "learning_rate": 3.6415986536500606e-06, "loss": 0.836936, "num_input_tokens_seen": 38550900, "step": 1811, "time_per_iteration": 3.66654372215271 }, { "auxiliary_loss_clip": 0.01126132, "auxiliary_loss_mlp": 0.01053328, "balance_loss_clip": 1.05276954, "balance_loss_mlp": 1.03152502, "epoch": 0.21788011783803282, "flos": 18332972501760.0, "grad_norm": 1.7342365956347527, "language_loss": 0.80400229, "learning_rate": 3.641153567569263e-06, "loss": 0.8257969, "num_input_tokens_seen": 38569215, "step": 1812, "time_per_iteration": 2.7463722229003906 }, { "auxiliary_loss_clip": 0.01163558, "auxiliary_loss_mlp": 0.01047378, "balance_loss_clip": 1.0513947, "balance_loss_mlp": 1.02558684, "epoch": 0.2180003607286719, "flos": 30262748037120.0, "grad_norm": 2.106475808128097, "language_loss": 0.95621902, "learning_rate": 3.640708232526774e-06, "loss": 0.97832841, "num_input_tokens_seen": 38587870, "step": 1813, "time_per_iteration": 2.730234384536743 }, { "auxiliary_loss_clip": 0.01111718, "auxiliary_loss_mlp": 0.01061453, "balance_loss_clip": 1.04214859, "balance_loss_mlp": 1.03863668, "epoch": 0.21812060361931102, "flos": 25480272637440.0, "grad_norm": 3.3705246897453387, "language_loss": 0.78648615, "learning_rate": 3.6402626485901504e-06, "loss": 0.80821788, "num_input_tokens_seen": 38606965, "step": 1814, "time_per_iteration": 3.6813976764678955 }, { "auxiliary_loss_clip": 0.01165802, "auxiliary_loss_mlp": 0.01059455, "balance_loss_clip": 1.05791855, "balance_loss_mlp": 1.03785467, "epoch": 0.2182408465099501, "flos": 21908166854400.0, "grad_norm": 2.0339959556898255, "language_loss": 0.78164935, "learning_rate": 3.639816815826988e-06, "loss": 0.80390191, "num_input_tokens_seen": 38626290, "step": 1815, "time_per_iteration": 2.669687509536743 }, { "auxiliary_loss_clip": 0.01154896, "auxiliary_loss_mlp": 0.01056261, "balance_loss_clip": 1.05396044, "balance_loss_mlp": 1.03429103, "epoch": 0.21836108940058918, "flos": 23657339456640.0, "grad_norm": 1.8568422750172244, "language_loss": 0.77812958, "learning_rate": 3.6393707343049176e-06, "loss": 0.80024111, "num_input_tokens_seen": 38646620, "step": 1816, "time_per_iteration": 2.783433198928833 }, { "auxiliary_loss_clip": 0.01171444, "auxiliary_loss_mlp": 0.01054358, "balance_loss_clip": 1.05406868, "balance_loss_mlp": 1.03287625, "epoch": 0.2184813322912283, "flos": 24681009156480.0, "grad_norm": 2.5641077591945427, "language_loss": 0.7350477, "learning_rate": 3.6389244040916104e-06, "loss": 0.75730574, "num_input_tokens_seen": 38665695, "step": 1817, "time_per_iteration": 2.6766858100891113 }, { "auxiliary_loss_clip": 0.01148249, "auxiliary_loss_mlp": 0.0077941, "balance_loss_clip": 1.05170548, "balance_loss_mlp": 1.00052643, "epoch": 0.21860157518186737, "flos": 26574650259840.0, "grad_norm": 2.4971646053991674, "language_loss": 0.79506183, "learning_rate": 3.6384778252547747e-06, "loss": 0.81433839, "num_input_tokens_seen": 38681575, "step": 1818, "time_per_iteration": 2.773413896560669 }, { "auxiliary_loss_clip": 0.01153797, "auxiliary_loss_mlp": 0.00776516, "balance_loss_clip": 1.05473137, "balance_loss_mlp": 1.00055957, "epoch": 0.21872181807250646, "flos": 20886292834560.0, "grad_norm": 2.3908443111320046, "language_loss": 0.77912635, "learning_rate": 3.638030997862155e-06, "loss": 0.79842943, "num_input_tokens_seen": 38700510, "step": 1819, "time_per_iteration": 2.715000629425049 }, { "auxiliary_loss_clip": 0.0103729, "auxiliary_loss_mlp": 0.0100252, "balance_loss_clip": 1.0129503, "balance_loss_mlp": 0.99949247, "epoch": 0.21884206096314554, "flos": 61209452897280.0, "grad_norm": 0.7600128632905143, "language_loss": 0.59407729, "learning_rate": 3.6375839219815356e-06, "loss": 0.61447537, "num_input_tokens_seen": 38758310, "step": 1820, "time_per_iteration": 3.329406499862671 }, { "auxiliary_loss_clip": 0.01185197, "auxiliary_loss_mlp": 0.01060959, "balance_loss_clip": 1.0580461, "balance_loss_mlp": 1.0382731, "epoch": 0.21896230385378465, "flos": 23473835850240.0, "grad_norm": 2.3564345131230238, "language_loss": 0.82607627, "learning_rate": 3.6371365976807375e-06, "loss": 0.8485378, "num_input_tokens_seen": 38778705, "step": 1821, "time_per_iteration": 2.789947986602783 }, { "auxiliary_loss_clip": 0.01124317, "auxiliary_loss_mlp": 0.01056993, "balance_loss_clip": 1.05105114, "balance_loss_mlp": 1.03480864, "epoch": 0.21908254674442373, "flos": 25081915829760.0, "grad_norm": 1.7229690674289408, "language_loss": 0.83573079, "learning_rate": 3.6366890250276185e-06, "loss": 0.85754395, "num_input_tokens_seen": 38799660, "step": 1822, "time_per_iteration": 2.9035165309906006 }, { "auxiliary_loss_clip": 0.01181391, "auxiliary_loss_mlp": 0.01059418, "balance_loss_clip": 1.05542946, "balance_loss_mlp": 1.03617167, "epoch": 0.21920278963506282, "flos": 23513768795520.0, "grad_norm": 2.2094800036667444, "language_loss": 0.8983674, "learning_rate": 3.6362412040900764e-06, "loss": 0.92077553, "num_input_tokens_seen": 38819450, "step": 1823, "time_per_iteration": 2.661170721054077 }, { "auxiliary_loss_clip": 0.01171213, "auxiliary_loss_mlp": 0.01061654, "balance_loss_clip": 1.05519998, "balance_loss_mlp": 1.03905153, "epoch": 0.21932303252570193, "flos": 29242238734080.0, "grad_norm": 2.0446120812739768, "language_loss": 0.80915904, "learning_rate": 3.635793134936044e-06, "loss": 0.83148777, "num_input_tokens_seen": 38840460, "step": 1824, "time_per_iteration": 2.7183144092559814 }, { "auxiliary_loss_clip": 0.01168212, "auxiliary_loss_mlp": 0.01055661, "balance_loss_clip": 1.0559988, "balance_loss_mlp": 1.03401303, "epoch": 0.219443275416341, "flos": 20806857907200.0, "grad_norm": 2.013249968012601, "language_loss": 0.73319352, "learning_rate": 3.635344817633494e-06, "loss": 0.75543225, "num_input_tokens_seen": 38859775, "step": 1825, "time_per_iteration": 2.757438898086548 }, { "auxiliary_loss_clip": 0.01162768, "auxiliary_loss_mlp": 0.01054177, "balance_loss_clip": 1.05277181, "balance_loss_mlp": 1.03214753, "epoch": 0.2195635183069801, "flos": 14501555458560.0, "grad_norm": 2.1874638982431422, "language_loss": 0.75459987, "learning_rate": 3.634896252250436e-06, "loss": 0.77676928, "num_input_tokens_seen": 38876540, "step": 1826, "time_per_iteration": 2.6466429233551025 }, { "auxiliary_loss_clip": 0.01185225, "auxiliary_loss_mlp": 0.01060378, "balance_loss_clip": 1.05609453, "balance_loss_mlp": 1.03958786, "epoch": 0.2196837611976192, "flos": 24243473589120.0, "grad_norm": 2.7481255138146836, "language_loss": 0.82414752, "learning_rate": 3.6344474388549157e-06, "loss": 0.84660351, "num_input_tokens_seen": 38896195, "step": 1827, "time_per_iteration": 2.6836729049682617 }, { "auxiliary_loss_clip": 0.01172892, "auxiliary_loss_mlp": 0.0105529, "balance_loss_clip": 1.05626917, "balance_loss_mlp": 1.03277171, "epoch": 0.2198040040882583, "flos": 18074523168000.0, "grad_norm": 2.880871439528963, "language_loss": 0.8034209, "learning_rate": 3.6339983775150183e-06, "loss": 0.82570267, "num_input_tokens_seen": 38912755, "step": 1828, "time_per_iteration": 2.6219305992126465 }, { "auxiliary_loss_clip": 0.01166382, "auxiliary_loss_mlp": 0.01051349, "balance_loss_clip": 1.05310106, "balance_loss_mlp": 1.02955759, "epoch": 0.21992424697889737, "flos": 17784185535360.0, "grad_norm": 3.592550338155343, "language_loss": 0.84360385, "learning_rate": 3.6335490682988664e-06, "loss": 0.86578113, "num_input_tokens_seen": 38928365, "step": 1829, "time_per_iteration": 2.6344237327575684 }, { "auxiliary_loss_clip": 0.01110361, "auxiliary_loss_mlp": 0.01068561, "balance_loss_clip": 1.04679501, "balance_loss_mlp": 1.04484999, "epoch": 0.22004448986953645, "flos": 17638495971840.0, "grad_norm": 2.135863310452264, "language_loss": 0.82744324, "learning_rate": 3.63309951127462e-06, "loss": 0.84923238, "num_input_tokens_seen": 38945275, "step": 1830, "time_per_iteration": 2.775933027267456 }, { "auxiliary_loss_clip": 0.01150392, "auxiliary_loss_mlp": 0.01068628, "balance_loss_clip": 1.05445695, "balance_loss_mlp": 1.04398751, "epoch": 0.22016473276017556, "flos": 22275533203200.0, "grad_norm": 1.7553995049235271, "language_loss": 0.75073433, "learning_rate": 3.6326497065104757e-06, "loss": 0.77292454, "num_input_tokens_seen": 38965740, "step": 1831, "time_per_iteration": 2.776885509490967 }, { "auxiliary_loss_clip": 0.01173947, "auxiliary_loss_mlp": 0.01056373, "balance_loss_clip": 1.05682921, "balance_loss_mlp": 1.03293633, "epoch": 0.22028497565081465, "flos": 25556259859200.0, "grad_norm": 2.2098647769701536, "language_loss": 0.77975821, "learning_rate": 3.6321996540746697e-06, "loss": 0.80206144, "num_input_tokens_seen": 38984815, "step": 1832, "time_per_iteration": 3.7053680419921875 }, { "auxiliary_loss_clip": 0.01143178, "auxiliary_loss_mlp": 0.01059627, "balance_loss_clip": 1.05277336, "balance_loss_mlp": 1.03684568, "epoch": 0.22040521854145373, "flos": 36247332925440.0, "grad_norm": 1.9132151003162006, "language_loss": 0.80491453, "learning_rate": 3.6317493540354733e-06, "loss": 0.82694256, "num_input_tokens_seen": 39008230, "step": 1833, "time_per_iteration": 2.807051420211792 }, { "auxiliary_loss_clip": 0.01163018, "auxiliary_loss_mlp": 0.01054735, "balance_loss_clip": 1.05079484, "balance_loss_mlp": 1.03108418, "epoch": 0.22052546143209284, "flos": 11838420270720.0, "grad_norm": 1.9306603758592402, "language_loss": 0.76720273, "learning_rate": 3.6312988064611976e-06, "loss": 0.78938031, "num_input_tokens_seen": 39026540, "step": 1834, "time_per_iteration": 2.735323905944824 }, { "auxiliary_loss_clip": 0.01142645, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.04537129, "balance_loss_mlp": 1.03527296, "epoch": 0.22064570432273192, "flos": 24209250906240.0, "grad_norm": 2.3105956215628645, "language_loss": 0.81772363, "learning_rate": 3.6308480114201896e-06, "loss": 0.8397156, "num_input_tokens_seen": 39048460, "step": 1835, "time_per_iteration": 2.7998697757720947 }, { "auxiliary_loss_clip": 0.01179993, "auxiliary_loss_mlp": 0.0106816, "balance_loss_clip": 1.05568421, "balance_loss_mlp": 1.04593921, "epoch": 0.220765947213371, "flos": 17931347556480.0, "grad_norm": 1.996800068792829, "language_loss": 0.76248121, "learning_rate": 3.630396968980835e-06, "loss": 0.78496265, "num_input_tokens_seen": 39066335, "step": 1836, "time_per_iteration": 3.532201051712036 }, { "auxiliary_loss_clip": 0.01158561, "auxiliary_loss_mlp": 0.01069119, "balance_loss_clip": 1.05488598, "balance_loss_mlp": 1.04491937, "epoch": 0.2208861901040101, "flos": 26757040544640.0, "grad_norm": 2.6099698702432903, "language_loss": 0.8353712, "learning_rate": 3.6299456792115575e-06, "loss": 0.85764802, "num_input_tokens_seen": 39087590, "step": 1837, "time_per_iteration": 2.7608232498168945 }, { "auxiliary_loss_clip": 0.01081529, "auxiliary_loss_mlp": 0.01058244, "balance_loss_clip": 1.04014671, "balance_loss_mlp": 1.03447402, "epoch": 0.2210064329946492, "flos": 17817977255040.0, "grad_norm": 1.9551726858637462, "language_loss": 0.80780089, "learning_rate": 3.629494142180815e-06, "loss": 0.8291986, "num_input_tokens_seen": 39106335, "step": 1838, "time_per_iteration": 3.760425329208374 }, { "auxiliary_loss_clip": 0.01180901, "auxiliary_loss_mlp": 0.01051705, "balance_loss_clip": 1.05594325, "balance_loss_mlp": 1.03109407, "epoch": 0.22112667588528828, "flos": 17967401832960.0, "grad_norm": 2.65883930145656, "language_loss": 0.85433125, "learning_rate": 3.6290423579571075e-06, "loss": 0.87665737, "num_input_tokens_seen": 39122875, "step": 1839, "time_per_iteration": 2.555922746658325 }, { "auxiliary_loss_clip": 0.0116874, "auxiliary_loss_mlp": 0.01064581, "balance_loss_clip": 1.05635023, "balance_loss_mlp": 1.03992867, "epoch": 0.22124691877592736, "flos": 18369206346240.0, "grad_norm": 2.555115242572095, "language_loss": 0.80404514, "learning_rate": 3.6285903266089694e-06, "loss": 0.82637835, "num_input_tokens_seen": 39142150, "step": 1840, "time_per_iteration": 2.6504313945770264 }, { "auxiliary_loss_clip": 0.0115706, "auxiliary_loss_mlp": 0.01052963, "balance_loss_clip": 1.0504334, "balance_loss_mlp": 1.02997947, "epoch": 0.22136716166656648, "flos": 20813286441600.0, "grad_norm": 2.5137566568526832, "language_loss": 0.77693045, "learning_rate": 3.628138048204974e-06, "loss": 0.79903066, "num_input_tokens_seen": 39162835, "step": 1841, "time_per_iteration": 3.568305730819702 }, { "auxiliary_loss_clip": 0.01124239, "auxiliary_loss_mlp": 0.0106582, "balance_loss_clip": 1.04975843, "balance_loss_mlp": 1.04057121, "epoch": 0.22148740455720556, "flos": 17675699483520.0, "grad_norm": 2.205039170701075, "language_loss": 0.76152956, "learning_rate": 3.6276855228137304e-06, "loss": 0.7834301, "num_input_tokens_seen": 39181040, "step": 1842, "time_per_iteration": 2.824549674987793 }, { "auxiliary_loss_clip": 0.01180071, "auxiliary_loss_mlp": 0.00777767, "balance_loss_clip": 1.0547471, "balance_loss_mlp": 1.00045049, "epoch": 0.22160764744784464, "flos": 21726710323200.0, "grad_norm": 2.5770958638434407, "language_loss": 0.81966722, "learning_rate": 3.6272327505038874e-06, "loss": 0.83924562, "num_input_tokens_seen": 39197505, "step": 1843, "time_per_iteration": 2.6101062297821045 }, { "auxiliary_loss_clip": 0.01132636, "auxiliary_loss_mlp": 0.01047843, "balance_loss_clip": 1.04816508, "balance_loss_mlp": 1.02551544, "epoch": 0.22172789033848372, "flos": 23764712186880.0, "grad_norm": 2.0637059024085134, "language_loss": 0.78513265, "learning_rate": 3.626779731344131e-06, "loss": 0.80693746, "num_input_tokens_seen": 39217295, "step": 1844, "time_per_iteration": 2.7788076400756836 }, { "auxiliary_loss_clip": 0.01175884, "auxiliary_loss_mlp": 0.01049296, "balance_loss_clip": 1.05444241, "balance_loss_mlp": 1.02821934, "epoch": 0.22184813322912283, "flos": 16982300361600.0, "grad_norm": 2.1311714638399843, "language_loss": 0.85475266, "learning_rate": 3.6263264654031814e-06, "loss": 0.8770045, "num_input_tokens_seen": 39234195, "step": 1845, "time_per_iteration": 2.605635643005371 }, { "auxiliary_loss_clip": 0.01035744, "auxiliary_loss_mlp": 0.01010971, "balance_loss_clip": 1.01912904, "balance_loss_mlp": 1.00797868, "epoch": 0.22196837611976192, "flos": 61823740314240.0, "grad_norm": 0.7026206206644569, "language_loss": 0.59226793, "learning_rate": 3.6258729527498008e-06, "loss": 0.61273509, "num_input_tokens_seen": 39295040, "step": 1846, "time_per_iteration": 3.2163708209991455 }, { "auxiliary_loss_clip": 0.0115954, "auxiliary_loss_mlp": 0.0104792, "balance_loss_clip": 1.05287743, "balance_loss_mlp": 1.02630711, "epoch": 0.222088619010401, "flos": 25558019625600.0, "grad_norm": 3.4422399109051325, "language_loss": 0.65072596, "learning_rate": 3.6254191934527854e-06, "loss": 0.6728006, "num_input_tokens_seen": 39314395, "step": 1847, "time_per_iteration": 2.782402992248535 }, { "auxiliary_loss_clip": 0.01135817, "auxiliary_loss_mlp": 0.0105655, "balance_loss_clip": 1.05203402, "balance_loss_mlp": 1.03292251, "epoch": 0.2222088619010401, "flos": 19318612677120.0, "grad_norm": 3.2335464591377674, "language_loss": 0.64808011, "learning_rate": 3.6249651875809715e-06, "loss": 0.67000377, "num_input_tokens_seen": 39334275, "step": 1848, "time_per_iteration": 2.7704508304595947 }, { "auxiliary_loss_clip": 0.01149658, "auxiliary_loss_mlp": 0.0106388, "balance_loss_clip": 1.05585945, "balance_loss_mlp": 1.04127812, "epoch": 0.2223291047916792, "flos": 19099342103040.0, "grad_norm": 2.0349981253263567, "language_loss": 0.89053446, "learning_rate": 3.62451093520323e-06, "loss": 0.9126699, "num_input_tokens_seen": 39352180, "step": 1849, "time_per_iteration": 2.7901504039764404 }, { "auxiliary_loss_clip": 0.0111914, "auxiliary_loss_mlp": 0.01044707, "balance_loss_clip": 1.04552114, "balance_loss_mlp": 1.02221191, "epoch": 0.22244934768231828, "flos": 20850418126080.0, "grad_norm": 1.8923860823566514, "language_loss": 0.90701175, "learning_rate": 3.6240564363884714e-06, "loss": 0.9286502, "num_input_tokens_seen": 39372125, "step": 1850, "time_per_iteration": 2.780700922012329 }, { "auxiliary_loss_clip": 0.01169132, "auxiliary_loss_mlp": 0.01059014, "balance_loss_clip": 1.05123639, "balance_loss_mlp": 1.03471947, "epoch": 0.2225695905729574, "flos": 15632921111040.0, "grad_norm": 2.19957193180753, "language_loss": 0.70764947, "learning_rate": 3.623601691205643e-06, "loss": 0.729931, "num_input_tokens_seen": 39391200, "step": 1851, "time_per_iteration": 2.689734935760498 }, { "auxiliary_loss_clip": 0.01164269, "auxiliary_loss_mlp": 0.01057262, "balance_loss_clip": 1.05372787, "balance_loss_mlp": 1.03588796, "epoch": 0.22268983346359647, "flos": 25373582265600.0, "grad_norm": 2.4517982238299236, "language_loss": 0.81722713, "learning_rate": 3.623146699723729e-06, "loss": 0.83944249, "num_input_tokens_seen": 39410660, "step": 1852, "time_per_iteration": 2.6810150146484375 }, { "auxiliary_loss_clip": 0.01156028, "auxiliary_loss_mlp": 0.01052343, "balance_loss_clip": 1.0550971, "balance_loss_mlp": 1.03121972, "epoch": 0.22281007635423555, "flos": 13261452359040.0, "grad_norm": 1.8214690198171581, "language_loss": 0.77589041, "learning_rate": 3.6226914620117507e-06, "loss": 0.79797411, "num_input_tokens_seen": 39429280, "step": 1853, "time_per_iteration": 2.763570785522461 }, { "auxiliary_loss_clip": 0.01134223, "auxiliary_loss_mlp": 0.01050067, "balance_loss_clip": 1.04499793, "balance_loss_mlp": 1.02912199, "epoch": 0.22293031924487464, "flos": 15340536403200.0, "grad_norm": 2.0583811945938555, "language_loss": 0.80906618, "learning_rate": 3.622235978138768e-06, "loss": 0.83090901, "num_input_tokens_seen": 39446905, "step": 1854, "time_per_iteration": 2.7101290225982666 }, { "auxiliary_loss_clip": 0.01164463, "auxiliary_loss_mlp": 0.01042623, "balance_loss_clip": 1.054178, "balance_loss_mlp": 1.02141583, "epoch": 0.22305056213551375, "flos": 22564649773440.0, "grad_norm": 1.8448418024287034, "language_loss": 0.81113428, "learning_rate": 3.621780248173877e-06, "loss": 0.8332051, "num_input_tokens_seen": 39465105, "step": 1855, "time_per_iteration": 2.6975464820861816 }, { "auxiliary_loss_clip": 0.01061678, "auxiliary_loss_mlp": 0.01002287, "balance_loss_clip": 1.02287531, "balance_loss_mlp": 0.99955696, "epoch": 0.22317080502615283, "flos": 64880419887360.0, "grad_norm": 0.8250207143352085, "language_loss": 0.61057729, "learning_rate": 3.6213242721862125e-06, "loss": 0.63121694, "num_input_tokens_seen": 39523560, "step": 1856, "time_per_iteration": 3.2140891551971436 }, { "auxiliary_loss_clip": 0.01144771, "auxiliary_loss_mlp": 0.01050353, "balance_loss_clip": 1.05445743, "balance_loss_mlp": 1.02855015, "epoch": 0.2232910479167919, "flos": 25775997310080.0, "grad_norm": 1.6262698829503914, "language_loss": 0.75313109, "learning_rate": 3.620868050244945e-06, "loss": 0.77508235, "num_input_tokens_seen": 39544040, "step": 1857, "time_per_iteration": 2.7444541454315186 }, { "auxiliary_loss_clip": 0.01149118, "auxiliary_loss_mlp": 0.01054691, "balance_loss_clip": 1.0503726, "balance_loss_mlp": 1.03144503, "epoch": 0.22341129080743102, "flos": 23251799928960.0, "grad_norm": 2.3880040975569443, "language_loss": 0.77761459, "learning_rate": 3.6204115824192817e-06, "loss": 0.79965264, "num_input_tokens_seen": 39561515, "step": 1858, "time_per_iteration": 3.7639384269714355 }, { "auxiliary_loss_clip": 0.01140055, "auxiliary_loss_mlp": 0.01062508, "balance_loss_clip": 1.0448705, "balance_loss_mlp": 1.03789103, "epoch": 0.2235315336980701, "flos": 21214552250880.0, "grad_norm": 3.137768938465784, "language_loss": 0.76826274, "learning_rate": 3.619954868778471e-06, "loss": 0.79028845, "num_input_tokens_seen": 39578210, "step": 1859, "time_per_iteration": 2.731337547302246 }, { "auxiliary_loss_clip": 0.01151173, "auxiliary_loss_mlp": 0.01070004, "balance_loss_clip": 1.05028343, "balance_loss_mlp": 1.04905939, "epoch": 0.2236517765887092, "flos": 19901945548800.0, "grad_norm": 1.8895574565562947, "language_loss": 0.82558119, "learning_rate": 3.6194979093917944e-06, "loss": 0.84779298, "num_input_tokens_seen": 39597625, "step": 1860, "time_per_iteration": 2.7266769409179688 }, { "auxiliary_loss_clip": 0.01145852, "auxiliary_loss_mlp": 0.01061953, "balance_loss_clip": 1.0508728, "balance_loss_mlp": 1.04013813, "epoch": 0.22377201947934827, "flos": 23214847812480.0, "grad_norm": 1.9674141688465456, "language_loss": 0.8686949, "learning_rate": 3.6190407043285724e-06, "loss": 0.89077294, "num_input_tokens_seen": 39615360, "step": 1861, "time_per_iteration": 3.598414897918701 }, { "auxiliary_loss_clip": 0.01187352, "auxiliary_loss_mlp": 0.01057957, "balance_loss_clip": 1.05824733, "balance_loss_mlp": 1.03534329, "epoch": 0.22389226236998738, "flos": 26794244056320.0, "grad_norm": 1.835050294869004, "language_loss": 0.75949407, "learning_rate": 3.618583253658163e-06, "loss": 0.78194714, "num_input_tokens_seen": 39635460, "step": 1862, "time_per_iteration": 2.6864125728607178 }, { "auxiliary_loss_clip": 0.01125165, "auxiliary_loss_mlp": 0.00777072, "balance_loss_clip": 1.05194056, "balance_loss_mlp": 1.00046706, "epoch": 0.22401250526062647, "flos": 24170359455360.0, "grad_norm": 1.876018945131147, "language_loss": 0.86509871, "learning_rate": 3.618125557449961e-06, "loss": 0.88412106, "num_input_tokens_seen": 39653515, "step": 1863, "time_per_iteration": 3.6997289657592773 }, { "auxiliary_loss_clip": 0.01163321, "auxiliary_loss_mlp": 0.01060902, "balance_loss_clip": 1.0557487, "balance_loss_mlp": 1.03877664, "epoch": 0.22413274815126555, "flos": 16759761649920.0, "grad_norm": 2.0212246185505562, "language_loss": 0.8306461, "learning_rate": 3.6176676157733983e-06, "loss": 0.85288829, "num_input_tokens_seen": 39668525, "step": 1864, "time_per_iteration": 2.6887450218200684 }, { "auxiliary_loss_clip": 0.01133475, "auxiliary_loss_mlp": 0.01080772, "balance_loss_clip": 1.04779458, "balance_loss_mlp": 1.05790818, "epoch": 0.22425299104190466, "flos": 21360205900800.0, "grad_norm": 2.3902841824588603, "language_loss": 0.76226664, "learning_rate": 3.6172094286979443e-06, "loss": 0.78440911, "num_input_tokens_seen": 39685895, "step": 1865, "time_per_iteration": 2.7742316722869873 }, { "auxiliary_loss_clip": 0.01145145, "auxiliary_loss_mlp": 0.01050668, "balance_loss_clip": 1.04601979, "balance_loss_mlp": 1.02960336, "epoch": 0.22437323393254374, "flos": 32165547108480.0, "grad_norm": 1.438396914339012, "language_loss": 0.81320012, "learning_rate": 3.6167509962931064e-06, "loss": 0.83515829, "num_input_tokens_seen": 39711595, "step": 1866, "time_per_iteration": 3.8310999870300293 }, { "auxiliary_loss_clip": 0.0113914, "auxiliary_loss_mlp": 0.01055233, "balance_loss_clip": 1.05352104, "balance_loss_mlp": 1.03305984, "epoch": 0.22449347682318282, "flos": 18002809664640.0, "grad_norm": 2.7482729440434666, "language_loss": 0.77226269, "learning_rate": 3.6162923186284276e-06, "loss": 0.7942065, "num_input_tokens_seen": 39727555, "step": 1867, "time_per_iteration": 2.754499912261963 }, { "auxiliary_loss_clip": 0.01148878, "auxiliary_loss_mlp": 0.01063137, "balance_loss_clip": 1.04876804, "balance_loss_mlp": 1.04072571, "epoch": 0.2246137197138219, "flos": 18697286194560.0, "grad_norm": 1.97154751395132, "language_loss": 0.85821664, "learning_rate": 3.6158333957734888e-06, "loss": 0.88033688, "num_input_tokens_seen": 39746145, "step": 1868, "time_per_iteration": 2.6760425567626953 }, { "auxiliary_loss_clip": 0.01141051, "auxiliary_loss_mlp": 0.01073427, "balance_loss_clip": 1.04959822, "balance_loss_mlp": 1.05318522, "epoch": 0.22473396260446102, "flos": 15590653781760.0, "grad_norm": 2.0488873287261056, "language_loss": 0.82720208, "learning_rate": 3.6153742277979088e-06, "loss": 0.84934694, "num_input_tokens_seen": 39763575, "step": 1869, "time_per_iteration": 2.6692943572998047 }, { "auxiliary_loss_clip": 0.01152492, "auxiliary_loss_mlp": 0.01055042, "balance_loss_clip": 1.05076778, "balance_loss_mlp": 1.03614724, "epoch": 0.2248542054951001, "flos": 14465501182080.0, "grad_norm": 2.1753130969398997, "language_loss": 0.78221607, "learning_rate": 3.6149148147713434e-06, "loss": 0.80429143, "num_input_tokens_seen": 39781810, "step": 1870, "time_per_iteration": 2.8114171028137207 }, { "auxiliary_loss_clip": 0.01171377, "auxiliary_loss_mlp": 0.01053977, "balance_loss_clip": 1.05700505, "balance_loss_mlp": 1.03282928, "epoch": 0.22497444838573918, "flos": 19243882431360.0, "grad_norm": 1.9322815932489612, "language_loss": 0.86042041, "learning_rate": 3.614455156763484e-06, "loss": 0.88267398, "num_input_tokens_seen": 39800115, "step": 1871, "time_per_iteration": 2.7310831546783447 }, { "auxiliary_loss_clip": 0.01118661, "auxiliary_loss_mlp": 0.01056994, "balance_loss_clip": 1.04444802, "balance_loss_mlp": 1.03324747, "epoch": 0.2250946912763783, "flos": 16910299549440.0, "grad_norm": 2.1374720456899565, "language_loss": 0.71158999, "learning_rate": 3.613995253844061e-06, "loss": 0.73334652, "num_input_tokens_seen": 39817795, "step": 1872, "time_per_iteration": 2.66080379486084 }, { "auxiliary_loss_clip": 0.01157073, "auxiliary_loss_mlp": 0.01054173, "balance_loss_clip": 1.0515393, "balance_loss_mlp": 1.03231001, "epoch": 0.22521493416701738, "flos": 24681368292480.0, "grad_norm": 1.9756865971932118, "language_loss": 0.80859816, "learning_rate": 3.6135351060828414e-06, "loss": 0.83071065, "num_input_tokens_seen": 39838270, "step": 1873, "time_per_iteration": 2.7427024841308594 }, { "auxiliary_loss_clip": 0.01179795, "auxiliary_loss_mlp": 0.01055164, "balance_loss_clip": 1.05212235, "balance_loss_mlp": 1.03364706, "epoch": 0.22533517705765646, "flos": 17821963664640.0, "grad_norm": 2.882675964342703, "language_loss": 0.69787097, "learning_rate": 3.6130747135496285e-06, "loss": 0.72022057, "num_input_tokens_seen": 39857270, "step": 1874, "time_per_iteration": 2.626936674118042 }, { "auxiliary_loss_clip": 0.01174189, "auxiliary_loss_mlp": 0.01044461, "balance_loss_clip": 1.05251753, "balance_loss_mlp": 1.02284884, "epoch": 0.22545541994829554, "flos": 33691390899840.0, "grad_norm": 2.178002665763151, "language_loss": 0.66099411, "learning_rate": 3.6126140763142646e-06, "loss": 0.68318057, "num_input_tokens_seen": 39882300, "step": 1875, "time_per_iteration": 2.8078067302703857 }, { "auxiliary_loss_clip": 0.01181193, "auxiliary_loss_mlp": 0.01049192, "balance_loss_clip": 1.056795, "balance_loss_mlp": 1.0277462, "epoch": 0.22557566283893465, "flos": 19171594310400.0, "grad_norm": 2.5928563284632733, "language_loss": 0.86556453, "learning_rate": 3.6121531944466275e-06, "loss": 0.88786834, "num_input_tokens_seen": 39899625, "step": 1876, "time_per_iteration": 2.63960599899292 }, { "auxiliary_loss_clip": 0.01161659, "auxiliary_loss_mlp": 0.0104644, "balance_loss_clip": 1.0534308, "balance_loss_mlp": 1.02547097, "epoch": 0.22569590572957374, "flos": 20773281669120.0, "grad_norm": 2.3477968924528065, "language_loss": 0.78281349, "learning_rate": 3.611692068016633e-06, "loss": 0.80489445, "num_input_tokens_seen": 39915955, "step": 1877, "time_per_iteration": 2.6914901733398438 }, { "auxiliary_loss_clip": 0.01130072, "auxiliary_loss_mlp": 0.01066688, "balance_loss_clip": 1.04444647, "balance_loss_mlp": 1.04306066, "epoch": 0.22581614862021282, "flos": 18442715529600.0, "grad_norm": 3.2481337691051007, "language_loss": 0.74970388, "learning_rate": 3.611230697094233e-06, "loss": 0.77167153, "num_input_tokens_seen": 39932655, "step": 1878, "time_per_iteration": 2.7313458919525146 }, { "auxiliary_loss_clip": 0.01150246, "auxiliary_loss_mlp": 0.01049992, "balance_loss_clip": 1.05018806, "balance_loss_mlp": 1.02886796, "epoch": 0.22593639151085193, "flos": 20048389297920.0, "grad_norm": 2.9139282141306193, "language_loss": 0.87339985, "learning_rate": 3.6107690817494173e-06, "loss": 0.89540219, "num_input_tokens_seen": 39952875, "step": 1879, "time_per_iteration": 2.683846950531006 }, { "auxiliary_loss_clip": 0.01117445, "auxiliary_loss_mlp": 0.01055378, "balance_loss_clip": 1.04465842, "balance_loss_mlp": 1.03395605, "epoch": 0.226056634401491, "flos": 13115116350720.0, "grad_norm": 3.3132766460299163, "language_loss": 0.70722026, "learning_rate": 3.6103072220522117e-06, "loss": 0.72894847, "num_input_tokens_seen": 39968405, "step": 1880, "time_per_iteration": 2.7539069652557373 }, { "auxiliary_loss_clip": 0.01139786, "auxiliary_loss_mlp": 0.01055623, "balance_loss_clip": 1.04964221, "balance_loss_mlp": 1.03477359, "epoch": 0.2261768772921301, "flos": 18988378012800.0, "grad_norm": 1.771193686862478, "language_loss": 0.9187932, "learning_rate": 3.609845118072682e-06, "loss": 0.94074726, "num_input_tokens_seen": 39987075, "step": 1881, "time_per_iteration": 2.730936288833618 }, { "auxiliary_loss_clip": 0.01166352, "auxiliary_loss_mlp": 0.00776991, "balance_loss_clip": 1.05166435, "balance_loss_mlp": 1.00031066, "epoch": 0.2262971201827692, "flos": 19974054101760.0, "grad_norm": 1.8964901767034616, "language_loss": 0.80332136, "learning_rate": 3.6093827698809276e-06, "loss": 0.82275474, "num_input_tokens_seen": 40006175, "step": 1882, "time_per_iteration": 2.7790415287017822 }, { "auxiliary_loss_clip": 0.01157419, "auxiliary_loss_mlp": 0.01044047, "balance_loss_clip": 1.04882443, "balance_loss_mlp": 1.02461576, "epoch": 0.2264173630734083, "flos": 16654543735680.0, "grad_norm": 2.2677583042388965, "language_loss": 0.84838271, "learning_rate": 3.6089201775470864e-06, "loss": 0.87039739, "num_input_tokens_seen": 40021630, "step": 1883, "time_per_iteration": 2.8273191452026367 }, { "auxiliary_loss_clip": 0.01124614, "auxiliary_loss_mlp": 0.01052239, "balance_loss_clip": 1.04550505, "balance_loss_mlp": 1.03019691, "epoch": 0.22653760596404737, "flos": 24389809597440.0, "grad_norm": 1.484379427864254, "language_loss": 0.77764165, "learning_rate": 3.6084573411413334e-06, "loss": 0.79941022, "num_input_tokens_seen": 40041025, "step": 1884, "time_per_iteration": 2.7643773555755615 }, { "auxiliary_loss_clip": 0.01136997, "auxiliary_loss_mlp": 0.01051781, "balance_loss_clip": 1.04928231, "balance_loss_mlp": 1.02898812, "epoch": 0.22665784885468646, "flos": 18332541538560.0, "grad_norm": 9.578532068468718, "language_loss": 0.81241238, "learning_rate": 3.607994260733881e-06, "loss": 0.83430016, "num_input_tokens_seen": 40060265, "step": 1885, "time_per_iteration": 3.7284834384918213 }, { "auxiliary_loss_clip": 0.01149114, "auxiliary_loss_mlp": 0.01044478, "balance_loss_clip": 1.04780626, "balance_loss_mlp": 1.02515399, "epoch": 0.22677809174532557, "flos": 24058102475520.0, "grad_norm": 1.714260945934096, "language_loss": 0.74589211, "learning_rate": 3.6075309363949776e-06, "loss": 0.76782799, "num_input_tokens_seen": 40079435, "step": 1886, "time_per_iteration": 2.6956136226654053 }, { "auxiliary_loss_clip": 0.0117505, "auxiliary_loss_mlp": 0.01055277, "balance_loss_clip": 1.05269933, "balance_loss_mlp": 1.03440297, "epoch": 0.22689833463596465, "flos": 20374242503040.0, "grad_norm": 2.5639535545198244, "language_loss": 0.81285864, "learning_rate": 3.6070673681949094e-06, "loss": 0.83516192, "num_input_tokens_seen": 40097800, "step": 1887, "time_per_iteration": 3.739868640899658 }, { "auxiliary_loss_clip": 0.0115281, "auxiliary_loss_mlp": 0.00775525, "balance_loss_clip": 1.05267859, "balance_loss_mlp": 1.00021124, "epoch": 0.22701857752660373, "flos": 30120398438400.0, "grad_norm": 1.9055108644055325, "language_loss": 0.80871129, "learning_rate": 3.606603556203999e-06, "loss": 0.8279947, "num_input_tokens_seen": 40122745, "step": 1888, "time_per_iteration": 2.7765889167785645 }, { "auxiliary_loss_clip": 0.01159714, "auxiliary_loss_mlp": 0.0105262, "balance_loss_clip": 1.048769, "balance_loss_mlp": 1.03259254, "epoch": 0.22713882041724284, "flos": 22492182084480.0, "grad_norm": 1.8025153475032882, "language_loss": 0.83162415, "learning_rate": 3.6061395004926066e-06, "loss": 0.85374755, "num_input_tokens_seen": 40141680, "step": 1889, "time_per_iteration": 2.744050979614258 }, { "auxiliary_loss_clip": 0.0117565, "auxiliary_loss_mlp": 0.01042792, "balance_loss_clip": 1.05128932, "balance_loss_mlp": 1.0216918, "epoch": 0.22725906330788193, "flos": 20521548178560.0, "grad_norm": 2.1332219017356477, "language_loss": 0.84512228, "learning_rate": 3.605675201131129e-06, "loss": 0.86730671, "num_input_tokens_seen": 40160140, "step": 1890, "time_per_iteration": 3.627912998199463 }, { "auxiliary_loss_clip": 0.01163324, "auxiliary_loss_mlp": 0.01056438, "balance_loss_clip": 1.05195546, "balance_loss_mlp": 1.03653002, "epoch": 0.227379306198521, "flos": 18989922297600.0, "grad_norm": 2.2210726080680416, "language_loss": 0.79819089, "learning_rate": 3.60521065819e-06, "loss": 0.8203885, "num_input_tokens_seen": 40177450, "step": 1891, "time_per_iteration": 2.6461496353149414 }, { "auxiliary_loss_clip": 0.01155261, "auxiliary_loss_mlp": 0.01058421, "balance_loss_clip": 1.05137885, "balance_loss_mlp": 1.03687978, "epoch": 0.2274995490891601, "flos": 21798351999360.0, "grad_norm": 2.5156932908916287, "language_loss": 0.87724644, "learning_rate": 3.60474587173969e-06, "loss": 0.89938331, "num_input_tokens_seen": 40195935, "step": 1892, "time_per_iteration": 2.75927996635437 }, { "auxiliary_loss_clip": 0.01160417, "auxiliary_loss_mlp": 0.01048803, "balance_loss_clip": 1.05257928, "balance_loss_mlp": 1.0280602, "epoch": 0.2276197919797992, "flos": 19058654972160.0, "grad_norm": 2.1354061805184656, "language_loss": 0.84043086, "learning_rate": 3.6042808418507084e-06, "loss": 0.86252308, "num_input_tokens_seen": 40213620, "step": 1893, "time_per_iteration": 3.5795745849609375 }, { "auxiliary_loss_clip": 0.01164566, "auxiliary_loss_mlp": 0.01049533, "balance_loss_clip": 1.05294204, "balance_loss_mlp": 1.02796817, "epoch": 0.22774003487043828, "flos": 18806777827200.0, "grad_norm": 5.644814114047152, "language_loss": 0.77513969, "learning_rate": 3.6038155685935976e-06, "loss": 0.79728067, "num_input_tokens_seen": 40230190, "step": 1894, "time_per_iteration": 2.6380717754364014 }, { "auxiliary_loss_clip": 0.01159833, "auxiliary_loss_mlp": 0.01054536, "balance_loss_clip": 1.05026913, "balance_loss_mlp": 1.032184, "epoch": 0.22786027776107737, "flos": 23002544476800.0, "grad_norm": 2.4465416205980226, "language_loss": 0.71048808, "learning_rate": 3.6033500520389404e-06, "loss": 0.7326318, "num_input_tokens_seen": 40246860, "step": 1895, "time_per_iteration": 2.76361346244812 }, { "auxiliary_loss_clip": 0.01029579, "auxiliary_loss_mlp": 0.01055804, "balance_loss_clip": 1.01999974, "balance_loss_mlp": 1.05265713, "epoch": 0.22798052065171648, "flos": 66706872600960.0, "grad_norm": 0.8091493149037018, "language_loss": 0.64753854, "learning_rate": 3.6028842922573553e-06, "loss": 0.66839242, "num_input_tokens_seen": 40311005, "step": 1896, "time_per_iteration": 3.3808093070983887 }, { "auxiliary_loss_clip": 0.01035159, "auxiliary_loss_mlp": 0.00757561, "balance_loss_clip": 1.01702583, "balance_loss_mlp": 1.00000894, "epoch": 0.22810076354235556, "flos": 62080896758400.0, "grad_norm": 0.8487597839944422, "language_loss": 0.628268, "learning_rate": 3.602418289319497e-06, "loss": 0.64619523, "num_input_tokens_seen": 40369560, "step": 1897, "time_per_iteration": 3.253931999206543 }, { "auxiliary_loss_clip": 0.01123612, "auxiliary_loss_mlp": 0.01055106, "balance_loss_clip": 1.0481708, "balance_loss_mlp": 1.03301668, "epoch": 0.22822100643299464, "flos": 23876358635520.0, "grad_norm": 1.770013574593687, "language_loss": 0.73162282, "learning_rate": 3.601952043296059e-06, "loss": 0.75340998, "num_input_tokens_seen": 40389555, "step": 1898, "time_per_iteration": 2.7416341304779053 }, { "auxiliary_loss_clip": 0.01150954, "auxiliary_loss_mlp": 0.01048593, "balance_loss_clip": 1.04813504, "balance_loss_mlp": 1.02743316, "epoch": 0.22834124932363373, "flos": 20991331180800.0, "grad_norm": 2.7468315671303984, "language_loss": 0.80723649, "learning_rate": 3.6014855542577696e-06, "loss": 0.82923198, "num_input_tokens_seen": 40406765, "step": 1899, "time_per_iteration": 2.7482151985168457 }, { "auxiliary_loss_clip": 0.01148856, "auxiliary_loss_mlp": 0.01051558, "balance_loss_clip": 1.05131435, "balance_loss_mlp": 1.03052938, "epoch": 0.22846149221427284, "flos": 24901572620160.0, "grad_norm": 1.698032823783943, "language_loss": 0.8435384, "learning_rate": 3.6010188222753943e-06, "loss": 0.86554253, "num_input_tokens_seen": 40427535, "step": 1900, "time_per_iteration": 2.7316718101501465 }, { "auxiliary_loss_clip": 0.01042243, "auxiliary_loss_mlp": 0.01002846, "balance_loss_clip": 1.01641858, "balance_loss_mlp": 1.00002027, "epoch": 0.22858173510491192, "flos": 56132294319360.0, "grad_norm": 0.9040647688029072, "language_loss": 0.64098948, "learning_rate": 3.6005518474197372e-06, "loss": 0.66144037, "num_input_tokens_seen": 40479580, "step": 1901, "time_per_iteration": 3.1399574279785156 }, { "auxiliary_loss_clip": 0.01163731, "auxiliary_loss_mlp": 0.0105799, "balance_loss_clip": 1.05357563, "balance_loss_mlp": 1.03579271, "epoch": 0.228701977995551, "flos": 24170826332160.0, "grad_norm": 1.9164633589676663, "language_loss": 0.78221023, "learning_rate": 3.6000846297616373e-06, "loss": 0.80442739, "num_input_tokens_seen": 40497880, "step": 1902, "time_per_iteration": 2.683562755584717 }, { "auxiliary_loss_clip": 0.01179511, "auxiliary_loss_mlp": 0.01067626, "balance_loss_clip": 1.05244517, "balance_loss_mlp": 1.04650199, "epoch": 0.22882222088619011, "flos": 21387892308480.0, "grad_norm": 2.577973025271209, "language_loss": 0.72512364, "learning_rate": 3.5996171693719717e-06, "loss": 0.74759501, "num_input_tokens_seen": 40513975, "step": 1903, "time_per_iteration": 2.6212289333343506 }, { "auxiliary_loss_clip": 0.01053577, "auxiliary_loss_mlp": 0.01003946, "balance_loss_clip": 1.01597667, "balance_loss_mlp": 1.0012995, "epoch": 0.2289424637768292, "flos": 64589615377920.0, "grad_norm": 0.8360780533813227, "language_loss": 0.64797777, "learning_rate": 3.5991494663216528e-06, "loss": 0.66855305, "num_input_tokens_seen": 40576960, "step": 1904, "time_per_iteration": 3.276611566543579 }, { "auxiliary_loss_clip": 0.01181698, "auxiliary_loss_mlp": 0.01059412, "balance_loss_clip": 1.05716002, "balance_loss_mlp": 1.03697705, "epoch": 0.22906270666746828, "flos": 22163419877760.0, "grad_norm": 1.8559696216666288, "language_loss": 0.87706411, "learning_rate": 3.5986815206816314e-06, "loss": 0.89947528, "num_input_tokens_seen": 40595780, "step": 1905, "time_per_iteration": 2.7054295539855957 }, { "auxiliary_loss_clip": 0.01178091, "auxiliary_loss_mlp": 0.01051685, "balance_loss_clip": 1.05224001, "balance_loss_mlp": 1.03009605, "epoch": 0.2291829495581074, "flos": 25772334122880.0, "grad_norm": 2.007726977198954, "language_loss": 0.74718821, "learning_rate": 3.598213332522895e-06, "loss": 0.76948601, "num_input_tokens_seen": 40615810, "step": 1906, "time_per_iteration": 2.6670448780059814 }, { "auxiliary_loss_clip": 0.01158457, "auxiliary_loss_mlp": 0.01056695, "balance_loss_clip": 1.04955089, "balance_loss_mlp": 1.03510618, "epoch": 0.22930319244874647, "flos": 31172760126720.0, "grad_norm": 1.950526711698663, "language_loss": 0.77707827, "learning_rate": 3.597744901916466e-06, "loss": 0.79922986, "num_input_tokens_seen": 40637095, "step": 1907, "time_per_iteration": 2.7488908767700195 }, { "auxiliary_loss_clip": 0.01178627, "auxiliary_loss_mlp": 0.01053895, "balance_loss_clip": 1.05060315, "balance_loss_mlp": 1.03070855, "epoch": 0.22942343533938556, "flos": 23254098399360.0, "grad_norm": 1.9949035414598453, "language_loss": 0.77341354, "learning_rate": 3.5972762289334058e-06, "loss": 0.7957387, "num_input_tokens_seen": 40656725, "step": 1908, "time_per_iteration": 2.6812357902526855 }, { "auxiliary_loss_clip": 0.01106923, "auxiliary_loss_mlp": 0.0105122, "balance_loss_clip": 1.04424155, "balance_loss_mlp": 1.02791452, "epoch": 0.22954367823002464, "flos": 14610903436800.0, "grad_norm": 1.9314844777628117, "language_loss": 0.8513087, "learning_rate": 3.5968073136448116e-06, "loss": 0.87289011, "num_input_tokens_seen": 40674745, "step": 1909, "time_per_iteration": 2.728581666946411 }, { "auxiliary_loss_clip": 0.01167327, "auxiliary_loss_mlp": 0.01054714, "balance_loss_clip": 1.05105984, "balance_loss_mlp": 1.03264797, "epoch": 0.22966392112066375, "flos": 16763604405120.0, "grad_norm": 1.7413670082477315, "language_loss": 0.91121191, "learning_rate": 3.596338156121818e-06, "loss": 0.93343228, "num_input_tokens_seen": 40693630, "step": 1910, "time_per_iteration": 2.6225221157073975 }, { "auxiliary_loss_clip": 0.01048285, "auxiliary_loss_mlp": 0.01006821, "balance_loss_clip": 1.02117097, "balance_loss_mlp": 1.00411534, "epoch": 0.22978416401130283, "flos": 67474247783040.0, "grad_norm": 0.7405315455408643, "language_loss": 0.59324145, "learning_rate": 3.595868756435595e-06, "loss": 0.61379254, "num_input_tokens_seen": 40761310, "step": 1911, "time_per_iteration": 4.369907855987549 }, { "auxiliary_loss_clip": 0.01145686, "auxiliary_loss_mlp": 0.01060762, "balance_loss_clip": 1.05511701, "balance_loss_mlp": 1.03869653, "epoch": 0.22990440690194192, "flos": 19865137086720.0, "grad_norm": 2.843750261136824, "language_loss": 0.80324411, "learning_rate": 3.5953991146573504e-06, "loss": 0.82530862, "num_input_tokens_seen": 40779955, "step": 1912, "time_per_iteration": 3.6997697353363037 }, { "auxiliary_loss_clip": 0.01163054, "auxiliary_loss_mlp": 0.01053741, "balance_loss_clip": 1.04738379, "balance_loss_mlp": 1.02964842, "epoch": 0.23002464979258103, "flos": 13289246507520.0, "grad_norm": 2.5678601283924514, "language_loss": 0.83492422, "learning_rate": 3.5949292308583294e-06, "loss": 0.85709214, "num_input_tokens_seen": 40793200, "step": 1913, "time_per_iteration": 2.601879835128784 }, { "auxiliary_loss_clip": 0.0117513, "auxiliary_loss_mlp": 0.01059147, "balance_loss_clip": 1.05236089, "balance_loss_mlp": 1.03511405, "epoch": 0.2301448926832201, "flos": 22163779013760.0, "grad_norm": 2.681820096602953, "language_loss": 0.81052774, "learning_rate": 3.594459105109811e-06, "loss": 0.83287048, "num_input_tokens_seen": 40812380, "step": 1914, "time_per_iteration": 2.625591516494751 }, { "auxiliary_loss_clip": 0.01167212, "auxiliary_loss_mlp": 0.01056297, "balance_loss_clip": 1.054057, "balance_loss_mlp": 1.03506613, "epoch": 0.2302651355738592, "flos": 20704477167360.0, "grad_norm": 1.8400630319699118, "language_loss": 0.81578922, "learning_rate": 3.593988737483115e-06, "loss": 0.83802426, "num_input_tokens_seen": 40832320, "step": 1915, "time_per_iteration": 3.5502021312713623 }, { "auxiliary_loss_clip": 0.01153514, "auxiliary_loss_mlp": 0.01054008, "balance_loss_clip": 1.05379105, "balance_loss_mlp": 1.03116798, "epoch": 0.23038537846449827, "flos": 18588943797120.0, "grad_norm": 2.0768040367473746, "language_loss": 0.78128946, "learning_rate": 3.5935181280495947e-06, "loss": 0.80336469, "num_input_tokens_seen": 40850900, "step": 1916, "time_per_iteration": 2.732121229171753 }, { "auxiliary_loss_clip": 0.01039038, "auxiliary_loss_mlp": 0.0100214, "balance_loss_clip": 1.01719427, "balance_loss_mlp": 0.99940979, "epoch": 0.23050562135513739, "flos": 64224260190720.0, "grad_norm": 0.8076295084253783, "language_loss": 0.54311579, "learning_rate": 3.5930472768806412e-06, "loss": 0.56352758, "num_input_tokens_seen": 40909570, "step": 1917, "time_per_iteration": 3.233189821243286 }, { "auxiliary_loss_clip": 0.01177427, "auxiliary_loss_mlp": 0.01063009, "balance_loss_clip": 1.05497384, "balance_loss_mlp": 1.04080009, "epoch": 0.23062586424577647, "flos": 17313396952320.0, "grad_norm": 1.7825087187935797, "language_loss": 0.76898205, "learning_rate": 3.5925761840476826e-06, "loss": 0.79138637, "num_input_tokens_seen": 40928180, "step": 1918, "time_per_iteration": 2.6181399822235107 }, { "auxiliary_loss_clip": 0.01146471, "auxiliary_loss_mlp": 0.01054681, "balance_loss_clip": 1.05228567, "balance_loss_mlp": 1.03361619, "epoch": 0.23074610713641555, "flos": 27855979194240.0, "grad_norm": 1.8107384531157642, "language_loss": 0.81325758, "learning_rate": 3.592104849622183e-06, "loss": 0.83526915, "num_input_tokens_seen": 40950435, "step": 1919, "time_per_iteration": 3.663696527481079 }, { "auxiliary_loss_clip": 0.01118654, "auxiliary_loss_mlp": 0.01055997, "balance_loss_clip": 1.04645681, "balance_loss_mlp": 1.03332376, "epoch": 0.23086635002705466, "flos": 28841798937600.0, "grad_norm": 1.6014217555890295, "language_loss": 0.73229575, "learning_rate": 3.591633273675644e-06, "loss": 0.75404227, "num_input_tokens_seen": 40972670, "step": 1920, "time_per_iteration": 2.7341561317443848 }, { "auxiliary_loss_clip": 0.0102969, "auxiliary_loss_mlp": 0.01016206, "balance_loss_clip": 1.02665901, "balance_loss_mlp": 1.01336884, "epoch": 0.23098659291769374, "flos": 62923681566720.0, "grad_norm": 0.915093702362415, "language_loss": 0.58207595, "learning_rate": 3.591161456279602e-06, "loss": 0.60253495, "num_input_tokens_seen": 41018215, "step": 1921, "time_per_iteration": 3.135228395462036 }, { "auxiliary_loss_clip": 0.01156036, "auxiliary_loss_mlp": 0.01051165, "balance_loss_clip": 1.050524, "balance_loss_mlp": 1.028193, "epoch": 0.23110683580833283, "flos": 23476816679040.0, "grad_norm": 6.222605475301165, "language_loss": 0.8032347, "learning_rate": 3.590689397505633e-06, "loss": 0.82530665, "num_input_tokens_seen": 41039125, "step": 1922, "time_per_iteration": 2.7015011310577393 }, { "auxiliary_loss_clip": 0.01173959, "auxiliary_loss_mlp": 0.01052384, "balance_loss_clip": 1.05324268, "balance_loss_mlp": 1.031654, "epoch": 0.2312270786989719, "flos": 27271066124160.0, "grad_norm": 2.5412780434125195, "language_loss": 0.86772621, "learning_rate": 3.590217097425347e-06, "loss": 0.88998961, "num_input_tokens_seen": 41059025, "step": 1923, "time_per_iteration": 2.668872833251953 }, { "auxiliary_loss_clip": 0.01175455, "auxiliary_loss_mlp": 0.01053797, "balance_loss_clip": 1.05247188, "balance_loss_mlp": 1.03140998, "epoch": 0.23134732158961102, "flos": 13261344618240.0, "grad_norm": 2.1813406654481975, "language_loss": 0.71392775, "learning_rate": 3.589744556110391e-06, "loss": 0.73622024, "num_input_tokens_seen": 41077015, "step": 1924, "time_per_iteration": 2.62738299369812 }, { "auxiliary_loss_clip": 0.0114299, "auxiliary_loss_mlp": 0.01051607, "balance_loss_clip": 1.04522347, "balance_loss_mlp": 1.02940965, "epoch": 0.2314675644802501, "flos": 36977648250240.0, "grad_norm": 1.5510409145246757, "language_loss": 0.84432101, "learning_rate": 3.58927177363245e-06, "loss": 0.86626697, "num_input_tokens_seen": 41099840, "step": 1925, "time_per_iteration": 2.7775425910949707 }, { "auxiliary_loss_clip": 0.01130467, "auxiliary_loss_mlp": 0.01054518, "balance_loss_clip": 1.04618132, "balance_loss_mlp": 1.0318923, "epoch": 0.2315878073708892, "flos": 23842207779840.0, "grad_norm": 2.895895902976756, "language_loss": 0.72610158, "learning_rate": 3.5887987500632447e-06, "loss": 0.74795145, "num_input_tokens_seen": 41117845, "step": 1926, "time_per_iteration": 2.7536489963531494 }, { "auxiliary_loss_clip": 0.01143296, "auxiliary_loss_mlp": 0.01050248, "balance_loss_clip": 1.05218148, "balance_loss_mlp": 1.02850437, "epoch": 0.2317080502615283, "flos": 23039424766080.0, "grad_norm": 2.485126285061805, "language_loss": 0.84053707, "learning_rate": 3.5883254854745325e-06, "loss": 0.86247253, "num_input_tokens_seen": 41136235, "step": 1927, "time_per_iteration": 2.6948812007904053 }, { "auxiliary_loss_clip": 0.01165491, "auxiliary_loss_mlp": 0.01057517, "balance_loss_clip": 1.04813874, "balance_loss_mlp": 1.03493881, "epoch": 0.23182829315216738, "flos": 11254656435840.0, "grad_norm": 2.1145154511559423, "language_loss": 0.74952173, "learning_rate": 3.587851979938107e-06, "loss": 0.77175188, "num_input_tokens_seen": 41153125, "step": 1928, "time_per_iteration": 2.641390085220337 }, { "auxiliary_loss_clip": 0.01161855, "auxiliary_loss_mlp": 0.0105978, "balance_loss_clip": 1.05009162, "balance_loss_mlp": 1.03649807, "epoch": 0.23194853604280646, "flos": 19828939155840.0, "grad_norm": 1.9334753459034646, "language_loss": 0.77802801, "learning_rate": 3.5873782335257985e-06, "loss": 0.80024445, "num_input_tokens_seen": 41171290, "step": 1929, "time_per_iteration": 2.7181646823883057 }, { "auxiliary_loss_clip": 0.01133363, "auxiliary_loss_mlp": 0.01057483, "balance_loss_clip": 1.04937744, "balance_loss_mlp": 1.03387976, "epoch": 0.23206877893344555, "flos": 15305020830720.0, "grad_norm": 3.0172017103777455, "language_loss": 0.78274965, "learning_rate": 3.5869042463094744e-06, "loss": 0.80465811, "num_input_tokens_seen": 41189005, "step": 1930, "time_per_iteration": 2.7031404972076416 }, { "auxiliary_loss_clip": 0.01104072, "auxiliary_loss_mlp": 0.01060969, "balance_loss_clip": 1.0422436, "balance_loss_mlp": 1.03593469, "epoch": 0.23218902182408466, "flos": 22711488572160.0, "grad_norm": 2.037152675464311, "language_loss": 0.76695633, "learning_rate": 3.586430018361038e-06, "loss": 0.78860676, "num_input_tokens_seen": 41208775, "step": 1931, "time_per_iteration": 2.7200329303741455 }, { "auxiliary_loss_clip": 0.01135606, "auxiliary_loss_mlp": 0.01059701, "balance_loss_clip": 1.04570055, "balance_loss_mlp": 1.03603768, "epoch": 0.23230926471472374, "flos": 22710734386560.0, "grad_norm": 2.9159781346758185, "language_loss": 0.76313877, "learning_rate": 3.5859555497524283e-06, "loss": 0.78509188, "num_input_tokens_seen": 41226010, "step": 1932, "time_per_iteration": 2.7097456455230713 }, { "auxiliary_loss_clip": 0.01161054, "auxiliary_loss_mlp": 0.01055083, "balance_loss_clip": 1.05336499, "balance_loss_mlp": 1.03473401, "epoch": 0.23242950760536282, "flos": 20375499479040.0, "grad_norm": 2.15359133906217, "language_loss": 0.92175859, "learning_rate": 3.5854808405556237e-06, "loss": 0.9439199, "num_input_tokens_seen": 41245245, "step": 1933, "time_per_iteration": 2.6993401050567627 }, { "auxiliary_loss_clip": 0.011354, "auxiliary_loss_mlp": 0.01045535, "balance_loss_clip": 1.04724336, "balance_loss_mlp": 1.02526939, "epoch": 0.23254975049600193, "flos": 16908324301440.0, "grad_norm": 2.523745294754789, "language_loss": 0.75210804, "learning_rate": 3.5850058908426355e-06, "loss": 0.77391732, "num_input_tokens_seen": 41263795, "step": 1934, "time_per_iteration": 2.6964879035949707 }, { "auxiliary_loss_clip": 0.01148248, "auxiliary_loss_mlp": 0.01049548, "balance_loss_clip": 1.04546738, "balance_loss_mlp": 1.02870989, "epoch": 0.23266999338664102, "flos": 23294821443840.0, "grad_norm": 1.8237361740803522, "language_loss": 0.85796821, "learning_rate": 3.584530700685514e-06, "loss": 0.87994617, "num_input_tokens_seen": 41284055, "step": 1935, "time_per_iteration": 2.789303779602051 }, { "auxiliary_loss_clip": 0.01143805, "auxiliary_loss_mlp": 0.01047329, "balance_loss_clip": 1.04955971, "balance_loss_mlp": 1.02514458, "epoch": 0.2327902362772801, "flos": 19569987031680.0, "grad_norm": 2.146442606133093, "language_loss": 0.8903302, "learning_rate": 3.5840552701563448e-06, "loss": 0.91224158, "num_input_tokens_seen": 41300255, "step": 1936, "time_per_iteration": 3.6581857204437256 }, { "auxiliary_loss_clip": 0.01169944, "auxiliary_loss_mlp": 0.0106103, "balance_loss_clip": 1.04985023, "balance_loss_mlp": 1.03913081, "epoch": 0.2329104791679192, "flos": 16727514215040.0, "grad_norm": 2.1626905604698834, "language_loss": 0.81907964, "learning_rate": 3.5835795993272513e-06, "loss": 0.84138936, "num_input_tokens_seen": 41318540, "step": 1937, "time_per_iteration": 2.6788320541381836 }, { "auxiliary_loss_clip": 0.0108451, "auxiliary_loss_mlp": 0.01069866, "balance_loss_clip": 1.04397929, "balance_loss_mlp": 1.04538035, "epoch": 0.2330307220585583, "flos": 22163743100160.0, "grad_norm": 1.9832421899956771, "language_loss": 0.71169233, "learning_rate": 3.583103688270391e-06, "loss": 0.73323607, "num_input_tokens_seen": 41338320, "step": 1938, "time_per_iteration": 2.9971187114715576 }, { "auxiliary_loss_clip": 0.01133588, "auxiliary_loss_mlp": 0.01065597, "balance_loss_clip": 1.04621542, "balance_loss_mlp": 1.04254234, "epoch": 0.23315096494919738, "flos": 19317319787520.0, "grad_norm": 2.2205072506102037, "language_loss": 0.89183462, "learning_rate": 3.58262753705796e-06, "loss": 0.91382647, "num_input_tokens_seen": 41353210, "step": 1939, "time_per_iteration": 3.80549693107605 }, { "auxiliary_loss_clip": 0.01047869, "auxiliary_loss_mlp": 0.01012595, "balance_loss_clip": 1.02220523, "balance_loss_mlp": 1.01002026, "epoch": 0.23327120783983646, "flos": 53031048946560.0, "grad_norm": 0.7610323854760187, "language_loss": 0.55504519, "learning_rate": 3.5821511457621902e-06, "loss": 0.5756498, "num_input_tokens_seen": 41410510, "step": 1940, "time_per_iteration": 3.2591426372528076 }, { "auxiliary_loss_clip": 0.01145321, "auxiliary_loss_mlp": 0.01065694, "balance_loss_clip": 1.04863882, "balance_loss_mlp": 1.03998065, "epoch": 0.23339145073047557, "flos": 17126984344320.0, "grad_norm": 4.142661114050779, "language_loss": 0.81312072, "learning_rate": 3.5816745144553497e-06, "loss": 0.83523083, "num_input_tokens_seen": 41425830, "step": 1941, "time_per_iteration": 3.5890634059906006 }, { "auxiliary_loss_clip": 0.01122758, "auxiliary_loss_mlp": 0.01045365, "balance_loss_clip": 1.0490489, "balance_loss_mlp": 1.02337074, "epoch": 0.23351169362111465, "flos": 13078918419840.0, "grad_norm": 2.4925649572944573, "language_loss": 0.75721431, "learning_rate": 3.5811976432097424e-06, "loss": 0.77889562, "num_input_tokens_seen": 41443500, "step": 1942, "time_per_iteration": 2.74035382270813 }, { "auxiliary_loss_clip": 0.0116734, "auxiliary_loss_mlp": 0.00777264, "balance_loss_clip": 1.05567455, "balance_loss_mlp": 1.00021195, "epoch": 0.23363193651175373, "flos": 15851257931520.0, "grad_norm": 2.690870493925202, "language_loss": 0.85088098, "learning_rate": 3.58072053209771e-06, "loss": 0.870327, "num_input_tokens_seen": 41460055, "step": 1943, "time_per_iteration": 2.6555328369140625 }, { "auxiliary_loss_clip": 0.01143351, "auxiliary_loss_mlp": 0.01066858, "balance_loss_clip": 1.04778576, "balance_loss_mlp": 1.04324222, "epoch": 0.23375217940239285, "flos": 21025769345280.0, "grad_norm": 2.239515532314005, "language_loss": 0.7924304, "learning_rate": 3.5802431811916296e-06, "loss": 0.81453252, "num_input_tokens_seen": 41476665, "step": 1944, "time_per_iteration": 3.7195417881011963 }, { "auxiliary_loss_clip": 0.01148967, "auxiliary_loss_mlp": 0.010499, "balance_loss_clip": 1.05026162, "balance_loss_mlp": 1.02956283, "epoch": 0.23387242229303193, "flos": 20594698225920.0, "grad_norm": 1.7464017004788273, "language_loss": 0.81010747, "learning_rate": 3.579765590563916e-06, "loss": 0.83209616, "num_input_tokens_seen": 41496065, "step": 1945, "time_per_iteration": 2.738394260406494 }, { "auxiliary_loss_clip": 0.01157173, "auxiliary_loss_mlp": 0.01055165, "balance_loss_clip": 1.05361927, "balance_loss_mlp": 1.03252769, "epoch": 0.233992665183671, "flos": 24279491952000.0, "grad_norm": 2.0113236166415507, "language_loss": 0.81710666, "learning_rate": 3.579287760287017e-06, "loss": 0.83923006, "num_input_tokens_seen": 41516815, "step": 1946, "time_per_iteration": 2.7667226791381836 }, { "auxiliary_loss_clip": 0.01163551, "auxiliary_loss_mlp": 0.01049732, "balance_loss_clip": 1.05432761, "balance_loss_mlp": 1.02972913, "epoch": 0.2341129080743101, "flos": 30154621121280.0, "grad_norm": 1.6711634335682677, "language_loss": 0.73292243, "learning_rate": 3.578809690433421e-06, "loss": 0.75505531, "num_input_tokens_seen": 41538525, "step": 1947, "time_per_iteration": 2.7711942195892334 }, { "auxiliary_loss_clip": 0.01184241, "auxiliary_loss_mlp": 0.01065204, "balance_loss_clip": 1.0566721, "balance_loss_mlp": 1.04229152, "epoch": 0.2342331509649492, "flos": 22784135829120.0, "grad_norm": 2.564637218595273, "language_loss": 0.816055, "learning_rate": 3.578331381075651e-06, "loss": 0.83854949, "num_input_tokens_seen": 41559025, "step": 1948, "time_per_iteration": 2.755326271057129 }, { "auxiliary_loss_clip": 0.01164075, "auxiliary_loss_mlp": 0.01070056, "balance_loss_clip": 1.05088651, "balance_loss_mlp": 1.04847932, "epoch": 0.2343533938555883, "flos": 23623152687360.0, "grad_norm": 2.1103797153227286, "language_loss": 0.69767791, "learning_rate": 3.5778528322862646e-06, "loss": 0.72001922, "num_input_tokens_seen": 41577845, "step": 1949, "time_per_iteration": 2.719179630279541 }, { "auxiliary_loss_clip": 0.01165004, "auxiliary_loss_mlp": 0.01061228, "balance_loss_clip": 1.05286527, "balance_loss_mlp": 1.04047418, "epoch": 0.23447363674622737, "flos": 24570332375040.0, "grad_norm": 2.2567968672690464, "language_loss": 0.86825323, "learning_rate": 3.5773740441378585e-06, "loss": 0.89051545, "num_input_tokens_seen": 41598600, "step": 1950, "time_per_iteration": 2.710190773010254 }, { "auxiliary_loss_clip": 0.01159849, "auxiliary_loss_mlp": 0.01054922, "balance_loss_clip": 1.05017948, "balance_loss_mlp": 1.03375041, "epoch": 0.23459387963686648, "flos": 53140322119680.0, "grad_norm": 1.8767512012044127, "language_loss": 0.73941791, "learning_rate": 3.5768950167030633e-06, "loss": 0.76156569, "num_input_tokens_seen": 41623300, "step": 1951, "time_per_iteration": 2.9717164039611816 }, { "auxiliary_loss_clip": 0.01134324, "auxiliary_loss_mlp": 0.01064404, "balance_loss_clip": 1.04463315, "balance_loss_mlp": 1.04055023, "epoch": 0.23471412252750556, "flos": 23951412103680.0, "grad_norm": 2.1446311608000377, "language_loss": 0.78709233, "learning_rate": 3.576415750054548e-06, "loss": 0.80907965, "num_input_tokens_seen": 41643420, "step": 1952, "time_per_iteration": 2.6839306354522705 }, { "auxiliary_loss_clip": 0.0113812, "auxiliary_loss_mlp": 0.01071879, "balance_loss_clip": 1.04776621, "balance_loss_mlp": 1.0484786, "epoch": 0.23483436541814465, "flos": 15706573948800.0, "grad_norm": 5.26646009208368, "language_loss": 0.85942918, "learning_rate": 3.5759362442650172e-06, "loss": 0.88152921, "num_input_tokens_seen": 41660170, "step": 1953, "time_per_iteration": 2.5869226455688477 }, { "auxiliary_loss_clip": 0.01159009, "auxiliary_loss_mlp": 0.01058051, "balance_loss_clip": 1.0548892, "balance_loss_mlp": 1.03771353, "epoch": 0.23495460830878373, "flos": 24936262179840.0, "grad_norm": 1.9503864833995848, "language_loss": 0.85288459, "learning_rate": 3.5754564994072113e-06, "loss": 0.87505519, "num_input_tokens_seen": 41679010, "step": 1954, "time_per_iteration": 2.7084872722625732 }, { "auxiliary_loss_clip": 0.01147377, "auxiliary_loss_mlp": 0.01056018, "balance_loss_clip": 1.05035019, "balance_loss_mlp": 1.03411877, "epoch": 0.23507485119942284, "flos": 30482665056000.0, "grad_norm": 2.427824828537219, "language_loss": 0.60215461, "learning_rate": 3.5749765155539067e-06, "loss": 0.62418854, "num_input_tokens_seen": 41699495, "step": 1955, "time_per_iteration": 2.7516367435455322 }, { "auxiliary_loss_clip": 0.01139242, "auxiliary_loss_mlp": 0.01057862, "balance_loss_clip": 1.04711604, "balance_loss_mlp": 1.03613055, "epoch": 0.23519509409006192, "flos": 18329129746560.0, "grad_norm": 2.188563537926338, "language_loss": 0.92216146, "learning_rate": 3.574496292777917e-06, "loss": 0.94413257, "num_input_tokens_seen": 41717705, "step": 1956, "time_per_iteration": 2.7184090614318848 }, { "auxiliary_loss_clip": 0.01159436, "auxiliary_loss_mlp": 0.01058457, "balance_loss_clip": 1.05147386, "balance_loss_mlp": 1.03631973, "epoch": 0.235315336980701, "flos": 29643217234560.0, "grad_norm": 1.9765430197599303, "language_loss": 0.71706915, "learning_rate": 3.574015831152092e-06, "loss": 0.7392481, "num_input_tokens_seen": 41738120, "step": 1957, "time_per_iteration": 2.8796355724334717 }, { "auxiliary_loss_clip": 0.01136825, "auxiliary_loss_mlp": 0.01066673, "balance_loss_clip": 1.04780424, "balance_loss_mlp": 1.04684842, "epoch": 0.23543557987134012, "flos": 18551704371840.0, "grad_norm": 2.252270879307696, "language_loss": 0.83447218, "learning_rate": 3.573535130749316e-06, "loss": 0.85650718, "num_input_tokens_seen": 41756070, "step": 1958, "time_per_iteration": 2.6199512481689453 }, { "auxiliary_loss_clip": 0.01138303, "auxiliary_loss_mlp": 0.01066, "balance_loss_clip": 1.04764163, "balance_loss_mlp": 1.04263496, "epoch": 0.2355558227619792, "flos": 24679033908480.0, "grad_norm": 1.9063181596856635, "language_loss": 0.74055874, "learning_rate": 3.5730541916425127e-06, "loss": 0.76260173, "num_input_tokens_seen": 41777550, "step": 1959, "time_per_iteration": 2.7287979125976562 }, { "auxiliary_loss_clip": 0.0113704, "auxiliary_loss_mlp": 0.01056095, "balance_loss_clip": 1.04739213, "balance_loss_mlp": 1.03468549, "epoch": 0.23567606565261828, "flos": 21944795748480.0, "grad_norm": 2.3210105347792673, "language_loss": 0.86342061, "learning_rate": 3.572573013904639e-06, "loss": 0.8853519, "num_input_tokens_seen": 41797460, "step": 1960, "time_per_iteration": 2.75598406791687 }, { "auxiliary_loss_clip": 0.01175388, "auxiliary_loss_mlp": 0.01050881, "balance_loss_clip": 1.05331564, "balance_loss_mlp": 1.03048444, "epoch": 0.2357963085432574, "flos": 13589352639360.0, "grad_norm": 2.652162490594105, "language_loss": 0.91885835, "learning_rate": 3.572091597608689e-06, "loss": 0.94112098, "num_input_tokens_seen": 41815585, "step": 1961, "time_per_iteration": 2.630704164505005 }, { "auxiliary_loss_clip": 0.01159282, "auxiliary_loss_mlp": 0.0106002, "balance_loss_clip": 1.05210412, "balance_loss_mlp": 1.03713143, "epoch": 0.23591655143389648, "flos": 22088689632000.0, "grad_norm": 2.3836448654543645, "language_loss": 0.73730171, "learning_rate": 3.571609942827694e-06, "loss": 0.75949472, "num_input_tokens_seen": 41834700, "step": 1962, "time_per_iteration": 3.6529312133789062 }, { "auxiliary_loss_clip": 0.01148429, "auxiliary_loss_mlp": 0.01057647, "balance_loss_clip": 1.05022192, "balance_loss_mlp": 1.03603482, "epoch": 0.23603679432453556, "flos": 17017349057280.0, "grad_norm": 2.6492938848415104, "language_loss": 0.88626444, "learning_rate": 3.57112804963472e-06, "loss": 0.9083252, "num_input_tokens_seen": 41852915, "step": 1963, "time_per_iteration": 2.6902735233306885 }, { "auxiliary_loss_clip": 0.01129062, "auxiliary_loss_mlp": 0.01057496, "balance_loss_clip": 1.0470767, "balance_loss_mlp": 1.03569281, "epoch": 0.23615703721517464, "flos": 19171307001600.0, "grad_norm": 2.4322198611597994, "language_loss": 0.76745176, "learning_rate": 3.57064591810287e-06, "loss": 0.78931737, "num_input_tokens_seen": 41870415, "step": 1964, "time_per_iteration": 2.7307868003845215 }, { "auxiliary_loss_clip": 0.0117263, "auxiliary_loss_mlp": 0.00776273, "balance_loss_clip": 1.05261493, "balance_loss_mlp": 1.00037563, "epoch": 0.23627728010581375, "flos": 19098803399040.0, "grad_norm": 3.1106034819207187, "language_loss": 0.80427837, "learning_rate": 3.570163548305284e-06, "loss": 0.82376742, "num_input_tokens_seen": 41889345, "step": 1965, "time_per_iteration": 3.5426220893859863 }, { "auxiliary_loss_clip": 0.01153526, "auxiliary_loss_mlp": 0.01052671, "balance_loss_clip": 1.05267537, "balance_loss_mlp": 1.02966332, "epoch": 0.23639752299645284, "flos": 14282213057280.0, "grad_norm": 3.300204536264118, "language_loss": 0.69757009, "learning_rate": 3.569680940315135e-06, "loss": 0.71963203, "num_input_tokens_seen": 41905745, "step": 1966, "time_per_iteration": 3.748124122619629 }, { "auxiliary_loss_clip": 0.0114757, "auxiliary_loss_mlp": 0.0104805, "balance_loss_clip": 1.05239868, "balance_loss_mlp": 1.02575803, "epoch": 0.23651776588709192, "flos": 22893411980160.0, "grad_norm": 1.9756534342348262, "language_loss": 0.81766915, "learning_rate": 3.5691980942056356e-06, "loss": 0.83962536, "num_input_tokens_seen": 41925115, "step": 1967, "time_per_iteration": 2.747218370437622 }, { "auxiliary_loss_clip": 0.01165513, "auxiliary_loss_mlp": 0.01051246, "balance_loss_clip": 1.05238092, "balance_loss_mlp": 1.02904928, "epoch": 0.23663800877773103, "flos": 18624531196800.0, "grad_norm": 1.890325754233803, "language_loss": 0.79465318, "learning_rate": 3.5687150100500332e-06, "loss": 0.81682074, "num_input_tokens_seen": 41944815, "step": 1968, "time_per_iteration": 2.6396546363830566 }, { "auxiliary_loss_clip": 0.01166469, "auxiliary_loss_mlp": 0.01059547, "balance_loss_clip": 1.05325544, "balance_loss_mlp": 1.03769612, "epoch": 0.2367582516683701, "flos": 25555828896000.0, "grad_norm": 1.6648966270032575, "language_loss": 0.74625295, "learning_rate": 3.568231687921611e-06, "loss": 0.76851314, "num_input_tokens_seen": 41964990, "step": 1969, "time_per_iteration": 2.7280876636505127 }, { "auxiliary_loss_clip": 0.01173706, "auxiliary_loss_mlp": 0.01044253, "balance_loss_clip": 1.05380654, "balance_loss_mlp": 1.0235945, "epoch": 0.2368784945590092, "flos": 23295072839040.0, "grad_norm": 1.624865852928342, "language_loss": 0.80485463, "learning_rate": 3.5677481278936883e-06, "loss": 0.82703424, "num_input_tokens_seen": 41984570, "step": 1970, "time_per_iteration": 3.4838216304779053 }, { "auxiliary_loss_clip": 0.01050494, "auxiliary_loss_mlp": 0.01018666, "balance_loss_clip": 1.02820015, "balance_loss_mlp": 1.0160439, "epoch": 0.23699873744964828, "flos": 69859291875840.0, "grad_norm": 0.8297548854569072, "language_loss": 0.57761216, "learning_rate": 3.5672643300396214e-06, "loss": 0.59830379, "num_input_tokens_seen": 42053715, "step": 1971, "time_per_iteration": 3.307037830352783 }, { "auxiliary_loss_clip": 0.0113647, "auxiliary_loss_mlp": 0.01046065, "balance_loss_clip": 1.05032241, "balance_loss_mlp": 1.02496457, "epoch": 0.2371189803402874, "flos": 21835052720640.0, "grad_norm": 2.3708959320429237, "language_loss": 0.67604291, "learning_rate": 3.566780294432802e-06, "loss": 0.69786823, "num_input_tokens_seen": 42070890, "step": 1972, "time_per_iteration": 2.7679061889648438 }, { "auxiliary_loss_clip": 0.01180788, "auxiliary_loss_mlp": 0.0105545, "balance_loss_clip": 1.0567627, "balance_loss_mlp": 1.03256154, "epoch": 0.23723922323092647, "flos": 21908490076800.0, "grad_norm": 2.2776777575477998, "language_loss": 0.74204165, "learning_rate": 3.566296021146657e-06, "loss": 0.764404, "num_input_tokens_seen": 42090270, "step": 1973, "time_per_iteration": 2.64375376701355 }, { "auxiliary_loss_clip": 0.01175997, "auxiliary_loss_mlp": 0.01053784, "balance_loss_clip": 1.05344272, "balance_loss_mlp": 1.03227818, "epoch": 0.23735946612156555, "flos": 32708803380480.0, "grad_norm": 2.773734016441468, "language_loss": 0.73375106, "learning_rate": 3.565811510254652e-06, "loss": 0.7560488, "num_input_tokens_seen": 42111150, "step": 1974, "time_per_iteration": 2.7511112689971924 }, { "auxiliary_loss_clip": 0.01053039, "auxiliary_loss_mlp": 0.01005443, "balance_loss_clip": 1.02441454, "balance_loss_mlp": 1.0026536, "epoch": 0.23747970901220466, "flos": 70546944821760.0, "grad_norm": 0.8376584908521133, "language_loss": 0.58251309, "learning_rate": 3.5653267618302845e-06, "loss": 0.60309792, "num_input_tokens_seen": 42178730, "step": 1975, "time_per_iteration": 3.2953765392303467 }, { "auxiliary_loss_clip": 0.011761, "auxiliary_loss_mlp": 0.01045872, "balance_loss_clip": 1.05255771, "balance_loss_mlp": 1.02317476, "epoch": 0.23759995190284375, "flos": 20849807594880.0, "grad_norm": 1.8273760218682429, "language_loss": 0.85789853, "learning_rate": 3.564841775947093e-06, "loss": 0.88011825, "num_input_tokens_seen": 42199620, "step": 1976, "time_per_iteration": 2.6491196155548096 }, { "auxiliary_loss_clip": 0.01130119, "auxiliary_loss_mlp": 0.01058512, "balance_loss_clip": 1.04676878, "balance_loss_mlp": 1.03714955, "epoch": 0.23772019479348283, "flos": 32921645420160.0, "grad_norm": 2.0121081598615467, "language_loss": 0.76195979, "learning_rate": 3.5643565526786475e-06, "loss": 0.78384614, "num_input_tokens_seen": 42219560, "step": 1977, "time_per_iteration": 2.8479769229888916 }, { "auxiliary_loss_clip": 0.01176821, "auxiliary_loss_mlp": 0.01056426, "balance_loss_clip": 1.05444789, "balance_loss_mlp": 1.03536189, "epoch": 0.2378404376841219, "flos": 32342765834880.0, "grad_norm": 1.6427205089732955, "language_loss": 0.77331185, "learning_rate": 3.5638710920985574e-06, "loss": 0.79564434, "num_input_tokens_seen": 42241020, "step": 1978, "time_per_iteration": 2.7045114040374756 }, { "auxiliary_loss_clip": 0.01172957, "auxiliary_loss_mlp": 0.00778095, "balance_loss_clip": 1.05470932, "balance_loss_mlp": 1.00032616, "epoch": 0.23796068057476102, "flos": 22997624313600.0, "grad_norm": 2.0025234321096996, "language_loss": 0.82207322, "learning_rate": 3.5633853942804655e-06, "loss": 0.84158367, "num_input_tokens_seen": 42259345, "step": 1979, "time_per_iteration": 2.65731143951416 }, { "auxiliary_loss_clip": 0.01138298, "auxiliary_loss_mlp": 0.01062543, "balance_loss_clip": 1.04828429, "balance_loss_mlp": 1.03980947, "epoch": 0.2380809234654001, "flos": 13480938414720.0, "grad_norm": 6.671278585105459, "language_loss": 0.76547682, "learning_rate": 3.5628994592980527e-06, "loss": 0.78748524, "num_input_tokens_seen": 42277250, "step": 1980, "time_per_iteration": 2.6740329265594482 }, { "auxiliary_loss_clip": 0.0117463, "auxiliary_loss_mlp": 0.01056651, "balance_loss_clip": 1.0515902, "balance_loss_mlp": 1.03460944, "epoch": 0.2382011663560392, "flos": 16871803148160.0, "grad_norm": 1.7334994684609657, "language_loss": 0.7052083, "learning_rate": 3.562413287225034e-06, "loss": 0.72752112, "num_input_tokens_seen": 42295360, "step": 1981, "time_per_iteration": 2.6206746101379395 }, { "auxiliary_loss_clip": 0.0115999, "auxiliary_loss_mlp": 0.01052236, "balance_loss_clip": 1.05425119, "balance_loss_mlp": 1.03023005, "epoch": 0.2383214092466783, "flos": 18441135331200.0, "grad_norm": 2.535420803057595, "language_loss": 0.8932122, "learning_rate": 3.5619268781351623e-06, "loss": 0.9153344, "num_input_tokens_seen": 42313430, "step": 1982, "time_per_iteration": 2.5918619632720947 }, { "auxiliary_loss_clip": 0.01144756, "auxiliary_loss_mlp": 0.01046233, "balance_loss_clip": 1.05053329, "balance_loss_mlp": 1.02601457, "epoch": 0.23844165213731738, "flos": 19755717281280.0, "grad_norm": 2.0387588965970473, "language_loss": 0.76635492, "learning_rate": 3.5614402321022256e-06, "loss": 0.78826481, "num_input_tokens_seen": 42331260, "step": 1983, "time_per_iteration": 2.722748279571533 }, { "auxiliary_loss_clip": 0.01113164, "auxiliary_loss_mlp": 0.01067921, "balance_loss_clip": 1.04430115, "balance_loss_mlp": 1.04486585, "epoch": 0.23856189502795647, "flos": 23367360960000.0, "grad_norm": 1.8123323095397528, "language_loss": 0.87154728, "learning_rate": 3.5609533492000463e-06, "loss": 0.89335811, "num_input_tokens_seen": 42350150, "step": 1984, "time_per_iteration": 2.822575330734253 }, { "auxiliary_loss_clip": 0.01144508, "auxiliary_loss_mlp": 0.01051584, "balance_loss_clip": 1.05145931, "balance_loss_mlp": 1.03006697, "epoch": 0.23868213791859555, "flos": 23475056912640.0, "grad_norm": 2.845113049860167, "language_loss": 0.78882587, "learning_rate": 3.560466229502485e-06, "loss": 0.81078684, "num_input_tokens_seen": 42369495, "step": 1985, "time_per_iteration": 2.7371206283569336 }, { "auxiliary_loss_clip": 0.01150969, "auxiliary_loss_mlp": 0.00776399, "balance_loss_clip": 1.05528545, "balance_loss_mlp": 1.00029826, "epoch": 0.23880238080923466, "flos": 16617340224000.0, "grad_norm": 2.0958425382545967, "language_loss": 0.8981325, "learning_rate": 3.5599788730834384e-06, "loss": 0.9174062, "num_input_tokens_seen": 42387455, "step": 1986, "time_per_iteration": 2.646135091781616 }, { "auxiliary_loss_clip": 0.01168302, "auxiliary_loss_mlp": 0.01049864, "balance_loss_clip": 1.05294251, "balance_loss_mlp": 1.02885962, "epoch": 0.23892262369987374, "flos": 17348409734400.0, "grad_norm": 2.881343141504884, "language_loss": 0.78832746, "learning_rate": 3.559491280016836e-06, "loss": 0.81050909, "num_input_tokens_seen": 42405400, "step": 1987, "time_per_iteration": 2.6986615657806396 }, { "auxiliary_loss_clip": 0.01154333, "auxiliary_loss_mlp": 0.01052692, "balance_loss_clip": 1.05152667, "balance_loss_mlp": 1.02941012, "epoch": 0.23904286659051283, "flos": 22309899540480.0, "grad_norm": 1.6893997295858534, "language_loss": 0.70927936, "learning_rate": 3.5590034503766465e-06, "loss": 0.73134965, "num_input_tokens_seen": 42425065, "step": 1988, "time_per_iteration": 3.649585485458374 }, { "auxiliary_loss_clip": 0.01175153, "auxiliary_loss_mlp": 0.01058171, "balance_loss_clip": 1.05300605, "balance_loss_mlp": 1.03679657, "epoch": 0.23916310948115194, "flos": 21178246579200.0, "grad_norm": 2.5428003709177736, "language_loss": 0.80608881, "learning_rate": 3.558515384236874e-06, "loss": 0.82842195, "num_input_tokens_seen": 42442495, "step": 1989, "time_per_iteration": 2.6608388423919678 }, { "auxiliary_loss_clip": 0.01126421, "auxiliary_loss_mlp": 0.00776809, "balance_loss_clip": 1.04851675, "balance_loss_mlp": 1.00032663, "epoch": 0.23928335237179102, "flos": 14137349506560.0, "grad_norm": 1.9135338839378018, "language_loss": 0.83668453, "learning_rate": 3.558027081671556e-06, "loss": 0.85571682, "num_input_tokens_seen": 42459480, "step": 1990, "time_per_iteration": 2.7351081371307373 }, { "auxiliary_loss_clip": 0.01163488, "auxiliary_loss_mlp": 0.01057648, "balance_loss_clip": 1.05027306, "balance_loss_mlp": 1.03561842, "epoch": 0.2394035952624301, "flos": 23769596436480.0, "grad_norm": 2.349891592055473, "language_loss": 0.68675709, "learning_rate": 3.557538542754769e-06, "loss": 0.70896846, "num_input_tokens_seen": 42479175, "step": 1991, "time_per_iteration": 3.601482391357422 }, { "auxiliary_loss_clip": 0.01180238, "auxiliary_loss_mlp": 0.01052876, "balance_loss_clip": 1.05725479, "balance_loss_mlp": 1.03152525, "epoch": 0.2395238381530692, "flos": 24206198250240.0, "grad_norm": 1.9926687962166993, "language_loss": 0.66622877, "learning_rate": 3.557049767560623e-06, "loss": 0.68855989, "num_input_tokens_seen": 42498090, "step": 1992, "time_per_iteration": 3.5973174571990967 }, { "auxiliary_loss_clip": 0.01128193, "auxiliary_loss_mlp": 0.01057626, "balance_loss_clip": 1.05110419, "balance_loss_mlp": 1.03602576, "epoch": 0.2396440810437083, "flos": 25295763450240.0, "grad_norm": 2.3188620745226753, "language_loss": 0.85511887, "learning_rate": 3.5565607561632655e-06, "loss": 0.87697703, "num_input_tokens_seen": 42516930, "step": 1993, "time_per_iteration": 2.7938153743743896 }, { "auxiliary_loss_clip": 0.01147344, "auxiliary_loss_mlp": 0.010583, "balance_loss_clip": 1.05115044, "balance_loss_mlp": 1.03662777, "epoch": 0.23976432393434738, "flos": 28543093436160.0, "grad_norm": 4.971085820069137, "language_loss": 0.79720283, "learning_rate": 3.5560715086368787e-06, "loss": 0.81925923, "num_input_tokens_seen": 42534800, "step": 1994, "time_per_iteration": 2.69219970703125 }, { "auxiliary_loss_clip": 0.01143452, "auxiliary_loss_mlp": 0.01050104, "balance_loss_clip": 1.05123281, "balance_loss_mlp": 1.02933764, "epoch": 0.23988456682498646, "flos": 19494358945920.0, "grad_norm": 4.030672152700953, "language_loss": 0.82189435, "learning_rate": 3.5555820250556816e-06, "loss": 0.84382987, "num_input_tokens_seen": 42552000, "step": 1995, "time_per_iteration": 2.763026714324951 }, { "auxiliary_loss_clip": 0.01151232, "auxiliary_loss_mlp": 0.01056312, "balance_loss_clip": 1.05134487, "balance_loss_mlp": 1.0341872, "epoch": 0.24000480971562557, "flos": 20266331068800.0, "grad_norm": 2.614505891695993, "language_loss": 0.696055, "learning_rate": 3.5550923054939278e-06, "loss": 0.71813047, "num_input_tokens_seen": 42571455, "step": 1996, "time_per_iteration": 3.530829668045044 }, { "auxiliary_loss_clip": 0.01116322, "auxiliary_loss_mlp": 0.01049017, "balance_loss_clip": 1.04825222, "balance_loss_mlp": 1.02641463, "epoch": 0.24012505260626466, "flos": 25443176866560.0, "grad_norm": 1.8838102543878221, "language_loss": 0.74440259, "learning_rate": 3.5546023500259083e-06, "loss": 0.766056, "num_input_tokens_seen": 42592550, "step": 1997, "time_per_iteration": 2.824154853820801 }, { "auxiliary_loss_clip": 0.01125797, "auxiliary_loss_mlp": 0.01057884, "balance_loss_clip": 1.04706001, "balance_loss_mlp": 1.03394651, "epoch": 0.24024529549690374, "flos": 15553342529280.0, "grad_norm": 2.2806350322098763, "language_loss": 0.80657101, "learning_rate": 3.5541121587259477e-06, "loss": 0.82840788, "num_input_tokens_seen": 42610385, "step": 1998, "time_per_iteration": 2.7200238704681396 }, { "auxiliary_loss_clip": 0.01051931, "auxiliary_loss_mlp": 0.01006347, "balance_loss_clip": 1.02168143, "balance_loss_mlp": 1.00335455, "epoch": 0.24036553838754285, "flos": 57122351867520.0, "grad_norm": 0.8406524383091368, "language_loss": 0.5791688, "learning_rate": 3.553621731668408e-06, "loss": 0.59975159, "num_input_tokens_seen": 42673595, "step": 1999, "time_per_iteration": 3.181140184402466 }, { "auxiliary_loss_clip": 0.01156255, "auxiliary_loss_mlp": 0.01061946, "balance_loss_clip": 1.05021513, "balance_loss_mlp": 1.03807998, "epoch": 0.24048578127818193, "flos": 24969946158720.0, "grad_norm": 1.9493851671321158, "language_loss": 0.83438641, "learning_rate": 3.553131068927688e-06, "loss": 0.85656846, "num_input_tokens_seen": 42692000, "step": 2000, "time_per_iteration": 2.6760480403900146 }, { "auxiliary_loss_clip": 0.01135711, "auxiliary_loss_mlp": 0.01054345, "balance_loss_clip": 1.04987311, "balance_loss_mlp": 1.03230309, "epoch": 0.24060602416882101, "flos": 23330947547520.0, "grad_norm": 1.633809767363065, "language_loss": 0.80191141, "learning_rate": 3.552640170578219e-06, "loss": 0.82381189, "num_input_tokens_seen": 42712250, "step": 2001, "time_per_iteration": 2.6963276863098145 }, { "auxiliary_loss_clip": 0.01147874, "auxiliary_loss_mlp": 0.01048668, "balance_loss_clip": 1.05068159, "balance_loss_mlp": 1.02920103, "epoch": 0.2407262670594601, "flos": 14173260128640.0, "grad_norm": 2.4334490689563673, "language_loss": 0.77548367, "learning_rate": 3.5521490366944703e-06, "loss": 0.79744911, "num_input_tokens_seen": 42729900, "step": 2002, "time_per_iteration": 2.683614492416382 }, { "auxiliary_loss_clip": 0.01138596, "auxiliary_loss_mlp": 0.01057478, "balance_loss_clip": 1.05170739, "balance_loss_mlp": 1.03515053, "epoch": 0.2408465099500992, "flos": 13663113217920.0, "grad_norm": 2.1456938031744053, "language_loss": 0.79838216, "learning_rate": 3.5516576673509474e-06, "loss": 0.8203429, "num_input_tokens_seen": 42747900, "step": 2003, "time_per_iteration": 2.705913782119751 }, { "auxiliary_loss_clip": 0.01176093, "auxiliary_loss_mlp": 0.01056536, "balance_loss_clip": 1.05261588, "balance_loss_mlp": 1.03549588, "epoch": 0.2409667528407383, "flos": 31248029076480.0, "grad_norm": 1.9944361466190739, "language_loss": 0.86621416, "learning_rate": 3.5511660626221896e-06, "loss": 0.88854051, "num_input_tokens_seen": 42768540, "step": 2004, "time_per_iteration": 2.6791133880615234 }, { "auxiliary_loss_clip": 0.01149331, "auxiliary_loss_mlp": 0.00776494, "balance_loss_clip": 1.05196977, "balance_loss_mlp": 1.00033355, "epoch": 0.24108699573137737, "flos": 22199941031040.0, "grad_norm": 4.34774533359128, "language_loss": 0.89009917, "learning_rate": 3.5506742225827744e-06, "loss": 0.90935743, "num_input_tokens_seen": 42785395, "step": 2005, "time_per_iteration": 2.7339234352111816 }, { "auxiliary_loss_clip": 0.01136496, "auxiliary_loss_mlp": 0.01061611, "balance_loss_clip": 1.0489428, "balance_loss_mlp": 1.04054618, "epoch": 0.24120723862201648, "flos": 26103035664000.0, "grad_norm": 2.9859703321978777, "language_loss": 0.90570372, "learning_rate": 3.5501821473073116e-06, "loss": 0.92768484, "num_input_tokens_seen": 42801980, "step": 2006, "time_per_iteration": 2.7872354984283447 }, { "auxiliary_loss_clip": 0.01132062, "auxiliary_loss_mlp": 0.01056295, "balance_loss_clip": 1.05154836, "balance_loss_mlp": 1.03432477, "epoch": 0.24132748151265557, "flos": 18624926246400.0, "grad_norm": 2.0818105177025275, "language_loss": 0.87052608, "learning_rate": 3.54968983687045e-06, "loss": 0.89240968, "num_input_tokens_seen": 42818850, "step": 2007, "time_per_iteration": 2.715653657913208 }, { "auxiliary_loss_clip": 0.0114931, "auxiliary_loss_mlp": 0.01053647, "balance_loss_clip": 1.05210245, "balance_loss_mlp": 1.03190362, "epoch": 0.24144772440329465, "flos": 15267673664640.0, "grad_norm": 2.5363318426812493, "language_loss": 0.89310735, "learning_rate": 3.549197291346872e-06, "loss": 0.91513693, "num_input_tokens_seen": 42835375, "step": 2008, "time_per_iteration": 2.7107138633728027 }, { "auxiliary_loss_clip": 0.01163436, "auxiliary_loss_mlp": 0.01055758, "balance_loss_clip": 1.05283737, "balance_loss_mlp": 1.03275108, "epoch": 0.24156796729393373, "flos": 24024274842240.0, "grad_norm": 2.044524796843551, "language_loss": 0.78837818, "learning_rate": 3.548704510811297e-06, "loss": 0.81057012, "num_input_tokens_seen": 42854570, "step": 2009, "time_per_iteration": 2.6737959384918213 }, { "auxiliary_loss_clip": 0.01125705, "auxiliary_loss_mlp": 0.01048297, "balance_loss_clip": 1.0465641, "balance_loss_mlp": 1.02692246, "epoch": 0.24168821018457284, "flos": 26286790665600.0, "grad_norm": 2.818760651518668, "language_loss": 0.75085723, "learning_rate": 3.5482114953384787e-06, "loss": 0.77259731, "num_input_tokens_seen": 42873800, "step": 2010, "time_per_iteration": 2.7786407470703125 }, { "auxiliary_loss_clip": 0.01163153, "auxiliary_loss_mlp": 0.01060302, "balance_loss_clip": 1.05158424, "balance_loss_mlp": 1.03868961, "epoch": 0.24180845307521193, "flos": 18223193560320.0, "grad_norm": 2.422616155641694, "language_loss": 0.83948123, "learning_rate": 3.5477182450032077e-06, "loss": 0.86171579, "num_input_tokens_seen": 42892400, "step": 2011, "time_per_iteration": 2.6416945457458496 }, { "auxiliary_loss_clip": 0.01161132, "auxiliary_loss_mlp": 0.01051327, "balance_loss_clip": 1.05252767, "balance_loss_mlp": 1.02952385, "epoch": 0.241928695965851, "flos": 20449260057600.0, "grad_norm": 2.1325450437163953, "language_loss": 0.83055806, "learning_rate": 3.5472247598803097e-06, "loss": 0.85268277, "num_input_tokens_seen": 42911745, "step": 2012, "time_per_iteration": 2.621216058731079 }, { "auxiliary_loss_clip": 0.01176826, "auxiliary_loss_mlp": 0.01055811, "balance_loss_clip": 1.05319548, "balance_loss_mlp": 1.03279161, "epoch": 0.24204893885649012, "flos": 25556475340800.0, "grad_norm": 2.726002211092776, "language_loss": 0.85342675, "learning_rate": 3.546731040044645e-06, "loss": 0.87575316, "num_input_tokens_seen": 42926915, "step": 2013, "time_per_iteration": 2.6698076725006104 }, { "auxiliary_loss_clip": 0.0118, "auxiliary_loss_mlp": 0.01049902, "balance_loss_clip": 1.05558872, "balance_loss_mlp": 1.02919531, "epoch": 0.2421691817471292, "flos": 30660207004800.0, "grad_norm": 1.916372817870556, "language_loss": 0.75403488, "learning_rate": 3.546237085571112e-06, "loss": 0.77633393, "num_input_tokens_seen": 42945350, "step": 2014, "time_per_iteration": 3.6405670642852783 }, { "auxiliary_loss_clip": 0.01170592, "auxiliary_loss_mlp": 0.01056698, "balance_loss_clip": 1.05765367, "balance_loss_mlp": 1.03407216, "epoch": 0.24228942463776829, "flos": 21945011230080.0, "grad_norm": 2.438248966702853, "language_loss": 0.72475809, "learning_rate": 3.5457428965346425e-06, "loss": 0.74703097, "num_input_tokens_seen": 42964290, "step": 2015, "time_per_iteration": 2.651616334915161 }, { "auxiliary_loss_clip": 0.01108188, "auxiliary_loss_mlp": 0.01054305, "balance_loss_clip": 1.04752564, "balance_loss_mlp": 1.03108335, "epoch": 0.2424096675284074, "flos": 33984493879680.0, "grad_norm": 1.6468834359223836, "language_loss": 0.74715674, "learning_rate": 3.545248473010205e-06, "loss": 0.76878172, "num_input_tokens_seen": 42987095, "step": 2016, "time_per_iteration": 2.9568467140197754 }, { "auxiliary_loss_clip": 0.01183514, "auxiliary_loss_mlp": 0.00778455, "balance_loss_clip": 1.05696368, "balance_loss_mlp": 1.00027823, "epoch": 0.24252991041904648, "flos": 21653416621440.0, "grad_norm": 2.146377732384159, "language_loss": 0.87587327, "learning_rate": 3.544753815072802e-06, "loss": 0.89549303, "num_input_tokens_seen": 43005750, "step": 2017, "time_per_iteration": 3.6602840423583984 }, { "auxiliary_loss_clip": 0.01087439, "auxiliary_loss_mlp": 0.01056332, "balance_loss_clip": 1.04128599, "balance_loss_mlp": 1.03527939, "epoch": 0.24265015330968556, "flos": 21870065502720.0, "grad_norm": 2.122605042947144, "language_loss": 0.88485491, "learning_rate": 3.544258922797474e-06, "loss": 0.90629268, "num_input_tokens_seen": 43023870, "step": 2018, "time_per_iteration": 3.8918561935424805 }, { "auxiliary_loss_clip": 0.01173353, "auxiliary_loss_mlp": 0.01053379, "balance_loss_clip": 1.05330348, "balance_loss_mlp": 1.03273201, "epoch": 0.24277039620032465, "flos": 25628260671360.0, "grad_norm": 1.6053886828241333, "language_loss": 0.78211641, "learning_rate": 3.543763796259295e-06, "loss": 0.80438375, "num_input_tokens_seen": 43043825, "step": 2019, "time_per_iteration": 3.175381898880005 }, { "auxiliary_loss_clip": 0.01166581, "auxiliary_loss_mlp": 0.01054921, "balance_loss_clip": 1.05271006, "balance_loss_mlp": 1.03240228, "epoch": 0.24289063909096376, "flos": 26286575184000.0, "grad_norm": 1.811503197406541, "language_loss": 0.90882617, "learning_rate": 3.5432684355333754e-06, "loss": 0.93104124, "num_input_tokens_seen": 43062480, "step": 2020, "time_per_iteration": 2.7045650482177734 }, { "auxiliary_loss_clip": 0.01159918, "auxiliary_loss_mlp": 0.01052922, "balance_loss_clip": 1.05164075, "balance_loss_mlp": 1.031703, "epoch": 0.24301088198160284, "flos": 25075056332160.0, "grad_norm": 1.8756192153793352, "language_loss": 0.76185578, "learning_rate": 3.5427728406948613e-06, "loss": 0.78398418, "num_input_tokens_seen": 43081595, "step": 2021, "time_per_iteration": 2.6901862621307373 }, { "auxiliary_loss_clip": 0.01046502, "auxiliary_loss_mlp": 0.0100286, "balance_loss_clip": 1.01866961, "balance_loss_mlp": 1.0003562, "epoch": 0.24313112487224192, "flos": 69900948673920.0, "grad_norm": 0.7527901706703513, "language_loss": 0.57887244, "learning_rate": 3.542277011818934e-06, "loss": 0.59936607, "num_input_tokens_seen": 43145430, "step": 2022, "time_per_iteration": 4.262171506881714 }, { "auxiliary_loss_clip": 0.01147056, "auxiliary_loss_mlp": 0.01052432, "balance_loss_clip": 1.05154169, "balance_loss_mlp": 1.03207099, "epoch": 0.24325136776288103, "flos": 40662334235520.0, "grad_norm": 2.1703684276475292, "language_loss": 0.7461158, "learning_rate": 3.5417809489808104e-06, "loss": 0.76811075, "num_input_tokens_seen": 43167040, "step": 2023, "time_per_iteration": 2.7969343662261963 }, { "auxiliary_loss_clip": 0.01163035, "auxiliary_loss_mlp": 0.01049917, "balance_loss_clip": 1.053756, "balance_loss_mlp": 1.0299139, "epoch": 0.24337161065352012, "flos": 25046400257280.0, "grad_norm": 2.1234136012949327, "language_loss": 0.72515082, "learning_rate": 3.5412846522557422e-06, "loss": 0.74728036, "num_input_tokens_seen": 43187930, "step": 2024, "time_per_iteration": 2.7184462547302246 }, { "auxiliary_loss_clip": 0.0117781, "auxiliary_loss_mlp": 0.01051558, "balance_loss_clip": 1.05613542, "balance_loss_mlp": 1.02995718, "epoch": 0.2434918535441592, "flos": 18661160090880.0, "grad_norm": 2.21898867961393, "language_loss": 0.74454415, "learning_rate": 3.540788121719018e-06, "loss": 0.76683778, "num_input_tokens_seen": 43206350, "step": 2025, "time_per_iteration": 2.5737922191619873 }, { "auxiliary_loss_clip": 0.01130497, "auxiliary_loss_mlp": 0.01053808, "balance_loss_clip": 1.0521524, "balance_loss_mlp": 1.03229117, "epoch": 0.24361209643479828, "flos": 23915142345600.0, "grad_norm": 2.4953240761017383, "language_loss": 0.82068443, "learning_rate": 3.5402913574459604e-06, "loss": 0.84252751, "num_input_tokens_seen": 43226255, "step": 2026, "time_per_iteration": 2.7539751529693604 }, { "auxiliary_loss_clip": 0.0110364, "auxiliary_loss_mlp": 0.0105452, "balance_loss_clip": 1.04194617, "balance_loss_mlp": 1.0328238, "epoch": 0.2437323393254374, "flos": 28657505232000.0, "grad_norm": 1.6548513282553152, "language_loss": 0.86025244, "learning_rate": 3.5397943595119297e-06, "loss": 0.88183403, "num_input_tokens_seen": 43247675, "step": 2027, "time_per_iteration": 2.820204496383667 }, { "auxiliary_loss_clip": 0.01146897, "auxiliary_loss_mlp": 0.01045439, "balance_loss_clip": 1.05216599, "balance_loss_mlp": 1.02448201, "epoch": 0.24385258221607647, "flos": 23550325862400.0, "grad_norm": 3.505228333093844, "language_loss": 0.77452612, "learning_rate": 3.5392971279923177e-06, "loss": 0.79644954, "num_input_tokens_seen": 43265895, "step": 2028, "time_per_iteration": 2.755645990371704 }, { "auxiliary_loss_clip": 0.01131639, "auxiliary_loss_mlp": 0.01059846, "balance_loss_clip": 1.04579973, "balance_loss_mlp": 1.03528929, "epoch": 0.24397282510671556, "flos": 25336091445120.0, "grad_norm": 2.6183351737920186, "language_loss": 0.83544874, "learning_rate": 3.5387996629625557e-06, "loss": 0.85736358, "num_input_tokens_seen": 43283485, "step": 2029, "time_per_iteration": 2.796565055847168 }, { "auxiliary_loss_clip": 0.01066266, "auxiliary_loss_mlp": 0.01001307, "balance_loss_clip": 1.02032888, "balance_loss_mlp": 0.99881595, "epoch": 0.24409306799735467, "flos": 65187421430400.0, "grad_norm": 0.794921885461313, "language_loss": 0.54948699, "learning_rate": 3.5383019644981083e-06, "loss": 0.57016277, "num_input_tokens_seen": 43347180, "step": 2030, "time_per_iteration": 3.22299861907959 }, { "auxiliary_loss_clip": 0.01150174, "auxiliary_loss_mlp": 0.01052128, "balance_loss_clip": 1.05181837, "balance_loss_mlp": 1.03065825, "epoch": 0.24421331088799375, "flos": 19537093152000.0, "grad_norm": 3.4255882112786176, "language_loss": 0.72875404, "learning_rate": 3.5378040326744763e-06, "loss": 0.75077707, "num_input_tokens_seen": 43366665, "step": 2031, "time_per_iteration": 2.696402072906494 }, { "auxiliary_loss_clip": 0.0113922, "auxiliary_loss_mlp": 0.01055092, "balance_loss_clip": 1.05250657, "balance_loss_mlp": 1.03597116, "epoch": 0.24433355377863283, "flos": 21068575378560.0, "grad_norm": 2.506319232179889, "language_loss": 0.85319084, "learning_rate": 3.5373058675671946e-06, "loss": 0.87513393, "num_input_tokens_seen": 43384670, "step": 2032, "time_per_iteration": 2.711683750152588 }, { "auxiliary_loss_clip": 0.01118238, "auxiliary_loss_mlp": 0.01050604, "balance_loss_clip": 1.0464735, "balance_loss_mlp": 1.02956367, "epoch": 0.24445379666927192, "flos": 22637189289600.0, "grad_norm": 2.3868884617540136, "language_loss": 0.72114193, "learning_rate": 3.536807469251836e-06, "loss": 0.74283028, "num_input_tokens_seen": 43403825, "step": 2033, "time_per_iteration": 2.762706995010376 }, { "auxiliary_loss_clip": 0.01141986, "auxiliary_loss_mlp": 0.01053515, "balance_loss_clip": 1.0493598, "balance_loss_mlp": 1.03117561, "epoch": 0.24457403955991103, "flos": 21251612108160.0, "grad_norm": 2.8954624836066616, "language_loss": 0.82370436, "learning_rate": 3.5363088378040055e-06, "loss": 0.84565938, "num_input_tokens_seen": 43422715, "step": 2034, "time_per_iteration": 2.6855342388153076 }, { "auxiliary_loss_clip": 0.01063256, "auxiliary_loss_mlp": 0.00757055, "balance_loss_clip": 1.01734567, "balance_loss_mlp": 1.00007439, "epoch": 0.2446942824505501, "flos": 66997820764800.0, "grad_norm": 0.7588276988503536, "language_loss": 0.64324069, "learning_rate": 3.5358099732993463e-06, "loss": 0.66144383, "num_input_tokens_seen": 43481825, "step": 2035, "time_per_iteration": 3.1483941078186035 }, { "auxiliary_loss_clip": 0.01158458, "auxiliary_loss_mlp": 0.01046217, "balance_loss_clip": 1.05385244, "balance_loss_mlp": 1.02566552, "epoch": 0.2448145253411892, "flos": 20411122792320.0, "grad_norm": 2.450140125982478, "language_loss": 0.89862967, "learning_rate": 3.535310875813535e-06, "loss": 0.92067635, "num_input_tokens_seen": 43500220, "step": 2036, "time_per_iteration": 2.668073892593384 }, { "auxiliary_loss_clip": 0.01164466, "auxiliary_loss_mlp": 0.01064106, "balance_loss_clip": 1.05190754, "balance_loss_mlp": 1.04239798, "epoch": 0.2449347682318283, "flos": 28804739080320.0, "grad_norm": 2.247111806933798, "language_loss": 0.81663322, "learning_rate": 3.5348115454222843e-06, "loss": 0.83891892, "num_input_tokens_seen": 43522805, "step": 2037, "time_per_iteration": 2.7360265254974365 }, { "auxiliary_loss_clip": 0.01143837, "auxiliary_loss_mlp": 0.01048978, "balance_loss_clip": 1.04631042, "balance_loss_mlp": 1.02797306, "epoch": 0.2450550111224674, "flos": 22528990546560.0, "grad_norm": 1.7224654841592177, "language_loss": 0.8592509, "learning_rate": 3.5343119822013425e-06, "loss": 0.88117898, "num_input_tokens_seen": 43541915, "step": 2038, "time_per_iteration": 2.6944987773895264 }, { "auxiliary_loss_clip": 0.01176257, "auxiliary_loss_mlp": 0.01060047, "balance_loss_clip": 1.05740583, "balance_loss_mlp": 1.03807676, "epoch": 0.24517525401310647, "flos": 21759137326080.0, "grad_norm": 2.367085148350003, "language_loss": 0.77682412, "learning_rate": 3.533812186226493e-06, "loss": 0.79918718, "num_input_tokens_seen": 43562625, "step": 2039, "time_per_iteration": 2.6480960845947266 }, { "auxiliary_loss_clip": 0.01173009, "auxiliary_loss_mlp": 0.01049952, "balance_loss_clip": 1.05337012, "balance_loss_mlp": 1.02889943, "epoch": 0.24529549690374555, "flos": 25043311687680.0, "grad_norm": 1.9207838876925087, "language_loss": 0.761783, "learning_rate": 3.5333121575735545e-06, "loss": 0.78401256, "num_input_tokens_seen": 43582265, "step": 2040, "time_per_iteration": 3.5997562408447266 }, { "auxiliary_loss_clip": 0.01146982, "auxiliary_loss_mlp": 0.01051516, "balance_loss_clip": 1.05076838, "balance_loss_mlp": 1.03148937, "epoch": 0.24541573979438466, "flos": 32123638915200.0, "grad_norm": 1.9365175430742383, "language_loss": 0.75820661, "learning_rate": 3.532811896318381e-06, "loss": 0.7801916, "num_input_tokens_seen": 43604335, "step": 2041, "time_per_iteration": 2.726325273513794 }, { "auxiliary_loss_clip": 0.01141923, "auxiliary_loss_mlp": 0.01052261, "balance_loss_clip": 1.04986823, "balance_loss_mlp": 1.03126812, "epoch": 0.24553598268502375, "flos": 31357556622720.0, "grad_norm": 2.4815193654918164, "language_loss": 0.81875789, "learning_rate": 3.5323114025368615e-06, "loss": 0.84069973, "num_input_tokens_seen": 43619400, "step": 2042, "time_per_iteration": 2.749985456466675 }, { "auxiliary_loss_clip": 0.01154182, "auxiliary_loss_mlp": 0.01044213, "balance_loss_clip": 1.04525805, "balance_loss_mlp": 1.02270734, "epoch": 0.24565622557566283, "flos": 14027462824320.0, "grad_norm": 2.5110210260284536, "language_loss": 0.82376403, "learning_rate": 3.53181067630492e-06, "loss": 0.84574795, "num_input_tokens_seen": 43636870, "step": 2043, "time_per_iteration": 3.623706579208374 }, { "auxiliary_loss_clip": 0.0114093, "auxiliary_loss_mlp": 0.01052325, "balance_loss_clip": 1.04792356, "balance_loss_mlp": 1.03093851, "epoch": 0.24577646846630194, "flos": 16581465515520.0, "grad_norm": 1.8972230325070463, "language_loss": 0.75901973, "learning_rate": 3.5313097176985175e-06, "loss": 0.78095227, "num_input_tokens_seen": 43655180, "step": 2044, "time_per_iteration": 3.591600179672241 }, { "auxiliary_loss_clip": 0.01164511, "auxiliary_loss_mlp": 0.01057508, "balance_loss_clip": 1.052037, "balance_loss_mlp": 1.03708744, "epoch": 0.24589671135694102, "flos": 18807424272000.0, "grad_norm": 1.9956017541008566, "language_loss": 0.8121525, "learning_rate": 3.5308085267936482e-06, "loss": 0.83437264, "num_input_tokens_seen": 43672895, "step": 2045, "time_per_iteration": 2.6593239307403564 }, { "auxiliary_loss_clip": 0.01104645, "auxiliary_loss_mlp": 0.00775257, "balance_loss_clip": 1.04442346, "balance_loss_mlp": 1.00024784, "epoch": 0.2460169542475801, "flos": 19938538529280.0, "grad_norm": 2.1244594687833196, "language_loss": 0.90131658, "learning_rate": 3.530307103666342e-06, "loss": 0.92011559, "num_input_tokens_seen": 43691975, "step": 2046, "time_per_iteration": 2.753635883331299 }, { "auxiliary_loss_clip": 0.0114089, "auxiliary_loss_mlp": 0.01048487, "balance_loss_clip": 1.050107, "balance_loss_mlp": 1.02681434, "epoch": 0.24613719713821922, "flos": 24171221381760.0, "grad_norm": 7.150377906576094, "language_loss": 0.80182523, "learning_rate": 3.5298054483926658e-06, "loss": 0.82371897, "num_input_tokens_seen": 43712670, "step": 2047, "time_per_iteration": 2.75534725189209 }, { "auxiliary_loss_clip": 0.01170866, "auxiliary_loss_mlp": 0.01063614, "balance_loss_clip": 1.05533946, "balance_loss_mlp": 1.04237056, "epoch": 0.2462574400288583, "flos": 30221055325440.0, "grad_norm": 3.7935953294959686, "language_loss": 0.83130872, "learning_rate": 3.5293035610487187e-06, "loss": 0.85365343, "num_input_tokens_seen": 43732035, "step": 2048, "time_per_iteration": 3.810844659805298 }, { "auxiliary_loss_clip": 0.01044288, "auxiliary_loss_mlp": 0.01023309, "balance_loss_clip": 1.02501404, "balance_loss_mlp": 1.01987612, "epoch": 0.24637768291949738, "flos": 68943030819840.0, "grad_norm": 0.7274245620389014, "language_loss": 0.61988437, "learning_rate": 3.5288014417106374e-06, "loss": 0.64056039, "num_input_tokens_seen": 43798055, "step": 2049, "time_per_iteration": 3.285994291305542 }, { "auxiliary_loss_clip": 0.01135557, "auxiliary_loss_mlp": 0.01052266, "balance_loss_clip": 1.04969931, "balance_loss_mlp": 1.02894878, "epoch": 0.24649792581013646, "flos": 34383999922560.0, "grad_norm": 2.245211512370124, "language_loss": 0.75794101, "learning_rate": 3.528299090454593e-06, "loss": 0.77981925, "num_input_tokens_seen": 43818590, "step": 2050, "time_per_iteration": 2.85707688331604 }, { "auxiliary_loss_clip": 0.01164779, "auxiliary_loss_mlp": 0.01057675, "balance_loss_clip": 1.05085802, "balance_loss_mlp": 1.03664684, "epoch": 0.24661816870077558, "flos": 19680448331520.0, "grad_norm": 2.36536243141075, "language_loss": 0.82903904, "learning_rate": 3.527796507356792e-06, "loss": 0.85126358, "num_input_tokens_seen": 43832480, "step": 2051, "time_per_iteration": 2.6785848140716553 }, { "auxiliary_loss_clip": 0.01164807, "auxiliary_loss_mlp": 0.0105298, "balance_loss_clip": 1.04894614, "balance_loss_mlp": 1.0317843, "epoch": 0.24673841159141466, "flos": 20002279213440.0, "grad_norm": 25.55192958166991, "language_loss": 0.89702499, "learning_rate": 3.527293692493475e-06, "loss": 0.91920292, "num_input_tokens_seen": 43848345, "step": 2052, "time_per_iteration": 2.8157060146331787 }, { "auxiliary_loss_clip": 0.01168443, "auxiliary_loss_mlp": 0.01050048, "balance_loss_clip": 1.05143416, "balance_loss_mlp": 1.02665877, "epoch": 0.24685865448205374, "flos": 21646593037440.0, "grad_norm": 10.249885886924153, "language_loss": 0.73631448, "learning_rate": 3.52679064594092e-06, "loss": 0.75849938, "num_input_tokens_seen": 43865685, "step": 2053, "time_per_iteration": 2.8630869388580322 }, { "auxiliary_loss_clip": 0.01106199, "auxiliary_loss_mlp": 0.01057902, "balance_loss_clip": 1.03829992, "balance_loss_mlp": 1.03468013, "epoch": 0.24697889737269285, "flos": 17960470508160.0, "grad_norm": 2.920815472698764, "language_loss": 0.74767768, "learning_rate": 3.5262873677754375e-06, "loss": 0.7693187, "num_input_tokens_seen": 43883690, "step": 2054, "time_per_iteration": 2.754624605178833 }, { "auxiliary_loss_clip": 0.01173136, "auxiliary_loss_mlp": 0.01045717, "balance_loss_clip": 1.05209148, "balance_loss_mlp": 1.0252012, "epoch": 0.24709914026333193, "flos": 27344611221120.0, "grad_norm": 1.6780453889529199, "language_loss": 0.80588615, "learning_rate": 3.5257838580733745e-06, "loss": 0.82807469, "num_input_tokens_seen": 43903295, "step": 2055, "time_per_iteration": 2.6886513233184814 }, { "auxiliary_loss_clip": 0.01163796, "auxiliary_loss_mlp": 0.01049092, "balance_loss_clip": 1.05064058, "balance_loss_mlp": 1.02829015, "epoch": 0.24721938315397102, "flos": 19275519335040.0, "grad_norm": 2.519949079427585, "language_loss": 0.87752938, "learning_rate": 3.5252801169111138e-06, "loss": 0.89965832, "num_input_tokens_seen": 43920960, "step": 2056, "time_per_iteration": 2.6011595726013184 }, { "auxiliary_loss_clip": 0.01146651, "auxiliary_loss_mlp": 0.01053654, "balance_loss_clip": 1.04997134, "balance_loss_mlp": 1.03306651, "epoch": 0.2473396260446101, "flos": 23185796688000.0, "grad_norm": 1.9116697501285236, "language_loss": 0.79713988, "learning_rate": 3.524776144365072e-06, "loss": 0.81914288, "num_input_tokens_seen": 43939415, "step": 2057, "time_per_iteration": 2.7071375846862793 }, { "auxiliary_loss_clip": 0.01142461, "auxiliary_loss_mlp": 0.01056572, "balance_loss_clip": 1.05237973, "balance_loss_mlp": 1.03427982, "epoch": 0.2474598689352492, "flos": 21142443697920.0, "grad_norm": 1.6292756047550678, "language_loss": 0.79332948, "learning_rate": 3.5242719405117016e-06, "loss": 0.8153199, "num_input_tokens_seen": 43959220, "step": 2058, "time_per_iteration": 2.764310359954834 }, { "auxiliary_loss_clip": 0.01151013, "auxiliary_loss_mlp": 0.00776413, "balance_loss_clip": 1.05286908, "balance_loss_mlp": 1.00026989, "epoch": 0.2475801118258883, "flos": 21648352803840.0, "grad_norm": 5.111034760181446, "language_loss": 0.75075239, "learning_rate": 3.5237675054274893e-06, "loss": 0.77002668, "num_input_tokens_seen": 43978420, "step": 2059, "time_per_iteration": 2.7441186904907227 }, { "auxiliary_loss_clip": 0.01160733, "auxiliary_loss_mlp": 0.01051439, "balance_loss_clip": 1.05085778, "balance_loss_mlp": 1.02931333, "epoch": 0.24770035471652738, "flos": 22674500542080.0, "grad_norm": 1.689264830370085, "language_loss": 0.80109656, "learning_rate": 3.5232628391889584e-06, "loss": 0.82321823, "num_input_tokens_seen": 43996710, "step": 2060, "time_per_iteration": 2.70131516456604 }, { "auxiliary_loss_clip": 0.01120724, "auxiliary_loss_mlp": 0.01050316, "balance_loss_clip": 1.04875147, "balance_loss_mlp": 1.02908456, "epoch": 0.2478205976071665, "flos": 22163814927360.0, "grad_norm": 2.216269251465702, "language_loss": 0.64520556, "learning_rate": 3.522757941872666e-06, "loss": 0.66691589, "num_input_tokens_seen": 44014865, "step": 2061, "time_per_iteration": 2.81394624710083 }, { "auxiliary_loss_clip": 0.01176645, "auxiliary_loss_mlp": 0.00776418, "balance_loss_clip": 1.05370522, "balance_loss_mlp": 1.00033689, "epoch": 0.24794084049780557, "flos": 24973106555520.0, "grad_norm": 1.7847001643292757, "language_loss": 0.82666075, "learning_rate": 3.5222528135552042e-06, "loss": 0.84619141, "num_input_tokens_seen": 44036325, "step": 2062, "time_per_iteration": 2.680983066558838 }, { "auxiliary_loss_clip": 0.01164627, "auxiliary_loss_mlp": 0.0104811, "balance_loss_clip": 1.05607319, "balance_loss_mlp": 1.0259608, "epoch": 0.24806108338844465, "flos": 18296379521280.0, "grad_norm": 1.7681560780972538, "language_loss": 0.80118501, "learning_rate": 3.521747454313201e-06, "loss": 0.8233124, "num_input_tokens_seen": 44055005, "step": 2063, "time_per_iteration": 2.6160664558410645 }, { "auxiliary_loss_clip": 0.0112979, "auxiliary_loss_mlp": 0.01051864, "balance_loss_clip": 1.04569983, "balance_loss_mlp": 1.02942872, "epoch": 0.24818132627908374, "flos": 19282163351040.0, "grad_norm": 1.973431285933332, "language_loss": 0.66787565, "learning_rate": 3.521241864223319e-06, "loss": 0.68969214, "num_input_tokens_seen": 44073965, "step": 2064, "time_per_iteration": 2.7411344051361084 }, { "auxiliary_loss_clip": 0.01048758, "auxiliary_loss_mlp": 0.01004434, "balance_loss_clip": 1.02318013, "balance_loss_mlp": 1.00123906, "epoch": 0.24830156916972285, "flos": 70285837881600.0, "grad_norm": 0.7892392217139583, "language_loss": 0.62020004, "learning_rate": 3.5207360433622552e-06, "loss": 0.64073193, "num_input_tokens_seen": 44135965, "step": 2065, "time_per_iteration": 3.213700294494629 }, { "auxiliary_loss_clip": 0.01144091, "auxiliary_loss_mlp": 0.01062084, "balance_loss_clip": 1.05109954, "balance_loss_mlp": 1.04026818, "epoch": 0.24842181206036193, "flos": 40409128287360.0, "grad_norm": 1.828595968108555, "language_loss": 0.75039184, "learning_rate": 3.5202299918067437e-06, "loss": 0.77245367, "num_input_tokens_seen": 44159560, "step": 2066, "time_per_iteration": 3.849308729171753 }, { "auxiliary_loss_clip": 0.01160123, "auxiliary_loss_mlp": 0.01048369, "balance_loss_clip": 1.05209899, "balance_loss_mlp": 1.02780581, "epoch": 0.248542054951001, "flos": 20082432412800.0, "grad_norm": 2.456545962781772, "language_loss": 0.69559121, "learning_rate": 3.519723709633551e-06, "loss": 0.71767616, "num_input_tokens_seen": 44178320, "step": 2067, "time_per_iteration": 2.7393133640289307 }, { "auxiliary_loss_clip": 0.01145009, "auxiliary_loss_mlp": 0.01059679, "balance_loss_clip": 1.05211341, "balance_loss_mlp": 1.03655195, "epoch": 0.24866229784164012, "flos": 23513948363520.0, "grad_norm": 2.0815079500737124, "language_loss": 0.83524382, "learning_rate": 3.519217196919479e-06, "loss": 0.85729069, "num_input_tokens_seen": 44197305, "step": 2068, "time_per_iteration": 2.728058099746704 }, { "auxiliary_loss_clip": 0.01155203, "auxiliary_loss_mlp": 0.01060689, "balance_loss_clip": 1.05494761, "balance_loss_mlp": 1.03981543, "epoch": 0.2487825407322792, "flos": 19865101173120.0, "grad_norm": 1.9120316311687198, "language_loss": 0.73111314, "learning_rate": 3.518710453741367e-06, "loss": 0.75327206, "num_input_tokens_seen": 44216505, "step": 2069, "time_per_iteration": 3.6078834533691406 }, { "auxiliary_loss_clip": 0.01145426, "auxiliary_loss_mlp": 0.00777499, "balance_loss_clip": 1.05112219, "balance_loss_mlp": 1.00020409, "epoch": 0.2489027836229183, "flos": 22017622573440.0, "grad_norm": 2.4061095395558536, "language_loss": 0.68127394, "learning_rate": 3.518203480176086e-06, "loss": 0.70050311, "num_input_tokens_seen": 44235435, "step": 2070, "time_per_iteration": 3.7703821659088135 }, { "auxiliary_loss_clip": 0.01104322, "auxiliary_loss_mlp": 0.01061003, "balance_loss_clip": 1.04583335, "balance_loss_mlp": 1.03797197, "epoch": 0.2490230265135574, "flos": 23294354567040.0, "grad_norm": 1.7309455331823969, "language_loss": 0.80649668, "learning_rate": 3.517696276300545e-06, "loss": 0.82814991, "num_input_tokens_seen": 44256975, "step": 2071, "time_per_iteration": 2.837250232696533 }, { "auxiliary_loss_clip": 0.01168188, "auxiliary_loss_mlp": 0.01062529, "balance_loss_clip": 1.05864787, "balance_loss_mlp": 1.04043937, "epoch": 0.24914326940419648, "flos": 19826784339840.0, "grad_norm": 2.943917994346266, "language_loss": 0.69536954, "learning_rate": 3.517188842191685e-06, "loss": 0.71767676, "num_input_tokens_seen": 44275125, "step": 2072, "time_per_iteration": 2.711573839187622 }, { "auxiliary_loss_clip": 0.01165288, "auxiliary_loss_mlp": 0.01053472, "balance_loss_clip": 1.05268407, "balance_loss_mlp": 1.03003573, "epoch": 0.24926351229483557, "flos": 20229271211520.0, "grad_norm": 1.981043961978065, "language_loss": 0.73542368, "learning_rate": 3.5166811779264837e-06, "loss": 0.75761127, "num_input_tokens_seen": 44295445, "step": 2073, "time_per_iteration": 2.6903302669525146 }, { "auxiliary_loss_clip": 0.01179124, "auxiliary_loss_mlp": 0.01057521, "balance_loss_clip": 1.0540626, "balance_loss_mlp": 1.03440642, "epoch": 0.24938375518547465, "flos": 23294570048640.0, "grad_norm": 1.9976620244342622, "language_loss": 0.77858031, "learning_rate": 3.5161732835819545e-06, "loss": 0.80094671, "num_input_tokens_seen": 44314755, "step": 2074, "time_per_iteration": 3.5609095096588135 }, { "auxiliary_loss_clip": 0.01178801, "auxiliary_loss_mlp": 0.01059873, "balance_loss_clip": 1.05750299, "balance_loss_mlp": 1.03836751, "epoch": 0.24950399807611376, "flos": 17311673099520.0, "grad_norm": 2.2029030077743452, "language_loss": 0.83205259, "learning_rate": 3.515665159235143e-06, "loss": 0.85443938, "num_input_tokens_seen": 44333640, "step": 2075, "time_per_iteration": 2.6588823795318604 }, { "auxiliary_loss_clip": 0.01145696, "auxiliary_loss_mlp": 0.01049648, "balance_loss_clip": 1.04950631, "balance_loss_mlp": 1.02747488, "epoch": 0.24962424096675284, "flos": 19024863252480.0, "grad_norm": 1.6655229153528615, "language_loss": 0.75458801, "learning_rate": 3.5151568049631318e-06, "loss": 0.77654147, "num_input_tokens_seen": 44352355, "step": 2076, "time_per_iteration": 2.686830520629883 }, { "auxiliary_loss_clip": 0.01181565, "auxiliary_loss_mlp": 0.01055592, "balance_loss_clip": 1.05582738, "balance_loss_mlp": 1.033324, "epoch": 0.24974448385739192, "flos": 33398790710400.0, "grad_norm": 2.377634206093788, "language_loss": 0.80285841, "learning_rate": 3.5146482208430385e-06, "loss": 0.82523, "num_input_tokens_seen": 44374185, "step": 2077, "time_per_iteration": 2.726048707962036 }, { "auxiliary_loss_clip": 0.01100735, "auxiliary_loss_mlp": 0.01064403, "balance_loss_clip": 1.04125905, "balance_loss_mlp": 1.03835607, "epoch": 0.24986472674803104, "flos": 30007279532160.0, "grad_norm": 6.757871766481418, "language_loss": 0.67772794, "learning_rate": 3.514139406952014e-06, "loss": 0.69937932, "num_input_tokens_seen": 44396210, "step": 2078, "time_per_iteration": 2.809861421585083 }, { "auxiliary_loss_clip": 0.01165634, "auxiliary_loss_mlp": 0.01050564, "balance_loss_clip": 1.05568993, "balance_loss_mlp": 1.02908301, "epoch": 0.24998496963867012, "flos": 26613074833920.0, "grad_norm": 2.201300720012228, "language_loss": 0.83211958, "learning_rate": 3.5136303633672454e-06, "loss": 0.85428149, "num_input_tokens_seen": 44416340, "step": 2079, "time_per_iteration": 2.7373898029327393 }, { "auxiliary_loss_clip": 0.01148116, "auxiliary_loss_mlp": 0.00777648, "balance_loss_clip": 1.05371308, "balance_loss_mlp": 1.00019038, "epoch": 0.25010521252930923, "flos": 23553989049600.0, "grad_norm": 2.666438518672318, "language_loss": 0.74923962, "learning_rate": 3.5131210901659544e-06, "loss": 0.76849723, "num_input_tokens_seen": 44438095, "step": 2080, "time_per_iteration": 2.7324633598327637 }, { "auxiliary_loss_clip": 0.01133565, "auxiliary_loss_mlp": 0.01056662, "balance_loss_clip": 1.04821801, "balance_loss_mlp": 1.03392851, "epoch": 0.2502254554199483, "flos": 23441193365760.0, "grad_norm": 6.491670276408095, "language_loss": 0.82537758, "learning_rate": 3.5126115874253967e-06, "loss": 0.84727985, "num_input_tokens_seen": 44457650, "step": 2081, "time_per_iteration": 2.778879165649414 }, { "auxiliary_loss_clip": 0.0113885, "auxiliary_loss_mlp": 0.01056255, "balance_loss_clip": 1.05123973, "balance_loss_mlp": 1.03390384, "epoch": 0.2503456983105874, "flos": 28761681651840.0, "grad_norm": 1.9211003722954332, "language_loss": 0.81189585, "learning_rate": 3.5121018552228644e-06, "loss": 0.83384693, "num_input_tokens_seen": 44476155, "step": 2082, "time_per_iteration": 2.8594067096710205 }, { "auxiliary_loss_clip": 0.01140629, "auxiliary_loss_mlp": 0.01056683, "balance_loss_clip": 1.04887748, "balance_loss_mlp": 1.03355658, "epoch": 0.2504659412012265, "flos": 18770256673920.0, "grad_norm": 2.4218768129218575, "language_loss": 0.76605058, "learning_rate": 3.5115918936356827e-06, "loss": 0.78802377, "num_input_tokens_seen": 44492910, "step": 2083, "time_per_iteration": 2.8260679244995117 }, { "auxiliary_loss_clip": 0.01116407, "auxiliary_loss_mlp": 0.01058272, "balance_loss_clip": 1.04554796, "balance_loss_mlp": 1.03702879, "epoch": 0.25058618409186556, "flos": 16873383346560.0, "grad_norm": 2.098660447718772, "language_loss": 0.7897737, "learning_rate": 3.5110817027412123e-06, "loss": 0.81152052, "num_input_tokens_seen": 44512000, "step": 2084, "time_per_iteration": 2.727658987045288 }, { "auxiliary_loss_clip": 0.0113023, "auxiliary_loss_mlp": 0.01052891, "balance_loss_clip": 1.0457747, "balance_loss_mlp": 1.0293349, "epoch": 0.25070642698250467, "flos": 24425540651520.0, "grad_norm": 2.230454462894678, "language_loss": 0.6942842, "learning_rate": 3.5105712826168493e-06, "loss": 0.71611536, "num_input_tokens_seen": 44531650, "step": 2085, "time_per_iteration": 2.74692702293396 }, { "auxiliary_loss_clip": 0.01163518, "auxiliary_loss_mlp": 0.00776486, "balance_loss_clip": 1.05223334, "balance_loss_mlp": 1.00015855, "epoch": 0.2508266698731437, "flos": 20260944028800.0, "grad_norm": 3.7912043228602257, "language_loss": 0.71173263, "learning_rate": 3.5100606333400235e-06, "loss": 0.73113275, "num_input_tokens_seen": 44548785, "step": 2086, "time_per_iteration": 2.691012144088745 }, { "auxiliary_loss_clip": 0.01168451, "auxiliary_loss_mlp": 0.0106469, "balance_loss_clip": 1.0537827, "balance_loss_mlp": 1.04022884, "epoch": 0.25094691276378284, "flos": 19245318975360.0, "grad_norm": 2.0618921788769757, "language_loss": 0.77384764, "learning_rate": 3.5095497549882006e-06, "loss": 0.79617906, "num_input_tokens_seen": 44567230, "step": 2087, "time_per_iteration": 2.688614845275879 }, { "auxiliary_loss_clip": 0.01169337, "auxiliary_loss_mlp": 0.01055696, "balance_loss_clip": 1.05737281, "balance_loss_mlp": 1.03414309, "epoch": 0.25106715565442195, "flos": 26943237671040.0, "grad_norm": 2.2133303447412542, "language_loss": 0.72615898, "learning_rate": 3.50903864763888e-06, "loss": 0.74840927, "num_input_tokens_seen": 44588020, "step": 2088, "time_per_iteration": 2.7923390865325928 }, { "auxiliary_loss_clip": 0.01168598, "auxiliary_loss_mlp": 0.01058597, "balance_loss_clip": 1.05282974, "balance_loss_mlp": 1.0348146, "epoch": 0.251187398545061, "flos": 48359570572800.0, "grad_norm": 3.103326712316903, "language_loss": 0.76315528, "learning_rate": 3.5085273113695965e-06, "loss": 0.78542721, "num_input_tokens_seen": 44612590, "step": 2089, "time_per_iteration": 2.895125150680542 }, { "auxiliary_loss_clip": 0.01181476, "auxiliary_loss_mlp": 0.01055718, "balance_loss_clip": 1.05613279, "balance_loss_mlp": 1.03262711, "epoch": 0.2513076414357001, "flos": 27016100409600.0, "grad_norm": 2.3305223250607816, "language_loss": 0.78534847, "learning_rate": 3.508015746257919e-06, "loss": 0.80772042, "num_input_tokens_seen": 44631630, "step": 2090, "time_per_iteration": 2.7238988876342773 }, { "auxiliary_loss_clip": 0.01141645, "auxiliary_loss_mlp": 0.01052896, "balance_loss_clip": 1.05154455, "balance_loss_mlp": 1.03071165, "epoch": 0.2514278843263392, "flos": 19463619882240.0, "grad_norm": 2.126316254247374, "language_loss": 0.83201504, "learning_rate": 3.5075039523814518e-06, "loss": 0.85396051, "num_input_tokens_seen": 44650820, "step": 2091, "time_per_iteration": 2.7669246196746826 }, { "auxiliary_loss_clip": 0.01168997, "auxiliary_loss_mlp": 0.01060977, "balance_loss_clip": 1.05342829, "balance_loss_mlp": 1.03888774, "epoch": 0.2515481272169783, "flos": 16866092885760.0, "grad_norm": 2.544708494625553, "language_loss": 0.81867963, "learning_rate": 3.506991929817834e-06, "loss": 0.84097934, "num_input_tokens_seen": 44667540, "step": 2092, "time_per_iteration": 3.634234666824341 }, { "auxiliary_loss_clip": 0.01176745, "auxiliary_loss_mlp": 0.01060365, "balance_loss_clip": 1.05626321, "balance_loss_mlp": 1.03826332, "epoch": 0.2516683701076174, "flos": 23732464752000.0, "grad_norm": 1.945344634461846, "language_loss": 0.83162475, "learning_rate": 3.506479678644738e-06, "loss": 0.85399586, "num_input_tokens_seen": 44687935, "step": 2093, "time_per_iteration": 2.6898019313812256 }, { "auxiliary_loss_clip": 0.01116905, "auxiliary_loss_mlp": 0.01060511, "balance_loss_clip": 1.04541802, "balance_loss_mlp": 1.03689551, "epoch": 0.2517886129982565, "flos": 27635954434560.0, "grad_norm": 3.047536469215388, "language_loss": 0.74102855, "learning_rate": 3.505967198939873e-06, "loss": 0.76280272, "num_input_tokens_seen": 44704975, "step": 2094, "time_per_iteration": 2.7939133644104004 }, { "auxiliary_loss_clip": 0.01149986, "auxiliary_loss_mlp": 0.01047446, "balance_loss_clip": 1.04960012, "balance_loss_mlp": 1.02516556, "epoch": 0.25190885588889556, "flos": 38104596529920.0, "grad_norm": 2.0208678565210634, "language_loss": 0.78374606, "learning_rate": 3.5054544907809813e-06, "loss": 0.80572039, "num_input_tokens_seen": 44725475, "step": 2095, "time_per_iteration": 3.8447439670562744 }, { "auxiliary_loss_clip": 0.01154657, "auxiliary_loss_mlp": 0.00778362, "balance_loss_clip": 1.05487001, "balance_loss_mlp": 1.00015903, "epoch": 0.25202909877953467, "flos": 22269894768000.0, "grad_norm": 4.268122055203341, "language_loss": 0.80060303, "learning_rate": 3.50494155424584e-06, "loss": 0.81993318, "num_input_tokens_seen": 44744380, "step": 2096, "time_per_iteration": 3.7593133449554443 }, { "auxiliary_loss_clip": 0.01168668, "auxiliary_loss_mlp": 0.01060071, "balance_loss_clip": 1.05328608, "balance_loss_mlp": 1.0360384, "epoch": 0.2521493416701738, "flos": 21761759018880.0, "grad_norm": 1.587661526943013, "language_loss": 0.83105695, "learning_rate": 3.504428389412262e-06, "loss": 0.85334438, "num_input_tokens_seen": 44765190, "step": 2097, "time_per_iteration": 2.663766622543335 }, { "auxiliary_loss_clip": 0.01161866, "auxiliary_loss_mlp": 0.0105954, "balance_loss_clip": 1.05243158, "balance_loss_mlp": 1.03768873, "epoch": 0.25226958456081283, "flos": 27746738956800.0, "grad_norm": 2.2084555757190985, "language_loss": 0.72897637, "learning_rate": 3.5039149963580927e-06, "loss": 0.75119042, "num_input_tokens_seen": 44785210, "step": 2098, "time_per_iteration": 2.7121224403381348 }, { "auxiliary_loss_clip": 0.01145071, "auxiliary_loss_mlp": 0.01047671, "balance_loss_clip": 1.05127501, "balance_loss_mlp": 1.02595139, "epoch": 0.25238982745145194, "flos": 30732171903360.0, "grad_norm": 2.0908623966666844, "language_loss": 0.703192, "learning_rate": 3.503401375161215e-06, "loss": 0.72511935, "num_input_tokens_seen": 44804955, "step": 2099, "time_per_iteration": 2.7728562355041504 }, { "auxiliary_loss_clip": 0.01175721, "auxiliary_loss_mlp": 0.01048644, "balance_loss_clip": 1.05546927, "balance_loss_mlp": 1.02719843, "epoch": 0.252510070342091, "flos": 20266331068800.0, "grad_norm": 1.609801620733544, "language_loss": 0.83677065, "learning_rate": 3.502887525899544e-06, "loss": 0.85901433, "num_input_tokens_seen": 44823935, "step": 2100, "time_per_iteration": 3.579871416091919 }, { "auxiliary_loss_clip": 0.01150035, "auxiliary_loss_mlp": 0.01059801, "balance_loss_clip": 1.05094767, "balance_loss_mlp": 1.03687716, "epoch": 0.2526303132327301, "flos": 22747399194240.0, "grad_norm": 1.7534962366390316, "language_loss": 0.83059251, "learning_rate": 3.50237344865103e-06, "loss": 0.85269094, "num_input_tokens_seen": 44844935, "step": 2101, "time_per_iteration": 2.826176643371582 }, { "auxiliary_loss_clip": 0.01183073, "auxiliary_loss_mlp": 0.0105439, "balance_loss_clip": 1.05648887, "balance_loss_mlp": 1.03277755, "epoch": 0.2527505561233692, "flos": 30263466309120.0, "grad_norm": 1.9886922293676232, "language_loss": 0.76573008, "learning_rate": 3.501859143493658e-06, "loss": 0.78810465, "num_input_tokens_seen": 44865565, "step": 2102, "time_per_iteration": 2.7739083766937256 }, { "auxiliary_loss_clip": 0.01074511, "auxiliary_loss_mlp": 0.01010225, "balance_loss_clip": 1.03082037, "balance_loss_mlp": 1.0076623, "epoch": 0.2528707990140083, "flos": 58492917164160.0, "grad_norm": 0.9238195379715086, "language_loss": 0.60532117, "learning_rate": 3.5013446105054488e-06, "loss": 0.62616849, "num_input_tokens_seen": 44918485, "step": 2103, "time_per_iteration": 3.027036666870117 }, { "auxiliary_loss_clip": 0.0112386, "auxiliary_loss_mlp": 0.01055897, "balance_loss_clip": 1.04871583, "balance_loss_mlp": 1.03380775, "epoch": 0.2529910419046474, "flos": 24645134448000.0, "grad_norm": 1.816090404379078, "language_loss": 0.74659085, "learning_rate": 3.5008298497644555e-06, "loss": 0.76838839, "num_input_tokens_seen": 44937530, "step": 2104, "time_per_iteration": 2.8396389484405518 }, { "auxiliary_loss_clip": 0.0113962, "auxiliary_loss_mlp": 0.01055814, "balance_loss_clip": 1.04985929, "balance_loss_mlp": 1.0341177, "epoch": 0.2531112847952865, "flos": 23842135952640.0, "grad_norm": 2.0046121962631247, "language_loss": 0.88049692, "learning_rate": 3.500314861348767e-06, "loss": 0.90245122, "num_input_tokens_seen": 44958165, "step": 2105, "time_per_iteration": 2.834134578704834 }, { "auxiliary_loss_clip": 0.0113065, "auxiliary_loss_mlp": 0.01072432, "balance_loss_clip": 1.05110145, "balance_loss_mlp": 1.04813766, "epoch": 0.25323152768592555, "flos": 16143822207360.0, "grad_norm": 2.2337641139035647, "language_loss": 0.7742939, "learning_rate": 3.499799645336507e-06, "loss": 0.79632473, "num_input_tokens_seen": 44975060, "step": 2106, "time_per_iteration": 2.837430238723755 }, { "auxiliary_loss_clip": 0.01165017, "auxiliary_loss_mlp": 0.01045412, "balance_loss_clip": 1.05458236, "balance_loss_mlp": 1.02483642, "epoch": 0.25335177057656466, "flos": 28405161210240.0, "grad_norm": 1.4568378874302486, "language_loss": 0.86946464, "learning_rate": 3.4992842018058336e-06, "loss": 0.89156884, "num_input_tokens_seen": 44997960, "step": 2107, "time_per_iteration": 2.8235208988189697 }, { "auxiliary_loss_clip": 0.01147262, "auxiliary_loss_mlp": 0.01062267, "balance_loss_clip": 1.05089438, "balance_loss_mlp": 1.03986716, "epoch": 0.25347201346720377, "flos": 18799666934400.0, "grad_norm": 2.3831416985133167, "language_loss": 0.88665497, "learning_rate": 3.4987685308349384e-06, "loss": 0.90875024, "num_input_tokens_seen": 45015690, "step": 2108, "time_per_iteration": 2.806622266769409 }, { "auxiliary_loss_clip": 0.01134639, "auxiliary_loss_mlp": 0.01066764, "balance_loss_clip": 1.04777157, "balance_loss_mlp": 1.0421952, "epoch": 0.2535922563578428, "flos": 15815490963840.0, "grad_norm": 2.6829325320312765, "language_loss": 0.61343902, "learning_rate": 3.4982526325020497e-06, "loss": 0.6354531, "num_input_tokens_seen": 45032660, "step": 2109, "time_per_iteration": 2.752070665359497 }, { "auxiliary_loss_clip": 0.01154625, "auxiliary_loss_mlp": 0.0104792, "balance_loss_clip": 1.05333877, "balance_loss_mlp": 1.02605712, "epoch": 0.25371249924848194, "flos": 16318922031360.0, "grad_norm": 2.574315821839663, "language_loss": 0.82149005, "learning_rate": 3.4977365068854273e-06, "loss": 0.84351557, "num_input_tokens_seen": 45048280, "step": 2110, "time_per_iteration": 2.7903265953063965 }, { "auxiliary_loss_clip": 0.01143441, "auxiliary_loss_mlp": 0.01057052, "balance_loss_clip": 1.0513339, "balance_loss_mlp": 1.03545165, "epoch": 0.25383274213912105, "flos": 21761615364480.0, "grad_norm": 1.9714232736690296, "language_loss": 0.73977041, "learning_rate": 3.4972201540633676e-06, "loss": 0.76177537, "num_input_tokens_seen": 45067635, "step": 2111, "time_per_iteration": 2.8147754669189453 }, { "auxiliary_loss_clip": 0.01145543, "auxiliary_loss_mlp": 0.01063557, "balance_loss_clip": 1.05114186, "balance_loss_mlp": 1.03969169, "epoch": 0.2539529850297601, "flos": 21396870708480.0, "grad_norm": 2.6890022361350545, "language_loss": 0.85250837, "learning_rate": 3.4967035741142008e-06, "loss": 0.8745994, "num_input_tokens_seen": 45086455, "step": 2112, "time_per_iteration": 2.7504804134368896 }, { "auxiliary_loss_clip": 0.01138035, "auxiliary_loss_mlp": 0.01057915, "balance_loss_clip": 1.05543184, "balance_loss_mlp": 1.03770876, "epoch": 0.2540732279203992, "flos": 25228467319680.0, "grad_norm": 2.248982031415634, "language_loss": 0.81947476, "learning_rate": 3.4961867671162917e-06, "loss": 0.84143424, "num_input_tokens_seen": 45106385, "step": 2113, "time_per_iteration": 2.7481560707092285 }, { "auxiliary_loss_clip": 0.0117728, "auxiliary_loss_mlp": 0.01053095, "balance_loss_clip": 1.05322552, "balance_loss_mlp": 1.0307312, "epoch": 0.2541934708110383, "flos": 19427386037760.0, "grad_norm": 3.569256004766985, "language_loss": 0.76774418, "learning_rate": 3.4956697331480402e-06, "loss": 0.790048, "num_input_tokens_seen": 45124955, "step": 2114, "time_per_iteration": 2.707498788833618 }, { "auxiliary_loss_clip": 0.01142211, "auxiliary_loss_mlp": 0.01057377, "balance_loss_clip": 1.04990935, "balance_loss_mlp": 1.03423846, "epoch": 0.2543137137016774, "flos": 23949436855680.0, "grad_norm": 1.6963350982469378, "language_loss": 0.80340308, "learning_rate": 3.495152472287879e-06, "loss": 0.82539898, "num_input_tokens_seen": 45145665, "step": 2115, "time_per_iteration": 2.8164918422698975 }, { "auxiliary_loss_clip": 0.01142557, "auxiliary_loss_mlp": 0.0105439, "balance_loss_clip": 1.053159, "balance_loss_mlp": 1.03207374, "epoch": 0.2544339565923165, "flos": 25593283802880.0, "grad_norm": 1.944198632399214, "language_loss": 0.74029684, "learning_rate": 3.4946349846142766e-06, "loss": 0.76226628, "num_input_tokens_seen": 45164805, "step": 2116, "time_per_iteration": 2.7776248455047607 }, { "auxiliary_loss_clip": 0.01179715, "auxiliary_loss_mlp": 0.01046534, "balance_loss_clip": 1.05668831, "balance_loss_mlp": 1.02531445, "epoch": 0.25455419948295555, "flos": 21689470897920.0, "grad_norm": 1.8821398390282449, "language_loss": 0.75721455, "learning_rate": 3.4941172702057353e-06, "loss": 0.779477, "num_input_tokens_seen": 45184865, "step": 2117, "time_per_iteration": 2.7481861114501953 }, { "auxiliary_loss_clip": 0.01155916, "auxiliary_loss_mlp": 0.01054283, "balance_loss_clip": 1.0572561, "balance_loss_mlp": 1.03264654, "epoch": 0.25467444237359466, "flos": 26250341339520.0, "grad_norm": 2.802548898064399, "language_loss": 0.80236208, "learning_rate": 3.4935993291407924e-06, "loss": 0.82446408, "num_input_tokens_seen": 45203690, "step": 2118, "time_per_iteration": 3.680609941482544 }, { "auxiliary_loss_clip": 0.01144545, "auxiliary_loss_mlp": 0.01056847, "balance_loss_clip": 1.05007982, "balance_loss_mlp": 1.03463817, "epoch": 0.25479468526423377, "flos": 26979686997120.0, "grad_norm": 2.370686044242973, "language_loss": 0.71275872, "learning_rate": 3.4930811614980183e-06, "loss": 0.73477268, "num_input_tokens_seen": 45225385, "step": 2119, "time_per_iteration": 2.8190553188323975 }, { "auxiliary_loss_clip": 0.01157935, "auxiliary_loss_mlp": 0.01056321, "balance_loss_clip": 1.05210829, "balance_loss_mlp": 1.03429151, "epoch": 0.2549149281548728, "flos": 23475811098240.0, "grad_norm": 1.698879655349236, "language_loss": 0.79065955, "learning_rate": 3.4925627673560198e-06, "loss": 0.81280208, "num_input_tokens_seen": 45246045, "step": 2120, "time_per_iteration": 2.7386507987976074 }, { "auxiliary_loss_clip": 0.01138962, "auxiliary_loss_mlp": 0.01052057, "balance_loss_clip": 1.05144668, "balance_loss_mlp": 1.0300746, "epoch": 0.25503517104551193, "flos": 25812302981760.0, "grad_norm": 1.7116951807544698, "language_loss": 0.88375705, "learning_rate": 3.4920441467934357e-06, "loss": 0.90566725, "num_input_tokens_seen": 45266560, "step": 2121, "time_per_iteration": 2.8003361225128174 }, { "auxiliary_loss_clip": 0.0112452, "auxiliary_loss_mlp": 0.01050387, "balance_loss_clip": 1.0468508, "balance_loss_mlp": 1.02892971, "epoch": 0.25515541393615104, "flos": 26645106787200.0, "grad_norm": 2.1753079487282956, "language_loss": 0.8251105, "learning_rate": 3.491525299888941e-06, "loss": 0.84685957, "num_input_tokens_seen": 45285405, "step": 2122, "time_per_iteration": 3.7675986289978027 }, { "auxiliary_loss_clip": 0.01041954, "auxiliary_loss_mlp": 0.00756789, "balance_loss_clip": 1.02803254, "balance_loss_mlp": 1.00012362, "epoch": 0.2552756568267901, "flos": 65955945847680.0, "grad_norm": 0.8863081642403734, "language_loss": 0.62690121, "learning_rate": 3.491006226721244e-06, "loss": 0.64488864, "num_input_tokens_seen": 45349615, "step": 2123, "time_per_iteration": 3.2984426021575928 }, { "auxiliary_loss_clip": 0.01154953, "auxiliary_loss_mlp": 0.00777152, "balance_loss_clip": 1.05787992, "balance_loss_mlp": 1.00029707, "epoch": 0.2553958997174292, "flos": 17931096161280.0, "grad_norm": 2.243420491633644, "language_loss": 0.78130847, "learning_rate": 3.4904869273690882e-06, "loss": 0.80062956, "num_input_tokens_seen": 45367505, "step": 2124, "time_per_iteration": 2.746201753616333 }, { "auxiliary_loss_clip": 0.01169036, "auxiliary_loss_mlp": 0.01051119, "balance_loss_clip": 1.05618691, "balance_loss_mlp": 1.0296973, "epoch": 0.2555161426080683, "flos": 23367791923200.0, "grad_norm": 2.4618951497532398, "language_loss": 0.88727146, "learning_rate": 3.489967401911251e-06, "loss": 0.90947306, "num_input_tokens_seen": 45386805, "step": 2125, "time_per_iteration": 2.703204393386841 }, { "auxiliary_loss_clip": 0.01184696, "auxiliary_loss_mlp": 0.01049404, "balance_loss_clip": 1.05736804, "balance_loss_mlp": 1.02540779, "epoch": 0.2556363854987074, "flos": 40625130723840.0, "grad_norm": 1.5879452987697953, "language_loss": 0.69365621, "learning_rate": 3.4894476504265428e-06, "loss": 0.71599722, "num_input_tokens_seen": 45411045, "step": 2126, "time_per_iteration": 3.5857932567596436 }, { "auxiliary_loss_clip": 0.01055261, "auxiliary_loss_mlp": 0.01003096, "balance_loss_clip": 1.02244616, "balance_loss_mlp": 1.00062788, "epoch": 0.2557566283893465, "flos": 68019443389440.0, "grad_norm": 9.653067720985353, "language_loss": 0.54382169, "learning_rate": 3.4889276729938104e-06, "loss": 0.56440526, "num_input_tokens_seen": 45469575, "step": 2127, "time_per_iteration": 3.183098077774048 }, { "auxiliary_loss_clip": 0.01152411, "auxiliary_loss_mlp": 0.0105204, "balance_loss_clip": 1.05521095, "balance_loss_mlp": 1.0309639, "epoch": 0.2558768712799856, "flos": 22635645004800.0, "grad_norm": 2.6570681437584502, "language_loss": 0.80699444, "learning_rate": 3.488407469691934e-06, "loss": 0.82903898, "num_input_tokens_seen": 45490270, "step": 2128, "time_per_iteration": 2.664337158203125 }, { "auxiliary_loss_clip": 0.01146846, "auxiliary_loss_mlp": 0.0105528, "balance_loss_clip": 1.0488565, "balance_loss_mlp": 1.0318315, "epoch": 0.25599711417062465, "flos": 26396354125440.0, "grad_norm": 4.0569201418063345, "language_loss": 0.8074379, "learning_rate": 3.487887040599828e-06, "loss": 0.82945919, "num_input_tokens_seen": 45510070, "step": 2129, "time_per_iteration": 2.756997585296631 }, { "auxiliary_loss_clip": 0.01184439, "auxiliary_loss_mlp": 0.01068222, "balance_loss_clip": 1.05877542, "balance_loss_mlp": 1.04697871, "epoch": 0.25611735706126376, "flos": 22852042490880.0, "grad_norm": 2.73274996592111, "language_loss": 0.76234806, "learning_rate": 3.4873663857964407e-06, "loss": 0.78487468, "num_input_tokens_seen": 45527285, "step": 2130, "time_per_iteration": 2.670480251312256 }, { "auxiliary_loss_clip": 0.01127587, "auxiliary_loss_mlp": 0.01044536, "balance_loss_clip": 1.05428791, "balance_loss_mlp": 1.02351904, "epoch": 0.2562375999519028, "flos": 23367863750400.0, "grad_norm": 1.9853175520373396, "language_loss": 0.66536415, "learning_rate": 3.4868455053607556e-06, "loss": 0.68708539, "num_input_tokens_seen": 45546900, "step": 2131, "time_per_iteration": 2.807893753051758 }, { "auxiliary_loss_clip": 0.01170971, "auxiliary_loss_mlp": 0.01071581, "balance_loss_clip": 1.05501962, "balance_loss_mlp": 1.04590297, "epoch": 0.2563578428425419, "flos": 22856962654080.0, "grad_norm": 2.0984957642123163, "language_loss": 0.71815723, "learning_rate": 3.486324399371789e-06, "loss": 0.7405827, "num_input_tokens_seen": 45566200, "step": 2132, "time_per_iteration": 2.824913263320923 }, { "auxiliary_loss_clip": 0.01133023, "auxiliary_loss_mlp": 0.01056909, "balance_loss_clip": 1.05276334, "balance_loss_mlp": 1.03691745, "epoch": 0.25647808573318104, "flos": 21653883498240.0, "grad_norm": 2.085148479455921, "language_loss": 0.78719866, "learning_rate": 3.485803067908593e-06, "loss": 0.80909801, "num_input_tokens_seen": 45585710, "step": 2133, "time_per_iteration": 2.792893171310425 }, { "auxiliary_loss_clip": 0.01086031, "auxiliary_loss_mlp": 0.0106251, "balance_loss_clip": 1.03924501, "balance_loss_mlp": 1.04021811, "epoch": 0.2565983286238201, "flos": 33730569659520.0, "grad_norm": 1.8800373951577634, "language_loss": 0.79773808, "learning_rate": 3.485281511050253e-06, "loss": 0.81922352, "num_input_tokens_seen": 45607845, "step": 2134, "time_per_iteration": 2.994410514831543 }, { "auxiliary_loss_clip": 0.01163458, "auxiliary_loss_mlp": 0.01051334, "balance_loss_clip": 1.05270767, "balance_loss_mlp": 1.02960253, "epoch": 0.2567185715144592, "flos": 16216002587520.0, "grad_norm": 2.4372751191286506, "language_loss": 0.89819086, "learning_rate": 3.484759728875889e-06, "loss": 0.92033875, "num_input_tokens_seen": 45623210, "step": 2135, "time_per_iteration": 2.744702100753784 }, { "auxiliary_loss_clip": 0.01111545, "auxiliary_loss_mlp": 0.01056851, "balance_loss_clip": 1.04729831, "balance_loss_mlp": 1.03672838, "epoch": 0.2568388144050983, "flos": 17458475984640.0, "grad_norm": 1.933765115944544, "language_loss": 0.8110193, "learning_rate": 3.4842377214646543e-06, "loss": 0.83270323, "num_input_tokens_seen": 45641505, "step": 2136, "time_per_iteration": 2.9271111488342285 }, { "auxiliary_loss_clip": 0.01177121, "auxiliary_loss_mlp": 0.01048319, "balance_loss_clip": 1.05640984, "balance_loss_mlp": 1.0265038, "epoch": 0.25695905729573737, "flos": 20887442069760.0, "grad_norm": 1.7946651268381286, "language_loss": 0.66808432, "learning_rate": 3.483715488895737e-06, "loss": 0.69033873, "num_input_tokens_seen": 45661835, "step": 2137, "time_per_iteration": 2.639573812484741 }, { "auxiliary_loss_clip": 0.0112186, "auxiliary_loss_mlp": 0.01050239, "balance_loss_clip": 1.04570818, "balance_loss_mlp": 1.02820885, "epoch": 0.2570793001863765, "flos": 24717278914560.0, "grad_norm": 1.8873675996179429, "language_loss": 0.78467989, "learning_rate": 3.48319303124836e-06, "loss": 0.8064009, "num_input_tokens_seen": 45682215, "step": 2138, "time_per_iteration": 2.8003742694854736 }, { "auxiliary_loss_clip": 0.01149215, "auxiliary_loss_mlp": 0.01050191, "balance_loss_clip": 1.05321431, "balance_loss_mlp": 1.02810121, "epoch": 0.2571995430770156, "flos": 26906896085760.0, "grad_norm": 2.1255049776019668, "language_loss": 0.66758579, "learning_rate": 3.4826703486017798e-06, "loss": 0.68957984, "num_input_tokens_seen": 45701840, "step": 2139, "time_per_iteration": 2.8201210498809814 }, { "auxiliary_loss_clip": 0.01161612, "auxiliary_loss_mlp": 0.01053422, "balance_loss_clip": 1.05329108, "balance_loss_mlp": 1.03102231, "epoch": 0.25731978596765465, "flos": 19792561656960.0, "grad_norm": 1.765198013454836, "language_loss": 0.7733674, "learning_rate": 3.4821474410352867e-06, "loss": 0.79551768, "num_input_tokens_seen": 45720500, "step": 2140, "time_per_iteration": 2.670071601867676 }, { "auxiliary_loss_clip": 0.0104076, "auxiliary_loss_mlp": 0.01026473, "balance_loss_clip": 1.02864909, "balance_loss_mlp": 1.02311182, "epoch": 0.25744002885829376, "flos": 70564970471040.0, "grad_norm": 0.897542021166421, "language_loss": 0.62744176, "learning_rate": 3.481624308628205e-06, "loss": 0.64811414, "num_input_tokens_seen": 45781870, "step": 2141, "time_per_iteration": 3.436234474182129 }, { "auxiliary_loss_clip": 0.01150156, "auxiliary_loss_mlp": 0.01066474, "balance_loss_clip": 1.05215394, "balance_loss_mlp": 1.04430079, "epoch": 0.25756027174893287, "flos": 18038181582720.0, "grad_norm": 8.334122136877003, "language_loss": 0.9977681, "learning_rate": 3.481100951459893e-06, "loss": 1.01993442, "num_input_tokens_seen": 45794890, "step": 2142, "time_per_iteration": 2.6821396350860596 }, { "auxiliary_loss_clip": 0.01160605, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.05219758, "balance_loss_mlp": 1.02340138, "epoch": 0.2576805146395719, "flos": 22674069578880.0, "grad_norm": 1.875852183149724, "language_loss": 0.78790593, "learning_rate": 3.4805773696097453e-06, "loss": 0.80997008, "num_input_tokens_seen": 45815780, "step": 2143, "time_per_iteration": 2.755647897720337 }, { "auxiliary_loss_clip": 0.01144533, "auxiliary_loss_mlp": 0.01052285, "balance_loss_clip": 1.05421901, "balance_loss_mlp": 1.0300405, "epoch": 0.25780075753021103, "flos": 16472225278080.0, "grad_norm": 2.055042555402434, "language_loss": 0.87876153, "learning_rate": 3.4800535631571874e-06, "loss": 0.90072966, "num_input_tokens_seen": 45831310, "step": 2144, "time_per_iteration": 3.7452807426452637 }, { "auxiliary_loss_clip": 0.0115844, "auxiliary_loss_mlp": 0.01058443, "balance_loss_clip": 1.0546844, "balance_loss_mlp": 1.03470826, "epoch": 0.25792100042085014, "flos": 22820297846400.0, "grad_norm": 2.8480802830073393, "language_loss": 0.76068181, "learning_rate": 3.4795295321816804e-06, "loss": 0.78285062, "num_input_tokens_seen": 45850135, "step": 2145, "time_per_iteration": 2.781001567840576 }, { "auxiliary_loss_clip": 0.01138562, "auxiliary_loss_mlp": 0.01053737, "balance_loss_clip": 1.0496316, "balance_loss_mlp": 1.03157544, "epoch": 0.2580412433114892, "flos": 18697286194560.0, "grad_norm": 2.080536782332373, "language_loss": 0.90885288, "learning_rate": 3.47900527676272e-06, "loss": 0.93077588, "num_input_tokens_seen": 45868470, "step": 2146, "time_per_iteration": 2.8543617725372314 }, { "auxiliary_loss_clip": 0.0118144, "auxiliary_loss_mlp": 0.01053987, "balance_loss_clip": 1.05891037, "balance_loss_mlp": 1.03090858, "epoch": 0.2581614862021283, "flos": 14283146810880.0, "grad_norm": 1.9701191229555692, "language_loss": 0.88676405, "learning_rate": 3.478480796979835e-06, "loss": 0.90911841, "num_input_tokens_seen": 45886355, "step": 2147, "time_per_iteration": 2.71502423286438 }, { "auxiliary_loss_clip": 0.0114583, "auxiliary_loss_mlp": 0.01056102, "balance_loss_clip": 1.05105412, "balance_loss_mlp": 1.033059, "epoch": 0.25828172909276736, "flos": 29498281856640.0, "grad_norm": 1.874157217018246, "language_loss": 0.77936381, "learning_rate": 3.4779560929125894e-06, "loss": 0.80138314, "num_input_tokens_seen": 45907900, "step": 2148, "time_per_iteration": 3.903233051300049 }, { "auxiliary_loss_clip": 0.01032321, "auxiliary_loss_mlp": 0.01008532, "balance_loss_clip": 1.02071643, "balance_loss_mlp": 1.00585008, "epoch": 0.2584019719834065, "flos": 67114387376640.0, "grad_norm": 0.6691116932463965, "language_loss": 0.56927335, "learning_rate": 3.4774311646405783e-06, "loss": 0.58968198, "num_input_tokens_seen": 45977805, "step": 2149, "time_per_iteration": 3.408961534500122 }, { "auxiliary_loss_clip": 0.01128165, "auxiliary_loss_mlp": 0.01054323, "balance_loss_clip": 1.05038464, "balance_loss_mlp": 1.03111243, "epoch": 0.2585222148740456, "flos": 22893555634560.0, "grad_norm": 1.9966355662390545, "language_loss": 0.8358205, "learning_rate": 3.476906012243435e-06, "loss": 0.85764539, "num_input_tokens_seen": 45996715, "step": 2150, "time_per_iteration": 2.8099875450134277 }, { "auxiliary_loss_clip": 0.01156024, "auxiliary_loss_mlp": 0.01047958, "balance_loss_clip": 1.05421329, "balance_loss_mlp": 1.0259043, "epoch": 0.25864245776468464, "flos": 28909202808960.0, "grad_norm": 1.5885413411162832, "language_loss": 0.81284887, "learning_rate": 3.476380635800824e-06, "loss": 0.8348887, "num_input_tokens_seen": 46017915, "step": 2151, "time_per_iteration": 2.778433322906494 }, { "auxiliary_loss_clip": 0.01151605, "auxiliary_loss_mlp": 0.01049898, "balance_loss_clip": 1.05577302, "balance_loss_mlp": 1.0283097, "epoch": 0.25876270065532375, "flos": 14793185980800.0, "grad_norm": 2.306397951206394, "language_loss": 0.85829639, "learning_rate": 3.475855035392444e-06, "loss": 0.88031149, "num_input_tokens_seen": 46033235, "step": 2152, "time_per_iteration": 3.602783441543579 }, { "auxiliary_loss_clip": 0.01106687, "auxiliary_loss_mlp": 0.01049461, "balance_loss_clip": 1.04881108, "balance_loss_mlp": 1.02716851, "epoch": 0.25888294354596286, "flos": 60467821810560.0, "grad_norm": 3.8564117491274215, "language_loss": 0.71430689, "learning_rate": 3.475329211098029e-06, "loss": 0.73586839, "num_input_tokens_seen": 46056390, "step": 2153, "time_per_iteration": 3.1350085735321045 }, { "auxiliary_loss_clip": 0.01134011, "auxiliary_loss_mlp": 0.01056634, "balance_loss_clip": 1.05583477, "balance_loss_mlp": 1.03409195, "epoch": 0.2590031864366019, "flos": 27851166771840.0, "grad_norm": 2.4041178002121506, "language_loss": 0.82601571, "learning_rate": 3.4748031629973453e-06, "loss": 0.84792221, "num_input_tokens_seen": 46077120, "step": 2154, "time_per_iteration": 2.8376495838165283 }, { "auxiliary_loss_clip": 0.01022634, "auxiliary_loss_mlp": 0.01003212, "balance_loss_clip": 1.02328873, "balance_loss_mlp": 1.00013661, "epoch": 0.25912342932724103, "flos": 62422444206720.0, "grad_norm": 0.9160342267489127, "language_loss": 0.56645125, "learning_rate": 3.4742768911701944e-06, "loss": 0.58670974, "num_input_tokens_seen": 46139815, "step": 2155, "time_per_iteration": 3.4331297874450684 }, { "auxiliary_loss_clip": 0.01174414, "auxiliary_loss_mlp": 0.01068526, "balance_loss_clip": 1.0588572, "balance_loss_mlp": 1.04266953, "epoch": 0.25924367221788014, "flos": 12378839368320.0, "grad_norm": 9.76678147193117, "language_loss": 0.70792937, "learning_rate": 3.4737503956964113e-06, "loss": 0.73035884, "num_input_tokens_seen": 46152120, "step": 2156, "time_per_iteration": 2.674590587615967 }, { "auxiliary_loss_clip": 0.0114935, "auxiliary_loss_mlp": 0.0105963, "balance_loss_clip": 1.05368257, "balance_loss_mlp": 1.0337497, "epoch": 0.2593639151085192, "flos": 14575208296320.0, "grad_norm": 2.1029917799351905, "language_loss": 0.67778814, "learning_rate": 3.473223676655865e-06, "loss": 0.69987798, "num_input_tokens_seen": 46170120, "step": 2157, "time_per_iteration": 2.691101551055908 }, { "auxiliary_loss_clip": 0.01145905, "auxiliary_loss_mlp": 0.01053855, "balance_loss_clip": 1.05341268, "balance_loss_mlp": 1.02901173, "epoch": 0.2594841579991583, "flos": 15230937029760.0, "grad_norm": 2.0566593828757678, "language_loss": 0.79968297, "learning_rate": 3.472696734128459e-06, "loss": 0.82168055, "num_input_tokens_seen": 46187985, "step": 2158, "time_per_iteration": 2.722317934036255 }, { "auxiliary_loss_clip": 0.01170615, "auxiliary_loss_mlp": 0.01058909, "balance_loss_clip": 1.05832553, "balance_loss_mlp": 1.03603268, "epoch": 0.2596044008897974, "flos": 23623583650560.0, "grad_norm": 1.7169391253507156, "language_loss": 0.75982285, "learning_rate": 3.4721695681941286e-06, "loss": 0.78211802, "num_input_tokens_seen": 46207025, "step": 2159, "time_per_iteration": 2.7959036827087402 }, { "auxiliary_loss_clip": 0.01149445, "auxiliary_loss_mlp": 0.00777322, "balance_loss_clip": 1.05346501, "balance_loss_mlp": 1.0003047, "epoch": 0.25972464378043647, "flos": 13772281628160.0, "grad_norm": 2.412805404553067, "language_loss": 0.82534277, "learning_rate": 3.471642178932845e-06, "loss": 0.84461045, "num_input_tokens_seen": 46225670, "step": 2160, "time_per_iteration": 2.6861774921417236 }, { "auxiliary_loss_clip": 0.01155114, "auxiliary_loss_mlp": 0.01055703, "balance_loss_clip": 1.05329418, "balance_loss_mlp": 1.03323245, "epoch": 0.2598448866710756, "flos": 19573578391680.0, "grad_norm": 2.745732879901547, "language_loss": 0.89599049, "learning_rate": 3.471114566424613e-06, "loss": 0.91809869, "num_input_tokens_seen": 46244130, "step": 2161, "time_per_iteration": 2.754741907119751 }, { "auxiliary_loss_clip": 0.01152318, "auxiliary_loss_mlp": 0.01056408, "balance_loss_clip": 1.05567718, "balance_loss_mlp": 1.03328145, "epoch": 0.25996512956171464, "flos": 21653237053440.0, "grad_norm": 1.9529921434383257, "language_loss": 0.75551695, "learning_rate": 3.4705867307494715e-06, "loss": 0.77760422, "num_input_tokens_seen": 46263200, "step": 2162, "time_per_iteration": 2.671079635620117 }, { "auxiliary_loss_clip": 0.01170822, "auxiliary_loss_mlp": 0.01059345, "balance_loss_clip": 1.0588578, "balance_loss_mlp": 1.03659964, "epoch": 0.26008537245235375, "flos": 18223480869120.0, "grad_norm": 2.9937168763921487, "language_loss": 0.84531921, "learning_rate": 3.470058671987492e-06, "loss": 0.86762094, "num_input_tokens_seen": 46281465, "step": 2163, "time_per_iteration": 2.7611234188079834 }, { "auxiliary_loss_clip": 0.0117344, "auxiliary_loss_mlp": 0.01059131, "balance_loss_clip": 1.05924654, "balance_loss_mlp": 1.03495526, "epoch": 0.26020561534299286, "flos": 24645385843200.0, "grad_norm": 2.0327732519513746, "language_loss": 0.84477746, "learning_rate": 3.4695303902187805e-06, "loss": 0.86710322, "num_input_tokens_seen": 46301020, "step": 2164, "time_per_iteration": 2.665801525115967 }, { "auxiliary_loss_clip": 0.01135776, "auxiliary_loss_mlp": 0.01056072, "balance_loss_clip": 1.05327046, "balance_loss_mlp": 1.03218293, "epoch": 0.2603258582336319, "flos": 25773662926080.0, "grad_norm": 2.812885218603597, "language_loss": 0.78510976, "learning_rate": 3.469001885523478e-06, "loss": 0.80702817, "num_input_tokens_seen": 46321740, "step": 2165, "time_per_iteration": 2.8067002296447754 }, { "auxiliary_loss_clip": 0.01178489, "auxiliary_loss_mlp": 0.01059867, "balance_loss_clip": 1.05737519, "balance_loss_mlp": 1.03843307, "epoch": 0.260446101124271, "flos": 28766314506240.0, "grad_norm": 1.734226499313362, "language_loss": 0.81138539, "learning_rate": 3.4684731579817568e-06, "loss": 0.83376896, "num_input_tokens_seen": 46342730, "step": 2166, "time_per_iteration": 2.7252304553985596 }, { "auxiliary_loss_clip": 0.01119911, "auxiliary_loss_mlp": 0.01061708, "balance_loss_clip": 1.05723357, "balance_loss_mlp": 1.04059553, "epoch": 0.26056634401491013, "flos": 25666757072640.0, "grad_norm": 1.5043837378608154, "language_loss": 0.76786661, "learning_rate": 3.4679442076738247e-06, "loss": 0.78968281, "num_input_tokens_seen": 46362445, "step": 2167, "time_per_iteration": 2.9554810523986816 }, { "auxiliary_loss_clip": 0.01185479, "auxiliary_loss_mlp": 0.01054075, "balance_loss_clip": 1.06149352, "balance_loss_mlp": 1.0318898, "epoch": 0.2606865869055492, "flos": 27052765217280.0, "grad_norm": 2.2279446055780516, "language_loss": 0.83798027, "learning_rate": 3.4674150346799245e-06, "loss": 0.86037576, "num_input_tokens_seen": 46382145, "step": 2168, "time_per_iteration": 2.78726863861084 }, { "auxiliary_loss_clip": 0.01150502, "auxiliary_loss_mlp": 0.01051533, "balance_loss_clip": 1.05470562, "balance_loss_mlp": 1.02931237, "epoch": 0.2608068297961883, "flos": 17712615686400.0, "grad_norm": 3.3176397652429612, "language_loss": 0.80561662, "learning_rate": 3.4668856390803295e-06, "loss": 0.82763702, "num_input_tokens_seen": 46400025, "step": 2169, "time_per_iteration": 2.7460014820098877 }, { "auxiliary_loss_clip": 0.0115323, "auxiliary_loss_mlp": 0.01054571, "balance_loss_clip": 1.05390263, "balance_loss_mlp": 1.0333513, "epoch": 0.2609270726868274, "flos": 18551632544640.0, "grad_norm": 2.1629253286884533, "language_loss": 0.89724714, "learning_rate": 3.4663560209553495e-06, "loss": 0.91932511, "num_input_tokens_seen": 46418090, "step": 2170, "time_per_iteration": 3.643789768218994 }, { "auxiliary_loss_clip": 0.01140033, "auxiliary_loss_mlp": 0.01052042, "balance_loss_clip": 1.05156851, "balance_loss_mlp": 1.02881992, "epoch": 0.26104731557746647, "flos": 21835699165440.0, "grad_norm": 1.7472130239321486, "language_loss": 0.79373884, "learning_rate": 3.4658261803853267e-06, "loss": 0.81565958, "num_input_tokens_seen": 46436015, "step": 2171, "time_per_iteration": 2.703005075454712 }, { "auxiliary_loss_clip": 0.0114744, "auxiliary_loss_mlp": 0.0105252, "balance_loss_clip": 1.05619001, "balance_loss_mlp": 1.02990556, "epoch": 0.2611675584681056, "flos": 21689650465920.0, "grad_norm": 2.3141796450961856, "language_loss": 0.8030293, "learning_rate": 3.4652961174506383e-06, "loss": 0.8250289, "num_input_tokens_seen": 46455885, "step": 2172, "time_per_iteration": 2.740769624710083 }, { "auxiliary_loss_clip": 0.01050417, "auxiliary_loss_mlp": 0.01002876, "balance_loss_clip": 1.02241015, "balance_loss_mlp": 0.99997884, "epoch": 0.2612878013587447, "flos": 71862101389440.0, "grad_norm": 0.9486351015713668, "language_loss": 0.58141875, "learning_rate": 3.464765832231694e-06, "loss": 0.60195172, "num_input_tokens_seen": 46510050, "step": 2173, "time_per_iteration": 4.287001848220825 }, { "auxiliary_loss_clip": 0.01165027, "auxiliary_loss_mlp": 0.01054743, "balance_loss_clip": 1.05530453, "balance_loss_mlp": 1.03364253, "epoch": 0.26140804424938374, "flos": 20227511445120.0, "grad_norm": 1.8472914785829047, "language_loss": 0.70932257, "learning_rate": 3.4642353248089373e-06, "loss": 0.73152024, "num_input_tokens_seen": 46528810, "step": 2174, "time_per_iteration": 3.695401430130005 }, { "auxiliary_loss_clip": 0.01145065, "auxiliary_loss_mlp": 0.0105736, "balance_loss_clip": 1.05095589, "balance_loss_mlp": 1.03391123, "epoch": 0.26152828714002285, "flos": 25557085872000.0, "grad_norm": 3.8240333008173297, "language_loss": 0.8017633, "learning_rate": 3.463704595262846e-06, "loss": 0.82378751, "num_input_tokens_seen": 46549690, "step": 2175, "time_per_iteration": 2.7881603240966797 }, { "auxiliary_loss_clip": 0.01133802, "auxiliary_loss_mlp": 0.01062617, "balance_loss_clip": 1.05235362, "balance_loss_mlp": 1.04158843, "epoch": 0.26164853003066196, "flos": 25446516831360.0, "grad_norm": 15.104319848095919, "language_loss": 0.70571673, "learning_rate": 3.463173643673931e-06, "loss": 0.72768092, "num_input_tokens_seen": 46572215, "step": 2176, "time_per_iteration": 2.8726961612701416 }, { "auxiliary_loss_clip": 0.01056925, "auxiliary_loss_mlp": 0.01003083, "balance_loss_clip": 1.02335238, "balance_loss_mlp": 1.00053239, "epoch": 0.261768772921301, "flos": 53944580568960.0, "grad_norm": 0.8936582199075712, "language_loss": 0.63515168, "learning_rate": 3.4626424701227387e-06, "loss": 0.65575176, "num_input_tokens_seen": 46627275, "step": 2177, "time_per_iteration": 3.179335117340088 }, { "auxiliary_loss_clip": 0.01066778, "auxiliary_loss_mlp": 0.01002685, "balance_loss_clip": 1.02443409, "balance_loss_mlp": 1.00028884, "epoch": 0.26188901581194013, "flos": 70687606481280.0, "grad_norm": 0.8719465291966472, "language_loss": 0.55836344, "learning_rate": 3.4621110746898452e-06, "loss": 0.57905805, "num_input_tokens_seen": 46695135, "step": 2178, "time_per_iteration": 4.2188029289245605 }, { "auxiliary_loss_clip": 0.01164675, "auxiliary_loss_mlp": 0.01052993, "balance_loss_clip": 1.05389476, "balance_loss_mlp": 1.02968717, "epoch": 0.2620092587025792, "flos": 21069580959360.0, "grad_norm": 1.8816141784304057, "language_loss": 0.74827349, "learning_rate": 3.4615794574558654e-06, "loss": 0.77045017, "num_input_tokens_seen": 46714145, "step": 2179, "time_per_iteration": 2.701866388320923 }, { "auxiliary_loss_clip": 0.01151057, "auxiliary_loss_mlp": 0.01051751, "balance_loss_clip": 1.05483532, "balance_loss_mlp": 1.02916098, "epoch": 0.2621295015932183, "flos": 18369601395840.0, "grad_norm": 2.1526584612108235, "language_loss": 0.84133273, "learning_rate": 3.4610476185014436e-06, "loss": 0.86336082, "num_input_tokens_seen": 46731405, "step": 2180, "time_per_iteration": 2.7298223972320557 }, { "auxiliary_loss_clip": 0.01178384, "auxiliary_loss_mlp": 0.01053793, "balance_loss_clip": 1.05450916, "balance_loss_mlp": 1.03120327, "epoch": 0.2622497444838574, "flos": 23659997063040.0, "grad_norm": 2.1397163311755367, "language_loss": 0.80077893, "learning_rate": 3.4605155579072597e-06, "loss": 0.82310069, "num_input_tokens_seen": 46751260, "step": 2181, "time_per_iteration": 2.610633134841919 }, { "auxiliary_loss_clip": 0.01117361, "auxiliary_loss_mlp": 0.01060505, "balance_loss_clip": 1.04847121, "balance_loss_mlp": 1.03693771, "epoch": 0.26236998737449646, "flos": 22123810154880.0, "grad_norm": 1.7794577967525933, "language_loss": 0.71425152, "learning_rate": 3.459983275754027e-06, "loss": 0.73603016, "num_input_tokens_seen": 46770155, "step": 2182, "time_per_iteration": 2.7174105644226074 }, { "auxiliary_loss_clip": 0.01177897, "auxiliary_loss_mlp": 0.01055064, "balance_loss_clip": 1.05513108, "balance_loss_mlp": 1.0329628, "epoch": 0.26249023026513557, "flos": 17895185539200.0, "grad_norm": 21.38166463416786, "language_loss": 0.79670537, "learning_rate": 3.4594507721224918e-06, "loss": 0.81903499, "num_input_tokens_seen": 46788805, "step": 2183, "time_per_iteration": 2.6335527896881104 }, { "auxiliary_loss_clip": 0.01151648, "auxiliary_loss_mlp": 0.0105879, "balance_loss_clip": 1.05232942, "balance_loss_mlp": 1.03728485, "epoch": 0.2626104731557747, "flos": 18332936588160.0, "grad_norm": 1.920750100815487, "language_loss": 0.81817037, "learning_rate": 3.4589180470934353e-06, "loss": 0.84027475, "num_input_tokens_seen": 46808670, "step": 2184, "time_per_iteration": 2.6721620559692383 }, { "auxiliary_loss_clip": 0.01168207, "auxiliary_loss_mlp": 0.0104898, "balance_loss_clip": 1.05321312, "balance_loss_mlp": 1.02553153, "epoch": 0.26273071604641374, "flos": 19317714837120.0, "grad_norm": 2.610187029508347, "language_loss": 0.76934636, "learning_rate": 3.4583851007476713e-06, "loss": 0.79151821, "num_input_tokens_seen": 46827140, "step": 2185, "time_per_iteration": 2.6490020751953125 }, { "auxiliary_loss_clip": 0.01142243, "auxiliary_loss_mlp": 0.01060114, "balance_loss_clip": 1.05329478, "balance_loss_mlp": 1.03604531, "epoch": 0.26285095893705285, "flos": 18327477720960.0, "grad_norm": 2.198116796009087, "language_loss": 0.68842661, "learning_rate": 3.4578519331660464e-06, "loss": 0.71045017, "num_input_tokens_seen": 46844135, "step": 2186, "time_per_iteration": 2.6793601512908936 }, { "auxiliary_loss_clip": 0.01164978, "auxiliary_loss_mlp": 0.01057896, "balance_loss_clip": 1.05734706, "balance_loss_mlp": 1.03749979, "epoch": 0.26297120182769196, "flos": 20193827466240.0, "grad_norm": 2.7048656635137935, "language_loss": 0.82493174, "learning_rate": 3.4573185444294426e-06, "loss": 0.84716046, "num_input_tokens_seen": 46862500, "step": 2187, "time_per_iteration": 2.595212697982788 }, { "auxiliary_loss_clip": 0.01148483, "auxiliary_loss_mlp": 0.00778971, "balance_loss_clip": 1.05249834, "balance_loss_mlp": 1.00044286, "epoch": 0.263091444718331, "flos": 22418421505920.0, "grad_norm": 1.7487935367094365, "language_loss": 0.78756022, "learning_rate": 3.456784934618774e-06, "loss": 0.80683476, "num_input_tokens_seen": 46883665, "step": 2188, "time_per_iteration": 2.726773262023926 }, { "auxiliary_loss_clip": 0.01144004, "auxiliary_loss_mlp": 0.01057371, "balance_loss_clip": 1.04976118, "balance_loss_mlp": 1.03487659, "epoch": 0.2632116876089701, "flos": 19024827338880.0, "grad_norm": 1.9874423133651173, "language_loss": 0.79646266, "learning_rate": 3.4562511038149897e-06, "loss": 0.81847638, "num_input_tokens_seen": 46899160, "step": 2189, "time_per_iteration": 2.6522278785705566 }, { "auxiliary_loss_clip": 0.01013872, "auxiliary_loss_mlp": 0.01011651, "balance_loss_clip": 1.01435518, "balance_loss_mlp": 1.0087899, "epoch": 0.26333193049960923, "flos": 67308054531840.0, "grad_norm": 0.9531729694897607, "language_loss": 0.57757068, "learning_rate": 3.4557170520990705e-06, "loss": 0.59782594, "num_input_tokens_seen": 46959835, "step": 2190, "time_per_iteration": 3.336300849914551 }, { "auxiliary_loss_clip": 0.01160459, "auxiliary_loss_mlp": 0.01059445, "balance_loss_clip": 1.05321956, "balance_loss_mlp": 1.03845263, "epoch": 0.2634521733902483, "flos": 25048806468480.0, "grad_norm": 1.600849695984271, "language_loss": 0.8648324, "learning_rate": 3.4551827795520324e-06, "loss": 0.88703144, "num_input_tokens_seen": 46982720, "step": 2191, "time_per_iteration": 2.7471158504486084 }, { "auxiliary_loss_clip": 0.01159635, "auxiliary_loss_mlp": 0.01050065, "balance_loss_clip": 1.05266547, "balance_loss_mlp": 1.02923918, "epoch": 0.2635724162808874, "flos": 20594985534720.0, "grad_norm": 1.9023448471303643, "language_loss": 0.85027444, "learning_rate": 3.4546482862549226e-06, "loss": 0.87237144, "num_input_tokens_seen": 47003035, "step": 2192, "time_per_iteration": 2.7040963172912598 }, { "auxiliary_loss_clip": 0.01130957, "auxiliary_loss_mlp": 0.01057111, "balance_loss_clip": 1.05126619, "balance_loss_mlp": 1.03485465, "epoch": 0.2636926591715265, "flos": 19244636616960.0, "grad_norm": 2.3395000698416206, "language_loss": 0.78895426, "learning_rate": 3.4541135722888253e-06, "loss": 0.81083494, "num_input_tokens_seen": 47019625, "step": 2193, "time_per_iteration": 2.7301363945007324 }, { "auxiliary_loss_clip": 0.0117569, "auxiliary_loss_mlp": 0.01044568, "balance_loss_clip": 1.0560075, "balance_loss_mlp": 1.02258623, "epoch": 0.26381290206216557, "flos": 28804882734720.0, "grad_norm": 1.8469583121350017, "language_loss": 0.80511552, "learning_rate": 3.453578637734854e-06, "loss": 0.82731807, "num_input_tokens_seen": 47040815, "step": 2194, "time_per_iteration": 2.6992692947387695 }, { "auxiliary_loss_clip": 0.01180323, "auxiliary_loss_mlp": 0.01063055, "balance_loss_clip": 1.05869126, "balance_loss_mlp": 1.04133511, "epoch": 0.2639331449528047, "flos": 25008909436800.0, "grad_norm": 1.6425041317681905, "language_loss": 0.78423035, "learning_rate": 3.4530434826741605e-06, "loss": 0.80666411, "num_input_tokens_seen": 47061755, "step": 2195, "time_per_iteration": 2.7014212608337402 }, { "auxiliary_loss_clip": 0.01146873, "auxiliary_loss_mlp": 0.01053065, "balance_loss_clip": 1.05290723, "balance_loss_mlp": 1.03085661, "epoch": 0.26405338784344373, "flos": 46535775465600.0, "grad_norm": 1.7120080711087504, "language_loss": 0.68618977, "learning_rate": 3.452508107187926e-06, "loss": 0.70818919, "num_input_tokens_seen": 47085130, "step": 2196, "time_per_iteration": 3.7365407943725586 }, { "auxiliary_loss_clip": 0.01114128, "auxiliary_loss_mlp": 0.01059437, "balance_loss_clip": 1.04782939, "balance_loss_mlp": 1.03548825, "epoch": 0.26417363073408284, "flos": 21179467641600.0, "grad_norm": 2.0542402012978016, "language_loss": 0.77674592, "learning_rate": 3.451972511357366e-06, "loss": 0.79848158, "num_input_tokens_seen": 47104675, "step": 2197, "time_per_iteration": 2.781613826751709 }, { "auxiliary_loss_clip": 0.01160465, "auxiliary_loss_mlp": 0.01059654, "balance_loss_clip": 1.05451298, "balance_loss_mlp": 1.03825605, "epoch": 0.26429387362472195, "flos": 22674751937280.0, "grad_norm": 2.0115858252221988, "language_loss": 0.84824383, "learning_rate": 3.45143669526373e-06, "loss": 0.87044501, "num_input_tokens_seen": 47124435, "step": 2198, "time_per_iteration": 2.6835122108459473 }, { "auxiliary_loss_clip": 0.01042789, "auxiliary_loss_mlp": 0.0100356, "balance_loss_clip": 1.02003622, "balance_loss_mlp": 1.00068688, "epoch": 0.264414116515361, "flos": 67180534272000.0, "grad_norm": 0.7937846385127687, "language_loss": 0.63221723, "learning_rate": 3.450900658988302e-06, "loss": 0.6526807, "num_input_tokens_seen": 47185985, "step": 2199, "time_per_iteration": 3.1912662982940674 }, { "auxiliary_loss_clip": 0.01142646, "auxiliary_loss_mlp": 0.0105696, "balance_loss_clip": 1.05342579, "balance_loss_mlp": 1.03316545, "epoch": 0.2645343594060001, "flos": 25664709997440.0, "grad_norm": 2.4427392061787985, "language_loss": 0.77602255, "learning_rate": 3.450364402612397e-06, "loss": 0.79801857, "num_input_tokens_seen": 47203140, "step": 2200, "time_per_iteration": 3.721904993057251 }, { "auxiliary_loss_clip": 0.01146752, "auxiliary_loss_mlp": 0.01050892, "balance_loss_clip": 1.0488745, "balance_loss_mlp": 1.02843308, "epoch": 0.26465460229663923, "flos": 22491822948480.0, "grad_norm": 2.6620135309251314, "language_loss": 0.83700168, "learning_rate": 3.449827926217366e-06, "loss": 0.85897809, "num_input_tokens_seen": 47222575, "step": 2201, "time_per_iteration": 2.7049458026885986 }, { "auxiliary_loss_clip": 0.01152298, "auxiliary_loss_mlp": 0.01065986, "balance_loss_clip": 1.04938293, "balance_loss_mlp": 1.04436135, "epoch": 0.2647748451872783, "flos": 29388036038400.0, "grad_norm": 1.9266541113965596, "language_loss": 0.80598444, "learning_rate": 3.449291229884591e-06, "loss": 0.82816726, "num_input_tokens_seen": 47243815, "step": 2202, "time_per_iteration": 2.854071617126465 }, { "auxiliary_loss_clip": 0.01139057, "auxiliary_loss_mlp": 0.01054182, "balance_loss_clip": 1.05025697, "balance_loss_mlp": 1.03109062, "epoch": 0.2648950880779174, "flos": 26797799502720.0, "grad_norm": 2.549656230502875, "language_loss": 0.86789268, "learning_rate": 3.4487543136954887e-06, "loss": 0.88982505, "num_input_tokens_seen": 47263435, "step": 2203, "time_per_iteration": 2.7684688568115234 }, { "auxiliary_loss_clip": 0.01136541, "auxiliary_loss_mlp": 0.01050366, "balance_loss_clip": 1.05059886, "balance_loss_mlp": 1.02865791, "epoch": 0.2650153309685565, "flos": 28841008838400.0, "grad_norm": 1.8149005260991684, "language_loss": 0.91219509, "learning_rate": 3.448217177731509e-06, "loss": 0.93406415, "num_input_tokens_seen": 47283920, "step": 2204, "time_per_iteration": 3.575554370880127 }, { "auxiliary_loss_clip": 0.01146972, "auxiliary_loss_mlp": 0.01049669, "balance_loss_clip": 1.05575109, "balance_loss_mlp": 1.02853346, "epoch": 0.26513557385919556, "flos": 20303247271680.0, "grad_norm": 2.130124450853945, "language_loss": 0.77808475, "learning_rate": 3.4476798220741348e-06, "loss": 0.80005121, "num_input_tokens_seen": 47302800, "step": 2205, "time_per_iteration": 2.760577440261841 }, { "auxiliary_loss_clip": 0.0117597, "auxiliary_loss_mlp": 0.01062251, "balance_loss_clip": 1.05811, "balance_loss_mlp": 1.04165184, "epoch": 0.26525581674983467, "flos": 17676274101120.0, "grad_norm": 1.77502505738335, "language_loss": 0.78382659, "learning_rate": 3.4471422468048826e-06, "loss": 0.80620873, "num_input_tokens_seen": 47321525, "step": 2206, "time_per_iteration": 2.651613235473633 }, { "auxiliary_loss_clip": 0.01157467, "auxiliary_loss_mlp": 0.01053871, "balance_loss_clip": 1.05387306, "balance_loss_mlp": 1.03100681, "epoch": 0.2653760596404738, "flos": 26833746038400.0, "grad_norm": 4.871754188234827, "language_loss": 0.7297821, "learning_rate": 3.4466044520053022e-06, "loss": 0.75189543, "num_input_tokens_seen": 47340530, "step": 2207, "time_per_iteration": 2.8286123275756836 }, { "auxiliary_loss_clip": 0.01134319, "auxiliary_loss_mlp": 0.0105404, "balance_loss_clip": 1.04800069, "balance_loss_mlp": 1.03086531, "epoch": 0.26549630253111284, "flos": 22782160581120.0, "grad_norm": 2.3778040846800392, "language_loss": 0.60312152, "learning_rate": 3.446066437756977e-06, "loss": 0.62500513, "num_input_tokens_seen": 47359735, "step": 2208, "time_per_iteration": 2.7130720615386963 }, { "auxiliary_loss_clip": 0.0114698, "auxiliary_loss_mlp": 0.01048832, "balance_loss_clip": 1.05038321, "balance_loss_mlp": 1.02715933, "epoch": 0.26561654542175195, "flos": 23550002640000.0, "grad_norm": 2.4779167830740474, "language_loss": 0.7538507, "learning_rate": 3.4455282041415224e-06, "loss": 0.77580881, "num_input_tokens_seen": 47378945, "step": 2209, "time_per_iteration": 2.7465288639068604 }, { "auxiliary_loss_clip": 0.0113917, "auxiliary_loss_mlp": 0.01054423, "balance_loss_clip": 1.05187583, "balance_loss_mlp": 1.03151131, "epoch": 0.265736788312391, "flos": 26906680604160.0, "grad_norm": 2.634198396014623, "language_loss": 0.87430376, "learning_rate": 3.4449897512405894e-06, "loss": 0.8962397, "num_input_tokens_seen": 47398095, "step": 2210, "time_per_iteration": 2.699378728866577 }, { "auxiliary_loss_clip": 0.01100233, "auxiliary_loss_mlp": 0.00776482, "balance_loss_clip": 1.04463947, "balance_loss_mlp": 1.00034869, "epoch": 0.2658570312030301, "flos": 23477139901440.0, "grad_norm": 1.9134020496858268, "language_loss": 0.75100213, "learning_rate": 3.444451079135859e-06, "loss": 0.76976931, "num_input_tokens_seen": 47417605, "step": 2211, "time_per_iteration": 2.8285934925079346 }, { "auxiliary_loss_clip": 0.01112275, "auxiliary_loss_mlp": 0.00777735, "balance_loss_clip": 1.04430342, "balance_loss_mlp": 1.00036621, "epoch": 0.2659772740936692, "flos": 21866402315520.0, "grad_norm": 1.8959157440122967, "language_loss": 0.74076021, "learning_rate": 3.4439121879090493e-06, "loss": 0.7596603, "num_input_tokens_seen": 47435385, "step": 2212, "time_per_iteration": 2.749772548675537 }, { "auxiliary_loss_clip": 0.01154302, "auxiliary_loss_mlp": 0.01061482, "balance_loss_clip": 1.05486369, "balance_loss_mlp": 1.04027438, "epoch": 0.2660975169843083, "flos": 19793100360960.0, "grad_norm": 1.9495805283998426, "language_loss": 0.83081001, "learning_rate": 3.4433730776419082e-06, "loss": 0.85296786, "num_input_tokens_seen": 47454310, "step": 2213, "time_per_iteration": 2.753718137741089 }, { "auxiliary_loss_clip": 0.01164525, "auxiliary_loss_mlp": 0.00777661, "balance_loss_clip": 1.05299175, "balance_loss_mlp": 1.00041842, "epoch": 0.2662177598749474, "flos": 29018981750400.0, "grad_norm": 2.273009270601587, "language_loss": 0.80907267, "learning_rate": 3.4428337484162183e-06, "loss": 0.82849455, "num_input_tokens_seen": 47475120, "step": 2214, "time_per_iteration": 2.7474584579467773 }, { "auxiliary_loss_clip": 0.01143624, "auxiliary_loss_mlp": 0.01058382, "balance_loss_clip": 1.0518409, "balance_loss_mlp": 1.0363518, "epoch": 0.2663380027655865, "flos": 21762549118080.0, "grad_norm": 18.482365777507557, "language_loss": 0.84286594, "learning_rate": 3.442294200313797e-06, "loss": 0.86488605, "num_input_tokens_seen": 47493150, "step": 2215, "time_per_iteration": 2.651109457015991 }, { "auxiliary_loss_clip": 0.01060592, "auxiliary_loss_mlp": 0.01003263, "balance_loss_clip": 1.02043509, "balance_loss_mlp": 1.00054514, "epoch": 0.26645824565622556, "flos": 66980333819520.0, "grad_norm": 0.9652407705510174, "language_loss": 0.52756315, "learning_rate": 3.4417544334164916e-06, "loss": 0.54820168, "num_input_tokens_seen": 47557295, "step": 2216, "time_per_iteration": 3.2776429653167725 }, { "auxiliary_loss_clip": 0.01132953, "auxiliary_loss_mlp": 0.01050637, "balance_loss_clip": 1.05062711, "balance_loss_mlp": 1.03021622, "epoch": 0.26657848854686467, "flos": 25264198373760.0, "grad_norm": 2.137133034372931, "language_loss": 0.77659905, "learning_rate": 3.4412144478061854e-06, "loss": 0.79843497, "num_input_tokens_seen": 47579705, "step": 2217, "time_per_iteration": 2.736048460006714 }, { "auxiliary_loss_clip": 0.01080651, "auxiliary_loss_mlp": 0.01058243, "balance_loss_clip": 1.04172635, "balance_loss_mlp": 1.03539026, "epoch": 0.2666987314375038, "flos": 23696769611520.0, "grad_norm": 1.9767839988977496, "language_loss": 0.75597405, "learning_rate": 3.4406742435647925e-06, "loss": 0.77736306, "num_input_tokens_seen": 47599770, "step": 2218, "time_per_iteration": 2.889547109603882 }, { "auxiliary_loss_clip": 0.01161103, "auxiliary_loss_mlp": 0.01060421, "balance_loss_clip": 1.05608034, "balance_loss_mlp": 1.03929734, "epoch": 0.26681897432814283, "flos": 27048958375680.0, "grad_norm": 1.8644388173539597, "language_loss": 0.78782189, "learning_rate": 3.440133820774263e-06, "loss": 0.81003714, "num_input_tokens_seen": 47619580, "step": 2219, "time_per_iteration": 2.62516450881958 }, { "auxiliary_loss_clip": 0.01152859, "auxiliary_loss_mlp": 0.0105886, "balance_loss_clip": 1.05460978, "balance_loss_mlp": 1.0367825, "epoch": 0.26693921721878194, "flos": 28985944216320.0, "grad_norm": 2.515828438198556, "language_loss": 0.81576878, "learning_rate": 3.439593179516578e-06, "loss": 0.83788598, "num_input_tokens_seen": 47639490, "step": 2220, "time_per_iteration": 2.8066442012786865 }, { "auxiliary_loss_clip": 0.01147428, "auxiliary_loss_mlp": 0.01050412, "balance_loss_clip": 1.05006325, "balance_loss_mlp": 1.02904916, "epoch": 0.26705946010942105, "flos": 21507834798720.0, "grad_norm": 2.3372727635918693, "language_loss": 0.80924046, "learning_rate": 3.4390523198737524e-06, "loss": 0.83121884, "num_input_tokens_seen": 47658650, "step": 2221, "time_per_iteration": 2.6931211948394775 }, { "auxiliary_loss_clip": 0.01174577, "auxiliary_loss_mlp": 0.00777056, "balance_loss_clip": 1.05491161, "balance_loss_mlp": 1.00040066, "epoch": 0.2671797030000601, "flos": 21471277731840.0, "grad_norm": 1.9256295137404906, "language_loss": 0.73968601, "learning_rate": 3.4385112419278333e-06, "loss": 0.75920236, "num_input_tokens_seen": 47679875, "step": 2222, "time_per_iteration": 3.530299425125122 }, { "auxiliary_loss_clip": 0.01053319, "auxiliary_loss_mlp": 0.01001325, "balance_loss_clip": 1.0217185, "balance_loss_mlp": 0.99874973, "epoch": 0.2672999458906992, "flos": 64189929767040.0, "grad_norm": 0.7936928064299568, "language_loss": 0.6486361, "learning_rate": 3.4379699457609033e-06, "loss": 0.66918254, "num_input_tokens_seen": 47737700, "step": 2223, "time_per_iteration": 3.1147079467773438 }, { "auxiliary_loss_clip": 0.01139261, "auxiliary_loss_mlp": 0.01051869, "balance_loss_clip": 1.04915917, "balance_loss_mlp": 1.03047085, "epoch": 0.26742018878133833, "flos": 16909042573440.0, "grad_norm": 2.1793044767415575, "language_loss": 0.90133607, "learning_rate": 3.4374284314550755e-06, "loss": 0.92324734, "num_input_tokens_seen": 47756740, "step": 2224, "time_per_iteration": 2.7219254970550537 }, { "auxiliary_loss_clip": 0.01172938, "auxiliary_loss_mlp": 0.01049846, "balance_loss_clip": 1.0557332, "balance_loss_mlp": 1.02862692, "epoch": 0.2675404316719774, "flos": 20667560964480.0, "grad_norm": 1.8772489899037872, "language_loss": 0.8100915, "learning_rate": 3.436886699092498e-06, "loss": 0.83231938, "num_input_tokens_seen": 47775255, "step": 2225, "time_per_iteration": 4.600811004638672 }, { "auxiliary_loss_clip": 0.01183156, "auxiliary_loss_mlp": 0.01059542, "balance_loss_clip": 1.05759764, "balance_loss_mlp": 1.03723824, "epoch": 0.2676606745626165, "flos": 17485013157120.0, "grad_norm": 2.7400647526662736, "language_loss": 0.7148602, "learning_rate": 3.4363447487553502e-06, "loss": 0.73728716, "num_input_tokens_seen": 47788570, "step": 2226, "time_per_iteration": 2.6739742755889893 }, { "auxiliary_loss_clip": 0.01145372, "auxiliary_loss_mlp": 0.01046805, "balance_loss_clip": 1.05406868, "balance_loss_mlp": 1.02729058, "epoch": 0.26778091745325555, "flos": 27852675143040.0, "grad_norm": 3.3100148147408, "language_loss": 0.77994287, "learning_rate": 3.4358025805258455e-06, "loss": 0.80186462, "num_input_tokens_seen": 47808275, "step": 2227, "time_per_iteration": 2.729435443878174 }, { "auxiliary_loss_clip": 0.01130643, "auxiliary_loss_mlp": 0.01051721, "balance_loss_clip": 1.05106473, "balance_loss_mlp": 1.02930951, "epoch": 0.26790116034389466, "flos": 20955995176320.0, "grad_norm": 2.0545667198149755, "language_loss": 0.83209074, "learning_rate": 3.435260194486232e-06, "loss": 0.85391438, "num_input_tokens_seen": 47826245, "step": 2228, "time_per_iteration": 2.762946128845215 }, { "auxiliary_loss_clip": 0.01154464, "auxiliary_loss_mlp": 0.01054318, "balance_loss_clip": 1.05366135, "balance_loss_mlp": 1.03276491, "epoch": 0.2680214032345338, "flos": 18040659621120.0, "grad_norm": 2.2023837305424, "language_loss": 0.8228904, "learning_rate": 3.4347175907187875e-06, "loss": 0.84497827, "num_input_tokens_seen": 47843235, "step": 2229, "time_per_iteration": 2.6902925968170166 }, { "auxiliary_loss_clip": 0.01158722, "auxiliary_loss_mlp": 0.01060726, "balance_loss_clip": 1.05291247, "balance_loss_mlp": 1.0383265, "epoch": 0.26814164612517283, "flos": 22419427086720.0, "grad_norm": 1.8043883577265865, "language_loss": 0.87874824, "learning_rate": 3.4341747693058254e-06, "loss": 0.90094268, "num_input_tokens_seen": 47861710, "step": 2230, "time_per_iteration": 3.5757029056549072 }, { "auxiliary_loss_clip": 0.01074394, "auxiliary_loss_mlp": 0.01064219, "balance_loss_clip": 1.04332495, "balance_loss_mlp": 1.04255855, "epoch": 0.26826188901581194, "flos": 35627371159680.0, "grad_norm": 2.202852634583602, "language_loss": 0.77167588, "learning_rate": 3.4336317303296916e-06, "loss": 0.79306203, "num_input_tokens_seen": 47882685, "step": 2231, "time_per_iteration": 3.0122385025024414 }, { "auxiliary_loss_clip": 0.01158721, "auxiliary_loss_mlp": 0.01045854, "balance_loss_clip": 1.05603993, "balance_loss_mlp": 1.02602947, "epoch": 0.26838213190645105, "flos": 17639788861440.0, "grad_norm": 2.0739116282679317, "language_loss": 0.75606108, "learning_rate": 3.4330884738727635e-06, "loss": 0.77810675, "num_input_tokens_seen": 47900860, "step": 2232, "time_per_iteration": 2.817462921142578 }, { "auxiliary_loss_clip": 0.01113635, "auxiliary_loss_mlp": 0.01060368, "balance_loss_clip": 1.04770732, "balance_loss_mlp": 1.03901768, "epoch": 0.2685023747970901, "flos": 22674823764480.0, "grad_norm": 1.9356701422878666, "language_loss": 0.70699751, "learning_rate": 3.4325450000174535e-06, "loss": 0.72873753, "num_input_tokens_seen": 47917500, "step": 2233, "time_per_iteration": 2.7974960803985596 }, { "auxiliary_loss_clip": 0.01117757, "auxiliary_loss_mlp": 0.01065409, "balance_loss_clip": 1.0506928, "balance_loss_mlp": 1.0414362, "epoch": 0.2686226176877292, "flos": 20120533764480.0, "grad_norm": 2.176996301645797, "language_loss": 0.74115777, "learning_rate": 3.4320013088462067e-06, "loss": 0.7629894, "num_input_tokens_seen": 47934860, "step": 2234, "time_per_iteration": 2.8223328590393066 }, { "auxiliary_loss_clip": 0.01136211, "auxiliary_loss_mlp": 0.01059947, "balance_loss_clip": 1.04981208, "balance_loss_mlp": 1.03859663, "epoch": 0.2687428605783683, "flos": 21872040750720.0, "grad_norm": 1.5633394231406108, "language_loss": 0.81634289, "learning_rate": 3.431457400441499e-06, "loss": 0.83830446, "num_input_tokens_seen": 47955255, "step": 2235, "time_per_iteration": 2.7228925228118896 }, { "auxiliary_loss_clip": 0.01010459, "auxiliary_loss_mlp": 0.01011737, "balance_loss_clip": 1.02286196, "balance_loss_mlp": 1.00891161, "epoch": 0.2688631034690074, "flos": 69943320766080.0, "grad_norm": 0.952106776490963, "language_loss": 0.61062932, "learning_rate": 3.4309132748858424e-06, "loss": 0.63085127, "num_input_tokens_seen": 48016245, "step": 2236, "time_per_iteration": 3.466559648513794 }, { "auxiliary_loss_clip": 0.01159459, "auxiliary_loss_mlp": 0.01050622, "balance_loss_clip": 1.05548573, "balance_loss_mlp": 1.02954614, "epoch": 0.2689833463596465, "flos": 22856639431680.0, "grad_norm": 1.7509562529550486, "language_loss": 0.83960998, "learning_rate": 3.430368932261779e-06, "loss": 0.86171079, "num_input_tokens_seen": 48036600, "step": 2237, "time_per_iteration": 2.769575595855713 }, { "auxiliary_loss_clip": 0.01148158, "auxiliary_loss_mlp": 0.01056854, "balance_loss_clip": 1.05409205, "balance_loss_mlp": 1.034729, "epoch": 0.2691035892502856, "flos": 17200242132480.0, "grad_norm": 2.5906505076215613, "language_loss": 0.75172883, "learning_rate": 3.429824372651886e-06, "loss": 0.77377892, "num_input_tokens_seen": 48054750, "step": 2238, "time_per_iteration": 2.697763681411743 }, { "auxiliary_loss_clip": 0.0113262, "auxiliary_loss_mlp": 0.01056213, "balance_loss_clip": 1.05225337, "balance_loss_mlp": 1.03463578, "epoch": 0.26922383214092466, "flos": 17747484814080.0, "grad_norm": 2.306897548865082, "language_loss": 0.83693624, "learning_rate": 3.4292795961387732e-06, "loss": 0.85882461, "num_input_tokens_seen": 48072650, "step": 2239, "time_per_iteration": 2.7258999347686768 }, { "auxiliary_loss_clip": 0.01174137, "auxiliary_loss_mlp": 0.01043678, "balance_loss_clip": 1.05455327, "balance_loss_mlp": 1.02236319, "epoch": 0.26934407503156377, "flos": 16173376122240.0, "grad_norm": 2.365220004289925, "language_loss": 0.87475955, "learning_rate": 3.4287346028050818e-06, "loss": 0.89693767, "num_input_tokens_seen": 48088720, "step": 2240, "time_per_iteration": 2.6216607093811035 }, { "auxiliary_loss_clip": 0.0114119, "auxiliary_loss_mlp": 0.01045833, "balance_loss_clip": 1.05046391, "balance_loss_mlp": 1.0256269, "epoch": 0.2694643179222028, "flos": 23732895715200.0, "grad_norm": 1.5750960365289974, "language_loss": 0.7967695, "learning_rate": 3.4281893927334866e-06, "loss": 0.81863981, "num_input_tokens_seen": 48108630, "step": 2241, "time_per_iteration": 2.665254831314087 }, { "auxiliary_loss_clip": 0.01160485, "auxiliary_loss_mlp": 0.01045335, "balance_loss_clip": 1.05288696, "balance_loss_mlp": 1.02479517, "epoch": 0.26958456081284193, "flos": 24718140840960.0, "grad_norm": 1.9407611664097097, "language_loss": 0.75372547, "learning_rate": 3.4276439660066963e-06, "loss": 0.77578366, "num_input_tokens_seen": 48128330, "step": 2242, "time_per_iteration": 2.6758103370666504 }, { "auxiliary_loss_clip": 0.01172631, "auxiliary_loss_mlp": 0.01059195, "balance_loss_clip": 1.05672145, "balance_loss_mlp": 1.03766549, "epoch": 0.26970480370348104, "flos": 18112588606080.0, "grad_norm": 2.4630493990751563, "language_loss": 0.83937675, "learning_rate": 3.427098322707452e-06, "loss": 0.86169505, "num_input_tokens_seen": 48144295, "step": 2243, "time_per_iteration": 2.5901591777801514 }, { "auxiliary_loss_clip": 0.01167496, "auxiliary_loss_mlp": 0.01043282, "balance_loss_clip": 1.06328559, "balance_loss_mlp": 1.02232468, "epoch": 0.2698250465941201, "flos": 10816546250880.0, "grad_norm": 2.1742805709178543, "language_loss": 0.89815462, "learning_rate": 3.426552462918526e-06, "loss": 0.9202624, "num_input_tokens_seen": 48162230, "step": 2244, "time_per_iteration": 2.6587705612182617 }, { "auxiliary_loss_clip": 0.01173392, "auxiliary_loss_mlp": 0.01049537, "balance_loss_clip": 1.05699325, "balance_loss_mlp": 1.02887774, "epoch": 0.2699452894847592, "flos": 17308117653120.0, "grad_norm": 2.334114510276634, "language_loss": 0.73194551, "learning_rate": 3.426006386722726e-06, "loss": 0.75417477, "num_input_tokens_seen": 48180290, "step": 2245, "time_per_iteration": 2.636901617050171 }, { "auxiliary_loss_clip": 0.01139815, "auxiliary_loss_mlp": 0.01045535, "balance_loss_clip": 1.055933, "balance_loss_mlp": 1.02488756, "epoch": 0.2700655323753983, "flos": 18078150441600.0, "grad_norm": 3.387457346701958, "language_loss": 0.92494953, "learning_rate": 3.4254600942028914e-06, "loss": 0.94680297, "num_input_tokens_seen": 48198165, "step": 2246, "time_per_iteration": 2.7682595252990723 }, { "auxiliary_loss_clip": 0.01145155, "auxiliary_loss_mlp": 0.01057306, "balance_loss_clip": 1.05299628, "balance_loss_mlp": 1.03706408, "epoch": 0.2701857752660374, "flos": 18186636493440.0, "grad_norm": 2.1976546062445257, "language_loss": 0.82502741, "learning_rate": 3.424913585441893e-06, "loss": 0.8470521, "num_input_tokens_seen": 48216000, "step": 2247, "time_per_iteration": 3.699070930480957 }, { "auxiliary_loss_clip": 0.01155814, "auxiliary_loss_mlp": 0.01044102, "balance_loss_clip": 1.05411625, "balance_loss_mlp": 1.02245367, "epoch": 0.2703060181566765, "flos": 16319496648960.0, "grad_norm": 2.336656115825762, "language_loss": 0.87281352, "learning_rate": 3.4243668605226374e-06, "loss": 0.8948127, "num_input_tokens_seen": 48233025, "step": 2248, "time_per_iteration": 2.7021636962890625 }, { "auxiliary_loss_clip": 0.01134057, "auxiliary_loss_mlp": 0.010502, "balance_loss_clip": 1.04898357, "balance_loss_mlp": 1.02877796, "epoch": 0.2704262610473156, "flos": 19572357329280.0, "grad_norm": 2.6663352350578555, "language_loss": 0.8293519, "learning_rate": 3.423819919528061e-06, "loss": 0.8511945, "num_input_tokens_seen": 48251110, "step": 2249, "time_per_iteration": 2.8133974075317383 }, { "auxiliary_loss_clip": 0.01125582, "auxiliary_loss_mlp": 0.01054348, "balance_loss_clip": 1.04637909, "balance_loss_mlp": 1.03371274, "epoch": 0.27054650393795465, "flos": 20740746925440.0, "grad_norm": 3.566039746072348, "language_loss": 0.78407544, "learning_rate": 3.4232727625411355e-06, "loss": 0.80587476, "num_input_tokens_seen": 48270215, "step": 2250, "time_per_iteration": 3.0059401988983154 }, { "auxiliary_loss_clip": 0.01101025, "auxiliary_loss_mlp": 0.01049493, "balance_loss_clip": 1.04554057, "balance_loss_mlp": 1.02922773, "epoch": 0.27066674682859376, "flos": 18658322916480.0, "grad_norm": 1.7423201404208286, "language_loss": 0.86442798, "learning_rate": 3.4227253896448626e-06, "loss": 0.88593316, "num_input_tokens_seen": 48288075, "step": 2251, "time_per_iteration": 3.7202296257019043 }, { "auxiliary_loss_clip": 0.01172051, "auxiliary_loss_mlp": 0.0104992, "balance_loss_clip": 1.05443585, "balance_loss_mlp": 1.03079844, "epoch": 0.2707869897192329, "flos": 23002759958400.0, "grad_norm": 2.2693969452420353, "language_loss": 0.82194257, "learning_rate": 3.42217780092228e-06, "loss": 0.84416223, "num_input_tokens_seen": 48306415, "step": 2252, "time_per_iteration": 2.6630051136016846 }, { "auxiliary_loss_clip": 0.01039735, "auxiliary_loss_mlp": 0.01004536, "balance_loss_clip": 1.02572489, "balance_loss_mlp": 1.00166357, "epoch": 0.27090723260987193, "flos": 58323240293760.0, "grad_norm": 0.792442783357695, "language_loss": 0.6038177, "learning_rate": 3.421629996456456e-06, "loss": 0.62426043, "num_input_tokens_seen": 48365035, "step": 2253, "time_per_iteration": 3.240696668624878 }, { "auxiliary_loss_clip": 0.0115717, "auxiliary_loss_mlp": 0.01048979, "balance_loss_clip": 1.05248368, "balance_loss_mlp": 1.02725947, "epoch": 0.27102747550051104, "flos": 11984540797440.0, "grad_norm": 7.320737069023727, "language_loss": 0.82272553, "learning_rate": 3.421081976330491e-06, "loss": 0.84478706, "num_input_tokens_seen": 48383550, "step": 2254, "time_per_iteration": 2.6654953956604004 }, { "auxiliary_loss_clip": 0.01141644, "auxiliary_loss_mlp": 0.01046881, "balance_loss_clip": 1.04961467, "balance_loss_mlp": 1.02599537, "epoch": 0.27114771839115015, "flos": 19900401264000.0, "grad_norm": 2.068478266834834, "language_loss": 0.87840652, "learning_rate": 3.4205337406275207e-06, "loss": 0.90029168, "num_input_tokens_seen": 48403670, "step": 2255, "time_per_iteration": 2.712491273880005 }, { "auxiliary_loss_clip": 0.01170393, "auxiliary_loss_mlp": 0.01050282, "balance_loss_clip": 1.05281878, "balance_loss_mlp": 1.02903903, "epoch": 0.2712679612817892, "flos": 18331966920960.0, "grad_norm": 3.585937732875927, "language_loss": 0.7516306, "learning_rate": 3.4199852894307114e-06, "loss": 0.77383739, "num_input_tokens_seen": 48420420, "step": 2256, "time_per_iteration": 3.5420312881469727 }, { "auxiliary_loss_clip": 0.01108411, "auxiliary_loss_mlp": 0.01045069, "balance_loss_clip": 1.04867351, "balance_loss_mlp": 1.02386129, "epoch": 0.2713882041724283, "flos": 24460302038400.0, "grad_norm": 2.3592519965639296, "language_loss": 0.78821421, "learning_rate": 3.419436622823262e-06, "loss": 0.80974901, "num_input_tokens_seen": 48441140, "step": 2257, "time_per_iteration": 2.950802803039551 }, { "auxiliary_loss_clip": 0.0114184, "auxiliary_loss_mlp": 0.01055051, "balance_loss_clip": 1.05119324, "balance_loss_mlp": 1.03674006, "epoch": 0.27150844706306737, "flos": 23039317025280.0, "grad_norm": 2.1528350281167694, "language_loss": 0.74088752, "learning_rate": 3.4188877408884063e-06, "loss": 0.76285642, "num_input_tokens_seen": 48461845, "step": 2258, "time_per_iteration": 2.7402219772338867 }, { "auxiliary_loss_clip": 0.01143807, "auxiliary_loss_mlp": 0.01058192, "balance_loss_clip": 1.05245471, "balance_loss_mlp": 1.03516126, "epoch": 0.2716286899537065, "flos": 22563644192640.0, "grad_norm": 3.716312259912949, "language_loss": 0.65155756, "learning_rate": 3.4183386437094088e-06, "loss": 0.67357755, "num_input_tokens_seen": 48478510, "step": 2259, "time_per_iteration": 2.7406251430511475 }, { "auxiliary_loss_clip": 0.01148773, "auxiliary_loss_mlp": 0.010524, "balance_loss_clip": 1.0517689, "balance_loss_mlp": 1.03116918, "epoch": 0.2717489328443456, "flos": 13115044523520.0, "grad_norm": 3.2463639153936534, "language_loss": 0.82043231, "learning_rate": 3.417789331369565e-06, "loss": 0.84244406, "num_input_tokens_seen": 48494300, "step": 2260, "time_per_iteration": 2.7715351581573486 }, { "auxiliary_loss_clip": 0.01175969, "auxiliary_loss_mlp": 0.0105409, "balance_loss_clip": 1.05599499, "balance_loss_mlp": 1.03208351, "epoch": 0.27186917573498465, "flos": 29278688060160.0, "grad_norm": 4.8574701515910705, "language_loss": 0.91476226, "learning_rate": 3.4172398039522088e-06, "loss": 0.93706286, "num_input_tokens_seen": 48515585, "step": 2261, "time_per_iteration": 2.709296464920044 }, { "auxiliary_loss_clip": 0.01161373, "auxiliary_loss_mlp": 0.01058183, "balance_loss_clip": 1.05162835, "balance_loss_mlp": 1.03682065, "epoch": 0.27198941862562376, "flos": 26032220000640.0, "grad_norm": 2.072859930251919, "language_loss": 0.79983377, "learning_rate": 3.4166900615407e-06, "loss": 0.82202935, "num_input_tokens_seen": 48533500, "step": 2262, "time_per_iteration": 2.7116892337799072 }, { "auxiliary_loss_clip": 0.01158754, "auxiliary_loss_mlp": 0.01053344, "balance_loss_clip": 1.05365264, "balance_loss_mlp": 1.03413928, "epoch": 0.27210966151626287, "flos": 32780983760640.0, "grad_norm": 2.7279644533635854, "language_loss": 0.75199097, "learning_rate": 3.416140104218436e-06, "loss": 0.77411199, "num_input_tokens_seen": 48552865, "step": 2263, "time_per_iteration": 2.7596447467803955 }, { "auxiliary_loss_clip": 0.01041499, "auxiliary_loss_mlp": 0.00757296, "balance_loss_clip": 1.01929808, "balance_loss_mlp": 1.00032318, "epoch": 0.2722299044069019, "flos": 65471043219840.0, "grad_norm": 1.266610269338226, "language_loss": 0.69697142, "learning_rate": 3.4155899320688437e-06, "loss": 0.71495926, "num_input_tokens_seen": 48618940, "step": 2264, "time_per_iteration": 3.305976152420044 }, { "auxiliary_loss_clip": 0.01108949, "auxiliary_loss_mlp": 0.01053254, "balance_loss_clip": 1.04525876, "balance_loss_mlp": 1.03146267, "epoch": 0.27235014729754103, "flos": 15334143782400.0, "grad_norm": 2.573855225689633, "language_loss": 0.74233556, "learning_rate": 3.415039545175384e-06, "loss": 0.76395762, "num_input_tokens_seen": 48634665, "step": 2265, "time_per_iteration": 2.733311176300049 }, { "auxiliary_loss_clip": 0.0116019, "auxiliary_loss_mlp": 0.01054976, "balance_loss_clip": 1.05360496, "balance_loss_mlp": 1.03455591, "epoch": 0.27247039018818014, "flos": 21872363973120.0, "grad_norm": 2.378454932858091, "language_loss": 0.65324426, "learning_rate": 3.414488943621551e-06, "loss": 0.67539591, "num_input_tokens_seen": 48653330, "step": 2266, "time_per_iteration": 2.6896958351135254 }, { "auxiliary_loss_clip": 0.01160915, "auxiliary_loss_mlp": 0.01054193, "balance_loss_clip": 1.05779707, "balance_loss_mlp": 1.03263962, "epoch": 0.2725906330788192, "flos": 18695490514560.0, "grad_norm": 2.7372943794956806, "language_loss": 0.73738527, "learning_rate": 3.41393812749087e-06, "loss": 0.75953633, "num_input_tokens_seen": 48671375, "step": 2267, "time_per_iteration": 2.61541748046875 }, { "auxiliary_loss_clip": 0.01147475, "auxiliary_loss_mlp": 0.01056953, "balance_loss_clip": 1.053473, "balance_loss_mlp": 1.03686643, "epoch": 0.2727108759694583, "flos": 17886099398400.0, "grad_norm": 3.4025740075002293, "language_loss": 0.72024453, "learning_rate": 3.4133870968668984e-06, "loss": 0.74228883, "num_input_tokens_seen": 48686175, "step": 2268, "time_per_iteration": 2.733548641204834 }, { "auxiliary_loss_clip": 0.01149764, "auxiliary_loss_mlp": 0.0107021, "balance_loss_clip": 1.05401087, "balance_loss_mlp": 1.04753685, "epoch": 0.2728311188600974, "flos": 24461666755200.0, "grad_norm": 1.8014405297764444, "language_loss": 0.78586447, "learning_rate": 3.412835851833229e-06, "loss": 0.80806422, "num_input_tokens_seen": 48708370, "step": 2269, "time_per_iteration": 2.8183817863464355 }, { "auxiliary_loss_clip": 0.01160826, "auxiliary_loss_mlp": 0.01062232, "balance_loss_clip": 1.05772829, "balance_loss_mlp": 1.04063094, "epoch": 0.2729513617507365, "flos": 30993314757120.0, "grad_norm": 1.8355210924866285, "language_loss": 0.77774668, "learning_rate": 3.4122843924734834e-06, "loss": 0.79997724, "num_input_tokens_seen": 48730670, "step": 2270, "time_per_iteration": 2.73002028465271 }, { "auxiliary_loss_clip": 0.01144208, "auxiliary_loss_mlp": 0.0106219, "balance_loss_clip": 1.05176568, "balance_loss_mlp": 1.03894365, "epoch": 0.2730716046413756, "flos": 19094637421440.0, "grad_norm": 2.0140094677932847, "language_loss": 0.87943637, "learning_rate": 3.411732718871319e-06, "loss": 0.90150034, "num_input_tokens_seen": 48746510, "step": 2271, "time_per_iteration": 2.7088675498962402 }, { "auxiliary_loss_clip": 0.01167041, "auxiliary_loss_mlp": 0.01056531, "balance_loss_clip": 1.05464852, "balance_loss_mlp": 1.03715944, "epoch": 0.27319184753201464, "flos": 26944566474240.0, "grad_norm": 1.799806779458238, "language_loss": 0.78739405, "learning_rate": 3.4111808311104227e-06, "loss": 0.8096298, "num_input_tokens_seen": 48768825, "step": 2272, "time_per_iteration": 2.6420481204986572 }, { "auxiliary_loss_clip": 0.01150724, "auxiliary_loss_mlp": 0.01066124, "balance_loss_clip": 1.04916728, "balance_loss_mlp": 1.04429698, "epoch": 0.27331209042265375, "flos": 31759828012800.0, "grad_norm": 1.8683697785123163, "language_loss": 0.69551396, "learning_rate": 3.410628729274517e-06, "loss": 0.71768248, "num_input_tokens_seen": 48790345, "step": 2273, "time_per_iteration": 3.784278392791748 }, { "auxiliary_loss_clip": 0.01143075, "auxiliary_loss_mlp": 0.00776731, "balance_loss_clip": 1.05090809, "balance_loss_mlp": 1.00051188, "epoch": 0.27343233331329286, "flos": 25739081107200.0, "grad_norm": 2.251467674591456, "language_loss": 0.8286553, "learning_rate": 3.4100764134473546e-06, "loss": 0.8478533, "num_input_tokens_seen": 48809630, "step": 2274, "time_per_iteration": 2.770777702331543 }, { "auxiliary_loss_clip": 0.01174447, "auxiliary_loss_mlp": 0.0104992, "balance_loss_clip": 1.05960584, "balance_loss_mlp": 1.03064418, "epoch": 0.2735525762039319, "flos": 24389414547840.0, "grad_norm": 2.155984385483107, "language_loss": 0.84435165, "learning_rate": 3.4095238837127215e-06, "loss": 0.86659527, "num_input_tokens_seen": 48828770, "step": 2275, "time_per_iteration": 2.703373670578003 }, { "auxiliary_loss_clip": 0.01128684, "auxiliary_loss_mlp": 0.01041861, "balance_loss_clip": 1.04580092, "balance_loss_mlp": 1.02358603, "epoch": 0.27367281909457103, "flos": 14465357527680.0, "grad_norm": 2.029732296570765, "language_loss": 0.7961973, "learning_rate": 3.4089711401544355e-06, "loss": 0.81790268, "num_input_tokens_seen": 48846365, "step": 2276, "time_per_iteration": 2.7110695838928223 }, { "auxiliary_loss_clip": 0.01155094, "auxiliary_loss_mlp": 0.01053378, "balance_loss_clip": 1.05017173, "balance_loss_mlp": 1.03159881, "epoch": 0.27379306198521014, "flos": 23476996247040.0, "grad_norm": 3.200421536551335, "language_loss": 0.67676818, "learning_rate": 3.4084181828563486e-06, "loss": 0.6988529, "num_input_tokens_seen": 48863085, "step": 2277, "time_per_iteration": 3.645752429962158 }, { "auxiliary_loss_clip": 0.01115099, "auxiliary_loss_mlp": 0.01056162, "balance_loss_clip": 1.04881358, "balance_loss_mlp": 1.0374347, "epoch": 0.2739133048758492, "flos": 17458152762240.0, "grad_norm": 1.847994005298833, "language_loss": 0.70727414, "learning_rate": 3.4078650119023428e-06, "loss": 0.72898674, "num_input_tokens_seen": 48881400, "step": 2278, "time_per_iteration": 2.789236545562744 }, { "auxiliary_loss_clip": 0.0110766, "auxiliary_loss_mlp": 0.01060879, "balance_loss_clip": 1.04520953, "balance_loss_mlp": 1.03695369, "epoch": 0.2740335477664883, "flos": 19273113123840.0, "grad_norm": 2.212959655231877, "language_loss": 0.7406857, "learning_rate": 3.4073116273763337e-06, "loss": 0.76237106, "num_input_tokens_seen": 48895845, "step": 2279, "time_per_iteration": 2.7886714935302734 }, { "auxiliary_loss_clip": 0.01150352, "auxiliary_loss_mlp": 0.01062538, "balance_loss_clip": 1.05032229, "balance_loss_mlp": 1.0404129, "epoch": 0.2741537906571274, "flos": 26104723603200.0, "grad_norm": 2.3487672774345656, "language_loss": 0.81297648, "learning_rate": 3.40675802936227e-06, "loss": 0.8351053, "num_input_tokens_seen": 48916630, "step": 2280, "time_per_iteration": 2.738656520843506 }, { "auxiliary_loss_clip": 0.01140441, "auxiliary_loss_mlp": 0.01058023, "balance_loss_clip": 1.04945993, "balance_loss_mlp": 1.03462195, "epoch": 0.27427403354776647, "flos": 34164190644480.0, "grad_norm": 2.134202257522924, "language_loss": 0.72014838, "learning_rate": 3.4062042179441318e-06, "loss": 0.74213302, "num_input_tokens_seen": 48937100, "step": 2281, "time_per_iteration": 2.82364559173584 }, { "auxiliary_loss_clip": 0.01153699, "auxiliary_loss_mlp": 0.01047384, "balance_loss_clip": 1.05234396, "balance_loss_mlp": 1.0285008, "epoch": 0.2743942764384056, "flos": 18766988536320.0, "grad_norm": 2.067436442149842, "language_loss": 0.80492198, "learning_rate": 3.4056501932059314e-06, "loss": 0.82693279, "num_input_tokens_seen": 48955175, "step": 2282, "time_per_iteration": 3.5626401901245117 }, { "auxiliary_loss_clip": 0.01059502, "auxiliary_loss_mlp": 0.01005087, "balance_loss_clip": 1.01723015, "balance_loss_mlp": 1.00247657, "epoch": 0.2745145193290447, "flos": 64904048058240.0, "grad_norm": 0.7726415418197247, "language_loss": 0.58142948, "learning_rate": 3.405095955231715e-06, "loss": 0.60207546, "num_input_tokens_seen": 49006830, "step": 2283, "time_per_iteration": 3.1746504306793213 }, { "auxiliary_loss_clip": 0.01162009, "auxiliary_loss_mlp": 0.01046953, "balance_loss_clip": 1.05183816, "balance_loss_mlp": 1.02629375, "epoch": 0.27463476221968375, "flos": 16136926796160.0, "grad_norm": 3.388942125377841, "language_loss": 0.94106752, "learning_rate": 3.4045415041055585e-06, "loss": 0.96315712, "num_input_tokens_seen": 49022470, "step": 2284, "time_per_iteration": 2.647075891494751 }, { "auxiliary_loss_clip": 0.0114942, "auxiliary_loss_mlp": 0.01051714, "balance_loss_clip": 1.05188513, "balance_loss_mlp": 1.02951741, "epoch": 0.27475500511032286, "flos": 10376712213120.0, "grad_norm": 2.269802138081457, "language_loss": 0.7845124, "learning_rate": 3.4039868399115728e-06, "loss": 0.80652374, "num_input_tokens_seen": 49037110, "step": 2285, "time_per_iteration": 2.6510157585144043 }, { "auxiliary_loss_clip": 0.01113796, "auxiliary_loss_mlp": 0.01053066, "balance_loss_clip": 1.05025053, "balance_loss_mlp": 1.03195357, "epoch": 0.27487524800096197, "flos": 17311062568320.0, "grad_norm": 2.5691618483827585, "language_loss": 0.80190849, "learning_rate": 3.4034319627339003e-06, "loss": 0.82357705, "num_input_tokens_seen": 49053975, "step": 2286, "time_per_iteration": 2.745729684829712 }, { "auxiliary_loss_clip": 0.01148222, "auxiliary_loss_mlp": 0.01045447, "balance_loss_clip": 1.05347943, "balance_loss_mlp": 1.02329791, "epoch": 0.274995490891601, "flos": 27120205002240.0, "grad_norm": 2.6164750602407447, "language_loss": 0.69669056, "learning_rate": 3.402876872656715e-06, "loss": 0.71862721, "num_input_tokens_seen": 49072295, "step": 2287, "time_per_iteration": 2.742685317993164 }, { "auxiliary_loss_clip": 0.01146749, "auxiliary_loss_mlp": 0.01047587, "balance_loss_clip": 1.05445087, "balance_loss_mlp": 1.0273695, "epoch": 0.27511573378224013, "flos": 23436093634560.0, "grad_norm": 2.4424055415676746, "language_loss": 0.89491177, "learning_rate": 3.402321569764223e-06, "loss": 0.91685522, "num_input_tokens_seen": 49091600, "step": 2288, "time_per_iteration": 2.7738192081451416 }, { "auxiliary_loss_clip": 0.01124126, "auxiliary_loss_mlp": 0.00776905, "balance_loss_clip": 1.04819298, "balance_loss_mlp": 1.0006448, "epoch": 0.2752359766728792, "flos": 16722019434240.0, "grad_norm": 2.2235396632948703, "language_loss": 0.83644181, "learning_rate": 3.4017660541406635e-06, "loss": 0.85545218, "num_input_tokens_seen": 49107665, "step": 2289, "time_per_iteration": 2.732215166091919 }, { "auxiliary_loss_clip": 0.01144714, "auxiliary_loss_mlp": 0.01052644, "balance_loss_clip": 1.04839563, "balance_loss_mlp": 1.03186536, "epoch": 0.2753562195635183, "flos": 25297738698240.0, "grad_norm": 2.0691717596538814, "language_loss": 0.74403578, "learning_rate": 3.4012103258703092e-06, "loss": 0.76600933, "num_input_tokens_seen": 49126420, "step": 2290, "time_per_iteration": 2.8421833515167236 }, { "auxiliary_loss_clip": 0.01129867, "auxiliary_loss_mlp": 0.01052221, "balance_loss_clip": 1.05000103, "balance_loss_mlp": 1.03219414, "epoch": 0.2754764624541574, "flos": 27338972785920.0, "grad_norm": 1.9036244128616082, "language_loss": 0.82770383, "learning_rate": 3.4006543850374616e-06, "loss": 0.84952468, "num_input_tokens_seen": 49141470, "step": 2291, "time_per_iteration": 2.7507283687591553 }, { "auxiliary_loss_clip": 0.0115806, "auxiliary_loss_mlp": 0.01046322, "balance_loss_clip": 1.05033708, "balance_loss_mlp": 1.02709365, "epoch": 0.27559670534479647, "flos": 17238379397760.0, "grad_norm": 2.5997413615592095, "language_loss": 0.74784482, "learning_rate": 3.400098231726458e-06, "loss": 0.7698887, "num_input_tokens_seen": 49158570, "step": 2292, "time_per_iteration": 2.7036893367767334 }, { "auxiliary_loss_clip": 0.01134974, "auxiliary_loss_mlp": 0.01048486, "balance_loss_clip": 1.04869914, "balance_loss_mlp": 1.02622938, "epoch": 0.2757169482354356, "flos": 21939085486080.0, "grad_norm": 2.118577192026239, "language_loss": 0.87018633, "learning_rate": 3.3995418660216657e-06, "loss": 0.89202088, "num_input_tokens_seen": 49176025, "step": 2293, "time_per_iteration": 2.7897512912750244 }, { "auxiliary_loss_clip": 0.01178655, "auxiliary_loss_mlp": 0.01058135, "balance_loss_clip": 1.05612946, "balance_loss_mlp": 1.035676, "epoch": 0.2758371911260747, "flos": 20850669521280.0, "grad_norm": 2.210049623305375, "language_loss": 0.80720448, "learning_rate": 3.3989852880074848e-06, "loss": 0.82957238, "num_input_tokens_seen": 49197455, "step": 2294, "time_per_iteration": 2.6764280796051025 }, { "auxiliary_loss_clip": 0.01049047, "auxiliary_loss_mlp": 0.01023786, "balance_loss_clip": 1.03091431, "balance_loss_mlp": 1.01930392, "epoch": 0.27595743401671374, "flos": 69269063592960.0, "grad_norm": 0.7522065269171416, "language_loss": 0.6056782, "learning_rate": 3.398428497768348e-06, "loss": 0.62640655, "num_input_tokens_seen": 49262625, "step": 2295, "time_per_iteration": 3.3835251331329346 }, { "auxiliary_loss_clip": 0.01137607, "auxiliary_loss_mlp": 0.01049766, "balance_loss_clip": 1.04912508, "balance_loss_mlp": 1.02973843, "epoch": 0.27607767690735285, "flos": 21215019127680.0, "grad_norm": 1.9000454315670763, "language_loss": 0.7187252, "learning_rate": 3.3978714953887205e-06, "loss": 0.74059892, "num_input_tokens_seen": 49282380, "step": 2296, "time_per_iteration": 2.7491815090179443 }, { "auxiliary_loss_clip": 0.01108528, "auxiliary_loss_mlp": 0.01056303, "balance_loss_clip": 1.04539192, "balance_loss_mlp": 1.03499997, "epoch": 0.27619791979799196, "flos": 24825334003200.0, "grad_norm": 4.3011258385088675, "language_loss": 0.86010987, "learning_rate": 3.397314280953098e-06, "loss": 0.88175815, "num_input_tokens_seen": 49303205, "step": 2297, "time_per_iteration": 2.897944211959839 }, { "auxiliary_loss_clip": 0.01143364, "auxiliary_loss_mlp": 0.01056389, "balance_loss_clip": 1.05246806, "balance_loss_mlp": 1.03505087, "epoch": 0.276318162688631, "flos": 24753548672640.0, "grad_norm": 2.0411418613322465, "language_loss": 0.80119193, "learning_rate": 3.3967568545460108e-06, "loss": 0.82318944, "num_input_tokens_seen": 49322745, "step": 2298, "time_per_iteration": 2.6762046813964844 }, { "auxiliary_loss_clip": 0.01150819, "auxiliary_loss_mlp": 0.01051821, "balance_loss_clip": 1.05024755, "balance_loss_mlp": 1.03278327, "epoch": 0.27643840557927013, "flos": 18150007599360.0, "grad_norm": 3.144289553553304, "language_loss": 0.80773962, "learning_rate": 3.3961992162520185e-06, "loss": 0.82976604, "num_input_tokens_seen": 49341370, "step": 2299, "time_per_iteration": 3.6692111492156982 }, { "auxiliary_loss_clip": 0.01164053, "auxiliary_loss_mlp": 0.01057003, "balance_loss_clip": 1.05467832, "balance_loss_mlp": 1.03665423, "epoch": 0.27655864846990924, "flos": 24823933372800.0, "grad_norm": 2.31421562201722, "language_loss": 0.72295976, "learning_rate": 3.3956413661557156e-06, "loss": 0.74517035, "num_input_tokens_seen": 49361545, "step": 2300, "time_per_iteration": 2.6323232650756836 }, { "auxiliary_loss_clip": 0.01138116, "auxiliary_loss_mlp": 0.01053355, "balance_loss_clip": 1.05011284, "balance_loss_mlp": 1.0316236, "epoch": 0.2766788913605483, "flos": 20266582464000.0, "grad_norm": 4.823259890530801, "language_loss": 0.66083926, "learning_rate": 3.3950833043417273e-06, "loss": 0.68275398, "num_input_tokens_seen": 49379690, "step": 2301, "time_per_iteration": 2.7787158489227295 }, { "auxiliary_loss_clip": 0.01162841, "auxiliary_loss_mlp": 0.0106623, "balance_loss_clip": 1.05631804, "balance_loss_mlp": 1.04539275, "epoch": 0.2767991342511874, "flos": 21470272151040.0, "grad_norm": 2.357873981109303, "language_loss": 0.73501372, "learning_rate": 3.3945250308947105e-06, "loss": 0.75730443, "num_input_tokens_seen": 49395995, "step": 2302, "time_per_iteration": 2.6653828620910645 }, { "auxiliary_loss_clip": 0.0105526, "auxiliary_loss_mlp": 0.01064017, "balance_loss_clip": 1.02304566, "balance_loss_mlp": 1.06138217, "epoch": 0.2769193771418265, "flos": 66002627571840.0, "grad_norm": 1.2718200910438255, "language_loss": 0.68377167, "learning_rate": 3.3939665458993556e-06, "loss": 0.7049644, "num_input_tokens_seen": 49450415, "step": 2303, "time_per_iteration": 4.175766468048096 }, { "auxiliary_loss_clip": 0.01144584, "auxiliary_loss_mlp": 0.01060432, "balance_loss_clip": 1.0517149, "balance_loss_mlp": 1.04104865, "epoch": 0.27703962003246557, "flos": 20704441253760.0, "grad_norm": 3.292091334911594, "language_loss": 0.76981467, "learning_rate": 3.3934078494403843e-06, "loss": 0.79186481, "num_input_tokens_seen": 49469990, "step": 2304, "time_per_iteration": 2.717217445373535 }, { "auxiliary_loss_clip": 0.01089482, "auxiliary_loss_mlp": 0.00776364, "balance_loss_clip": 1.04607487, "balance_loss_mlp": 1.00072122, "epoch": 0.2771598629231047, "flos": 22929897219840.0, "grad_norm": 2.276724097635661, "language_loss": 0.81506813, "learning_rate": 3.3928489416025495e-06, "loss": 0.83372653, "num_input_tokens_seen": 49490835, "step": 2305, "time_per_iteration": 2.973511219024658 }, { "auxiliary_loss_clip": 0.01148786, "auxiliary_loss_mlp": 0.01054941, "balance_loss_clip": 1.05421603, "balance_loss_mlp": 1.03382885, "epoch": 0.27728010581374374, "flos": 18369457741440.0, "grad_norm": 4.473047028584793, "language_loss": 0.79429078, "learning_rate": 3.392289822470638e-06, "loss": 0.81632805, "num_input_tokens_seen": 49508815, "step": 2306, "time_per_iteration": 2.814077138900757 }, { "auxiliary_loss_clip": 0.01144341, "auxiliary_loss_mlp": 0.01056098, "balance_loss_clip": 1.0523448, "balance_loss_mlp": 1.03518867, "epoch": 0.27740034870438285, "flos": 19427637432960.0, "grad_norm": 2.23832736218816, "language_loss": 0.75487918, "learning_rate": 3.3917304921294674e-06, "loss": 0.7768836, "num_input_tokens_seen": 49526980, "step": 2307, "time_per_iteration": 2.7090096473693848 }, { "auxiliary_loss_clip": 0.01161862, "auxiliary_loss_mlp": 0.01049427, "balance_loss_clip": 1.0529635, "balance_loss_mlp": 1.02973318, "epoch": 0.27752059159502196, "flos": 21614776565760.0, "grad_norm": 1.9401228604555478, "language_loss": 0.80782127, "learning_rate": 3.3911709506638876e-06, "loss": 0.82993412, "num_input_tokens_seen": 49546290, "step": 2308, "time_per_iteration": 3.763765335083008 }, { "auxiliary_loss_clip": 0.01123732, "auxiliary_loss_mlp": 0.00777259, "balance_loss_clip": 1.04591465, "balance_loss_mlp": 1.00064445, "epoch": 0.277640834485661, "flos": 26608011016320.0, "grad_norm": 1.9906101842530992, "language_loss": 0.8104285, "learning_rate": 3.390611198158781e-06, "loss": 0.82943839, "num_input_tokens_seen": 49564165, "step": 2309, "time_per_iteration": 2.77022385597229 }, { "auxiliary_loss_clip": 0.01176681, "auxiliary_loss_mlp": 0.01051832, "balance_loss_clip": 1.0560658, "balance_loss_mlp": 1.03025544, "epoch": 0.2777610773763001, "flos": 19492814661120.0, "grad_norm": 8.905249045802583, "language_loss": 0.89925182, "learning_rate": 3.3900512346990612e-06, "loss": 0.92153692, "num_input_tokens_seen": 49580155, "step": 2310, "time_per_iteration": 2.6348648071289062 }, { "auxiliary_loss_clip": 0.01119011, "auxiliary_loss_mlp": 0.0105292, "balance_loss_clip": 1.04423833, "balance_loss_mlp": 1.03105712, "epoch": 0.27788132026693924, "flos": 38290650001920.0, "grad_norm": 1.9304635991266506, "language_loss": 0.66157734, "learning_rate": 3.389491060369674e-06, "loss": 0.68329668, "num_input_tokens_seen": 49605830, "step": 2311, "time_per_iteration": 2.8680455684661865 }, { "auxiliary_loss_clip": 0.01116255, "auxiliary_loss_mlp": 0.01043561, "balance_loss_clip": 1.04829347, "balance_loss_mlp": 1.02384412, "epoch": 0.2780015631575783, "flos": 22382546797440.0, "grad_norm": 2.697372162967087, "language_loss": 0.89835024, "learning_rate": 3.388930675255598e-06, "loss": 0.91994846, "num_input_tokens_seen": 49625680, "step": 2312, "time_per_iteration": 2.845043420791626 }, { "auxiliary_loss_clip": 0.01150603, "auxiliary_loss_mlp": 0.01050603, "balance_loss_clip": 1.05102825, "balance_loss_mlp": 1.02993202, "epoch": 0.2781218060482174, "flos": 12203200840320.0, "grad_norm": 2.20954719121036, "language_loss": 0.79359674, "learning_rate": 3.388370079441843e-06, "loss": 0.81560874, "num_input_tokens_seen": 49641195, "step": 2313, "time_per_iteration": 2.669322967529297 }, { "auxiliary_loss_clip": 0.01132628, "auxiliary_loss_mlp": 0.0106342, "balance_loss_clip": 1.05044985, "balance_loss_mlp": 1.04271364, "epoch": 0.2782420489388565, "flos": 18107632529280.0, "grad_norm": 2.356366444833387, "language_loss": 0.92913181, "learning_rate": 3.3878092730134505e-06, "loss": 0.95109224, "num_input_tokens_seen": 49659180, "step": 2314, "time_per_iteration": 2.694766044616699 }, { "auxiliary_loss_clip": 0.01150305, "auxiliary_loss_mlp": 0.01056101, "balance_loss_clip": 1.05105495, "balance_loss_mlp": 1.03633618, "epoch": 0.27836229182949557, "flos": 18514752255360.0, "grad_norm": 2.427191137910305, "language_loss": 0.80961406, "learning_rate": 3.3872482560554947e-06, "loss": 0.83167815, "num_input_tokens_seen": 49677955, "step": 2315, "time_per_iteration": 2.615368604660034 }, { "auxiliary_loss_clip": 0.01052967, "auxiliary_loss_mlp": 0.01011399, "balance_loss_clip": 1.02028811, "balance_loss_mlp": 1.00887191, "epoch": 0.2784825347201347, "flos": 67079230940160.0, "grad_norm": 0.8117719130124749, "language_loss": 0.57008755, "learning_rate": 3.386687028653082e-06, "loss": 0.59073126, "num_input_tokens_seen": 49740800, "step": 2316, "time_per_iteration": 3.2857937812805176 }, { "auxiliary_loss_clip": 0.01122016, "auxiliary_loss_mlp": 0.0105342, "balance_loss_clip": 1.05114388, "balance_loss_mlp": 1.03184366, "epoch": 0.2786027776107738, "flos": 22631119891200.0, "grad_norm": 1.8764848701598655, "language_loss": 0.8529405, "learning_rate": 3.386125590891349e-06, "loss": 0.87469482, "num_input_tokens_seen": 49757675, "step": 2317, "time_per_iteration": 2.7216100692749023 }, { "auxiliary_loss_clip": 0.01131872, "auxiliary_loss_mlp": 0.01046459, "balance_loss_clip": 1.04780531, "balance_loss_mlp": 1.02644336, "epoch": 0.27872302050141284, "flos": 15778826156160.0, "grad_norm": 2.420642567010155, "language_loss": 0.83196461, "learning_rate": 3.3855639428554657e-06, "loss": 0.85374796, "num_input_tokens_seen": 49775205, "step": 2318, "time_per_iteration": 2.6884055137634277 }, { "auxiliary_loss_clip": 0.01118531, "auxiliary_loss_mlp": 0.01047389, "balance_loss_clip": 1.04597497, "balance_loss_mlp": 1.02757668, "epoch": 0.27884326339205195, "flos": 22126970551680.0, "grad_norm": 1.9929632663005303, "language_loss": 0.80499303, "learning_rate": 3.385002084630635e-06, "loss": 0.82665217, "num_input_tokens_seen": 49794175, "step": 2319, "time_per_iteration": 2.7587690353393555 }, { "auxiliary_loss_clip": 0.01158028, "auxiliary_loss_mlp": 0.01049717, "balance_loss_clip": 1.05111265, "balance_loss_mlp": 1.02990401, "epoch": 0.278963506282691, "flos": 20558715776640.0, "grad_norm": 2.5377416514275803, "language_loss": 0.84960103, "learning_rate": 3.384440016302088e-06, "loss": 0.87167847, "num_input_tokens_seen": 49812850, "step": 2320, "time_per_iteration": 2.6652841567993164 }, { "auxiliary_loss_clip": 0.01153849, "auxiliary_loss_mlp": 0.01051702, "balance_loss_clip": 1.05143189, "balance_loss_mlp": 1.02970743, "epoch": 0.2790837491733301, "flos": 21942928241280.0, "grad_norm": 2.4640300074284975, "language_loss": 0.6273827, "learning_rate": 3.3838777379550923e-06, "loss": 0.6494382, "num_input_tokens_seen": 49832295, "step": 2321, "time_per_iteration": 2.7113637924194336 }, { "auxiliary_loss_clip": 0.01146, "auxiliary_loss_mlp": 0.01045682, "balance_loss_clip": 1.05056834, "balance_loss_mlp": 1.02572644, "epoch": 0.27920399206396923, "flos": 26286790665600.0, "grad_norm": 2.4203541698943836, "language_loss": 0.78767204, "learning_rate": 3.383315249674944e-06, "loss": 0.80958885, "num_input_tokens_seen": 49850860, "step": 2322, "time_per_iteration": 2.692425012588501 }, { "auxiliary_loss_clip": 0.0113715, "auxiliary_loss_mlp": 0.01046932, "balance_loss_clip": 1.04991281, "balance_loss_mlp": 1.02618957, "epoch": 0.2793242349546083, "flos": 25400981364480.0, "grad_norm": 3.021534505053916, "language_loss": 0.85845363, "learning_rate": 3.3827525515469715e-06, "loss": 0.88029444, "num_input_tokens_seen": 49865765, "step": 2323, "time_per_iteration": 2.8237195014953613 }, { "auxiliary_loss_clip": 0.01121359, "auxiliary_loss_mlp": 0.0104718, "balance_loss_clip": 1.04488981, "balance_loss_mlp": 1.02640188, "epoch": 0.2794444778452474, "flos": 20850346298880.0, "grad_norm": 1.9998012986238611, "language_loss": 0.71276236, "learning_rate": 3.3821896436565367e-06, "loss": 0.73444772, "num_input_tokens_seen": 49885425, "step": 2324, "time_per_iteration": 2.705504894256592 }, { "auxiliary_loss_clip": 0.01159617, "auxiliary_loss_mlp": 0.01047304, "balance_loss_clip": 1.05510509, "balance_loss_mlp": 1.02552438, "epoch": 0.2795647207358865, "flos": 21576244250880.0, "grad_norm": 2.2299882990548303, "language_loss": 0.70479727, "learning_rate": 3.381626526089032e-06, "loss": 0.72686648, "num_input_tokens_seen": 49904990, "step": 2325, "time_per_iteration": 3.636967658996582 }, { "auxiliary_loss_clip": 0.01138747, "auxiliary_loss_mlp": 0.01068477, "balance_loss_clip": 1.046821, "balance_loss_mlp": 1.044909, "epoch": 0.27968496362652556, "flos": 21471744608640.0, "grad_norm": 2.1078610693331483, "language_loss": 0.78891551, "learning_rate": 3.3810631989298815e-06, "loss": 0.81098777, "num_input_tokens_seen": 49924600, "step": 2326, "time_per_iteration": 2.6812474727630615 }, { "auxiliary_loss_clip": 0.01129915, "auxiliary_loss_mlp": 0.01057351, "balance_loss_clip": 1.05286586, "balance_loss_mlp": 1.03542888, "epoch": 0.2798052065171647, "flos": 23258695340160.0, "grad_norm": 2.264561722650664, "language_loss": 0.84171855, "learning_rate": 3.3804996622645423e-06, "loss": 0.86359125, "num_input_tokens_seen": 49942600, "step": 2327, "time_per_iteration": 2.75063157081604 }, { "auxiliary_loss_clip": 0.01168957, "auxiliary_loss_mlp": 0.01050888, "balance_loss_clip": 1.05367112, "balance_loss_mlp": 1.03126574, "epoch": 0.2799254494078038, "flos": 21539328048000.0, "grad_norm": 2.00844810271837, "language_loss": 0.89575601, "learning_rate": 3.3799359161785015e-06, "loss": 0.9179545, "num_input_tokens_seen": 49962250, "step": 2328, "time_per_iteration": 2.6735641956329346 }, { "auxiliary_loss_clip": 0.01157319, "auxiliary_loss_mlp": 0.01051644, "balance_loss_clip": 1.05251288, "balance_loss_mlp": 1.03110433, "epoch": 0.28004569229844284, "flos": 26393912000640.0, "grad_norm": 1.6409223448436463, "language_loss": 0.85609877, "learning_rate": 3.3793719607572798e-06, "loss": 0.87818837, "num_input_tokens_seen": 49983215, "step": 2329, "time_per_iteration": 3.6477372646331787 }, { "auxiliary_loss_clip": 0.01127652, "auxiliary_loss_mlp": 0.01052205, "balance_loss_clip": 1.04691792, "balance_loss_mlp": 1.03072405, "epoch": 0.28016593518908195, "flos": 33547676584320.0, "grad_norm": 3.3730771208280137, "language_loss": 0.76767153, "learning_rate": 3.378807796086428e-06, "loss": 0.78947008, "num_input_tokens_seen": 50006075, "step": 2330, "time_per_iteration": 2.7921106815338135 }, { "auxiliary_loss_clip": 0.01175159, "auxiliary_loss_mlp": 0.01055708, "balance_loss_clip": 1.05668974, "balance_loss_mlp": 1.03626537, "epoch": 0.28028617807972106, "flos": 15340823712000.0, "grad_norm": 2.030135466172166, "language_loss": 0.77005184, "learning_rate": 3.37824342225153e-06, "loss": 0.79236054, "num_input_tokens_seen": 50022495, "step": 2331, "time_per_iteration": 2.577950954437256 }, { "auxiliary_loss_clip": 0.01120139, "auxiliary_loss_mlp": 0.01044676, "balance_loss_clip": 1.04970813, "balance_loss_mlp": 1.02607918, "epoch": 0.2804064209703601, "flos": 25520277409920.0, "grad_norm": 1.8440036506024102, "language_loss": 0.7786665, "learning_rate": 3.3776788393382006e-06, "loss": 0.80031455, "num_input_tokens_seen": 50041975, "step": 2332, "time_per_iteration": 2.7631025314331055 }, { "auxiliary_loss_clip": 0.01172808, "auxiliary_loss_mlp": 0.01050129, "balance_loss_clip": 1.05525041, "balance_loss_mlp": 1.02943456, "epoch": 0.2805266638609992, "flos": 29351766280320.0, "grad_norm": 2.5676979353476597, "language_loss": 0.76622087, "learning_rate": 3.3771140474320872e-06, "loss": 0.78845024, "num_input_tokens_seen": 50061925, "step": 2333, "time_per_iteration": 2.697148084640503 }, { "auxiliary_loss_clip": 0.01144854, "auxiliary_loss_mlp": 0.01050524, "balance_loss_clip": 1.05436254, "balance_loss_mlp": 1.02873254, "epoch": 0.28064690675163834, "flos": 21463735875840.0, "grad_norm": 1.9047507002740485, "language_loss": 0.79476601, "learning_rate": 3.3765490466188664e-06, "loss": 0.81671983, "num_input_tokens_seen": 50079325, "step": 2334, "time_per_iteration": 3.634676218032837 }, { "auxiliary_loss_clip": 0.0112717, "auxiliary_loss_mlp": 0.01055783, "balance_loss_clip": 1.04765403, "balance_loss_mlp": 1.03320467, "epoch": 0.2807671496422774, "flos": 20995640812800.0, "grad_norm": 2.608262988973345, "language_loss": 0.73830128, "learning_rate": 3.3759838369842508e-06, "loss": 0.76013076, "num_input_tokens_seen": 50097400, "step": 2335, "time_per_iteration": 2.7510149478912354 }, { "auxiliary_loss_clip": 0.01136488, "auxiliary_loss_mlp": 0.01050809, "balance_loss_clip": 1.05268288, "balance_loss_mlp": 1.02975631, "epoch": 0.2808873925329165, "flos": 21506577822720.0, "grad_norm": 4.150386692169821, "language_loss": 0.7298758, "learning_rate": 3.375418418613981e-06, "loss": 0.75174874, "num_input_tokens_seen": 50116425, "step": 2336, "time_per_iteration": 2.7228519916534424 }, { "auxiliary_loss_clip": 0.01146833, "auxiliary_loss_mlp": 0.01048846, "balance_loss_clip": 1.05096745, "balance_loss_mlp": 1.02909255, "epoch": 0.28100763542355556, "flos": 16070815814400.0, "grad_norm": 3.6573079796589463, "language_loss": 0.83530265, "learning_rate": 3.374852791593831e-06, "loss": 0.85725945, "num_input_tokens_seen": 50132625, "step": 2337, "time_per_iteration": 2.7127180099487305 }, { "auxiliary_loss_clip": 0.01129191, "auxiliary_loss_mlp": 0.01046589, "balance_loss_clip": 1.0473485, "balance_loss_mlp": 1.02554822, "epoch": 0.28112787831419467, "flos": 19062605468160.0, "grad_norm": 3.1268100995157835, "language_loss": 0.54373419, "learning_rate": 3.374286956009605e-06, "loss": 0.56549203, "num_input_tokens_seen": 50151190, "step": 2338, "time_per_iteration": 2.738860607147217 }, { "auxiliary_loss_clip": 0.01158262, "auxiliary_loss_mlp": 0.01046406, "balance_loss_clip": 1.05634069, "balance_loss_mlp": 1.02547264, "epoch": 0.2812481212048338, "flos": 12823629482880.0, "grad_norm": 2.842831624487549, "language_loss": 0.74731499, "learning_rate": 3.3737209119471405e-06, "loss": 0.76936167, "num_input_tokens_seen": 50167700, "step": 2339, "time_per_iteration": 2.6492772102355957 }, { "auxiliary_loss_clip": 0.0116599, "auxiliary_loss_mlp": 0.01054732, "balance_loss_clip": 1.05613434, "balance_loss_mlp": 1.03330994, "epoch": 0.28136836409547283, "flos": 15633064765440.0, "grad_norm": 3.653351655274911, "language_loss": 0.63641858, "learning_rate": 3.373154659492306e-06, "loss": 0.65862578, "num_input_tokens_seen": 50185840, "step": 2340, "time_per_iteration": 2.5825278759002686 }, { "auxiliary_loss_clip": 0.01146958, "auxiliary_loss_mlp": 0.01044701, "balance_loss_clip": 1.05194509, "balance_loss_mlp": 1.02540112, "epoch": 0.28148860698611194, "flos": 19933726106880.0, "grad_norm": 2.0489741153096417, "language_loss": 0.85139543, "learning_rate": 3.3725881987310016e-06, "loss": 0.87331206, "num_input_tokens_seen": 50203375, "step": 2341, "time_per_iteration": 2.6891791820526123 }, { "auxiliary_loss_clip": 0.01141571, "auxiliary_loss_mlp": 0.01050773, "balance_loss_clip": 1.04871631, "balance_loss_mlp": 1.03138983, "epoch": 0.28160884987675106, "flos": 17457219008640.0, "grad_norm": 2.2681333935763606, "language_loss": 0.87808347, "learning_rate": 3.372021529749159e-06, "loss": 0.90000695, "num_input_tokens_seen": 50222435, "step": 2342, "time_per_iteration": 2.7112014293670654 }, { "auxiliary_loss_clip": 0.01107973, "auxiliary_loss_mlp": 0.01057331, "balance_loss_clip": 1.04884219, "balance_loss_mlp": 1.03642178, "epoch": 0.2817290927673901, "flos": 16834743290880.0, "grad_norm": 2.0764712520949704, "language_loss": 0.9215669, "learning_rate": 3.3714546526327405e-06, "loss": 0.9432199, "num_input_tokens_seen": 50240435, "step": 2343, "time_per_iteration": 2.7410123348236084 }, { "auxiliary_loss_clip": 0.01141577, "auxiliary_loss_mlp": 0.01047435, "balance_loss_clip": 1.04960907, "balance_loss_mlp": 1.02769446, "epoch": 0.2818493356580292, "flos": 15414081500160.0, "grad_norm": 2.1591429770523347, "language_loss": 0.8793807, "learning_rate": 3.3708875674677423e-06, "loss": 0.90127087, "num_input_tokens_seen": 50258410, "step": 2344, "time_per_iteration": 2.7935585975646973 }, { "auxiliary_loss_clip": 0.01151899, "auxiliary_loss_mlp": 0.01051326, "balance_loss_clip": 1.05313277, "balance_loss_mlp": 1.03128648, "epoch": 0.28196957854866833, "flos": 20412451595520.0, "grad_norm": 2.3569438747493985, "language_loss": 0.83683527, "learning_rate": 3.37032027434019e-06, "loss": 0.85886753, "num_input_tokens_seen": 50277930, "step": 2345, "time_per_iteration": 2.6787514686584473 }, { "auxiliary_loss_clip": 0.01169058, "auxiliary_loss_mlp": 0.01057349, "balance_loss_clip": 1.05400836, "balance_loss_mlp": 1.03496146, "epoch": 0.2820898214393074, "flos": 19973120348160.0, "grad_norm": 1.8329997526092248, "language_loss": 0.82600713, "learning_rate": 3.369752773336141e-06, "loss": 0.84827119, "num_input_tokens_seen": 50297410, "step": 2346, "time_per_iteration": 2.6957619190216064 }, { "auxiliary_loss_clip": 0.01145524, "auxiliary_loss_mlp": 0.01051933, "balance_loss_clip": 1.05020142, "balance_loss_mlp": 1.02887797, "epoch": 0.2822100643299465, "flos": 22528308188160.0, "grad_norm": 2.3147861415667985, "language_loss": 0.78487021, "learning_rate": 3.3691850645416864e-06, "loss": 0.80684477, "num_input_tokens_seen": 50317120, "step": 2347, "time_per_iteration": 2.7155418395996094 }, { "auxiliary_loss_clip": 0.01161501, "auxiliary_loss_mlp": 0.01056256, "balance_loss_clip": 1.05262315, "balance_loss_mlp": 1.03550172, "epoch": 0.2823303072205856, "flos": 11546682007680.0, "grad_norm": 2.2876620008625457, "language_loss": 0.83126462, "learning_rate": 3.368617148042945e-06, "loss": 0.85344219, "num_input_tokens_seen": 50334790, "step": 2348, "time_per_iteration": 2.680718421936035 }, { "auxiliary_loss_clip": 0.01140335, "auxiliary_loss_mlp": 0.01057954, "balance_loss_clip": 1.04844308, "balance_loss_mlp": 1.03375471, "epoch": 0.28245055011122466, "flos": 18259894281600.0, "grad_norm": 1.8323301824631273, "language_loss": 0.84677613, "learning_rate": 3.368049023926071e-06, "loss": 0.86875904, "num_input_tokens_seen": 50353785, "step": 2349, "time_per_iteration": 2.853503942489624 }, { "auxiliary_loss_clip": 0.01149891, "auxiliary_loss_mlp": 0.01045426, "balance_loss_clip": 1.05039322, "balance_loss_mlp": 1.02630436, "epoch": 0.2825707930018638, "flos": 24608110504320.0, "grad_norm": 1.733479729712723, "language_loss": 0.83771777, "learning_rate": 3.3674806922772476e-06, "loss": 0.85967088, "num_input_tokens_seen": 50374670, "step": 2350, "time_per_iteration": 2.730602741241455 }, { "auxiliary_loss_clip": 0.01137936, "auxiliary_loss_mlp": 0.01055406, "balance_loss_clip": 1.05081749, "balance_loss_mlp": 1.03363824, "epoch": 0.28269103589250283, "flos": 25226994862080.0, "grad_norm": 2.3557438730706095, "language_loss": 0.74822962, "learning_rate": 3.3669121531826904e-06, "loss": 0.77016306, "num_input_tokens_seen": 50395650, "step": 2351, "time_per_iteration": 3.8074631690979004 }, { "auxiliary_loss_clip": 0.01125791, "auxiliary_loss_mlp": 0.01048331, "balance_loss_clip": 1.04907179, "balance_loss_mlp": 1.02719474, "epoch": 0.28281127878314194, "flos": 19281552819840.0, "grad_norm": 2.5624016191124275, "language_loss": 0.83052337, "learning_rate": 3.366343406728647e-06, "loss": 0.85226458, "num_input_tokens_seen": 50415100, "step": 2352, "time_per_iteration": 2.772077798843384 }, { "auxiliary_loss_clip": 0.01149113, "auxiliary_loss_mlp": 0.01059207, "balance_loss_clip": 1.04755974, "balance_loss_mlp": 1.03786862, "epoch": 0.28293152167378105, "flos": 23878405710720.0, "grad_norm": 2.117129646120235, "language_loss": 0.68720424, "learning_rate": 3.3657744530013946e-06, "loss": 0.70928741, "num_input_tokens_seen": 50434335, "step": 2353, "time_per_iteration": 2.77225399017334 }, { "auxiliary_loss_clip": 0.01161899, "auxiliary_loss_mlp": 0.01046236, "balance_loss_clip": 1.05194902, "balance_loss_mlp": 1.02556539, "epoch": 0.2830517645644201, "flos": 43866965928960.0, "grad_norm": 1.905137448716512, "language_loss": 0.70795631, "learning_rate": 3.3652052920872437e-06, "loss": 0.73003769, "num_input_tokens_seen": 50457200, "step": 2354, "time_per_iteration": 2.8549606800079346 }, { "auxiliary_loss_clip": 0.01149416, "auxiliary_loss_mlp": 0.010575, "balance_loss_clip": 1.05375206, "balance_loss_mlp": 1.03550577, "epoch": 0.2831720074550592, "flos": 26651750803200.0, "grad_norm": 1.9193781504433962, "language_loss": 0.85470605, "learning_rate": 3.3646359240725355e-06, "loss": 0.87677521, "num_input_tokens_seen": 50476390, "step": 2355, "time_per_iteration": 4.659854412078857 }, { "auxiliary_loss_clip": 0.01153948, "auxiliary_loss_mlp": 0.00778414, "balance_loss_clip": 1.05199718, "balance_loss_mlp": 1.0005424, "epoch": 0.2832922503456983, "flos": 31029979564800.0, "grad_norm": 2.5689260788505086, "language_loss": 0.68006325, "learning_rate": 3.364066349043643e-06, "loss": 0.69938684, "num_input_tokens_seen": 50497595, "step": 2356, "time_per_iteration": 2.8252131938934326 }, { "auxiliary_loss_clip": 0.01139069, "auxiliary_loss_mlp": 0.01041696, "balance_loss_clip": 1.04844725, "balance_loss_mlp": 1.02220559, "epoch": 0.2834124932363374, "flos": 20405699838720.0, "grad_norm": 1.8371008745380295, "language_loss": 0.81947196, "learning_rate": 3.363496567086969e-06, "loss": 0.84127963, "num_input_tokens_seen": 50514690, "step": 2357, "time_per_iteration": 2.6716010570526123 }, { "auxiliary_loss_clip": 0.01168193, "auxiliary_loss_mlp": 0.01046906, "balance_loss_clip": 1.05314386, "balance_loss_mlp": 1.02621102, "epoch": 0.2835327361269765, "flos": 39384848056320.0, "grad_norm": 2.585493270045415, "language_loss": 0.75626934, "learning_rate": 3.3629265782889506e-06, "loss": 0.77842033, "num_input_tokens_seen": 50536515, "step": 2358, "time_per_iteration": 2.7364001274108887 }, { "auxiliary_loss_clip": 0.01126946, "auxiliary_loss_mlp": 0.01051402, "balance_loss_clip": 1.04613221, "balance_loss_mlp": 1.03201866, "epoch": 0.2836529790176156, "flos": 30261598801920.0, "grad_norm": 2.179504159862064, "language_loss": 0.71888, "learning_rate": 3.362356382736054e-06, "loss": 0.74066341, "num_input_tokens_seen": 50557120, "step": 2359, "time_per_iteration": 2.742805242538452 }, { "auxiliary_loss_clip": 0.01126256, "auxiliary_loss_mlp": 0.01052799, "balance_loss_clip": 1.04598927, "balance_loss_mlp": 1.03327227, "epoch": 0.28377322190825466, "flos": 12677796264960.0, "grad_norm": 31.480015855872526, "language_loss": 0.90907866, "learning_rate": 3.361785980514777e-06, "loss": 0.93086922, "num_input_tokens_seen": 50573320, "step": 2360, "time_per_iteration": 3.5105013847351074 }, { "auxiliary_loss_clip": 0.01099325, "auxiliary_loss_mlp": 0.01054737, "balance_loss_clip": 1.04615307, "balance_loss_mlp": 1.03363705, "epoch": 0.28389346479889377, "flos": 18296666830080.0, "grad_norm": 1.857640468596874, "language_loss": 0.7661522, "learning_rate": 3.361215371711649e-06, "loss": 0.78769279, "num_input_tokens_seen": 50592415, "step": 2361, "time_per_iteration": 2.671480417251587 }, { "auxiliary_loss_clip": 0.01124791, "auxiliary_loss_mlp": 0.01047618, "balance_loss_clip": 1.04734373, "balance_loss_mlp": 1.02742362, "epoch": 0.2840137076895329, "flos": 20406992728320.0, "grad_norm": 2.3448497959005894, "language_loss": 0.83729523, "learning_rate": 3.3606445564132326e-06, "loss": 0.85901928, "num_input_tokens_seen": 50609710, "step": 2362, "time_per_iteration": 2.598505735397339 }, { "auxiliary_loss_clip": 0.01169583, "auxiliary_loss_mlp": 0.00775488, "balance_loss_clip": 1.05422163, "balance_loss_mlp": 1.00056159, "epoch": 0.28413395058017193, "flos": 20048030161920.0, "grad_norm": 2.2727053734306946, "language_loss": 0.82349825, "learning_rate": 3.360073534706118e-06, "loss": 0.84294903, "num_input_tokens_seen": 50626865, "step": 2363, "time_per_iteration": 2.536546230316162 }, { "auxiliary_loss_clip": 0.01147381, "auxiliary_loss_mlp": 0.01052097, "balance_loss_clip": 1.05295086, "balance_loss_mlp": 1.03097284, "epoch": 0.28425419347081105, "flos": 37663613256960.0, "grad_norm": 2.174418271766171, "language_loss": 0.75776708, "learning_rate": 3.35950230667693e-06, "loss": 0.77976185, "num_input_tokens_seen": 50648560, "step": 2364, "time_per_iteration": 2.7987377643585205 }, { "auxiliary_loss_clip": 0.01158935, "auxiliary_loss_mlp": 0.01047959, "balance_loss_clip": 1.05275583, "balance_loss_mlp": 1.02689457, "epoch": 0.28437443636145016, "flos": 13845072539520.0, "grad_norm": 3.5517835204401944, "language_loss": 0.86493075, "learning_rate": 3.358930872412323e-06, "loss": 0.88699967, "num_input_tokens_seen": 50665725, "step": 2365, "time_per_iteration": 2.6218597888946533 }, { "auxiliary_loss_clip": 0.01156247, "auxiliary_loss_mlp": 0.01050838, "balance_loss_clip": 1.05431724, "balance_loss_mlp": 1.03153801, "epoch": 0.2844946792520892, "flos": 22747794243840.0, "grad_norm": 1.680024626949993, "language_loss": 0.81062055, "learning_rate": 3.3583592319989825e-06, "loss": 0.83269137, "num_input_tokens_seen": 50685095, "step": 2366, "time_per_iteration": 2.6299092769622803 }, { "auxiliary_loss_clip": 0.01168183, "auxiliary_loss_mlp": 0.0105696, "balance_loss_clip": 1.05613208, "balance_loss_mlp": 1.03533554, "epoch": 0.2846149221427283, "flos": 32415987709440.0, "grad_norm": 2.5859130254172213, "language_loss": 0.68962026, "learning_rate": 3.357787385523627e-06, "loss": 0.71187174, "num_input_tokens_seen": 50706500, "step": 2367, "time_per_iteration": 2.8162195682525635 }, { "auxiliary_loss_clip": 0.01116546, "auxiliary_loss_mlp": 0.01063044, "balance_loss_clip": 1.04842234, "balance_loss_mlp": 1.0408715, "epoch": 0.2847351650333674, "flos": 28475976873600.0, "grad_norm": 2.123233064103427, "language_loss": 0.82812208, "learning_rate": 3.3572153330730048e-06, "loss": 0.84991801, "num_input_tokens_seen": 50727595, "step": 2368, "time_per_iteration": 2.807873487472534 }, { "auxiliary_loss_clip": 0.01043989, "auxiliary_loss_mlp": 0.01011677, "balance_loss_clip": 1.02967048, "balance_loss_mlp": 1.00841033, "epoch": 0.2848554079240065, "flos": 55753399704960.0, "grad_norm": 0.8311297769932001, "language_loss": 0.64745879, "learning_rate": 3.3566430747338956e-06, "loss": 0.66801536, "num_input_tokens_seen": 50782800, "step": 2369, "time_per_iteration": 3.246250629425049 }, { "auxiliary_loss_clip": 0.01159, "auxiliary_loss_mlp": 0.01045806, "balance_loss_clip": 1.05251074, "balance_loss_mlp": 1.02498043, "epoch": 0.2849756508146456, "flos": 11836875985920.0, "grad_norm": 2.2435107933446674, "language_loss": 0.86714059, "learning_rate": 3.35607061059311e-06, "loss": 0.88918865, "num_input_tokens_seen": 50797730, "step": 2370, "time_per_iteration": 2.6382176876068115 }, { "auxiliary_loss_clip": 0.01168051, "auxiliary_loss_mlp": 0.01047128, "balance_loss_clip": 1.05541778, "balance_loss_mlp": 1.02680326, "epoch": 0.28509589370528465, "flos": 25155209531520.0, "grad_norm": 1.8885262543270815, "language_loss": 0.74645197, "learning_rate": 3.3554979407374917e-06, "loss": 0.7686038, "num_input_tokens_seen": 50819840, "step": 2371, "time_per_iteration": 2.805164337158203 }, { "auxiliary_loss_clip": 0.0115494, "auxiliary_loss_mlp": 0.0104764, "balance_loss_clip": 1.05073929, "balance_loss_mlp": 1.02744555, "epoch": 0.28521613659592376, "flos": 19974808287360.0, "grad_norm": 2.1840659519830714, "language_loss": 0.73527324, "learning_rate": 3.3549250652539134e-06, "loss": 0.75729907, "num_input_tokens_seen": 50838935, "step": 2372, "time_per_iteration": 2.6797356605529785 }, { "auxiliary_loss_clip": 0.01144706, "auxiliary_loss_mlp": 0.0106019, "balance_loss_clip": 1.04819894, "balance_loss_mlp": 1.03640747, "epoch": 0.2853363794865629, "flos": 23367971491200.0, "grad_norm": 1.8912814376176046, "language_loss": 0.81659734, "learning_rate": 3.3543519842292794e-06, "loss": 0.83864629, "num_input_tokens_seen": 50858590, "step": 2373, "time_per_iteration": 2.6874265670776367 }, { "auxiliary_loss_clip": 0.01169484, "auxiliary_loss_mlp": 0.00776151, "balance_loss_clip": 1.05519211, "balance_loss_mlp": 1.00058913, "epoch": 0.28545662237720193, "flos": 19861940776320.0, "grad_norm": 2.127653472456805, "language_loss": 0.83644235, "learning_rate": 3.353778697750527e-06, "loss": 0.85589868, "num_input_tokens_seen": 50876995, "step": 2374, "time_per_iteration": 2.685371160507202 }, { "auxiliary_loss_clip": 0.01134267, "auxiliary_loss_mlp": 0.01056011, "balance_loss_clip": 1.04769659, "balance_loss_mlp": 1.03512526, "epoch": 0.28557686526784104, "flos": 23879016241920.0, "grad_norm": 1.7474082966501059, "language_loss": 0.89503139, "learning_rate": 3.353205205904622e-06, "loss": 0.91693413, "num_input_tokens_seen": 50896105, "step": 2375, "time_per_iteration": 2.7183635234832764 }, { "auxiliary_loss_clip": 0.01150359, "auxiliary_loss_mlp": 0.01058174, "balance_loss_clip": 1.05210698, "balance_loss_mlp": 1.03545296, "epoch": 0.28569710815848015, "flos": 44890384233600.0, "grad_norm": 2.3091165149426165, "language_loss": 0.71997666, "learning_rate": 3.3526315087785637e-06, "loss": 0.74206203, "num_input_tokens_seen": 50917220, "step": 2376, "time_per_iteration": 2.9494380950927734 }, { "auxiliary_loss_clip": 0.01101597, "auxiliary_loss_mlp": 0.01049884, "balance_loss_clip": 1.04664207, "balance_loss_mlp": 1.02951097, "epoch": 0.2858173510491192, "flos": 26829759628800.0, "grad_norm": 1.989994068024539, "language_loss": 0.81017864, "learning_rate": 3.3520576064593805e-06, "loss": 0.83169341, "num_input_tokens_seen": 50937175, "step": 2377, "time_per_iteration": 3.9026939868927 }, { "auxiliary_loss_clip": 0.01164495, "auxiliary_loss_mlp": 0.01050049, "balance_loss_clip": 1.05490625, "balance_loss_mlp": 1.02937841, "epoch": 0.2859375939397583, "flos": 23148916398720.0, "grad_norm": 1.752465309076695, "language_loss": 0.81713057, "learning_rate": 3.3514834990341337e-06, "loss": 0.83927608, "num_input_tokens_seen": 50957500, "step": 2378, "time_per_iteration": 2.826185703277588 }, { "auxiliary_loss_clip": 0.01148243, "auxiliary_loss_mlp": 0.01048295, "balance_loss_clip": 1.05200088, "balance_loss_mlp": 1.02854156, "epoch": 0.2860578368303974, "flos": 12129799397760.0, "grad_norm": 5.224203409412885, "language_loss": 0.92977315, "learning_rate": 3.3509091865899144e-06, "loss": 0.95173854, "num_input_tokens_seen": 50972690, "step": 2379, "time_per_iteration": 2.726144552230835 }, { "auxiliary_loss_clip": 0.01176313, "auxiliary_loss_mlp": 0.01052897, "balance_loss_clip": 1.05830669, "balance_loss_mlp": 1.03164244, "epoch": 0.2861780797210365, "flos": 19938035738880.0, "grad_norm": 1.871484407406434, "language_loss": 0.70608079, "learning_rate": 3.350334669213846e-06, "loss": 0.72837287, "num_input_tokens_seen": 50990095, "step": 2380, "time_per_iteration": 3.6968162059783936 }, { "auxiliary_loss_clip": 0.01157018, "auxiliary_loss_mlp": 0.01040394, "balance_loss_clip": 1.05546188, "balance_loss_mlp": 1.02065301, "epoch": 0.2862983226116756, "flos": 27563127609600.0, "grad_norm": 2.1125193049372464, "language_loss": 0.75375181, "learning_rate": 3.3497599469930816e-06, "loss": 0.77572596, "num_input_tokens_seen": 51008305, "step": 2381, "time_per_iteration": 3.6156694889068604 }, { "auxiliary_loss_clip": 0.01170285, "auxiliary_loss_mlp": 0.01040826, "balance_loss_clip": 1.05400944, "balance_loss_mlp": 1.02190781, "epoch": 0.28641856550231465, "flos": 22053964158720.0, "grad_norm": 2.640038899586222, "language_loss": 0.8322171, "learning_rate": 3.349185020014807e-06, "loss": 0.85432816, "num_input_tokens_seen": 51025570, "step": 2382, "time_per_iteration": 2.684147357940674 }, { "auxiliary_loss_clip": 0.01161943, "auxiliary_loss_mlp": 0.01043748, "balance_loss_clip": 1.0539397, "balance_loss_mlp": 1.02330351, "epoch": 0.28653880839295376, "flos": 22378775869440.0, "grad_norm": 1.8851114691554707, "language_loss": 0.74285066, "learning_rate": 3.348609888366237e-06, "loss": 0.76490754, "num_input_tokens_seen": 51044585, "step": 2383, "time_per_iteration": 2.6736676692962646 }, { "auxiliary_loss_clip": 0.01098566, "auxiliary_loss_mlp": 0.01047128, "balance_loss_clip": 1.04650164, "balance_loss_mlp": 1.02675509, "epoch": 0.28665905128359287, "flos": 23367971491200.0, "grad_norm": 5.168579833488864, "language_loss": 0.62265956, "learning_rate": 3.348034552134619e-06, "loss": 0.64411652, "num_input_tokens_seen": 51063990, "step": 2384, "time_per_iteration": 2.860769510269165 }, { "auxiliary_loss_clip": 0.01114845, "auxiliary_loss_mlp": 0.01053432, "balance_loss_clip": 1.04916072, "balance_loss_mlp": 1.03208137, "epoch": 0.2867792941742319, "flos": 20881695893760.0, "grad_norm": 3.5051162346834914, "language_loss": 0.84705675, "learning_rate": 3.3474590114072316e-06, "loss": 0.86873955, "num_input_tokens_seen": 51081990, "step": 2385, "time_per_iteration": 2.8222179412841797 }, { "auxiliary_loss_clip": 0.0113267, "auxiliary_loss_mlp": 0.01042917, "balance_loss_clip": 1.05218768, "balance_loss_mlp": 1.0217098, "epoch": 0.28689953706487104, "flos": 20664005518080.0, "grad_norm": 1.9119573538362455, "language_loss": 0.83159339, "learning_rate": 3.3468832662713836e-06, "loss": 0.85334933, "num_input_tokens_seen": 51100235, "step": 2386, "time_per_iteration": 3.5351216793060303 }, { "auxiliary_loss_clip": 0.01134289, "auxiliary_loss_mlp": 0.01045425, "balance_loss_clip": 1.05306923, "balance_loss_mlp": 1.0269711, "epoch": 0.28701977995551015, "flos": 12675533708160.0, "grad_norm": 2.3834319518735247, "language_loss": 0.84203196, "learning_rate": 3.346307316814415e-06, "loss": 0.86382908, "num_input_tokens_seen": 51115405, "step": 2387, "time_per_iteration": 2.7610490322113037 }, { "auxiliary_loss_clip": 0.01158198, "auxiliary_loss_mlp": 0.01053868, "balance_loss_clip": 1.05593634, "balance_loss_mlp": 1.03133774, "epoch": 0.2871400228461492, "flos": 21252366293760.0, "grad_norm": 2.247175219450743, "language_loss": 0.75825191, "learning_rate": 3.3457311631236965e-06, "loss": 0.78037262, "num_input_tokens_seen": 51136390, "step": 2388, "time_per_iteration": 2.766594409942627 }, { "auxiliary_loss_clip": 0.01136828, "auxiliary_loss_mlp": 0.01052845, "balance_loss_clip": 1.05113864, "balance_loss_mlp": 1.03149486, "epoch": 0.2872602657367883, "flos": 25119262995840.0, "grad_norm": 1.92408430980879, "language_loss": 0.8476088, "learning_rate": 3.345154805286631e-06, "loss": 0.86950547, "num_input_tokens_seen": 51156650, "step": 2389, "time_per_iteration": 2.7771875858306885 }, { "auxiliary_loss_clip": 0.01152041, "auxiliary_loss_mlp": 0.01053202, "balance_loss_clip": 1.05178618, "balance_loss_mlp": 1.03061152, "epoch": 0.2873805086274274, "flos": 16646606830080.0, "grad_norm": 3.0940625744053607, "language_loss": 0.76307493, "learning_rate": 3.344578243390651e-06, "loss": 0.78512734, "num_input_tokens_seen": 51172210, "step": 2390, "time_per_iteration": 2.6873650550842285 }, { "auxiliary_loss_clip": 0.01140978, "auxiliary_loss_mlp": 0.01048624, "balance_loss_clip": 1.05066144, "balance_loss_mlp": 1.0280484, "epoch": 0.2875007515180665, "flos": 17420123237760.0, "grad_norm": 3.12364967055019, "language_loss": 0.78334713, "learning_rate": 3.3440014775232206e-06, "loss": 0.80524313, "num_input_tokens_seen": 51190265, "step": 2391, "time_per_iteration": 2.7134039402008057 }, { "auxiliary_loss_clip": 0.01131399, "auxiliary_loss_mlp": 0.01047819, "balance_loss_clip": 1.05183601, "balance_loss_mlp": 1.02592051, "epoch": 0.2876209944087056, "flos": 23434190213760.0, "grad_norm": 2.181245131118638, "language_loss": 0.71353412, "learning_rate": 3.343424507771834e-06, "loss": 0.73532629, "num_input_tokens_seen": 51208475, "step": 2392, "time_per_iteration": 2.8375930786132812 }, { "auxiliary_loss_clip": 0.01133093, "auxiliary_loss_mlp": 0.01049814, "balance_loss_clip": 1.05230045, "balance_loss_mlp": 1.02946496, "epoch": 0.2877412372993447, "flos": 13735509079680.0, "grad_norm": 1.7369742401664858, "language_loss": 0.86467314, "learning_rate": 3.342847334224018e-06, "loss": 0.88650227, "num_input_tokens_seen": 51225875, "step": 2393, "time_per_iteration": 2.7043631076812744 }, { "auxiliary_loss_clip": 0.01063458, "auxiliary_loss_mlp": 0.01016646, "balance_loss_clip": 1.03169274, "balance_loss_mlp": 1.01371336, "epoch": 0.28786148018998375, "flos": 58079695104000.0, "grad_norm": 0.9673874564241041, "language_loss": 0.62365884, "learning_rate": 3.342269956967329e-06, "loss": 0.6444599, "num_input_tokens_seen": 51287780, "step": 2394, "time_per_iteration": 3.3433656692504883 }, { "auxiliary_loss_clip": 0.01165023, "auxiliary_loss_mlp": 0.01045983, "balance_loss_clip": 1.05455661, "balance_loss_mlp": 1.02410841, "epoch": 0.28798172308062286, "flos": 23435052140160.0, "grad_norm": 2.826433359642448, "language_loss": 0.71464062, "learning_rate": 3.341692376089355e-06, "loss": 0.73675066, "num_input_tokens_seen": 51303335, "step": 2395, "time_per_iteration": 2.7457289695739746 }, { "auxiliary_loss_clip": 0.01154248, "auxiliary_loss_mlp": 0.01047131, "balance_loss_clip": 1.05238855, "balance_loss_mlp": 1.02629328, "epoch": 0.288101965971262, "flos": 25110033200640.0, "grad_norm": 3.857000110910892, "language_loss": 0.84637135, "learning_rate": 3.3411145916777146e-06, "loss": 0.8683852, "num_input_tokens_seen": 51317495, "step": 2396, "time_per_iteration": 2.85599946975708 }, { "auxiliary_loss_clip": 0.01135996, "auxiliary_loss_mlp": 0.01056404, "balance_loss_clip": 1.04955292, "balance_loss_mlp": 1.03569698, "epoch": 0.28822220886190103, "flos": 16252559654400.0, "grad_norm": 2.545654971233812, "language_loss": 0.90968961, "learning_rate": 3.3405366038200566e-06, "loss": 0.93161356, "num_input_tokens_seen": 51336430, "step": 2397, "time_per_iteration": 2.7156753540039062 }, { "auxiliary_loss_clip": 0.01153525, "auxiliary_loss_mlp": 0.01060461, "balance_loss_clip": 1.05826473, "balance_loss_mlp": 1.03915811, "epoch": 0.28834245175254014, "flos": 24535642815360.0, "grad_norm": 2.469738475126247, "language_loss": 0.84579492, "learning_rate": 3.3399584126040617e-06, "loss": 0.86793476, "num_input_tokens_seen": 51355930, "step": 2398, "time_per_iteration": 2.835986375808716 }, { "auxiliary_loss_clip": 0.0116957, "auxiliary_loss_mlp": 0.00775017, "balance_loss_clip": 1.05424953, "balance_loss_mlp": 1.0005753, "epoch": 0.2884626946431792, "flos": 24571445696640.0, "grad_norm": 2.6944119961686797, "language_loss": 0.90550637, "learning_rate": 3.339380018117441e-06, "loss": 0.92495227, "num_input_tokens_seen": 51376765, "step": 2399, "time_per_iteration": 2.900350570678711 }, { "auxiliary_loss_clip": 0.0115196, "auxiliary_loss_mlp": 0.01052683, "balance_loss_clip": 1.05464792, "balance_loss_mlp": 1.03303742, "epoch": 0.2885829375338183, "flos": 16544657053440.0, "grad_norm": 2.4142588402804512, "language_loss": 0.7831946, "learning_rate": 3.3388014204479366e-06, "loss": 0.80524099, "num_input_tokens_seen": 51394570, "step": 2400, "time_per_iteration": 2.679990530014038 }, { "auxiliary_loss_clip": 0.011755, "auxiliary_loss_mlp": 0.01046743, "balance_loss_clip": 1.0585655, "balance_loss_mlp": 1.02572644, "epoch": 0.2887031804244574, "flos": 24061226958720.0, "grad_norm": 2.3544830724587795, "language_loss": 0.91364038, "learning_rate": 3.338222619683321e-06, "loss": 0.93586278, "num_input_tokens_seen": 51414535, "step": 2401, "time_per_iteration": 2.7957844734191895 }, { "auxiliary_loss_clip": 0.01153009, "auxiliary_loss_mlp": 0.01053797, "balance_loss_clip": 1.05558586, "balance_loss_mlp": 1.03198159, "epoch": 0.2888234233150965, "flos": 23330696152320.0, "grad_norm": 8.015071523580978, "language_loss": 0.73611015, "learning_rate": 3.337643615911398e-06, "loss": 0.75817823, "num_input_tokens_seen": 51434160, "step": 2402, "time_per_iteration": 2.7056684494018555 }, { "auxiliary_loss_clip": 0.01159895, "auxiliary_loss_mlp": 0.01049332, "balance_loss_clip": 1.05481696, "balance_loss_mlp": 1.02798164, "epoch": 0.2889436662057356, "flos": 22272767856000.0, "grad_norm": 3.188225253408502, "language_loss": 0.7878356, "learning_rate": 3.3370644092200026e-06, "loss": 0.80992782, "num_input_tokens_seen": 51451435, "step": 2403, "time_per_iteration": 3.6966564655303955 }, { "auxiliary_loss_clip": 0.01123696, "auxiliary_loss_mlp": 0.0105701, "balance_loss_clip": 1.05215752, "balance_loss_mlp": 1.03507519, "epoch": 0.2890639090963747, "flos": 21616931381760.0, "grad_norm": 2.120290354365618, "language_loss": 0.78470224, "learning_rate": 3.3364849996969985e-06, "loss": 0.80650932, "num_input_tokens_seen": 51471455, "step": 2404, "time_per_iteration": 2.8764467239379883 }, { "auxiliary_loss_clip": 0.01160283, "auxiliary_loss_mlp": 0.01053417, "balance_loss_clip": 1.05712605, "balance_loss_mlp": 1.03423667, "epoch": 0.28918415198701375, "flos": 28585540333440.0, "grad_norm": 2.0275841109518233, "language_loss": 0.85505211, "learning_rate": 3.335905387430283e-06, "loss": 0.87718916, "num_input_tokens_seen": 51492890, "step": 2405, "time_per_iteration": 2.795382499694824 }, { "auxiliary_loss_clip": 0.01147469, "auxiliary_loss_mlp": 0.01049875, "balance_loss_clip": 1.0516789, "balance_loss_mlp": 1.02870393, "epoch": 0.28930439487765286, "flos": 21944688007680.0, "grad_norm": 1.7732028148785244, "language_loss": 0.83107519, "learning_rate": 3.335325572507782e-06, "loss": 0.85304862, "num_input_tokens_seen": 51513390, "step": 2406, "time_per_iteration": 2.802424192428589 }, { "auxiliary_loss_clip": 0.01171612, "auxiliary_loss_mlp": 0.00775985, "balance_loss_clip": 1.05833054, "balance_loss_mlp": 1.00054836, "epoch": 0.28942463776829197, "flos": 19281911955840.0, "grad_norm": 1.6566166072832904, "language_loss": 0.74172807, "learning_rate": 3.3347455550174537e-06, "loss": 0.76120406, "num_input_tokens_seen": 51532730, "step": 2407, "time_per_iteration": 5.2451066970825195 }, { "auxiliary_loss_clip": 0.0112865, "auxiliary_loss_mlp": 0.01049485, "balance_loss_clip": 1.04884005, "balance_loss_mlp": 1.02726412, "epoch": 0.289544880658931, "flos": 14645700737280.0, "grad_norm": 2.796487587066844, "language_loss": 0.68656975, "learning_rate": 3.3341653350472864e-06, "loss": 0.70835108, "num_input_tokens_seen": 51549560, "step": 2408, "time_per_iteration": 2.7987682819366455 }, { "auxiliary_loss_clip": 0.01186373, "auxiliary_loss_mlp": 0.01057094, "balance_loss_clip": 1.05905807, "balance_loss_mlp": 1.03300214, "epoch": 0.28966512354957014, "flos": 28621881918720.0, "grad_norm": 4.814626310584187, "language_loss": 0.6922102, "learning_rate": 3.333584912685298e-06, "loss": 0.71464485, "num_input_tokens_seen": 51568180, "step": 2409, "time_per_iteration": 2.8075613975524902 }, { "auxiliary_loss_clip": 0.01043303, "auxiliary_loss_mlp": 0.01002387, "balance_loss_clip": 1.03126144, "balance_loss_mlp": 0.99941844, "epoch": 0.28978536644020925, "flos": 64711784511360.0, "grad_norm": 0.9127123325710516, "language_loss": 0.55538833, "learning_rate": 3.3330042880195385e-06, "loss": 0.57584524, "num_input_tokens_seen": 51622530, "step": 2410, "time_per_iteration": 3.30709171295166 }, { "auxiliary_loss_clip": 0.01142519, "auxiliary_loss_mlp": 0.01043832, "balance_loss_clip": 1.05426967, "balance_loss_mlp": 1.0245564, "epoch": 0.2899056093308483, "flos": 18624638937600.0, "grad_norm": 1.7230387897033483, "language_loss": 0.78414404, "learning_rate": 3.3324234611380888e-06, "loss": 0.8060075, "num_input_tokens_seen": 51641260, "step": 2411, "time_per_iteration": 2.8146815299987793 }, { "auxiliary_loss_clip": 0.01126835, "auxiliary_loss_mlp": 0.010463, "balance_loss_clip": 1.05364144, "balance_loss_mlp": 1.02676153, "epoch": 0.2900258522214874, "flos": 22893735202560.0, "grad_norm": 2.366476863035214, "language_loss": 0.81843281, "learning_rate": 3.3318424321290596e-06, "loss": 0.84016412, "num_input_tokens_seen": 51660975, "step": 2412, "time_per_iteration": 3.639340400695801 }, { "auxiliary_loss_clip": 0.01035575, "auxiliary_loss_mlp": 0.01013112, "balance_loss_clip": 1.02761412, "balance_loss_mlp": 1.00996447, "epoch": 0.2901460951121265, "flos": 71106036013440.0, "grad_norm": 1.2245165201706827, "language_loss": 0.59900415, "learning_rate": 3.3312612010805917e-06, "loss": 0.61949104, "num_input_tokens_seen": 51720550, "step": 2413, "time_per_iteration": 3.3030717372894287 }, { "auxiliary_loss_clip": 0.01136311, "auxiliary_loss_mlp": 0.01049665, "balance_loss_clip": 1.05270433, "balance_loss_mlp": 1.02897072, "epoch": 0.2902663380027656, "flos": 32160986081280.0, "grad_norm": 2.1447734285742865, "language_loss": 0.70615357, "learning_rate": 3.330679768080858e-06, "loss": 0.72801334, "num_input_tokens_seen": 51744435, "step": 2414, "time_per_iteration": 2.813324451446533 }, { "auxiliary_loss_clip": 0.01155786, "auxiliary_loss_mlp": 0.01057594, "balance_loss_clip": 1.05451894, "balance_loss_mlp": 1.03819919, "epoch": 0.2903865808934047, "flos": 29351658539520.0, "grad_norm": 8.002553653923849, "language_loss": 0.83463585, "learning_rate": 3.3300981332180627e-06, "loss": 0.85676974, "num_input_tokens_seen": 51763640, "step": 2415, "time_per_iteration": 2.848191499710083 }, { "auxiliary_loss_clip": 0.0113708, "auxiliary_loss_mlp": 0.01052876, "balance_loss_clip": 1.05272007, "balance_loss_mlp": 1.03162074, "epoch": 0.29050682378404374, "flos": 17089026647040.0, "grad_norm": 2.034418814977364, "language_loss": 0.80457234, "learning_rate": 3.3295162965804373e-06, "loss": 0.82647192, "num_input_tokens_seen": 51782135, "step": 2416, "time_per_iteration": 2.7760345935821533 }, { "auxiliary_loss_clip": 0.01130221, "auxiliary_loss_mlp": 0.01051599, "balance_loss_clip": 1.05224943, "balance_loss_mlp": 1.03189361, "epoch": 0.29062706667468285, "flos": 17858233422720.0, "grad_norm": 2.8562885483454767, "language_loss": 0.78635418, "learning_rate": 3.328934258256247e-06, "loss": 0.80817246, "num_input_tokens_seen": 51800200, "step": 2417, "time_per_iteration": 2.768050193786621 }, { "auxiliary_loss_clip": 0.01157305, "auxiliary_loss_mlp": 0.01051729, "balance_loss_clip": 1.05320191, "balance_loss_mlp": 1.03075957, "epoch": 0.29074730956532197, "flos": 24279815174400.0, "grad_norm": 2.470046679982397, "language_loss": 0.66907728, "learning_rate": 3.3283520183337856e-06, "loss": 0.69116765, "num_input_tokens_seen": 51819905, "step": 2418, "time_per_iteration": 2.8707172870635986 }, { "auxiliary_loss_clip": 0.01143935, "auxiliary_loss_mlp": 0.01047019, "balance_loss_clip": 1.055161, "balance_loss_mlp": 1.02697992, "epoch": 0.290867552455961, "flos": 22340961826560.0, "grad_norm": 2.0778832215359753, "language_loss": 0.6941005, "learning_rate": 3.3277695769013797e-06, "loss": 0.71600997, "num_input_tokens_seen": 51839350, "step": 2419, "time_per_iteration": 2.9129726886749268 }, { "auxiliary_loss_clip": 0.01154706, "auxiliary_loss_mlp": 0.01060853, "balance_loss_clip": 1.05364895, "balance_loss_mlp": 1.0399555, "epoch": 0.29098779534660013, "flos": 23186155824000.0, "grad_norm": 2.4013118507164224, "language_loss": 0.77468216, "learning_rate": 3.327186934047385e-06, "loss": 0.79683769, "num_input_tokens_seen": 51858045, "step": 2420, "time_per_iteration": 2.8681342601776123 }, { "auxiliary_loss_clip": 0.01134581, "auxiliary_loss_mlp": 0.01051068, "balance_loss_clip": 1.04931211, "balance_loss_mlp": 1.03039706, "epoch": 0.29110803823723924, "flos": 15304194817920.0, "grad_norm": 2.9192776691750772, "language_loss": 0.65778208, "learning_rate": 3.3266040898601877e-06, "loss": 0.67963862, "num_input_tokens_seen": 51875880, "step": 2421, "time_per_iteration": 2.9091577529907227 }, { "auxiliary_loss_clip": 0.01109476, "auxiliary_loss_mlp": 0.01065982, "balance_loss_clip": 1.04786372, "balance_loss_mlp": 1.04371357, "epoch": 0.2912282811278783, "flos": 22595352923520.0, "grad_norm": 1.9085060755112202, "language_loss": 0.77833307, "learning_rate": 3.3260210444282045e-06, "loss": 0.80008757, "num_input_tokens_seen": 51893835, "step": 2422, "time_per_iteration": 2.880452871322632 }, { "auxiliary_loss_clip": 0.01151299, "auxiliary_loss_mlp": 0.01040474, "balance_loss_clip": 1.05351591, "balance_loss_mlp": 1.01949275, "epoch": 0.2913485240185174, "flos": 24497900599680.0, "grad_norm": 2.2194591604218137, "language_loss": 0.73375136, "learning_rate": 3.325437797839883e-06, "loss": 0.75566912, "num_input_tokens_seen": 51912205, "step": 2423, "time_per_iteration": 2.8847315311431885 }, { "auxiliary_loss_clip": 0.01175339, "auxiliary_loss_mlp": 0.01045153, "balance_loss_clip": 1.05806255, "balance_loss_mlp": 1.02445841, "epoch": 0.2914687669091565, "flos": 17931024334080.0, "grad_norm": 4.166896981692623, "language_loss": 0.74554765, "learning_rate": 3.3248543501837015e-06, "loss": 0.76775253, "num_input_tokens_seen": 51929410, "step": 2424, "time_per_iteration": 2.747042179107666 }, { "auxiliary_loss_clip": 0.01124932, "auxiliary_loss_mlp": 0.01046338, "balance_loss_clip": 1.05490315, "balance_loss_mlp": 1.0263828, "epoch": 0.2915890097997956, "flos": 22529313768960.0, "grad_norm": 2.1767070404917974, "language_loss": 0.77428776, "learning_rate": 3.3242707015481684e-06, "loss": 0.79600048, "num_input_tokens_seen": 51949345, "step": 2425, "time_per_iteration": 2.820485830307007 }, { "auxiliary_loss_clip": 0.01143426, "auxiliary_loss_mlp": 0.01045544, "balance_loss_clip": 1.05044532, "balance_loss_mlp": 1.02637529, "epoch": 0.2917092526904347, "flos": 13845216193920.0, "grad_norm": 1.925419521916142, "language_loss": 0.80457091, "learning_rate": 3.323686852021823e-06, "loss": 0.8264606, "num_input_tokens_seen": 51966855, "step": 2426, "time_per_iteration": 2.750706195831299 }, { "auxiliary_loss_clip": 0.0113312, "auxiliary_loss_mlp": 0.01046302, "balance_loss_clip": 1.04832625, "balance_loss_mlp": 1.02640617, "epoch": 0.2918294955810738, "flos": 22674859678080.0, "grad_norm": 2.0942047030673963, "language_loss": 0.79737872, "learning_rate": 3.323102801693235e-06, "loss": 0.81917298, "num_input_tokens_seen": 51985620, "step": 2427, "time_per_iteration": 2.7936692237854004 }, { "auxiliary_loss_clip": 0.01154616, "auxiliary_loss_mlp": 0.01063337, "balance_loss_clip": 1.0533452, "balance_loss_mlp": 1.04130697, "epoch": 0.29194973847171285, "flos": 23438284364160.0, "grad_norm": 2.7965877389040164, "language_loss": 0.80715621, "learning_rate": 3.322518550651003e-06, "loss": 0.82933581, "num_input_tokens_seen": 52004930, "step": 2428, "time_per_iteration": 2.7673232555389404 }, { "auxiliary_loss_clip": 0.01152035, "auxiliary_loss_mlp": 0.01050935, "balance_loss_clip": 1.05320919, "balance_loss_mlp": 1.030586, "epoch": 0.29206998136235196, "flos": 21909064694400.0, "grad_norm": 2.7641841065609616, "language_loss": 0.81418061, "learning_rate": 3.3219340989837586e-06, "loss": 0.83621025, "num_input_tokens_seen": 52024920, "step": 2429, "time_per_iteration": 3.688230514526367 }, { "auxiliary_loss_clip": 0.01143457, "auxiliary_loss_mlp": 0.01046975, "balance_loss_clip": 1.05100989, "balance_loss_mlp": 1.02800918, "epoch": 0.292190224252991, "flos": 23215925220480.0, "grad_norm": 2.036792072819414, "language_loss": 0.80666292, "learning_rate": 3.3213494467801625e-06, "loss": 0.82856721, "num_input_tokens_seen": 52044095, "step": 2430, "time_per_iteration": 2.7088356018066406 }, { "auxiliary_loss_clip": 0.01086431, "auxiliary_loss_mlp": 0.01047807, "balance_loss_clip": 1.03979182, "balance_loss_mlp": 1.02613485, "epoch": 0.2923104671436301, "flos": 20740818752640.0, "grad_norm": 2.2080301555409445, "language_loss": 0.71510839, "learning_rate": 3.3207645941289063e-06, "loss": 0.73645079, "num_input_tokens_seen": 52062440, "step": 2431, "time_per_iteration": 2.920485496520996 }, { "auxiliary_loss_clip": 0.01155093, "auxiliary_loss_mlp": 0.00774924, "balance_loss_clip": 1.05200183, "balance_loss_mlp": 1.00039649, "epoch": 0.29243071003426924, "flos": 35809114999680.0, "grad_norm": 2.675287857064476, "language_loss": 0.80377734, "learning_rate": 3.320179541118711e-06, "loss": 0.82307744, "num_input_tokens_seen": 52084940, "step": 2432, "time_per_iteration": 4.01216721534729 }, { "auxiliary_loss_clip": 0.01064758, "auxiliary_loss_mlp": 0.0100921, "balance_loss_clip": 1.03259254, "balance_loss_mlp": 1.00658786, "epoch": 0.2925509529249083, "flos": 58081598524800.0, "grad_norm": 1.0206067958736234, "language_loss": 0.60335821, "learning_rate": 3.3195942878383293e-06, "loss": 0.62409788, "num_input_tokens_seen": 52141040, "step": 2433, "time_per_iteration": 4.154206275939941 }, { "auxiliary_loss_clip": 0.01163418, "auxiliary_loss_mlp": 0.0104761, "balance_loss_clip": 1.05552101, "balance_loss_mlp": 1.02610481, "epoch": 0.2926711958155474, "flos": 21397122103680.0, "grad_norm": 2.066449487030179, "language_loss": 0.78029847, "learning_rate": 3.319008834376543e-06, "loss": 0.80240875, "num_input_tokens_seen": 52160730, "step": 2434, "time_per_iteration": 2.767174005508423 }, { "auxiliary_loss_clip": 0.01135308, "auxiliary_loss_mlp": 0.01051088, "balance_loss_clip": 1.04788232, "balance_loss_mlp": 1.02918935, "epoch": 0.2927914387061865, "flos": 23185796688000.0, "grad_norm": 2.4177501337243177, "language_loss": 0.88798988, "learning_rate": 3.3184231808221654e-06, "loss": 0.90985394, "num_input_tokens_seen": 52175055, "step": 2435, "time_per_iteration": 2.848109006881714 }, { "auxiliary_loss_clip": 0.01135658, "auxiliary_loss_mlp": 0.0104892, "balance_loss_clip": 1.05268407, "balance_loss_mlp": 1.02632928, "epoch": 0.29291168159682557, "flos": 22455553190400.0, "grad_norm": 2.199643446515838, "language_loss": 0.62435794, "learning_rate": 3.3178373272640394e-06, "loss": 0.6462037, "num_input_tokens_seen": 52194150, "step": 2436, "time_per_iteration": 2.7348134517669678 }, { "auxiliary_loss_clip": 0.01164692, "auxiliary_loss_mlp": 0.01045268, "balance_loss_clip": 1.05265439, "balance_loss_mlp": 1.02520561, "epoch": 0.2930319244874647, "flos": 21170632896000.0, "grad_norm": 2.3821932154529315, "language_loss": 0.85089743, "learning_rate": 3.3172512737910387e-06, "loss": 0.87299699, "num_input_tokens_seen": 52211660, "step": 2437, "time_per_iteration": 2.6387102603912354 }, { "auxiliary_loss_clip": 0.01155029, "auxiliary_loss_mlp": 0.01048189, "balance_loss_clip": 1.05106544, "balance_loss_mlp": 1.02643323, "epoch": 0.2931521673781038, "flos": 31357843931520.0, "grad_norm": 3.2937389096016405, "language_loss": 0.8849262, "learning_rate": 3.3166650204920674e-06, "loss": 0.9069584, "num_input_tokens_seen": 52232830, "step": 2438, "time_per_iteration": 3.6656410694122314 }, { "auxiliary_loss_clip": 0.01158273, "auxiliary_loss_mlp": 0.01051847, "balance_loss_clip": 1.05343652, "balance_loss_mlp": 1.03010321, "epoch": 0.29327241026874284, "flos": 24200990778240.0, "grad_norm": 1.7874668832068825, "language_loss": 0.81430924, "learning_rate": 3.316078567456059e-06, "loss": 0.83641046, "num_input_tokens_seen": 52250670, "step": 2439, "time_per_iteration": 2.7113072872161865 }, { "auxiliary_loss_clip": 0.01109989, "auxiliary_loss_mlp": 0.01051054, "balance_loss_clip": 1.04837346, "balance_loss_mlp": 1.02988243, "epoch": 0.29339265315938196, "flos": 24242611662720.0, "grad_norm": 1.6189348141465294, "language_loss": 0.75573856, "learning_rate": 3.3154919147719786e-06, "loss": 0.777349, "num_input_tokens_seen": 52271685, "step": 2440, "time_per_iteration": 2.763535976409912 }, { "auxiliary_loss_clip": 0.01156903, "auxiliary_loss_mlp": 0.01058044, "balance_loss_clip": 1.05292559, "balance_loss_mlp": 1.03615713, "epoch": 0.29351289605002107, "flos": 16946641134720.0, "grad_norm": 2.372974963572577, "language_loss": 0.86380994, "learning_rate": 3.31490506252882e-06, "loss": 0.88595939, "num_input_tokens_seen": 52291065, "step": 2441, "time_per_iteration": 2.7055654525756836 }, { "auxiliary_loss_clip": 0.01119764, "auxiliary_loss_mlp": 0.01046731, "balance_loss_clip": 1.04654896, "balance_loss_mlp": 1.02665603, "epoch": 0.2936331389406601, "flos": 19829082810240.0, "grad_norm": 2.087626205604625, "language_loss": 0.83899432, "learning_rate": 3.31431801081561e-06, "loss": 0.8606593, "num_input_tokens_seen": 52310000, "step": 2442, "time_per_iteration": 2.8045859336853027 }, { "auxiliary_loss_clip": 0.01031178, "auxiliary_loss_mlp": 0.01002562, "balance_loss_clip": 1.01483309, "balance_loss_mlp": 1.00019014, "epoch": 0.29375338183129923, "flos": 71416844398080.0, "grad_norm": 1.042869767178447, "language_loss": 0.67948151, "learning_rate": 3.313730759721402e-06, "loss": 0.69981897, "num_input_tokens_seen": 52372930, "step": 2443, "time_per_iteration": 3.367961883544922 }, { "auxiliary_loss_clip": 0.01136965, "auxiliary_loss_mlp": 0.01049761, "balance_loss_clip": 1.0483129, "balance_loss_mlp": 1.0292685, "epoch": 0.29387362472193834, "flos": 22054502862720.0, "grad_norm": 2.2569822115348592, "language_loss": 0.86471814, "learning_rate": 3.313143309335282e-06, "loss": 0.88658541, "num_input_tokens_seen": 52391420, "step": 2444, "time_per_iteration": 2.830517292022705 }, { "auxiliary_loss_clip": 0.01126285, "auxiliary_loss_mlp": 0.01052459, "balance_loss_clip": 1.05007231, "balance_loss_mlp": 1.03311133, "epoch": 0.2939938676125774, "flos": 22966418373120.0, "grad_norm": 2.4933392363561717, "language_loss": 0.85196805, "learning_rate": 3.3125556597463665e-06, "loss": 0.87375551, "num_input_tokens_seen": 52410725, "step": 2445, "time_per_iteration": 2.701253652572632 }, { "auxiliary_loss_clip": 0.01155518, "auxiliary_loss_mlp": 0.01049952, "balance_loss_clip": 1.05477023, "balance_loss_mlp": 1.02990079, "epoch": 0.2941141105032165, "flos": 31358705857920.0, "grad_norm": 1.7228387811836101, "language_loss": 0.66938019, "learning_rate": 3.311967811043801e-06, "loss": 0.69143486, "num_input_tokens_seen": 52432645, "step": 2446, "time_per_iteration": 2.755540609359741 }, { "auxiliary_loss_clip": 0.01158609, "auxiliary_loss_mlp": 0.01059096, "balance_loss_clip": 1.05617952, "balance_loss_mlp": 1.03978443, "epoch": 0.29423435339385556, "flos": 23222138273280.0, "grad_norm": 4.6236884335746735, "language_loss": 0.81454575, "learning_rate": 3.3113797633167617e-06, "loss": 0.83672279, "num_input_tokens_seen": 52450940, "step": 2447, "time_per_iteration": 2.665902614593506 }, { "auxiliary_loss_clip": 0.01171858, "auxiliary_loss_mlp": 0.01053597, "balance_loss_clip": 1.05656552, "balance_loss_mlp": 1.03451204, "epoch": 0.2943545962844947, "flos": 26864054138880.0, "grad_norm": 3.609566350955359, "language_loss": 0.6953612, "learning_rate": 3.310791516654455e-06, "loss": 0.71761572, "num_input_tokens_seen": 52468000, "step": 2448, "time_per_iteration": 2.7764759063720703 }, { "auxiliary_loss_clip": 0.01139135, "auxiliary_loss_mlp": 0.01055998, "balance_loss_clip": 1.05183077, "balance_loss_mlp": 1.03302598, "epoch": 0.2944748391751338, "flos": 20231677422720.0, "grad_norm": 2.2205330223989894, "language_loss": 0.79437989, "learning_rate": 3.3102030711461177e-06, "loss": 0.81633121, "num_input_tokens_seen": 52487575, "step": 2449, "time_per_iteration": 2.882816791534424 }, { "auxiliary_loss_clip": 0.01137144, "auxiliary_loss_mlp": 0.01047032, "balance_loss_clip": 1.05195534, "balance_loss_mlp": 1.02602744, "epoch": 0.29459508206577284, "flos": 15960965045760.0, "grad_norm": 2.03006763140756, "language_loss": 0.67835599, "learning_rate": 3.3096144268810156e-06, "loss": 0.70019782, "num_input_tokens_seen": 52506335, "step": 2450, "time_per_iteration": 2.734628438949585 }, { "auxiliary_loss_clip": 0.01149688, "auxiliary_loss_mlp": 0.01049549, "balance_loss_clip": 1.05245686, "balance_loss_mlp": 1.02760231, "epoch": 0.29471532495641195, "flos": 20412882558720.0, "grad_norm": 2.794763078566964, "language_loss": 0.72686112, "learning_rate": 3.3090255839484462e-06, "loss": 0.74885356, "num_input_tokens_seen": 52524330, "step": 2451, "time_per_iteration": 2.7057688236236572 }, { "auxiliary_loss_clip": 0.01145013, "auxiliary_loss_mlp": 0.01049536, "balance_loss_clip": 1.05002189, "balance_loss_mlp": 1.02829266, "epoch": 0.29483556784705106, "flos": 20376576887040.0, "grad_norm": 2.000972493496323, "language_loss": 0.85437727, "learning_rate": 3.3084365424377366e-06, "loss": 0.87632275, "num_input_tokens_seen": 52543095, "step": 2452, "time_per_iteration": 2.7034242153167725 }, { "auxiliary_loss_clip": 0.01034746, "auxiliary_loss_mlp": 0.01047237, "balance_loss_clip": 1.03389394, "balance_loss_mlp": 1.0441618, "epoch": 0.2949558107376901, "flos": 68555660595840.0, "grad_norm": 0.7360923243945813, "language_loss": 0.55926794, "learning_rate": 3.307847302438245e-06, "loss": 0.58008778, "num_input_tokens_seen": 52597075, "step": 2453, "time_per_iteration": 3.266730308532715 }, { "auxiliary_loss_clip": 0.01099739, "auxiliary_loss_mlp": 0.01064702, "balance_loss_clip": 1.04234862, "balance_loss_mlp": 1.04050279, "epoch": 0.2950760536283292, "flos": 16107085572480.0, "grad_norm": 2.985038519824887, "language_loss": 0.77816921, "learning_rate": 3.3072578640393562e-06, "loss": 0.79981363, "num_input_tokens_seen": 52614410, "step": 2454, "time_per_iteration": 2.844655752182007 }, { "auxiliary_loss_clip": 0.01141392, "auxiliary_loss_mlp": 0.01059382, "balance_loss_clip": 1.04949331, "balance_loss_mlp": 1.03813875, "epoch": 0.29519629651896834, "flos": 20483626394880.0, "grad_norm": 1.8326850846913436, "language_loss": 0.79474008, "learning_rate": 3.3066682273304886e-06, "loss": 0.81674778, "num_input_tokens_seen": 52632055, "step": 2455, "time_per_iteration": 3.680793285369873 }, { "auxiliary_loss_clip": 0.01160066, "auxiliary_loss_mlp": 0.00777222, "balance_loss_clip": 1.05103278, "balance_loss_mlp": 1.00045657, "epoch": 0.2953165394096074, "flos": 18916484941440.0, "grad_norm": 2.077440782278872, "language_loss": 0.79087615, "learning_rate": 3.3060783924010904e-06, "loss": 0.81024903, "num_input_tokens_seen": 52649980, "step": 2456, "time_per_iteration": 2.7570483684539795 }, { "auxiliary_loss_clip": 0.01131252, "auxiliary_loss_mlp": 0.01057184, "balance_loss_clip": 1.04832387, "balance_loss_mlp": 1.03552413, "epoch": 0.2954367823002465, "flos": 20624467622400.0, "grad_norm": 2.5059816653342883, "language_loss": 0.84844005, "learning_rate": 3.3054883593406387e-06, "loss": 0.87032443, "num_input_tokens_seen": 52664730, "step": 2457, "time_per_iteration": 2.794348955154419 }, { "auxiliary_loss_clip": 0.0114303, "auxiliary_loss_mlp": 0.01046955, "balance_loss_clip": 1.05069971, "balance_loss_mlp": 1.02683282, "epoch": 0.2955570251908856, "flos": 31175525473920.0, "grad_norm": 2.273009076263368, "language_loss": 0.65104496, "learning_rate": 3.3048981282386404e-06, "loss": 0.67294478, "num_input_tokens_seen": 52686040, "step": 2458, "time_per_iteration": 4.347769498825073 }, { "auxiliary_loss_clip": 0.01120232, "auxiliary_loss_mlp": 0.01057676, "balance_loss_clip": 1.04834604, "balance_loss_mlp": 1.03593254, "epoch": 0.29567726808152467, "flos": 21650328051840.0, "grad_norm": 4.083380478405425, "language_loss": 0.82650435, "learning_rate": 3.304307699184634e-06, "loss": 0.84828341, "num_input_tokens_seen": 52704630, "step": 2459, "time_per_iteration": 3.8383233547210693 }, { "auxiliary_loss_clip": 0.01140547, "auxiliary_loss_mlp": 0.01056095, "balance_loss_clip": 1.0510993, "balance_loss_mlp": 1.03575802, "epoch": 0.2957975109721638, "flos": 24243868638720.0, "grad_norm": 2.085448069431787, "language_loss": 0.78952277, "learning_rate": 3.3037170722681866e-06, "loss": 0.81148916, "num_input_tokens_seen": 52725465, "step": 2460, "time_per_iteration": 2.929508924484253 }, { "auxiliary_loss_clip": 0.01121812, "auxiliary_loss_mlp": 0.01048468, "balance_loss_clip": 1.04698372, "balance_loss_mlp": 1.02642632, "epoch": 0.29591775386280283, "flos": 13479717352320.0, "grad_norm": 2.4610609168459434, "language_loss": 0.67994988, "learning_rate": 3.3031262475788956e-06, "loss": 0.70165265, "num_input_tokens_seen": 52742405, "step": 2461, "time_per_iteration": 2.854872941970825 }, { "auxiliary_loss_clip": 0.01137199, "auxiliary_loss_mlp": 0.01056981, "balance_loss_clip": 1.04654694, "balance_loss_mlp": 1.03583288, "epoch": 0.29603799675344195, "flos": 17749783284480.0, "grad_norm": 1.8555900074363665, "language_loss": 0.7323277, "learning_rate": 3.3025352252063897e-06, "loss": 0.75426948, "num_input_tokens_seen": 52761100, "step": 2462, "time_per_iteration": 2.6943466663360596 }, { "auxiliary_loss_clip": 0.01153369, "auxiliary_loss_mlp": 0.01054803, "balance_loss_clip": 1.05208588, "balance_loss_mlp": 1.03449011, "epoch": 0.29615823964408106, "flos": 22783920347520.0, "grad_norm": 1.9718070318031722, "language_loss": 0.7518844, "learning_rate": 3.3019440052403252e-06, "loss": 0.77396613, "num_input_tokens_seen": 52780965, "step": 2463, "time_per_iteration": 2.746184825897217 }, { "auxiliary_loss_clip": 0.0114435, "auxiliary_loss_mlp": 0.01051442, "balance_loss_clip": 1.05086565, "balance_loss_mlp": 1.02951932, "epoch": 0.2962784825347201, "flos": 23514199758720.0, "grad_norm": 1.9047441323975196, "language_loss": 0.70608664, "learning_rate": 3.30135258777039e-06, "loss": 0.72804451, "num_input_tokens_seen": 52800335, "step": 2464, "time_per_iteration": 3.6251604557037354 }, { "auxiliary_loss_clip": 0.01158361, "auxiliary_loss_mlp": 0.00776706, "balance_loss_clip": 1.04993081, "balance_loss_mlp": 1.00052595, "epoch": 0.2963987254253592, "flos": 16362769559040.0, "grad_norm": 3.4776936802212406, "language_loss": 0.7067523, "learning_rate": 3.3007609728863024e-06, "loss": 0.72610295, "num_input_tokens_seen": 52818425, "step": 2465, "time_per_iteration": 2.6012980937957764 }, { "auxiliary_loss_clip": 0.01099343, "auxiliary_loss_mlp": 0.01048607, "balance_loss_clip": 1.04604852, "balance_loss_mlp": 1.02726841, "epoch": 0.29651896831599833, "flos": 33472263980160.0, "grad_norm": 1.8636254810791217, "language_loss": 0.72941059, "learning_rate": 3.300169160677809e-06, "loss": 0.75089002, "num_input_tokens_seen": 52842340, "step": 2466, "time_per_iteration": 2.8788015842437744 }, { "auxiliary_loss_clip": 0.01138385, "auxiliary_loss_mlp": 0.01051722, "balance_loss_clip": 1.05052543, "balance_loss_mlp": 1.03078914, "epoch": 0.2966392112066374, "flos": 23805363404160.0, "grad_norm": 3.1477061200298375, "language_loss": 0.77830118, "learning_rate": 3.2995771512346878e-06, "loss": 0.80020225, "num_input_tokens_seen": 52860690, "step": 2467, "time_per_iteration": 2.7111120223999023 }, { "auxiliary_loss_clip": 0.01173046, "auxiliary_loss_mlp": 0.00777416, "balance_loss_clip": 1.05419111, "balance_loss_mlp": 1.00052285, "epoch": 0.2967594540972765, "flos": 19938466702080.0, "grad_norm": 3.2809890550639396, "language_loss": 0.73143816, "learning_rate": 3.298984944646746e-06, "loss": 0.75094277, "num_input_tokens_seen": 52879370, "step": 2468, "time_per_iteration": 2.5657505989074707 }, { "auxiliary_loss_clip": 0.01156837, "auxiliary_loss_mlp": 0.00775275, "balance_loss_clip": 1.05435801, "balance_loss_mlp": 1.00047302, "epoch": 0.2968796969879156, "flos": 23732823888000.0, "grad_norm": 1.9479875672333777, "language_loss": 0.81506252, "learning_rate": 3.298392541003822e-06, "loss": 0.83438361, "num_input_tokens_seen": 52898775, "step": 2469, "time_per_iteration": 2.6327407360076904 }, { "auxiliary_loss_clip": 0.01140913, "auxiliary_loss_mlp": 0.01045168, "balance_loss_clip": 1.0507493, "balance_loss_mlp": 1.02431798, "epoch": 0.29699993987855466, "flos": 22893699288960.0, "grad_norm": 1.7341686553754432, "language_loss": 0.89616281, "learning_rate": 3.2977999403957806e-06, "loss": 0.91802359, "num_input_tokens_seen": 52917535, "step": 2470, "time_per_iteration": 2.6567118167877197 }, { "auxiliary_loss_clip": 0.0117028, "auxiliary_loss_mlp": 0.01051508, "balance_loss_clip": 1.05488288, "balance_loss_mlp": 1.0290246, "epoch": 0.2971201827691938, "flos": 33832555349760.0, "grad_norm": 2.1968663344189814, "language_loss": 0.67250198, "learning_rate": 3.2972071429125207e-06, "loss": 0.69471985, "num_input_tokens_seen": 52938755, "step": 2471, "time_per_iteration": 2.7191240787506104 }, { "auxiliary_loss_clip": 0.01127978, "auxiliary_loss_mlp": 0.01048661, "balance_loss_clip": 1.04701006, "balance_loss_mlp": 1.02759683, "epoch": 0.2972404256598329, "flos": 22054359208320.0, "grad_norm": 2.0313765371106123, "language_loss": 0.88089287, "learning_rate": 3.2966141486439682e-06, "loss": 0.9026593, "num_input_tokens_seen": 52957945, "step": 2472, "time_per_iteration": 2.6723928451538086 }, { "auxiliary_loss_clip": 0.01106516, "auxiliary_loss_mlp": 0.01050336, "balance_loss_clip": 1.04547548, "balance_loss_mlp": 1.02847314, "epoch": 0.29736066855047194, "flos": 31978595796480.0, "grad_norm": 2.20261717366861, "language_loss": 0.64318037, "learning_rate": 3.29602095768008e-06, "loss": 0.66474891, "num_input_tokens_seen": 52978460, "step": 2473, "time_per_iteration": 2.8040931224823 }, { "auxiliary_loss_clip": 0.01133174, "auxiliary_loss_mlp": 0.01051868, "balance_loss_clip": 1.04785347, "balance_loss_mlp": 1.0316025, "epoch": 0.29748091144111105, "flos": 33510401245440.0, "grad_norm": 2.1388782582551364, "language_loss": 0.6375013, "learning_rate": 3.2954275701108437e-06, "loss": 0.65935171, "num_input_tokens_seen": 52999640, "step": 2474, "time_per_iteration": 2.704301357269287 }, { "auxiliary_loss_clip": 0.01112265, "auxiliary_loss_mlp": 0.01061233, "balance_loss_clip": 1.04489112, "balance_loss_mlp": 1.03714037, "epoch": 0.29760115433175016, "flos": 41283373409280.0, "grad_norm": 2.1489742428821956, "language_loss": 0.68751001, "learning_rate": 3.294833986026275e-06, "loss": 0.70924497, "num_input_tokens_seen": 53022880, "step": 2475, "time_per_iteration": 2.8608932495117188 }, { "auxiliary_loss_clip": 0.01122198, "auxiliary_loss_mlp": 0.01046876, "balance_loss_clip": 1.04814959, "balance_loss_mlp": 1.02663457, "epoch": 0.2977213972223892, "flos": 24493339572480.0, "grad_norm": 2.2190621129804735, "language_loss": 0.85558152, "learning_rate": 3.29424020551642e-06, "loss": 0.87727225, "num_input_tokens_seen": 53041515, "step": 2476, "time_per_iteration": 2.7058205604553223 }, { "auxiliary_loss_clip": 0.01178627, "auxiliary_loss_mlp": 0.01054142, "balance_loss_clip": 1.05756068, "balance_loss_mlp": 1.03292322, "epoch": 0.2978416401130283, "flos": 21285116519040.0, "grad_norm": 2.790988200023865, "language_loss": 0.71949238, "learning_rate": 3.2936462286713546e-06, "loss": 0.74182004, "num_input_tokens_seen": 53059865, "step": 2477, "time_per_iteration": 2.5808145999908447 }, { "auxiliary_loss_clip": 0.01156255, "auxiliary_loss_mlp": 0.01046632, "balance_loss_clip": 1.05312777, "balance_loss_mlp": 1.02624726, "epoch": 0.2979618830036674, "flos": 25772154554880.0, "grad_norm": 2.412409010278721, "language_loss": 0.77284431, "learning_rate": 3.2930520555811846e-06, "loss": 0.79487324, "num_input_tokens_seen": 53079490, "step": 2478, "time_per_iteration": 2.6632626056671143 }, { "auxiliary_loss_clip": 0.01070544, "auxiliary_loss_mlp": 0.00779022, "balance_loss_clip": 1.04433262, "balance_loss_mlp": 1.00043845, "epoch": 0.2980821258943065, "flos": 23476996247040.0, "grad_norm": 1.7838726307429837, "language_loss": 0.79824716, "learning_rate": 3.292457686336046e-06, "loss": 0.81674278, "num_input_tokens_seen": 53098810, "step": 2479, "time_per_iteration": 2.808281660079956 }, { "auxiliary_loss_clip": 0.01055693, "auxiliary_loss_mlp": 0.01030008, "balance_loss_clip": 1.03384924, "balance_loss_mlp": 1.02731442, "epoch": 0.2982023687849456, "flos": 69752314195200.0, "grad_norm": 0.8566822675124878, "language_loss": 0.6125285, "learning_rate": 3.291863121026105e-06, "loss": 0.63338554, "num_input_tokens_seen": 53162590, "step": 2480, "time_per_iteration": 4.177576541900635 }, { "auxiliary_loss_clip": 0.01158757, "auxiliary_loss_mlp": 0.01052792, "balance_loss_clip": 1.05194855, "balance_loss_mlp": 1.0321573, "epoch": 0.29832261167558466, "flos": 29825930741760.0, "grad_norm": 2.2064457017053933, "language_loss": 0.76760733, "learning_rate": 3.2912683597415547e-06, "loss": 0.7897228, "num_input_tokens_seen": 53186675, "step": 2481, "time_per_iteration": 2.7117221355438232 }, { "auxiliary_loss_clip": 0.01131031, "auxiliary_loss_mlp": 0.0105231, "balance_loss_clip": 1.04947305, "balance_loss_mlp": 1.03090024, "epoch": 0.29844285456622377, "flos": 33910158683520.0, "grad_norm": 2.2190541633687078, "language_loss": 0.78155899, "learning_rate": 3.2906734025726213e-06, "loss": 0.80339241, "num_input_tokens_seen": 53205940, "step": 2482, "time_per_iteration": 2.7578117847442627 }, { "auxiliary_loss_clip": 0.0116543, "auxiliary_loss_mlp": 0.01065965, "balance_loss_clip": 1.05779278, "balance_loss_mlp": 1.04423285, "epoch": 0.2985630974568629, "flos": 23876933253120.0, "grad_norm": 6.496947624092455, "language_loss": 0.87902033, "learning_rate": 3.290078249609559e-06, "loss": 0.90133429, "num_input_tokens_seen": 53225360, "step": 2483, "time_per_iteration": 2.626493453979492 }, { "auxiliary_loss_clip": 0.01155352, "auxiliary_loss_mlp": 0.01052195, "balance_loss_clip": 1.05682516, "balance_loss_mlp": 1.02995062, "epoch": 0.29868334034750194, "flos": 21799106184960.0, "grad_norm": 2.471452312191652, "language_loss": 0.87970108, "learning_rate": 3.2894829009426514e-06, "loss": 0.90177655, "num_input_tokens_seen": 53243195, "step": 2484, "time_per_iteration": 3.6273152828216553 }, { "auxiliary_loss_clip": 0.01152708, "auxiliary_loss_mlp": 0.01046492, "balance_loss_clip": 1.0528053, "balance_loss_mlp": 1.02607107, "epoch": 0.29880358323814105, "flos": 25666649331840.0, "grad_norm": 1.954747792973726, "language_loss": 0.77685726, "learning_rate": 3.288887356662213e-06, "loss": 0.79884928, "num_input_tokens_seen": 53264530, "step": 2485, "time_per_iteration": 2.649144172668457 }, { "auxiliary_loss_clip": 0.01059274, "auxiliary_loss_mlp": 0.01001632, "balance_loss_clip": 1.03106952, "balance_loss_mlp": 0.99908084, "epoch": 0.29892382612878016, "flos": 71005846003200.0, "grad_norm": 0.7756385962618695, "language_loss": 0.59648895, "learning_rate": 3.288291616858588e-06, "loss": 0.61709797, "num_input_tokens_seen": 53319920, "step": 2486, "time_per_iteration": 3.087130546569824 }, { "auxiliary_loss_clip": 0.01113314, "auxiliary_loss_mlp": 0.010557, "balance_loss_clip": 1.05143559, "balance_loss_mlp": 1.0356971, "epoch": 0.2990440690194192, "flos": 25481134563840.0, "grad_norm": 1.7073956092895988, "language_loss": 0.77072656, "learning_rate": 3.287695681622149e-06, "loss": 0.79241675, "num_input_tokens_seen": 53339270, "step": 2487, "time_per_iteration": 2.7082161903381348 }, { "auxiliary_loss_clip": 0.01144459, "auxiliary_loss_mlp": 0.01054425, "balance_loss_clip": 1.05076623, "balance_loss_mlp": 1.03377759, "epoch": 0.2991643119100583, "flos": 23732357011200.0, "grad_norm": 2.2411692748037755, "language_loss": 0.81168735, "learning_rate": 3.2870995510432982e-06, "loss": 0.83367616, "num_input_tokens_seen": 53357750, "step": 2488, "time_per_iteration": 2.7341737747192383 }, { "auxiliary_loss_clip": 0.01147837, "auxiliary_loss_mlp": 0.0105826, "balance_loss_clip": 1.05177259, "balance_loss_mlp": 1.03936505, "epoch": 0.29928455480069743, "flos": 27417545786880.0, "grad_norm": 9.262922236633798, "language_loss": 0.76768124, "learning_rate": 3.2865032252124697e-06, "loss": 0.78974223, "num_input_tokens_seen": 53378265, "step": 2489, "time_per_iteration": 2.7072865962982178 }, { "auxiliary_loss_clip": 0.01138988, "auxiliary_loss_mlp": 0.01056839, "balance_loss_clip": 1.05074656, "balance_loss_mlp": 1.03808701, "epoch": 0.2994047976913365, "flos": 33692935184640.0, "grad_norm": 1.6269209406926346, "language_loss": 0.77656257, "learning_rate": 3.2859067042201243e-06, "loss": 0.79852086, "num_input_tokens_seen": 53400305, "step": 2490, "time_per_iteration": 3.7062320709228516 }, { "auxiliary_loss_clip": 0.01084566, "auxiliary_loss_mlp": 0.0105537, "balance_loss_clip": 1.04391837, "balance_loss_mlp": 1.03376937, "epoch": 0.2995250405819756, "flos": 16763963541120.0, "grad_norm": 2.9879908599112244, "language_loss": 0.78038472, "learning_rate": 3.2853099881567544e-06, "loss": 0.8017841, "num_input_tokens_seen": 53418705, "step": 2491, "time_per_iteration": 2.8713173866271973 }, { "auxiliary_loss_clip": 0.01160408, "auxiliary_loss_mlp": 0.01053432, "balance_loss_clip": 1.05211675, "balance_loss_mlp": 1.03485942, "epoch": 0.29964528347261465, "flos": 22963976248320.0, "grad_norm": 1.7581699130416018, "language_loss": 0.79400539, "learning_rate": 3.284713077112881e-06, "loss": 0.81614387, "num_input_tokens_seen": 53438135, "step": 2492, "time_per_iteration": 2.653409719467163 }, { "auxiliary_loss_clip": 0.01137792, "auxiliary_loss_mlp": 0.01055793, "balance_loss_clip": 1.05383718, "balance_loss_mlp": 1.03299999, "epoch": 0.29976552636325376, "flos": 16938021870720.0, "grad_norm": 3.0192684410694737, "language_loss": 0.8609128, "learning_rate": 3.284115971179056e-06, "loss": 0.88284862, "num_input_tokens_seen": 53452165, "step": 2493, "time_per_iteration": 2.6894586086273193 }, { "auxiliary_loss_clip": 0.01101816, "auxiliary_loss_mlp": 0.01052082, "balance_loss_clip": 1.0490799, "balance_loss_mlp": 1.03056443, "epoch": 0.2998857692538929, "flos": 17056455989760.0, "grad_norm": 1.8231632801005373, "language_loss": 0.78508818, "learning_rate": 3.283518670445859e-06, "loss": 0.80662715, "num_input_tokens_seen": 53470075, "step": 2494, "time_per_iteration": 2.7251107692718506 }, { "auxiliary_loss_clip": 0.0103737, "auxiliary_loss_mlp": 0.00756488, "balance_loss_clip": 1.02258623, "balance_loss_mlp": 1.00053084, "epoch": 0.30000601214453193, "flos": 68831528025600.0, "grad_norm": 0.7767568703498787, "language_loss": 0.54379177, "learning_rate": 3.2829211750038995e-06, "loss": 0.56173038, "num_input_tokens_seen": 53538705, "step": 2495, "time_per_iteration": 3.314610481262207 }, { "auxiliary_loss_clip": 0.01124104, "auxiliary_loss_mlp": 0.01047614, "balance_loss_clip": 1.04932368, "balance_loss_mlp": 1.02732468, "epoch": 0.30012625503517104, "flos": 17603267708160.0, "grad_norm": 2.203618124059072, "language_loss": 0.88900793, "learning_rate": 3.2823234849438183e-06, "loss": 0.91072512, "num_input_tokens_seen": 53556740, "step": 2496, "time_per_iteration": 2.699336528778076 }, { "auxiliary_loss_clip": 0.01141906, "auxiliary_loss_mlp": 0.01056711, "balance_loss_clip": 1.05185413, "balance_loss_mlp": 1.03601646, "epoch": 0.30024649792581015, "flos": 21252581775360.0, "grad_norm": 2.4133016799682956, "language_loss": 0.76118195, "learning_rate": 3.2817256003562836e-06, "loss": 0.78316808, "num_input_tokens_seen": 53577115, "step": 2497, "time_per_iteration": 2.6657238006591797 }, { "auxiliary_loss_clip": 0.01106098, "auxiliary_loss_mlp": 0.01056467, "balance_loss_clip": 1.04661846, "balance_loss_mlp": 1.0362494, "epoch": 0.3003667408164492, "flos": 23003262748800.0, "grad_norm": 1.8588579096590698, "language_loss": 0.66137326, "learning_rate": 3.281127521331995e-06, "loss": 0.6829989, "num_input_tokens_seen": 53598295, "step": 2498, "time_per_iteration": 2.776956081390381 }, { "auxiliary_loss_clip": 0.01060522, "auxiliary_loss_mlp": 0.01006039, "balance_loss_clip": 1.02247119, "balance_loss_mlp": 1.00347614, "epoch": 0.3004869837070883, "flos": 64232340750720.0, "grad_norm": 0.8791471695014219, "language_loss": 0.60656333, "learning_rate": 3.2805292479616798e-06, "loss": 0.62722886, "num_input_tokens_seen": 53657160, "step": 2499, "time_per_iteration": 3.165987730026245 }, { "auxiliary_loss_clip": 0.01143755, "auxiliary_loss_mlp": 0.01051255, "balance_loss_clip": 1.05085278, "balance_loss_mlp": 1.03040528, "epoch": 0.30060722659772743, "flos": 26248653400320.0, "grad_norm": 2.229718105119911, "language_loss": 0.91671264, "learning_rate": 3.2799307803360955e-06, "loss": 0.93866277, "num_input_tokens_seen": 53673090, "step": 2500, "time_per_iteration": 2.662382125854492 }, { "auxiliary_loss_clip": 0.01161896, "auxiliary_loss_mlp": 0.0104879, "balance_loss_clip": 1.05230105, "balance_loss_mlp": 1.03077769, "epoch": 0.3007274694883665, "flos": 24970879912320.0, "grad_norm": 1.5702778282498169, "language_loss": 0.81621873, "learning_rate": 3.27933211854603e-06, "loss": 0.83832562, "num_input_tokens_seen": 53692145, "step": 2501, "time_per_iteration": 2.598090648651123 }, { "auxiliary_loss_clip": 0.01140553, "auxiliary_loss_mlp": 0.01049928, "balance_loss_clip": 1.05207491, "balance_loss_mlp": 1.02907813, "epoch": 0.3008477123790056, "flos": 17055845458560.0, "grad_norm": 1.5585322877489258, "language_loss": 0.87047493, "learning_rate": 3.278733262682299e-06, "loss": 0.89237976, "num_input_tokens_seen": 53710000, "step": 2502, "time_per_iteration": 2.629251003265381 }, { "auxiliary_loss_clip": 0.01164964, "auxiliary_loss_mlp": 0.01046795, "balance_loss_clip": 1.051332, "balance_loss_mlp": 1.0258975, "epoch": 0.3009679552696447, "flos": 21506398254720.0, "grad_norm": 2.292345179430449, "language_loss": 0.83170444, "learning_rate": 3.2781342128357484e-06, "loss": 0.85382205, "num_input_tokens_seen": 53729355, "step": 2503, "time_per_iteration": 2.5724897384643555 }, { "auxiliary_loss_clip": 0.0112712, "auxiliary_loss_mlp": 0.01048674, "balance_loss_clip": 1.04899049, "balance_loss_mlp": 1.02975583, "epoch": 0.30108819816028376, "flos": 21134004001920.0, "grad_norm": 2.374142104943579, "language_loss": 0.80456156, "learning_rate": 3.2775349690972547e-06, "loss": 0.82631946, "num_input_tokens_seen": 53743505, "step": 2504, "time_per_iteration": 2.704957962036133 }, { "auxiliary_loss_clip": 0.01040253, "auxiliary_loss_mlp": 0.01007549, "balance_loss_clip": 1.01646876, "balance_loss_mlp": 1.00496244, "epoch": 0.30120844105092287, "flos": 71126434938240.0, "grad_norm": 0.766391931818719, "language_loss": 0.51847255, "learning_rate": 3.276935531557722e-06, "loss": 0.53895056, "num_input_tokens_seen": 53808725, "step": 2505, "time_per_iteration": 3.3051962852478027 }, { "auxiliary_loss_clip": 0.01121005, "auxiliary_loss_mlp": 0.01053794, "balance_loss_clip": 1.04618549, "balance_loss_mlp": 1.03182316, "epoch": 0.301328683941562, "flos": 20264571302400.0, "grad_norm": 3.5539085841946605, "language_loss": 0.7947669, "learning_rate": 3.2763359003080837e-06, "loss": 0.81651485, "num_input_tokens_seen": 53825680, "step": 2506, "time_per_iteration": 2.666975736618042 }, { "auxiliary_loss_clip": 0.01040853, "auxiliary_loss_mlp": 0.01002508, "balance_loss_clip": 1.02234864, "balance_loss_mlp": 0.99980211, "epoch": 0.30144892683220104, "flos": 70648212240000.0, "grad_norm": 0.8446494065402335, "language_loss": 0.62548721, "learning_rate": 3.2757360754393047e-06, "loss": 0.64592075, "num_input_tokens_seen": 53889750, "step": 2507, "time_per_iteration": 4.31151819229126 }, { "auxiliary_loss_clip": 0.01153927, "auxiliary_loss_mlp": 0.01048134, "balance_loss_clip": 1.05268621, "balance_loss_mlp": 1.02726018, "epoch": 0.30156916972284015, "flos": 22820549241600.0, "grad_norm": 3.828075955277589, "language_loss": 0.64121497, "learning_rate": 3.2751360570423767e-06, "loss": 0.66323555, "num_input_tokens_seen": 53908135, "step": 2508, "time_per_iteration": 2.629136085510254 }, { "auxiliary_loss_clip": 0.01142622, "auxiliary_loss_mlp": 0.0104895, "balance_loss_clip": 1.05033541, "balance_loss_mlp": 1.03019834, "epoch": 0.3016894126134792, "flos": 29899188529920.0, "grad_norm": 2.135403172171045, "language_loss": 0.75860441, "learning_rate": 3.2745358452083236e-06, "loss": 0.78052014, "num_input_tokens_seen": 53931035, "step": 2509, "time_per_iteration": 2.694061756134033 }, { "auxiliary_loss_clip": 0.01146723, "auxiliary_loss_mlp": 0.01041126, "balance_loss_clip": 1.05190742, "balance_loss_mlp": 1.02285075, "epoch": 0.3018096555041183, "flos": 21546331200000.0, "grad_norm": 1.4827180473846187, "language_loss": 0.82515895, "learning_rate": 3.2739354400281955e-06, "loss": 0.84703743, "num_input_tokens_seen": 53952255, "step": 2510, "time_per_iteration": 4.645521879196167 }, { "auxiliary_loss_clip": 0.0103444, "auxiliary_loss_mlp": 0.00756567, "balance_loss_clip": 1.0233686, "balance_loss_mlp": 1.00047624, "epoch": 0.3019298983947574, "flos": 59136294597120.0, "grad_norm": 0.8650941439976274, "language_loss": 0.63706315, "learning_rate": 3.2733348415930744e-06, "loss": 0.65497315, "num_input_tokens_seen": 54014125, "step": 2511, "time_per_iteration": 3.289489269256592 }, { "auxiliary_loss_clip": 0.01127105, "auxiliary_loss_mlp": 0.01048087, "balance_loss_clip": 1.05034995, "balance_loss_mlp": 1.02839375, "epoch": 0.3020501412853965, "flos": 34423070941440.0, "grad_norm": 2.238149649680198, "language_loss": 0.80707073, "learning_rate": 3.27273404999407e-06, "loss": 0.82882261, "num_input_tokens_seen": 54036345, "step": 2512, "time_per_iteration": 2.7948801517486572 }, { "auxiliary_loss_clip": 0.01044338, "auxiliary_loss_mlp": 0.01009951, "balance_loss_clip": 1.02551723, "balance_loss_mlp": 1.00738811, "epoch": 0.3021703841760356, "flos": 71008288128000.0, "grad_norm": 0.8120259240374451, "language_loss": 0.60448617, "learning_rate": 3.272133065322322e-06, "loss": 0.62502909, "num_input_tokens_seen": 54094615, "step": 2513, "time_per_iteration": 3.1875247955322266 }, { "auxiliary_loss_clip": 0.01161876, "auxiliary_loss_mlp": 0.01047273, "balance_loss_clip": 1.05087733, "balance_loss_mlp": 1.02731764, "epoch": 0.3022906270666747, "flos": 21510528318720.0, "grad_norm": 1.663207786937648, "language_loss": 0.79434741, "learning_rate": 3.271531887669e-06, "loss": 0.81643891, "num_input_tokens_seen": 54114675, "step": 2514, "time_per_iteration": 2.6214358806610107 }, { "auxiliary_loss_clip": 0.01124985, "auxiliary_loss_mlp": 0.0105695, "balance_loss_clip": 1.05136025, "balance_loss_mlp": 1.03511083, "epoch": 0.30241086995731375, "flos": 31132001168640.0, "grad_norm": 2.302972853090926, "language_loss": 0.63923842, "learning_rate": 3.2709305171253015e-06, "loss": 0.66105777, "num_input_tokens_seen": 54134795, "step": 2515, "time_per_iteration": 2.7476749420166016 }, { "auxiliary_loss_clip": 0.01154817, "auxiliary_loss_mlp": 0.01045263, "balance_loss_clip": 1.05444908, "balance_loss_mlp": 1.02647519, "epoch": 0.30253111284795287, "flos": 23511542152320.0, "grad_norm": 1.9850368238834797, "language_loss": 0.777619, "learning_rate": 3.2703289537824536e-06, "loss": 0.79961979, "num_input_tokens_seen": 54154595, "step": 2516, "time_per_iteration": 3.546783447265625 }, { "auxiliary_loss_clip": 0.0111937, "auxiliary_loss_mlp": 0.01055259, "balance_loss_clip": 1.05007362, "balance_loss_mlp": 1.0340395, "epoch": 0.302651355738592, "flos": 18725367651840.0, "grad_norm": 2.7020344585592637, "language_loss": 0.78731066, "learning_rate": 3.269727197731714e-06, "loss": 0.809057, "num_input_tokens_seen": 54167360, "step": 2517, "time_per_iteration": 2.6955177783966064 }, { "auxiliary_loss_clip": 0.01107545, "auxiliary_loss_mlp": 0.0105525, "balance_loss_clip": 1.04423654, "balance_loss_mlp": 1.03477001, "epoch": 0.30277159862923103, "flos": 22418888382720.0, "grad_norm": 1.8969029569762546, "language_loss": 0.7795434, "learning_rate": 3.269125249064367e-06, "loss": 0.8011713, "num_input_tokens_seen": 54187055, "step": 2518, "time_per_iteration": 2.6999921798706055 }, { "auxiliary_loss_clip": 0.01165472, "auxiliary_loss_mlp": 0.01052507, "balance_loss_clip": 1.05150414, "balance_loss_mlp": 1.03101325, "epoch": 0.30289184151987014, "flos": 22273126992000.0, "grad_norm": 1.7855256681099991, "language_loss": 0.82918799, "learning_rate": 3.2685231078717297e-06, "loss": 0.85136777, "num_input_tokens_seen": 54207245, "step": 2519, "time_per_iteration": 2.5835888385772705 }, { "auxiliary_loss_clip": 0.01123676, "auxiliary_loss_mlp": 0.00778154, "balance_loss_clip": 1.04823375, "balance_loss_mlp": 1.00046396, "epoch": 0.30301208441050925, "flos": 25225594231680.0, "grad_norm": 2.399558280669778, "language_loss": 0.75404882, "learning_rate": 3.267920774245145e-06, "loss": 0.77306718, "num_input_tokens_seen": 54226650, "step": 2520, "time_per_iteration": 2.725649118423462 }, { "auxiliary_loss_clip": 0.01159717, "auxiliary_loss_mlp": 0.01054254, "balance_loss_clip": 1.05345809, "balance_loss_mlp": 1.03178298, "epoch": 0.3031323273011483, "flos": 23039245198080.0, "grad_norm": 1.9891777543470508, "language_loss": 0.84830046, "learning_rate": 3.2673182482759876e-06, "loss": 0.87044013, "num_input_tokens_seen": 54245765, "step": 2521, "time_per_iteration": 2.637786626815796 }, { "auxiliary_loss_clip": 0.01155085, "auxiliary_loss_mlp": 0.01046974, "balance_loss_clip": 1.0543561, "balance_loss_mlp": 1.02654171, "epoch": 0.3032525701917874, "flos": 18876695650560.0, "grad_norm": 1.8349129894323821, "language_loss": 0.66448855, "learning_rate": 3.266715530055659e-06, "loss": 0.68650913, "num_input_tokens_seen": 54263915, "step": 2522, "time_per_iteration": 2.609919786453247 }, { "auxiliary_loss_clip": 0.01144202, "auxiliary_loss_mlp": 0.0104956, "balance_loss_clip": 1.04884815, "balance_loss_mlp": 1.02824521, "epoch": 0.30337281308242653, "flos": 17782641250560.0, "grad_norm": 3.1987891104687787, "language_loss": 0.80661035, "learning_rate": 3.2661126196755927e-06, "loss": 0.82854795, "num_input_tokens_seen": 54283025, "step": 2523, "time_per_iteration": 2.6108720302581787 }, { "auxiliary_loss_clip": 0.01062458, "auxiliary_loss_mlp": 0.01003182, "balance_loss_clip": 1.02552509, "balance_loss_mlp": 1.00070286, "epoch": 0.3034930559730656, "flos": 57824298426240.0, "grad_norm": 0.7838851915990797, "language_loss": 0.56014442, "learning_rate": 3.265509517227248e-06, "loss": 0.58080083, "num_input_tokens_seen": 54339840, "step": 2524, "time_per_iteration": 3.1330063343048096 }, { "auxiliary_loss_clip": 0.01142705, "auxiliary_loss_mlp": 0.01041919, "balance_loss_clip": 1.04835439, "balance_loss_mlp": 1.02142727, "epoch": 0.3036132988637047, "flos": 14755587419520.0, "grad_norm": 2.016704935877204, "language_loss": 0.80750644, "learning_rate": 3.264906222802115e-06, "loss": 0.82935268, "num_input_tokens_seen": 54357690, "step": 2525, "time_per_iteration": 2.6181230545043945 }, { "auxiliary_loss_clip": 0.01167847, "auxiliary_loss_mlp": 0.01047804, "balance_loss_clip": 1.05176115, "balance_loss_mlp": 1.02678752, "epoch": 0.30373354175434375, "flos": 21033203460480.0, "grad_norm": 3.7916735523928633, "language_loss": 0.77856648, "learning_rate": 3.264302736491715e-06, "loss": 0.80072296, "num_input_tokens_seen": 54377810, "step": 2526, "time_per_iteration": 2.576286554336548 }, { "auxiliary_loss_clip": 0.01149924, "auxiliary_loss_mlp": 0.01049859, "balance_loss_clip": 1.05482376, "balance_loss_mlp": 1.03015399, "epoch": 0.30385378464498286, "flos": 21143233797120.0, "grad_norm": 2.29060583136674, "language_loss": 0.87009561, "learning_rate": 3.263699058387594e-06, "loss": 0.89209342, "num_input_tokens_seen": 54395245, "step": 2527, "time_per_iteration": 2.6121702194213867 }, { "auxiliary_loss_clip": 0.01121804, "auxiliary_loss_mlp": 0.01050911, "balance_loss_clip": 1.0445807, "balance_loss_mlp": 1.02989483, "epoch": 0.30397402753562197, "flos": 20629244131200.0, "grad_norm": 2.246966435055922, "language_loss": 0.90160322, "learning_rate": 3.2630951885813315e-06, "loss": 0.92333043, "num_input_tokens_seen": 54412640, "step": 2528, "time_per_iteration": -0.004884243011474609 }, { "auxiliary_loss_clip": 0.01140879, "auxiliary_loss_mlp": 0.01045253, "balance_loss_clip": 1.05006146, "balance_loss_mlp": 1.02621555, "epoch": 0.304094270426261, "flos": 15085678429440.0, "grad_norm": 3.906838713469829, "language_loss": 0.78223473, "learning_rate": 3.262491127164533e-06, "loss": 0.8040961, "num_input_tokens_seen": 54431455, "step": 2529, "time_per_iteration": 2.653517007827759 }, { "auxiliary_loss_clip": 0.01146965, "auxiliary_loss_mlp": 0.00776822, "balance_loss_clip": 1.04809403, "balance_loss_mlp": 1.00054979, "epoch": 0.30421451331690014, "flos": 13845216193920.0, "grad_norm": 2.3823748148060333, "language_loss": 0.80543077, "learning_rate": 3.2618868742288337e-06, "loss": 0.82466871, "num_input_tokens_seen": 54448380, "step": 2530, "time_per_iteration": 2.6263062953948975 }, { "auxiliary_loss_clip": 0.01157472, "auxiliary_loss_mlp": 0.01044999, "balance_loss_clip": 1.05575585, "balance_loss_mlp": 1.02488828, "epoch": 0.30433475620753925, "flos": 17384212615680.0, "grad_norm": 2.135941356193469, "language_loss": 0.72312129, "learning_rate": 3.261282429865899e-06, "loss": 0.74514592, "num_input_tokens_seen": 54466385, "step": 2531, "time_per_iteration": 2.7056407928466797 }, { "auxiliary_loss_clip": 0.01140004, "auxiliary_loss_mlp": 0.0077695, "balance_loss_clip": 1.05021095, "balance_loss_mlp": 1.00050545, "epoch": 0.3044549990981783, "flos": 18916951818240.0, "grad_norm": 1.859638153948569, "language_loss": 0.7203027, "learning_rate": 3.2606777941674225e-06, "loss": 0.73947215, "num_input_tokens_seen": 54485040, "step": 2532, "time_per_iteration": 2.670280933380127 }, { "auxiliary_loss_clip": 0.01112632, "auxiliary_loss_mlp": 0.01048428, "balance_loss_clip": 1.04782093, "balance_loss_mlp": 1.02857924, "epoch": 0.3045752419888174, "flos": 21068431724160.0, "grad_norm": 2.14955493869767, "language_loss": 0.84697115, "learning_rate": 3.2600729672251276e-06, "loss": 0.86858183, "num_input_tokens_seen": 54502755, "step": 2533, "time_per_iteration": 3.7031233310699463 }, { "auxiliary_loss_clip": 0.01162005, "auxiliary_loss_mlp": 0.00775792, "balance_loss_clip": 1.05127323, "balance_loss_mlp": 1.0005132, "epoch": 0.3046954848794565, "flos": 29096405516160.0, "grad_norm": 2.136067951923354, "language_loss": 0.65406203, "learning_rate": 3.259467949130765e-06, "loss": 0.67343998, "num_input_tokens_seen": 54524165, "step": 2534, "time_per_iteration": 2.628826856613159 }, { "auxiliary_loss_clip": 0.01140328, "auxiliary_loss_mlp": 0.01050863, "balance_loss_clip": 1.04937565, "balance_loss_mlp": 1.03125286, "epoch": 0.3048157277700956, "flos": 20295346279680.0, "grad_norm": 3.1895647599814607, "language_loss": 0.82570463, "learning_rate": 3.2588627399761164e-06, "loss": 0.84761661, "num_input_tokens_seen": 54540160, "step": 2535, "time_per_iteration": 2.678102493286133 }, { "auxiliary_loss_clip": 0.011387, "auxiliary_loss_mlp": 0.01054416, "balance_loss_clip": 1.04741275, "balance_loss_mlp": 1.0344007, "epoch": 0.3049359706607347, "flos": 22739929165440.0, "grad_norm": 1.6920605845649914, "language_loss": 0.71081614, "learning_rate": 3.2582573398529903e-06, "loss": 0.73274726, "num_input_tokens_seen": 54557515, "step": 2536, "time_per_iteration": 3.632007598876953 }, { "auxiliary_loss_clip": 0.01124837, "auxiliary_loss_mlp": 0.01065678, "balance_loss_clip": 1.04776454, "balance_loss_mlp": 1.04602051, "epoch": 0.3050562135513738, "flos": 18434634969600.0, "grad_norm": 2.101357987473152, "language_loss": 0.73887283, "learning_rate": 3.2576517488532265e-06, "loss": 0.76077795, "num_input_tokens_seen": 54573865, "step": 2537, "time_per_iteration": 2.6436617374420166 }, { "auxiliary_loss_clip": 0.01146356, "auxiliary_loss_mlp": 0.01044183, "balance_loss_clip": 1.04773796, "balance_loss_mlp": 1.02525294, "epoch": 0.30517645644201286, "flos": 20370327920640.0, "grad_norm": 1.842065833670944, "language_loss": 0.87161601, "learning_rate": 3.257045967068692e-06, "loss": 0.89352137, "num_input_tokens_seen": 54593120, "step": 2538, "time_per_iteration": 2.605297803878784 }, { "auxiliary_loss_clip": 0.01167145, "auxiliary_loss_mlp": 0.01044251, "balance_loss_clip": 1.05407393, "balance_loss_mlp": 1.02482009, "epoch": 0.30529669933265197, "flos": 21945118970880.0, "grad_norm": 1.8286947070525268, "language_loss": 0.82400519, "learning_rate": 3.2564399945912848e-06, "loss": 0.84611917, "num_input_tokens_seen": 54612910, "step": 2539, "time_per_iteration": 2.599576473236084 }, { "auxiliary_loss_clip": 0.0111013, "auxiliary_loss_mlp": 0.01049387, "balance_loss_clip": 1.04479051, "balance_loss_mlp": 1.03067088, "epoch": 0.305416942223291, "flos": 21835411856640.0, "grad_norm": 2.845303710968983, "language_loss": 0.82398194, "learning_rate": 3.2558338315129287e-06, "loss": 0.84557712, "num_input_tokens_seen": 54631055, "step": 2540, "time_per_iteration": 2.6875882148742676 }, { "auxiliary_loss_clip": 0.01142488, "auxiliary_loss_mlp": 0.01051665, "balance_loss_clip": 1.05021095, "balance_loss_mlp": 1.02902699, "epoch": 0.30553718511393013, "flos": 33911810709120.0, "grad_norm": 2.1005301609074816, "language_loss": 0.75398135, "learning_rate": 3.2552274779255785e-06, "loss": 0.77592295, "num_input_tokens_seen": 54651985, "step": 2541, "time_per_iteration": 2.724919557571411 }, { "auxiliary_loss_clip": 0.01151096, "auxiliary_loss_mlp": 0.01048032, "balance_loss_clip": 1.05144656, "balance_loss_mlp": 1.02835059, "epoch": 0.30565742800456924, "flos": 22268530051200.0, "grad_norm": 2.3624714483188485, "language_loss": 0.77088058, "learning_rate": 3.2546209339212184e-06, "loss": 0.79287183, "num_input_tokens_seen": 54671005, "step": 2542, "time_per_iteration": 3.5294833183288574 }, { "auxiliary_loss_clip": 0.01136194, "auxiliary_loss_mlp": 0.01050878, "balance_loss_clip": 1.04717374, "balance_loss_mlp": 1.03000414, "epoch": 0.3057776708952083, "flos": 22565044823040.0, "grad_norm": 1.7182308663160955, "language_loss": 0.776793, "learning_rate": 3.25401419959186e-06, "loss": 0.79866374, "num_input_tokens_seen": 54691615, "step": 2543, "time_per_iteration": 2.6552584171295166 }, { "auxiliary_loss_clip": 0.01151744, "auxiliary_loss_mlp": 0.01062265, "balance_loss_clip": 1.05191696, "balance_loss_mlp": 1.04260731, "epoch": 0.3058979137858474, "flos": 21799213925760.0, "grad_norm": 2.1374831482925956, "language_loss": 0.76176637, "learning_rate": 3.253407275029545e-06, "loss": 0.78390646, "num_input_tokens_seen": 54710520, "step": 2544, "time_per_iteration": 2.6654410362243652 }, { "auxiliary_loss_clip": 0.01134393, "auxiliary_loss_mlp": 0.01056864, "balance_loss_clip": 1.05306268, "balance_loss_mlp": 1.03631282, "epoch": 0.3060181566764865, "flos": 26979435601920.0, "grad_norm": 3.269734051958002, "language_loss": 0.79974401, "learning_rate": 3.2528001603263425e-06, "loss": 0.82165658, "num_input_tokens_seen": 54732590, "step": 2545, "time_per_iteration": 2.8212943077087402 }, { "auxiliary_loss_clip": 0.011564, "auxiliary_loss_mlp": 0.01056232, "balance_loss_clip": 1.05563045, "balance_loss_mlp": 1.03522694, "epoch": 0.3061383995671256, "flos": 19865101173120.0, "grad_norm": 2.800903111861418, "language_loss": 0.81337619, "learning_rate": 3.2521928555743514e-06, "loss": 0.83550251, "num_input_tokens_seen": 54749935, "step": 2546, "time_per_iteration": 2.626070976257324 }, { "auxiliary_loss_clip": 0.01135502, "auxiliary_loss_mlp": 0.00777441, "balance_loss_clip": 1.04763603, "balance_loss_mlp": 1.00057387, "epoch": 0.3062586424577647, "flos": 22127509255680.0, "grad_norm": 2.631775278683457, "language_loss": 0.67853916, "learning_rate": 3.2515853608657e-06, "loss": 0.69766861, "num_input_tokens_seen": 54767935, "step": 2547, "time_per_iteration": 2.672780752182007 }, { "auxiliary_loss_clip": 0.01148036, "auxiliary_loss_mlp": 0.01052041, "balance_loss_clip": 1.05191374, "balance_loss_mlp": 1.02908111, "epoch": 0.3063788853484038, "flos": 20845497962880.0, "grad_norm": 2.787410953757698, "language_loss": 0.75150001, "learning_rate": 3.250977676292545e-06, "loss": 0.7735008, "num_input_tokens_seen": 54786175, "step": 2548, "time_per_iteration": 2.6120285987854004 }, { "auxiliary_loss_clip": 0.01141829, "auxiliary_loss_mlp": 0.01046835, "balance_loss_clip": 1.04971743, "balance_loss_mlp": 1.02510333, "epoch": 0.30649912823904285, "flos": 16209717707520.0, "grad_norm": 2.33802988088857, "language_loss": 0.79437029, "learning_rate": 3.2503698019470712e-06, "loss": 0.81625694, "num_input_tokens_seen": 54801945, "step": 2549, "time_per_iteration": 2.6226727962493896 }, { "auxiliary_loss_clip": 0.01152702, "auxiliary_loss_mlp": 0.01042598, "balance_loss_clip": 1.05143702, "balance_loss_mlp": 1.02160573, "epoch": 0.30661937112968196, "flos": 18617815353600.0, "grad_norm": 2.024844673322981, "language_loss": 0.78014553, "learning_rate": 3.249761737921492e-06, "loss": 0.80209851, "num_input_tokens_seen": 54818475, "step": 2550, "time_per_iteration": 2.5925748348236084 }, { "auxiliary_loss_clip": 0.01138564, "auxiliary_loss_mlp": 0.01051326, "balance_loss_clip": 1.04972899, "balance_loss_mlp": 1.03108394, "epoch": 0.30673961402032107, "flos": 31390809638400.0, "grad_norm": 1.9220620451398074, "language_loss": 0.74440479, "learning_rate": 3.249153484308051e-06, "loss": 0.76630378, "num_input_tokens_seen": 54837090, "step": 2551, "time_per_iteration": 2.6927571296691895 }, { "auxiliary_loss_clip": 0.01107151, "auxiliary_loss_mlp": 0.0105038, "balance_loss_clip": 1.04532516, "balance_loss_mlp": 1.02832675, "epoch": 0.3068598569109601, "flos": 20229809915520.0, "grad_norm": 2.240956510857225, "language_loss": 0.78417659, "learning_rate": 3.2485450411990194e-06, "loss": 0.80575192, "num_input_tokens_seen": 54856445, "step": 2552, "time_per_iteration": 2.7350571155548096 }, { "auxiliary_loss_clip": 0.01169177, "auxiliary_loss_mlp": 0.01061157, "balance_loss_clip": 1.05153108, "balance_loss_mlp": 1.04009283, "epoch": 0.30698009980159924, "flos": 29601991399680.0, "grad_norm": 3.402350694460275, "language_loss": 0.82256836, "learning_rate": 3.2479364086866983e-06, "loss": 0.84487164, "num_input_tokens_seen": 54876700, "step": 2553, "time_per_iteration": 2.7502450942993164 }, { "auxiliary_loss_clip": 0.01143789, "auxiliary_loss_mlp": 0.00776295, "balance_loss_clip": 1.05615115, "balance_loss_mlp": 1.00045812, "epoch": 0.30710034269223835, "flos": 23842423261440.0, "grad_norm": 1.7854431166400684, "language_loss": 0.81322825, "learning_rate": 3.247327586863416e-06, "loss": 0.83242905, "num_input_tokens_seen": 54897580, "step": 2554, "time_per_iteration": 2.800095558166504 }, { "auxiliary_loss_clip": 0.01133398, "auxiliary_loss_mlp": 0.01048176, "balance_loss_clip": 1.04750013, "balance_loss_mlp": 1.02700472, "epoch": 0.3072205855828774, "flos": 25884986152320.0, "grad_norm": 4.73626651355225, "language_loss": 0.76768088, "learning_rate": 3.2467185758215304e-06, "loss": 0.7894966, "num_input_tokens_seen": 54917320, "step": 2555, "time_per_iteration": 2.7453532218933105 }, { "auxiliary_loss_clip": 0.01131803, "auxiliary_loss_mlp": 0.0077559, "balance_loss_clip": 1.05307686, "balance_loss_mlp": 1.00047588, "epoch": 0.3073408284735165, "flos": 22236390357120.0, "grad_norm": 3.787595211453149, "language_loss": 0.85653597, "learning_rate": 3.246109375653428e-06, "loss": 0.87560987, "num_input_tokens_seen": 54934085, "step": 2556, "time_per_iteration": 2.733640670776367 }, { "auxiliary_loss_clip": 0.0116429, "auxiliary_loss_mlp": 0.01050875, "balance_loss_clip": 1.05103207, "balance_loss_mlp": 1.02990639, "epoch": 0.30746107136415557, "flos": 19500284689920.0, "grad_norm": 2.476933099910356, "language_loss": 0.78393883, "learning_rate": 3.2454999864515243e-06, "loss": 0.80609047, "num_input_tokens_seen": 54953460, "step": 2557, "time_per_iteration": 2.578181266784668 }, { "auxiliary_loss_clip": 0.01136459, "auxiliary_loss_mlp": 0.00776957, "balance_loss_clip": 1.05004048, "balance_loss_mlp": 1.00053811, "epoch": 0.3075813142547947, "flos": 21724806902400.0, "grad_norm": 1.8724259858517118, "language_loss": 0.69487065, "learning_rate": 3.244890408308263e-06, "loss": 0.71400481, "num_input_tokens_seen": 54974165, "step": 2558, "time_per_iteration": 2.6175742149353027 }, { "auxiliary_loss_clip": 0.01114958, "auxiliary_loss_mlp": 0.01048892, "balance_loss_clip": 1.04578257, "balance_loss_mlp": 1.02831638, "epoch": 0.3077015571454338, "flos": 24097963593600.0, "grad_norm": 2.21339847305816, "language_loss": 0.6123333, "learning_rate": 3.2442806413161165e-06, "loss": 0.63397181, "num_input_tokens_seen": 54993810, "step": 2559, "time_per_iteration": 3.690852165222168 }, { "auxiliary_loss_clip": 0.01119097, "auxiliary_loss_mlp": 0.0104613, "balance_loss_clip": 1.04758549, "balance_loss_mlp": 1.0245893, "epoch": 0.30782180003607285, "flos": 18405476104320.0, "grad_norm": 2.171099029053, "language_loss": 0.76233721, "learning_rate": 3.243670685567586e-06, "loss": 0.78398949, "num_input_tokens_seen": 55011210, "step": 2560, "time_per_iteration": 2.668231964111328 }, { "auxiliary_loss_clip": 0.01137635, "auxiliary_loss_mlp": 0.00775239, "balance_loss_clip": 1.04739833, "balance_loss_mlp": 1.00050521, "epoch": 0.30794204292671196, "flos": 23878549365120.0, "grad_norm": 2.6490823423983194, "language_loss": 0.80513388, "learning_rate": 3.2430605411552012e-06, "loss": 0.82426262, "num_input_tokens_seen": 55031325, "step": 2561, "time_per_iteration": 2.655155897140503 }, { "auxiliary_loss_clip": 0.01035955, "auxiliary_loss_mlp": 0.01013218, "balance_loss_clip": 1.02577376, "balance_loss_mlp": 1.01069069, "epoch": 0.30806228581735107, "flos": 67927800816000.0, "grad_norm": 0.8926062369908624, "language_loss": 0.70555204, "learning_rate": 3.2424502081715205e-06, "loss": 0.72604376, "num_input_tokens_seen": 55094440, "step": 2562, "time_per_iteration": 5.2543041706085205 }, { "auxiliary_loss_clip": 0.01137217, "auxiliary_loss_mlp": 0.01044136, "balance_loss_clip": 1.04820251, "balance_loss_mlp": 1.02359653, "epoch": 0.3081825287079901, "flos": 23843213360640.0, "grad_norm": 1.8301865373975026, "language_loss": 0.7844063, "learning_rate": 3.241839686709132e-06, "loss": 0.80621982, "num_input_tokens_seen": 55115375, "step": 2563, "time_per_iteration": 2.703256845474243 }, { "auxiliary_loss_clip": 0.01151784, "auxiliary_loss_mlp": 0.0105022, "balance_loss_clip": 1.04886365, "balance_loss_mlp": 1.02788019, "epoch": 0.30830277159862923, "flos": 16209969102720.0, "grad_norm": 4.300953706217355, "language_loss": 0.81998348, "learning_rate": 3.2412289768606495e-06, "loss": 0.84200346, "num_input_tokens_seen": 55131945, "step": 2564, "time_per_iteration": 2.5910837650299072 }, { "auxiliary_loss_clip": 0.01156372, "auxiliary_loss_mlp": 0.01055763, "balance_loss_clip": 1.05151272, "balance_loss_mlp": 1.03399503, "epoch": 0.30842301448926834, "flos": 29349503723520.0, "grad_norm": 2.312248490208794, "language_loss": 0.82835686, "learning_rate": 3.240618078718718e-06, "loss": 0.85047817, "num_input_tokens_seen": 55153405, "step": 2565, "time_per_iteration": 2.6644837856292725 }, { "auxiliary_loss_clip": 0.01128337, "auxiliary_loss_mlp": 0.01052398, "balance_loss_clip": 1.04840147, "balance_loss_mlp": 1.03035581, "epoch": 0.3085432573799074, "flos": 21945190798080.0, "grad_norm": 3.9004817492554773, "language_loss": 0.74483919, "learning_rate": 3.240006992376011e-06, "loss": 0.76664656, "num_input_tokens_seen": 55173030, "step": 2566, "time_per_iteration": 2.6854963302612305 }, { "auxiliary_loss_clip": 0.01144879, "auxiliary_loss_mlp": 0.01048637, "balance_loss_clip": 1.05150342, "balance_loss_mlp": 1.028216, "epoch": 0.3086635002705465, "flos": 22054718344320.0, "grad_norm": 2.974429594057894, "language_loss": 0.76062584, "learning_rate": 3.2393957179252284e-06, "loss": 0.782561, "num_input_tokens_seen": 55189565, "step": 2567, "time_per_iteration": 2.6350879669189453 }, { "auxiliary_loss_clip": 0.01169068, "auxiliary_loss_mlp": 0.01047632, "balance_loss_clip": 1.05396843, "balance_loss_mlp": 1.02677035, "epoch": 0.3087837431611856, "flos": 32665925520000.0, "grad_norm": 1.911063073932678, "language_loss": 0.80491364, "learning_rate": 3.2387842554591016e-06, "loss": 0.82708061, "num_input_tokens_seen": 55210380, "step": 2568, "time_per_iteration": 3.55898118019104 }, { "auxiliary_loss_clip": 0.01170206, "auxiliary_loss_mlp": 0.01055954, "balance_loss_clip": 1.05421543, "balance_loss_mlp": 1.03359067, "epoch": 0.3089039860518247, "flos": 17599245384960.0, "grad_norm": 2.2246788460049958, "language_loss": 0.87987733, "learning_rate": 3.238172605070388e-06, "loss": 0.90213895, "num_input_tokens_seen": 55225795, "step": 2569, "time_per_iteration": 2.5633738040924072 }, { "auxiliary_loss_clip": 0.01154861, "auxiliary_loss_mlp": 0.00776536, "balance_loss_clip": 1.05225658, "balance_loss_mlp": 1.00035834, "epoch": 0.3090242289424638, "flos": 14383839611520.0, "grad_norm": 2.5627318496152753, "language_loss": 0.78363109, "learning_rate": 3.2375607668518745e-06, "loss": 0.80294502, "num_input_tokens_seen": 55238830, "step": 2570, "time_per_iteration": 2.5333240032196045 }, { "auxiliary_loss_clip": 0.01134991, "auxiliary_loss_mlp": 0.01045549, "balance_loss_clip": 1.04978395, "balance_loss_mlp": 1.02426994, "epoch": 0.30914447183310284, "flos": 16068625084800.0, "grad_norm": 2.633232385884396, "language_loss": 0.89717162, "learning_rate": 3.236948740896377e-06, "loss": 0.91897702, "num_input_tokens_seen": 55253630, "step": 2571, "time_per_iteration": 2.628340244293213 }, { "auxiliary_loss_clip": 0.01155873, "auxiliary_loss_mlp": 0.0105892, "balance_loss_clip": 1.05160952, "balance_loss_mlp": 1.03913164, "epoch": 0.30926471472374195, "flos": 32230221546240.0, "grad_norm": 1.6439124991018064, "language_loss": 0.84476364, "learning_rate": 3.2363365272967384e-06, "loss": 0.86691153, "num_input_tokens_seen": 55276200, "step": 2572, "time_per_iteration": 2.6956613063812256 }, { "auxiliary_loss_clip": 0.01157908, "auxiliary_loss_mlp": 0.01064546, "balance_loss_clip": 1.05723798, "balance_loss_mlp": 1.04392326, "epoch": 0.30938495761438106, "flos": 20370722970240.0, "grad_norm": 2.1339488177201202, "language_loss": 0.81857419, "learning_rate": 3.235724126145832e-06, "loss": 0.8407988, "num_input_tokens_seen": 55292235, "step": 2573, "time_per_iteration": 2.611488103866577 }, { "auxiliary_loss_clip": 0.01143039, "auxiliary_loss_mlp": 0.01054939, "balance_loss_clip": 1.04831243, "balance_loss_mlp": 1.0326941, "epoch": 0.3095052005050201, "flos": 24061155131520.0, "grad_norm": 1.7420855683347647, "language_loss": 0.77336848, "learning_rate": 3.235111537536558e-06, "loss": 0.79534829, "num_input_tokens_seen": 55313050, "step": 2574, "time_per_iteration": 2.6587111949920654 }, { "auxiliary_loss_clip": 0.0115285, "auxiliary_loss_mlp": 0.0104272, "balance_loss_clip": 1.05042255, "balance_loss_mlp": 1.02319312, "epoch": 0.30962544339565923, "flos": 23401547729280.0, "grad_norm": 2.3830195707737776, "language_loss": 0.83020014, "learning_rate": 3.2344987615618456e-06, "loss": 0.85215586, "num_input_tokens_seen": 55332885, "step": 2575, "time_per_iteration": 2.624023675918579 }, { "auxiliary_loss_clip": 0.01129642, "auxiliary_loss_mlp": 0.01051297, "balance_loss_clip": 1.05375266, "balance_loss_mlp": 1.03068542, "epoch": 0.30974568628629834, "flos": 33799984692480.0, "grad_norm": 1.7810529599243385, "language_loss": 0.784504, "learning_rate": 3.2338857983146533e-06, "loss": 0.8063134, "num_input_tokens_seen": 55354385, "step": 2576, "time_per_iteration": 2.758218765258789 }, { "auxiliary_loss_clip": 0.01133212, "auxiliary_loss_mlp": 0.01050991, "balance_loss_clip": 1.04897451, "balance_loss_mlp": 1.02906811, "epoch": 0.3098659291769374, "flos": 20229594433920.0, "grad_norm": 2.728523592250545, "language_loss": 0.76318216, "learning_rate": 3.233272647887966e-06, "loss": 0.78502423, "num_input_tokens_seen": 55373275, "step": 2577, "time_per_iteration": 2.624692440032959 }, { "auxiliary_loss_clip": 0.01169813, "auxiliary_loss_mlp": 0.01054745, "balance_loss_clip": 1.05372059, "balance_loss_mlp": 1.03297722, "epoch": 0.3099861720675765, "flos": 24748556682240.0, "grad_norm": 2.2321831238925682, "language_loss": 0.8998841, "learning_rate": 3.2326593103747985e-06, "loss": 0.92212969, "num_input_tokens_seen": 55392290, "step": 2578, "time_per_iteration": 2.600324869155884 }, { "auxiliary_loss_clip": 0.01154866, "auxiliary_loss_mlp": 0.01052892, "balance_loss_clip": 1.0536952, "balance_loss_mlp": 1.03131497, "epoch": 0.3101064149582156, "flos": 11765485704960.0, "grad_norm": 3.9012288880474233, "language_loss": 0.84807515, "learning_rate": 3.2320457858681936e-06, "loss": 0.87015271, "num_input_tokens_seen": 55410680, "step": 2579, "time_per_iteration": 2.5820837020874023 }, { "auxiliary_loss_clip": 0.01139544, "auxiliary_loss_mlp": 0.01057221, "balance_loss_clip": 1.04812503, "balance_loss_mlp": 1.03753984, "epoch": 0.31022665784885467, "flos": 23033247626880.0, "grad_norm": 2.5422737605768555, "language_loss": 0.85744047, "learning_rate": 3.2314320744612228e-06, "loss": 0.87940812, "num_input_tokens_seen": 55425980, "step": 2580, "time_per_iteration": 2.6251375675201416 }, { "auxiliary_loss_clip": 0.01150643, "auxiliary_loss_mlp": 0.01049888, "balance_loss_clip": 1.05283403, "balance_loss_mlp": 1.0299319, "epoch": 0.3103469007394938, "flos": 16289188548480.0, "grad_norm": 1.9083273697137502, "language_loss": 0.76516604, "learning_rate": 3.2308181762469854e-06, "loss": 0.78717136, "num_input_tokens_seen": 55443925, "step": 2581, "time_per_iteration": 2.5777063369750977 }, { "auxiliary_loss_clip": 0.01176573, "auxiliary_loss_mlp": 0.01052507, "balance_loss_clip": 1.05518043, "balance_loss_mlp": 1.03076291, "epoch": 0.3104671436301329, "flos": 30515271626880.0, "grad_norm": 1.9953791094127602, "language_loss": 0.78153068, "learning_rate": 3.230204091318609e-06, "loss": 0.80382144, "num_input_tokens_seen": 55464465, "step": 2582, "time_per_iteration": 2.6248416900634766 }, { "auxiliary_loss_clip": 0.01168904, "auxiliary_loss_mlp": 0.00775786, "balance_loss_clip": 1.05311668, "balance_loss_mlp": 1.00021672, "epoch": 0.31058738652077195, "flos": 20047240062720.0, "grad_norm": 1.8896679909105352, "language_loss": 0.84613883, "learning_rate": 3.2295898197692503e-06, "loss": 0.86558574, "num_input_tokens_seen": 55483425, "step": 2583, "time_per_iteration": 2.6463043689727783 }, { "auxiliary_loss_clip": 0.01166724, "auxiliary_loss_mlp": 0.01052468, "balance_loss_clip": 1.05171394, "balance_loss_mlp": 1.03093934, "epoch": 0.31070762941141106, "flos": 28074639237120.0, "grad_norm": 1.877835242847518, "language_loss": 0.79031485, "learning_rate": 3.228975361692094e-06, "loss": 0.81250679, "num_input_tokens_seen": 55504445, "step": 2584, "time_per_iteration": 2.617004632949829 }, { "auxiliary_loss_clip": 0.01154914, "auxiliary_loss_mlp": 0.00777224, "balance_loss_clip": 1.04717386, "balance_loss_mlp": 1.00021863, "epoch": 0.31082787230205017, "flos": 20521907314560.0, "grad_norm": 2.179740188926991, "language_loss": 0.80348313, "learning_rate": 3.228360717180352e-06, "loss": 0.82280451, "num_input_tokens_seen": 55521970, "step": 2585, "time_per_iteration": 3.5510218143463135 }, { "auxiliary_loss_clip": 0.01057972, "auxiliary_loss_mlp": 0.00756598, "balance_loss_clip": 1.02090001, "balance_loss_mlp": 1.00014389, "epoch": 0.3109481151926892, "flos": 62445928723200.0, "grad_norm": 0.8148475842654495, "language_loss": 0.59534788, "learning_rate": 3.227745886327266e-06, "loss": 0.61349356, "num_input_tokens_seen": 55580665, "step": 2586, "time_per_iteration": 3.1204168796539307 }, { "auxiliary_loss_clip": 0.01057926, "auxiliary_loss_mlp": 0.01003036, "balance_loss_clip": 1.02111483, "balance_loss_mlp": 1.00059271, "epoch": 0.31106835808332833, "flos": 44746744723200.0, "grad_norm": 0.8079434519282797, "language_loss": 0.55839384, "learning_rate": 3.227130869226105e-06, "loss": 0.57900345, "num_input_tokens_seen": 55637825, "step": 2587, "time_per_iteration": 3.1360511779785156 }, { "auxiliary_loss_clip": 0.01157101, "auxiliary_loss_mlp": 0.01048563, "balance_loss_clip": 1.05240726, "balance_loss_mlp": 1.02730823, "epoch": 0.3111886009739674, "flos": 23403056100480.0, "grad_norm": 2.426694843537891, "language_loss": 0.82542562, "learning_rate": 3.226515665970167e-06, "loss": 0.8474822, "num_input_tokens_seen": 55655365, "step": 2588, "time_per_iteration": 4.536997079849243 }, { "auxiliary_loss_clip": 0.01150382, "auxiliary_loss_mlp": 0.01052882, "balance_loss_clip": 1.04764438, "balance_loss_mlp": 1.03132892, "epoch": 0.3113088438646065, "flos": 17530728192000.0, "grad_norm": 2.5972851243800257, "language_loss": 0.86531967, "learning_rate": 3.225900276652777e-06, "loss": 0.88735235, "num_input_tokens_seen": 55672140, "step": 2589, "time_per_iteration": 2.5636298656463623 }, { "auxiliary_loss_clip": 0.01146796, "auxiliary_loss_mlp": 0.01046516, "balance_loss_clip": 1.05022919, "balance_loss_mlp": 1.02721596, "epoch": 0.3114290867552456, "flos": 28365802882560.0, "grad_norm": 1.7018383552254015, "language_loss": 0.75701141, "learning_rate": 3.2252847013672906e-06, "loss": 0.77894455, "num_input_tokens_seen": 55694800, "step": 2590, "time_per_iteration": 2.677109479904175 }, { "auxiliary_loss_clip": 0.01118569, "auxiliary_loss_mlp": 0.01054666, "balance_loss_clip": 1.04548097, "balance_loss_mlp": 1.03201628, "epoch": 0.31154932964588467, "flos": 27379157126400.0, "grad_norm": 2.5431818505978208, "language_loss": 0.76646626, "learning_rate": 3.224668940207089e-06, "loss": 0.78819859, "num_input_tokens_seen": 55713785, "step": 2591, "time_per_iteration": 2.705871820449829 }, { "auxiliary_loss_clip": 0.01109333, "auxiliary_loss_mlp": 0.01054696, "balance_loss_clip": 1.04526043, "balance_loss_mlp": 1.03346527, "epoch": 0.3116695725365238, "flos": 26541864120960.0, "grad_norm": 2.1080200764599915, "language_loss": 0.87002343, "learning_rate": 3.2240529932655828e-06, "loss": 0.89166373, "num_input_tokens_seen": 55733050, "step": 2592, "time_per_iteration": 2.742946147918701 }, { "auxiliary_loss_clip": 0.0113976, "auxiliary_loss_mlp": 0.01043591, "balance_loss_clip": 1.05178761, "balance_loss_mlp": 1.02352786, "epoch": 0.3117898154271629, "flos": 21177600134400.0, "grad_norm": 3.4902565370157865, "language_loss": 0.88674128, "learning_rate": 3.223436860636211e-06, "loss": 0.90857482, "num_input_tokens_seen": 55748685, "step": 2593, "time_per_iteration": 2.6175215244293213 }, { "auxiliary_loss_clip": 0.01165694, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.05163097, "balance_loss_mlp": 1.02570963, "epoch": 0.31191005831780194, "flos": 27272430840960.0, "grad_norm": 1.749049591078349, "language_loss": 0.74082315, "learning_rate": 3.2228205424124403e-06, "loss": 0.76295245, "num_input_tokens_seen": 55771840, "step": 2594, "time_per_iteration": 3.506075859069824 }, { "auxiliary_loss_clip": 0.01127297, "auxiliary_loss_mlp": 0.01054808, "balance_loss_clip": 1.0472362, "balance_loss_mlp": 1.03205085, "epoch": 0.31203030120844105, "flos": 12963501043200.0, "grad_norm": 2.312090095108092, "language_loss": 0.7501061, "learning_rate": 3.222204038687765e-06, "loss": 0.77192712, "num_input_tokens_seen": 55784975, "step": 2595, "time_per_iteration": 2.582831621170044 }, { "auxiliary_loss_clip": 0.01146876, "auxiliary_loss_mlp": 0.01041411, "balance_loss_clip": 1.04753387, "balance_loss_mlp": 1.02168226, "epoch": 0.31215054409908016, "flos": 27562014288000.0, "grad_norm": 1.8343739192844635, "language_loss": 0.88127476, "learning_rate": 3.221587349555709e-06, "loss": 0.90315759, "num_input_tokens_seen": 55805235, "step": 2596, "time_per_iteration": 2.6658453941345215 }, { "auxiliary_loss_clip": 0.01141594, "auxiliary_loss_mlp": 0.01045466, "balance_loss_clip": 1.05049264, "balance_loss_mlp": 1.02485466, "epoch": 0.3122707869897192, "flos": 21506326427520.0, "grad_norm": 5.05358814014767, "language_loss": 0.69413024, "learning_rate": 3.2209704751098236e-06, "loss": 0.71600085, "num_input_tokens_seen": 55824265, "step": 2597, "time_per_iteration": 2.6446402072906494 }, { "auxiliary_loss_clip": 0.01148099, "auxiliary_loss_mlp": 0.01063856, "balance_loss_clip": 1.05248559, "balance_loss_mlp": 1.04161119, "epoch": 0.31239102988035833, "flos": 15187017674880.0, "grad_norm": 3.475667392480767, "language_loss": 0.82671583, "learning_rate": 3.2203534154436875e-06, "loss": 0.84883541, "num_input_tokens_seen": 55838620, "step": 2598, "time_per_iteration": 2.582808494567871 }, { "auxiliary_loss_clip": 0.01098032, "auxiliary_loss_mlp": 0.01061042, "balance_loss_clip": 1.04627633, "balance_loss_mlp": 1.03760552, "epoch": 0.31251127277099744, "flos": 22053712763520.0, "grad_norm": 1.9111116740142315, "language_loss": 0.7567659, "learning_rate": 3.2197361706509084e-06, "loss": 0.77835661, "num_input_tokens_seen": 55859375, "step": 2599, "time_per_iteration": 2.739089250564575 }, { "auxiliary_loss_clip": 0.01174432, "auxiliary_loss_mlp": 0.01057494, "balance_loss_clip": 1.05443358, "balance_loss_mlp": 1.03430808, "epoch": 0.3126315156616365, "flos": 15193984913280.0, "grad_norm": 2.8396234571735763, "language_loss": 0.84412169, "learning_rate": 3.2191187408251228e-06, "loss": 0.86644095, "num_input_tokens_seen": 55876535, "step": 2600, "time_per_iteration": 2.532607078552246 }, { "auxiliary_loss_clip": 0.01159827, "auxiliary_loss_mlp": 0.01052723, "balance_loss_clip": 1.05105567, "balance_loss_mlp": 1.03054965, "epoch": 0.3127517585522756, "flos": 18145338831360.0, "grad_norm": 3.2676354210694774, "language_loss": 0.78682894, "learning_rate": 3.218501126059993e-06, "loss": 0.80895448, "num_input_tokens_seen": 55891930, "step": 2601, "time_per_iteration": 2.564025402069092 }, { "auxiliary_loss_clip": 0.011512, "auxiliary_loss_mlp": 0.01056486, "balance_loss_clip": 1.04838812, "balance_loss_mlp": 1.03549302, "epoch": 0.31287200144291466, "flos": 21908633731200.0, "grad_norm": 2.195259717530327, "language_loss": 0.81474489, "learning_rate": 3.2178833264492116e-06, "loss": 0.83682173, "num_input_tokens_seen": 55910635, "step": 2602, "time_per_iteration": 2.611701488494873 }, { "auxiliary_loss_clip": 0.01160071, "auxiliary_loss_mlp": 0.01042271, "balance_loss_clip": 1.05244398, "balance_loss_mlp": 1.0204798, "epoch": 0.31299224433355377, "flos": 29896997800320.0, "grad_norm": 1.8074070337018495, "language_loss": 0.76330209, "learning_rate": 3.217265342086498e-06, "loss": 0.78532553, "num_input_tokens_seen": 55931125, "step": 2603, "time_per_iteration": 2.662874460220337 }, { "auxiliary_loss_clip": 0.01134441, "auxiliary_loss_mlp": 0.00776547, "balance_loss_clip": 1.05108047, "balance_loss_mlp": 1.00023174, "epoch": 0.3131124872241929, "flos": 11655886331520.0, "grad_norm": 2.3939243403905404, "language_loss": 0.72967094, "learning_rate": 3.216647173065599e-06, "loss": 0.74878073, "num_input_tokens_seen": 55946590, "step": 2604, "time_per_iteration": 2.6386570930480957 }, { "auxiliary_loss_clip": 0.01140236, "auxiliary_loss_mlp": 0.01045914, "balance_loss_clip": 1.05130649, "balance_loss_mlp": 1.02461171, "epoch": 0.31323273011483194, "flos": 49848785470080.0, "grad_norm": 3.0896584432284158, "language_loss": 0.73621655, "learning_rate": 3.216028819480292e-06, "loss": 0.7580781, "num_input_tokens_seen": 55967930, "step": 2605, "time_per_iteration": 2.8937089443206787 }, { "auxiliary_loss_clip": 0.01125069, "auxiliary_loss_mlp": 0.01043019, "balance_loss_clip": 1.04493177, "balance_loss_mlp": 1.02259827, "epoch": 0.31335297300547105, "flos": 22601278667520.0, "grad_norm": 2.4998152622745637, "language_loss": 0.75782859, "learning_rate": 3.2154102814243793e-06, "loss": 0.77950943, "num_input_tokens_seen": 55987070, "step": 2606, "time_per_iteration": 2.622152090072632 }, { "auxiliary_loss_clip": 0.01128191, "auxiliary_loss_mlp": 0.01049317, "balance_loss_clip": 1.04658794, "balance_loss_mlp": 1.02888405, "epoch": 0.31347321589611016, "flos": 34710858708480.0, "grad_norm": 5.000718007484262, "language_loss": 0.66950351, "learning_rate": 3.2147915589916937e-06, "loss": 0.69127864, "num_input_tokens_seen": 56008630, "step": 2607, "time_per_iteration": 2.7789833545684814 }, { "auxiliary_loss_clip": 0.01129452, "auxiliary_loss_mlp": 0.01047144, "balance_loss_clip": 1.04466271, "balance_loss_mlp": 1.02582896, "epoch": 0.3135934587867492, "flos": 19755789108480.0, "grad_norm": 2.030141025686705, "language_loss": 0.82960403, "learning_rate": 3.2141726522760938e-06, "loss": 0.85136998, "num_input_tokens_seen": 56026690, "step": 2608, "time_per_iteration": 2.635347366333008 }, { "auxiliary_loss_clip": 0.01046692, "auxiliary_loss_mlp": 0.01011779, "balance_loss_clip": 1.01962197, "balance_loss_mlp": 1.00929976, "epoch": 0.3137137016773883, "flos": 65815535583360.0, "grad_norm": 0.7096581730012497, "language_loss": 0.52648562, "learning_rate": 3.213553561371469e-06, "loss": 0.54707032, "num_input_tokens_seen": 56090425, "step": 2609, "time_per_iteration": 3.3258893489837646 }, { "auxiliary_loss_clip": 0.0111329, "auxiliary_loss_mlp": 0.01041922, "balance_loss_clip": 1.04603815, "balance_loss_mlp": 1.02185941, "epoch": 0.31383394456802743, "flos": 16252739222400.0, "grad_norm": 2.1882374574068333, "language_loss": 0.95535517, "learning_rate": 3.212934286371733e-06, "loss": 0.97690725, "num_input_tokens_seen": 56107135, "step": 2610, "time_per_iteration": 2.691107749938965 }, { "auxiliary_loss_clip": 0.01157818, "auxiliary_loss_mlp": 0.01051149, "balance_loss_clip": 1.05480945, "balance_loss_mlp": 1.02944088, "epoch": 0.3139541874586665, "flos": 38795517613440.0, "grad_norm": 2.2919405584239816, "language_loss": 0.83244896, "learning_rate": 3.2123148273708304e-06, "loss": 0.85453856, "num_input_tokens_seen": 56127325, "step": 2611, "time_per_iteration": 3.699021816253662 }, { "auxiliary_loss_clip": 0.01164606, "auxiliary_loss_mlp": 0.01053852, "balance_loss_clip": 1.05387688, "balance_loss_mlp": 1.03340757, "epoch": 0.3140744303493056, "flos": 25046328430080.0, "grad_norm": 1.933698277775771, "language_loss": 0.76709443, "learning_rate": 3.211695184462733e-06, "loss": 0.78927904, "num_input_tokens_seen": 56148500, "step": 2612, "time_per_iteration": 2.616441249847412 }, { "auxiliary_loss_clip": 0.01026298, "auxiliary_loss_mlp": 0.01002114, "balance_loss_clip": 1.01687813, "balance_loss_mlp": 0.99951553, "epoch": 0.3141946732399447, "flos": 72504254782080.0, "grad_norm": 0.886211760726558, "language_loss": 0.60468411, "learning_rate": 3.2110753577414383e-06, "loss": 0.62496829, "num_input_tokens_seen": 56210080, "step": 2613, "time_per_iteration": 4.140542984008789 }, { "auxiliary_loss_clip": 0.01143438, "auxiliary_loss_mlp": 0.01053469, "balance_loss_clip": 1.04767227, "balance_loss_mlp": 1.0324527, "epoch": 0.31431491613058377, "flos": 19239788280960.0, "grad_norm": 2.891047792305127, "language_loss": 0.78798997, "learning_rate": 3.2104553473009757e-06, "loss": 0.80995905, "num_input_tokens_seen": 56228200, "step": 2614, "time_per_iteration": 3.5783040523529053 }, { "auxiliary_loss_clip": 0.01109359, "auxiliary_loss_mlp": 0.01046006, "balance_loss_clip": 1.04242039, "balance_loss_mlp": 1.02467918, "epoch": 0.3144351590212229, "flos": 36210596290560.0, "grad_norm": 1.9257057857341187, "language_loss": 0.67924845, "learning_rate": 3.209835153235399e-06, "loss": 0.70080209, "num_input_tokens_seen": 56249755, "step": 2615, "time_per_iteration": 2.9222562313079834 }, { "auxiliary_loss_clip": 0.0110991, "auxiliary_loss_mlp": 0.0104484, "balance_loss_clip": 1.04274642, "balance_loss_mlp": 1.02358484, "epoch": 0.314555401911862, "flos": 18551740285440.0, "grad_norm": 1.7465399898933054, "language_loss": 0.67651361, "learning_rate": 3.2092147756387916e-06, "loss": 0.69806105, "num_input_tokens_seen": 56270080, "step": 2616, "time_per_iteration": 2.6847310066223145 }, { "auxiliary_loss_clip": 0.0113381, "auxiliary_loss_mlp": 0.01061911, "balance_loss_clip": 1.04849768, "balance_loss_mlp": 1.039464, "epoch": 0.31467564480250104, "flos": 16362877299840.0, "grad_norm": 4.7869284900474325, "language_loss": 0.8317396, "learning_rate": 3.208594214605264e-06, "loss": 0.85369682, "num_input_tokens_seen": 56288625, "step": 2617, "time_per_iteration": 2.630279302597046 }, { "auxiliary_loss_clip": 0.01125804, "auxiliary_loss_mlp": 0.01054162, "balance_loss_clip": 1.04454279, "balance_loss_mlp": 1.03332448, "epoch": 0.31479588769314015, "flos": 21652375127040.0, "grad_norm": 2.044888913700032, "language_loss": 0.77138937, "learning_rate": 3.2079734702289553e-06, "loss": 0.79318905, "num_input_tokens_seen": 56307520, "step": 2618, "time_per_iteration": 2.6364526748657227 }, { "auxiliary_loss_clip": 0.0104237, "auxiliary_loss_mlp": 0.007564, "balance_loss_clip": 1.01553321, "balance_loss_mlp": 1.00008035, "epoch": 0.3149161305837792, "flos": 66051072040320.0, "grad_norm": 0.8052707315580518, "language_loss": 0.60418016, "learning_rate": 3.207352542604031e-06, "loss": 0.62216789, "num_input_tokens_seen": 56369855, "step": 2619, "time_per_iteration": 3.257105827331543 }, { "auxiliary_loss_clip": 0.01111468, "auxiliary_loss_mlp": 0.01044503, "balance_loss_clip": 1.04199362, "balance_loss_mlp": 1.02565575, "epoch": 0.3150363734744183, "flos": 28987201192320.0, "grad_norm": 1.7961631543013628, "language_loss": 0.78357267, "learning_rate": 3.2067314318246864e-06, "loss": 0.80513239, "num_input_tokens_seen": 56390570, "step": 2620, "time_per_iteration": 3.57592511177063 }, { "auxiliary_loss_clip": 0.01129286, "auxiliary_loss_mlp": 0.01052931, "balance_loss_clip": 1.04840899, "balance_loss_mlp": 1.03373873, "epoch": 0.31515661636505743, "flos": 27636600879360.0, "grad_norm": 1.931838023905345, "language_loss": 0.77612281, "learning_rate": 3.206110137985143e-06, "loss": 0.79794502, "num_input_tokens_seen": 56410775, "step": 2621, "time_per_iteration": 2.6923911571502686 }, { "auxiliary_loss_clip": 0.01115697, "auxiliary_loss_mlp": 0.01058674, "balance_loss_clip": 1.04566932, "balance_loss_mlp": 1.03718042, "epoch": 0.3152768592556965, "flos": 24605632465920.0, "grad_norm": 2.894696900610709, "language_loss": 0.92634583, "learning_rate": 3.2054886611796505e-06, "loss": 0.9480896, "num_input_tokens_seen": 56429770, "step": 2622, "time_per_iteration": 2.6996521949768066 }, { "auxiliary_loss_clip": 0.0105346, "auxiliary_loss_mlp": 0.01005224, "balance_loss_clip": 1.01637483, "balance_loss_mlp": 1.00272083, "epoch": 0.3153971021463356, "flos": 68476908026880.0, "grad_norm": 0.8892479525497466, "language_loss": 0.63573575, "learning_rate": 3.204867001502487e-06, "loss": 0.6563226, "num_input_tokens_seen": 56488425, "step": 2623, "time_per_iteration": 3.1695218086242676 }, { "auxiliary_loss_clip": 0.01167201, "auxiliary_loss_mlp": 0.01057284, "balance_loss_clip": 1.05391836, "balance_loss_mlp": 1.0358386, "epoch": 0.3155173450369747, "flos": 25593714766080.0, "grad_norm": 2.4162266724595893, "language_loss": 0.80753636, "learning_rate": 3.2042451590479567e-06, "loss": 0.82978129, "num_input_tokens_seen": 56508940, "step": 2624, "time_per_iteration": 2.6091184616088867 }, { "auxiliary_loss_clip": 0.0116071, "auxiliary_loss_mlp": 0.0104386, "balance_loss_clip": 1.05067503, "balance_loss_mlp": 1.02544188, "epoch": 0.31563758792761376, "flos": 24309333175680.0, "grad_norm": 1.777270334656548, "language_loss": 0.86685771, "learning_rate": 3.203623133910394e-06, "loss": 0.88890344, "num_input_tokens_seen": 56527245, "step": 2625, "time_per_iteration": 2.6026430130004883 }, { "auxiliary_loss_clip": 0.0110185, "auxiliary_loss_mlp": 0.01043282, "balance_loss_clip": 1.04125142, "balance_loss_mlp": 1.02354121, "epoch": 0.31575783081825287, "flos": 31903865550720.0, "grad_norm": 2.9311859497877526, "language_loss": 0.77606618, "learning_rate": 3.203000926184158e-06, "loss": 0.79751754, "num_input_tokens_seen": 56546170, "step": 2626, "time_per_iteration": 2.7605221271514893 }, { "auxiliary_loss_clip": 0.01161389, "auxiliary_loss_mlp": 0.01045087, "balance_loss_clip": 1.04979873, "balance_loss_mlp": 1.02746773, "epoch": 0.315878073708892, "flos": 30810960385920.0, "grad_norm": 2.435009696704956, "language_loss": 0.77772802, "learning_rate": 3.202378535963639e-06, "loss": 0.79979277, "num_input_tokens_seen": 56567085, "step": 2627, "time_per_iteration": 2.639190912246704 }, { "auxiliary_loss_clip": 0.01130646, "auxiliary_loss_mlp": 0.00776382, "balance_loss_clip": 1.04513299, "balance_loss_mlp": 1.00015545, "epoch": 0.31599831659953104, "flos": 22200264253440.0, "grad_norm": 1.8012517843204714, "language_loss": 0.84087026, "learning_rate": 3.2017559633432516e-06, "loss": 0.85994053, "num_input_tokens_seen": 56586715, "step": 2628, "time_per_iteration": 2.6381731033325195 }, { "auxiliary_loss_clip": 0.01141075, "auxiliary_loss_mlp": 0.01059348, "balance_loss_clip": 1.04715514, "balance_loss_mlp": 1.03904629, "epoch": 0.31611855949017015, "flos": 25593463370880.0, "grad_norm": 2.249382878391304, "language_loss": 0.663046, "learning_rate": 3.2011332084174398e-06, "loss": 0.68505025, "num_input_tokens_seen": 56607585, "step": 2629, "time_per_iteration": 2.6380276679992676 }, { "auxiliary_loss_clip": 0.01149049, "auxiliary_loss_mlp": 0.01045263, "balance_loss_clip": 1.04890478, "balance_loss_mlp": 1.02430558, "epoch": 0.31623880238080926, "flos": 20594087694720.0, "grad_norm": 1.9780107890937126, "language_loss": 0.8936317, "learning_rate": 3.2005102712806756e-06, "loss": 0.91557485, "num_input_tokens_seen": 56626415, "step": 2630, "time_per_iteration": 2.616278886795044 }, { "auxiliary_loss_clip": 0.01157723, "auxiliary_loss_mlp": 0.01056961, "balance_loss_clip": 1.05322576, "balance_loss_mlp": 1.03689802, "epoch": 0.3163590452714483, "flos": 12784917600000.0, "grad_norm": 2.4674891645215733, "language_loss": 0.73482442, "learning_rate": 3.1998871520274575e-06, "loss": 0.7569713, "num_input_tokens_seen": 56641750, "step": 2631, "time_per_iteration": 2.5813772678375244 }, { "auxiliary_loss_clip": 0.01137929, "auxiliary_loss_mlp": 0.01052199, "balance_loss_clip": 1.0465529, "balance_loss_mlp": 1.03065848, "epoch": 0.3164792881620874, "flos": 23041292273280.0, "grad_norm": 2.645263753165168, "language_loss": 0.84772539, "learning_rate": 3.199263850752312e-06, "loss": 0.86962664, "num_input_tokens_seen": 56662585, "step": 2632, "time_per_iteration": 2.8840205669403076 }, { "auxiliary_loss_clip": 0.01156626, "auxiliary_loss_mlp": 0.01060377, "balance_loss_clip": 1.05304933, "balance_loss_mlp": 1.04071903, "epoch": 0.31659953105272653, "flos": 18296271780480.0, "grad_norm": 2.3456218152517163, "language_loss": 0.85236013, "learning_rate": 3.198640367549795e-06, "loss": 0.8745302, "num_input_tokens_seen": 56681480, "step": 2633, "time_per_iteration": 2.7483577728271484 }, { "auxiliary_loss_clip": 0.01153651, "auxiliary_loss_mlp": 0.00776036, "balance_loss_clip": 1.04955781, "balance_loss_mlp": 1.00012875, "epoch": 0.3167197739433656, "flos": 25703421880320.0, "grad_norm": 1.8546844552136645, "language_loss": 0.85771096, "learning_rate": 3.198016702514487e-06, "loss": 0.87700784, "num_input_tokens_seen": 56701760, "step": 2634, "time_per_iteration": 2.848972797393799 }, { "auxiliary_loss_clip": 0.01161273, "auxiliary_loss_mlp": 0.01056795, "balance_loss_clip": 1.05224514, "balance_loss_mlp": 1.0378052, "epoch": 0.3168400168340047, "flos": 23546016230400.0, "grad_norm": 1.74710406342021, "language_loss": 0.84814322, "learning_rate": 3.1973928557409972e-06, "loss": 0.87032396, "num_input_tokens_seen": 56719800, "step": 2635, "time_per_iteration": 2.6196751594543457 }, { "auxiliary_loss_clip": 0.01161592, "auxiliary_loss_mlp": 0.01046685, "balance_loss_clip": 1.05271506, "balance_loss_mlp": 1.02813601, "epoch": 0.31696025972464376, "flos": 28366449327360.0, "grad_norm": 15.546613482234733, "language_loss": 0.71290767, "learning_rate": 3.1967688273239636e-06, "loss": 0.73499048, "num_input_tokens_seen": 56739605, "step": 2636, "time_per_iteration": 2.684457778930664 }, { "auxiliary_loss_clip": 0.01124353, "auxiliary_loss_mlp": 0.01045501, "balance_loss_clip": 1.04799676, "balance_loss_mlp": 1.02630806, "epoch": 0.31708050261528287, "flos": 16399111144320.0, "grad_norm": 1.9754670630147788, "language_loss": 0.82098901, "learning_rate": 3.1961446173580503e-06, "loss": 0.84268761, "num_input_tokens_seen": 56756545, "step": 2637, "time_per_iteration": 3.6161422729492188 }, { "auxiliary_loss_clip": 0.01136319, "auxiliary_loss_mlp": 0.01057017, "balance_loss_clip": 1.04858923, "balance_loss_mlp": 1.03768134, "epoch": 0.317200745505922, "flos": 26212347728640.0, "grad_norm": 2.218371314072048, "language_loss": 0.77355886, "learning_rate": 3.1955202259379502e-06, "loss": 0.79549229, "num_input_tokens_seen": 56778275, "step": 2638, "time_per_iteration": 2.722179889678955 }, { "auxiliary_loss_clip": 0.01148758, "auxiliary_loss_mlp": 0.01041435, "balance_loss_clip": 1.04701853, "balance_loss_mlp": 1.02024007, "epoch": 0.31732098839656103, "flos": 31350876693120.0, "grad_norm": 2.21745094255961, "language_loss": 0.827775, "learning_rate": 3.194895653158381e-06, "loss": 0.84967697, "num_input_tokens_seen": 56797215, "step": 2639, "time_per_iteration": 3.737583637237549 }, { "auxiliary_loss_clip": 0.01052157, "auxiliary_loss_mlp": 0.01010706, "balance_loss_clip": 1.0151906, "balance_loss_mlp": 1.00836992, "epoch": 0.31744123128720014, "flos": 58989024835200.0, "grad_norm": 0.7883745366194495, "language_loss": 0.55538285, "learning_rate": 3.194270899114093e-06, "loss": 0.57601148, "num_input_tokens_seen": 56863010, "step": 2640, "time_per_iteration": 4.211897134780884 }, { "auxiliary_loss_clip": 0.01157606, "auxiliary_loss_mlp": 0.01054037, "balance_loss_clip": 1.05151606, "balance_loss_mlp": 1.03383112, "epoch": 0.31756147417783925, "flos": 17417573372160.0, "grad_norm": 2.167191157554067, "language_loss": 0.82574153, "learning_rate": 3.193645963899858e-06, "loss": 0.84785795, "num_input_tokens_seen": 56880625, "step": 2641, "time_per_iteration": 2.6130361557006836 }, { "auxiliary_loss_clip": 0.01132426, "auxiliary_loss_mlp": 0.01062025, "balance_loss_clip": 1.04729736, "balance_loss_mlp": 1.04093671, "epoch": 0.3176817170684783, "flos": 25481673267840.0, "grad_norm": 1.877058934371021, "language_loss": 0.84283602, "learning_rate": 3.193020847610479e-06, "loss": 0.86478055, "num_input_tokens_seen": 56900945, "step": 2642, "time_per_iteration": 2.674253463745117 }, { "auxiliary_loss_clip": 0.01131748, "auxiliary_loss_mlp": 0.01052582, "balance_loss_clip": 1.04823327, "balance_loss_mlp": 1.02972984, "epoch": 0.3178019599591174, "flos": 24972603765120.0, "grad_norm": 2.4768953708561536, "language_loss": 0.7139436, "learning_rate": 3.192395550340787e-06, "loss": 0.73578691, "num_input_tokens_seen": 56918895, "step": 2643, "time_per_iteration": 2.6483285427093506 }, { "auxiliary_loss_clip": 0.01153894, "auxiliary_loss_mlp": 0.01050581, "balance_loss_clip": 1.05172241, "balance_loss_mlp": 1.03023171, "epoch": 0.31792220284975653, "flos": 12422220019200.0, "grad_norm": 2.1144623761026216, "language_loss": 0.76608163, "learning_rate": 3.191770072185638e-06, "loss": 0.78812647, "num_input_tokens_seen": 56935890, "step": 2644, "time_per_iteration": 2.6182446479797363 }, { "auxiliary_loss_clip": 0.01151914, "auxiliary_loss_mlp": 0.01054879, "balance_loss_clip": 1.05294454, "balance_loss_mlp": 1.03385103, "epoch": 0.3180424457403956, "flos": 15485759089920.0, "grad_norm": 2.3059332597628424, "language_loss": 0.72512406, "learning_rate": 3.191144413239916e-06, "loss": 0.74719197, "num_input_tokens_seen": 56952460, "step": 2645, "time_per_iteration": 2.559443235397339 }, { "auxiliary_loss_clip": 0.01139631, "auxiliary_loss_mlp": 0.01056198, "balance_loss_clip": 1.04912257, "balance_loss_mlp": 1.0349071, "epoch": 0.3181626886310347, "flos": 26174964648960.0, "grad_norm": 3.359839664106832, "language_loss": 0.88185346, "learning_rate": 3.190518573598534e-06, "loss": 0.90381175, "num_input_tokens_seen": 56969065, "step": 2646, "time_per_iteration": 3.5823557376861572 }, { "auxiliary_loss_clip": 0.01129086, "auxiliary_loss_mlp": 0.01048129, "balance_loss_clip": 1.04633641, "balance_loss_mlp": 1.02694559, "epoch": 0.3182829315216738, "flos": 25483109811840.0, "grad_norm": 1.6675720492266253, "language_loss": 0.77385706, "learning_rate": 3.1898925533564308e-06, "loss": 0.7956292, "num_input_tokens_seen": 56990535, "step": 2647, "time_per_iteration": 2.6966307163238525 }, { "auxiliary_loss_clip": 0.01116705, "auxiliary_loss_mlp": 0.01048636, "balance_loss_clip": 1.04891109, "balance_loss_mlp": 1.02797711, "epoch": 0.31840317441231286, "flos": 18113701927680.0, "grad_norm": 1.9710435684670204, "language_loss": 0.63806111, "learning_rate": 3.1892663526085733e-06, "loss": 0.65971458, "num_input_tokens_seen": 57008910, "step": 2648, "time_per_iteration": 2.6458003520965576 }, { "auxiliary_loss_clip": 0.01051849, "auxiliary_loss_mlp": 0.01002932, "balance_loss_clip": 1.01518941, "balance_loss_mlp": 1.00038087, "epoch": 0.31852341730295197, "flos": 64741948957440.0, "grad_norm": 0.7645637312405539, "language_loss": 0.56881416, "learning_rate": 3.188639971449956e-06, "loss": 0.58936191, "num_input_tokens_seen": 57074960, "step": 2649, "time_per_iteration": 3.127678155899048 }, { "auxiliary_loss_clip": 0.01167655, "auxiliary_loss_mlp": 0.01053273, "balance_loss_clip": 1.05255485, "balance_loss_mlp": 1.03310323, "epoch": 0.318643660193591, "flos": 20668135582080.0, "grad_norm": 2.302352596559934, "language_loss": 0.72482818, "learning_rate": 3.1880134099756e-06, "loss": 0.74703741, "num_input_tokens_seen": 57094595, "step": 2650, "time_per_iteration": 2.544049024581909 }, { "auxiliary_loss_clip": 0.01145635, "auxiliary_loss_mlp": 0.01047885, "balance_loss_clip": 1.04750204, "balance_loss_mlp": 1.02796483, "epoch": 0.31876390308423014, "flos": 26943345411840.0, "grad_norm": 1.7740399172754509, "language_loss": 0.69959849, "learning_rate": 3.1873866682805535e-06, "loss": 0.72153366, "num_input_tokens_seen": 57115290, "step": 2651, "time_per_iteration": 2.6249194145202637 }, { "auxiliary_loss_clip": 0.01144719, "auxiliary_loss_mlp": 0.01056834, "balance_loss_clip": 1.05121231, "balance_loss_mlp": 1.03599572, "epoch": 0.31888414597486925, "flos": 18041916597120.0, "grad_norm": 15.85621899776131, "language_loss": 0.88350093, "learning_rate": 3.186759746459894e-06, "loss": 0.90551651, "num_input_tokens_seen": 57134400, "step": 2652, "time_per_iteration": 2.6421046257019043 }, { "auxiliary_loss_clip": 0.01136769, "auxiliary_loss_mlp": 0.01052291, "balance_loss_clip": 1.04977322, "balance_loss_mlp": 1.03274024, "epoch": 0.3190043888655083, "flos": 25149319701120.0, "grad_norm": 2.091525532854636, "language_loss": 0.79463518, "learning_rate": 3.1861326446087246e-06, "loss": 0.81652582, "num_input_tokens_seen": 57153140, "step": 2653, "time_per_iteration": 2.6949496269226074 }, { "auxiliary_loss_clip": 0.01156872, "auxiliary_loss_mlp": 0.01037331, "balance_loss_clip": 1.05363178, "balance_loss_mlp": 1.01599312, "epoch": 0.3191246317561474, "flos": 22053892331520.0, "grad_norm": 2.476475129491212, "language_loss": 0.71635884, "learning_rate": 3.1855053628221763e-06, "loss": 0.73830086, "num_input_tokens_seen": 57172395, "step": 2654, "time_per_iteration": 2.5816218852996826 }, { "auxiliary_loss_clip": 0.01117191, "auxiliary_loss_mlp": 0.01058714, "balance_loss_clip": 1.04291081, "balance_loss_mlp": 1.0370419, "epoch": 0.3192448746467865, "flos": 14901815687040.0, "grad_norm": 2.6562026863198653, "language_loss": 0.90046787, "learning_rate": 3.184877901195407e-06, "loss": 0.92222691, "num_input_tokens_seen": 57189090, "step": 2655, "time_per_iteration": 2.671908378601074 }, { "auxiliary_loss_clip": 0.01042597, "auxiliary_loss_mlp": 0.01010751, "balance_loss_clip": 1.02692437, "balance_loss_mlp": 1.00655484, "epoch": 0.3193651175374256, "flos": 67234832657280.0, "grad_norm": 0.7943163546168384, "language_loss": 0.62831497, "learning_rate": 3.184250259823602e-06, "loss": 0.64884853, "num_input_tokens_seen": 57251620, "step": 2656, "time_per_iteration": 3.2668607234954834 }, { "auxiliary_loss_clip": 0.01130936, "auxiliary_loss_mlp": 0.01046938, "balance_loss_clip": 1.04856157, "balance_loss_mlp": 1.02630281, "epoch": 0.3194853604280647, "flos": 12233077977600.0, "grad_norm": 2.385913739221802, "language_loss": 0.81861377, "learning_rate": 3.183622438801974e-06, "loss": 0.84039253, "num_input_tokens_seen": 57266910, "step": 2657, "time_per_iteration": 2.6571896076202393 }, { "auxiliary_loss_clip": 0.01166026, "auxiliary_loss_mlp": 0.01050872, "balance_loss_clip": 1.05540657, "balance_loss_mlp": 1.03253734, "epoch": 0.3196056033187038, "flos": 14939917038720.0, "grad_norm": 2.0195399085830905, "language_loss": 0.75178921, "learning_rate": 3.1829944382257637e-06, "loss": 0.77395815, "num_input_tokens_seen": 57285040, "step": 2658, "time_per_iteration": 2.5862855911254883 }, { "auxiliary_loss_clip": 0.01153024, "auxiliary_loss_mlp": 0.01056432, "balance_loss_clip": 1.05198598, "balance_loss_mlp": 1.03702426, "epoch": 0.31972584620934286, "flos": 23768878164480.0, "grad_norm": 2.9374735408683788, "language_loss": 0.81425756, "learning_rate": 3.1823662581902373e-06, "loss": 0.83635211, "num_input_tokens_seen": 57302725, "step": 2659, "time_per_iteration": 2.6441445350646973 }, { "auxiliary_loss_clip": 0.01111431, "auxiliary_loss_mlp": 0.01061985, "balance_loss_clip": 1.04255164, "balance_loss_mlp": 1.03828633, "epoch": 0.31984608909998197, "flos": 21251540280960.0, "grad_norm": 2.2187557598519647, "language_loss": 0.75097632, "learning_rate": 3.1817378987906896e-06, "loss": 0.77271056, "num_input_tokens_seen": 57322230, "step": 2660, "time_per_iteration": 2.6651134490966797 }, { "auxiliary_loss_clip": 0.01110734, "auxiliary_loss_mlp": 0.01051453, "balance_loss_clip": 1.04960907, "balance_loss_mlp": 1.02978086, "epoch": 0.3199663319906211, "flos": 18296235866880.0, "grad_norm": 2.2841762328654007, "language_loss": 0.79626107, "learning_rate": 3.181109360122442e-06, "loss": 0.81788296, "num_input_tokens_seen": 57339820, "step": 2661, "time_per_iteration": 2.691559314727783 }, { "auxiliary_loss_clip": 0.01127992, "auxiliary_loss_mlp": 0.01053591, "balance_loss_clip": 1.0501287, "balance_loss_mlp": 1.03147805, "epoch": 0.32008657488126013, "flos": 18733627779840.0, "grad_norm": 3.271507648720823, "language_loss": 0.78963965, "learning_rate": 3.1804806422808445e-06, "loss": 0.81145549, "num_input_tokens_seen": 57356955, "step": 2662, "time_per_iteration": 2.66413950920105 }, { "auxiliary_loss_clip": 0.01132401, "auxiliary_loss_mlp": 0.01057053, "balance_loss_clip": 1.04907978, "balance_loss_mlp": 1.03408158, "epoch": 0.32020681777189924, "flos": 20595344670720.0, "grad_norm": 1.7695283390201337, "language_loss": 0.72715235, "learning_rate": 3.1798517453612714e-06, "loss": 0.74904686, "num_input_tokens_seen": 57376760, "step": 2663, "time_per_iteration": 3.6463468074798584 }, { "auxiliary_loss_clip": 0.01150705, "auxiliary_loss_mlp": 0.01057007, "balance_loss_clip": 1.05368936, "balance_loss_mlp": 1.03550184, "epoch": 0.32032706066253835, "flos": 35261692750080.0, "grad_norm": 1.8629394730805655, "language_loss": 0.75545049, "learning_rate": 3.1792226694591265e-06, "loss": 0.77752763, "num_input_tokens_seen": 57398145, "step": 2664, "time_per_iteration": 2.7279741764068604 }, { "auxiliary_loss_clip": 0.01124391, "auxiliary_loss_mlp": 0.0105028, "balance_loss_clip": 1.04974973, "balance_loss_mlp": 1.03162384, "epoch": 0.3204473035531774, "flos": 15304230731520.0, "grad_norm": 2.0202580431763746, "language_loss": 0.8019436, "learning_rate": 3.178593414669841e-06, "loss": 0.8236903, "num_input_tokens_seen": 57416730, "step": 2665, "time_per_iteration": 2.6523213386535645 }, { "auxiliary_loss_clip": 0.01161422, "auxiliary_loss_mlp": 0.01059902, "balance_loss_clip": 1.05573845, "balance_loss_mlp": 1.03631043, "epoch": 0.3205675464438165, "flos": 24462564595200.0, "grad_norm": 2.173059043622253, "language_loss": 0.70907569, "learning_rate": 3.1779639810888707e-06, "loss": 0.73128891, "num_input_tokens_seen": 57436325, "step": 2666, "time_per_iteration": 4.620692491531372 }, { "auxiliary_loss_clip": 0.01150044, "auxiliary_loss_mlp": 0.01053492, "balance_loss_clip": 1.05135536, "balance_loss_mlp": 1.03346491, "epoch": 0.3206877893344556, "flos": 22456235548800.0, "grad_norm": 2.1449433452141027, "language_loss": 0.76238579, "learning_rate": 3.1773343688117013e-06, "loss": 0.78442109, "num_input_tokens_seen": 57457235, "step": 2667, "time_per_iteration": 2.60774564743042 }, { "auxiliary_loss_clip": 0.01143943, "auxiliary_loss_mlp": 0.00775899, "balance_loss_clip": 1.05095828, "balance_loss_mlp": 1.00021052, "epoch": 0.3208080322250947, "flos": 20412236113920.0, "grad_norm": 6.354086653591602, "language_loss": 0.83868879, "learning_rate": 3.1767045779338445e-06, "loss": 0.85788721, "num_input_tokens_seen": 57474895, "step": 2668, "time_per_iteration": 2.615778684616089 }, { "auxiliary_loss_clip": 0.01147491, "auxiliary_loss_mlp": 0.01051609, "balance_loss_clip": 1.0478996, "balance_loss_mlp": 1.03081894, "epoch": 0.3209282751157338, "flos": 21762118154880.0, "grad_norm": 2.3086265961679318, "language_loss": 0.91519189, "learning_rate": 3.176074608550839e-06, "loss": 0.9371829, "num_input_tokens_seen": 57490715, "step": 2669, "time_per_iteration": 2.6161093711853027 }, { "auxiliary_loss_clip": 0.01104459, "auxiliary_loss_mlp": 0.01061169, "balance_loss_clip": 1.04611468, "balance_loss_mlp": 1.03911495, "epoch": 0.32104851800637285, "flos": 22055041566720.0, "grad_norm": 2.35569800053326, "language_loss": 0.82315361, "learning_rate": 3.17544446075825e-06, "loss": 0.84480989, "num_input_tokens_seen": 57509880, "step": 2670, "time_per_iteration": 2.7326366901397705 }, { "auxiliary_loss_clip": 0.01142435, "auxiliary_loss_mlp": 0.01047851, "balance_loss_clip": 1.04930365, "balance_loss_mlp": 1.02946866, "epoch": 0.32116876089701196, "flos": 37012301896320.0, "grad_norm": 1.6588923417915449, "language_loss": 0.70871782, "learning_rate": 3.174814134651671e-06, "loss": 0.73062068, "num_input_tokens_seen": 57532430, "step": 2671, "time_per_iteration": 2.7734787464141846 }, { "auxiliary_loss_clip": 0.01159134, "auxiliary_loss_mlp": 0.01048281, "balance_loss_clip": 1.05067265, "balance_loss_mlp": 1.03031659, "epoch": 0.3212890037876511, "flos": 21979233912960.0, "grad_norm": 1.7087291047459154, "language_loss": 0.80454451, "learning_rate": 3.1741836303267215e-06, "loss": 0.82661867, "num_input_tokens_seen": 57551965, "step": 2672, "time_per_iteration": 2.5687108039855957 }, { "auxiliary_loss_clip": 0.01161723, "auxiliary_loss_mlp": 0.0105692, "balance_loss_clip": 1.05244696, "balance_loss_mlp": 1.03798926, "epoch": 0.32140924667829013, "flos": 10342345875840.0, "grad_norm": 2.0092675514498035, "language_loss": 0.74992687, "learning_rate": 3.1735529478790496e-06, "loss": 0.77211332, "num_input_tokens_seen": 57569955, "step": 2673, "time_per_iteration": 3.3529863357543945 }, { "auxiliary_loss_clip": 0.01150397, "auxiliary_loss_mlp": 0.01054523, "balance_loss_clip": 1.05012453, "balance_loss_mlp": 1.03356624, "epoch": 0.32152948956892924, "flos": 50798910072960.0, "grad_norm": 1.848456246637865, "language_loss": 0.7960813, "learning_rate": 3.172922087404328e-06, "loss": 0.81813049, "num_input_tokens_seen": 57592215, "step": 2674, "time_per_iteration": 2.9005346298217773 }, { "auxiliary_loss_clip": 0.01056393, "auxiliary_loss_mlp": 0.0102279, "balance_loss_clip": 1.01917505, "balance_loss_mlp": 1.02060819, "epoch": 0.32164973245956835, "flos": 63863250549120.0, "grad_norm": 0.7732708087113601, "language_loss": 0.5525198, "learning_rate": 3.1722910489982586e-06, "loss": 0.57331157, "num_input_tokens_seen": 57652575, "step": 2675, "time_per_iteration": 3.2274699211120605 }, { "auxiliary_loss_clip": 0.01133942, "auxiliary_loss_mlp": 0.01047879, "balance_loss_clip": 1.04812193, "balance_loss_mlp": 1.02793527, "epoch": 0.3217699753502074, "flos": 23513948363520.0, "grad_norm": 2.21774612809819, "language_loss": 0.80080849, "learning_rate": 3.1716598327565694e-06, "loss": 0.82262677, "num_input_tokens_seen": 57672215, "step": 2676, "time_per_iteration": 2.6534459590911865 }, { "auxiliary_loss_clip": 0.01161779, "auxiliary_loss_mlp": 0.01048935, "balance_loss_clip": 1.05246687, "balance_loss_mlp": 1.03011191, "epoch": 0.3218902182408465, "flos": 19062533640960.0, "grad_norm": 1.5725507124975275, "language_loss": 0.84204066, "learning_rate": 3.171028438775015e-06, "loss": 0.8641479, "num_input_tokens_seen": 57691410, "step": 2677, "time_per_iteration": 2.5961053371429443 }, { "auxiliary_loss_clip": 0.01162141, "auxiliary_loss_mlp": 0.01043375, "balance_loss_clip": 1.05184674, "balance_loss_mlp": 1.02347922, "epoch": 0.3220104611314856, "flos": 20375571306240.0, "grad_norm": 2.059158000724236, "language_loss": 0.8404004, "learning_rate": 3.170396867149377e-06, "loss": 0.86245561, "num_input_tokens_seen": 57709415, "step": 2678, "time_per_iteration": 2.587770700454712 }, { "auxiliary_loss_clip": 0.01105339, "auxiliary_loss_mlp": 0.01060543, "balance_loss_clip": 1.04584289, "balance_loss_mlp": 1.04009891, "epoch": 0.3221307040221247, "flos": 20117014231680.0, "grad_norm": 2.1075815213965052, "language_loss": 0.8634702, "learning_rate": 3.1697651179754653e-06, "loss": 0.88512909, "num_input_tokens_seen": 57728075, "step": 2679, "time_per_iteration": 2.6972551345825195 }, { "auxiliary_loss_clip": 0.01128565, "auxiliary_loss_mlp": 0.01049823, "balance_loss_clip": 1.05241084, "balance_loss_mlp": 1.03048754, "epoch": 0.3222509469127638, "flos": 23987789602560.0, "grad_norm": 1.9376338379418607, "language_loss": 0.73244977, "learning_rate": 3.1691331913491153e-06, "loss": 0.75423366, "num_input_tokens_seen": 57750645, "step": 2680, "time_per_iteration": 2.7104175090789795 }, { "auxiliary_loss_clip": 0.01160322, "auxiliary_loss_mlp": 0.01043377, "balance_loss_clip": 1.05069697, "balance_loss_mlp": 1.02189565, "epoch": 0.32237118980340285, "flos": 17675735397120.0, "grad_norm": 2.3932294408507016, "language_loss": 0.84916347, "learning_rate": 3.1685010873661898e-06, "loss": 0.87120044, "num_input_tokens_seen": 57769820, "step": 2681, "time_per_iteration": 2.556800603866577 }, { "auxiliary_loss_clip": 0.01145364, "auxiliary_loss_mlp": 0.01051474, "balance_loss_clip": 1.04869294, "balance_loss_mlp": 1.03125644, "epoch": 0.32249143269404196, "flos": 23147982645120.0, "grad_norm": 2.732540526699581, "language_loss": 0.79866564, "learning_rate": 3.167868806122578e-06, "loss": 0.82063401, "num_input_tokens_seen": 57788870, "step": 2682, "time_per_iteration": 2.6139097213745117 }, { "auxiliary_loss_clip": 0.01144768, "auxiliary_loss_mlp": 0.01057526, "balance_loss_clip": 1.05202234, "balance_loss_mlp": 1.03740346, "epoch": 0.32261167558468107, "flos": 24422308427520.0, "grad_norm": 2.1063606281213763, "language_loss": 0.66467124, "learning_rate": 3.1672363477141968e-06, "loss": 0.6866942, "num_input_tokens_seen": 57808165, "step": 2683, "time_per_iteration": 2.6591429710388184 }, { "auxiliary_loss_clip": 0.01142585, "auxiliary_loss_mlp": 0.0106005, "balance_loss_clip": 1.04791033, "balance_loss_mlp": 1.03945053, "epoch": 0.3227319184753201, "flos": 30367175852160.0, "grad_norm": 2.160618189010879, "language_loss": 0.84920096, "learning_rate": 3.1666037122369903e-06, "loss": 0.87122726, "num_input_tokens_seen": 57828825, "step": 2684, "time_per_iteration": 2.695279598236084 }, { "auxiliary_loss_clip": 0.01143758, "auxiliary_loss_mlp": 0.0104908, "balance_loss_clip": 1.04691505, "balance_loss_mlp": 1.02894568, "epoch": 0.32285216136595923, "flos": 16946174257920.0, "grad_norm": 2.6872940125860665, "language_loss": 0.86492747, "learning_rate": 3.165970899786928e-06, "loss": 0.88685584, "num_input_tokens_seen": 57846740, "step": 2685, "time_per_iteration": 2.5951242446899414 }, { "auxiliary_loss_clip": 0.01129483, "auxiliary_loss_mlp": 0.0104703, "balance_loss_clip": 1.04880786, "balance_loss_mlp": 1.02625155, "epoch": 0.32297240425659834, "flos": 21981532383360.0, "grad_norm": 1.6459204514146895, "language_loss": 0.75219965, "learning_rate": 3.1653379104600067e-06, "loss": 0.77396482, "num_input_tokens_seen": 57866885, "step": 2686, "time_per_iteration": 2.7415244579315186 }, { "auxiliary_loss_clip": 0.01149526, "auxiliary_loss_mlp": 0.01049509, "balance_loss_clip": 1.04908681, "balance_loss_mlp": 1.02939844, "epoch": 0.3230926471472374, "flos": 22748045639040.0, "grad_norm": 1.881391600106447, "language_loss": 0.69627702, "learning_rate": 3.164704744352251e-06, "loss": 0.71826744, "num_input_tokens_seen": 57887690, "step": 2687, "time_per_iteration": 2.6027565002441406 }, { "auxiliary_loss_clip": 0.01144466, "auxiliary_loss_mlp": 0.01044865, "balance_loss_clip": 1.04516113, "balance_loss_mlp": 1.0254699, "epoch": 0.3232128900378765, "flos": 16942977947520.0, "grad_norm": 1.7862321270339738, "language_loss": 0.80736852, "learning_rate": 3.164071401559713e-06, "loss": 0.8292619, "num_input_tokens_seen": 57905090, "step": 2688, "time_per_iteration": 2.6036994457244873 }, { "auxiliary_loss_clip": 0.01139469, "auxiliary_loss_mlp": 0.01053715, "balance_loss_clip": 1.04785204, "balance_loss_mlp": 1.03375924, "epoch": 0.3233331329285156, "flos": 24023736138240.0, "grad_norm": 1.7685830878470756, "language_loss": 0.71142173, "learning_rate": 3.1634378821784674e-06, "loss": 0.7333535, "num_input_tokens_seen": 57925305, "step": 2689, "time_per_iteration": 3.6435635089874268 }, { "auxiliary_loss_clip": 0.01129935, "auxiliary_loss_mlp": 0.01051697, "balance_loss_clip": 1.05239892, "balance_loss_mlp": 1.03064466, "epoch": 0.3234533758191547, "flos": 18113845582080.0, "grad_norm": 2.1891207794221406, "language_loss": 0.7387625, "learning_rate": 3.1628041863046208e-06, "loss": 0.76057881, "num_input_tokens_seen": 57942720, "step": 2690, "time_per_iteration": 2.661607503890991 }, { "auxiliary_loss_clip": 0.01168321, "auxiliary_loss_mlp": 0.01052898, "balance_loss_clip": 1.05259132, "balance_loss_mlp": 1.03127337, "epoch": 0.3235736187097938, "flos": 16946138344320.0, "grad_norm": 5.428911083266787, "language_loss": 0.91003835, "learning_rate": 3.162170314034304e-06, "loss": 0.9322505, "num_input_tokens_seen": 57960135, "step": 2691, "time_per_iteration": 2.527146100997925 }, { "auxiliary_loss_clip": 0.01163188, "auxiliary_loss_mlp": 0.01046871, "balance_loss_clip": 1.0505631, "balance_loss_mlp": 1.02810717, "epoch": 0.3236938616004329, "flos": 22127150119680.0, "grad_norm": 1.662715609368348, "language_loss": 0.80936009, "learning_rate": 3.1615362654636738e-06, "loss": 0.83146071, "num_input_tokens_seen": 57980875, "step": 2692, "time_per_iteration": 4.524076223373413 }, { "auxiliary_loss_clip": 0.01117581, "auxiliary_loss_mlp": 0.0104316, "balance_loss_clip": 1.04741573, "balance_loss_mlp": 1.02477813, "epoch": 0.32381410449107195, "flos": 17164618819200.0, "grad_norm": 1.8645561561906547, "language_loss": 0.87102211, "learning_rate": 3.1609020406889163e-06, "loss": 0.8926295, "num_input_tokens_seen": 57998310, "step": 2693, "time_per_iteration": 2.628246545791626 }, { "auxiliary_loss_clip": 0.01137407, "auxiliary_loss_mlp": 0.01047919, "balance_loss_clip": 1.04684782, "balance_loss_mlp": 1.02801108, "epoch": 0.32393434738171106, "flos": 16578125550720.0, "grad_norm": 1.9129238501316363, "language_loss": 0.84956157, "learning_rate": 3.1602676398062416e-06, "loss": 0.87141478, "num_input_tokens_seen": 58017220, "step": 2694, "time_per_iteration": 2.6230642795562744 }, { "auxiliary_loss_clip": 0.01148158, "auxiliary_loss_mlp": 0.01046825, "balance_loss_clip": 1.05057311, "balance_loss_mlp": 1.02827621, "epoch": 0.3240545902723502, "flos": 25483612602240.0, "grad_norm": 4.108518964153062, "language_loss": 0.61671722, "learning_rate": 3.1596330629118886e-06, "loss": 0.63866711, "num_input_tokens_seen": 58037190, "step": 2695, "time_per_iteration": 2.6606943607330322 }, { "auxiliary_loss_clip": 0.01105015, "auxiliary_loss_mlp": 0.01069079, "balance_loss_clip": 1.04400969, "balance_loss_mlp": 1.0472157, "epoch": 0.32417483316298923, "flos": 35845851634560.0, "grad_norm": 2.1254645214933006, "language_loss": 0.7308777, "learning_rate": 3.1589983101021223e-06, "loss": 0.75261861, "num_input_tokens_seen": 58055820, "step": 2696, "time_per_iteration": 2.8060195446014404 }, { "auxiliary_loss_clip": 0.01136892, "auxiliary_loss_mlp": 0.01050103, "balance_loss_clip": 1.04728842, "balance_loss_mlp": 1.03108931, "epoch": 0.32429507605362834, "flos": 30080501406720.0, "grad_norm": 5.933668584059653, "language_loss": 0.84750116, "learning_rate": 3.1583633814732337e-06, "loss": 0.86937112, "num_input_tokens_seen": 58075340, "step": 2697, "time_per_iteration": 2.6954567432403564 }, { "auxiliary_loss_clip": 0.01162228, "auxiliary_loss_mlp": 0.01039638, "balance_loss_clip": 1.05034924, "balance_loss_mlp": 1.02132702, "epoch": 0.3244153189442674, "flos": 18223265387520.0, "grad_norm": 6.250798583670777, "language_loss": 0.71907288, "learning_rate": 3.157728277121541e-06, "loss": 0.74109161, "num_input_tokens_seen": 58093515, "step": 2698, "time_per_iteration": 2.5838987827301025 }, { "auxiliary_loss_clip": 0.01164161, "auxiliary_loss_mlp": 0.01046608, "balance_loss_clip": 1.05123997, "balance_loss_mlp": 1.02596092, "epoch": 0.3245355618349065, "flos": 17710317216000.0, "grad_norm": 3.245460231607872, "language_loss": 0.78787202, "learning_rate": 3.1570929971433897e-06, "loss": 0.80997968, "num_input_tokens_seen": 58109300, "step": 2699, "time_per_iteration": 3.4117767810821533 }, { "auxiliary_loss_clip": 0.01150508, "auxiliary_loss_mlp": 0.01055692, "balance_loss_clip": 1.05051172, "balance_loss_mlp": 1.03469944, "epoch": 0.3246558047255456, "flos": 23440798316160.0, "grad_norm": 1.9529118986796583, "language_loss": 0.83907837, "learning_rate": 3.1564575416351504e-06, "loss": 0.86114037, "num_input_tokens_seen": 58128000, "step": 2700, "time_per_iteration": 2.6093366146087646 }, { "auxiliary_loss_clip": 0.0116298, "auxiliary_loss_mlp": 0.01044405, "balance_loss_clip": 1.05084169, "balance_loss_mlp": 1.02478302, "epoch": 0.32477604761618467, "flos": 21760861178880.0, "grad_norm": 1.8203485579063305, "language_loss": 0.74135542, "learning_rate": 3.155821910693221e-06, "loss": 0.76342928, "num_input_tokens_seen": 58147415, "step": 2701, "time_per_iteration": 2.6109347343444824 }, { "auxiliary_loss_clip": 0.01134025, "auxiliary_loss_mlp": 0.01045788, "balance_loss_clip": 1.0465858, "balance_loss_mlp": 1.02449739, "epoch": 0.3248962905068238, "flos": 19828328624640.0, "grad_norm": 4.093166303185511, "language_loss": 0.8596229, "learning_rate": 3.1551861044140275e-06, "loss": 0.88142103, "num_input_tokens_seen": 58167050, "step": 2702, "time_per_iteration": 2.612567901611328 }, { "auxiliary_loss_clip": 0.01104716, "auxiliary_loss_mlp": 0.01054053, "balance_loss_clip": 1.04503179, "balance_loss_mlp": 1.03390694, "epoch": 0.3250165333974629, "flos": 23948215793280.0, "grad_norm": 7.47356878922595, "language_loss": 0.77444947, "learning_rate": 3.15455012289402e-06, "loss": 0.7960372, "num_input_tokens_seen": 58186695, "step": 2703, "time_per_iteration": 2.7236454486846924 }, { "auxiliary_loss_clip": 0.01150507, "auxiliary_loss_mlp": 0.01054642, "balance_loss_clip": 1.05088329, "balance_loss_mlp": 1.0345428, "epoch": 0.32513677628810195, "flos": 23989333887360.0, "grad_norm": 1.8410053241627613, "language_loss": 0.84207809, "learning_rate": 3.153913966229677e-06, "loss": 0.86412954, "num_input_tokens_seen": 58205815, "step": 2704, "time_per_iteration": 2.6549034118652344 }, { "auxiliary_loss_clip": 0.01045413, "auxiliary_loss_mlp": 0.0101903, "balance_loss_clip": 1.018399, "balance_loss_mlp": 1.01682496, "epoch": 0.32525701917874106, "flos": 70655790009600.0, "grad_norm": 0.6671557430182823, "language_loss": 0.50272304, "learning_rate": 3.1532776345175027e-06, "loss": 0.5233674, "num_input_tokens_seen": 58270960, "step": 2705, "time_per_iteration": 3.1619787216186523 }, { "auxiliary_loss_clip": 0.01156392, "auxiliary_loss_mlp": 0.01052919, "balance_loss_clip": 1.04737163, "balance_loss_mlp": 1.03349972, "epoch": 0.32537726206938017, "flos": 19682639061120.0, "grad_norm": 2.1127573244779967, "language_loss": 0.78467643, "learning_rate": 3.1526411278540285e-06, "loss": 0.80676955, "num_input_tokens_seen": 58289390, "step": 2706, "time_per_iteration": 2.5406243801116943 }, { "auxiliary_loss_clip": 0.01142775, "auxiliary_loss_mlp": 0.01048848, "balance_loss_clip": 1.04865837, "balance_loss_mlp": 1.02767682, "epoch": 0.3254975049600192, "flos": 28760999293440.0, "grad_norm": 2.4596447744727796, "language_loss": 0.81123948, "learning_rate": 3.1520044463358116e-06, "loss": 0.83315569, "num_input_tokens_seen": 58306120, "step": 2707, "time_per_iteration": 2.682624340057373 }, { "auxiliary_loss_clip": 0.01145198, "auxiliary_loss_mlp": 0.0104734, "balance_loss_clip": 1.04747021, "balance_loss_mlp": 1.02839732, "epoch": 0.32561774785065833, "flos": 18877378008960.0, "grad_norm": 1.6435390040074807, "language_loss": 0.8032704, "learning_rate": 3.151367590059436e-06, "loss": 0.82519579, "num_input_tokens_seen": 58324545, "step": 2708, "time_per_iteration": 2.5780484676361084 }, { "auxiliary_loss_clip": 0.01164202, "auxiliary_loss_mlp": 0.00774421, "balance_loss_clip": 1.05125284, "balance_loss_mlp": 1.00022364, "epoch": 0.32573799074129745, "flos": 23112107936640.0, "grad_norm": 2.2108864742218945, "language_loss": 0.87305593, "learning_rate": 3.1507305591215117e-06, "loss": 0.89244217, "num_input_tokens_seen": 58342455, "step": 2709, "time_per_iteration": 2.556419610977173 }, { "auxiliary_loss_clip": 0.01045599, "auxiliary_loss_mlp": 0.01002559, "balance_loss_clip": 1.01815271, "balance_loss_mlp": 1.00023425, "epoch": 0.3258582336319365, "flos": 71237650423680.0, "grad_norm": 0.6980420115877923, "language_loss": 0.55705023, "learning_rate": 3.150093353618677e-06, "loss": 0.57753181, "num_input_tokens_seen": 58407185, "step": 2710, "time_per_iteration": 3.2421669960021973 }, { "auxiliary_loss_clip": 0.0115467, "auxiliary_loss_mlp": 0.01056852, "balance_loss_clip": 1.04941034, "balance_loss_mlp": 1.03569269, "epoch": 0.3259784765225756, "flos": 22456020067200.0, "grad_norm": 2.6338537302594265, "language_loss": 0.88196743, "learning_rate": 3.149455973647596e-06, "loss": 0.90408266, "num_input_tokens_seen": 58425245, "step": 2711, "time_per_iteration": 2.5860812664031982 }, { "auxiliary_loss_clip": 0.01116381, "auxiliary_loss_mlp": 0.01054611, "balance_loss_clip": 1.04220915, "balance_loss_mlp": 1.03177094, "epoch": 0.32609871941321467, "flos": 20484811543680.0, "grad_norm": 2.393541819217028, "language_loss": 0.77020466, "learning_rate": 3.1488184193049563e-06, "loss": 0.79191458, "num_input_tokens_seen": 58444780, "step": 2712, "time_per_iteration": 2.641777276992798 }, { "auxiliary_loss_clip": 0.01156322, "auxiliary_loss_mlp": 0.01050334, "balance_loss_clip": 1.04891491, "balance_loss_mlp": 1.0316298, "epoch": 0.3262189623038538, "flos": 22416805393920.0, "grad_norm": 1.741987083009857, "language_loss": 0.72078133, "learning_rate": 3.1481806906874767e-06, "loss": 0.74284792, "num_input_tokens_seen": 58466090, "step": 2713, "time_per_iteration": 2.5828278064727783 }, { "auxiliary_loss_clip": 0.01155654, "auxiliary_loss_mlp": 0.01048007, "balance_loss_clip": 1.04900837, "balance_loss_mlp": 1.02956569, "epoch": 0.3263392051944929, "flos": 20923496346240.0, "grad_norm": 1.7725296635245533, "language_loss": 0.87649715, "learning_rate": 3.147542787891899e-06, "loss": 0.89853382, "num_input_tokens_seen": 58485435, "step": 2714, "time_per_iteration": 2.572761297225952 }, { "auxiliary_loss_clip": 0.01138007, "auxiliary_loss_mlp": 0.0104662, "balance_loss_clip": 1.05221403, "balance_loss_mlp": 1.02693844, "epoch": 0.32645944808513194, "flos": 24025172682240.0, "grad_norm": 2.262376632070194, "language_loss": 0.75157249, "learning_rate": 3.1469047110149926e-06, "loss": 0.77341878, "num_input_tokens_seen": 58504175, "step": 2715, "time_per_iteration": 3.689948558807373 }, { "auxiliary_loss_clip": 0.01105986, "auxiliary_loss_mlp": 0.01050709, "balance_loss_clip": 1.04452622, "balance_loss_mlp": 1.03039575, "epoch": 0.32657969097577105, "flos": 21032413361280.0, "grad_norm": 2.163052926055696, "language_loss": 0.85575259, "learning_rate": 3.146266460153554e-06, "loss": 0.87731957, "num_input_tokens_seen": 58523885, "step": 2716, "time_per_iteration": 2.701594114303589 }, { "auxiliary_loss_clip": 0.01135409, "auxiliary_loss_mlp": 0.00776144, "balance_loss_clip": 1.04824233, "balance_loss_mlp": 1.00018585, "epoch": 0.32669993386641016, "flos": 22710267509760.0, "grad_norm": 2.586139195913629, "language_loss": 0.79915822, "learning_rate": 3.145628035404404e-06, "loss": 0.81827372, "num_input_tokens_seen": 58543085, "step": 2717, "time_per_iteration": 2.663771390914917 }, { "auxiliary_loss_clip": 0.01047106, "auxiliary_loss_mlp": 0.01007286, "balance_loss_clip": 1.01993537, "balance_loss_mlp": 1.00489032, "epoch": 0.3268201767570492, "flos": 72105718406400.0, "grad_norm": 0.8863922420496092, "language_loss": 0.57476819, "learning_rate": 3.1449894368643922e-06, "loss": 0.59531212, "num_input_tokens_seen": 58605400, "step": 2718, "time_per_iteration": 5.1589133739471436 }, { "auxiliary_loss_clip": 0.01120566, "auxiliary_loss_mlp": 0.01046762, "balance_loss_clip": 1.04864788, "balance_loss_mlp": 1.02770066, "epoch": 0.32694041964768833, "flos": 24535175938560.0, "grad_norm": 1.6903819597948162, "language_loss": 0.71226013, "learning_rate": 3.1443506646303934e-06, "loss": 0.73393333, "num_input_tokens_seen": 58626700, "step": 2719, "time_per_iteration": 2.702737808227539 }, { "auxiliary_loss_clip": 0.01153735, "auxiliary_loss_mlp": 0.01043803, "balance_loss_clip": 1.04992414, "balance_loss_mlp": 1.02399015, "epoch": 0.32706066253832744, "flos": 33183003755520.0, "grad_norm": 2.2469291130592985, "language_loss": 0.67029172, "learning_rate": 3.1437117187993086e-06, "loss": 0.69226712, "num_input_tokens_seen": 58649020, "step": 2720, "time_per_iteration": 2.7136569023132324 }, { "auxiliary_loss_clip": 0.01115501, "auxiliary_loss_mlp": 0.01056822, "balance_loss_clip": 1.04401755, "balance_loss_mlp": 1.03460145, "epoch": 0.3271809054289665, "flos": 24061622008320.0, "grad_norm": 1.656620156912782, "language_loss": 0.80023807, "learning_rate": 3.143072599468065e-06, "loss": 0.82196134, "num_input_tokens_seen": 58668845, "step": 2721, "time_per_iteration": 2.731837034225464 }, { "auxiliary_loss_clip": 0.01133292, "auxiliary_loss_mlp": 0.01046226, "balance_loss_clip": 1.04883599, "balance_loss_mlp": 1.02750969, "epoch": 0.3273011483196056, "flos": 38253769712640.0, "grad_norm": 1.5654554805762464, "language_loss": 0.75884557, "learning_rate": 3.1424333067336174e-06, "loss": 0.78064084, "num_input_tokens_seen": 58691610, "step": 2722, "time_per_iteration": 2.7974629402160645 }, { "auxiliary_loss_clip": 0.01153198, "auxiliary_loss_mlp": 0.01041693, "balance_loss_clip": 1.04875278, "balance_loss_mlp": 1.02209544, "epoch": 0.3274213912102447, "flos": 29054389582080.0, "grad_norm": 2.400630868621271, "language_loss": 0.7843731, "learning_rate": 3.141793840692945e-06, "loss": 0.80632198, "num_input_tokens_seen": 58712360, "step": 2723, "time_per_iteration": 2.6576061248779297 }, { "auxiliary_loss_clip": 0.01128406, "auxiliary_loss_mlp": 0.01051876, "balance_loss_clip": 1.04608512, "balance_loss_mlp": 1.03040636, "epoch": 0.32754163410088377, "flos": 29133249891840.0, "grad_norm": 2.842681073476286, "language_loss": 0.6104725, "learning_rate": 3.1411542014430553e-06, "loss": 0.63227528, "num_input_tokens_seen": 58733440, "step": 2724, "time_per_iteration": 3.6208572387695312 }, { "auxiliary_loss_clip": 0.01119637, "auxiliary_loss_mlp": 0.01049238, "balance_loss_clip": 1.04226303, "balance_loss_mlp": 1.03079605, "epoch": 0.3276618769915229, "flos": 20631075724800.0, "grad_norm": 1.7965255191881666, "language_loss": 0.81479281, "learning_rate": 3.1405143890809804e-06, "loss": 0.83648157, "num_input_tokens_seen": 58752735, "step": 2725, "time_per_iteration": 2.6692845821380615 }, { "auxiliary_loss_clip": 0.01129727, "auxiliary_loss_mlp": 0.01052873, "balance_loss_clip": 1.04711092, "balance_loss_mlp": 1.0333581, "epoch": 0.327782119882162, "flos": 18657425076480.0, "grad_norm": 1.7045027857306514, "language_loss": 0.70262933, "learning_rate": 3.1398744037037796e-06, "loss": 0.72445536, "num_input_tokens_seen": 58772070, "step": 2726, "time_per_iteration": 2.6259822845458984 }, { "auxiliary_loss_clip": 0.01135002, "auxiliary_loss_mlp": 0.01046974, "balance_loss_clip": 1.04678047, "balance_loss_mlp": 1.02685153, "epoch": 0.32790236277280105, "flos": 21795802133760.0, "grad_norm": 2.4129008725155874, "language_loss": 0.83860373, "learning_rate": 3.139234245408538e-06, "loss": 0.86042351, "num_input_tokens_seen": 58790950, "step": 2727, "time_per_iteration": 2.656778573989868 }, { "auxiliary_loss_clip": 0.01121545, "auxiliary_loss_mlp": 0.00773828, "balance_loss_clip": 1.04580808, "balance_loss_mlp": 1.00020003, "epoch": 0.32802260566344016, "flos": 23331414424320.0, "grad_norm": 1.3887029534521127, "language_loss": 0.75929952, "learning_rate": 3.1385939142923666e-06, "loss": 0.7782532, "num_input_tokens_seen": 58813340, "step": 2728, "time_per_iteration": 2.685924768447876 }, { "auxiliary_loss_clip": 0.01138356, "auxiliary_loss_mlp": 0.01046834, "balance_loss_clip": 1.04780769, "balance_loss_mlp": 1.02481604, "epoch": 0.3281428485540792, "flos": 24206988349440.0, "grad_norm": 2.001024690019547, "language_loss": 0.78403735, "learning_rate": 3.137953410452405e-06, "loss": 0.80588919, "num_input_tokens_seen": 58833610, "step": 2729, "time_per_iteration": 2.6724839210510254 }, { "auxiliary_loss_clip": 0.01131495, "auxiliary_loss_mlp": 0.0105237, "balance_loss_clip": 1.04426837, "balance_loss_mlp": 1.03367817, "epoch": 0.3282630914447183, "flos": 34128962380800.0, "grad_norm": 1.8045932534586129, "language_loss": 0.74428487, "learning_rate": 3.1373127339858146e-06, "loss": 0.76612359, "num_input_tokens_seen": 58856210, "step": 2730, "time_per_iteration": 2.74821400642395 }, { "auxiliary_loss_clip": 0.01113942, "auxiliary_loss_mlp": 0.01049583, "balance_loss_clip": 1.04061079, "balance_loss_mlp": 1.03091478, "epoch": 0.32838333433535744, "flos": 27600726170880.0, "grad_norm": 1.7652753841155844, "language_loss": 0.74598944, "learning_rate": 3.136671884989787e-06, "loss": 0.76762468, "num_input_tokens_seen": 58876120, "step": 2731, "time_per_iteration": 2.723985433578491 }, { "auxiliary_loss_clip": 0.01102848, "auxiliary_loss_mlp": 0.0105527, "balance_loss_clip": 1.04482973, "balance_loss_mlp": 1.03461134, "epoch": 0.3285035772259965, "flos": 12349500935040.0, "grad_norm": 2.802482638399822, "language_loss": 0.87343097, "learning_rate": 3.1360308635615383e-06, "loss": 0.8950122, "num_input_tokens_seen": 58894660, "step": 2732, "time_per_iteration": 2.7189581394195557 }, { "auxiliary_loss_clip": 0.01146948, "auxiliary_loss_mlp": 0.01049187, "balance_loss_clip": 1.05022287, "balance_loss_mlp": 1.02683544, "epoch": 0.3286238201166356, "flos": 24316084932480.0, "grad_norm": 2.548600174151031, "language_loss": 0.78982407, "learning_rate": 3.135389669798311e-06, "loss": 0.8117854, "num_input_tokens_seen": 58912720, "step": 2733, "time_per_iteration": 2.6776211261749268 }, { "auxiliary_loss_clip": 0.01145263, "auxiliary_loss_mlp": 0.00774833, "balance_loss_clip": 1.0474298, "balance_loss_mlp": 1.00020695, "epoch": 0.3287440630072747, "flos": 21392812471680.0, "grad_norm": 2.1320306314101085, "language_loss": 0.79936373, "learning_rate": 3.134748303797373e-06, "loss": 0.81856459, "num_input_tokens_seen": 58930090, "step": 2734, "time_per_iteration": 2.633493661880493 }, { "auxiliary_loss_clip": 0.0111096, "auxiliary_loss_mlp": 0.01072066, "balance_loss_clip": 1.04292524, "balance_loss_mlp": 1.04742551, "epoch": 0.32886430589791377, "flos": 23732536579200.0, "grad_norm": 1.9410829362909476, "language_loss": 0.8123486, "learning_rate": 3.1341067656560203e-06, "loss": 0.83417886, "num_input_tokens_seen": 58947935, "step": 2735, "time_per_iteration": 2.7231645584106445 }, { "auxiliary_loss_clip": 0.01147236, "auxiliary_loss_mlp": 0.01047333, "balance_loss_clip": 1.04907346, "balance_loss_mlp": 1.02645957, "epoch": 0.3289845487885529, "flos": 22418708814720.0, "grad_norm": 1.9156169505207146, "language_loss": 0.85704327, "learning_rate": 3.133465055471572e-06, "loss": 0.87898886, "num_input_tokens_seen": 58967720, "step": 2736, "time_per_iteration": 2.6208434104919434 }, { "auxiliary_loss_clip": 0.01120828, "auxiliary_loss_mlp": 0.01049087, "balance_loss_clip": 1.04419005, "balance_loss_mlp": 1.02799881, "epoch": 0.329104791679192, "flos": 19682603147520.0, "grad_norm": 2.2477577969307965, "language_loss": 0.65996057, "learning_rate": 3.1328231733413767e-06, "loss": 0.6816597, "num_input_tokens_seen": 58984360, "step": 2737, "time_per_iteration": 2.6695358753204346 }, { "auxiliary_loss_clip": 0.01143231, "auxiliary_loss_mlp": 0.01063296, "balance_loss_clip": 1.0462085, "balance_loss_mlp": 1.04183817, "epoch": 0.32922503456983104, "flos": 15997234803840.0, "grad_norm": 2.7973671087624226, "language_loss": 0.910326, "learning_rate": 3.1321811193628067e-06, "loss": 0.93239135, "num_input_tokens_seen": 59002505, "step": 2738, "time_per_iteration": 2.5887110233306885 }, { "auxiliary_loss_clip": 0.01149953, "auxiliary_loss_mlp": 0.00774699, "balance_loss_clip": 1.04776561, "balance_loss_mlp": 1.00019574, "epoch": 0.32934527746047015, "flos": 26834069260800.0, "grad_norm": 2.133275787801232, "language_loss": 0.70368254, "learning_rate": 3.131538893633261e-06, "loss": 0.72292906, "num_input_tokens_seen": 59022065, "step": 2739, "time_per_iteration": 2.6375882625579834 }, { "auxiliary_loss_clip": 0.01161411, "auxiliary_loss_mlp": 0.01056872, "balance_loss_clip": 1.04973233, "balance_loss_mlp": 1.03730965, "epoch": 0.32946552035110926, "flos": 23403774372480.0, "grad_norm": 2.504763552552192, "language_loss": 0.77984053, "learning_rate": 3.130896496250165e-06, "loss": 0.80202335, "num_input_tokens_seen": 59041890, "step": 2740, "time_per_iteration": 3.550654411315918 }, { "auxiliary_loss_clip": 0.01160454, "auxiliary_loss_mlp": 0.01046643, "balance_loss_clip": 1.04792202, "balance_loss_mlp": 1.02728331, "epoch": 0.3295857632417483, "flos": 14172470029440.0, "grad_norm": 2.171130020541096, "language_loss": 0.8665691, "learning_rate": 3.1302539273109693e-06, "loss": 0.88864005, "num_input_tokens_seen": 59058715, "step": 2741, "time_per_iteration": 2.558293342590332 }, { "auxiliary_loss_clip": 0.01130762, "auxiliary_loss_mlp": 0.01052258, "balance_loss_clip": 1.04646897, "balance_loss_mlp": 1.03058553, "epoch": 0.32970600613238743, "flos": 22196708807040.0, "grad_norm": 1.7120324194494736, "language_loss": 0.80443937, "learning_rate": 3.1296111869131513e-06, "loss": 0.82626957, "num_input_tokens_seen": 59076140, "step": 2742, "time_per_iteration": 2.6551663875579834 }, { "auxiliary_loss_clip": 0.01160377, "auxiliary_loss_mlp": 0.01046719, "balance_loss_clip": 1.04898834, "balance_loss_mlp": 1.02646542, "epoch": 0.32982624902302654, "flos": 22053784590720.0, "grad_norm": 2.3210910113345604, "language_loss": 0.86065865, "learning_rate": 3.1289682751542153e-06, "loss": 0.88272959, "num_input_tokens_seen": 59095700, "step": 2743, "time_per_iteration": 2.594581365585327 }, { "auxiliary_loss_clip": 0.01143556, "auxiliary_loss_mlp": 0.01051239, "balance_loss_clip": 1.0450443, "balance_loss_mlp": 1.03156972, "epoch": 0.3299464919136656, "flos": 18661626967680.0, "grad_norm": 2.097158240023893, "language_loss": 0.71628582, "learning_rate": 3.1283251921316883e-06, "loss": 0.73823375, "num_input_tokens_seen": 59113445, "step": 2744, "time_per_iteration": 4.575470447540283 }, { "auxiliary_loss_clip": 0.01108246, "auxiliary_loss_mlp": 0.01047601, "balance_loss_clip": 1.04491234, "balance_loss_mlp": 1.02931404, "epoch": 0.3300667348043047, "flos": 13407357404160.0, "grad_norm": 2.5988241058166786, "language_loss": 0.8080852, "learning_rate": 3.1276819379431277e-06, "loss": 0.82964373, "num_input_tokens_seen": 59131535, "step": 2745, "time_per_iteration": 2.678248405456543 }, { "auxiliary_loss_clip": 0.01144951, "auxiliary_loss_mlp": 0.00775698, "balance_loss_clip": 1.04881108, "balance_loss_mlp": 1.00022614, "epoch": 0.33018697769494376, "flos": 15742556398080.0, "grad_norm": 2.165945596350961, "language_loss": 0.75096023, "learning_rate": 3.1270385126861134e-06, "loss": 0.77016675, "num_input_tokens_seen": 59149520, "step": 2746, "time_per_iteration": 2.601346731185913 }, { "auxiliary_loss_clip": 0.01160819, "auxiliary_loss_mlp": 0.01050778, "balance_loss_clip": 1.04992127, "balance_loss_mlp": 1.03015447, "epoch": 0.3303072205855829, "flos": 18258601392000.0, "grad_norm": 2.272058528762308, "language_loss": 0.82223403, "learning_rate": 3.1263949164582533e-06, "loss": 0.84434998, "num_input_tokens_seen": 59169170, "step": 2747, "time_per_iteration": 2.5871071815490723 }, { "auxiliary_loss_clip": 0.01162587, "auxiliary_loss_mlp": 0.0104585, "balance_loss_clip": 1.0493722, "balance_loss_mlp": 1.02585912, "epoch": 0.330427463476222, "flos": 17749424148480.0, "grad_norm": 2.7598608210282416, "language_loss": 0.78400421, "learning_rate": 3.1257511493571797e-06, "loss": 0.80608863, "num_input_tokens_seen": 59187675, "step": 2748, "time_per_iteration": 2.5423567295074463 }, { "auxiliary_loss_clip": 0.01126022, "auxiliary_loss_mlp": 0.01049473, "balance_loss_clip": 1.04603171, "balance_loss_mlp": 1.02980304, "epoch": 0.33054770636686104, "flos": 27162580072320.0, "grad_norm": 2.6748078926241488, "language_loss": 0.78945965, "learning_rate": 3.125107211480552e-06, "loss": 0.81121457, "num_input_tokens_seen": 59207610, "step": 2749, "time_per_iteration": 2.7259063720703125 }, { "auxiliary_loss_clip": 0.01088996, "auxiliary_loss_mlp": 0.01048343, "balance_loss_clip": 1.038414, "balance_loss_mlp": 1.02843523, "epoch": 0.33066794925750015, "flos": 20117193799680.0, "grad_norm": 1.7331294582222592, "language_loss": 0.79599369, "learning_rate": 3.124463102926054e-06, "loss": 0.81736708, "num_input_tokens_seen": 59226945, "step": 2750, "time_per_iteration": 3.5911989212036133 }, { "auxiliary_loss_clip": 0.01046075, "auxiliary_loss_mlp": 0.01011971, "balance_loss_clip": 1.02409267, "balance_loss_mlp": 1.00961101, "epoch": 0.33078819214813926, "flos": 70642609718400.0, "grad_norm": 0.762179911223777, "language_loss": 0.61607599, "learning_rate": 3.1238188237913984e-06, "loss": 0.6366564, "num_input_tokens_seen": 59291485, "step": 2751, "time_per_iteration": 3.273850202560425 }, { "auxiliary_loss_clip": 0.01164155, "auxiliary_loss_mlp": 0.01050179, "balance_loss_clip": 1.04853129, "balance_loss_mlp": 1.02870905, "epoch": 0.3309084350387783, "flos": 21141940907520.0, "grad_norm": 2.4291592099169192, "language_loss": 0.76529372, "learning_rate": 3.1231743741743202e-06, "loss": 0.78743708, "num_input_tokens_seen": 59310990, "step": 2752, "time_per_iteration": 2.6041526794433594 }, { "auxiliary_loss_clip": 0.0114335, "auxiliary_loss_mlp": 0.01048328, "balance_loss_clip": 1.04605401, "balance_loss_mlp": 1.02739477, "epoch": 0.3310286779294174, "flos": 14209350318720.0, "grad_norm": 2.379817842887499, "language_loss": 0.83475167, "learning_rate": 3.122529754172582e-06, "loss": 0.85666841, "num_input_tokens_seen": 59327875, "step": 2753, "time_per_iteration": 2.5531022548675537 }, { "auxiliary_loss_clip": 0.0114848, "auxiliary_loss_mlp": 0.01055028, "balance_loss_clip": 1.04952407, "balance_loss_mlp": 1.03479838, "epoch": 0.33114892082005654, "flos": 20778130005120.0, "grad_norm": 2.172937161584224, "language_loss": 0.72277987, "learning_rate": 3.1218849638839736e-06, "loss": 0.74481499, "num_input_tokens_seen": 59347135, "step": 2754, "time_per_iteration": 2.621527910232544 }, { "auxiliary_loss_clip": 0.01110837, "auxiliary_loss_mlp": 0.01058984, "balance_loss_clip": 1.04061437, "balance_loss_mlp": 1.03603601, "epoch": 0.3312691637106956, "flos": 17090750499840.0, "grad_norm": 3.331272284555186, "language_loss": 0.78621542, "learning_rate": 3.121240003406307e-06, "loss": 0.8079136, "num_input_tokens_seen": 59365985, "step": 2755, "time_per_iteration": 2.660635471343994 }, { "auxiliary_loss_clip": 0.0113053, "auxiliary_loss_mlp": 0.01053279, "balance_loss_clip": 1.04734731, "balance_loss_mlp": 1.03185713, "epoch": 0.3313894066013347, "flos": 29456230008960.0, "grad_norm": 3.7416877212156034, "language_loss": 0.72482729, "learning_rate": 3.120594872837425e-06, "loss": 0.74666536, "num_input_tokens_seen": 59384655, "step": 2756, "time_per_iteration": 2.708507776260376 }, { "auxiliary_loss_clip": 0.01047114, "auxiliary_loss_mlp": 0.00756494, "balance_loss_clip": 1.01968622, "balance_loss_mlp": 1.00035655, "epoch": 0.3315096494919738, "flos": 61419242280960.0, "grad_norm": 0.8298114957716708, "language_loss": 0.62335593, "learning_rate": 3.1199495722751906e-06, "loss": 0.64139205, "num_input_tokens_seen": 59444185, "step": 2757, "time_per_iteration": 3.2405714988708496 }, { "auxiliary_loss_clip": 0.0111582, "auxiliary_loss_mlp": 0.0105032, "balance_loss_clip": 1.04581261, "balance_loss_mlp": 1.03010225, "epoch": 0.33162989238261287, "flos": 21653057485440.0, "grad_norm": 2.560375418326839, "language_loss": 0.83831823, "learning_rate": 3.1193041018174972e-06, "loss": 0.85997963, "num_input_tokens_seen": 59464900, "step": 2758, "time_per_iteration": 2.7021632194519043 }, { "auxiliary_loss_clip": 0.01156877, "auxiliary_loss_mlp": 0.01051617, "balance_loss_clip": 1.05205083, "balance_loss_mlp": 1.03015924, "epoch": 0.331750135273252, "flos": 22674787850880.0, "grad_norm": 3.1017293317257715, "language_loss": 0.94471216, "learning_rate": 3.118658461562261e-06, "loss": 0.96679711, "num_input_tokens_seen": 59481000, "step": 2759, "time_per_iteration": 2.6123600006103516 }, { "auxiliary_loss_clip": 0.01134185, "auxiliary_loss_mlp": 0.01056255, "balance_loss_clip": 1.05004144, "balance_loss_mlp": 1.03672862, "epoch": 0.33187037816389103, "flos": 22746896403840.0, "grad_norm": 2.249071761794524, "language_loss": 0.85059434, "learning_rate": 3.118012651607426e-06, "loss": 0.87249875, "num_input_tokens_seen": 59502605, "step": 2760, "time_per_iteration": 2.616345167160034 }, { "auxiliary_loss_clip": 0.01164943, "auxiliary_loss_mlp": 0.01042666, "balance_loss_clip": 1.05136871, "balance_loss_mlp": 1.0216136, "epoch": 0.33199062105453014, "flos": 19203769918080.0, "grad_norm": 2.319138734510208, "language_loss": 0.83564317, "learning_rate": 3.1173666720509603e-06, "loss": 0.8577193, "num_input_tokens_seen": 59519540, "step": 2761, "time_per_iteration": 2.590277671813965 }, { "auxiliary_loss_clip": 0.01140885, "auxiliary_loss_mlp": 0.01042423, "balance_loss_clip": 1.05029464, "balance_loss_mlp": 1.0217284, "epoch": 0.33211086394516925, "flos": 31577006764800.0, "grad_norm": 1.9267460008699573, "language_loss": 0.68222541, "learning_rate": 3.116720522990859e-06, "loss": 0.70405853, "num_input_tokens_seen": 59540415, "step": 2762, "time_per_iteration": 2.684875011444092 }, { "auxiliary_loss_clip": 0.01104878, "auxiliary_loss_mlp": 0.01060277, "balance_loss_clip": 1.04652059, "balance_loss_mlp": 1.03939128, "epoch": 0.3322311068358083, "flos": 17932496791680.0, "grad_norm": 4.506921355789327, "language_loss": 0.62259305, "learning_rate": 3.116074204525142e-06, "loss": 0.64424455, "num_input_tokens_seen": 59558590, "step": 2763, "time_per_iteration": 2.7533600330352783 }, { "auxiliary_loss_clip": 0.01144548, "auxiliary_loss_mlp": 0.01049561, "balance_loss_clip": 1.05047631, "balance_loss_mlp": 1.02816308, "epoch": 0.3323513497264474, "flos": 32269831269120.0, "grad_norm": 3.5518882400352823, "language_loss": 0.83516175, "learning_rate": 3.1154277167518553e-06, "loss": 0.85710287, "num_input_tokens_seen": 59580205, "step": 2764, "time_per_iteration": 2.7139408588409424 }, { "auxiliary_loss_clip": 0.01034385, "auxiliary_loss_mlp": 0.01012055, "balance_loss_clip": 1.01736295, "balance_loss_mlp": 1.0089314, "epoch": 0.33247159261708653, "flos": 52668674588160.0, "grad_norm": 0.7787189648897279, "language_loss": 0.59459591, "learning_rate": 3.114781059769072e-06, "loss": 0.61506033, "num_input_tokens_seen": 59631530, "step": 2765, "time_per_iteration": 3.0952796936035156 }, { "auxiliary_loss_clip": 0.01137138, "auxiliary_loss_mlp": 0.01046715, "balance_loss_clip": 1.04807496, "balance_loss_mlp": 1.02532911, "epoch": 0.3325918355077256, "flos": 27125232906240.0, "grad_norm": 2.8527460714164734, "language_loss": 0.6704632, "learning_rate": 3.1141342336748874e-06, "loss": 0.69230175, "num_input_tokens_seen": 59651090, "step": 2766, "time_per_iteration": 2.651412010192871 }, { "auxiliary_loss_clip": 0.01149417, "auxiliary_loss_mlp": 0.01059167, "balance_loss_clip": 1.05260015, "balance_loss_mlp": 1.04023671, "epoch": 0.3327120783983647, "flos": 23664414435840.0, "grad_norm": 1.667697647782282, "language_loss": 0.82112038, "learning_rate": 3.1134872385674253e-06, "loss": 0.84320623, "num_input_tokens_seen": 59675245, "step": 2767, "time_per_iteration": 3.6683692932128906 }, { "auxiliary_loss_clip": 0.01139443, "auxiliary_loss_mlp": 0.01053494, "balance_loss_clip": 1.04823327, "balance_loss_mlp": 1.03182173, "epoch": 0.3328323212890038, "flos": 19171378828800.0, "grad_norm": 2.8926070226800724, "language_loss": 0.8569299, "learning_rate": 3.1128400745448353e-06, "loss": 0.87885928, "num_input_tokens_seen": 59694625, "step": 2768, "time_per_iteration": 2.6252832412719727 }, { "auxiliary_loss_clip": 0.01155635, "auxiliary_loss_mlp": 0.01049387, "balance_loss_clip": 1.05322659, "balance_loss_mlp": 1.03009868, "epoch": 0.33295256417964286, "flos": 37706347463040.0, "grad_norm": 2.130165622236748, "language_loss": 0.6292845, "learning_rate": 3.11219274170529e-06, "loss": 0.6513347, "num_input_tokens_seen": 59716435, "step": 2769, "time_per_iteration": 2.773837089538574 }, { "auxiliary_loss_clip": 0.01128618, "auxiliary_loss_mlp": 0.0104786, "balance_loss_clip": 1.04697788, "balance_loss_mlp": 1.02848864, "epoch": 0.333072807070282, "flos": 26505989412480.0, "grad_norm": 1.8356172617931203, "language_loss": 0.81706029, "learning_rate": 3.1115452401469903e-06, "loss": 0.83882511, "num_input_tokens_seen": 59736835, "step": 2770, "time_per_iteration": 3.6135103702545166 }, { "auxiliary_loss_clip": 0.01104848, "auxiliary_loss_mlp": 0.01058945, "balance_loss_clip": 1.04471755, "balance_loss_mlp": 1.03876352, "epoch": 0.3331930499609211, "flos": 21430913823360.0, "grad_norm": 2.1907343560711228, "language_loss": 0.8647992, "learning_rate": 3.1108975699681613e-06, "loss": 0.88643718, "num_input_tokens_seen": 59754230, "step": 2771, "time_per_iteration": 3.6898679733276367 }, { "auxiliary_loss_clip": 0.01119292, "auxiliary_loss_mlp": 0.01050372, "balance_loss_clip": 1.04573989, "balance_loss_mlp": 1.03187084, "epoch": 0.33331329285156014, "flos": 20659947281280.0, "grad_norm": 2.012655043200273, "language_loss": 0.71501529, "learning_rate": 3.1102497312670542e-06, "loss": 0.73671192, "num_input_tokens_seen": 59772235, "step": 2772, "time_per_iteration": 2.6393630504608154 }, { "auxiliary_loss_clip": 0.0112164, "auxiliary_loss_mlp": 0.01062085, "balance_loss_clip": 1.0432992, "balance_loss_mlp": 1.04004335, "epoch": 0.33343353574219925, "flos": 28001596930560.0, "grad_norm": 2.1604645404524736, "language_loss": 0.8084172, "learning_rate": 3.109601724141946e-06, "loss": 0.8302545, "num_input_tokens_seen": 59791230, "step": 2773, "time_per_iteration": 2.6721317768096924 }, { "auxiliary_loss_clip": 0.01129542, "auxiliary_loss_mlp": 0.01058857, "balance_loss_clip": 1.04747605, "balance_loss_mlp": 1.03785205, "epoch": 0.33355377863283836, "flos": 23764963582080.0, "grad_norm": 1.8613610520236035, "language_loss": 0.68300021, "learning_rate": 3.108953548691138e-06, "loss": 0.70488423, "num_input_tokens_seen": 59811315, "step": 2774, "time_per_iteration": 2.6802055835723877 }, { "auxiliary_loss_clip": 0.01166689, "auxiliary_loss_mlp": 0.01047638, "balance_loss_clip": 1.05136466, "balance_loss_mlp": 1.0268836, "epoch": 0.3336740215234774, "flos": 37779677078400.0, "grad_norm": 2.4778388759591325, "language_loss": 0.73010808, "learning_rate": 3.108305205012959e-06, "loss": 0.75225139, "num_input_tokens_seen": 59832010, "step": 2775, "time_per_iteration": 2.6928865909576416 }, { "auxiliary_loss_clip": 0.01139012, "auxiliary_loss_mlp": 0.01052747, "balance_loss_clip": 1.05007792, "balance_loss_mlp": 1.03219569, "epoch": 0.3337942644141165, "flos": 25519056347520.0, "grad_norm": 2.4542808611811386, "language_loss": 0.8723197, "learning_rate": 3.107656693205761e-06, "loss": 0.89423734, "num_input_tokens_seen": 59851450, "step": 2776, "time_per_iteration": 3.518981695175171 }, { "auxiliary_loss_clip": 0.0117363, "auxiliary_loss_mlp": 0.0104905, "balance_loss_clip": 1.05365276, "balance_loss_mlp": 1.02655506, "epoch": 0.3339145073047556, "flos": 25989844930560.0, "grad_norm": 4.694965994893502, "language_loss": 0.7056421, "learning_rate": 3.107008013367924e-06, "loss": 0.72786891, "num_input_tokens_seen": 59870245, "step": 2777, "time_per_iteration": 2.579043388366699 }, { "auxiliary_loss_clip": 0.01127087, "auxiliary_loss_mlp": 0.01048868, "balance_loss_clip": 1.04834938, "balance_loss_mlp": 1.02890086, "epoch": 0.3340347501953947, "flos": 19062569554560.0, "grad_norm": 2.4418625760547874, "language_loss": 0.86948186, "learning_rate": 3.1063591655978507e-06, "loss": 0.89124143, "num_input_tokens_seen": 59886195, "step": 2778, "time_per_iteration": 2.6378085613250732 }, { "auxiliary_loss_clip": 0.01096217, "auxiliary_loss_mlp": 0.01046785, "balance_loss_clip": 1.04111505, "balance_loss_mlp": 1.02597094, "epoch": 0.3341549930860338, "flos": 18109715518080.0, "grad_norm": 3.120903127157724, "language_loss": 0.80078018, "learning_rate": 3.105710149993972e-06, "loss": 0.82221019, "num_input_tokens_seen": 59905525, "step": 2779, "time_per_iteration": 2.686845302581787 }, { "auxiliary_loss_clip": 0.0116884, "auxiliary_loss_mlp": 0.01045393, "balance_loss_clip": 1.05452561, "balance_loss_mlp": 1.02480507, "epoch": 0.33427523597667286, "flos": 22674967418880.0, "grad_norm": 3.541572175796515, "language_loss": 0.85411817, "learning_rate": 3.1050609666547427e-06, "loss": 0.87626052, "num_input_tokens_seen": 59925085, "step": 2780, "time_per_iteration": 2.5984854698181152 }, { "auxiliary_loss_clip": 0.01135926, "auxiliary_loss_mlp": 0.01053832, "balance_loss_clip": 1.05027175, "balance_loss_mlp": 1.03391171, "epoch": 0.33439547886731197, "flos": 22638338524800.0, "grad_norm": 3.475303009560451, "language_loss": 0.77759856, "learning_rate": 3.104411615678644e-06, "loss": 0.79949605, "num_input_tokens_seen": 59943935, "step": 2781, "time_per_iteration": 2.6765661239624023 }, { "auxiliary_loss_clip": 0.01134002, "auxiliary_loss_mlp": 0.01052165, "balance_loss_clip": 1.04883158, "balance_loss_mlp": 1.03095746, "epoch": 0.3345157217579511, "flos": 24096383395200.0, "grad_norm": 3.307343373872197, "language_loss": 0.73736703, "learning_rate": 3.1037620971641803e-06, "loss": 0.75922871, "num_input_tokens_seen": 59963725, "step": 2782, "time_per_iteration": 2.680881977081299 }, { "auxiliary_loss_clip": 0.01167095, "auxiliary_loss_mlp": 0.01049818, "balance_loss_clip": 1.05187106, "balance_loss_mlp": 1.02861035, "epoch": 0.33463596464859013, "flos": 18989491334400.0, "grad_norm": 3.7756639738329905, "language_loss": 0.6485666, "learning_rate": 3.1031124112098844e-06, "loss": 0.67073578, "num_input_tokens_seen": 59981935, "step": 2783, "time_per_iteration": 2.5976569652557373 }, { "auxiliary_loss_clip": 0.01140712, "auxiliary_loss_mlp": 0.01044525, "balance_loss_clip": 1.05027783, "balance_loss_mlp": 1.02553511, "epoch": 0.33475620753922924, "flos": 20375607219840.0, "grad_norm": 2.357378788017526, "language_loss": 0.7209326, "learning_rate": 3.1024625579143127e-06, "loss": 0.74278498, "num_input_tokens_seen": 59999455, "step": 2784, "time_per_iteration": 2.59187388420105 }, { "auxiliary_loss_clip": 0.01166045, "auxiliary_loss_mlp": 0.01052128, "balance_loss_clip": 1.05430555, "balance_loss_mlp": 1.03180289, "epoch": 0.33487645042986836, "flos": 18182578256640.0, "grad_norm": 2.44665812266502, "language_loss": 0.72902989, "learning_rate": 3.101812537376048e-06, "loss": 0.75121164, "num_input_tokens_seen": 60018475, "step": 2785, "time_per_iteration": 2.5761470794677734 }, { "auxiliary_loss_clip": 0.01133458, "auxiliary_loss_mlp": 0.0077549, "balance_loss_clip": 1.0494895, "balance_loss_mlp": 1.00024748, "epoch": 0.3349966933205074, "flos": 25848824135040.0, "grad_norm": 2.050507794336368, "language_loss": 0.84248066, "learning_rate": 3.1011623496936973e-06, "loss": 0.86157006, "num_input_tokens_seen": 60036770, "step": 2786, "time_per_iteration": 2.6552019119262695 }, { "auxiliary_loss_clip": 0.01167383, "auxiliary_loss_mlp": 0.0104917, "balance_loss_clip": 1.0563314, "balance_loss_mlp": 1.02994132, "epoch": 0.3351169362111465, "flos": 28111447699200.0, "grad_norm": 1.9645758785527483, "language_loss": 0.69602996, "learning_rate": 3.100511994965893e-06, "loss": 0.71819556, "num_input_tokens_seen": 60056725, "step": 2787, "time_per_iteration": 2.6234796047210693 }, { "auxiliary_loss_clip": 0.01148281, "auxiliary_loss_mlp": 0.01058476, "balance_loss_clip": 1.0533601, "balance_loss_mlp": 1.0395577, "epoch": 0.33523717910178563, "flos": 22673315393280.0, "grad_norm": 1.7528899534567335, "language_loss": 0.84315956, "learning_rate": 3.0998614732912947e-06, "loss": 0.8652271, "num_input_tokens_seen": 60076100, "step": 2788, "time_per_iteration": 2.606600761413574 }, { "auxiliary_loss_clip": 0.01153785, "auxiliary_loss_mlp": 0.01058426, "balance_loss_clip": 1.05572891, "balance_loss_mlp": 1.03880394, "epoch": 0.3353574219924247, "flos": 15669801400320.0, "grad_norm": 5.376965677556769, "language_loss": 0.6781022, "learning_rate": 3.0992107847685855e-06, "loss": 0.70022428, "num_input_tokens_seen": 60093815, "step": 2789, "time_per_iteration": 2.5857245922088623 }, { "auxiliary_loss_clip": 0.01145174, "auxiliary_loss_mlp": 0.01062333, "balance_loss_clip": 1.05371523, "balance_loss_mlp": 1.03937292, "epoch": 0.3354776648830638, "flos": 24790644443520.0, "grad_norm": 1.8404322376307143, "language_loss": 0.7934866, "learning_rate": 3.0985599294964736e-06, "loss": 0.81556159, "num_input_tokens_seen": 60113370, "step": 2790, "time_per_iteration": 2.6531689167022705 }, { "auxiliary_loss_clip": 0.01138747, "auxiliary_loss_mlp": 0.01058678, "balance_loss_clip": 1.05272436, "balance_loss_mlp": 1.03770947, "epoch": 0.33559790777370285, "flos": 28694852398080.0, "grad_norm": 2.1811720792118003, "language_loss": 0.6984117, "learning_rate": 3.097908907573695e-06, "loss": 0.72038597, "num_input_tokens_seen": 60131350, "step": 2791, "time_per_iteration": 2.711247682571411 }, { "auxiliary_loss_clip": 0.01104621, "auxiliary_loss_mlp": 0.01056424, "balance_loss_clip": 1.04965115, "balance_loss_mlp": 1.03669477, "epoch": 0.33571815066434196, "flos": 22235779825920.0, "grad_norm": 6.512349873928185, "language_loss": 0.89549786, "learning_rate": 3.0972577190990067e-06, "loss": 0.9171083, "num_input_tokens_seen": 60149830, "step": 2792, "time_per_iteration": 3.726475477218628 }, { "auxiliary_loss_clip": 0.01130996, "auxiliary_loss_mlp": 0.01054713, "balance_loss_clip": 1.04995179, "balance_loss_mlp": 1.03572273, "epoch": 0.3358383935549811, "flos": 23842279607040.0, "grad_norm": 1.9059869384284849, "language_loss": 0.79830194, "learning_rate": 3.096606364171196e-06, "loss": 0.82015908, "num_input_tokens_seen": 60169620, "step": 2793, "time_per_iteration": 2.831233263015747 }, { "auxiliary_loss_clip": 0.01110309, "auxiliary_loss_mlp": 0.01070318, "balance_loss_clip": 1.04763877, "balance_loss_mlp": 1.04968274, "epoch": 0.33595863644562013, "flos": 22267308988800.0, "grad_norm": 1.905282445293862, "language_loss": 0.84934437, "learning_rate": 3.0959548428890703e-06, "loss": 0.87115061, "num_input_tokens_seen": 60188490, "step": 2794, "time_per_iteration": 2.9430904388427734 }, { "auxiliary_loss_clip": 0.01150948, "auxiliary_loss_mlp": 0.01058443, "balance_loss_clip": 1.05337667, "balance_loss_mlp": 1.03834426, "epoch": 0.33607887933625924, "flos": 20119779578880.0, "grad_norm": 1.8225719255205386, "language_loss": 0.84444135, "learning_rate": 3.095303155351468e-06, "loss": 0.86653525, "num_input_tokens_seen": 60208695, "step": 2795, "time_per_iteration": 2.7921619415283203 }, { "auxiliary_loss_clip": 0.01103896, "auxiliary_loss_mlp": 0.01065106, "balance_loss_clip": 1.04689622, "balance_loss_mlp": 1.04330254, "epoch": 0.33619912222689835, "flos": 19318109886720.0, "grad_norm": 2.2860865203088676, "language_loss": 0.79395878, "learning_rate": 3.0946513016572464e-06, "loss": 0.81564885, "num_input_tokens_seen": 60227600, "step": 2796, "time_per_iteration": 4.774228096008301 }, { "auxiliary_loss_clip": 0.01159416, "auxiliary_loss_mlp": 0.01050721, "balance_loss_clip": 1.05271447, "balance_loss_mlp": 1.03007436, "epoch": 0.3363193651175374, "flos": 16800664262400.0, "grad_norm": 2.622377007213688, "language_loss": 0.76963884, "learning_rate": 3.0939992819052938e-06, "loss": 0.79174018, "num_input_tokens_seen": 60245110, "step": 2797, "time_per_iteration": 2.607781410217285 }, { "auxiliary_loss_clip": 0.01141835, "auxiliary_loss_mlp": 0.01063244, "balance_loss_clip": 1.05240405, "balance_loss_mlp": 1.04329991, "epoch": 0.3364396080081765, "flos": 23550289948800.0, "grad_norm": 2.1033635277954743, "language_loss": 0.80560696, "learning_rate": 3.0933470961945193e-06, "loss": 0.82765782, "num_input_tokens_seen": 60263405, "step": 2798, "time_per_iteration": 2.645747184753418 }, { "auxiliary_loss_clip": 0.01138633, "auxiliary_loss_mlp": 0.010569, "balance_loss_clip": 1.05212975, "balance_loss_mlp": 1.03735018, "epoch": 0.3365598508988156, "flos": 28037902602240.0, "grad_norm": 2.0047938182060627, "language_loss": 0.67859113, "learning_rate": 3.0926947446238597e-06, "loss": 0.70054644, "num_input_tokens_seen": 60282975, "step": 2799, "time_per_iteration": 2.680166244506836 }, { "auxiliary_loss_clip": 0.0116009, "auxiliary_loss_mlp": 0.01051822, "balance_loss_clip": 1.05248046, "balance_loss_mlp": 1.03062654, "epoch": 0.3366800937894547, "flos": 16982767238400.0, "grad_norm": 2.8895714640108636, "language_loss": 0.82943749, "learning_rate": 3.092042227292276e-06, "loss": 0.85155666, "num_input_tokens_seen": 60299810, "step": 2800, "time_per_iteration": 2.6279497146606445 }, { "auxiliary_loss_clip": 0.01163369, "auxiliary_loss_mlp": 0.01052962, "balance_loss_clip": 1.05411303, "balance_loss_mlp": 1.03434181, "epoch": 0.3368003366800938, "flos": 23915321913600.0, "grad_norm": 1.6843619649999044, "language_loss": 0.87992489, "learning_rate": 3.0913895442987557e-06, "loss": 0.90208828, "num_input_tokens_seen": 60320775, "step": 2801, "time_per_iteration": 2.6471505165100098 }, { "auxiliary_loss_clip": 0.01131415, "auxiliary_loss_mlp": 0.00774671, "balance_loss_clip": 1.05142605, "balance_loss_mlp": 1.00034738, "epoch": 0.3369205795707329, "flos": 24791219061120.0, "grad_norm": 1.7070154023037754, "language_loss": 0.85996127, "learning_rate": 3.090736695742308e-06, "loss": 0.87902212, "num_input_tokens_seen": 60341905, "step": 2802, "time_per_iteration": 3.6060309410095215 }, { "auxiliary_loss_clip": 0.01107367, "auxiliary_loss_mlp": 0.0106098, "balance_loss_clip": 1.04590392, "balance_loss_mlp": 1.04231215, "epoch": 0.33704082246137196, "flos": 17931096161280.0, "grad_norm": 2.5134445011495408, "language_loss": 0.51679951, "learning_rate": 3.0900836817219713e-06, "loss": 0.53848296, "num_input_tokens_seen": 60358335, "step": 2803, "time_per_iteration": 2.692148447036743 }, { "auxiliary_loss_clip": 0.0116718, "auxiliary_loss_mlp": 0.01053207, "balance_loss_clip": 1.05488586, "balance_loss_mlp": 1.03310859, "epoch": 0.33716106535201107, "flos": 21286517149440.0, "grad_norm": 1.9933532774641995, "language_loss": 0.83692837, "learning_rate": 3.089430502336807e-06, "loss": 0.85913229, "num_input_tokens_seen": 60378305, "step": 2804, "time_per_iteration": 2.6271703243255615 }, { "auxiliary_loss_clip": 0.01156709, "auxiliary_loss_mlp": 0.01050513, "balance_loss_clip": 1.05349612, "balance_loss_mlp": 1.03010488, "epoch": 0.3372813082426502, "flos": 18402962152320.0, "grad_norm": 2.5120696558745186, "language_loss": 0.90608835, "learning_rate": 3.088777157685902e-06, "loss": 0.92816055, "num_input_tokens_seen": 60393895, "step": 2805, "time_per_iteration": 2.6395187377929688 }, { "auxiliary_loss_clip": 0.01132692, "auxiliary_loss_mlp": 0.01047341, "balance_loss_clip": 1.04957044, "balance_loss_mlp": 1.02827954, "epoch": 0.33740155113328923, "flos": 17201391367680.0, "grad_norm": 2.3225453170561474, "language_loss": 0.85718763, "learning_rate": 3.088123647868367e-06, "loss": 0.87898797, "num_input_tokens_seen": 60410445, "step": 2806, "time_per_iteration": 2.6468992233276367 }, { "auxiliary_loss_clip": 0.01159008, "auxiliary_loss_mlp": 0.01050523, "balance_loss_clip": 1.05399442, "balance_loss_mlp": 1.03112721, "epoch": 0.33752179402392835, "flos": 29058950609280.0, "grad_norm": 2.0605124807945163, "language_loss": 0.81444943, "learning_rate": 3.0874699729833405e-06, "loss": 0.83654475, "num_input_tokens_seen": 60431815, "step": 2807, "time_per_iteration": 2.6683566570281982 }, { "auxiliary_loss_clip": 0.01134485, "auxiliary_loss_mlp": 0.01052159, "balance_loss_clip": 1.05051529, "balance_loss_mlp": 1.03229916, "epoch": 0.3376420369145674, "flos": 25080730680960.0, "grad_norm": 1.9895702710398047, "language_loss": 0.79963833, "learning_rate": 3.086816133129983e-06, "loss": 0.82150483, "num_input_tokens_seen": 60452075, "step": 2808, "time_per_iteration": 2.7063186168670654 }, { "auxiliary_loss_clip": 0.01169152, "auxiliary_loss_mlp": 0.01056386, "balance_loss_clip": 1.05933714, "balance_loss_mlp": 1.03662109, "epoch": 0.3377622798052065, "flos": 27490624007040.0, "grad_norm": 2.0347063997711157, "language_loss": 0.76375139, "learning_rate": 3.0861621284074826e-06, "loss": 0.78600675, "num_input_tokens_seen": 60472600, "step": 2809, "time_per_iteration": 2.654705286026001 }, { "auxiliary_loss_clip": 0.01145492, "auxiliary_loss_mlp": 0.01046708, "balance_loss_clip": 1.05355382, "balance_loss_mlp": 1.0268476, "epoch": 0.3378825226958456, "flos": 21975211589760.0, "grad_norm": 1.775261685705555, "language_loss": 0.73000646, "learning_rate": 3.085507958915051e-06, "loss": 0.75192839, "num_input_tokens_seen": 60491030, "step": 2810, "time_per_iteration": 2.664161205291748 }, { "auxiliary_loss_clip": 0.01136084, "auxiliary_loss_mlp": 0.0105149, "balance_loss_clip": 1.04808891, "balance_loss_mlp": 1.02976978, "epoch": 0.3380027655864847, "flos": 42523189200000.0, "grad_norm": 4.932689818645291, "language_loss": 0.7174527, "learning_rate": 3.084853624751925e-06, "loss": 0.7393285, "num_input_tokens_seen": 60512615, "step": 2811, "time_per_iteration": 2.8304805755615234 }, { "auxiliary_loss_clip": 0.01131839, "auxiliary_loss_mlp": 0.01050759, "balance_loss_clip": 1.05319142, "balance_loss_mlp": 1.03041017, "epoch": 0.3381230084771238, "flos": 26725080418560.0, "grad_norm": 2.2596512167547123, "language_loss": 0.85503542, "learning_rate": 3.0841991260173668e-06, "loss": 0.87686139, "num_input_tokens_seen": 60532520, "step": 2812, "time_per_iteration": 2.7475507259368896 }, { "auxiliary_loss_clip": 0.01170071, "auxiliary_loss_mlp": 0.01054539, "balance_loss_clip": 1.05760062, "balance_loss_mlp": 1.03415453, "epoch": 0.3382432513677629, "flos": 22710375250560.0, "grad_norm": 2.054701014405002, "language_loss": 0.80166161, "learning_rate": 3.0835444628106634e-06, "loss": 0.82390773, "num_input_tokens_seen": 60551500, "step": 2813, "time_per_iteration": 2.5993313789367676 }, { "auxiliary_loss_clip": 0.01168901, "auxiliary_loss_mlp": 0.00774997, "balance_loss_clip": 1.05634069, "balance_loss_mlp": 1.00026345, "epoch": 0.33836349425840195, "flos": 22122409524480.0, "grad_norm": 1.7358776798263964, "language_loss": 0.83352017, "learning_rate": 3.082889635231126e-06, "loss": 0.85295916, "num_input_tokens_seen": 60570160, "step": 2814, "time_per_iteration": 2.5794146060943604 }, { "auxiliary_loss_clip": 0.01144411, "auxiliary_loss_mlp": 0.01058365, "balance_loss_clip": 1.05253088, "balance_loss_mlp": 1.03821838, "epoch": 0.33848373714904106, "flos": 27308090067840.0, "grad_norm": 2.227452369830557, "language_loss": 0.76444727, "learning_rate": 3.0822346433780925e-06, "loss": 0.78647506, "num_input_tokens_seen": 60590885, "step": 2815, "time_per_iteration": 2.6549878120422363 }, { "auxiliary_loss_clip": 0.01156231, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.05455685, "balance_loss_mlp": 1.0203135, "epoch": 0.3386039800396802, "flos": 25848716394240.0, "grad_norm": 2.856919828189717, "language_loss": 0.87279773, "learning_rate": 3.0815794873509237e-06, "loss": 0.89477122, "num_input_tokens_seen": 60609170, "step": 2816, "time_per_iteration": 2.656491994857788 }, { "auxiliary_loss_clip": 0.01167263, "auxiliary_loss_mlp": 0.01046506, "balance_loss_clip": 1.05742037, "balance_loss_mlp": 1.02736115, "epoch": 0.33872422293031923, "flos": 18880646146560.0, "grad_norm": 2.086057384851497, "language_loss": 0.72563618, "learning_rate": 3.0809241672490066e-06, "loss": 0.74777389, "num_input_tokens_seen": 60627340, "step": 2817, "time_per_iteration": 2.576915979385376 }, { "auxiliary_loss_clip": 0.01142507, "auxiliary_loss_mlp": 0.01041881, "balance_loss_clip": 1.05385971, "balance_loss_mlp": 1.02230644, "epoch": 0.33884446582095834, "flos": 23146977064320.0, "grad_norm": 1.8196145846418532, "language_loss": 0.85251617, "learning_rate": 3.080268683171753e-06, "loss": 0.87436002, "num_input_tokens_seen": 60647630, "step": 2818, "time_per_iteration": 3.591740369796753 }, { "auxiliary_loss_clip": 0.01151297, "auxiliary_loss_mlp": 0.01054249, "balance_loss_clip": 1.05030656, "balance_loss_mlp": 1.03512788, "epoch": 0.33896470871159745, "flos": 15997342544640.0, "grad_norm": 2.1261270967299466, "language_loss": 0.89391994, "learning_rate": 3.0796130352185985e-06, "loss": 0.91597539, "num_input_tokens_seen": 60664485, "step": 2819, "time_per_iteration": 2.6082217693328857 }, { "auxiliary_loss_clip": 0.01125804, "auxiliary_loss_mlp": 0.0077632, "balance_loss_clip": 1.04633737, "balance_loss_mlp": 1.00030816, "epoch": 0.3390849516022365, "flos": 34495754112000.0, "grad_norm": 1.9274489630075795, "language_loss": 0.66623962, "learning_rate": 3.0789572234890057e-06, "loss": 0.68526089, "num_input_tokens_seen": 60686125, "step": 2820, "time_per_iteration": 2.737733840942383 }, { "auxiliary_loss_clip": 0.01139543, "auxiliary_loss_mlp": 0.01050488, "balance_loss_clip": 1.05103564, "balance_loss_mlp": 1.03042555, "epoch": 0.3392051944928756, "flos": 16180307447040.0, "grad_norm": 1.871923978100527, "language_loss": 0.77216458, "learning_rate": 3.0783012480824596e-06, "loss": 0.79406494, "num_input_tokens_seen": 60705270, "step": 2821, "time_per_iteration": 2.602771043777466 }, { "auxiliary_loss_clip": 0.01172116, "auxiliary_loss_mlp": 0.01048569, "balance_loss_clip": 1.0589056, "balance_loss_mlp": 1.02861333, "epoch": 0.33932543738351467, "flos": 17086656349440.0, "grad_norm": 2.6354425750777164, "language_loss": 0.74506199, "learning_rate": 3.077645109098471e-06, "loss": 0.76726878, "num_input_tokens_seen": 60721540, "step": 2822, "time_per_iteration": 4.474803686141968 }, { "auxiliary_loss_clip": 0.01119047, "auxiliary_loss_mlp": 0.0104825, "balance_loss_clip": 1.05040944, "balance_loss_mlp": 1.02822316, "epoch": 0.3394456802741538, "flos": 22126970551680.0, "grad_norm": 3.5663308312709487, "language_loss": 0.72239304, "learning_rate": 3.076988806636577e-06, "loss": 0.744066, "num_input_tokens_seen": 60739300, "step": 2823, "time_per_iteration": 2.6770904064178467 }, { "auxiliary_loss_clip": 0.01150273, "auxiliary_loss_mlp": 0.00776601, "balance_loss_clip": 1.05661368, "balance_loss_mlp": 1.00025368, "epoch": 0.3395659231647929, "flos": 25226887121280.0, "grad_norm": 1.9838693871403732, "language_loss": 0.89039475, "learning_rate": 3.0763323407963377e-06, "loss": 0.90966344, "num_input_tokens_seen": 60758910, "step": 2824, "time_per_iteration": 2.717909812927246 }, { "auxiliary_loss_clip": 0.01154598, "auxiliary_loss_mlp": 0.01050137, "balance_loss_clip": 1.05316067, "balance_loss_mlp": 1.02997899, "epoch": 0.33968616605543195, "flos": 29096477343360.0, "grad_norm": 2.1945729063606065, "language_loss": 0.80444479, "learning_rate": 3.075675711677337e-06, "loss": 0.82649213, "num_input_tokens_seen": 60779005, "step": 2825, "time_per_iteration": 2.686950445175171 }, { "auxiliary_loss_clip": 0.01138474, "auxiliary_loss_mlp": 0.01047444, "balance_loss_clip": 1.05509424, "balance_loss_mlp": 1.02872825, "epoch": 0.33980640894607106, "flos": 21433966479360.0, "grad_norm": 1.9197009447134594, "language_loss": 0.78047621, "learning_rate": 3.0750189193791865e-06, "loss": 0.80233538, "num_input_tokens_seen": 60798590, "step": 2826, "time_per_iteration": 2.6082072257995605 }, { "auxiliary_loss_clip": 0.0114911, "auxiliary_loss_mlp": 0.01047933, "balance_loss_clip": 1.05224371, "balance_loss_mlp": 1.02771473, "epoch": 0.33992665183671017, "flos": 32490035596800.0, "grad_norm": 7.530346711809784, "language_loss": 0.70406598, "learning_rate": 3.0743619640015203e-06, "loss": 0.72603643, "num_input_tokens_seen": 60818840, "step": 2827, "time_per_iteration": 2.8114850521087646 }, { "auxiliary_loss_clip": 0.01145267, "auxiliary_loss_mlp": 0.0104769, "balance_loss_clip": 1.05177331, "balance_loss_mlp": 1.02787781, "epoch": 0.3400468947273492, "flos": 17055414495360.0, "grad_norm": 3.617408868032532, "language_loss": 0.92393196, "learning_rate": 3.073704845643999e-06, "loss": 0.94586152, "num_input_tokens_seen": 60835965, "step": 2828, "time_per_iteration": 3.5592610836029053 }, { "auxiliary_loss_clip": 0.01158032, "auxiliary_loss_mlp": 0.01051764, "balance_loss_clip": 1.05414987, "balance_loss_mlp": 1.02992475, "epoch": 0.34016713761798834, "flos": 16872988296960.0, "grad_norm": 5.558890611956699, "language_loss": 0.77317643, "learning_rate": 3.0730475644063063e-06, "loss": 0.7952745, "num_input_tokens_seen": 60851065, "step": 2829, "time_per_iteration": 2.8076958656311035 }, { "auxiliary_loss_clip": 0.01133887, "auxiliary_loss_mlp": 0.00775163, "balance_loss_clip": 1.05214, "balance_loss_mlp": 1.00029325, "epoch": 0.34028738050862745, "flos": 21907161273600.0, "grad_norm": 1.737545080734447, "language_loss": 0.65184367, "learning_rate": 3.072390120388151e-06, "loss": 0.67093414, "num_input_tokens_seen": 60869390, "step": 2830, "time_per_iteration": 2.723407506942749 }, { "auxiliary_loss_clip": 0.01159315, "auxiliary_loss_mlp": 0.01063209, "balance_loss_clip": 1.05720639, "balance_loss_mlp": 1.04153645, "epoch": 0.3404076233992665, "flos": 22746034477440.0, "grad_norm": 2.399818635277508, "language_loss": 0.70991659, "learning_rate": 3.071732513689267e-06, "loss": 0.73214185, "num_input_tokens_seen": 60887925, "step": 2831, "time_per_iteration": 2.6337225437164307 }, { "auxiliary_loss_clip": 0.01156607, "auxiliary_loss_mlp": 0.01051065, "balance_loss_clip": 1.05734527, "balance_loss_mlp": 1.03068018, "epoch": 0.3405278662899056, "flos": 17052361839360.0, "grad_norm": 2.683913952975046, "language_loss": 0.67548382, "learning_rate": 3.0710747444094134e-06, "loss": 0.69756049, "num_input_tokens_seen": 60905955, "step": 2832, "time_per_iteration": 2.6101930141448975 }, { "auxiliary_loss_clip": 0.01149843, "auxiliary_loss_mlp": 0.01053404, "balance_loss_clip": 1.05579317, "balance_loss_mlp": 1.03293586, "epoch": 0.3406481091805447, "flos": 42813131783040.0, "grad_norm": 2.180052114382107, "language_loss": 0.64896655, "learning_rate": 3.070416812648372e-06, "loss": 0.67099899, "num_input_tokens_seen": 60929405, "step": 2833, "time_per_iteration": 2.890221118927002 }, { "auxiliary_loss_clip": 0.01126205, "auxiliary_loss_mlp": 0.01054475, "balance_loss_clip": 1.05019379, "balance_loss_mlp": 1.03454328, "epoch": 0.3407683520711838, "flos": 26761457917440.0, "grad_norm": 2.0599437440045874, "language_loss": 0.64821708, "learning_rate": 3.069758718505951e-06, "loss": 0.67002392, "num_input_tokens_seen": 60951145, "step": 2834, "time_per_iteration": 2.7307326793670654 }, { "auxiliary_loss_clip": 0.01170766, "auxiliary_loss_mlp": 0.01058283, "balance_loss_clip": 1.05901921, "balance_loss_mlp": 1.03795803, "epoch": 0.3408885949618229, "flos": 28767643309440.0, "grad_norm": 1.7417666950515065, "language_loss": 0.80250496, "learning_rate": 3.0691004620819836e-06, "loss": 0.82479542, "num_input_tokens_seen": 60971275, "step": 2835, "time_per_iteration": 2.649868965148926 }, { "auxiliary_loss_clip": 0.0102901, "auxiliary_loss_mlp": 0.01019751, "balance_loss_clip": 1.02821553, "balance_loss_mlp": 1.01565015, "epoch": 0.341008837852462, "flos": 63576252881280.0, "grad_norm": 0.8040749506907823, "language_loss": 0.60187888, "learning_rate": 3.0684420434763254e-06, "loss": 0.62236655, "num_input_tokens_seen": 61037460, "step": 2836, "time_per_iteration": 3.316394567489624 }, { "auxiliary_loss_clip": 0.01121756, "auxiliary_loss_mlp": 0.01054631, "balance_loss_clip": 1.05210042, "balance_loss_mlp": 1.03527129, "epoch": 0.34112908074310105, "flos": 20812173120000.0, "grad_norm": 1.8020218610863863, "language_loss": 0.76947874, "learning_rate": 3.06778346278886e-06, "loss": 0.7912426, "num_input_tokens_seen": 61056295, "step": 2837, "time_per_iteration": 2.6673426628112793 }, { "auxiliary_loss_clip": 0.01172802, "auxiliary_loss_mlp": 0.01052779, "balance_loss_clip": 1.05896688, "balance_loss_mlp": 1.03109491, "epoch": 0.34124932363374016, "flos": 24976446520320.0, "grad_norm": 2.0392419200596503, "language_loss": 0.79086369, "learning_rate": 3.0671247201194906e-06, "loss": 0.81311953, "num_input_tokens_seen": 61078430, "step": 2838, "time_per_iteration": 2.657527446746826 }, { "auxiliary_loss_clip": 0.0113606, "auxiliary_loss_mlp": 0.01058287, "balance_loss_clip": 1.05398118, "balance_loss_mlp": 1.03722286, "epoch": 0.3413695665243792, "flos": 28402970480640.0, "grad_norm": 1.9822098368713315, "language_loss": 0.75442439, "learning_rate": 3.066465815568151e-06, "loss": 0.7763679, "num_input_tokens_seen": 61099260, "step": 2839, "time_per_iteration": 2.813870668411255 }, { "auxiliary_loss_clip": 0.01154881, "auxiliary_loss_mlp": 0.01049336, "balance_loss_clip": 1.0533514, "balance_loss_mlp": 1.02933264, "epoch": 0.34148980941501833, "flos": 25302012416640.0, "grad_norm": 2.0459532901473807, "language_loss": 0.69167048, "learning_rate": 3.0658067492347947e-06, "loss": 0.71371263, "num_input_tokens_seen": 61121900, "step": 2840, "time_per_iteration": 2.704383373260498 }, { "auxiliary_loss_clip": 0.01084143, "auxiliary_loss_mlp": 0.01051761, "balance_loss_clip": 1.04612541, "balance_loss_mlp": 1.03089929, "epoch": 0.34161005230565744, "flos": 17530081747200.0, "grad_norm": 2.5390722865374107, "language_loss": 0.66639757, "learning_rate": 3.065147521219402e-06, "loss": 0.6877566, "num_input_tokens_seen": 61141155, "step": 2841, "time_per_iteration": 2.7606639862060547 }, { "auxiliary_loss_clip": 0.01125005, "auxiliary_loss_mlp": 0.01060361, "balance_loss_clip": 1.04852104, "balance_loss_mlp": 1.03961873, "epoch": 0.3417302951962965, "flos": 43650101566080.0, "grad_norm": 1.9269297453365517, "language_loss": 0.74389386, "learning_rate": 3.064488131621977e-06, "loss": 0.76574755, "num_input_tokens_seen": 61164480, "step": 2842, "time_per_iteration": 2.835660219192505 }, { "auxiliary_loss_clip": 0.01144005, "auxiliary_loss_mlp": 0.01048275, "balance_loss_clip": 1.05022216, "balance_loss_mlp": 1.02804542, "epoch": 0.3418505380869356, "flos": 30882207012480.0, "grad_norm": 2.310507800187704, "language_loss": 0.73570049, "learning_rate": 3.063828580542549e-06, "loss": 0.75762331, "num_input_tokens_seen": 61185675, "step": 2843, "time_per_iteration": 2.6993954181671143 }, { "auxiliary_loss_clip": 0.01142154, "auxiliary_loss_mlp": 0.01048752, "balance_loss_clip": 1.05315447, "balance_loss_mlp": 1.02889132, "epoch": 0.3419707809775747, "flos": 19463871277440.0, "grad_norm": 2.0478040533528037, "language_loss": 0.7363255, "learning_rate": 3.0631688680811706e-06, "loss": 0.75823462, "num_input_tokens_seen": 61205300, "step": 2844, "time_per_iteration": 3.589906930923462 }, { "auxiliary_loss_clip": 0.01172229, "auxiliary_loss_mlp": 0.01052975, "balance_loss_clip": 1.05766892, "balance_loss_mlp": 1.03310239, "epoch": 0.3420910238682138, "flos": 28727818104960.0, "grad_norm": 2.9398954058034086, "language_loss": 0.75367099, "learning_rate": 3.062508994337921e-06, "loss": 0.77592301, "num_input_tokens_seen": 61224905, "step": 2845, "time_per_iteration": 2.6322224140167236 }, { "auxiliary_loss_clip": 0.01154242, "auxiliary_loss_mlp": 0.01048892, "balance_loss_clip": 1.05171013, "balance_loss_mlp": 1.02780378, "epoch": 0.3422112667588529, "flos": 21397265758080.0, "grad_norm": 4.071509757360904, "language_loss": 0.79023981, "learning_rate": 3.0618489594129013e-06, "loss": 0.81227112, "num_input_tokens_seen": 61243045, "step": 2846, "time_per_iteration": 2.6214139461517334 }, { "auxiliary_loss_clip": 0.01136047, "auxiliary_loss_mlp": 0.01049472, "balance_loss_clip": 1.05428529, "balance_loss_mlp": 1.0290277, "epoch": 0.342331509649492, "flos": 13881450038400.0, "grad_norm": 2.0600948168878963, "language_loss": 0.71075952, "learning_rate": 3.061188763406239e-06, "loss": 0.7326147, "num_input_tokens_seen": 61259190, "step": 2847, "time_per_iteration": 2.676740884780884 }, { "auxiliary_loss_clip": 0.01139113, "auxiliary_loss_mlp": 0.01068382, "balance_loss_clip": 1.05364966, "balance_loss_mlp": 1.04575634, "epoch": 0.34245175254013105, "flos": 28621450955520.0, "grad_norm": 7.609730168053527, "language_loss": 0.81816959, "learning_rate": 3.060528406418085e-06, "loss": 0.84024447, "num_input_tokens_seen": 61279040, "step": 2848, "time_per_iteration": 4.702430248260498 }, { "auxiliary_loss_clip": 0.01134114, "auxiliary_loss_mlp": 0.01045185, "balance_loss_clip": 1.05226564, "balance_loss_mlp": 1.02490771, "epoch": 0.34257199543077016, "flos": 34127058960000.0, "grad_norm": 1.5432426214526513, "language_loss": 0.61922598, "learning_rate": 3.0598678885486145e-06, "loss": 0.64101899, "num_input_tokens_seen": 61301580, "step": 2849, "time_per_iteration": 2.7296512126922607 }, { "auxiliary_loss_clip": 0.01129376, "auxiliary_loss_mlp": 0.00777793, "balance_loss_clip": 1.04964304, "balance_loss_mlp": 1.00025892, "epoch": 0.34269223832140927, "flos": 19974018188160.0, "grad_norm": 1.6956368721794106, "language_loss": 0.74502122, "learning_rate": 3.0592072098980282e-06, "loss": 0.76409292, "num_input_tokens_seen": 61321240, "step": 2850, "time_per_iteration": 2.6597113609313965 }, { "auxiliary_loss_clip": 0.01135448, "auxiliary_loss_mlp": 0.01052339, "balance_loss_clip": 1.05209982, "balance_loss_mlp": 1.03135824, "epoch": 0.3428124812120483, "flos": 27235658292480.0, "grad_norm": 2.7056574639228947, "language_loss": 0.72929347, "learning_rate": 3.0585463705665514e-06, "loss": 0.75117135, "num_input_tokens_seen": 61341615, "step": 2851, "time_per_iteration": 2.667696952819824 }, { "auxiliary_loss_clip": 0.01125658, "auxiliary_loss_mlp": 0.01058095, "balance_loss_clip": 1.04894233, "balance_loss_mlp": 1.03658938, "epoch": 0.34293272410268744, "flos": 24570871079040.0, "grad_norm": 2.1619534199759434, "language_loss": 0.70556343, "learning_rate": 3.0578853706544304e-06, "loss": 0.72740096, "num_input_tokens_seen": 61359005, "step": 2852, "time_per_iteration": 2.704174757003784 }, { "auxiliary_loss_clip": 0.01131304, "auxiliary_loss_mlp": 0.00776149, "balance_loss_clip": 1.05194473, "balance_loss_mlp": 1.00033545, "epoch": 0.34305296699332655, "flos": 21506865131520.0, "grad_norm": 2.799395648399289, "language_loss": 0.65635967, "learning_rate": 3.0572242102619404e-06, "loss": 0.67543423, "num_input_tokens_seen": 61376160, "step": 2853, "time_per_iteration": 2.65948224067688 }, { "auxiliary_loss_clip": 0.01136252, "auxiliary_loss_mlp": 0.01048108, "balance_loss_clip": 1.05122733, "balance_loss_mlp": 1.02846265, "epoch": 0.3431732098839656, "flos": 24056665931520.0, "grad_norm": 2.200773947265631, "language_loss": 0.80541956, "learning_rate": 3.0565628894893784e-06, "loss": 0.82726312, "num_input_tokens_seen": 61396795, "step": 2854, "time_per_iteration": 3.5776588916778564 }, { "auxiliary_loss_clip": 0.0114519, "auxiliary_loss_mlp": 0.01056713, "balance_loss_clip": 1.05157042, "balance_loss_mlp": 1.03530288, "epoch": 0.3432934527746047, "flos": 16800879744000.0, "grad_norm": 3.552847047576464, "language_loss": 0.74394315, "learning_rate": 3.0559014084370655e-06, "loss": 0.76596224, "num_input_tokens_seen": 61415320, "step": 2855, "time_per_iteration": 2.580394744873047 }, { "auxiliary_loss_clip": 0.01152725, "auxiliary_loss_mlp": 0.01054976, "balance_loss_clip": 1.05418146, "balance_loss_mlp": 1.0338285, "epoch": 0.34341369566524377, "flos": 23439720908160.0, "grad_norm": 2.011172894176608, "language_loss": 0.78675473, "learning_rate": 3.055239767205349e-06, "loss": 0.80883181, "num_input_tokens_seen": 61437070, "step": 2856, "time_per_iteration": 2.6823391914367676 }, { "auxiliary_loss_clip": 0.01153596, "auxiliary_loss_mlp": 0.01055461, "balance_loss_clip": 1.05845392, "balance_loss_mlp": 1.03496945, "epoch": 0.3435339385558829, "flos": 17267466435840.0, "grad_norm": 1.8458949157847448, "language_loss": 0.78198969, "learning_rate": 3.054577965894599e-06, "loss": 0.80408025, "num_input_tokens_seen": 61453215, "step": 2857, "time_per_iteration": 2.620321750640869 }, { "auxiliary_loss_clip": 0.01145738, "auxiliary_loss_mlp": 0.01052389, "balance_loss_clip": 1.0526371, "balance_loss_mlp": 1.03209937, "epoch": 0.343654181446522, "flos": 22199366413440.0, "grad_norm": 1.6324861072573194, "language_loss": 0.70068777, "learning_rate": 3.0539160046052094e-06, "loss": 0.72266901, "num_input_tokens_seen": 61472915, "step": 2858, "time_per_iteration": 2.64688777923584 }, { "auxiliary_loss_clip": 0.01137844, "auxiliary_loss_mlp": 0.01047285, "balance_loss_clip": 1.05159163, "balance_loss_mlp": 1.02411056, "epoch": 0.34377442433716104, "flos": 19901801894400.0, "grad_norm": 2.6632743177030758, "language_loss": 0.70579255, "learning_rate": 3.0532538834376003e-06, "loss": 0.72764385, "num_input_tokens_seen": 61492475, "step": 2859, "time_per_iteration": 2.6669366359710693 }, { "auxiliary_loss_clip": 0.01159823, "auxiliary_loss_mlp": 0.01045473, "balance_loss_clip": 1.05471432, "balance_loss_mlp": 1.02475452, "epoch": 0.34389466722780015, "flos": 22197678474240.0, "grad_norm": 4.872236754001591, "language_loss": 0.77934361, "learning_rate": 3.0525916024922143e-06, "loss": 0.80139655, "num_input_tokens_seen": 61511660, "step": 2860, "time_per_iteration": 2.658752918243408 }, { "auxiliary_loss_clip": 0.01141365, "auxiliary_loss_mlp": 0.01047261, "balance_loss_clip": 1.05191004, "balance_loss_mlp": 1.02637601, "epoch": 0.34401491011843927, "flos": 18624567110400.0, "grad_norm": 5.179057471243604, "language_loss": 0.83853745, "learning_rate": 3.0519291618695193e-06, "loss": 0.86042374, "num_input_tokens_seen": 61529060, "step": 2861, "time_per_iteration": 2.7143945693969727 }, { "auxiliary_loss_clip": 0.01118949, "auxiliary_loss_mlp": 0.01068679, "balance_loss_clip": 1.04689693, "balance_loss_mlp": 1.04654169, "epoch": 0.3441351530090783, "flos": 17858197509120.0, "grad_norm": 1.7094047334068911, "language_loss": 0.75762802, "learning_rate": 3.0512665616700065e-06, "loss": 0.7795043, "num_input_tokens_seen": 61548125, "step": 2862, "time_per_iteration": 2.6630513668060303 }, { "auxiliary_loss_clip": 0.01104186, "auxiliary_loss_mlp": 0.01058057, "balance_loss_clip": 1.04416633, "balance_loss_mlp": 1.03780365, "epoch": 0.34425539589971743, "flos": 23112754381440.0, "grad_norm": 1.967054705446216, "language_loss": 0.89452434, "learning_rate": 3.0506038019941933e-06, "loss": 0.91614681, "num_input_tokens_seen": 61568135, "step": 2863, "time_per_iteration": 2.7323219776153564 }, { "auxiliary_loss_clip": 0.01129085, "auxiliary_loss_mlp": 0.0105299, "balance_loss_clip": 1.05046308, "balance_loss_mlp": 1.03178287, "epoch": 0.34437563879035654, "flos": 21907699977600.0, "grad_norm": 2.6892767328197182, "language_loss": 0.6763798, "learning_rate": 3.049940882942617e-06, "loss": 0.69820058, "num_input_tokens_seen": 61586920, "step": 2864, "time_per_iteration": 2.6850168704986572 }, { "auxiliary_loss_clip": 0.01169509, "auxiliary_loss_mlp": 0.01057159, "balance_loss_clip": 1.05481195, "balance_loss_mlp": 1.03647637, "epoch": 0.3444958816809956, "flos": 23076915586560.0, "grad_norm": 2.0119295103208636, "language_loss": 0.80500126, "learning_rate": 3.0492778046158448e-06, "loss": 0.82726789, "num_input_tokens_seen": 61608340, "step": 2865, "time_per_iteration": 2.5710396766662598 }, { "auxiliary_loss_clip": 0.0115001, "auxiliary_loss_mlp": 0.01052701, "balance_loss_clip": 1.0528934, "balance_loss_mlp": 1.0327338, "epoch": 0.3446161245716347, "flos": 21908633731200.0, "grad_norm": 3.794973288408887, "language_loss": 0.76789695, "learning_rate": 3.0486145671144633e-06, "loss": 0.78992409, "num_input_tokens_seen": 61628130, "step": 2866, "time_per_iteration": 2.5974059104919434 }, { "auxiliary_loss_clip": 0.01086657, "auxiliary_loss_mlp": 0.01055632, "balance_loss_clip": 1.04804897, "balance_loss_mlp": 1.03367352, "epoch": 0.3447363674622738, "flos": 25112834461440.0, "grad_norm": 2.4782742131080435, "language_loss": 0.77563596, "learning_rate": 3.047951170539086e-06, "loss": 0.79705882, "num_input_tokens_seen": 61647755, "step": 2867, "time_per_iteration": 2.743694543838501 }, { "auxiliary_loss_clip": 0.01120569, "auxiliary_loss_mlp": 0.01043432, "balance_loss_clip": 1.05276489, "balance_loss_mlp": 1.02549112, "epoch": 0.3448566103529129, "flos": 11984684451840.0, "grad_norm": 1.8914934943947463, "language_loss": 0.8401925, "learning_rate": 3.047287614990349e-06, "loss": 0.8618325, "num_input_tokens_seen": 61665675, "step": 2868, "time_per_iteration": 2.6390113830566406 }, { "auxiliary_loss_clip": 0.01135533, "auxiliary_loss_mlp": 0.01068919, "balance_loss_clip": 1.05052769, "balance_loss_mlp": 1.04523182, "epoch": 0.344976853243552, "flos": 40187882465280.0, "grad_norm": 4.913736965220831, "language_loss": 0.61949658, "learning_rate": 3.046623900568914e-06, "loss": 0.64154112, "num_input_tokens_seen": 61688240, "step": 2869, "time_per_iteration": 2.8081464767456055 }, { "auxiliary_loss_clip": 0.01139233, "auxiliary_loss_mlp": 0.01049914, "balance_loss_clip": 1.05154061, "balance_loss_mlp": 1.02739573, "epoch": 0.34509709613419104, "flos": 28723652127360.0, "grad_norm": 2.3069541038944883, "language_loss": 0.70484197, "learning_rate": 3.045960027375465e-06, "loss": 0.72673345, "num_input_tokens_seen": 61706075, "step": 2870, "time_per_iteration": 3.6722633838653564 }, { "auxiliary_loss_clip": 0.01160233, "auxiliary_loss_mlp": 0.01052017, "balance_loss_clip": 1.0532887, "balance_loss_mlp": 1.02947497, "epoch": 0.34521733902483015, "flos": 29967597982080.0, "grad_norm": 2.698936125806755, "language_loss": 0.83002567, "learning_rate": 3.045295995510711e-06, "loss": 0.85214823, "num_input_tokens_seen": 61723045, "step": 2871, "time_per_iteration": 2.6597280502319336 }, { "auxiliary_loss_clip": 0.01134781, "auxiliary_loss_mlp": 0.01044837, "balance_loss_clip": 1.05208302, "balance_loss_mlp": 1.02625275, "epoch": 0.34533758191546926, "flos": 27923059843200.0, "grad_norm": 1.9472646153847157, "language_loss": 0.73927397, "learning_rate": 3.0446318050753865e-06, "loss": 0.76107013, "num_input_tokens_seen": 61743525, "step": 2872, "time_per_iteration": 2.691751718521118 }, { "auxiliary_loss_clip": 0.01147211, "auxiliary_loss_mlp": 0.01062671, "balance_loss_clip": 1.05166888, "balance_loss_mlp": 1.04115367, "epoch": 0.3454578248061083, "flos": 27125879351040.0, "grad_norm": 2.153625020900668, "language_loss": 0.77779782, "learning_rate": 3.0439674561702474e-06, "loss": 0.79989666, "num_input_tokens_seen": 61763025, "step": 2873, "time_per_iteration": 3.584350347518921 }, { "auxiliary_loss_clip": 0.01150437, "auxiliary_loss_mlp": 0.01048696, "balance_loss_clip": 1.05376077, "balance_loss_mlp": 1.02866888, "epoch": 0.3455780676967474, "flos": 19024899166080.0, "grad_norm": 2.527704503715098, "language_loss": 0.87662196, "learning_rate": 3.043302948896076e-06, "loss": 0.89861333, "num_input_tokens_seen": 61781630, "step": 2874, "time_per_iteration": 3.6746201515197754 }, { "auxiliary_loss_clip": 0.01105309, "auxiliary_loss_mlp": 0.01049477, "balance_loss_clip": 1.04663026, "balance_loss_mlp": 1.02788877, "epoch": 0.34569831058738654, "flos": 34496005507200.0, "grad_norm": 2.3893590274760497, "language_loss": 0.60681605, "learning_rate": 3.0426382833536756e-06, "loss": 0.62836397, "num_input_tokens_seen": 61804985, "step": 2875, "time_per_iteration": 2.8210132122039795 }, { "auxiliary_loss_clip": 0.01120699, "auxiliary_loss_mlp": 0.01056312, "balance_loss_clip": 1.04632664, "balance_loss_mlp": 1.03417516, "epoch": 0.3458185534780256, "flos": 31138681098240.0, "grad_norm": 2.6981036410444355, "language_loss": 0.7790972, "learning_rate": 3.041973459643877e-06, "loss": 0.80086726, "num_input_tokens_seen": 61824440, "step": 2876, "time_per_iteration": 2.7374675273895264 }, { "auxiliary_loss_clip": 0.01105713, "auxiliary_loss_mlp": 0.01053426, "balance_loss_clip": 1.04591548, "balance_loss_mlp": 1.03199244, "epoch": 0.3459387963686647, "flos": 32452508862720.0, "grad_norm": 3.0678127390491787, "language_loss": 0.6680907, "learning_rate": 3.0413084778675334e-06, "loss": 0.68968213, "num_input_tokens_seen": 61845690, "step": 2877, "time_per_iteration": 2.8164288997650146 }, { "auxiliary_loss_clip": 0.0113376, "auxiliary_loss_mlp": 0.00776049, "balance_loss_clip": 1.05013275, "balance_loss_mlp": 1.00021267, "epoch": 0.3460590392593038, "flos": 24675658030080.0, "grad_norm": 1.8752052734366547, "language_loss": 0.8392781, "learning_rate": 3.0406433381255214e-06, "loss": 0.85837615, "num_input_tokens_seen": 61863725, "step": 2878, "time_per_iteration": 2.6915831565856934 }, { "auxiliary_loss_clip": 0.01154171, "auxiliary_loss_mlp": 0.01045401, "balance_loss_clip": 1.05640709, "balance_loss_mlp": 1.02555263, "epoch": 0.34617928214994287, "flos": 18807316531200.0, "grad_norm": 6.004334654425356, "language_loss": 0.8224647, "learning_rate": 3.0399780405187425e-06, "loss": 0.84446037, "num_input_tokens_seen": 61882720, "step": 2879, "time_per_iteration": 2.611752986907959 }, { "auxiliary_loss_clip": 0.01147753, "auxiliary_loss_mlp": 0.0105313, "balance_loss_clip": 1.05132473, "balance_loss_mlp": 1.03267407, "epoch": 0.346299525040582, "flos": 24857653265280.0, "grad_norm": 2.0814135653898833, "language_loss": 0.78381312, "learning_rate": 3.0393125851481216e-06, "loss": 0.80582196, "num_input_tokens_seen": 61902595, "step": 2880, "time_per_iteration": 3.61816668510437 }, { "auxiliary_loss_clip": 0.01122947, "auxiliary_loss_mlp": 0.01052436, "balance_loss_clip": 1.05057073, "balance_loss_mlp": 1.02969074, "epoch": 0.3464197679312211, "flos": 16434914025600.0, "grad_norm": 5.901522658550006, "language_loss": 0.86372662, "learning_rate": 3.038646972114608e-06, "loss": 0.8854804, "num_input_tokens_seen": 61918920, "step": 2881, "time_per_iteration": 2.6348774433135986 }, { "auxiliary_loss_clip": 0.01123417, "auxiliary_loss_mlp": 0.01048389, "balance_loss_clip": 1.05023742, "balance_loss_mlp": 1.02740836, "epoch": 0.34654001082186014, "flos": 22382474970240.0, "grad_norm": 2.1679077428755114, "language_loss": 0.67442006, "learning_rate": 3.037981201519174e-06, "loss": 0.69613814, "num_input_tokens_seen": 61939520, "step": 2882, "time_per_iteration": 2.728142261505127 }, { "auxiliary_loss_clip": 0.01156279, "auxiliary_loss_mlp": 0.01048478, "balance_loss_clip": 1.05882645, "balance_loss_mlp": 1.02821231, "epoch": 0.34666025371249926, "flos": 19573901614080.0, "grad_norm": 2.297582288295924, "language_loss": 0.71626866, "learning_rate": 3.0373152734628175e-06, "loss": 0.7383163, "num_input_tokens_seen": 61957800, "step": 2883, "time_per_iteration": 2.610135316848755 }, { "auxiliary_loss_clip": 0.01148984, "auxiliary_loss_mlp": 0.01053267, "balance_loss_clip": 1.05174243, "balance_loss_mlp": 1.03203547, "epoch": 0.34678049660313837, "flos": 15267637751040.0, "grad_norm": 1.8802299978222436, "language_loss": 0.76203203, "learning_rate": 3.0366491880465584e-06, "loss": 0.78405452, "num_input_tokens_seen": 61975820, "step": 2884, "time_per_iteration": 2.600931167602539 }, { "auxiliary_loss_clip": 0.01170076, "auxiliary_loss_mlp": 0.01062269, "balance_loss_clip": 1.05548811, "balance_loss_mlp": 1.0418005, "epoch": 0.3469007394937774, "flos": 21181550630400.0, "grad_norm": 1.8492800438509025, "language_loss": 0.82086408, "learning_rate": 3.035982945371443e-06, "loss": 0.84318757, "num_input_tokens_seen": 61997515, "step": 2885, "time_per_iteration": 2.6006550788879395 }, { "auxiliary_loss_clip": 0.0114734, "auxiliary_loss_mlp": 0.01052426, "balance_loss_clip": 1.05385566, "balance_loss_mlp": 1.03388858, "epoch": 0.34702098238441653, "flos": 22375471818240.0, "grad_norm": 6.141562374074781, "language_loss": 0.85491443, "learning_rate": 3.035316545538537e-06, "loss": 0.87691212, "num_input_tokens_seen": 62016310, "step": 2886, "time_per_iteration": 2.669180393218994 }, { "auxiliary_loss_clip": 0.01142054, "auxiliary_loss_mlp": 0.01045403, "balance_loss_clip": 1.05837321, "balance_loss_mlp": 1.02674651, "epoch": 0.3471412252750556, "flos": 22929430343040.0, "grad_norm": 2.010671782525189, "language_loss": 0.79323584, "learning_rate": 3.034649988648935e-06, "loss": 0.81511039, "num_input_tokens_seen": 62036075, "step": 2887, "time_per_iteration": 2.7093207836151123 }, { "auxiliary_loss_clip": 0.01143055, "auxiliary_loss_mlp": 0.01051946, "balance_loss_clip": 1.05285835, "balance_loss_mlp": 1.03045297, "epoch": 0.3472614681656947, "flos": 21324259365120.0, "grad_norm": 1.7041220283520055, "language_loss": 0.80419123, "learning_rate": 3.033983274803752e-06, "loss": 0.82614124, "num_input_tokens_seen": 62055865, "step": 2888, "time_per_iteration": 2.677701234817505 }, { "auxiliary_loss_clip": 0.01135699, "auxiliary_loss_mlp": 0.01051483, "balance_loss_clip": 1.05176759, "balance_loss_mlp": 1.02945352, "epoch": 0.3473817110563338, "flos": 23475739271040.0, "grad_norm": 8.026684437715158, "language_loss": 0.72555196, "learning_rate": 3.0333164041041283e-06, "loss": 0.74742383, "num_input_tokens_seen": 62072180, "step": 2889, "time_per_iteration": 2.6689231395721436 }, { "auxiliary_loss_clip": 0.01107999, "auxiliary_loss_mlp": 0.01055702, "balance_loss_clip": 1.05065584, "balance_loss_mlp": 1.03537655, "epoch": 0.34750195394697286, "flos": 22346025644160.0, "grad_norm": 1.8610106008734213, "language_loss": 0.71770215, "learning_rate": 3.032649376651228e-06, "loss": 0.73933917, "num_input_tokens_seen": 62091600, "step": 2890, "time_per_iteration": 2.750581741333008 }, { "auxiliary_loss_clip": 0.01131881, "auxiliary_loss_mlp": 0.01045149, "balance_loss_clip": 1.050179, "balance_loss_mlp": 1.02457356, "epoch": 0.347622196837612, "flos": 29095004885760.0, "grad_norm": 2.2996160528160687, "language_loss": 0.75767201, "learning_rate": 3.031982192546238e-06, "loss": 0.77944231, "num_input_tokens_seen": 62114695, "step": 2891, "time_per_iteration": 2.7392265796661377 }, { "auxiliary_loss_clip": 0.01156182, "auxiliary_loss_mlp": 0.01047194, "balance_loss_clip": 1.05455184, "balance_loss_mlp": 1.02734566, "epoch": 0.3477424397282511, "flos": 22455732758400.0, "grad_norm": 2.5906891901573257, "language_loss": 0.94599456, "learning_rate": 3.0313148518903696e-06, "loss": 0.96802837, "num_input_tokens_seen": 62134520, "step": 2892, "time_per_iteration": 2.603435754776001 }, { "auxiliary_loss_clip": 0.01142081, "auxiliary_loss_mlp": 0.010501, "balance_loss_clip": 1.05589867, "balance_loss_mlp": 1.03046584, "epoch": 0.34786268261889014, "flos": 15778790242560.0, "grad_norm": 2.123295721155425, "language_loss": 0.81305134, "learning_rate": 3.030647354784859e-06, "loss": 0.8349731, "num_input_tokens_seen": 62151560, "step": 2893, "time_per_iteration": 2.606149673461914 }, { "auxiliary_loss_clip": 0.01125476, "auxiliary_loss_mlp": 0.01046061, "balance_loss_clip": 1.04952884, "balance_loss_mlp": 1.02590251, "epoch": 0.34798292550952925, "flos": 20777627214720.0, "grad_norm": 1.93270734480392, "language_loss": 0.7706393, "learning_rate": 3.029979701330964e-06, "loss": 0.7923547, "num_input_tokens_seen": 62170985, "step": 2894, "time_per_iteration": 2.6950888633728027 }, { "auxiliary_loss_clip": 0.01148621, "auxiliary_loss_mlp": 0.01052115, "balance_loss_clip": 1.05640078, "balance_loss_mlp": 1.03188539, "epoch": 0.34810316840016836, "flos": 19937820257280.0, "grad_norm": 2.42369573465443, "language_loss": 0.80265933, "learning_rate": 3.029311891629966e-06, "loss": 0.82466674, "num_input_tokens_seen": 62189440, "step": 2895, "time_per_iteration": 2.640444755554199 }, { "auxiliary_loss_clip": 0.0113751, "auxiliary_loss_mlp": 0.01043611, "balance_loss_clip": 1.05261195, "balance_loss_mlp": 1.02470446, "epoch": 0.3482234112908074, "flos": 23623296341760.0, "grad_norm": 2.227860885115362, "language_loss": 0.74824321, "learning_rate": 3.0286439257831744e-06, "loss": 0.77005446, "num_input_tokens_seen": 62208910, "step": 2896, "time_per_iteration": 3.6873369216918945 }, { "auxiliary_loss_clip": 0.01180333, "auxiliary_loss_mlp": 0.01060344, "balance_loss_clip": 1.05955195, "balance_loss_mlp": 1.03840995, "epoch": 0.3483436541814465, "flos": 23986712194560.0, "grad_norm": 1.8914326358886617, "language_loss": 0.7138595, "learning_rate": 3.0279758038919156e-06, "loss": 0.73626626, "num_input_tokens_seen": 62227135, "step": 2897, "time_per_iteration": 2.608701467514038 }, { "auxiliary_loss_clip": 0.01157483, "auxiliary_loss_mlp": 0.01063693, "balance_loss_clip": 1.05626535, "balance_loss_mlp": 1.04336786, "epoch": 0.34846389707208564, "flos": 22638338524800.0, "grad_norm": 2.2181238661398184, "language_loss": 0.78255105, "learning_rate": 3.0273075260575455e-06, "loss": 0.80476284, "num_input_tokens_seen": 62246035, "step": 2898, "time_per_iteration": 2.6256179809570312 }, { "auxiliary_loss_clip": 0.01145018, "auxiliary_loss_mlp": 0.01047682, "balance_loss_clip": 1.05321336, "balance_loss_mlp": 1.0264504, "epoch": 0.3485841399627247, "flos": 21792857218560.0, "grad_norm": 3.0897361683989417, "language_loss": 0.80558658, "learning_rate": 3.0266390923814396e-06, "loss": 0.82751358, "num_input_tokens_seen": 62264095, "step": 2899, "time_per_iteration": 3.5871973037719727 }, { "auxiliary_loss_clip": 0.01153223, "auxiliary_loss_mlp": 0.01056749, "balance_loss_clip": 1.06223512, "balance_loss_mlp": 1.03653109, "epoch": 0.3487043828533638, "flos": 17019036996480.0, "grad_norm": 1.961370230084132, "language_loss": 0.81829756, "learning_rate": 3.0259705029650008e-06, "loss": 0.8403973, "num_input_tokens_seen": 62282025, "step": 2900, "time_per_iteration": 3.598101854324341 }, { "auxiliary_loss_clip": 0.01156822, "auxiliary_loss_mlp": 0.01049736, "balance_loss_clip": 1.05456877, "balance_loss_mlp": 1.03091323, "epoch": 0.34882462574400286, "flos": 22601135013120.0, "grad_norm": 1.7491376675040704, "language_loss": 0.73244965, "learning_rate": 3.025301757909652e-06, "loss": 0.75451529, "num_input_tokens_seen": 62302220, "step": 2901, "time_per_iteration": 2.6490695476531982 }, { "auxiliary_loss_clip": 0.01132256, "auxiliary_loss_mlp": 0.00776383, "balance_loss_clip": 1.0533694, "balance_loss_mlp": 1.00019765, "epoch": 0.34894486863464197, "flos": 29861518141440.0, "grad_norm": 1.5608563646264617, "language_loss": 0.80491042, "learning_rate": 3.024632857316842e-06, "loss": 0.82399684, "num_input_tokens_seen": 62323535, "step": 2902, "time_per_iteration": 2.711099863052368 }, { "auxiliary_loss_clip": 0.01158163, "auxiliary_loss_mlp": 0.01046486, "balance_loss_clip": 1.05542874, "balance_loss_mlp": 1.0266614, "epoch": 0.3490651115252811, "flos": 22122265870080.0, "grad_norm": 1.95035732770343, "language_loss": 0.7759549, "learning_rate": 3.0239638012880412e-06, "loss": 0.79800135, "num_input_tokens_seen": 62343430, "step": 2903, "time_per_iteration": 2.612111806869507 }, { "auxiliary_loss_clip": 0.01112221, "auxiliary_loss_mlp": 0.01053365, "balance_loss_clip": 1.04778194, "balance_loss_mlp": 1.03145468, "epoch": 0.34918535441592014, "flos": 12676682943360.0, "grad_norm": 2.719479508677994, "language_loss": 0.81351829, "learning_rate": 3.0232945899247466e-06, "loss": 0.8351742, "num_input_tokens_seen": 62360365, "step": 2904, "time_per_iteration": 2.669416666030884 }, { "auxiliary_loss_clip": 0.01155647, "auxiliary_loss_mlp": 0.01053482, "balance_loss_clip": 1.05261028, "balance_loss_mlp": 1.03242993, "epoch": 0.34930559730655925, "flos": 23185617120000.0, "grad_norm": 2.2198908220193414, "language_loss": 0.77594024, "learning_rate": 3.022625223328476e-06, "loss": 0.79803151, "num_input_tokens_seen": 62382105, "step": 2905, "time_per_iteration": 2.6392130851745605 }, { "auxiliary_loss_clip": 0.01163965, "auxiliary_loss_mlp": 0.01053925, "balance_loss_clip": 1.05701065, "balance_loss_mlp": 1.03255093, "epoch": 0.34942584019719836, "flos": 22855023319680.0, "grad_norm": 1.6830111829232348, "language_loss": 0.68976402, "learning_rate": 3.0219557016007723e-06, "loss": 0.71194297, "num_input_tokens_seen": 62402235, "step": 2906, "time_per_iteration": 3.5733375549316406 }, { "auxiliary_loss_clip": 0.01147243, "auxiliary_loss_mlp": 0.0105852, "balance_loss_clip": 1.05331564, "balance_loss_mlp": 1.03836191, "epoch": 0.3495460830878374, "flos": 24426043441920.0, "grad_norm": 2.9950452807622283, "language_loss": 0.7005868, "learning_rate": 3.021286024843202e-06, "loss": 0.72264445, "num_input_tokens_seen": 62420430, "step": 2907, "time_per_iteration": 2.6543824672698975 }, { "auxiliary_loss_clip": 0.01070598, "auxiliary_loss_mlp": 0.0100589, "balance_loss_clip": 1.0312674, "balance_loss_mlp": 1.00212276, "epoch": 0.3496663259784765, "flos": 70008749389440.0, "grad_norm": 1.064822825114661, "language_loss": 0.64719826, "learning_rate": 3.0206161931573526e-06, "loss": 0.66796309, "num_input_tokens_seen": 62472980, "step": 2908, "time_per_iteration": 3.0999932289123535 }, { "auxiliary_loss_clip": 0.01132906, "auxiliary_loss_mlp": 0.01044335, "balance_loss_clip": 1.04781556, "balance_loss_mlp": 1.02507126, "epoch": 0.34978656886911563, "flos": 28692805322880.0, "grad_norm": 1.6915334195451261, "language_loss": 0.92749417, "learning_rate": 3.0199462066448388e-06, "loss": 0.94926661, "num_input_tokens_seen": 62495175, "step": 2909, "time_per_iteration": 2.695802688598633 }, { "auxiliary_loss_clip": 0.01159978, "auxiliary_loss_mlp": 0.01050304, "balance_loss_clip": 1.05691671, "balance_loss_mlp": 1.0287745, "epoch": 0.3499068117597547, "flos": 21142156389120.0, "grad_norm": 1.7803282991436338, "language_loss": 0.69430029, "learning_rate": 3.019276065407296e-06, "loss": 0.71640313, "num_input_tokens_seen": 62514295, "step": 2910, "time_per_iteration": 2.5670905113220215 }, { "auxiliary_loss_clip": 0.01116992, "auxiliary_loss_mlp": 0.0105618, "balance_loss_clip": 1.0474894, "balance_loss_mlp": 1.03478158, "epoch": 0.3500270546503938, "flos": 22782699285120.0, "grad_norm": 2.031233805522864, "language_loss": 0.80333239, "learning_rate": 3.018605769546385e-06, "loss": 0.82506412, "num_input_tokens_seen": 62534850, "step": 2911, "time_per_iteration": 2.692523717880249 }, { "auxiliary_loss_clip": 0.01152302, "auxiliary_loss_mlp": 0.0104692, "balance_loss_clip": 1.05101442, "balance_loss_mlp": 1.02658319, "epoch": 0.3501472975410329, "flos": 22894058424960.0, "grad_norm": 3.2214562907018482, "language_loss": 0.79396057, "learning_rate": 3.017935319163788e-06, "loss": 0.81595278, "num_input_tokens_seen": 62553810, "step": 2912, "time_per_iteration": 2.6386423110961914 }, { "auxiliary_loss_clip": 0.01158104, "auxiliary_loss_mlp": 0.01055388, "balance_loss_clip": 1.05358577, "balance_loss_mlp": 1.03319073, "epoch": 0.35026754043167196, "flos": 25446588658560.0, "grad_norm": 1.8101059532753592, "language_loss": 0.70533776, "learning_rate": 3.017264714361213e-06, "loss": 0.72747266, "num_input_tokens_seen": 62573460, "step": 2913, "time_per_iteration": 2.6178293228149414 }, { "auxiliary_loss_clip": 0.01138575, "auxiliary_loss_mlp": 0.00776578, "balance_loss_clip": 1.05203581, "balance_loss_mlp": 1.00028753, "epoch": 0.3503877833223111, "flos": 19573757959680.0, "grad_norm": 3.176973579291576, "language_loss": 0.82237244, "learning_rate": 3.016593955240389e-06, "loss": 0.841524, "num_input_tokens_seen": 62592150, "step": 2914, "time_per_iteration": 2.6512043476104736 }, { "auxiliary_loss_clip": 0.0105501, "auxiliary_loss_mlp": 0.01002735, "balance_loss_clip": 1.02666247, "balance_loss_mlp": 0.99956435, "epoch": 0.3505080262129502, "flos": 65072075880960.0, "grad_norm": 0.8416674448733638, "language_loss": 0.6376611, "learning_rate": 3.015923041903071e-06, "loss": 0.65823853, "num_input_tokens_seen": 62658275, "step": 2915, "time_per_iteration": 3.262662172317505 }, { "auxiliary_loss_clip": 0.01154504, "auxiliary_loss_mlp": 0.01056014, "balance_loss_clip": 1.05443358, "balance_loss_mlp": 1.03593874, "epoch": 0.35062826910358924, "flos": 29314562768640.0, "grad_norm": 2.195902148717065, "language_loss": 0.8375327, "learning_rate": 3.0152519744510347e-06, "loss": 0.85963786, "num_input_tokens_seen": 62678075, "step": 2916, "time_per_iteration": 2.6644999980926514 }, { "auxiliary_loss_clip": 0.01125722, "auxiliary_loss_mlp": 0.01048524, "balance_loss_clip": 1.04722989, "balance_loss_mlp": 1.02855694, "epoch": 0.35074851199422835, "flos": 23987717775360.0, "grad_norm": 2.138928702965886, "language_loss": 0.82472563, "learning_rate": 3.014580752986081e-06, "loss": 0.84646809, "num_input_tokens_seen": 62696950, "step": 2917, "time_per_iteration": 2.688502073287964 }, { "auxiliary_loss_clip": 0.011164, "auxiliary_loss_mlp": 0.01053303, "balance_loss_clip": 1.04867435, "balance_loss_mlp": 1.0327394, "epoch": 0.3508687548848674, "flos": 15224436668160.0, "grad_norm": 2.235468718713301, "language_loss": 0.78506643, "learning_rate": 3.0139093776100345e-06, "loss": 0.80676341, "num_input_tokens_seen": 62713540, "step": 2918, "time_per_iteration": 2.64717435836792 }, { "auxiliary_loss_clip": 0.01163667, "auxiliary_loss_mlp": 0.0105533, "balance_loss_clip": 1.05362177, "balance_loss_mlp": 1.03650689, "epoch": 0.3509889977755065, "flos": 21361750185600.0, "grad_norm": 1.867400167119909, "language_loss": 0.75676394, "learning_rate": 3.013237848424741e-06, "loss": 0.77895391, "num_input_tokens_seen": 62732925, "step": 2919, "time_per_iteration": 2.5868992805480957 }, { "auxiliary_loss_clip": 0.01145618, "auxiliary_loss_mlp": 0.01044334, "balance_loss_clip": 1.05530453, "balance_loss_mlp": 1.02445042, "epoch": 0.35110924066614563, "flos": 19135360465920.0, "grad_norm": 2.8356702540882406, "language_loss": 0.75115728, "learning_rate": 3.012566165532072e-06, "loss": 0.77305686, "num_input_tokens_seen": 62751715, "step": 2920, "time_per_iteration": 2.654865264892578 }, { "auxiliary_loss_clip": 0.01103576, "auxiliary_loss_mlp": 0.01056402, "balance_loss_clip": 1.04768991, "balance_loss_mlp": 1.03580272, "epoch": 0.3512294835567847, "flos": 21980885938560.0, "grad_norm": 2.4548336475937003, "language_loss": 0.76657474, "learning_rate": 3.0118943290339207e-06, "loss": 0.78817463, "num_input_tokens_seen": 62771925, "step": 2921, "time_per_iteration": 2.7267701625823975 }, { "auxiliary_loss_clip": 0.01116555, "auxiliary_loss_mlp": 0.01052286, "balance_loss_clip": 1.04483795, "balance_loss_mlp": 1.0311029, "epoch": 0.3513497264474238, "flos": 17817294896640.0, "grad_norm": 3.5036247327248216, "language_loss": 0.68519306, "learning_rate": 3.011222339032204e-06, "loss": 0.70688152, "num_input_tokens_seen": 62790075, "step": 2922, "time_per_iteration": 3.6490793228149414 }, { "auxiliary_loss_clip": 0.01165625, "auxiliary_loss_mlp": 0.01060199, "balance_loss_clip": 1.05559468, "balance_loss_mlp": 1.03956389, "epoch": 0.3514699693380629, "flos": 26943417239040.0, "grad_norm": 3.059333423043453, "language_loss": 0.68925464, "learning_rate": 3.0105501956288626e-06, "loss": 0.71151292, "num_input_tokens_seen": 62810545, "step": 2923, "time_per_iteration": 2.6490302085876465 }, { "auxiliary_loss_clip": 0.01158264, "auxiliary_loss_mlp": 0.01053778, "balance_loss_clip": 1.05179787, "balance_loss_mlp": 1.03241563, "epoch": 0.35159021222870196, "flos": 15267565923840.0, "grad_norm": 2.2844301024406684, "language_loss": 0.72726977, "learning_rate": 3.0098778989258602e-06, "loss": 0.74939018, "num_input_tokens_seen": 62829155, "step": 2924, "time_per_iteration": 2.5888447761535645 }, { "auxiliary_loss_clip": 0.01126558, "auxiliary_loss_mlp": 0.01065695, "balance_loss_clip": 1.05291665, "balance_loss_mlp": 1.04219866, "epoch": 0.35171045511934107, "flos": 13984154000640.0, "grad_norm": 2.017794866233196, "language_loss": 0.88087368, "learning_rate": 3.009205449025183e-06, "loss": 0.90279615, "num_input_tokens_seen": 62845350, "step": 2925, "time_per_iteration": 3.620793342590332 }, { "auxiliary_loss_clip": 0.01115157, "auxiliary_loss_mlp": 0.01063162, "balance_loss_clip": 1.0447855, "balance_loss_mlp": 1.04110885, "epoch": 0.3518306980099802, "flos": 14283434119680.0, "grad_norm": 1.85450558095865, "language_loss": 0.62841249, "learning_rate": 3.008532846028842e-06, "loss": 0.65019572, "num_input_tokens_seen": 62862110, "step": 2926, "time_per_iteration": 2.6361215114593506 }, { "auxiliary_loss_clip": 0.01172734, "auxiliary_loss_mlp": 0.01050715, "balance_loss_clip": 1.0573585, "balance_loss_mlp": 1.02892375, "epoch": 0.35195094090061924, "flos": 27052872958080.0, "grad_norm": 2.792146996300992, "language_loss": 0.72415888, "learning_rate": 3.0078600900388694e-06, "loss": 0.74639344, "num_input_tokens_seen": 62882415, "step": 2927, "time_per_iteration": 3.6173248291015625 }, { "auxiliary_loss_clip": 0.01114781, "auxiliary_loss_mlp": 0.01053088, "balance_loss_clip": 1.04552412, "balance_loss_mlp": 1.03284585, "epoch": 0.35207118379125835, "flos": 25629266252160.0, "grad_norm": 2.0876432191182492, "language_loss": 0.74096435, "learning_rate": 3.007187181157323e-06, "loss": 0.7626431, "num_input_tokens_seen": 62902425, "step": 2928, "time_per_iteration": 2.6971724033355713 }, { "auxiliary_loss_clip": 0.0108887, "auxiliary_loss_mlp": 0.01052967, "balance_loss_clip": 1.04173005, "balance_loss_mlp": 1.0328207, "epoch": 0.35219142668189746, "flos": 18004713085440.0, "grad_norm": 3.5498045547937354, "language_loss": 0.68345386, "learning_rate": 3.006514119486282e-06, "loss": 0.70487225, "num_input_tokens_seen": 62919255, "step": 2929, "time_per_iteration": 2.7040796279907227 }, { "auxiliary_loss_clip": 0.01120337, "auxiliary_loss_mlp": 0.01054139, "balance_loss_clip": 1.048033, "balance_loss_mlp": 1.03218091, "epoch": 0.3523116695725365, "flos": 14028109269120.0, "grad_norm": 1.8198334891216281, "language_loss": 0.70306599, "learning_rate": 3.005840905127849e-06, "loss": 0.72481072, "num_input_tokens_seen": 62936160, "step": 2930, "time_per_iteration": 2.6962668895721436 }, { "auxiliary_loss_clip": 0.01165707, "auxiliary_loss_mlp": 0.01049863, "balance_loss_clip": 1.05792832, "balance_loss_mlp": 1.02946675, "epoch": 0.3524319124631756, "flos": 21433966479360.0, "grad_norm": 2.561149096172719, "language_loss": 0.86885607, "learning_rate": 3.0051675381841516e-06, "loss": 0.89101183, "num_input_tokens_seen": 62953470, "step": 2931, "time_per_iteration": 2.628161907196045 }, { "auxiliary_loss_clip": 0.01085269, "auxiliary_loss_mlp": 0.00777647, "balance_loss_clip": 1.04383755, "balance_loss_mlp": 1.00028992, "epoch": 0.3525521553538147, "flos": 26322773114880.0, "grad_norm": 1.8182468284025661, "language_loss": 0.76745808, "learning_rate": 3.0044940187573363e-06, "loss": 0.78608721, "num_input_tokens_seen": 62974480, "step": 2932, "time_per_iteration": 3.719459295272827 }, { "auxiliary_loss_clip": 0.01153943, "auxiliary_loss_mlp": 0.01059486, "balance_loss_clip": 1.05264568, "balance_loss_mlp": 1.03994703, "epoch": 0.3526723982444538, "flos": 21543314457600.0, "grad_norm": 2.052160916743595, "language_loss": 0.65033746, "learning_rate": 3.003820346949578e-06, "loss": 0.67247176, "num_input_tokens_seen": 62992560, "step": 2933, "time_per_iteration": 2.603430986404419 }, { "auxiliary_loss_clip": 0.01167434, "auxiliary_loss_mlp": 0.01055045, "balance_loss_clip": 1.05342603, "balance_loss_mlp": 1.03370643, "epoch": 0.3527926411350929, "flos": 23733649900800.0, "grad_norm": 2.062324495003538, "language_loss": 0.79378831, "learning_rate": 3.003146522863071e-06, "loss": 0.8160131, "num_input_tokens_seen": 63013445, "step": 2934, "time_per_iteration": 2.6170694828033447 }, { "auxiliary_loss_clip": 0.01141277, "auxiliary_loss_mlp": 0.01055689, "balance_loss_clip": 1.05313766, "balance_loss_mlp": 1.03494692, "epoch": 0.35291288402573195, "flos": 30445461544320.0, "grad_norm": 2.795789209154241, "language_loss": 0.85808247, "learning_rate": 3.0024725466000345e-06, "loss": 0.88005215, "num_input_tokens_seen": 63033400, "step": 2935, "time_per_iteration": 2.705249309539795 }, { "auxiliary_loss_clip": 0.01159465, "auxiliary_loss_mlp": 0.01051546, "balance_loss_clip": 1.05903614, "balance_loss_mlp": 1.03166246, "epoch": 0.35303312691637107, "flos": 23112179763840.0, "grad_norm": 1.7537232136356586, "language_loss": 0.78930724, "learning_rate": 3.0017984182627087e-06, "loss": 0.81141734, "num_input_tokens_seen": 63052725, "step": 2936, "time_per_iteration": 2.662428855895996 }, { "auxiliary_loss_clip": 0.01125748, "auxiliary_loss_mlp": 0.00776095, "balance_loss_clip": 1.04785049, "balance_loss_mlp": 1.00026608, "epoch": 0.3531533698070102, "flos": 21835699165440.0, "grad_norm": 1.927821619059833, "language_loss": 0.82296431, "learning_rate": 3.00112413795336e-06, "loss": 0.84198272, "num_input_tokens_seen": 63072560, "step": 2937, "time_per_iteration": 2.725297451019287 }, { "auxiliary_loss_clip": 0.01138368, "auxiliary_loss_mlp": 0.01056965, "balance_loss_clip": 1.04990327, "balance_loss_mlp": 1.0357455, "epoch": 0.35327361269764923, "flos": 15778969810560.0, "grad_norm": 2.485865002183311, "language_loss": 0.7996527, "learning_rate": 3.000449705774275e-06, "loss": 0.82160604, "num_input_tokens_seen": 63090800, "step": 2938, "time_per_iteration": 2.6412997245788574 }, { "auxiliary_loss_clip": 0.01155522, "auxiliary_loss_mlp": 0.01052277, "balance_loss_clip": 1.05504084, "balance_loss_mlp": 1.03247583, "epoch": 0.35339385558828834, "flos": 22090413484800.0, "grad_norm": 2.341786940415543, "language_loss": 0.71818876, "learning_rate": 2.9997751218277654e-06, "loss": 0.74026674, "num_input_tokens_seen": 63108955, "step": 2939, "time_per_iteration": 2.770033597946167 }, { "auxiliary_loss_clip": 0.01170958, "auxiliary_loss_mlp": 0.01056716, "balance_loss_clip": 1.05930781, "balance_loss_mlp": 1.03593826, "epoch": 0.35351409847892745, "flos": 24165008328960.0, "grad_norm": 2.166413787555567, "language_loss": 0.78061283, "learning_rate": 2.999100386216166e-06, "loss": 0.80288959, "num_input_tokens_seen": 63127895, "step": 2940, "time_per_iteration": 2.5832042694091797 }, { "auxiliary_loss_clip": 0.01144172, "auxiliary_loss_mlp": 0.01048631, "balance_loss_clip": 1.0537051, "balance_loss_mlp": 1.02825856, "epoch": 0.3536343413695665, "flos": 27052298340480.0, "grad_norm": 5.523879050461161, "language_loss": 0.74493957, "learning_rate": 2.998425499041831e-06, "loss": 0.76686758, "num_input_tokens_seen": 63148410, "step": 2941, "time_per_iteration": 2.7268617153167725 }, { "auxiliary_loss_clip": 0.0105728, "auxiliary_loss_mlp": 0.01012737, "balance_loss_clip": 1.02939415, "balance_loss_mlp": 1.0098995, "epoch": 0.3537545842602056, "flos": 65991066370560.0, "grad_norm": 1.270259026531448, "language_loss": 0.64560282, "learning_rate": 2.997750460407142e-06, "loss": 0.66630292, "num_input_tokens_seen": 63209765, "step": 2942, "time_per_iteration": 3.264694929122925 }, { "auxiliary_loss_clip": 0.01135549, "auxiliary_loss_mlp": 0.01048618, "balance_loss_clip": 1.04944539, "balance_loss_mlp": 1.02575326, "epoch": 0.35387482715084473, "flos": 18436897526400.0, "grad_norm": 3.74452812997859, "language_loss": 0.7020697, "learning_rate": 2.997075270414501e-06, "loss": 0.72391135, "num_input_tokens_seen": 63226980, "step": 2943, "time_per_iteration": 2.653993844985962 }, { "auxiliary_loss_clip": 0.01042335, "auxiliary_loss_mlp": 0.01006155, "balance_loss_clip": 1.0241189, "balance_loss_mlp": 1.00349665, "epoch": 0.3539950700414838, "flos": 65588579498880.0, "grad_norm": 0.7126058902719579, "language_loss": 0.57732737, "learning_rate": 2.9963999291663347e-06, "loss": 0.59781229, "num_input_tokens_seen": 63292760, "step": 2944, "time_per_iteration": 3.3216211795806885 }, { "auxiliary_loss_clip": 0.01123652, "auxiliary_loss_mlp": 0.01053187, "balance_loss_clip": 1.05352902, "balance_loss_mlp": 1.03144324, "epoch": 0.3541153129321229, "flos": 20521655919360.0, "grad_norm": 3.5039416711978864, "language_loss": 0.7412045, "learning_rate": 2.9957244367650915e-06, "loss": 0.76297289, "num_input_tokens_seen": 63309005, "step": 2945, "time_per_iteration": 2.70156192779541 }, { "auxiliary_loss_clip": 0.01107541, "auxiliary_loss_mlp": 0.01057343, "balance_loss_clip": 1.05067933, "balance_loss_mlp": 1.03558731, "epoch": 0.354235555822762, "flos": 19573578391680.0, "grad_norm": 2.143242738156709, "language_loss": 0.8405298, "learning_rate": 2.9950487933132425e-06, "loss": 0.86217856, "num_input_tokens_seen": 63326420, "step": 2946, "time_per_iteration": 2.722440719604492 }, { "auxiliary_loss_clip": 0.0116157, "auxiliary_loss_mlp": 0.01055221, "balance_loss_clip": 1.05564392, "balance_loss_mlp": 1.03199863, "epoch": 0.35435579871340106, "flos": 20777268078720.0, "grad_norm": 2.1235453394342767, "language_loss": 0.7110191, "learning_rate": 2.994372998913283e-06, "loss": 0.73318702, "num_input_tokens_seen": 63344925, "step": 2947, "time_per_iteration": 2.634809732437134 }, { "auxiliary_loss_clip": 0.01144006, "auxiliary_loss_mlp": 0.01053839, "balance_loss_clip": 1.0534302, "balance_loss_mlp": 1.03244138, "epoch": 0.35447604160404017, "flos": 23951807153280.0, "grad_norm": 3.436170606626004, "language_loss": 0.62328207, "learning_rate": 2.99369705366773e-06, "loss": 0.64526057, "num_input_tokens_seen": 63365170, "step": 2948, "time_per_iteration": 3.6897330284118652 }, { "auxiliary_loss_clip": 0.01137143, "auxiliary_loss_mlp": 0.01052353, "balance_loss_clip": 1.05228305, "balance_loss_mlp": 1.03116906, "epoch": 0.3545962844946792, "flos": 23435662671360.0, "grad_norm": 2.5894885569234143, "language_loss": 0.82155436, "learning_rate": 2.9930209576791244e-06, "loss": 0.84344935, "num_input_tokens_seen": 63383645, "step": 2949, "time_per_iteration": 2.6894032955169678 }, { "auxiliary_loss_clip": 0.01151262, "auxiliary_loss_mlp": 0.01049663, "balance_loss_clip": 1.05256152, "balance_loss_mlp": 1.03019619, "epoch": 0.35471652738531834, "flos": 22085134185600.0, "grad_norm": 5.058531270490593, "language_loss": 0.63799059, "learning_rate": 2.9923447110500285e-06, "loss": 0.65999985, "num_input_tokens_seen": 63402390, "step": 2950, "time_per_iteration": 2.7131857872009277 }, { "auxiliary_loss_clip": 0.01144782, "auxiliary_loss_mlp": 0.01057308, "balance_loss_clip": 1.05375493, "balance_loss_mlp": 1.03612494, "epoch": 0.35483677027595745, "flos": 27341881787520.0, "grad_norm": 1.4907904022393912, "language_loss": 0.75552106, "learning_rate": 2.9916683138830295e-06, "loss": 0.777542, "num_input_tokens_seen": 63423055, "step": 2951, "time_per_iteration": 3.6830363273620605 }, { "auxiliary_loss_clip": 0.01141522, "auxiliary_loss_mlp": 0.01050956, "balance_loss_clip": 1.0547905, "balance_loss_mlp": 1.03008246, "epoch": 0.3549570131665965, "flos": 13516166678400.0, "grad_norm": 2.52025201568681, "language_loss": 0.81271851, "learning_rate": 2.9909917662807353e-06, "loss": 0.83464336, "num_input_tokens_seen": 63440855, "step": 2952, "time_per_iteration": 2.745417594909668 }, { "auxiliary_loss_clip": 0.01149876, "auxiliary_loss_mlp": 0.01056988, "balance_loss_clip": 1.05044723, "balance_loss_mlp": 1.0348748, "epoch": 0.3550772560572356, "flos": 20887549810560.0, "grad_norm": 2.4107884245555464, "language_loss": 0.69326925, "learning_rate": 2.9903150683457783e-06, "loss": 0.71533787, "num_input_tokens_seen": 63459400, "step": 2953, "time_per_iteration": 2.7746498584747314 }, { "auxiliary_loss_clip": 0.01139987, "auxiliary_loss_mlp": 0.01051862, "balance_loss_clip": 1.04944372, "balance_loss_mlp": 1.031322, "epoch": 0.3551974989478747, "flos": 20194042947840.0, "grad_norm": 2.164373818725, "language_loss": 0.65218818, "learning_rate": 2.9896382201808126e-06, "loss": 0.67410672, "num_input_tokens_seen": 63476800, "step": 2954, "time_per_iteration": 4.228388786315918 }, { "auxiliary_loss_clip": 0.01165335, "auxiliary_loss_mlp": 0.0105196, "balance_loss_clip": 1.05320644, "balance_loss_mlp": 1.03003752, "epoch": 0.3553177418385138, "flos": 19828831415040.0, "grad_norm": 22.482168573308968, "language_loss": 0.8131454, "learning_rate": 2.988961221888516e-06, "loss": 0.83531833, "num_input_tokens_seen": 63493475, "step": 2955, "time_per_iteration": 2.6807103157043457 }, { "auxiliary_loss_clip": 0.01121032, "auxiliary_loss_mlp": 0.01052543, "balance_loss_clip": 1.04841602, "balance_loss_mlp": 1.03143144, "epoch": 0.3554379847291529, "flos": 14829132516480.0, "grad_norm": 2.964941880435686, "language_loss": 0.79493386, "learning_rate": 2.988284073571589e-06, "loss": 0.81666958, "num_input_tokens_seen": 63509560, "step": 2956, "time_per_iteration": 2.81964111328125 }, { "auxiliary_loss_clip": 0.0115175, "auxiliary_loss_mlp": 0.00776727, "balance_loss_clip": 1.05190229, "balance_loss_mlp": 1.00024581, "epoch": 0.355558227619792, "flos": 20485350247680.0, "grad_norm": 2.4287015324872114, "language_loss": 0.7292785, "learning_rate": 2.9876067753327528e-06, "loss": 0.74856329, "num_input_tokens_seen": 63527290, "step": 2957, "time_per_iteration": 2.7374134063720703 }, { "auxiliary_loss_clip": 0.01158467, "auxiliary_loss_mlp": 0.01063309, "balance_loss_clip": 1.05437982, "balance_loss_mlp": 1.04193461, "epoch": 0.35567847051043106, "flos": 37663613256960.0, "grad_norm": 1.991506479485817, "language_loss": 0.8054657, "learning_rate": 2.986929327274754e-06, "loss": 0.82768345, "num_input_tokens_seen": 63547870, "step": 2958, "time_per_iteration": 3.6612961292266846 }, { "auxiliary_loss_clip": 0.01154507, "auxiliary_loss_mlp": 0.01063346, "balance_loss_clip": 1.05533659, "balance_loss_mlp": 1.04290128, "epoch": 0.35579871340107017, "flos": 26943058103040.0, "grad_norm": 1.6618626415084314, "language_loss": 0.7873807, "learning_rate": 2.9862517295003617e-06, "loss": 0.80955923, "num_input_tokens_seen": 63568285, "step": 2959, "time_per_iteration": 2.6530299186706543 }, { "auxiliary_loss_clip": 0.01127189, "auxiliary_loss_mlp": 0.01051582, "balance_loss_clip": 1.04747498, "balance_loss_mlp": 1.03061354, "epoch": 0.3559189562917093, "flos": 28293335193600.0, "grad_norm": 1.993116015869338, "language_loss": 0.72492814, "learning_rate": 2.9855739821123654e-06, "loss": 0.74671584, "num_input_tokens_seen": 63589865, "step": 2960, "time_per_iteration": 2.7302029132843018 }, { "auxiliary_loss_clip": 0.01151335, "auxiliary_loss_mlp": 0.010466, "balance_loss_clip": 1.05321693, "balance_loss_mlp": 1.02641773, "epoch": 0.35603919918234833, "flos": 25664063552640.0, "grad_norm": 1.8738884945156962, "language_loss": 0.82192504, "learning_rate": 2.98489608521358e-06, "loss": 0.84390438, "num_input_tokens_seen": 63609805, "step": 2961, "time_per_iteration": 2.68438982963562 }, { "auxiliary_loss_clip": 0.01158379, "auxiliary_loss_mlp": 0.00776017, "balance_loss_clip": 1.05348647, "balance_loss_mlp": 1.00029325, "epoch": 0.35615944207298744, "flos": 23000856537600.0, "grad_norm": 3.205404237054497, "language_loss": 0.7920962, "learning_rate": 2.9842180389068425e-06, "loss": 0.81144011, "num_input_tokens_seen": 63627115, "step": 2962, "time_per_iteration": 2.6417717933654785 }, { "auxiliary_loss_clip": 0.01037555, "auxiliary_loss_mlp": 0.01014773, "balance_loss_clip": 1.03270721, "balance_loss_mlp": 1.01212692, "epoch": 0.35627968496362655, "flos": 68251283723520.0, "grad_norm": 0.7673368232984866, "language_loss": 0.59258616, "learning_rate": 2.98353984329501e-06, "loss": 0.61310947, "num_input_tokens_seen": 63691460, "step": 2963, "time_per_iteration": 3.303643226623535 }, { "auxiliary_loss_clip": 0.01142599, "auxiliary_loss_mlp": 0.01051342, "balance_loss_clip": 1.05095947, "balance_loss_mlp": 1.02995634, "epoch": 0.3563999278542656, "flos": 22641714403200.0, "grad_norm": 1.6106270995662368, "language_loss": 0.70743322, "learning_rate": 2.982861498480965e-06, "loss": 0.72937262, "num_input_tokens_seen": 63713840, "step": 2964, "time_per_iteration": 2.6688735485076904 }, { "auxiliary_loss_clip": 0.01117985, "auxiliary_loss_mlp": 0.01057753, "balance_loss_clip": 1.04456902, "balance_loss_mlp": 1.03881049, "epoch": 0.3565201707449047, "flos": 25952533678080.0, "grad_norm": 3.2510673847073543, "language_loss": 0.82688451, "learning_rate": 2.9821830045676122e-06, "loss": 0.84864187, "num_input_tokens_seen": 63733540, "step": 2965, "time_per_iteration": 2.7175862789154053 }, { "auxiliary_loss_clip": 0.01168609, "auxiliary_loss_mlp": 0.01051434, "balance_loss_clip": 1.05510902, "balance_loss_mlp": 1.03030968, "epoch": 0.3566404136355438, "flos": 28475725478400.0, "grad_norm": 2.325780107244553, "language_loss": 0.7313149, "learning_rate": 2.9815043616578793e-06, "loss": 0.75351536, "num_input_tokens_seen": 63754335, "step": 2966, "time_per_iteration": 2.64094614982605 }, { "auxiliary_loss_clip": 0.01125295, "auxiliary_loss_mlp": 0.01045278, "balance_loss_clip": 1.04809761, "balance_loss_mlp": 1.02521539, "epoch": 0.3567606565261829, "flos": 38363117690880.0, "grad_norm": 2.062452253903864, "language_loss": 0.77001333, "learning_rate": 2.9808255698547145e-06, "loss": 0.79171908, "num_input_tokens_seen": 63777135, "step": 2967, "time_per_iteration": 2.826209545135498 }, { "auxiliary_loss_clip": 0.01153642, "auxiliary_loss_mlp": 0.01057187, "balance_loss_clip": 1.05420303, "balance_loss_mlp": 1.03705311, "epoch": 0.356880899416822, "flos": 21981029592960.0, "grad_norm": 2.666066847448079, "language_loss": 0.79701006, "learning_rate": 2.9801466292610913e-06, "loss": 0.81911838, "num_input_tokens_seen": 63797020, "step": 2968, "time_per_iteration": 2.6153581142425537 }, { "auxiliary_loss_clip": 0.01150913, "auxiliary_loss_mlp": 0.01053332, "balance_loss_clip": 1.05220771, "balance_loss_mlp": 1.03200495, "epoch": 0.35700114230746105, "flos": 18989132198400.0, "grad_norm": 2.9961084016088217, "language_loss": 0.80762869, "learning_rate": 2.979467539980003e-06, "loss": 0.82967114, "num_input_tokens_seen": 63813810, "step": 2969, "time_per_iteration": 2.6145224571228027 }, { "auxiliary_loss_clip": 0.0115577, "auxiliary_loss_mlp": 0.01054996, "balance_loss_clip": 1.05327749, "balance_loss_mlp": 1.03537369, "epoch": 0.35712138519810016, "flos": 19756112330880.0, "grad_norm": 1.9057458623146626, "language_loss": 0.76838183, "learning_rate": 2.978788302114468e-06, "loss": 0.79048949, "num_input_tokens_seen": 63830925, "step": 2970, "time_per_iteration": 2.627052068710327 }, { "auxiliary_loss_clip": 0.01151793, "auxiliary_loss_mlp": 0.01055889, "balance_loss_clip": 1.05185437, "balance_loss_mlp": 1.03455102, "epoch": 0.35724162808873927, "flos": 35183012008320.0, "grad_norm": 3.982957263003453, "language_loss": 0.83676207, "learning_rate": 2.9781089157675255e-06, "loss": 0.85883892, "num_input_tokens_seen": 63849385, "step": 2971, "time_per_iteration": 2.697190999984741 }, { "auxiliary_loss_clip": 0.01149995, "auxiliary_loss_mlp": 0.01053621, "balance_loss_clip": 1.05341208, "balance_loss_mlp": 1.0339396, "epoch": 0.3573618709793783, "flos": 25556726736000.0, "grad_norm": 1.6228816976667524, "language_loss": 0.88530487, "learning_rate": 2.977429381042238e-06, "loss": 0.907341, "num_input_tokens_seen": 63870060, "step": 2972, "time_per_iteration": 2.6474344730377197 }, { "auxiliary_loss_clip": 0.01139123, "auxiliary_loss_mlp": 0.01046439, "balance_loss_clip": 1.05049968, "balance_loss_mlp": 1.02742481, "epoch": 0.35748211387001744, "flos": 29132352051840.0, "grad_norm": 2.69053844272139, "language_loss": 0.89079517, "learning_rate": 2.9767496980416913e-06, "loss": 0.91265076, "num_input_tokens_seen": 63889355, "step": 2973, "time_per_iteration": 2.703545331954956 }, { "auxiliary_loss_clip": 0.01132464, "auxiliary_loss_mlp": 0.0105314, "balance_loss_clip": 1.04678786, "balance_loss_mlp": 1.03215909, "epoch": 0.35760235676065655, "flos": 13954169122560.0, "grad_norm": 2.4503469690448156, "language_loss": 0.8108331, "learning_rate": 2.9760698668689914e-06, "loss": 0.83268905, "num_input_tokens_seen": 63905580, "step": 2974, "time_per_iteration": 3.582085609436035 }, { "auxiliary_loss_clip": 0.01150051, "auxiliary_loss_mlp": 0.01050318, "balance_loss_clip": 1.05066466, "balance_loss_mlp": 1.0300405, "epoch": 0.3577225996512956, "flos": 44018688977280.0, "grad_norm": 1.987879511912726, "language_loss": 0.71494502, "learning_rate": 2.975389887627269e-06, "loss": 0.73694873, "num_input_tokens_seen": 63928180, "step": 2975, "time_per_iteration": 2.7955803871154785 }, { "auxiliary_loss_clip": 0.01130506, "auxiliary_loss_mlp": 0.01060435, "balance_loss_clip": 1.04920733, "balance_loss_mlp": 1.04109895, "epoch": 0.3578428425419347, "flos": 17055199013760.0, "grad_norm": 2.225216672772553, "language_loss": 0.89887762, "learning_rate": 2.9747097604196764e-06, "loss": 0.92078704, "num_input_tokens_seen": 63944825, "step": 2976, "time_per_iteration": 2.72145414352417 }, { "auxiliary_loss_clip": 0.01027082, "auxiliary_loss_mlp": 0.01025368, "balance_loss_clip": 1.0281961, "balance_loss_mlp": 1.02198279, "epoch": 0.3579630854325738, "flos": 71676550707840.0, "grad_norm": 0.6835420822374932, "language_loss": 0.56675661, "learning_rate": 2.9740294853493875e-06, "loss": 0.58728111, "num_input_tokens_seen": 64016385, "step": 2977, "time_per_iteration": 4.483656644821167 }, { "auxiliary_loss_clip": 0.01120796, "auxiliary_loss_mlp": 0.01045897, "balance_loss_clip": 1.04879963, "balance_loss_mlp": 1.02558327, "epoch": 0.3580833283232129, "flos": 25046651652480.0, "grad_norm": 2.511637877851402, "language_loss": 0.67002845, "learning_rate": 2.9733490625196008e-06, "loss": 0.69169539, "num_input_tokens_seen": 64036245, "step": 2978, "time_per_iteration": 2.865234136581421 }, { "auxiliary_loss_clip": 0.0111628, "auxiliary_loss_mlp": 0.01065933, "balance_loss_clip": 1.04835057, "balance_loss_mlp": 1.04404604, "epoch": 0.358203571213852, "flos": 13953127628160.0, "grad_norm": 6.018110608122781, "language_loss": 0.75814927, "learning_rate": 2.9726684920335353e-06, "loss": 0.77997136, "num_input_tokens_seen": 64054110, "step": 2979, "time_per_iteration": 4.271493434906006 }, { "auxiliary_loss_clip": 0.0116777, "auxiliary_loss_mlp": 0.00776501, "balance_loss_clip": 1.05236125, "balance_loss_mlp": 1.00026941, "epoch": 0.35832381410449105, "flos": 20302457172480.0, "grad_norm": 2.3127392662155746, "language_loss": 0.82155657, "learning_rate": 2.971987773994432e-06, "loss": 0.84099925, "num_input_tokens_seen": 64070295, "step": 2980, "time_per_iteration": 2.754307746887207 }, { "auxiliary_loss_clip": 0.01140985, "auxiliary_loss_mlp": 0.01053699, "balance_loss_clip": 1.04781628, "balance_loss_mlp": 1.0326345, "epoch": 0.35844405699513016, "flos": 16983234115200.0, "grad_norm": 2.2425812603044433, "language_loss": 0.83077395, "learning_rate": 2.9713069085055566e-06, "loss": 0.85272074, "num_input_tokens_seen": 64088605, "step": 2981, "time_per_iteration": 2.773059368133545 }, { "auxiliary_loss_clip": 0.01124369, "auxiliary_loss_mlp": 0.01050045, "balance_loss_clip": 1.04906058, "balance_loss_mlp": 1.0311383, "epoch": 0.35856429988576927, "flos": 23216858974080.0, "grad_norm": 1.568331835656707, "language_loss": 0.78927851, "learning_rate": 2.9706258956701958e-06, "loss": 0.81102264, "num_input_tokens_seen": 64108595, "step": 2982, "time_per_iteration": 2.766974687576294 }, { "auxiliary_loss_clip": 0.01152092, "auxiliary_loss_mlp": 0.0104559, "balance_loss_clip": 1.04868197, "balance_loss_mlp": 1.02322626, "epoch": 0.3586845427764083, "flos": 23034576430080.0, "grad_norm": 4.298336660937478, "language_loss": 0.77948403, "learning_rate": 2.9699447355916575e-06, "loss": 0.80146086, "num_input_tokens_seen": 64127405, "step": 2983, "time_per_iteration": 2.8455872535705566 }, { "auxiliary_loss_clip": 0.01166986, "auxiliary_loss_mlp": 0.00775217, "balance_loss_clip": 1.05501544, "balance_loss_mlp": 1.0002079, "epoch": 0.35880478566704743, "flos": 20010682995840.0, "grad_norm": 2.1293092751302978, "language_loss": 0.73795795, "learning_rate": 2.969263428373275e-06, "loss": 0.75738001, "num_input_tokens_seen": 64145755, "step": 2984, "time_per_iteration": 3.5454611778259277 }, { "auxiliary_loss_clip": 0.0114042, "auxiliary_loss_mlp": 0.01043813, "balance_loss_clip": 1.05034614, "balance_loss_mlp": 1.02397668, "epoch": 0.35892502855768654, "flos": 13699095667200.0, "grad_norm": 6.040916025809988, "language_loss": 0.79325992, "learning_rate": 2.9685819741184007e-06, "loss": 0.81510222, "num_input_tokens_seen": 64164195, "step": 2985, "time_per_iteration": 2.69281005859375 }, { "auxiliary_loss_clip": 0.01118265, "auxiliary_loss_mlp": 0.01040851, "balance_loss_clip": 1.04743958, "balance_loss_mlp": 1.02077579, "epoch": 0.3590452714483256, "flos": 18114096977280.0, "grad_norm": 7.364373550186952, "language_loss": 0.69082475, "learning_rate": 2.967900372930411e-06, "loss": 0.71241593, "num_input_tokens_seen": 64182705, "step": 2986, "time_per_iteration": 2.820580005645752 }, { "auxiliary_loss_clip": 0.01128463, "auxiliary_loss_mlp": 0.01053626, "balance_loss_clip": 1.04615343, "balance_loss_mlp": 1.03104746, "epoch": 0.3591655143389647, "flos": 17749352321280.0, "grad_norm": 3.067549089327632, "language_loss": 0.79433542, "learning_rate": 2.9672186249127046e-06, "loss": 0.81615627, "num_input_tokens_seen": 64202170, "step": 2987, "time_per_iteration": 2.7614920139312744 }, { "auxiliary_loss_clip": 0.01137845, "auxiliary_loss_mlp": 0.01050886, "balance_loss_clip": 1.05058944, "balance_loss_mlp": 1.03078747, "epoch": 0.3592857572296038, "flos": 25224409082880.0, "grad_norm": 2.0442264774753984, "language_loss": 0.79056156, "learning_rate": 2.9665367301687014e-06, "loss": 0.81244892, "num_input_tokens_seen": 64220415, "step": 2988, "time_per_iteration": 2.7440781593322754 }, { "auxiliary_loss_clip": 0.01127717, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.0472486, "balance_loss_mlp": 1.02517128, "epoch": 0.3594060001202429, "flos": 29384408764800.0, "grad_norm": 1.8243748868219984, "language_loss": 0.76589626, "learning_rate": 2.965854688801845e-06, "loss": 0.78763694, "num_input_tokens_seen": 64242475, "step": 2989, "time_per_iteration": 2.775728702545166 }, { "auxiliary_loss_clip": 0.0114519, "auxiliary_loss_mlp": 0.01044034, "balance_loss_clip": 1.04629111, "balance_loss_mlp": 1.02366149, "epoch": 0.359526243010882, "flos": 17052900543360.0, "grad_norm": 1.8968353640131466, "language_loss": 0.7666052, "learning_rate": 2.9651725009156005e-06, "loss": 0.78849745, "num_input_tokens_seen": 64260220, "step": 2990, "time_per_iteration": 2.7189323902130127 }, { "auxiliary_loss_clip": 0.01131229, "auxiliary_loss_mlp": 0.01053072, "balance_loss_clip": 1.04805589, "balance_loss_mlp": 1.03117287, "epoch": 0.3596464859015211, "flos": 22965089569920.0, "grad_norm": 1.9198278568553673, "language_loss": 0.74424332, "learning_rate": 2.964490166613454e-06, "loss": 0.76608634, "num_input_tokens_seen": 64280145, "step": 2991, "time_per_iteration": 2.8636367321014404 }, { "auxiliary_loss_clip": 0.01066091, "auxiliary_loss_mlp": 0.0101753, "balance_loss_clip": 1.02857912, "balance_loss_mlp": 1.01493156, "epoch": 0.35976672879216015, "flos": 54739462590720.0, "grad_norm": 0.7610523708499184, "language_loss": 0.57791495, "learning_rate": 2.963807685998917e-06, "loss": 0.59875119, "num_input_tokens_seen": 64336010, "step": 2992, "time_per_iteration": 3.240088701248169 }, { "auxiliary_loss_clip": 0.01114729, "auxiliary_loss_mlp": 0.01045395, "balance_loss_clip": 1.04681277, "balance_loss_mlp": 1.02671492, "epoch": 0.35988697168279926, "flos": 43139020901760.0, "grad_norm": 15.480314828050629, "language_loss": 0.78122509, "learning_rate": 2.9631250591755196e-06, "loss": 0.8028264, "num_input_tokens_seen": 64358725, "step": 2993, "time_per_iteration": 3.0559449195861816 }, { "auxiliary_loss_clip": 0.01132756, "auxiliary_loss_mlp": 0.01060925, "balance_loss_clip": 1.04868126, "balance_loss_mlp": 1.04042077, "epoch": 0.36000721457343837, "flos": 35845600239360.0, "grad_norm": 3.447017118003742, "language_loss": 0.5787912, "learning_rate": 2.962442286246817e-06, "loss": 0.60072803, "num_input_tokens_seen": 64381555, "step": 2994, "time_per_iteration": 2.9090182781219482 }, { "auxiliary_loss_clip": 0.01140111, "auxiliary_loss_mlp": 0.01044114, "balance_loss_clip": 1.04980278, "balance_loss_mlp": 1.02405071, "epoch": 0.3601274574640774, "flos": 18291100222080.0, "grad_norm": 1.6595895481988527, "language_loss": 0.69863069, "learning_rate": 2.9617593673163853e-06, "loss": 0.72047293, "num_input_tokens_seen": 64400375, "step": 2995, "time_per_iteration": 2.997558355331421 }, { "auxiliary_loss_clip": 0.01138191, "auxiliary_loss_mlp": 0.01047882, "balance_loss_clip": 1.04782224, "balance_loss_mlp": 1.02796161, "epoch": 0.36024770035471654, "flos": 13333955961600.0, "grad_norm": 2.525238167079836, "language_loss": 0.77068233, "learning_rate": 2.9610763024878216e-06, "loss": 0.79254305, "num_input_tokens_seen": 64415880, "step": 2996, "time_per_iteration": 2.7427921295166016 }, { "auxiliary_loss_clip": 0.01128859, "auxiliary_loss_mlp": 0.01056582, "balance_loss_clip": 1.04741836, "balance_loss_mlp": 1.03524351, "epoch": 0.3603679432453556, "flos": 20267013427200.0, "grad_norm": 1.866916565428917, "language_loss": 0.91804886, "learning_rate": 2.960393091864747e-06, "loss": 0.93990326, "num_input_tokens_seen": 64434260, "step": 2997, "time_per_iteration": 2.826279878616333 }, { "auxiliary_loss_clip": 0.01136089, "auxiliary_loss_mlp": 0.01043952, "balance_loss_clip": 1.04799151, "balance_loss_mlp": 1.02461576, "epoch": 0.3604881861359947, "flos": 22451135817600.0, "grad_norm": 1.9575973777684212, "language_loss": 0.74796331, "learning_rate": 2.959709735550804e-06, "loss": 0.76976371, "num_input_tokens_seen": 64453855, "step": 2998, "time_per_iteration": 2.788922071456909 }, { "auxiliary_loss_clip": 0.01116348, "auxiliary_loss_mlp": 0.01043192, "balance_loss_clip": 1.04652214, "balance_loss_mlp": 1.02397561, "epoch": 0.3606084290266338, "flos": 22054251467520.0, "grad_norm": 2.073781746870594, "language_loss": 0.75814021, "learning_rate": 2.9590262336496575e-06, "loss": 0.77973557, "num_input_tokens_seen": 64473585, "step": 2999, "time_per_iteration": 2.9101624488830566 }, { "auxiliary_loss_clip": 0.01119161, "auxiliary_loss_mlp": 0.01048023, "balance_loss_clip": 1.04940534, "balance_loss_mlp": 1.02518213, "epoch": 0.36072867191727287, "flos": 15632921111040.0, "grad_norm": 2.322525302369529, "language_loss": 0.85711354, "learning_rate": 2.9583425862649936e-06, "loss": 0.87878537, "num_input_tokens_seen": 64491720, "step": 3000, "time_per_iteration": 3.7559802532196045 }, { "auxiliary_loss_clip": 0.01168841, "auxiliary_loss_mlp": 0.01046664, "balance_loss_clip": 1.05447292, "balance_loss_mlp": 1.02490842, "epoch": 0.360848914807912, "flos": 19677000625920.0, "grad_norm": 2.303722540997165, "language_loss": 0.7419138, "learning_rate": 2.9576587935005215e-06, "loss": 0.76406884, "num_input_tokens_seen": 64509800, "step": 3001, "time_per_iteration": 2.6691231727600098 }, { "auxiliary_loss_clip": 0.01154714, "auxiliary_loss_mlp": 0.01052019, "balance_loss_clip": 1.05169582, "balance_loss_mlp": 1.02924979, "epoch": 0.3609691576985511, "flos": 18877808972160.0, "grad_norm": 3.4848106949226456, "language_loss": 0.71855724, "learning_rate": 2.9569748554599713e-06, "loss": 0.74062455, "num_input_tokens_seen": 64525410, "step": 3002, "time_per_iteration": 2.758241891860962 }, { "auxiliary_loss_clip": 0.01133318, "auxiliary_loss_mlp": 0.01055013, "balance_loss_clip": 1.04936171, "balance_loss_mlp": 1.03663075, "epoch": 0.36108940058919015, "flos": 42224088648960.0, "grad_norm": 2.061553533088709, "language_loss": 0.73608923, "learning_rate": 2.956290772247097e-06, "loss": 0.75797248, "num_input_tokens_seen": 64544085, "step": 3003, "time_per_iteration": 4.172752380371094 }, { "auxiliary_loss_clip": 0.01105839, "auxiliary_loss_mlp": 0.01054627, "balance_loss_clip": 1.0476954, "balance_loss_mlp": 1.03539896, "epoch": 0.36120964347982926, "flos": 23185150243200.0, "grad_norm": 1.843203566892578, "language_loss": 0.73341066, "learning_rate": 2.9556065439656724e-06, "loss": 0.75501531, "num_input_tokens_seen": 64563135, "step": 3004, "time_per_iteration": 2.8245575428009033 }, { "auxiliary_loss_clip": 0.01090456, "auxiliary_loss_mlp": 0.01054984, "balance_loss_clip": 1.04068613, "balance_loss_mlp": 1.03425312, "epoch": 0.36132988637046837, "flos": 18113055482880.0, "grad_norm": 1.9388981306681607, "language_loss": 0.81652695, "learning_rate": 2.9549221707194952e-06, "loss": 0.83798134, "num_input_tokens_seen": 64581985, "step": 3005, "time_per_iteration": 4.346343040466309 }, { "auxiliary_loss_clip": 0.01153414, "auxiliary_loss_mlp": 0.01050944, "balance_loss_clip": 1.05292034, "balance_loss_mlp": 1.03032112, "epoch": 0.3614501292611074, "flos": 27813101333760.0, "grad_norm": 2.7619705664785488, "language_loss": 0.73099637, "learning_rate": 2.954237652612384e-06, "loss": 0.75303996, "num_input_tokens_seen": 64601035, "step": 3006, "time_per_iteration": 2.769684076309204 }, { "auxiliary_loss_clip": 0.01134572, "auxiliary_loss_mlp": 0.01053207, "balance_loss_clip": 1.05072379, "balance_loss_mlp": 1.03333449, "epoch": 0.36157037215174653, "flos": 22634926732800.0, "grad_norm": 1.907899585792785, "language_loss": 0.84522682, "learning_rate": 2.9535529897481796e-06, "loss": 0.86710459, "num_input_tokens_seen": 64618580, "step": 3007, "time_per_iteration": 2.8404417037963867 }, { "auxiliary_loss_clip": 0.01165298, "auxiliary_loss_mlp": 0.01049035, "balance_loss_clip": 1.05510187, "balance_loss_mlp": 1.02839959, "epoch": 0.36169061504238564, "flos": 12600839376000.0, "grad_norm": 2.3602146125165877, "language_loss": 0.7744953, "learning_rate": 2.9528681822307446e-06, "loss": 0.79663861, "num_input_tokens_seen": 64635430, "step": 3008, "time_per_iteration": 2.702430009841919 }, { "auxiliary_loss_clip": 0.0114411, "auxiliary_loss_mlp": 0.00775029, "balance_loss_clip": 1.05256069, "balance_loss_mlp": 1.00025916, "epoch": 0.3618108579330247, "flos": 26684644682880.0, "grad_norm": 2.4571230975678557, "language_loss": 0.82298613, "learning_rate": 2.952183230163964e-06, "loss": 0.84217751, "num_input_tokens_seen": 64655005, "step": 3009, "time_per_iteration": 3.621417999267578 }, { "auxiliary_loss_clip": 0.01119413, "auxiliary_loss_mlp": 0.01040955, "balance_loss_clip": 1.04769683, "balance_loss_mlp": 1.02146423, "epoch": 0.3619311008236638, "flos": 22817029708800.0, "grad_norm": 2.2273873832452686, "language_loss": 0.73059994, "learning_rate": 2.9514981336517448e-06, "loss": 0.75220364, "num_input_tokens_seen": 64674775, "step": 3010, "time_per_iteration": 2.8263285160064697 }, { "auxiliary_loss_clip": 0.01151544, "auxiliary_loss_mlp": 0.01057595, "balance_loss_clip": 1.05216134, "balance_loss_mlp": 1.03583968, "epoch": 0.36205134371430286, "flos": 25919603884800.0, "grad_norm": 2.9354129301291962, "language_loss": 0.81348801, "learning_rate": 2.950812892798015e-06, "loss": 0.8355794, "num_input_tokens_seen": 64695670, "step": 3011, "time_per_iteration": 2.8586347103118896 }, { "auxiliary_loss_clip": 0.01107668, "auxiliary_loss_mlp": 0.00774812, "balance_loss_clip": 1.04766023, "balance_loss_mlp": 1.00028372, "epoch": 0.362171586604942, "flos": 26139592730880.0, "grad_norm": 2.0356957438960426, "language_loss": 0.87511504, "learning_rate": 2.9501275077067256e-06, "loss": 0.89393985, "num_input_tokens_seen": 64716290, "step": 3012, "time_per_iteration": 2.8625848293304443 }, { "auxiliary_loss_clip": 0.01081321, "auxiliary_loss_mlp": 0.01061186, "balance_loss_clip": 1.04172158, "balance_loss_mlp": 1.04006183, "epoch": 0.3622918294955811, "flos": 28074208273920.0, "grad_norm": 1.7089420889343512, "language_loss": 0.88432968, "learning_rate": 2.949441978481848e-06, "loss": 0.90575469, "num_input_tokens_seen": 64737190, "step": 3013, "time_per_iteration": 3.039064407348633 }, { "auxiliary_loss_clip": 0.01134266, "auxiliary_loss_mlp": 0.01062897, "balance_loss_clip": 1.05186963, "balance_loss_mlp": 1.04261959, "epoch": 0.36241207238622014, "flos": 19828005402240.0, "grad_norm": 1.985329935250447, "language_loss": 0.79972905, "learning_rate": 2.9487563052273778e-06, "loss": 0.82170069, "num_input_tokens_seen": 64753950, "step": 3014, "time_per_iteration": 2.766286611557007 }, { "auxiliary_loss_clip": 0.01148875, "auxiliary_loss_mlp": 0.01053654, "balance_loss_clip": 1.05557203, "balance_loss_mlp": 1.03385365, "epoch": 0.36253231527685925, "flos": 21397158017280.0, "grad_norm": 2.201112833717477, "language_loss": 0.85798228, "learning_rate": 2.94807048804733e-06, "loss": 0.88000757, "num_input_tokens_seen": 64773570, "step": 3015, "time_per_iteration": 2.699500799179077 }, { "auxiliary_loss_clip": 0.01129051, "auxiliary_loss_mlp": 0.01063446, "balance_loss_clip": 1.0494107, "balance_loss_mlp": 1.04182148, "epoch": 0.36265255816749836, "flos": 18362885552640.0, "grad_norm": 2.4079676481203145, "language_loss": 0.89982289, "learning_rate": 2.9473845270457434e-06, "loss": 0.92174786, "num_input_tokens_seen": 64790385, "step": 3016, "time_per_iteration": 2.797403573989868 }, { "auxiliary_loss_clip": 0.01128242, "auxiliary_loss_mlp": 0.01051375, "balance_loss_clip": 1.04911971, "balance_loss_mlp": 1.03157461, "epoch": 0.3627728010581374, "flos": 18660046769280.0, "grad_norm": 4.300009772643868, "language_loss": 0.69894564, "learning_rate": 2.946698422326677e-06, "loss": 0.72074175, "num_input_tokens_seen": 64807845, "step": 3017, "time_per_iteration": 2.70731258392334 }, { "auxiliary_loss_clip": 0.01114822, "auxiliary_loss_mlp": 0.01046182, "balance_loss_clip": 1.04652262, "balance_loss_mlp": 1.02466476, "epoch": 0.36289304394877653, "flos": 27524272072320.0, "grad_norm": 2.639906230291476, "language_loss": 0.80098367, "learning_rate": 2.946012173994213e-06, "loss": 0.82259381, "num_input_tokens_seen": 64827630, "step": 3018, "time_per_iteration": 2.8984971046447754 }, { "auxiliary_loss_clip": 0.0114295, "auxiliary_loss_mlp": 0.01054531, "balance_loss_clip": 1.05205607, "balance_loss_mlp": 1.03476655, "epoch": 0.36301328683941564, "flos": 34533244932480.0, "grad_norm": 1.7746238055172547, "language_loss": 0.67714053, "learning_rate": 2.945325782152454e-06, "loss": 0.6991154, "num_input_tokens_seen": 64850665, "step": 3019, "time_per_iteration": 2.887699604034424 }, { "auxiliary_loss_clip": 0.01136963, "auxiliary_loss_mlp": 0.01051762, "balance_loss_clip": 1.04784942, "balance_loss_mlp": 1.03218794, "epoch": 0.3631335297300547, "flos": 19025976574080.0, "grad_norm": 2.5536230513714573, "language_loss": 0.78573161, "learning_rate": 2.9446392469055257e-06, "loss": 0.80761886, "num_input_tokens_seen": 64868700, "step": 3020, "time_per_iteration": 2.746054172515869 }, { "auxiliary_loss_clip": 0.01117187, "auxiliary_loss_mlp": 0.01053257, "balance_loss_clip": 1.05000472, "balance_loss_mlp": 1.03331327, "epoch": 0.3632537726206938, "flos": 19536769929600.0, "grad_norm": 2.2604765493558925, "language_loss": 0.79685664, "learning_rate": 2.9439525683575745e-06, "loss": 0.81856108, "num_input_tokens_seen": 64887620, "step": 3021, "time_per_iteration": 2.8648061752319336 }, { "auxiliary_loss_clip": 0.01167193, "auxiliary_loss_mlp": 0.01058045, "balance_loss_clip": 1.05506766, "balance_loss_mlp": 1.03789902, "epoch": 0.3633740155113329, "flos": 21068611292160.0, "grad_norm": 2.1509011695854796, "language_loss": 0.7527281, "learning_rate": 2.9432657466127694e-06, "loss": 0.77498049, "num_input_tokens_seen": 64907190, "step": 3022, "time_per_iteration": 2.7066879272460938 }, { "auxiliary_loss_clip": 0.01114195, "auxiliary_loss_mlp": 0.01047143, "balance_loss_clip": 1.05060029, "balance_loss_mlp": 1.02752161, "epoch": 0.36349425840197197, "flos": 20298722158080.0, "grad_norm": 1.7547223925015838, "language_loss": 0.76831412, "learning_rate": 2.9425787817753007e-06, "loss": 0.78992754, "num_input_tokens_seen": 64925850, "step": 3023, "time_per_iteration": 2.7256715297698975 }, { "auxiliary_loss_clip": 0.01127443, "auxiliary_loss_mlp": 0.01049356, "balance_loss_clip": 1.05130601, "balance_loss_mlp": 1.02876842, "epoch": 0.3636145012926111, "flos": 29716762331520.0, "grad_norm": 1.60129034425442, "language_loss": 0.713319, "learning_rate": 2.94189167394938e-06, "loss": 0.73508698, "num_input_tokens_seen": 64948285, "step": 3024, "time_per_iteration": 2.7757208347320557 }, { "auxiliary_loss_clip": 0.01166878, "auxiliary_loss_mlp": 0.01055526, "balance_loss_clip": 1.0563072, "balance_loss_mlp": 1.03555834, "epoch": 0.3637347441832502, "flos": 21431847576960.0, "grad_norm": 2.033189668702686, "language_loss": 0.81104743, "learning_rate": 2.941204423239241e-06, "loss": 0.8332715, "num_input_tokens_seen": 64967160, "step": 3025, "time_per_iteration": 2.6526923179626465 }, { "auxiliary_loss_clip": 0.01149678, "auxiliary_loss_mlp": 0.01056814, "balance_loss_clip": 1.05173373, "balance_loss_mlp": 1.03476, "epoch": 0.36385498707388925, "flos": 29533941083520.0, "grad_norm": 2.0779865394021786, "language_loss": 0.76056892, "learning_rate": 2.9405170297491395e-06, "loss": 0.78263384, "num_input_tokens_seen": 64987155, "step": 3026, "time_per_iteration": 3.7566850185394287 }, { "auxiliary_loss_clip": 0.01088693, "auxiliary_loss_mlp": 0.00774836, "balance_loss_clip": 1.04767156, "balance_loss_mlp": 1.0002799, "epoch": 0.36397522996452836, "flos": 22236569925120.0, "grad_norm": 2.0694715247921533, "language_loss": 0.80543453, "learning_rate": 2.939829493583353e-06, "loss": 0.82406986, "num_input_tokens_seen": 65003800, "step": 3027, "time_per_iteration": 2.9190967082977295 }, { "auxiliary_loss_clip": 0.01116543, "auxiliary_loss_mlp": 0.01047454, "balance_loss_clip": 1.04539013, "balance_loss_mlp": 1.02733183, "epoch": 0.3640954728551674, "flos": 21506505995520.0, "grad_norm": 2.5374159742480593, "language_loss": 0.82843804, "learning_rate": 2.939141814846179e-06, "loss": 0.85007799, "num_input_tokens_seen": 65021215, "step": 3028, "time_per_iteration": 2.768193244934082 }, { "auxiliary_loss_clip": 0.01137626, "auxiliary_loss_mlp": 0.01046783, "balance_loss_clip": 1.04915094, "balance_loss_mlp": 1.02726805, "epoch": 0.3642157157458065, "flos": 17712867081600.0, "grad_norm": 3.9324128877693627, "language_loss": 0.82199281, "learning_rate": 2.938453993641938e-06, "loss": 0.8438369, "num_input_tokens_seen": 65039590, "step": 3029, "time_per_iteration": 3.634014844894409 }, { "auxiliary_loss_clip": 0.01136883, "auxiliary_loss_mlp": 0.01045762, "balance_loss_clip": 1.05233955, "balance_loss_mlp": 1.02641463, "epoch": 0.36433595863644563, "flos": 17639537466240.0, "grad_norm": 2.139738273343421, "language_loss": 0.70460731, "learning_rate": 2.937766030074973e-06, "loss": 0.72643375, "num_input_tokens_seen": 65056845, "step": 3030, "time_per_iteration": 2.757909059524536 }, { "auxiliary_loss_clip": 0.01130924, "auxiliary_loss_mlp": 0.01056204, "balance_loss_clip": 1.05073416, "balance_loss_mlp": 1.03574777, "epoch": 0.3644562015270847, "flos": 26833279161600.0, "grad_norm": 2.148484075954788, "language_loss": 0.82491338, "learning_rate": 2.937077924249646e-06, "loss": 0.84678465, "num_input_tokens_seen": 65079435, "step": 3031, "time_per_iteration": 4.396740198135376 }, { "auxiliary_loss_clip": 0.01140104, "auxiliary_loss_mlp": 0.01049699, "balance_loss_clip": 1.04968405, "balance_loss_mlp": 1.02960014, "epoch": 0.3645764444177238, "flos": 14282715847680.0, "grad_norm": 2.7664003342799846, "language_loss": 0.75655985, "learning_rate": 2.9363896762703443e-06, "loss": 0.77845788, "num_input_tokens_seen": 65096500, "step": 3032, "time_per_iteration": 2.632689952850342 }, { "auxiliary_loss_clip": 0.01164254, "auxiliary_loss_mlp": 0.01048224, "balance_loss_clip": 1.0535717, "balance_loss_mlp": 1.0273031, "epoch": 0.3646966873083629, "flos": 20667489137280.0, "grad_norm": 1.742969770140816, "language_loss": 0.84524423, "learning_rate": 2.9357012862414725e-06, "loss": 0.86736906, "num_input_tokens_seen": 65115860, "step": 3033, "time_per_iteration": 2.611147403717041 }, { "auxiliary_loss_clip": 0.01153117, "auxiliary_loss_mlp": 0.01044543, "balance_loss_clip": 1.05483794, "balance_loss_mlp": 1.0255295, "epoch": 0.36481693019900197, "flos": 27782613665280.0, "grad_norm": 1.9572092892006452, "language_loss": 0.71826762, "learning_rate": 2.9350127542674593e-06, "loss": 0.74024421, "num_input_tokens_seen": 65138070, "step": 3034, "time_per_iteration": 2.769723415374756 }, { "auxiliary_loss_clip": 0.0114425, "auxiliary_loss_mlp": 0.01044036, "balance_loss_clip": 1.0530647, "balance_loss_mlp": 1.02416372, "epoch": 0.3649371730896411, "flos": 19712588025600.0, "grad_norm": 2.039516219478625, "language_loss": 0.76531923, "learning_rate": 2.934324080452755e-06, "loss": 0.78720212, "num_input_tokens_seen": 65155860, "step": 3035, "time_per_iteration": 3.6211235523223877 }, { "auxiliary_loss_clip": 0.01113457, "auxiliary_loss_mlp": 0.00777863, "balance_loss_clip": 1.04274321, "balance_loss_mlp": 1.00030899, "epoch": 0.3650574159802802, "flos": 24750496016640.0, "grad_norm": 1.4386360007317358, "language_loss": 0.78124404, "learning_rate": 2.9336352649018307e-06, "loss": 0.80015719, "num_input_tokens_seen": 65175930, "step": 3036, "time_per_iteration": 2.741374969482422 }, { "auxiliary_loss_clip": 0.01138699, "auxiliary_loss_mlp": 0.0104544, "balance_loss_clip": 1.05202711, "balance_loss_mlp": 1.02420843, "epoch": 0.36517765887091924, "flos": 32853487363200.0, "grad_norm": 2.095863756485924, "language_loss": 0.70197034, "learning_rate": 2.9329463077191783e-06, "loss": 0.72381175, "num_input_tokens_seen": 65199305, "step": 3037, "time_per_iteration": 2.779229164123535 }, { "auxiliary_loss_clip": 0.01111906, "auxiliary_loss_mlp": 0.01057955, "balance_loss_clip": 1.04790771, "balance_loss_mlp": 1.0334574, "epoch": 0.36529790176155835, "flos": 20120318282880.0, "grad_norm": 3.6342065077488375, "language_loss": 0.63995969, "learning_rate": 2.9322572090093135e-06, "loss": 0.66165829, "num_input_tokens_seen": 65218010, "step": 3038, "time_per_iteration": 2.751215934753418 }, { "auxiliary_loss_clip": 0.01109097, "auxiliary_loss_mlp": 0.01048372, "balance_loss_clip": 1.04577112, "balance_loss_mlp": 1.02779651, "epoch": 0.36541814465219746, "flos": 17639573379840.0, "grad_norm": 3.262123845454111, "language_loss": 0.76425266, "learning_rate": 2.9315679688767713e-06, "loss": 0.7858274, "num_input_tokens_seen": 65236020, "step": 3039, "time_per_iteration": 2.686016321182251 }, { "auxiliary_loss_clip": 0.01134047, "auxiliary_loss_mlp": 0.01059485, "balance_loss_clip": 1.05041516, "balance_loss_mlp": 1.03850389, "epoch": 0.3655383875428365, "flos": 22674356887680.0, "grad_norm": 1.6575253209291143, "language_loss": 0.66731668, "learning_rate": 2.9308785874261085e-06, "loss": 0.68925202, "num_input_tokens_seen": 65256210, "step": 3040, "time_per_iteration": 2.646735906600952 }, { "auxiliary_loss_clip": 0.01166248, "auxiliary_loss_mlp": 0.01047562, "balance_loss_clip": 1.05608201, "balance_loss_mlp": 1.0274756, "epoch": 0.36565863043347563, "flos": 21981173247360.0, "grad_norm": 1.7571953084595155, "language_loss": 0.81866086, "learning_rate": 2.9301890647619045e-06, "loss": 0.84079891, "num_input_tokens_seen": 65275505, "step": 3041, "time_per_iteration": 2.5967042446136475 }, { "auxiliary_loss_clip": 0.01143274, "auxiliary_loss_mlp": 0.01053065, "balance_loss_clip": 1.05249143, "balance_loss_mlp": 1.03293049, "epoch": 0.36577887332411474, "flos": 24827632473600.0, "grad_norm": 3.9708392776433112, "language_loss": 0.80429196, "learning_rate": 2.929499400988759e-06, "loss": 0.82625532, "num_input_tokens_seen": 65296665, "step": 3042, "time_per_iteration": 2.730752468109131 }, { "auxiliary_loss_clip": 0.01152038, "auxiliary_loss_mlp": 0.01056291, "balance_loss_clip": 1.05308867, "balance_loss_mlp": 1.03566778, "epoch": 0.3658991162147538, "flos": 28293191539200.0, "grad_norm": 2.107847850352803, "language_loss": 0.64830297, "learning_rate": 2.9288095962112927e-06, "loss": 0.67038631, "num_input_tokens_seen": 65317370, "step": 3043, "time_per_iteration": 2.6413567066192627 }, { "auxiliary_loss_clip": 0.01166156, "auxiliary_loss_mlp": 0.01040497, "balance_loss_clip": 1.05689025, "balance_loss_mlp": 1.01993322, "epoch": 0.3660193591053929, "flos": 17785550252160.0, "grad_norm": 2.1201868061375544, "language_loss": 0.85410285, "learning_rate": 2.9281196505341503e-06, "loss": 0.87616944, "num_input_tokens_seen": 65334540, "step": 3044, "time_per_iteration": 2.563852548599243 }, { "auxiliary_loss_clip": 0.01107251, "auxiliary_loss_mlp": 0.00775613, "balance_loss_clip": 1.04841089, "balance_loss_mlp": 1.00021911, "epoch": 0.36613960199603196, "flos": 10342776839040.0, "grad_norm": 2.5941767645045326, "language_loss": 0.78860152, "learning_rate": 2.9274295640619946e-06, "loss": 0.80743015, "num_input_tokens_seen": 65351670, "step": 3045, "time_per_iteration": 2.694221019744873 }, { "auxiliary_loss_clip": 0.01127866, "auxiliary_loss_mlp": 0.01049205, "balance_loss_clip": 1.04797173, "balance_loss_mlp": 1.02898729, "epoch": 0.36625984488667107, "flos": 19755609540480.0, "grad_norm": 2.0385141468944115, "language_loss": 0.78810608, "learning_rate": 2.9267393368995103e-06, "loss": 0.8098768, "num_input_tokens_seen": 65370900, "step": 3046, "time_per_iteration": 2.6613330841064453 }, { "auxiliary_loss_clip": 0.01165155, "auxiliary_loss_mlp": 0.01056213, "balance_loss_clip": 1.05657423, "balance_loss_mlp": 1.03603053, "epoch": 0.3663800877773102, "flos": 17674262939520.0, "grad_norm": 2.8949716536366616, "language_loss": 0.74624169, "learning_rate": 2.926048969151407e-06, "loss": 0.76845539, "num_input_tokens_seen": 65388185, "step": 3047, "time_per_iteration": 2.5187788009643555 }, { "auxiliary_loss_clip": 0.01110347, "auxiliary_loss_mlp": 0.01056028, "balance_loss_clip": 1.05025005, "balance_loss_mlp": 1.03633475, "epoch": 0.36650033066794924, "flos": 20303606407680.0, "grad_norm": 1.8051018683480287, "language_loss": 0.6851567, "learning_rate": 2.92535846092241e-06, "loss": 0.70682049, "num_input_tokens_seen": 65407200, "step": 3048, "time_per_iteration": 2.717883825302124 }, { "auxiliary_loss_clip": 0.01145348, "auxiliary_loss_mlp": 0.0106017, "balance_loss_clip": 1.05433989, "balance_loss_mlp": 1.03911746, "epoch": 0.36662057355858835, "flos": 24716237420160.0, "grad_norm": 5.292011019217305, "language_loss": 0.82689583, "learning_rate": 2.9246678123172704e-06, "loss": 0.8489511, "num_input_tokens_seen": 65427290, "step": 3049, "time_per_iteration": 2.704925537109375 }, { "auxiliary_loss_clip": 0.01167759, "auxiliary_loss_mlp": 0.01048722, "balance_loss_clip": 1.05465877, "balance_loss_mlp": 1.02825427, "epoch": 0.36674081644922746, "flos": 12385267902720.0, "grad_norm": 2.5980082101885746, "language_loss": 0.74735117, "learning_rate": 2.9239770234407596e-06, "loss": 0.76951605, "num_input_tokens_seen": 65445595, "step": 3050, "time_per_iteration": 2.5954980850219727 }, { "auxiliary_loss_clip": 0.01154965, "auxiliary_loss_mlp": 0.01041566, "balance_loss_clip": 1.0541544, "balance_loss_mlp": 1.02095485, "epoch": 0.3668610593398665, "flos": 21105922544640.0, "grad_norm": 1.8159860208114271, "language_loss": 0.68366849, "learning_rate": 2.9232860943976686e-06, "loss": 0.70563376, "num_input_tokens_seen": 65466330, "step": 3051, "time_per_iteration": 2.591996192932129 }, { "auxiliary_loss_clip": 0.0114245, "auxiliary_loss_mlp": 0.01041987, "balance_loss_clip": 1.05621338, "balance_loss_mlp": 1.02252018, "epoch": 0.3669813022305056, "flos": 26758082039040.0, "grad_norm": 1.657007143204837, "language_loss": 0.84209526, "learning_rate": 2.9225950252928115e-06, "loss": 0.86393964, "num_input_tokens_seen": 65487180, "step": 3052, "time_per_iteration": 3.531677484512329 }, { "auxiliary_loss_clip": 0.01149782, "auxiliary_loss_mlp": 0.01046052, "balance_loss_clip": 1.05235696, "balance_loss_mlp": 1.02625132, "epoch": 0.36710154512114473, "flos": 19099521671040.0, "grad_norm": 3.4897957545212255, "language_loss": 0.81615782, "learning_rate": 2.9219038162310217e-06, "loss": 0.83811605, "num_input_tokens_seen": 65505380, "step": 3053, "time_per_iteration": 2.5118906497955322 }, { "auxiliary_loss_clip": 0.01085726, "auxiliary_loss_mlp": 0.00776456, "balance_loss_clip": 1.04409122, "balance_loss_mlp": 1.0002296, "epoch": 0.3672217880117838, "flos": 20812029465600.0, "grad_norm": 2.0779086444073678, "language_loss": 0.824889, "learning_rate": 2.921212467317157e-06, "loss": 0.84351087, "num_input_tokens_seen": 65524825, "step": 3054, "time_per_iteration": 2.75626802444458 }, { "auxiliary_loss_clip": 0.0112485, "auxiliary_loss_mlp": 0.01058271, "balance_loss_clip": 1.04741848, "balance_loss_mlp": 1.03224719, "epoch": 0.3673420309024229, "flos": 13590394133760.0, "grad_norm": 3.3173827011952812, "language_loss": 0.80066228, "learning_rate": 2.920520978656093e-06, "loss": 0.82249349, "num_input_tokens_seen": 65541790, "step": 3055, "time_per_iteration": 3.8203766345977783 }, { "auxiliary_loss_clip": 0.01166339, "auxiliary_loss_mlp": 0.00774793, "balance_loss_clip": 1.05834198, "balance_loss_mlp": 1.00027514, "epoch": 0.367462273793062, "flos": 28986877969920.0, "grad_norm": 2.032965879587134, "language_loss": 0.76787484, "learning_rate": 2.919829350352729e-06, "loss": 0.78728616, "num_input_tokens_seen": 65563395, "step": 3056, "time_per_iteration": 2.6590652465820312 }, { "auxiliary_loss_clip": 0.01074634, "auxiliary_loss_mlp": 0.01011506, "balance_loss_clip": 1.03759074, "balance_loss_mlp": 1.00905061, "epoch": 0.36758251668370107, "flos": 62643148346880.0, "grad_norm": 0.7670938104502283, "language_loss": 0.59954023, "learning_rate": 2.919137582511983e-06, "loss": 0.62040168, "num_input_tokens_seen": 65619835, "step": 3057, "time_per_iteration": 4.1025238037109375 }, { "auxiliary_loss_clip": 0.01133211, "auxiliary_loss_mlp": 0.01051718, "balance_loss_clip": 1.05274796, "balance_loss_mlp": 1.0319171, "epoch": 0.3677027595743402, "flos": 12713886455040.0, "grad_norm": 2.346636951702379, "language_loss": 0.64097416, "learning_rate": 2.918445675238797e-06, "loss": 0.66282338, "num_input_tokens_seen": 65636760, "step": 3058, "time_per_iteration": 2.6641616821289062 }, { "auxiliary_loss_clip": 0.01165809, "auxiliary_loss_mlp": 0.01054548, "balance_loss_clip": 1.0527792, "balance_loss_mlp": 1.03374612, "epoch": 0.36782300246497923, "flos": 25046579825280.0, "grad_norm": 2.438985193019249, "language_loss": 0.69968772, "learning_rate": 2.917753628638132e-06, "loss": 0.72189128, "num_input_tokens_seen": 65657065, "step": 3059, "time_per_iteration": 2.6123099327087402 }, { "auxiliary_loss_clip": 0.01139881, "auxiliary_loss_mlp": 0.01053264, "balance_loss_clip": 1.0516609, "balance_loss_mlp": 1.03277206, "epoch": 0.36794324535561834, "flos": 17419512706560.0, "grad_norm": 2.394525485317144, "language_loss": 0.70372123, "learning_rate": 2.9170614428149716e-06, "loss": 0.72565269, "num_input_tokens_seen": 65675400, "step": 3060, "time_per_iteration": 2.642340660095215 }, { "auxiliary_loss_clip": 0.01129377, "auxiliary_loss_mlp": 0.01056355, "balance_loss_clip": 1.05248451, "balance_loss_mlp": 1.03412247, "epoch": 0.36806348824625745, "flos": 24089128848000.0, "grad_norm": 2.604442295807965, "language_loss": 0.86810839, "learning_rate": 2.9163691178743195e-06, "loss": 0.88996577, "num_input_tokens_seen": 65694050, "step": 3061, "time_per_iteration": 3.6186881065368652 }, { "auxiliary_loss_clip": 0.01149417, "auxiliary_loss_mlp": 0.01050347, "balance_loss_clip": 1.05310214, "balance_loss_mlp": 1.03157175, "epoch": 0.3681837311368965, "flos": 20521871400960.0, "grad_norm": 1.985887939013417, "language_loss": 0.77402264, "learning_rate": 2.9156766539212006e-06, "loss": 0.79602027, "num_input_tokens_seen": 65711695, "step": 3062, "time_per_iteration": 2.62459135055542 }, { "auxiliary_loss_clip": 0.01152573, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.05070841, "balance_loss_mlp": 1.0247463, "epoch": 0.3683039740275356, "flos": 21466644877440.0, "grad_norm": 2.0395642371954557, "language_loss": 0.72069865, "learning_rate": 2.9149840510606614e-06, "loss": 0.74266481, "num_input_tokens_seen": 65730350, "step": 3063, "time_per_iteration": 2.631171464920044 }, { "auxiliary_loss_clip": 0.01059209, "auxiliary_loss_mlp": 0.00756331, "balance_loss_clip": 1.03265131, "balance_loss_mlp": 1.00060868, "epoch": 0.36842421691817473, "flos": 70380999987840.0, "grad_norm": 1.013882068607707, "language_loss": 0.64117622, "learning_rate": 2.914291309397769e-06, "loss": 0.65933168, "num_input_tokens_seen": 65787820, "step": 3064, "time_per_iteration": 3.301393508911133 }, { "auxiliary_loss_clip": 0.0108285, "auxiliary_loss_mlp": 0.01050206, "balance_loss_clip": 1.03939486, "balance_loss_mlp": 1.02803326, "epoch": 0.3685444598088138, "flos": 23331378510720.0, "grad_norm": 2.7054045926493755, "language_loss": 0.78814793, "learning_rate": 2.9135984290376117e-06, "loss": 0.80947852, "num_input_tokens_seen": 65806685, "step": 3065, "time_per_iteration": 2.7843005657196045 }, { "auxiliary_loss_clip": 0.01101012, "auxiliary_loss_mlp": 0.01054858, "balance_loss_clip": 1.0480684, "balance_loss_mlp": 1.03460455, "epoch": 0.3686647026994529, "flos": 23070271570560.0, "grad_norm": 1.8037111122136096, "language_loss": 0.82502627, "learning_rate": 2.9129054100853e-06, "loss": 0.84658492, "num_input_tokens_seen": 65825525, "step": 3066, "time_per_iteration": 2.7667593955993652 }, { "auxiliary_loss_clip": 0.01143943, "auxiliary_loss_mlp": 0.01048588, "balance_loss_clip": 1.05151308, "balance_loss_mlp": 1.02806044, "epoch": 0.368784945590092, "flos": 25119909440640.0, "grad_norm": 1.7168383349902403, "language_loss": 0.76032293, "learning_rate": 2.912212252645963e-06, "loss": 0.78224826, "num_input_tokens_seen": 65848110, "step": 3067, "time_per_iteration": 2.735621452331543 }, { "auxiliary_loss_clip": 0.01158253, "auxiliary_loss_mlp": 0.01050384, "balance_loss_clip": 1.05255175, "balance_loss_mlp": 1.02902174, "epoch": 0.36890518848073106, "flos": 18442284566400.0, "grad_norm": 2.4251885770057244, "language_loss": 0.76675737, "learning_rate": 2.9115189568247523e-06, "loss": 0.78884375, "num_input_tokens_seen": 65865670, "step": 3068, "time_per_iteration": 2.5922982692718506 }, { "auxiliary_loss_clip": 0.01100047, "auxiliary_loss_mlp": 0.01045499, "balance_loss_clip": 1.04891419, "balance_loss_mlp": 1.02503061, "epoch": 0.36902543137137017, "flos": 16362446336640.0, "grad_norm": 2.012200662036113, "language_loss": 0.92406404, "learning_rate": 2.910825522726841e-06, "loss": 0.94551951, "num_input_tokens_seen": 65883195, "step": 3069, "time_per_iteration": 2.695699453353882 }, { "auxiliary_loss_clip": 0.01106897, "auxiliary_loss_mlp": 0.01051406, "balance_loss_clip": 1.04678333, "balance_loss_mlp": 1.0312717, "epoch": 0.3691456742620093, "flos": 12275596702080.0, "grad_norm": 2.833174029574726, "language_loss": 0.77330947, "learning_rate": 2.9101319504574215e-06, "loss": 0.79489243, "num_input_tokens_seen": 65899635, "step": 3070, "time_per_iteration": 2.7017440795898438 }, { "auxiliary_loss_clip": 0.01139518, "auxiliary_loss_mlp": 0.01047082, "balance_loss_clip": 1.04730225, "balance_loss_mlp": 1.02645826, "epoch": 0.36926591715264834, "flos": 17786412178560.0, "grad_norm": 2.6744092169750613, "language_loss": 0.7590363, "learning_rate": 2.909438240121709e-06, "loss": 0.78090227, "num_input_tokens_seen": 65919910, "step": 3071, "time_per_iteration": 2.6811001300811768 }, { "auxiliary_loss_clip": 0.01131818, "auxiliary_loss_mlp": 0.01045671, "balance_loss_clip": 1.05161238, "balance_loss_mlp": 1.02639532, "epoch": 0.36938616004328745, "flos": 28948309741440.0, "grad_norm": 1.7792100329557359, "language_loss": 0.70005703, "learning_rate": 2.908744391824939e-06, "loss": 0.72183192, "num_input_tokens_seen": 65940930, "step": 3072, "time_per_iteration": 2.727421283721924 }, { "auxiliary_loss_clip": 0.01100496, "auxiliary_loss_mlp": 0.01046129, "balance_loss_clip": 1.04353261, "balance_loss_mlp": 1.02569687, "epoch": 0.36950640293392656, "flos": 29205394358400.0, "grad_norm": 1.8989377313909603, "language_loss": 0.79326713, "learning_rate": 2.908050405672367e-06, "loss": 0.81473339, "num_input_tokens_seen": 65960475, "step": 3073, "time_per_iteration": 2.7923364639282227 }, { "auxiliary_loss_clip": 0.01142399, "auxiliary_loss_mlp": 0.01051834, "balance_loss_clip": 1.0481863, "balance_loss_mlp": 1.03127015, "epoch": 0.3696266458245656, "flos": 24827776128000.0, "grad_norm": 2.072893861305551, "language_loss": 0.79427785, "learning_rate": 2.9073562817692703e-06, "loss": 0.81622016, "num_input_tokens_seen": 65979160, "step": 3074, "time_per_iteration": 2.6824851036071777 }, { "auxiliary_loss_clip": 0.01030764, "auxiliary_loss_mlp": 0.01003998, "balance_loss_clip": 1.02864718, "balance_loss_mlp": 1.00141144, "epoch": 0.3697468887152047, "flos": 59887257264000.0, "grad_norm": 0.7343075359141303, "language_loss": 0.56524497, "learning_rate": 2.9066620202209468e-06, "loss": 0.58559257, "num_input_tokens_seen": 66041650, "step": 3075, "time_per_iteration": 3.2417807579040527 }, { "auxiliary_loss_clip": 0.01113653, "auxiliary_loss_mlp": 0.01051659, "balance_loss_clip": 1.0460813, "balance_loss_mlp": 1.03049982, "epoch": 0.3698671316058438, "flos": 26137581569280.0, "grad_norm": 1.966596154940471, "language_loss": 0.7780785, "learning_rate": 2.905967621132716e-06, "loss": 0.79973161, "num_input_tokens_seen": 66059260, "step": 3076, "time_per_iteration": 2.7194628715515137 }, { "auxiliary_loss_clip": 0.01143285, "auxiliary_loss_mlp": 0.01050546, "balance_loss_clip": 1.04934359, "balance_loss_mlp": 1.02855229, "epoch": 0.3699873744964829, "flos": 24607464059520.0, "grad_norm": 2.159422795947731, "language_loss": 0.75434631, "learning_rate": 2.9052730846099172e-06, "loss": 0.77628464, "num_input_tokens_seen": 66080605, "step": 3077, "time_per_iteration": 2.7004384994506836 }, { "auxiliary_loss_clip": 0.01040679, "auxiliary_loss_mlp": 0.01009925, "balance_loss_clip": 1.02241147, "balance_loss_mlp": 1.0077914, "epoch": 0.370107617387122, "flos": 64885340050560.0, "grad_norm": 0.8506702795287231, "language_loss": 0.60845339, "learning_rate": 2.9045784107579123e-06, "loss": 0.62895942, "num_input_tokens_seen": 66140710, "step": 3078, "time_per_iteration": 4.194579124450684 }, { "auxiliary_loss_clip": 0.01164526, "auxiliary_loss_mlp": 0.01046876, "balance_loss_clip": 1.05307674, "balance_loss_mlp": 1.02573979, "epoch": 0.37022786027776106, "flos": 15961683317760.0, "grad_norm": 1.9024805089245636, "language_loss": 0.66971755, "learning_rate": 2.9038835996820807e-06, "loss": 0.69183159, "num_input_tokens_seen": 66158320, "step": 3079, "time_per_iteration": 2.6172006130218506 }, { "auxiliary_loss_clip": 0.01127256, "auxiliary_loss_mlp": 0.01058059, "balance_loss_clip": 1.04673994, "balance_loss_mlp": 1.03912854, "epoch": 0.37034810316840017, "flos": 18546927863040.0, "grad_norm": 2.4818460621992764, "language_loss": 0.79649156, "learning_rate": 2.903188651487826e-06, "loss": 0.81834471, "num_input_tokens_seen": 66176875, "step": 3080, "time_per_iteration": 2.7347214221954346 }, { "auxiliary_loss_clip": 0.01152287, "auxiliary_loss_mlp": 0.01052462, "balance_loss_clip": 1.05232263, "balance_loss_mlp": 1.03288758, "epoch": 0.3704683460590393, "flos": 17821927751040.0, "grad_norm": 2.2622123439322297, "language_loss": 0.87165821, "learning_rate": 2.902493566280571e-06, "loss": 0.89370573, "num_input_tokens_seen": 66194980, "step": 3081, "time_per_iteration": 3.4978952407836914 }, { "auxiliary_loss_clip": 0.01138787, "auxiliary_loss_mlp": 0.01056821, "balance_loss_clip": 1.04998577, "balance_loss_mlp": 1.03628111, "epoch": 0.37058858894967833, "flos": 14134081368960.0, "grad_norm": 2.3162184819374354, "language_loss": 0.81255829, "learning_rate": 2.9017983441657595e-06, "loss": 0.83451432, "num_input_tokens_seen": 66212310, "step": 3082, "time_per_iteration": 2.6434884071350098 }, { "auxiliary_loss_clip": 0.011126, "auxiliary_loss_mlp": 0.01055459, "balance_loss_clip": 1.0456655, "balance_loss_mlp": 1.03381085, "epoch": 0.37070883184031744, "flos": 13954492344960.0, "grad_norm": 2.8108268844766235, "language_loss": 0.75327861, "learning_rate": 2.9011029852488564e-06, "loss": 0.77495921, "num_input_tokens_seen": 66229545, "step": 3083, "time_per_iteration": 3.6848363876342773 }, { "auxiliary_loss_clip": 0.01061596, "auxiliary_loss_mlp": 0.01004208, "balance_loss_clip": 1.02453279, "balance_loss_mlp": 1.00181198, "epoch": 0.37082907473095655, "flos": 52315419306240.0, "grad_norm": 1.0122517250898082, "language_loss": 0.62454307, "learning_rate": 2.9004074896353465e-06, "loss": 0.64520109, "num_input_tokens_seen": 66283545, "step": 3084, "time_per_iteration": 3.0689477920532227 }, { "auxiliary_loss_clip": 0.01159228, "auxiliary_loss_mlp": 0.01049793, "balance_loss_clip": 1.05391026, "balance_loss_mlp": 1.03101754, "epoch": 0.3709493176215956, "flos": 15998096730240.0, "grad_norm": 2.0984481980304497, "language_loss": 0.81257439, "learning_rate": 2.8997118574307362e-06, "loss": 0.83466458, "num_input_tokens_seen": 66300500, "step": 3085, "time_per_iteration": 2.5775508880615234 }, { "auxiliary_loss_clip": 0.01132458, "auxiliary_loss_mlp": 0.01052812, "balance_loss_clip": 1.04983616, "balance_loss_mlp": 1.03284454, "epoch": 0.3710695605122347, "flos": 20959837931520.0, "grad_norm": 2.187025480461133, "language_loss": 0.74278551, "learning_rate": 2.899016088740553e-06, "loss": 0.76463819, "num_input_tokens_seen": 66318610, "step": 3086, "time_per_iteration": 2.6590073108673096 }, { "auxiliary_loss_clip": 0.01110356, "auxiliary_loss_mlp": 0.01048066, "balance_loss_clip": 1.04652262, "balance_loss_mlp": 1.02859926, "epoch": 0.37118980340287383, "flos": 14355578586240.0, "grad_norm": 2.2356754209312553, "language_loss": 0.79154313, "learning_rate": 2.898320183670344e-06, "loss": 0.81312734, "num_input_tokens_seen": 66336025, "step": 3087, "time_per_iteration": 3.648120403289795 }, { "auxiliary_loss_clip": 0.0111342, "auxiliary_loss_mlp": 0.01057824, "balance_loss_clip": 1.05175066, "balance_loss_mlp": 1.03696227, "epoch": 0.3713100462935129, "flos": 25885381201920.0, "grad_norm": 1.9135344937863459, "language_loss": 0.88656032, "learning_rate": 2.8976241423256767e-06, "loss": 0.90827274, "num_input_tokens_seen": 66356120, "step": 3088, "time_per_iteration": 2.7567577362060547 }, { "auxiliary_loss_clip": 0.01126524, "auxiliary_loss_mlp": 0.01062125, "balance_loss_clip": 1.0450592, "balance_loss_mlp": 1.04169273, "epoch": 0.371430289184152, "flos": 30518934814080.0, "grad_norm": 3.1813807926767175, "language_loss": 0.68215591, "learning_rate": 2.896927964812142e-06, "loss": 0.70404243, "num_input_tokens_seen": 66376685, "step": 3089, "time_per_iteration": 2.758586883544922 }, { "auxiliary_loss_clip": 0.01135963, "auxiliary_loss_mlp": 0.0105959, "balance_loss_clip": 1.05243933, "balance_loss_mlp": 1.0395149, "epoch": 0.37155053207479105, "flos": 15742233175680.0, "grad_norm": 2.519068682410345, "language_loss": 0.75188327, "learning_rate": 2.8962316512353465e-06, "loss": 0.77383876, "num_input_tokens_seen": 66394230, "step": 3090, "time_per_iteration": 2.617279529571533 }, { "auxiliary_loss_clip": 0.01096614, "auxiliary_loss_mlp": 0.01053229, "balance_loss_clip": 1.04464805, "balance_loss_mlp": 1.03191423, "epoch": 0.37167077496543016, "flos": 23404061681280.0, "grad_norm": 1.7868350008556457, "language_loss": 0.75105178, "learning_rate": 2.8955352017009233e-06, "loss": 0.77255017, "num_input_tokens_seen": 66413475, "step": 3091, "time_per_iteration": 2.7706828117370605 }, { "auxiliary_loss_clip": 0.01136485, "auxiliary_loss_mlp": 0.01059847, "balance_loss_clip": 1.0501771, "balance_loss_mlp": 1.0396533, "epoch": 0.3717910178560693, "flos": 22088653718400.0, "grad_norm": 1.9108979588609434, "language_loss": 0.77491015, "learning_rate": 2.8948386163145212e-06, "loss": 0.79687345, "num_input_tokens_seen": 66432685, "step": 3092, "time_per_iteration": 2.7097859382629395 }, { "auxiliary_loss_clip": 0.0114646, "auxiliary_loss_mlp": 0.01055848, "balance_loss_clip": 1.04942143, "balance_loss_mlp": 1.03725171, "epoch": 0.3719112607467083, "flos": 26939969533440.0, "grad_norm": 1.9035421579766036, "language_loss": 0.79400814, "learning_rate": 2.8941418951818135e-06, "loss": 0.81603122, "num_input_tokens_seen": 66452245, "step": 3093, "time_per_iteration": 2.6472928524017334 }, { "auxiliary_loss_clip": 0.01124589, "auxiliary_loss_mlp": 0.01050678, "balance_loss_clip": 1.04714513, "balance_loss_mlp": 1.03097296, "epoch": 0.37203150363734744, "flos": 12166500119040.0, "grad_norm": 3.066183233652135, "language_loss": 0.71384233, "learning_rate": 2.8934450384084903e-06, "loss": 0.73559499, "num_input_tokens_seen": 66469760, "step": 3094, "time_per_iteration": 2.6759958267211914 }, { "auxiliary_loss_clip": 0.01128979, "auxiliary_loss_mlp": 0.01064515, "balance_loss_clip": 1.04742312, "balance_loss_mlp": 1.04259193, "epoch": 0.37215174652798655, "flos": 23697595624320.0, "grad_norm": 2.577712659248644, "language_loss": 0.69517565, "learning_rate": 2.8927480461002653e-06, "loss": 0.71711063, "num_input_tokens_seen": 66489730, "step": 3095, "time_per_iteration": 2.6627612113952637 }, { "auxiliary_loss_clip": 0.01135682, "auxiliary_loss_mlp": 0.01059417, "balance_loss_clip": 1.04619992, "balance_loss_mlp": 1.0362426, "epoch": 0.3722719894186256, "flos": 17887751424000.0, "grad_norm": 2.1973603638613293, "language_loss": 0.86229461, "learning_rate": 2.892050918362872e-06, "loss": 0.88424563, "num_input_tokens_seen": 66504785, "step": 3096, "time_per_iteration": 2.6424458026885986 }, { "auxiliary_loss_clip": 0.01015983, "auxiliary_loss_mlp": 0.01017731, "balance_loss_clip": 1.03685641, "balance_loss_mlp": 1.01484585, "epoch": 0.3723922323092647, "flos": 62419891363200.0, "grad_norm": 0.8539718174348975, "language_loss": 0.55875766, "learning_rate": 2.8913536553020626e-06, "loss": 0.57909483, "num_input_tokens_seen": 66558840, "step": 3097, "time_per_iteration": 3.476919651031494 }, { "auxiliary_loss_clip": 0.01099304, "auxiliary_loss_mlp": 0.01068525, "balance_loss_clip": 1.03982902, "balance_loss_mlp": 1.04564893, "epoch": 0.3725124751999038, "flos": 23039747988480.0, "grad_norm": 2.2250572661810266, "language_loss": 0.84847206, "learning_rate": 2.8906562570236137e-06, "loss": 0.87015033, "num_input_tokens_seen": 66576750, "step": 3098, "time_per_iteration": 3.0370166301727295 }, { "auxiliary_loss_clip": 0.01092675, "auxiliary_loss_mlp": 0.01045929, "balance_loss_clip": 1.04220915, "balance_loss_mlp": 1.02681923, "epoch": 0.3726327180905429, "flos": 20920551431040.0, "grad_norm": 1.5529932476085064, "language_loss": 0.76512277, "learning_rate": 2.889958723633318e-06, "loss": 0.78650886, "num_input_tokens_seen": 66595690, "step": 3099, "time_per_iteration": 2.7612111568450928 }, { "auxiliary_loss_clip": 0.01124006, "auxiliary_loss_mlp": 0.01051061, "balance_loss_clip": 1.04885292, "balance_loss_mlp": 1.03279805, "epoch": 0.372752960981182, "flos": 30592156688640.0, "grad_norm": 1.88186801916178, "language_loss": 0.73974311, "learning_rate": 2.889261055236992e-06, "loss": 0.76149374, "num_input_tokens_seen": 66617905, "step": 3100, "time_per_iteration": 2.7492775917053223 }, { "auxiliary_loss_clip": 0.01139369, "auxiliary_loss_mlp": 0.01046699, "balance_loss_clip": 1.05430567, "balance_loss_mlp": 1.02785242, "epoch": 0.3728732038718211, "flos": 25116749043840.0, "grad_norm": 2.478799721847512, "language_loss": 0.8251313, "learning_rate": 2.8885632519404704e-06, "loss": 0.84699202, "num_input_tokens_seen": 66638175, "step": 3101, "time_per_iteration": 2.6912312507629395 }, { "auxiliary_loss_clip": 0.01133384, "auxiliary_loss_mlp": 0.01045536, "balance_loss_clip": 1.04878807, "balance_loss_mlp": 1.02610505, "epoch": 0.37299344676246016, "flos": 25302048330240.0, "grad_norm": 2.027268394087065, "language_loss": 0.75702786, "learning_rate": 2.8878653138496107e-06, "loss": 0.77881706, "num_input_tokens_seen": 66658670, "step": 3102, "time_per_iteration": 2.67328143119812 }, { "auxiliary_loss_clip": 0.01090782, "auxiliary_loss_mlp": 0.01060321, "balance_loss_clip": 1.03970194, "balance_loss_mlp": 1.03789794, "epoch": 0.37311368965309927, "flos": 23842531002240.0, "grad_norm": 2.279583090726877, "language_loss": 0.76815641, "learning_rate": 2.8871672410702878e-06, "loss": 0.78966743, "num_input_tokens_seen": 66676030, "step": 3103, "time_per_iteration": 2.795228958129883 }, { "auxiliary_loss_clip": 0.01135214, "auxiliary_loss_mlp": 0.01045637, "balance_loss_clip": 1.04876637, "balance_loss_mlp": 1.02595544, "epoch": 0.3732339325437384, "flos": 25811943845760.0, "grad_norm": 2.1112415333399106, "language_loss": 0.81969863, "learning_rate": 2.8864690337084008e-06, "loss": 0.84150714, "num_input_tokens_seen": 66695305, "step": 3104, "time_per_iteration": 3.86253023147583 }, { "auxiliary_loss_clip": 0.01142091, "auxiliary_loss_mlp": 0.01060012, "balance_loss_clip": 1.0485642, "balance_loss_mlp": 1.03828049, "epoch": 0.37335417543437743, "flos": 26208433146240.0, "grad_norm": 1.7786531340397331, "language_loss": 0.78355879, "learning_rate": 2.885770691869866e-06, "loss": 0.80557978, "num_input_tokens_seen": 66716185, "step": 3105, "time_per_iteration": 2.735905885696411 }, { "auxiliary_loss_clip": 0.01142277, "auxiliary_loss_mlp": 0.01052205, "balance_loss_clip": 1.04780173, "balance_loss_mlp": 1.03379965, "epoch": 0.37347441832501654, "flos": 24023879792640.0, "grad_norm": 8.006167041005668, "language_loss": 0.74142587, "learning_rate": 2.8850722156606207e-06, "loss": 0.76337069, "num_input_tokens_seen": 66734575, "step": 3106, "time_per_iteration": 3.6302576065063477 }, { "auxiliary_loss_clip": 0.01137785, "auxiliary_loss_mlp": 0.01051926, "balance_loss_clip": 1.04775763, "balance_loss_mlp": 1.03234029, "epoch": 0.3735946612156556, "flos": 19714922409600.0, "grad_norm": 1.6076540505102725, "language_loss": 0.67174047, "learning_rate": 2.8843736051866252e-06, "loss": 0.69363761, "num_input_tokens_seen": 66753500, "step": 3107, "time_per_iteration": 2.6227996349334717 }, { "auxiliary_loss_clip": 0.01109036, "auxiliary_loss_mlp": 0.00776142, "balance_loss_clip": 1.04522943, "balance_loss_mlp": 1.00029683, "epoch": 0.3737149041062947, "flos": 23039604334080.0, "grad_norm": 2.057782953791128, "language_loss": 0.69535917, "learning_rate": 2.8836748605538557e-06, "loss": 0.71421093, "num_input_tokens_seen": 66775140, "step": 3108, "time_per_iteration": 2.7966952323913574 }, { "auxiliary_loss_clip": 0.01140492, "auxiliary_loss_mlp": 0.01040376, "balance_loss_clip": 1.04898858, "balance_loss_mlp": 1.02088523, "epoch": 0.3738351469969338, "flos": 34678108483200.0, "grad_norm": 2.3616061289250796, "language_loss": 0.63115197, "learning_rate": 2.882975981868313e-06, "loss": 0.65296066, "num_input_tokens_seen": 66795525, "step": 3109, "time_per_iteration": 3.78651762008667 }, { "auxiliary_loss_clip": 0.0115018, "auxiliary_loss_mlp": 0.01049197, "balance_loss_clip": 1.05128014, "balance_loss_mlp": 1.02920532, "epoch": 0.3739553898875729, "flos": 43507967448960.0, "grad_norm": 2.2761036948432856, "language_loss": 0.68769813, "learning_rate": 2.882276969236016e-06, "loss": 0.70969194, "num_input_tokens_seen": 66816885, "step": 3110, "time_per_iteration": 2.8280327320098877 }, { "auxiliary_loss_clip": 0.01129319, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.04566026, "balance_loss_mlp": 1.02433002, "epoch": 0.374075632778212, "flos": 12856487448960.0, "grad_norm": 2.8566190372428224, "language_loss": 0.763699, "learning_rate": 2.881577822763005e-06, "loss": 0.78544331, "num_input_tokens_seen": 66834835, "step": 3111, "time_per_iteration": 2.6567161083221436 }, { "auxiliary_loss_clip": 0.01148162, "auxiliary_loss_mlp": 0.01047256, "balance_loss_clip": 1.04861736, "balance_loss_mlp": 1.02876687, "epoch": 0.3741958756688511, "flos": 26024031699840.0, "grad_norm": 1.8564666187493957, "language_loss": 0.87365609, "learning_rate": 2.880878542555338e-06, "loss": 0.89561027, "num_input_tokens_seen": 66852600, "step": 3112, "time_per_iteration": 2.6915364265441895 }, { "auxiliary_loss_clip": 0.01163871, "auxiliary_loss_mlp": 0.0104679, "balance_loss_clip": 1.0514493, "balance_loss_mlp": 1.0259999, "epoch": 0.37431611855949015, "flos": 21433894652160.0, "grad_norm": 2.377919745464871, "language_loss": 0.80515587, "learning_rate": 2.8801791287190976e-06, "loss": 0.82726246, "num_input_tokens_seen": 66870595, "step": 3113, "time_per_iteration": 3.5360279083251953 }, { "auxiliary_loss_clip": 0.01153659, "auxiliary_loss_mlp": 0.01049586, "balance_loss_clip": 1.05083525, "balance_loss_mlp": 1.02933216, "epoch": 0.37443636145012926, "flos": 24207096090240.0, "grad_norm": 3.0713082897269888, "language_loss": 0.86201322, "learning_rate": 2.8794795813603817e-06, "loss": 0.88404572, "num_input_tokens_seen": 66886060, "step": 3114, "time_per_iteration": 2.6215505599975586 }, { "auxiliary_loss_clip": 0.01150162, "auxiliary_loss_mlp": 0.01048546, "balance_loss_clip": 1.04938316, "balance_loss_mlp": 1.02847099, "epoch": 0.3745566043407684, "flos": 15378601841280.0, "grad_norm": 2.006478314243709, "language_loss": 0.81672883, "learning_rate": 2.878779900585314e-06, "loss": 0.83871591, "num_input_tokens_seen": 66903900, "step": 3115, "time_per_iteration": 2.5849149227142334 }, { "auxiliary_loss_clip": 0.01139028, "auxiliary_loss_mlp": 0.01043713, "balance_loss_clip": 1.05058682, "balance_loss_mlp": 1.02431798, "epoch": 0.37467684723140743, "flos": 24608218245120.0, "grad_norm": 1.7762210043909097, "language_loss": 0.75680315, "learning_rate": 2.8780800865000336e-06, "loss": 0.7786305, "num_input_tokens_seen": 66925210, "step": 3116, "time_per_iteration": 2.657369375228882 }, { "auxiliary_loss_clip": 0.01051071, "auxiliary_loss_mlp": 0.01001973, "balance_loss_clip": 1.02304792, "balance_loss_mlp": 0.9997679, "epoch": 0.37479709012204654, "flos": 64377491610240.0, "grad_norm": 0.9864802156413605, "language_loss": 0.59200418, "learning_rate": 2.877380139210702e-06, "loss": 0.61253452, "num_input_tokens_seen": 66983880, "step": 3117, "time_per_iteration": 3.1232550144195557 }, { "auxiliary_loss_clip": 0.01128236, "auxiliary_loss_mlp": 0.01040493, "balance_loss_clip": 1.04766262, "balance_loss_mlp": 1.01961946, "epoch": 0.37491733301268565, "flos": 23803962773760.0, "grad_norm": 1.9105494100054514, "language_loss": 0.76697469, "learning_rate": 2.876680058823501e-06, "loss": 0.78866202, "num_input_tokens_seen": 67004280, "step": 3118, "time_per_iteration": 2.7332637310028076 }, { "auxiliary_loss_clip": 0.01126064, "auxiliary_loss_mlp": 0.01046827, "balance_loss_clip": 1.0460037, "balance_loss_mlp": 1.02544045, "epoch": 0.3750375759033247, "flos": 32160950167680.0, "grad_norm": 2.16394891118166, "language_loss": 0.66064233, "learning_rate": 2.8759798454446314e-06, "loss": 0.68237126, "num_input_tokens_seen": 67027445, "step": 3119, "time_per_iteration": 2.738985538482666 }, { "auxiliary_loss_clip": 0.01152472, "auxiliary_loss_mlp": 0.01064724, "balance_loss_clip": 1.05285311, "balance_loss_mlp": 1.0451858, "epoch": 0.3751578187939638, "flos": 23367791923200.0, "grad_norm": 2.3084171493959316, "language_loss": 0.81194597, "learning_rate": 2.8752794991803173e-06, "loss": 0.83411789, "num_input_tokens_seen": 67045130, "step": 3120, "time_per_iteration": 2.6731245517730713 }, { "auxiliary_loss_clip": 0.01132838, "auxiliary_loss_mlp": 0.01051014, "balance_loss_clip": 1.04850757, "balance_loss_mlp": 1.03306079, "epoch": 0.37527806168460287, "flos": 14605731878400.0, "grad_norm": 2.402856156492908, "language_loss": 0.75434613, "learning_rate": 2.8745790201367976e-06, "loss": 0.77618468, "num_input_tokens_seen": 67060885, "step": 3121, "time_per_iteration": 2.616372585296631 }, { "auxiliary_loss_clip": 0.01164194, "auxiliary_loss_mlp": 0.01052564, "balance_loss_clip": 1.05024731, "balance_loss_mlp": 1.03312135, "epoch": 0.375398304575242, "flos": 26390823431040.0, "grad_norm": 2.318273198204424, "language_loss": 0.84272885, "learning_rate": 2.8738784084203373e-06, "loss": 0.86489642, "num_input_tokens_seen": 67080960, "step": 3122, "time_per_iteration": 2.645777702331543 }, { "auxiliary_loss_clip": 0.01127875, "auxiliary_loss_mlp": 0.01047329, "balance_loss_clip": 1.04690146, "balance_loss_mlp": 1.02897096, "epoch": 0.3755185474658811, "flos": 22236605838720.0, "grad_norm": 2.3148662292322824, "language_loss": 0.79018378, "learning_rate": 2.873177664137216e-06, "loss": 0.81193584, "num_input_tokens_seen": 67101890, "step": 3123, "time_per_iteration": 2.680345296859741 }, { "auxiliary_loss_clip": 0.0111602, "auxiliary_loss_mlp": 0.01042784, "balance_loss_clip": 1.04730284, "balance_loss_mlp": 1.02338839, "epoch": 0.37563879035652015, "flos": 30812935633920.0, "grad_norm": 5.330110039063163, "language_loss": 0.69232076, "learning_rate": 2.8724767873937384e-06, "loss": 0.71390879, "num_input_tokens_seen": 67126010, "step": 3124, "time_per_iteration": 2.782834529876709 }, { "auxiliary_loss_clip": 0.01135679, "auxiliary_loss_mlp": 0.01049978, "balance_loss_clip": 1.05048287, "balance_loss_mlp": 1.03184605, "epoch": 0.37575903324715926, "flos": 20773533064320.0, "grad_norm": 2.527759334453394, "language_loss": 0.87607354, "learning_rate": 2.871775778296225e-06, "loss": 0.89793009, "num_input_tokens_seen": 67143100, "step": 3125, "time_per_iteration": 2.672624349594116 }, { "auxiliary_loss_clip": 0.01153686, "auxiliary_loss_mlp": 0.01054157, "balance_loss_clip": 1.05204821, "balance_loss_mlp": 1.03294957, "epoch": 0.37587927613779837, "flos": 18697681244160.0, "grad_norm": 2.258085824687612, "language_loss": 0.78160298, "learning_rate": 2.8710746369510196e-06, "loss": 0.80368137, "num_input_tokens_seen": 67161085, "step": 3126, "time_per_iteration": 2.604952335357666 }, { "auxiliary_loss_clip": 0.01130994, "auxiliary_loss_mlp": 0.01054377, "balance_loss_clip": 1.04842043, "balance_loss_mlp": 1.03280032, "epoch": 0.3759995190284374, "flos": 13624796384640.0, "grad_norm": 2.7129401162005817, "language_loss": 0.8348496, "learning_rate": 2.8703733634644846e-06, "loss": 0.85670334, "num_input_tokens_seen": 67175840, "step": 3127, "time_per_iteration": 2.598921775817871 }, { "auxiliary_loss_clip": 0.01158255, "auxiliary_loss_mlp": 0.01051801, "balance_loss_clip": 1.05267131, "balance_loss_mlp": 1.03378892, "epoch": 0.37611976191907653, "flos": 20484847457280.0, "grad_norm": 2.3715864584022417, "language_loss": 0.79684603, "learning_rate": 2.869671957943002e-06, "loss": 0.8189466, "num_input_tokens_seen": 67194995, "step": 3128, "time_per_iteration": 2.6029176712036133 }, { "auxiliary_loss_clip": 0.01129368, "auxiliary_loss_mlp": 0.01052705, "balance_loss_clip": 1.05277181, "balance_loss_mlp": 1.03261805, "epoch": 0.37624000480971564, "flos": 21141797253120.0, "grad_norm": 2.502121084453165, "language_loss": 0.74001324, "learning_rate": 2.8689704204929747e-06, "loss": 0.76183403, "num_input_tokens_seen": 67214175, "step": 3129, "time_per_iteration": 2.655381441116333 }, { "auxiliary_loss_clip": 0.01160223, "auxiliary_loss_mlp": 0.01055451, "balance_loss_clip": 1.05053377, "balance_loss_mlp": 1.03636551, "epoch": 0.3763602477003547, "flos": 22564470205440.0, "grad_norm": 2.3424316235070677, "language_loss": 0.80934948, "learning_rate": 2.8682687512208253e-06, "loss": 0.83150625, "num_input_tokens_seen": 67233185, "step": 3130, "time_per_iteration": 3.5231270790100098 }, { "auxiliary_loss_clip": 0.01155535, "auxiliary_loss_mlp": 0.01054548, "balance_loss_clip": 1.05025661, "balance_loss_mlp": 1.03467572, "epoch": 0.3764804905909938, "flos": 27526857851520.0, "grad_norm": 2.1295653324742614, "language_loss": 0.80461776, "learning_rate": 2.8675669502329972e-06, "loss": 0.82671869, "num_input_tokens_seen": 67254715, "step": 3131, "time_per_iteration": 2.6965627670288086 }, { "auxiliary_loss_clip": 0.01149991, "auxiliary_loss_mlp": 0.00774949, "balance_loss_clip": 1.0519712, "balance_loss_mlp": 1.00032222, "epoch": 0.3766007334816329, "flos": 22528092706560.0, "grad_norm": 2.9024762176248045, "language_loss": 0.85755503, "learning_rate": 2.866865017635952e-06, "loss": 0.87680441, "num_input_tokens_seen": 67272535, "step": 3132, "time_per_iteration": 3.5546071529388428 }, { "auxiliary_loss_clip": 0.01126066, "auxiliary_loss_mlp": 0.01051935, "balance_loss_clip": 1.05174041, "balance_loss_mlp": 1.03143096, "epoch": 0.376720976372272, "flos": 25957166532480.0, "grad_norm": 1.553581620721888, "language_loss": 0.79213691, "learning_rate": 2.866162953536174e-06, "loss": 0.81391692, "num_input_tokens_seen": 67293505, "step": 3133, "time_per_iteration": 2.863638162612915 }, { "auxiliary_loss_clip": 0.01133297, "auxiliary_loss_mlp": 0.00774764, "balance_loss_clip": 1.04747295, "balance_loss_mlp": 1.00028622, "epoch": 0.3768412192629111, "flos": 18041162411520.0, "grad_norm": 1.7675841827149337, "language_loss": 0.74769992, "learning_rate": 2.8654607580401634e-06, "loss": 0.7667805, "num_input_tokens_seen": 67313240, "step": 3134, "time_per_iteration": 2.6727027893066406 }, { "auxiliary_loss_clip": 0.01049211, "auxiliary_loss_mlp": 0.01017916, "balance_loss_clip": 1.02289724, "balance_loss_mlp": 1.01559186, "epoch": 0.3769614621535502, "flos": 62989472304000.0, "grad_norm": 0.886754555325537, "language_loss": 0.65185529, "learning_rate": 2.8647584312544446e-06, "loss": 0.67252654, "num_input_tokens_seen": 67378445, "step": 3135, "time_per_iteration": 4.182028293609619 }, { "auxiliary_loss_clip": 0.01113661, "auxiliary_loss_mlp": 0.00774518, "balance_loss_clip": 1.0432719, "balance_loss_mlp": 1.00032949, "epoch": 0.37708170504418925, "flos": 23661685002240.0, "grad_norm": 2.1001966132191616, "language_loss": 0.85224879, "learning_rate": 2.864055973285559e-06, "loss": 0.87113059, "num_input_tokens_seen": 67400445, "step": 3136, "time_per_iteration": 2.745769500732422 }, { "auxiliary_loss_clip": 0.01121184, "auxiliary_loss_mlp": 0.01052014, "balance_loss_clip": 1.04565597, "balance_loss_mlp": 1.03389382, "epoch": 0.37720194793482836, "flos": 24423170353920.0, "grad_norm": 1.772444117124088, "language_loss": 0.86351919, "learning_rate": 2.8633533842400698e-06, "loss": 0.8852511, "num_input_tokens_seen": 67420645, "step": 3137, "time_per_iteration": 2.691955804824829 }, { "auxiliary_loss_clip": 0.01151057, "auxiliary_loss_mlp": 0.00776772, "balance_loss_clip": 1.05061293, "balance_loss_mlp": 1.0003984, "epoch": 0.3773221908254674, "flos": 20996502739200.0, "grad_norm": 1.7856658245048944, "language_loss": 0.77553844, "learning_rate": 2.862650664224558e-06, "loss": 0.79481673, "num_input_tokens_seen": 67439495, "step": 3138, "time_per_iteration": 2.669537305831909 }, { "auxiliary_loss_clip": 0.01143938, "auxiliary_loss_mlp": 0.01042514, "balance_loss_clip": 1.05156159, "balance_loss_mlp": 1.02496624, "epoch": 0.37744243371610653, "flos": 37631724958080.0, "grad_norm": 1.7462986717139857, "language_loss": 0.69769973, "learning_rate": 2.861947813345627e-06, "loss": 0.7195642, "num_input_tokens_seen": 67462195, "step": 3139, "time_per_iteration": 3.673076868057251 }, { "auxiliary_loss_clip": 0.01164506, "auxiliary_loss_mlp": 0.00775972, "balance_loss_clip": 1.05480385, "balance_loss_mlp": 1.0003556, "epoch": 0.37756267660674564, "flos": 26140526484480.0, "grad_norm": 5.11495924989834, "language_loss": 0.72399867, "learning_rate": 2.8612448317098974e-06, "loss": 0.74340343, "num_input_tokens_seen": 67482530, "step": 3140, "time_per_iteration": 2.638946533203125 }, { "auxiliary_loss_clip": 0.01123454, "auxiliary_loss_mlp": 0.00775428, "balance_loss_clip": 1.04558921, "balance_loss_mlp": 1.00035071, "epoch": 0.3776829194973847, "flos": 19427888828160.0, "grad_norm": 2.234297867668687, "language_loss": 0.83201718, "learning_rate": 2.8605417194240114e-06, "loss": 0.85100597, "num_input_tokens_seen": 67500890, "step": 3141, "time_per_iteration": 2.7202765941619873 }, { "auxiliary_loss_clip": 0.01140487, "auxiliary_loss_mlp": 0.01045165, "balance_loss_clip": 1.04753911, "balance_loss_mlp": 1.02696228, "epoch": 0.3778031623880238, "flos": 17382309194880.0, "grad_norm": 3.469346435563887, "language_loss": 0.78740954, "learning_rate": 2.8598384765946315e-06, "loss": 0.80926609, "num_input_tokens_seen": 67519545, "step": 3142, "time_per_iteration": 2.585569143295288 }, { "auxiliary_loss_clip": 0.01158715, "auxiliary_loss_mlp": 0.0104254, "balance_loss_clip": 1.0495528, "balance_loss_mlp": 1.02501607, "epoch": 0.3779234052786629, "flos": 27125843437440.0, "grad_norm": 1.9585874899478515, "language_loss": 0.71523219, "learning_rate": 2.8591351033284377e-06, "loss": 0.73724473, "num_input_tokens_seen": 67539275, "step": 3143, "time_per_iteration": 2.630934953689575 }, { "auxiliary_loss_clip": 0.01150907, "auxiliary_loss_mlp": 0.01050012, "balance_loss_clip": 1.05248904, "balance_loss_mlp": 1.03083134, "epoch": 0.37804364816930197, "flos": 19682639061120.0, "grad_norm": 3.687705988108315, "language_loss": 0.83912289, "learning_rate": 2.8584315997321325e-06, "loss": 0.86113209, "num_input_tokens_seen": 67558280, "step": 3144, "time_per_iteration": 2.605609893798828 }, { "auxiliary_loss_clip": 0.01158645, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.05087137, "balance_loss_mlp": 1.02105701, "epoch": 0.3781638910599411, "flos": 22702905221760.0, "grad_norm": 2.497052359033352, "language_loss": 0.78572392, "learning_rate": 2.8577279659124356e-06, "loss": 0.80771083, "num_input_tokens_seen": 67575955, "step": 3145, "time_per_iteration": 2.558730363845825 }, { "auxiliary_loss_clip": 0.01139291, "auxiliary_loss_mlp": 0.01038812, "balance_loss_clip": 1.04692435, "balance_loss_mlp": 1.02070451, "epoch": 0.3782841339505802, "flos": 14647604158080.0, "grad_norm": 1.9486312672738257, "language_loss": 0.83675468, "learning_rate": 2.857024201976089e-06, "loss": 0.85853577, "num_input_tokens_seen": 67593515, "step": 3146, "time_per_iteration": 2.6034774780273438 }, { "auxiliary_loss_clip": 0.01134369, "auxiliary_loss_mlp": 0.01040484, "balance_loss_clip": 1.04845452, "balance_loss_mlp": 1.02092206, "epoch": 0.37840437684121925, "flos": 32818223185920.0, "grad_norm": 2.0823990426563372, "language_loss": 0.72709525, "learning_rate": 2.8563203080298516e-06, "loss": 0.74884379, "num_input_tokens_seen": 67614290, "step": 3147, "time_per_iteration": 2.7494921684265137 }, { "auxiliary_loss_clip": 0.0113803, "auxiliary_loss_mlp": 0.00774407, "balance_loss_clip": 1.04974687, "balance_loss_mlp": 1.00035143, "epoch": 0.37852461973185836, "flos": 18369206346240.0, "grad_norm": 2.3475199425278652, "language_loss": 0.89340883, "learning_rate": 2.855616284180505e-06, "loss": 0.91253316, "num_input_tokens_seen": 67631340, "step": 3148, "time_per_iteration": 2.6186978816986084 }, { "auxiliary_loss_clip": 0.01050583, "auxiliary_loss_mlp": 0.01003058, "balance_loss_clip": 1.02361894, "balance_loss_mlp": 1.00099564, "epoch": 0.37864486262249747, "flos": 59500680117120.0, "grad_norm": 0.8901905605228482, "language_loss": 0.66120666, "learning_rate": 2.8549121305348477e-06, "loss": 0.68174309, "num_input_tokens_seen": 67691125, "step": 3149, "time_per_iteration": 3.1700499057769775 }, { "auxiliary_loss_clip": 0.01146941, "auxiliary_loss_mlp": 0.01040465, "balance_loss_clip": 1.05104589, "balance_loss_mlp": 1.02308488, "epoch": 0.3787651055131365, "flos": 23363015414400.0, "grad_norm": 2.356890219822281, "language_loss": 0.83546883, "learning_rate": 2.8542078471997006e-06, "loss": 0.85734284, "num_input_tokens_seen": 67708740, "step": 3150, "time_per_iteration": 2.616117477416992 }, { "auxiliary_loss_clip": 0.011483, "auxiliary_loss_mlp": 0.01043168, "balance_loss_clip": 1.05142736, "balance_loss_mlp": 1.02485704, "epoch": 0.37888534840377563, "flos": 24601394661120.0, "grad_norm": 1.7190148072688927, "language_loss": 0.7594806, "learning_rate": 2.8535034342819013e-06, "loss": 0.78139532, "num_input_tokens_seen": 67726150, "step": 3151, "time_per_iteration": 2.643587589263916 }, { "auxiliary_loss_clip": 0.01156245, "auxiliary_loss_mlp": 0.01048689, "balance_loss_clip": 1.05103278, "balance_loss_mlp": 1.03024745, "epoch": 0.37900559129441475, "flos": 23986891762560.0, "grad_norm": 1.7780189097646693, "language_loss": 0.72702968, "learning_rate": 2.85279889188831e-06, "loss": 0.74907899, "num_input_tokens_seen": 67746525, "step": 3152, "time_per_iteration": 2.7521469593048096 }, { "auxiliary_loss_clip": 0.01119962, "auxiliary_loss_mlp": 0.01065869, "balance_loss_clip": 1.04468489, "balance_loss_mlp": 1.04462624, "epoch": 0.3791258341850538, "flos": 24644667571200.0, "grad_norm": 2.1421942147187494, "language_loss": 0.81125349, "learning_rate": 2.852094220125805e-06, "loss": 0.83311182, "num_input_tokens_seen": 67766035, "step": 3153, "time_per_iteration": 2.7276222705841064 }, { "auxiliary_loss_clip": 0.0115236, "auxiliary_loss_mlp": 0.01050645, "balance_loss_clip": 1.05495274, "balance_loss_mlp": 1.0312382, "epoch": 0.3792460770756929, "flos": 17420841509760.0, "grad_norm": 2.3775555289025485, "language_loss": 0.71424401, "learning_rate": 2.8513894191012846e-06, "loss": 0.736274, "num_input_tokens_seen": 67785015, "step": 3154, "time_per_iteration": 2.6797034740448 }, { "auxiliary_loss_clip": 0.0116149, "auxiliary_loss_mlp": 0.01047957, "balance_loss_clip": 1.05206108, "balance_loss_mlp": 1.02969408, "epoch": 0.37936631996633197, "flos": 24206557386240.0, "grad_norm": 1.5971225855731377, "language_loss": 0.78905034, "learning_rate": 2.8506844889216664e-06, "loss": 0.81114483, "num_input_tokens_seen": 67804400, "step": 3155, "time_per_iteration": 2.6095945835113525 }, { "auxiliary_loss_clip": 0.01054619, "auxiliary_loss_mlp": 0.01006996, "balance_loss_clip": 1.03358614, "balance_loss_mlp": 1.00454009, "epoch": 0.3794865628569711, "flos": 70297114752000.0, "grad_norm": 0.8637061698659256, "language_loss": 0.62825572, "learning_rate": 2.849979429693887e-06, "loss": 0.6488719, "num_input_tokens_seen": 67865385, "step": 3156, "time_per_iteration": 4.1756861209869385 }, { "auxiliary_loss_clip": 0.0115976, "auxiliary_loss_mlp": 0.01054092, "balance_loss_clip": 1.05242038, "balance_loss_mlp": 1.03283691, "epoch": 0.3796068057476102, "flos": 15779364860160.0, "grad_norm": 2.0923864830825063, "language_loss": 0.73921239, "learning_rate": 2.8492742415249042e-06, "loss": 0.76135087, "num_input_tokens_seen": 67883030, "step": 3157, "time_per_iteration": 2.559828758239746 }, { "auxiliary_loss_clip": 0.01160193, "auxiliary_loss_mlp": 0.01048397, "balance_loss_clip": 1.05245602, "balance_loss_mlp": 1.02913284, "epoch": 0.37972704863824924, "flos": 25191694771200.0, "grad_norm": 1.6708804205960397, "language_loss": 0.76585627, "learning_rate": 2.848568924521694e-06, "loss": 0.78794217, "num_input_tokens_seen": 67903810, "step": 3158, "time_per_iteration": 3.6010029315948486 }, { "auxiliary_loss_clip": 0.01140327, "auxiliary_loss_mlp": 0.01059145, "balance_loss_clip": 1.04687679, "balance_loss_mlp": 1.03924918, "epoch": 0.37984729152888835, "flos": 26210372480640.0, "grad_norm": 2.106186851247063, "language_loss": 0.73905683, "learning_rate": 2.8478634787912526e-06, "loss": 0.76105154, "num_input_tokens_seen": 67921865, "step": 3159, "time_per_iteration": 2.65862774848938 }, { "auxiliary_loss_clip": 0.01144967, "auxiliary_loss_mlp": 0.01056014, "balance_loss_clip": 1.04807484, "balance_loss_mlp": 1.03634477, "epoch": 0.37996753441952746, "flos": 25629302165760.0, "grad_norm": 2.42233320961187, "language_loss": 0.76327664, "learning_rate": 2.847157904440596e-06, "loss": 0.78528643, "num_input_tokens_seen": 67941595, "step": 3160, "time_per_iteration": 2.609419107437134 }, { "auxiliary_loss_clip": 0.01146689, "auxiliary_loss_mlp": 0.01058738, "balance_loss_clip": 1.05228877, "balance_loss_mlp": 1.04009366, "epoch": 0.3800877773101665, "flos": 20118414862080.0, "grad_norm": 2.349544935183062, "language_loss": 0.73879206, "learning_rate": 2.846452201576759e-06, "loss": 0.76084632, "num_input_tokens_seen": 67960970, "step": 3161, "time_per_iteration": 3.5991482734680176 }, { "auxiliary_loss_clip": 0.01041565, "auxiliary_loss_mlp": 0.01014415, "balance_loss_clip": 1.02327454, "balance_loss_mlp": 1.01229334, "epoch": 0.38020802020080563, "flos": 63053608037760.0, "grad_norm": 0.8446504669248668, "language_loss": 0.62817609, "learning_rate": 2.845746370306795e-06, "loss": 0.64873594, "num_input_tokens_seen": 68026160, "step": 3162, "time_per_iteration": 3.306689977645874 }, { "auxiliary_loss_clip": 0.01149709, "auxiliary_loss_mlp": 0.01048436, "balance_loss_clip": 1.05158472, "balance_loss_mlp": 1.02901685, "epoch": 0.38032826309144474, "flos": 21288420570240.0, "grad_norm": 1.9431057437084578, "language_loss": 0.7819978, "learning_rate": 2.84504041073778e-06, "loss": 0.80397922, "num_input_tokens_seen": 68044575, "step": 3163, "time_per_iteration": 2.6018364429473877 }, { "auxiliary_loss_clip": 0.01128363, "auxiliary_loss_mlp": 0.01079099, "balance_loss_clip": 1.04735446, "balance_loss_mlp": 1.05675912, "epoch": 0.3804485059820838, "flos": 18954119416320.0, "grad_norm": 1.7682001156686713, "language_loss": 0.79114407, "learning_rate": 2.844334322976806e-06, "loss": 0.81321871, "num_input_tokens_seen": 68064790, "step": 3164, "time_per_iteration": 2.6571409702301025 }, { "auxiliary_loss_clip": 0.01113456, "auxiliary_loss_mlp": 0.0105665, "balance_loss_clip": 1.04455614, "balance_loss_mlp": 1.0353359, "epoch": 0.3805687488727229, "flos": 21833759831040.0, "grad_norm": 2.978451467212197, "language_loss": 0.83363825, "learning_rate": 2.8436281071309866e-06, "loss": 0.85533929, "num_input_tokens_seen": 68083330, "step": 3165, "time_per_iteration": 3.5599186420440674 }, { "auxiliary_loss_clip": 0.01020817, "auxiliary_loss_mlp": 0.01005891, "balance_loss_clip": 1.02109671, "balance_loss_mlp": 1.00372112, "epoch": 0.380688991763362, "flos": 58546209968640.0, "grad_norm": 0.727817317200585, "language_loss": 0.52991569, "learning_rate": 2.842921763307455e-06, "loss": 0.55018276, "num_input_tokens_seen": 68146140, "step": 3166, "time_per_iteration": 3.230886459350586 }, { "auxiliary_loss_clip": 0.0113002, "auxiliary_loss_mlp": 0.01056775, "balance_loss_clip": 1.04886532, "balance_loss_mlp": 1.0380708, "epoch": 0.38080923465400107, "flos": 23799509487360.0, "grad_norm": 2.5452856124180263, "language_loss": 0.82730711, "learning_rate": 2.842215291613361e-06, "loss": 0.8491751, "num_input_tokens_seen": 68164520, "step": 3167, "time_per_iteration": 2.667527198791504 }, { "auxiliary_loss_clip": 0.00987884, "auxiliary_loss_mlp": 0.01000209, "balance_loss_clip": 1.01703274, "balance_loss_mlp": 0.99805146, "epoch": 0.3809294775446402, "flos": 54969866380800.0, "grad_norm": 0.8320511038495647, "language_loss": 0.59171557, "learning_rate": 2.8415086921558774e-06, "loss": 0.61159647, "num_input_tokens_seen": 68227945, "step": 3168, "time_per_iteration": 3.3776049613952637 }, { "auxiliary_loss_clip": 0.01119515, "auxiliary_loss_mlp": 0.01045904, "balance_loss_clip": 1.04251671, "balance_loss_mlp": 1.02606726, "epoch": 0.38104972043527924, "flos": 24643697904000.0, "grad_norm": 1.6329631839969583, "language_loss": 0.78681862, "learning_rate": 2.840801965042194e-06, "loss": 0.80847281, "num_input_tokens_seen": 68247405, "step": 3169, "time_per_iteration": 2.8681819438934326 }, { "auxiliary_loss_clip": 0.01124471, "auxiliary_loss_mlp": 0.01046396, "balance_loss_clip": 1.04526746, "balance_loss_mlp": 1.02679789, "epoch": 0.38116996332591835, "flos": 22856783086080.0, "grad_norm": 2.6244462193817695, "language_loss": 0.83713019, "learning_rate": 2.840095110379521e-06, "loss": 0.85883886, "num_input_tokens_seen": 68266925, "step": 3170, "time_per_iteration": 2.6267459392547607 }, { "auxiliary_loss_clip": 0.01022471, "auxiliary_loss_mlp": 0.01001707, "balance_loss_clip": 1.0252223, "balance_loss_mlp": 0.99915582, "epoch": 0.38129020621655746, "flos": 60836160804480.0, "grad_norm": 0.7343568885887056, "language_loss": 0.53902906, "learning_rate": 2.8393881282750884e-06, "loss": 0.55927086, "num_input_tokens_seen": 68329755, "step": 3171, "time_per_iteration": 3.1967339515686035 }, { "auxiliary_loss_clip": 0.01134826, "auxiliary_loss_mlp": 0.01044193, "balance_loss_clip": 1.05040348, "balance_loss_mlp": 1.02421331, "epoch": 0.3814104491071965, "flos": 21648101408640.0, "grad_norm": 2.1184111930040075, "language_loss": 0.78674567, "learning_rate": 2.838681018836144e-06, "loss": 0.80853587, "num_input_tokens_seen": 68347075, "step": 3172, "time_per_iteration": 2.6746222972869873 }, { "auxiliary_loss_clip": 0.01121603, "auxiliary_loss_mlp": 0.00774238, "balance_loss_clip": 1.04484963, "balance_loss_mlp": 1.00053287, "epoch": 0.3815306919978356, "flos": 19099090707840.0, "grad_norm": 2.0555281304333777, "language_loss": 0.77935344, "learning_rate": 2.837973782169955e-06, "loss": 0.79831183, "num_input_tokens_seen": 68365450, "step": 3173, "time_per_iteration": 2.6251580715179443 }, { "auxiliary_loss_clip": 0.01060041, "auxiliary_loss_mlp": 0.01006072, "balance_loss_clip": 1.02405214, "balance_loss_mlp": 1.00372362, "epoch": 0.38165093488847474, "flos": 67067918156160.0, "grad_norm": 0.8088476912981893, "language_loss": 0.59130502, "learning_rate": 2.8372664183838096e-06, "loss": 0.61196613, "num_input_tokens_seen": 68428470, "step": 3174, "time_per_iteration": 3.2124385833740234 }, { "auxiliary_loss_clip": 0.01160804, "auxiliary_loss_mlp": 0.01058358, "balance_loss_clip": 1.0518651, "balance_loss_mlp": 1.03945112, "epoch": 0.3817711777791138, "flos": 22341105480960.0, "grad_norm": 2.267415013101407, "language_loss": 0.68126822, "learning_rate": 2.836558927585015e-06, "loss": 0.70345986, "num_input_tokens_seen": 68445440, "step": 3175, "time_per_iteration": 2.598942995071411 }, { "auxiliary_loss_clip": 0.01151296, "auxiliary_loss_mlp": 0.01043372, "balance_loss_clip": 1.05271459, "balance_loss_mlp": 1.02490687, "epoch": 0.3818914206697529, "flos": 22820621068800.0, "grad_norm": 2.4040671526951747, "language_loss": 0.82481039, "learning_rate": 2.8358513098808957e-06, "loss": 0.84675711, "num_input_tokens_seen": 68465755, "step": 3176, "time_per_iteration": 2.632481098175049 }, { "auxiliary_loss_clip": 0.01104202, "auxiliary_loss_mlp": 0.01057915, "balance_loss_clip": 1.04865193, "balance_loss_mlp": 1.03580165, "epoch": 0.382011663560392, "flos": 24386074583040.0, "grad_norm": 1.8232411195870026, "language_loss": 0.76878971, "learning_rate": 2.835143565378798e-06, "loss": 0.79041088, "num_input_tokens_seen": 68486220, "step": 3177, "time_per_iteration": 2.739481210708618 }, { "auxiliary_loss_clip": 0.01094984, "auxiliary_loss_mlp": 0.01048658, "balance_loss_clip": 1.04403114, "balance_loss_mlp": 1.02821374, "epoch": 0.38213190645103107, "flos": 21981568296960.0, "grad_norm": 1.9659492040533426, "language_loss": 0.78328812, "learning_rate": 2.8344356941860847e-06, "loss": 0.80472457, "num_input_tokens_seen": 68505850, "step": 3178, "time_per_iteration": 2.7671337127685547 }, { "auxiliary_loss_clip": 0.01117038, "auxiliary_loss_mlp": 0.01062407, "balance_loss_clip": 1.04736567, "balance_loss_mlp": 1.04145038, "epoch": 0.3822521493416702, "flos": 35516945773440.0, "grad_norm": 2.488428098621802, "language_loss": 0.65940434, "learning_rate": 2.8337276964101403e-06, "loss": 0.68119872, "num_input_tokens_seen": 68526290, "step": 3179, "time_per_iteration": 2.7792625427246094 }, { "auxiliary_loss_clip": 0.01151357, "auxiliary_loss_mlp": 0.01043999, "balance_loss_clip": 1.05274427, "balance_loss_mlp": 1.02425778, "epoch": 0.3823723922323093, "flos": 21069904181760.0, "grad_norm": 1.8792963935126619, "language_loss": 0.76484042, "learning_rate": 2.833019572158367e-06, "loss": 0.78679401, "num_input_tokens_seen": 68544725, "step": 3180, "time_per_iteration": 2.6593639850616455 }, { "auxiliary_loss_clip": 0.01136702, "auxiliary_loss_mlp": 0.01056393, "balance_loss_clip": 1.05049896, "balance_loss_mlp": 1.03714108, "epoch": 0.38249263512294834, "flos": 19789149864960.0, "grad_norm": 2.272092656231092, "language_loss": 0.79966617, "learning_rate": 2.8323113215381872e-06, "loss": 0.8215971, "num_input_tokens_seen": 68563070, "step": 3181, "time_per_iteration": 2.6993086338043213 }, { "auxiliary_loss_clip": 0.0112112, "auxiliary_loss_mlp": 0.01054482, "balance_loss_clip": 1.04570818, "balance_loss_mlp": 1.0333941, "epoch": 0.38261287801358745, "flos": 21433930565760.0, "grad_norm": 4.718543607993711, "language_loss": 0.76322138, "learning_rate": 2.831602944657042e-06, "loss": 0.78497744, "num_input_tokens_seen": 68581150, "step": 3182, "time_per_iteration": 3.627214193344116 }, { "auxiliary_loss_clip": 0.01139501, "auxiliary_loss_mlp": 0.01049035, "balance_loss_clip": 1.0478574, "balance_loss_mlp": 1.02909136, "epoch": 0.38273312090422656, "flos": 21981568296960.0, "grad_norm": 2.536902809172987, "language_loss": 0.74233919, "learning_rate": 2.830894441622391e-06, "loss": 0.76422453, "num_input_tokens_seen": 68597800, "step": 3183, "time_per_iteration": 2.6577224731445312 }, { "auxiliary_loss_clip": 0.01123077, "auxiliary_loss_mlp": 0.00778096, "balance_loss_clip": 1.04551911, "balance_loss_mlp": 1.00053835, "epoch": 0.3828533637948656, "flos": 24790895838720.0, "grad_norm": 2.046433745693914, "language_loss": 0.79849541, "learning_rate": 2.8301858125417134e-06, "loss": 0.81750715, "num_input_tokens_seen": 68617640, "step": 3184, "time_per_iteration": 3.612048387527466 }, { "auxiliary_loss_clip": 0.0113236, "auxiliary_loss_mlp": 0.01040081, "balance_loss_clip": 1.04981208, "balance_loss_mlp": 1.02168751, "epoch": 0.38297360668550473, "flos": 22455445449600.0, "grad_norm": 1.9251117458154976, "language_loss": 0.74178827, "learning_rate": 2.8294770575225082e-06, "loss": 0.76351261, "num_input_tokens_seen": 68637770, "step": 3185, "time_per_iteration": 2.6716365814208984 }, { "auxiliary_loss_clip": 0.01150253, "auxiliary_loss_mlp": 0.01051459, "balance_loss_clip": 1.05135965, "balance_loss_mlp": 1.03156257, "epoch": 0.3830938495761438, "flos": 24896903852160.0, "grad_norm": 1.8761920559924026, "language_loss": 0.83854437, "learning_rate": 2.828768176672293e-06, "loss": 0.86056149, "num_input_tokens_seen": 68656885, "step": 3186, "time_per_iteration": 2.630845308303833 }, { "auxiliary_loss_clip": 0.01122682, "auxiliary_loss_mlp": 0.01049938, "balance_loss_clip": 1.0476135, "balance_loss_mlp": 1.0288384, "epoch": 0.3832140924667829, "flos": 33036236784000.0, "grad_norm": 1.9731063554154589, "language_loss": 0.71592677, "learning_rate": 2.8280591700986044e-06, "loss": 0.73765302, "num_input_tokens_seen": 68678750, "step": 3187, "time_per_iteration": 3.7067575454711914 }, { "auxiliary_loss_clip": 0.01136657, "auxiliary_loss_mlp": 0.01048291, "balance_loss_clip": 1.04769492, "balance_loss_mlp": 1.02820408, "epoch": 0.383334335357422, "flos": 31903721896320.0, "grad_norm": 2.6435521603129977, "language_loss": 0.75229347, "learning_rate": 2.827350037908999e-06, "loss": 0.77414292, "num_input_tokens_seen": 68698190, "step": 3188, "time_per_iteration": 2.743664503097534 }, { "auxiliary_loss_clip": 0.01126953, "auxiliary_loss_mlp": 0.01050115, "balance_loss_clip": 1.04725325, "balance_loss_mlp": 1.03021884, "epoch": 0.38345457824806106, "flos": 19791915212160.0, "grad_norm": 2.3506612820524353, "language_loss": 0.79145426, "learning_rate": 2.8266407802110496e-06, "loss": 0.81322491, "num_input_tokens_seen": 68716445, "step": 3189, "time_per_iteration": 2.678136110305786 }, { "auxiliary_loss_clip": 0.01094852, "auxiliary_loss_mlp": 0.01047025, "balance_loss_clip": 1.04530776, "balance_loss_mlp": 1.02586579, "epoch": 0.3835748211387002, "flos": 22419391173120.0, "grad_norm": 1.9937144519032406, "language_loss": 0.75756741, "learning_rate": 2.8259313971123515e-06, "loss": 0.77898622, "num_input_tokens_seen": 68737565, "step": 3190, "time_per_iteration": 2.8319995403289795 }, { "auxiliary_loss_clip": 0.01143114, "auxiliary_loss_mlp": 0.010419, "balance_loss_clip": 1.05042005, "balance_loss_mlp": 1.02450776, "epoch": 0.3836950640293393, "flos": 25118436983040.0, "grad_norm": 1.5217196663607153, "language_loss": 0.78208572, "learning_rate": 2.8252218887205166e-06, "loss": 0.80393589, "num_input_tokens_seen": 68758255, "step": 3191, "time_per_iteration": 3.5741517543792725 }, { "auxiliary_loss_clip": 0.01096756, "auxiliary_loss_mlp": 0.0106193, "balance_loss_clip": 1.0439887, "balance_loss_mlp": 1.04235625, "epoch": 0.38381530691997834, "flos": 21799213925760.0, "grad_norm": 1.9032031689414128, "language_loss": 0.8092286, "learning_rate": 2.824512255143178e-06, "loss": 0.83081543, "num_input_tokens_seen": 68777490, "step": 3192, "time_per_iteration": 2.7045395374298096 }, { "auxiliary_loss_clip": 0.01126485, "auxiliary_loss_mlp": 0.01045135, "balance_loss_clip": 1.04980326, "balance_loss_mlp": 1.02551305, "epoch": 0.38393554981061745, "flos": 21252689516160.0, "grad_norm": 2.211051058309033, "language_loss": 0.79238451, "learning_rate": 2.8238024964879855e-06, "loss": 0.81410074, "num_input_tokens_seen": 68798385, "step": 3193, "time_per_iteration": 2.6942179203033447 }, { "auxiliary_loss_clip": 0.01168223, "auxiliary_loss_mlp": 0.01044695, "balance_loss_clip": 1.05513954, "balance_loss_mlp": 1.02377415, "epoch": 0.38405579270125656, "flos": 17019360218880.0, "grad_norm": 2.8173749347340338, "language_loss": 0.76845241, "learning_rate": 2.8230926128626095e-06, "loss": 0.79058158, "num_input_tokens_seen": 68816880, "step": 3194, "time_per_iteration": 2.5938796997070312 }, { "auxiliary_loss_clip": 0.0113041, "auxiliary_loss_mlp": 0.01044135, "balance_loss_clip": 1.04826534, "balance_loss_mlp": 1.02252197, "epoch": 0.3841760355918956, "flos": 21835375943040.0, "grad_norm": 2.0289790021023615, "language_loss": 0.79333943, "learning_rate": 2.822382604374738e-06, "loss": 0.81508487, "num_input_tokens_seen": 68835805, "step": 3195, "time_per_iteration": 2.659923791885376 }, { "auxiliary_loss_clip": 0.01132277, "auxiliary_loss_mlp": 0.01055059, "balance_loss_clip": 1.05073524, "balance_loss_mlp": 1.03435218, "epoch": 0.3842962784825347, "flos": 25915114684800.0, "grad_norm": 3.6628259242318038, "language_loss": 0.65487671, "learning_rate": 2.8216724711320793e-06, "loss": 0.67675012, "num_input_tokens_seen": 68854930, "step": 3196, "time_per_iteration": 2.7016549110412598 }, { "auxiliary_loss_clip": 0.01160405, "auxiliary_loss_mlp": 0.00774018, "balance_loss_clip": 1.05220127, "balance_loss_mlp": 1.00067616, "epoch": 0.38441652137317384, "flos": 25337492075520.0, "grad_norm": 1.8952230109954602, "language_loss": 0.79616606, "learning_rate": 2.820962213242361e-06, "loss": 0.81551033, "num_input_tokens_seen": 68874260, "step": 3197, "time_per_iteration": 2.664355516433716 }, { "auxiliary_loss_clip": 0.01146998, "auxiliary_loss_mlp": 0.01053385, "balance_loss_clip": 1.0537523, "balance_loss_mlp": 1.03360796, "epoch": 0.3845367642638129, "flos": 18113486446080.0, "grad_norm": 35.91432007914154, "language_loss": 0.84448302, "learning_rate": 2.8202518308133264e-06, "loss": 0.86648679, "num_input_tokens_seen": 68891535, "step": 3198, "time_per_iteration": 2.5898282527923584 }, { "auxiliary_loss_clip": 0.01163663, "auxiliary_loss_mlp": 0.01048884, "balance_loss_clip": 1.05145884, "balance_loss_mlp": 1.02899981, "epoch": 0.384657007154452, "flos": 25228395492480.0, "grad_norm": 1.9578885495062792, "language_loss": 0.73149168, "learning_rate": 2.8195413239527426e-06, "loss": 0.75361711, "num_input_tokens_seen": 68911275, "step": 3199, "time_per_iteration": 2.632903575897217 }, { "auxiliary_loss_clip": 0.01144635, "auxiliary_loss_mlp": 0.01049918, "balance_loss_clip": 1.04975343, "balance_loss_mlp": 1.02933073, "epoch": 0.38477725004509106, "flos": 19865855358720.0, "grad_norm": 1.959536910818404, "language_loss": 0.8054598, "learning_rate": 2.8188306927683906e-06, "loss": 0.82740533, "num_input_tokens_seen": 68930745, "step": 3200, "time_per_iteration": 2.6041476726531982 }, { "auxiliary_loss_clip": 0.01138383, "auxiliary_loss_mlp": 0.01050851, "balance_loss_clip": 1.05226529, "balance_loss_mlp": 1.02962017, "epoch": 0.38489749293573017, "flos": 18259391491200.0, "grad_norm": 2.5740044492248466, "language_loss": 0.74823308, "learning_rate": 2.818119937368074e-06, "loss": 0.77012545, "num_input_tokens_seen": 68949380, "step": 3201, "time_per_iteration": 2.6297855377197266 }, { "auxiliary_loss_clip": 0.0115646, "auxiliary_loss_mlp": 0.01050253, "balance_loss_clip": 1.05249524, "balance_loss_mlp": 1.02936745, "epoch": 0.3850177358263693, "flos": 24389163152640.0, "grad_norm": 1.9553377738856517, "language_loss": 0.65442801, "learning_rate": 2.817409057859613e-06, "loss": 0.67649508, "num_input_tokens_seen": 68968370, "step": 3202, "time_per_iteration": 2.653902530670166 }, { "auxiliary_loss_clip": 0.0110943, "auxiliary_loss_mlp": 0.01054389, "balance_loss_clip": 1.04813159, "balance_loss_mlp": 1.03243041, "epoch": 0.38513797871700833, "flos": 17671533505920.0, "grad_norm": 2.2735329475094352, "language_loss": 0.79131842, "learning_rate": 2.8166980543508482e-06, "loss": 0.81295657, "num_input_tokens_seen": 68984260, "step": 3203, "time_per_iteration": 2.7138755321502686 }, { "auxiliary_loss_clip": 0.0116382, "auxiliary_loss_mlp": 0.01045407, "balance_loss_clip": 1.05551803, "balance_loss_mlp": 1.02634573, "epoch": 0.38525822160764744, "flos": 25739583897600.0, "grad_norm": 2.3561706820124946, "language_loss": 0.79937041, "learning_rate": 2.815986926949638e-06, "loss": 0.82146269, "num_input_tokens_seen": 69002760, "step": 3204, "time_per_iteration": 2.605586528778076 }, { "auxiliary_loss_clip": 0.01150538, "auxiliary_loss_mlp": 0.01052868, "balance_loss_clip": 1.05313623, "balance_loss_mlp": 1.03328156, "epoch": 0.38537846449828655, "flos": 20193647898240.0, "grad_norm": 1.8138963522788636, "language_loss": 0.80374718, "learning_rate": 2.8152756757638597e-06, "loss": 0.82578129, "num_input_tokens_seen": 69021260, "step": 3205, "time_per_iteration": 2.6479694843292236 }, { "auxiliary_loss_clip": 0.01150463, "auxiliary_loss_mlp": 0.01053273, "balance_loss_clip": 1.05389094, "balance_loss_mlp": 1.03363895, "epoch": 0.3854987073889256, "flos": 23039352938880.0, "grad_norm": 2.438541042235779, "language_loss": 0.84514546, "learning_rate": 2.8145643009014093e-06, "loss": 0.86718285, "num_input_tokens_seen": 69039755, "step": 3206, "time_per_iteration": 2.668074369430542 }, { "auxiliary_loss_clip": 0.0114691, "auxiliary_loss_mlp": 0.0104819, "balance_loss_clip": 1.05129004, "balance_loss_mlp": 1.02982032, "epoch": 0.3856189502795647, "flos": 20190631155840.0, "grad_norm": 2.0669253585786436, "language_loss": 0.79590428, "learning_rate": 2.813852802470202e-06, "loss": 0.8178553, "num_input_tokens_seen": 69057650, "step": 3207, "time_per_iteration": 2.6431491374969482 }, { "auxiliary_loss_clip": 0.0113409, "auxiliary_loss_mlp": 0.01044917, "balance_loss_clip": 1.05025601, "balance_loss_mlp": 1.02461529, "epoch": 0.38573919317020383, "flos": 25702631781120.0, "grad_norm": 1.6769842214144401, "language_loss": 0.72584087, "learning_rate": 2.8131411805781717e-06, "loss": 0.74763095, "num_input_tokens_seen": 69077775, "step": 3208, "time_per_iteration": 3.627300500869751 }, { "auxiliary_loss_clip": 0.01138397, "auxiliary_loss_mlp": 0.01049777, "balance_loss_clip": 1.05120635, "balance_loss_mlp": 1.02887928, "epoch": 0.3858594360608429, "flos": 29821405628160.0, "grad_norm": 19.632958231606608, "language_loss": 0.64010167, "learning_rate": 2.8124294353332707e-06, "loss": 0.66198343, "num_input_tokens_seen": 69096450, "step": 3209, "time_per_iteration": 2.7174947261810303 }, { "auxiliary_loss_clip": 0.01129462, "auxiliary_loss_mlp": 0.01048628, "balance_loss_clip": 1.05010223, "balance_loss_mlp": 1.03078198, "epoch": 0.385979678951482, "flos": 24790428961920.0, "grad_norm": 1.6373563442543266, "language_loss": 0.77294087, "learning_rate": 2.8117175668434713e-06, "loss": 0.79472172, "num_input_tokens_seen": 69116110, "step": 3210, "time_per_iteration": 3.637132406234741 }, { "auxiliary_loss_clip": 0.01165215, "auxiliary_loss_mlp": 0.01040056, "balance_loss_clip": 1.05344629, "balance_loss_mlp": 1.02001691, "epoch": 0.3860999218421211, "flos": 21287881866240.0, "grad_norm": 2.2625202934246618, "language_loss": 0.69460702, "learning_rate": 2.811005575216762e-06, "loss": 0.71665978, "num_input_tokens_seen": 69134825, "step": 3211, "time_per_iteration": 2.5671825408935547 }, { "auxiliary_loss_clip": 0.01115102, "auxiliary_loss_mlp": 0.01049698, "balance_loss_clip": 1.04607129, "balance_loss_mlp": 1.03025484, "epoch": 0.38622016473276016, "flos": 24536720223360.0, "grad_norm": 1.5513203827239062, "language_loss": 0.79054999, "learning_rate": 2.8102934605611513e-06, "loss": 0.81219798, "num_input_tokens_seen": 69156460, "step": 3212, "time_per_iteration": 3.7154150009155273 }, { "auxiliary_loss_clip": 0.01144931, "auxiliary_loss_mlp": 0.01050011, "balance_loss_clip": 1.05307722, "balance_loss_mlp": 1.02901769, "epoch": 0.3863404076233993, "flos": 20558212986240.0, "grad_norm": 2.03400423942248, "language_loss": 0.66947782, "learning_rate": 2.8095812229846665e-06, "loss": 0.69142723, "num_input_tokens_seen": 69176420, "step": 3213, "time_per_iteration": 2.6138086318969727 }, { "auxiliary_loss_clip": 0.01137649, "auxiliary_loss_mlp": 0.01056802, "balance_loss_clip": 1.04991984, "balance_loss_mlp": 1.03565454, "epoch": 0.3864606505140384, "flos": 22346277039360.0, "grad_norm": 2.330904439162565, "language_loss": 0.68803978, "learning_rate": 2.808868862595355e-06, "loss": 0.7099843, "num_input_tokens_seen": 69196665, "step": 3214, "time_per_iteration": 2.669231414794922 }, { "auxiliary_loss_clip": 0.01154969, "auxiliary_loss_mlp": 0.01050762, "balance_loss_clip": 1.05493546, "balance_loss_mlp": 1.03158092, "epoch": 0.38658089340467744, "flos": 25703601448320.0, "grad_norm": 2.0828006725417194, "language_loss": 0.798509, "learning_rate": 2.8081563795012795e-06, "loss": 0.8205663, "num_input_tokens_seen": 69216290, "step": 3215, "time_per_iteration": 2.682488441467285 }, { "auxiliary_loss_clip": 0.01144255, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.05140066, "balance_loss_mlp": 1.02303338, "epoch": 0.38670113629531655, "flos": 33802534558080.0, "grad_norm": 1.7111719562208365, "language_loss": 0.73528647, "learning_rate": 2.807443773810524e-06, "loss": 0.75716043, "num_input_tokens_seen": 69237550, "step": 3216, "time_per_iteration": 2.782177209854126 }, { "auxiliary_loss_clip": 0.01127425, "auxiliary_loss_mlp": 0.0104845, "balance_loss_clip": 1.05295253, "balance_loss_mlp": 1.02855361, "epoch": 0.3868213791859556, "flos": 23331522165120.0, "grad_norm": 1.8816519461132115, "language_loss": 0.89746612, "learning_rate": 2.80673104563119e-06, "loss": 0.91922486, "num_input_tokens_seen": 69258175, "step": 3217, "time_per_iteration": 3.676966905593872 }, { "auxiliary_loss_clip": 0.01146772, "auxiliary_loss_mlp": 0.01048778, "balance_loss_clip": 1.0522548, "balance_loss_mlp": 1.02989554, "epoch": 0.3869416220765947, "flos": 18441530380800.0, "grad_norm": 1.9266158967854363, "language_loss": 0.78822553, "learning_rate": 2.8060181950713976e-06, "loss": 0.81018102, "num_input_tokens_seen": 69274965, "step": 3218, "time_per_iteration": 2.570096969604492 }, { "auxiliary_loss_clip": 0.0112541, "auxiliary_loss_mlp": 0.01075713, "balance_loss_clip": 1.05092239, "balance_loss_mlp": 1.05416012, "epoch": 0.3870618649672338, "flos": 15632992938240.0, "grad_norm": 2.2342939843095064, "language_loss": 0.80880547, "learning_rate": 2.805305222239286e-06, "loss": 0.83081669, "num_input_tokens_seen": 69292220, "step": 3219, "time_per_iteration": 2.684450387954712 }, { "auxiliary_loss_clip": 0.01136934, "auxiliary_loss_mlp": 0.01048406, "balance_loss_clip": 1.05404115, "balance_loss_mlp": 1.02914226, "epoch": 0.3871821078578729, "flos": 23513804709120.0, "grad_norm": 2.94561465719325, "language_loss": 0.73778951, "learning_rate": 2.8045921272430118e-06, "loss": 0.7596429, "num_input_tokens_seen": 69311900, "step": 3220, "time_per_iteration": 2.693847179412842 }, { "auxiliary_loss_clip": 0.0116016, "auxiliary_loss_mlp": 0.01064153, "balance_loss_clip": 1.05313337, "balance_loss_mlp": 1.0416348, "epoch": 0.387302350748512, "flos": 17778259791360.0, "grad_norm": 2.7101982910900664, "language_loss": 0.76715398, "learning_rate": 2.803878910190753e-06, "loss": 0.78939712, "num_input_tokens_seen": 69328820, "step": 3221, "time_per_iteration": 2.604706048965454 }, { "auxiliary_loss_clip": 0.0115513, "auxiliary_loss_mlp": 0.01047158, "balance_loss_clip": 1.05457222, "balance_loss_mlp": 1.02844214, "epoch": 0.3874225936391511, "flos": 11503409097600.0, "grad_norm": 2.6612491940361074, "language_loss": 0.82062972, "learning_rate": 2.8031655711907017e-06, "loss": 0.84265268, "num_input_tokens_seen": 69342525, "step": 3222, "time_per_iteration": 2.7162578105926514 }, { "auxiliary_loss_clip": 0.01157641, "auxiliary_loss_mlp": 0.01051999, "balance_loss_clip": 1.05635953, "balance_loss_mlp": 1.03213894, "epoch": 0.38754283652979016, "flos": 21945154884480.0, "grad_norm": 5.262451338995011, "language_loss": 0.8072958, "learning_rate": 2.8024521103510723e-06, "loss": 0.82939219, "num_input_tokens_seen": 69359295, "step": 3223, "time_per_iteration": 2.6050491333007812 }, { "auxiliary_loss_clip": 0.01147422, "auxiliary_loss_mlp": 0.01051262, "balance_loss_clip": 1.04935241, "balance_loss_mlp": 1.03198564, "epoch": 0.38766307942042927, "flos": 21175984022400.0, "grad_norm": 2.099981746153724, "language_loss": 0.75395787, "learning_rate": 2.8017385277800952e-06, "loss": 0.77594471, "num_input_tokens_seen": 69377650, "step": 3224, "time_per_iteration": 2.638763427734375 }, { "auxiliary_loss_clip": 0.01131121, "auxiliary_loss_mlp": 0.01048549, "balance_loss_clip": 1.05157554, "balance_loss_mlp": 1.02794981, "epoch": 0.3877833223110684, "flos": 27417294391680.0, "grad_norm": 2.1943845243177735, "language_loss": 0.75009274, "learning_rate": 2.8010248235860213e-06, "loss": 0.77188939, "num_input_tokens_seen": 69397765, "step": 3225, "time_per_iteration": 2.7224934101104736 }, { "auxiliary_loss_clip": 0.01050726, "auxiliary_loss_mlp": 0.00756717, "balance_loss_clip": 1.03261113, "balance_loss_mlp": 1.00098062, "epoch": 0.38790356520170743, "flos": 64500019879680.0, "grad_norm": 0.8344295979703304, "language_loss": 0.62764335, "learning_rate": 2.8003109978771192e-06, "loss": 0.64571786, "num_input_tokens_seen": 69458930, "step": 3226, "time_per_iteration": 3.282952308654785 }, { "auxiliary_loss_clip": 0.01114508, "auxiliary_loss_mlp": 0.01048288, "balance_loss_clip": 1.04554594, "balance_loss_mlp": 1.02717638, "epoch": 0.38802380809234654, "flos": 22345415112960.0, "grad_norm": 2.198392126533368, "language_loss": 0.79098439, "learning_rate": 2.799597050761674e-06, "loss": 0.8126123, "num_input_tokens_seen": 69475135, "step": 3227, "time_per_iteration": 2.7091948986053467 }, { "auxiliary_loss_clip": 0.01169742, "auxiliary_loss_mlp": 0.01048414, "balance_loss_clip": 1.05639172, "balance_loss_mlp": 1.02812445, "epoch": 0.38814405098298566, "flos": 25261361199360.0, "grad_norm": 2.008094661780721, "language_loss": 0.78709567, "learning_rate": 2.7988829823479924e-06, "loss": 0.80927718, "num_input_tokens_seen": 69493525, "step": 3228, "time_per_iteration": 2.6165454387664795 }, { "auxiliary_loss_clip": 0.01132271, "auxiliary_loss_mlp": 0.01057985, "balance_loss_clip": 1.0490067, "balance_loss_mlp": 1.0355854, "epoch": 0.3882642938736247, "flos": 18841180078080.0, "grad_norm": 2.1454021492889335, "language_loss": 0.63913804, "learning_rate": 2.7981687927443976e-06, "loss": 0.66104066, "num_input_tokens_seen": 69510325, "step": 3229, "time_per_iteration": 2.60971736907959 }, { "auxiliary_loss_clip": 0.01145302, "auxiliary_loss_mlp": 0.01045747, "balance_loss_clip": 1.04791403, "balance_loss_mlp": 1.02741289, "epoch": 0.3883845367642638, "flos": 21652806090240.0, "grad_norm": 2.4139806351885356, "language_loss": 0.85246968, "learning_rate": 2.797454482059231e-06, "loss": 0.87438017, "num_input_tokens_seen": 69530480, "step": 3230, "time_per_iteration": 2.6730730533599854 }, { "auxiliary_loss_clip": 0.01160932, "auxiliary_loss_mlp": 0.01044628, "balance_loss_clip": 1.0530479, "balance_loss_mlp": 1.02581644, "epoch": 0.3885047796549029, "flos": 20557530627840.0, "grad_norm": 2.097958948612708, "language_loss": 0.8426373, "learning_rate": 2.7967400504008537e-06, "loss": 0.86469293, "num_input_tokens_seen": 69549780, "step": 3231, "time_per_iteration": 2.5644989013671875 }, { "auxiliary_loss_clip": 0.01024063, "auxiliary_loss_mlp": 0.0100492, "balance_loss_clip": 1.03076029, "balance_loss_mlp": 1.00258374, "epoch": 0.388625022545542, "flos": 64325491695360.0, "grad_norm": 0.8844534074706606, "language_loss": 0.57480544, "learning_rate": 2.7960254978776456e-06, "loss": 0.59509528, "num_input_tokens_seen": 69611870, "step": 3232, "time_per_iteration": 3.2563514709472656 }, { "auxiliary_loss_clip": 0.01164448, "auxiliary_loss_mlp": 0.01048825, "balance_loss_clip": 1.05440664, "balance_loss_mlp": 1.02981162, "epoch": 0.3887452654361811, "flos": 18113881495680.0, "grad_norm": 1.9645793466571126, "language_loss": 0.81555307, "learning_rate": 2.7953108245980006e-06, "loss": 0.83768582, "num_input_tokens_seen": 69630385, "step": 3233, "time_per_iteration": 2.578876495361328 }, { "auxiliary_loss_clip": 0.01131103, "auxiliary_loss_mlp": 0.01050799, "balance_loss_clip": 1.05197382, "balance_loss_mlp": 1.03319132, "epoch": 0.38886550832682015, "flos": 24975261371520.0, "grad_norm": 1.5145744262950138, "language_loss": 0.73763102, "learning_rate": 2.7945960306703365e-06, "loss": 0.75945008, "num_input_tokens_seen": 69653370, "step": 3234, "time_per_iteration": 3.641115188598633 }, { "auxiliary_loss_clip": 0.01148907, "auxiliary_loss_mlp": 0.01050653, "balance_loss_clip": 1.05165613, "balance_loss_mlp": 1.03152025, "epoch": 0.38898575121745926, "flos": 27199496275200.0, "grad_norm": 2.0783706492100884, "language_loss": 0.65810549, "learning_rate": 2.7938811162030865e-06, "loss": 0.6801011, "num_input_tokens_seen": 69673635, "step": 3235, "time_per_iteration": 2.684255838394165 }, { "auxiliary_loss_clip": 0.01147742, "auxiliary_loss_mlp": 0.01043279, "balance_loss_clip": 1.05207849, "balance_loss_mlp": 1.02490914, "epoch": 0.3891059941080984, "flos": 28763728727040.0, "grad_norm": 6.880930474667925, "language_loss": 0.82157779, "learning_rate": 2.793166081304702e-06, "loss": 0.84348798, "num_input_tokens_seen": 69694130, "step": 3236, "time_per_iteration": 3.578671455383301 }, { "auxiliary_loss_clip": 0.01132645, "auxiliary_loss_mlp": 0.01045243, "balance_loss_clip": 1.05137157, "balance_loss_mlp": 1.02504873, "epoch": 0.38922623699873743, "flos": 22893447893760.0, "grad_norm": 1.8592453625095642, "language_loss": 0.82611632, "learning_rate": 2.7924509260836543e-06, "loss": 0.84789515, "num_input_tokens_seen": 69713255, "step": 3237, "time_per_iteration": 2.72060489654541 }, { "auxiliary_loss_clip": 0.01125154, "auxiliary_loss_mlp": 0.01050357, "balance_loss_clip": 1.0500021, "balance_loss_mlp": 1.03065133, "epoch": 0.38934647988937654, "flos": 19792418002560.0, "grad_norm": 1.6426834295110615, "language_loss": 0.68584049, "learning_rate": 2.791735650648431e-06, "loss": 0.70759559, "num_input_tokens_seen": 69732375, "step": 3238, "time_per_iteration": 3.709494113922119 }, { "auxiliary_loss_clip": 0.01135971, "auxiliary_loss_mlp": 0.0105076, "balance_loss_clip": 1.05015409, "balance_loss_mlp": 1.0308882, "epoch": 0.38946672278001565, "flos": 19202081978880.0, "grad_norm": 2.0418205629410786, "language_loss": 0.74511141, "learning_rate": 2.791020255107538e-06, "loss": 0.76697874, "num_input_tokens_seen": 69749745, "step": 3239, "time_per_iteration": 2.6325552463531494 }, { "auxiliary_loss_clip": 0.0111878, "auxiliary_loss_mlp": 0.01054987, "balance_loss_clip": 1.0456543, "balance_loss_mlp": 1.03369641, "epoch": 0.3895869656706547, "flos": 24936477661440.0, "grad_norm": 1.6716824871977616, "language_loss": 0.80623239, "learning_rate": 2.7903047395695023e-06, "loss": 0.82797009, "num_input_tokens_seen": 69769645, "step": 3240, "time_per_iteration": 2.746135711669922 }, { "auxiliary_loss_clip": 0.01148285, "auxiliary_loss_mlp": 0.00776659, "balance_loss_clip": 1.05102444, "balance_loss_mlp": 1.00070953, "epoch": 0.3897072085612938, "flos": 24133622820480.0, "grad_norm": 2.058747256948741, "language_loss": 0.90133268, "learning_rate": 2.789589104142865e-06, "loss": 0.92058206, "num_input_tokens_seen": 69787270, "step": 3241, "time_per_iteration": 2.637000322341919 }, { "auxiliary_loss_clip": 0.01126628, "auxiliary_loss_mlp": 0.01049867, "balance_loss_clip": 1.05289423, "balance_loss_mlp": 1.02980363, "epoch": 0.3898274514519329, "flos": 17166342672000.0, "grad_norm": 3.401292811856127, "language_loss": 0.76373804, "learning_rate": 2.7888733489361895e-06, "loss": 0.78550303, "num_input_tokens_seen": 69805685, "step": 3242, "time_per_iteration": 2.625628709793091 }, { "auxiliary_loss_clip": 0.010588, "auxiliary_loss_mlp": 0.01011151, "balance_loss_clip": 1.02326334, "balance_loss_mlp": 1.00908875, "epoch": 0.389947694342572, "flos": 66074807952000.0, "grad_norm": 0.7262092508480852, "language_loss": 0.586999, "learning_rate": 2.788157474058054e-06, "loss": 0.6076985, "num_input_tokens_seen": 69867960, "step": 3243, "time_per_iteration": 4.231140613555908 }, { "auxiliary_loss_clip": 0.01154537, "auxiliary_loss_mlp": 0.01042022, "balance_loss_clip": 1.05023229, "balance_loss_mlp": 1.0239501, "epoch": 0.3900679372332111, "flos": 25740912700800.0, "grad_norm": 1.558264647719786, "language_loss": 0.69896781, "learning_rate": 2.7874414796170555e-06, "loss": 0.72093344, "num_input_tokens_seen": 69889450, "step": 3244, "time_per_iteration": 2.6309170722961426 }, { "auxiliary_loss_clip": 0.01145166, "auxiliary_loss_mlp": 0.01044908, "balance_loss_clip": 1.0521363, "balance_loss_mlp": 1.02545261, "epoch": 0.3901881801238502, "flos": 11801611808640.0, "grad_norm": 6.0450243888931965, "language_loss": 0.83781946, "learning_rate": 2.7867253657218113e-06, "loss": 0.85972017, "num_input_tokens_seen": 69903340, "step": 3245, "time_per_iteration": 2.5855042934417725 }, { "auxiliary_loss_clip": 0.01136659, "auxiliary_loss_mlp": 0.0077572, "balance_loss_clip": 1.05163717, "balance_loss_mlp": 1.00079203, "epoch": 0.39030842301448926, "flos": 27308951994240.0, "grad_norm": 2.408828301319729, "language_loss": 0.73099124, "learning_rate": 2.7860091324809544e-06, "loss": 0.75011504, "num_input_tokens_seen": 69924400, "step": 3246, "time_per_iteration": 2.7298853397369385 }, { "auxiliary_loss_clip": 0.01146977, "auxiliary_loss_mlp": 0.01046785, "balance_loss_clip": 1.05279303, "balance_loss_mlp": 1.02775955, "epoch": 0.39042866590512837, "flos": 27163334257920.0, "grad_norm": 1.7770725687971949, "language_loss": 0.81329811, "learning_rate": 2.7852927800031377e-06, "loss": 0.83523571, "num_input_tokens_seen": 69944565, "step": 3247, "time_per_iteration": 2.6674797534942627 }, { "auxiliary_loss_clip": 0.0113627, "auxiliary_loss_mlp": 0.0104486, "balance_loss_clip": 1.04932761, "balance_loss_mlp": 1.02572751, "epoch": 0.3905489087957674, "flos": 29716115886720.0, "grad_norm": 1.9915686175749183, "language_loss": 0.82631946, "learning_rate": 2.7845763083970298e-06, "loss": 0.84813076, "num_input_tokens_seen": 69964965, "step": 3248, "time_per_iteration": 2.693777322769165 }, { "auxiliary_loss_clip": 0.01141239, "auxiliary_loss_mlp": 0.01050966, "balance_loss_clip": 1.04955816, "balance_loss_mlp": 1.0298301, "epoch": 0.39066915168640653, "flos": 24498618871680.0, "grad_norm": 2.008874220945911, "language_loss": 0.81782961, "learning_rate": 2.7838597177713205e-06, "loss": 0.83975172, "num_input_tokens_seen": 69986055, "step": 3249, "time_per_iteration": 2.656094789505005 }, { "auxiliary_loss_clip": 0.01089104, "auxiliary_loss_mlp": 0.01042644, "balance_loss_clip": 1.04906654, "balance_loss_mlp": 1.02393997, "epoch": 0.39078939457704565, "flos": 20558572122240.0, "grad_norm": 1.7703862752685182, "language_loss": 0.73842716, "learning_rate": 2.7831430082347143e-06, "loss": 0.75974464, "num_input_tokens_seen": 70005260, "step": 3250, "time_per_iteration": 2.752460479736328 }, { "auxiliary_loss_clip": 0.01152477, "auxiliary_loss_mlp": 0.00774009, "balance_loss_clip": 1.05517817, "balance_loss_mlp": 1.00089633, "epoch": 0.3909096374676847, "flos": 22783417557120.0, "grad_norm": 2.3095104476316535, "language_loss": 0.82287276, "learning_rate": 2.7824261798959373e-06, "loss": 0.84213758, "num_input_tokens_seen": 70023440, "step": 3251, "time_per_iteration": 2.6434552669525146 }, { "auxiliary_loss_clip": 0.0113522, "auxiliary_loss_mlp": 0.0105375, "balance_loss_clip": 1.04746366, "balance_loss_mlp": 1.03393769, "epoch": 0.3910298803583238, "flos": 23003119094400.0, "grad_norm": 2.9617643394752444, "language_loss": 0.79749215, "learning_rate": 2.78170923286373e-06, "loss": 0.81938189, "num_input_tokens_seen": 70043040, "step": 3252, "time_per_iteration": 2.679412841796875 }, { "auxiliary_loss_clip": 0.01080791, "auxiliary_loss_mlp": 0.01056047, "balance_loss_clip": 1.04990077, "balance_loss_mlp": 1.03530478, "epoch": 0.3911501232489629, "flos": 24316264500480.0, "grad_norm": 2.438579784535443, "language_loss": 0.83941329, "learning_rate": 2.780992167246854e-06, "loss": 0.86078167, "num_input_tokens_seen": 70060565, "step": 3253, "time_per_iteration": 2.783379554748535 }, { "auxiliary_loss_clip": 0.01039072, "auxiliary_loss_mlp": 0.01000554, "balance_loss_clip": 1.02100122, "balance_loss_mlp": 0.99818134, "epoch": 0.391270366139602, "flos": 60869054684160.0, "grad_norm": 0.9764161151332152, "language_loss": 0.72097826, "learning_rate": 2.7802749831540883e-06, "loss": 0.74137449, "num_input_tokens_seen": 70119465, "step": 3254, "time_per_iteration": 3.2680747509002686 }, { "auxiliary_loss_clip": 0.01111385, "auxiliary_loss_mlp": 0.01045771, "balance_loss_clip": 1.04866385, "balance_loss_mlp": 1.02866435, "epoch": 0.3913906090302411, "flos": 21543494025600.0, "grad_norm": 3.065175219862326, "language_loss": 0.82303429, "learning_rate": 2.7795576806942268e-06, "loss": 0.8446058, "num_input_tokens_seen": 70138270, "step": 3255, "time_per_iteration": 2.7126848697662354 }, { "auxiliary_loss_clip": 0.01029688, "auxiliary_loss_mlp": 0.01003661, "balance_loss_clip": 1.01852882, "balance_loss_mlp": 1.00168216, "epoch": 0.3915108519208802, "flos": 49839953702400.0, "grad_norm": 0.7560868051628089, "language_loss": 0.54842162, "learning_rate": 2.778840259976085e-06, "loss": 0.56875515, "num_input_tokens_seen": 70193500, "step": 3256, "time_per_iteration": 3.1881778240203857 }, { "auxiliary_loss_clip": 0.01153651, "auxiliary_loss_mlp": 0.01047487, "balance_loss_clip": 1.05348396, "balance_loss_mlp": 1.02719772, "epoch": 0.39163109481151925, "flos": 16506447960960.0, "grad_norm": 2.112500917660574, "language_loss": 0.77295274, "learning_rate": 2.778122721108495e-06, "loss": 0.79496413, "num_input_tokens_seen": 70211730, "step": 3257, "time_per_iteration": 2.621177911758423 }, { "auxiliary_loss_clip": 0.01141055, "auxiliary_loss_mlp": 0.01047312, "balance_loss_clip": 1.04865766, "balance_loss_mlp": 1.02771354, "epoch": 0.39175133770215836, "flos": 26067484177920.0, "grad_norm": 2.3170705850531474, "language_loss": 0.88415009, "learning_rate": 2.7774050642003076e-06, "loss": 0.90603375, "num_input_tokens_seen": 70232540, "step": 3258, "time_per_iteration": 2.659087896347046 }, { "auxiliary_loss_clip": 0.01165886, "auxiliary_loss_mlp": 0.01057671, "balance_loss_clip": 1.05537367, "balance_loss_mlp": 1.03738165, "epoch": 0.3918715805927975, "flos": 21872076664320.0, "grad_norm": 2.387513426515727, "language_loss": 0.9331634, "learning_rate": 2.7766872893603896e-06, "loss": 0.95539898, "num_input_tokens_seen": 70252515, "step": 3259, "time_per_iteration": 2.6528124809265137 }, { "auxiliary_loss_clip": 0.01149092, "auxiliary_loss_mlp": 0.01052606, "balance_loss_clip": 1.05205798, "balance_loss_mlp": 1.03386605, "epoch": 0.39199182348343653, "flos": 20376181837440.0, "grad_norm": 1.844443981788724, "language_loss": 0.73098451, "learning_rate": 2.7759693966976275e-06, "loss": 0.75300151, "num_input_tokens_seen": 70271020, "step": 3260, "time_per_iteration": 3.711592197418213 }, { "auxiliary_loss_clip": 0.01120566, "auxiliary_loss_mlp": 0.01049669, "balance_loss_clip": 1.04705882, "balance_loss_mlp": 1.02978456, "epoch": 0.39211206637407564, "flos": 21683545153920.0, "grad_norm": 4.13458440412003, "language_loss": 0.85261405, "learning_rate": 2.7752513863209242e-06, "loss": 0.87431639, "num_input_tokens_seen": 70289600, "step": 3261, "time_per_iteration": 2.796182632446289 }, { "auxiliary_loss_clip": 0.01128104, "auxiliary_loss_mlp": 0.00774048, "balance_loss_clip": 1.04782486, "balance_loss_mlp": 1.00090265, "epoch": 0.39223230926471475, "flos": 21066276908160.0, "grad_norm": 1.7532631615543648, "language_loss": 0.84408623, "learning_rate": 2.774533258339203e-06, "loss": 0.8631078, "num_input_tokens_seen": 70307060, "step": 3262, "time_per_iteration": 3.646141767501831 }, { "auxiliary_loss_clip": 0.01106331, "auxiliary_loss_mlp": 0.01067004, "balance_loss_clip": 1.04428816, "balance_loss_mlp": 1.04442585, "epoch": 0.3923525521553538, "flos": 17603016312960.0, "grad_norm": 2.511828207561571, "language_loss": 0.79699826, "learning_rate": 2.7738150128614014e-06, "loss": 0.81873161, "num_input_tokens_seen": 70324465, "step": 3263, "time_per_iteration": 2.744596242904663 }, { "auxiliary_loss_clip": 0.01111058, "auxiliary_loss_mlp": 0.0105816, "balance_loss_clip": 1.04521179, "balance_loss_mlp": 1.03698826, "epoch": 0.3924727950459929, "flos": 20558284813440.0, "grad_norm": 1.9204651749845312, "language_loss": 0.89517665, "learning_rate": 2.7730966499964777e-06, "loss": 0.91686881, "num_input_tokens_seen": 70341415, "step": 3264, "time_per_iteration": 2.758502960205078 }, { "auxiliary_loss_clip": 0.01163114, "auxiliary_loss_mlp": 0.01043266, "balance_loss_clip": 1.05196142, "balance_loss_mlp": 1.02316737, "epoch": 0.39259303793663197, "flos": 16216110328320.0, "grad_norm": 3.1494350808435585, "language_loss": 0.8023389, "learning_rate": 2.772378169853408e-06, "loss": 0.82440269, "num_input_tokens_seen": 70358985, "step": 3265, "time_per_iteration": 4.0331878662109375 }, { "auxiliary_loss_clip": 0.01123984, "auxiliary_loss_mlp": 0.01041856, "balance_loss_clip": 1.05084908, "balance_loss_mlp": 1.02308011, "epoch": 0.3927132808272711, "flos": 16797001075200.0, "grad_norm": 1.9349224184918699, "language_loss": 0.74033821, "learning_rate": 2.771659572541183e-06, "loss": 0.76199663, "num_input_tokens_seen": 70376915, "step": 3266, "time_per_iteration": 2.7582876682281494 }, { "auxiliary_loss_clip": 0.01154382, "auxiliary_loss_mlp": 0.01047122, "balance_loss_clip": 1.05641007, "balance_loss_mlp": 1.02817965, "epoch": 0.3928335237179102, "flos": 20267228908800.0, "grad_norm": 2.0888227731438165, "language_loss": 0.87095392, "learning_rate": 2.7709408581688143e-06, "loss": 0.89296889, "num_input_tokens_seen": 70396900, "step": 3267, "time_per_iteration": 2.6286795139312744 }, { "auxiliary_loss_clip": 0.01125812, "auxiliary_loss_mlp": 0.01052457, "balance_loss_clip": 1.04972231, "balance_loss_mlp": 1.0340029, "epoch": 0.39295376660854925, "flos": 24973250209920.0, "grad_norm": 2.116708438525721, "language_loss": 0.87864316, "learning_rate": 2.7702220268453307e-06, "loss": 0.90042585, "num_input_tokens_seen": 70417260, "step": 3268, "time_per_iteration": 2.8309099674224854 }, { "auxiliary_loss_clip": 0.01143492, "auxiliary_loss_mlp": 0.01044581, "balance_loss_clip": 1.05247521, "balance_loss_mlp": 1.02557945, "epoch": 0.39307400949918836, "flos": 18697788984960.0, "grad_norm": 2.2551856223425406, "language_loss": 0.84736234, "learning_rate": 2.7695030786797785e-06, "loss": 0.86924314, "num_input_tokens_seen": 70433155, "step": 3269, "time_per_iteration": 3.5514962673187256 }, { "auxiliary_loss_clip": 0.01108253, "auxiliary_loss_mlp": 0.01059673, "balance_loss_clip": 1.04772437, "balance_loss_mlp": 1.03692818, "epoch": 0.39319425238982747, "flos": 22415476590720.0, "grad_norm": 2.1690184628328124, "language_loss": 0.74199456, "learning_rate": 2.7687840137812206e-06, "loss": 0.76367384, "num_input_tokens_seen": 70451240, "step": 3270, "time_per_iteration": 2.7287790775299072 }, { "auxiliary_loss_clip": 0.01041745, "auxiliary_loss_mlp": 0.01009083, "balance_loss_clip": 1.01887178, "balance_loss_mlp": 1.00678277, "epoch": 0.3933144952804665, "flos": 66192954762240.0, "grad_norm": 0.7907041187636611, "language_loss": 0.6198647, "learning_rate": 2.7680648322587395e-06, "loss": 0.64037299, "num_input_tokens_seen": 70516115, "step": 3271, "time_per_iteration": 3.2420260906219482 }, { "auxiliary_loss_clip": 0.01158547, "auxiliary_loss_mlp": 0.01052202, "balance_loss_clip": 1.05220377, "balance_loss_mlp": 1.03252065, "epoch": 0.39343473817110564, "flos": 15487159720320.0, "grad_norm": 1.8873992146555034, "language_loss": 0.80826318, "learning_rate": 2.7673455342214334e-06, "loss": 0.83037066, "num_input_tokens_seen": 70533105, "step": 3272, "time_per_iteration": 2.583782434463501 }, { "auxiliary_loss_clip": 0.01149226, "auxiliary_loss_mlp": 0.01044016, "balance_loss_clip": 1.05434632, "balance_loss_mlp": 1.02500176, "epoch": 0.39355498106174475, "flos": 21324905809920.0, "grad_norm": 1.9854764232739397, "language_loss": 0.7587651, "learning_rate": 2.7666261197784198e-06, "loss": 0.78069758, "num_input_tokens_seen": 70551920, "step": 3273, "time_per_iteration": 2.6598732471466064 }, { "auxiliary_loss_clip": 0.01128647, "auxiliary_loss_mlp": 0.01057617, "balance_loss_clip": 1.04904509, "balance_loss_mlp": 1.03890085, "epoch": 0.3936752239523838, "flos": 13296357400320.0, "grad_norm": 2.155309463062258, "language_loss": 0.76206625, "learning_rate": 2.7659065890388336e-06, "loss": 0.78392887, "num_input_tokens_seen": 70567920, "step": 3274, "time_per_iteration": 2.624551296234131 }, { "auxiliary_loss_clip": 0.01134536, "auxiliary_loss_mlp": 0.01049397, "balance_loss_clip": 1.0492928, "balance_loss_mlp": 1.02946568, "epoch": 0.3937954668430229, "flos": 16800161472000.0, "grad_norm": 2.1992079594532883, "language_loss": 0.8460778, "learning_rate": 2.7651869421118266e-06, "loss": 0.86791712, "num_input_tokens_seen": 70584530, "step": 3275, "time_per_iteration": 2.609614133834839 }, { "auxiliary_loss_clip": 0.01153616, "auxiliary_loss_mlp": 0.01042278, "balance_loss_clip": 1.05692434, "balance_loss_mlp": 1.02233422, "epoch": 0.393915709733662, "flos": 21064229832960.0, "grad_norm": 1.9293625463276352, "language_loss": 0.82927394, "learning_rate": 2.76446717910657e-06, "loss": 0.85123289, "num_input_tokens_seen": 70605235, "step": 3276, "time_per_iteration": 2.665724992752075 }, { "auxiliary_loss_clip": 0.01144939, "auxiliary_loss_mlp": 0.01045938, "balance_loss_clip": 1.05046725, "balance_loss_mlp": 1.02688789, "epoch": 0.3940359526243011, "flos": 17165265264000.0, "grad_norm": 2.647030211574263, "language_loss": 0.76859725, "learning_rate": 2.763747300132249e-06, "loss": 0.79050601, "num_input_tokens_seen": 70622675, "step": 3277, "time_per_iteration": 2.6912620067596436 }, { "auxiliary_loss_clip": 0.01159276, "auxiliary_loss_mlp": 0.01049781, "balance_loss_clip": 1.05314767, "balance_loss_mlp": 1.03099382, "epoch": 0.3941561955149402, "flos": 20995856294400.0, "grad_norm": 3.1253472091218444, "language_loss": 0.86324346, "learning_rate": 2.7630273052980704e-06, "loss": 0.88533401, "num_input_tokens_seen": 70643265, "step": 3278, "time_per_iteration": 2.6555330753326416 }, { "auxiliary_loss_clip": 0.0112327, "auxiliary_loss_mlp": 0.01058258, "balance_loss_clip": 1.04707897, "balance_loss_mlp": 1.0359304, "epoch": 0.39427643840557924, "flos": 18843406721280.0, "grad_norm": 2.173978946389979, "language_loss": 0.67072773, "learning_rate": 2.7623071947132554e-06, "loss": 0.69254303, "num_input_tokens_seen": 70660295, "step": 3279, "time_per_iteration": 2.6355342864990234 }, { "auxiliary_loss_clip": 0.01145268, "auxiliary_loss_mlp": 0.01051622, "balance_loss_clip": 1.05165267, "balance_loss_mlp": 1.03142738, "epoch": 0.39439668129621835, "flos": 23258659426560.0, "grad_norm": 1.9210038437817667, "language_loss": 0.79126924, "learning_rate": 2.7615869684870458e-06, "loss": 0.81323814, "num_input_tokens_seen": 70679605, "step": 3280, "time_per_iteration": 2.694204568862915 }, { "auxiliary_loss_clip": 0.01147151, "auxiliary_loss_mlp": 0.01044314, "balance_loss_clip": 1.05163431, "balance_loss_mlp": 1.02540755, "epoch": 0.39451692418685746, "flos": 26652289507200.0, "grad_norm": 1.8714841914408888, "language_loss": 0.85039079, "learning_rate": 2.7608666267286986e-06, "loss": 0.87230539, "num_input_tokens_seen": 70699835, "step": 3281, "time_per_iteration": 2.6648170948028564 }, { "auxiliary_loss_clip": 0.01094928, "auxiliary_loss_mlp": 0.01058307, "balance_loss_clip": 1.0451107, "balance_loss_mlp": 1.03711176, "epoch": 0.3946371670774965, "flos": 18258709132800.0, "grad_norm": 2.2999010414188215, "language_loss": 0.86858952, "learning_rate": 2.760146169547489e-06, "loss": 0.89012194, "num_input_tokens_seen": 70716600, "step": 3282, "time_per_iteration": 2.7631733417510986 }, { "auxiliary_loss_clip": 0.01137654, "auxiliary_loss_mlp": 0.01061107, "balance_loss_clip": 1.05130303, "balance_loss_mlp": 1.0421412, "epoch": 0.39475740996813563, "flos": 24206126423040.0, "grad_norm": 1.8455732619079364, "language_loss": 0.76540017, "learning_rate": 2.75942559705271e-06, "loss": 0.78738779, "num_input_tokens_seen": 70736335, "step": 3283, "time_per_iteration": 2.6939480304718018 }, { "auxiliary_loss_clip": 0.01145012, "auxiliary_loss_mlp": 0.01045821, "balance_loss_clip": 1.05048239, "balance_loss_mlp": 1.02571082, "epoch": 0.39487765285877474, "flos": 19317858491520.0, "grad_norm": 3.716499666116972, "language_loss": 0.89149618, "learning_rate": 2.7587049093536713e-06, "loss": 0.91340446, "num_input_tokens_seen": 70752665, "step": 3284, "time_per_iteration": 2.60657000541687 }, { "auxiliary_loss_clip": 0.01150225, "auxiliary_loss_mlp": 0.01051581, "balance_loss_clip": 1.05253959, "balance_loss_mlp": 1.03371143, "epoch": 0.3949978957494138, "flos": 17311744926720.0, "grad_norm": 2.087316659158383, "language_loss": 0.80324024, "learning_rate": 2.757984106559701e-06, "loss": 0.82525826, "num_input_tokens_seen": 70771650, "step": 3285, "time_per_iteration": 2.569645404815674 }, { "auxiliary_loss_clip": 0.01128773, "auxiliary_loss_mlp": 0.01059379, "balance_loss_clip": 1.0504961, "balance_loss_mlp": 1.03874421, "epoch": 0.3951181386400529, "flos": 36317861280000.0, "grad_norm": 2.9612809247225975, "language_loss": 0.71180409, "learning_rate": 2.7572631887801446e-06, "loss": 0.73368561, "num_input_tokens_seen": 70793275, "step": 3286, "time_per_iteration": 3.660470485687256 }, { "auxiliary_loss_clip": 0.01146671, "auxiliary_loss_mlp": 0.0103954, "balance_loss_clip": 1.0492208, "balance_loss_mlp": 1.01923871, "epoch": 0.395238381530692, "flos": 23110348170240.0, "grad_norm": 6.056114464720737, "language_loss": 0.76541114, "learning_rate": 2.7565421561243654e-06, "loss": 0.78727317, "num_input_tokens_seen": 70811440, "step": 3287, "time_per_iteration": 2.6439316272735596 }, { "auxiliary_loss_clip": 0.0111608, "auxiliary_loss_mlp": 0.01050747, "balance_loss_clip": 1.04883885, "balance_loss_mlp": 1.03219771, "epoch": 0.3953586244213311, "flos": 24347614095360.0, "grad_norm": 2.3380526496432554, "language_loss": 0.81920493, "learning_rate": 2.7558210087017413e-06, "loss": 0.84087318, "num_input_tokens_seen": 70831375, "step": 3288, "time_per_iteration": 3.6207783222198486 }, { "auxiliary_loss_clip": 0.01115102, "auxiliary_loss_mlp": 0.01047688, "balance_loss_clip": 1.04839396, "balance_loss_mlp": 1.02750623, "epoch": 0.3954788673119702, "flos": 23440080044160.0, "grad_norm": 2.9353713413221545, "language_loss": 0.73420072, "learning_rate": 2.7550997466216724e-06, "loss": 0.75582862, "num_input_tokens_seen": 70849170, "step": 3289, "time_per_iteration": 2.677673101425171 }, { "auxiliary_loss_clip": 0.01127095, "auxiliary_loss_mlp": 0.01052518, "balance_loss_clip": 1.0516541, "balance_loss_mlp": 1.03412414, "epoch": 0.3955991102026093, "flos": 17494063384320.0, "grad_norm": 2.489683604148565, "language_loss": 0.8146525, "learning_rate": 2.7543783699935714e-06, "loss": 0.83644867, "num_input_tokens_seen": 70867200, "step": 3290, "time_per_iteration": 3.693568468093872 }, { "auxiliary_loss_clip": 0.0114447, "auxiliary_loss_mlp": 0.01047711, "balance_loss_clip": 1.05180836, "balance_loss_mlp": 1.02955556, "epoch": 0.39571935309324835, "flos": 18221326053120.0, "grad_norm": 3.1562404711276555, "language_loss": 0.86320317, "learning_rate": 2.753656878926872e-06, "loss": 0.88512492, "num_input_tokens_seen": 70883080, "step": 3291, "time_per_iteration": 2.5999038219451904 }, { "auxiliary_loss_clip": 0.01122521, "auxiliary_loss_mlp": 0.01043728, "balance_loss_clip": 1.04636717, "balance_loss_mlp": 1.02330697, "epoch": 0.39583959598388746, "flos": 17748813617280.0, "grad_norm": 1.9639165907681002, "language_loss": 0.74235952, "learning_rate": 2.752935273531023e-06, "loss": 0.76402205, "num_input_tokens_seen": 70901230, "step": 3292, "time_per_iteration": 2.6462526321411133 }, { "auxiliary_loss_clip": 0.01153126, "auxiliary_loss_mlp": 0.01043636, "balance_loss_clip": 1.05334818, "balance_loss_mlp": 1.02207136, "epoch": 0.39595983887452657, "flos": 19352368483200.0, "grad_norm": 2.4629100216081112, "language_loss": 0.78601438, "learning_rate": 2.752213553915492e-06, "loss": 0.80798197, "num_input_tokens_seen": 70919585, "step": 3293, "time_per_iteration": 2.624016761779785 }, { "auxiliary_loss_clip": 0.01040701, "auxiliary_loss_mlp": 0.01010898, "balance_loss_clip": 1.02546406, "balance_loss_mlp": 1.00808513, "epoch": 0.3960800817651656, "flos": 60682282940160.0, "grad_norm": 0.8166650888036876, "language_loss": 0.66068983, "learning_rate": 2.751491720189762e-06, "loss": 0.68120587, "num_input_tokens_seen": 70977695, "step": 3294, "time_per_iteration": 3.199326992034912 }, { "auxiliary_loss_clip": 0.01136225, "auxiliary_loss_mlp": 0.00775064, "balance_loss_clip": 1.05203199, "balance_loss_mlp": 1.00086284, "epoch": 0.39620032465580474, "flos": 16836718538880.0, "grad_norm": 2.343933694056807, "language_loss": 0.91636467, "learning_rate": 2.7507697724633364e-06, "loss": 0.93547761, "num_input_tokens_seen": 70994455, "step": 3295, "time_per_iteration": 3.561577081680298 }, { "auxiliary_loss_clip": 0.01020016, "auxiliary_loss_mlp": 0.01002726, "balance_loss_clip": 1.01884031, "balance_loss_mlp": 1.00084257, "epoch": 0.3963205675464438, "flos": 69071445941760.0, "grad_norm": 0.7803553308594783, "language_loss": 0.54671681, "learning_rate": 2.7500477108457327e-06, "loss": 0.56694424, "num_input_tokens_seen": 71046465, "step": 3296, "time_per_iteration": 3.0931315422058105 }, { "auxiliary_loss_clip": 0.0115063, "auxiliary_loss_mlp": 0.01055289, "balance_loss_clip": 1.05304933, "balance_loss_mlp": 1.03397393, "epoch": 0.3964408104370829, "flos": 25667439431040.0, "grad_norm": 1.9045994412302956, "language_loss": 0.80163741, "learning_rate": 2.7493255354464877e-06, "loss": 0.82369661, "num_input_tokens_seen": 71064275, "step": 3297, "time_per_iteration": 2.682398796081543 }, { "auxiliary_loss_clip": 0.01044125, "auxiliary_loss_mlp": 0.01052036, "balance_loss_clip": 1.03863442, "balance_loss_mlp": 1.03402364, "epoch": 0.396561053327722, "flos": 24277480790400.0, "grad_norm": 2.0426302750807483, "language_loss": 0.76194525, "learning_rate": 2.748603246375156e-06, "loss": 0.78290689, "num_input_tokens_seen": 71082290, "step": 3298, "time_per_iteration": 3.041475534439087 }, { "auxiliary_loss_clip": 0.0116122, "auxiliary_loss_mlp": 0.01045378, "balance_loss_clip": 1.05503249, "balance_loss_mlp": 1.02657902, "epoch": 0.39668129621836107, "flos": 20522302364160.0, "grad_norm": 3.0463809442893055, "language_loss": 0.69351041, "learning_rate": 2.7478808437413055e-06, "loss": 0.71557635, "num_input_tokens_seen": 71101700, "step": 3299, "time_per_iteration": 2.804522752761841 }, { "auxiliary_loss_clip": 0.01106834, "auxiliary_loss_mlp": 0.01048897, "balance_loss_clip": 1.04719353, "balance_loss_mlp": 1.03020477, "epoch": 0.3968015391090002, "flos": 27052585649280.0, "grad_norm": 2.2961510097497384, "language_loss": 0.66201615, "learning_rate": 2.7471583276545263e-06, "loss": 0.68357348, "num_input_tokens_seen": 71122360, "step": 3300, "time_per_iteration": 2.7813832759857178 }, { "auxiliary_loss_clip": 0.01137435, "auxiliary_loss_mlp": 0.01048495, "balance_loss_clip": 1.05080247, "balance_loss_mlp": 1.02908731, "epoch": 0.3969217819996393, "flos": 12531819392640.0, "grad_norm": 2.235050186003028, "language_loss": 0.70578671, "learning_rate": 2.7464356982244224e-06, "loss": 0.72764599, "num_input_tokens_seen": 71140360, "step": 3301, "time_per_iteration": 2.680187940597534 }, { "auxiliary_loss_clip": 0.01041289, "auxiliary_loss_mlp": 0.01009842, "balance_loss_clip": 1.02114129, "balance_loss_mlp": 1.00795829, "epoch": 0.39704202489027834, "flos": 66241399230720.0, "grad_norm": 0.7784248718230181, "language_loss": 0.61671615, "learning_rate": 2.745712955560617e-06, "loss": 0.63722742, "num_input_tokens_seen": 71196565, "step": 3302, "time_per_iteration": 3.1667914390563965 }, { "auxiliary_loss_clip": 0.01095892, "auxiliary_loss_mlp": 0.01053651, "balance_loss_clip": 1.04509628, "balance_loss_mlp": 1.03320694, "epoch": 0.39716226778091746, "flos": 16982982720000.0, "grad_norm": 5.1031508321727514, "language_loss": 0.77331001, "learning_rate": 2.7449900997727496e-06, "loss": 0.79480547, "num_input_tokens_seen": 71214675, "step": 3303, "time_per_iteration": 2.742841958999634 }, { "auxiliary_loss_clip": 0.01129161, "auxiliary_loss_mlp": 0.010437, "balance_loss_clip": 1.05217266, "balance_loss_mlp": 1.02670097, "epoch": 0.39728251067155657, "flos": 23477139901440.0, "grad_norm": 2.037004688340957, "language_loss": 0.84213591, "learning_rate": 2.744267130970476e-06, "loss": 0.86386454, "num_input_tokens_seen": 71234400, "step": 3304, "time_per_iteration": 2.6983070373535156 }, { "auxiliary_loss_clip": 0.01130165, "auxiliary_loss_mlp": 0.01055576, "balance_loss_clip": 1.05174637, "balance_loss_mlp": 1.03719401, "epoch": 0.3974027535621956, "flos": 20704441253760.0, "grad_norm": 1.734030052157573, "language_loss": 0.76839435, "learning_rate": 2.7435440492634697e-06, "loss": 0.79025173, "num_input_tokens_seen": 71253725, "step": 3305, "time_per_iteration": 2.696723461151123 }, { "auxiliary_loss_clip": 0.01138901, "auxiliary_loss_mlp": 0.01053643, "balance_loss_clip": 1.05034852, "balance_loss_mlp": 1.03182793, "epoch": 0.39752299645283473, "flos": 21543278544000.0, "grad_norm": 3.59082538789412, "language_loss": 0.67186731, "learning_rate": 2.7428208547614228e-06, "loss": 0.69379282, "num_input_tokens_seen": 71273220, "step": 3306, "time_per_iteration": 2.6800026893615723 }, { "auxiliary_loss_clip": 0.01149911, "auxiliary_loss_mlp": 0.01051643, "balance_loss_clip": 1.05210412, "balance_loss_mlp": 1.03186584, "epoch": 0.39764323934347384, "flos": 19208295031680.0, "grad_norm": 1.8389447013947142, "language_loss": 0.772856, "learning_rate": 2.742097547574043e-06, "loss": 0.79487157, "num_input_tokens_seen": 71291445, "step": 3307, "time_per_iteration": 2.653571605682373 }, { "auxiliary_loss_clip": 0.01142348, "auxiliary_loss_mlp": 0.00775748, "balance_loss_clip": 1.05245948, "balance_loss_mlp": 1.00095606, "epoch": 0.3977634822341129, "flos": 20850202644480.0, "grad_norm": 2.191811415178671, "language_loss": 0.77894849, "learning_rate": 2.7413741278110544e-06, "loss": 0.79812944, "num_input_tokens_seen": 71310135, "step": 3308, "time_per_iteration": 2.63883376121521 }, { "auxiliary_loss_clip": 0.01143754, "auxiliary_loss_mlp": 0.01053465, "balance_loss_clip": 1.05419588, "balance_loss_mlp": 1.03299725, "epoch": 0.397883725124752, "flos": 39786042038400.0, "grad_norm": 2.597090777154629, "language_loss": 0.68658149, "learning_rate": 2.7406505955822016e-06, "loss": 0.70855367, "num_input_tokens_seen": 71331160, "step": 3309, "time_per_iteration": 2.7806901931762695 }, { "auxiliary_loss_clip": 0.01133668, "auxiliary_loss_mlp": 0.01065794, "balance_loss_clip": 1.04868984, "balance_loss_mlp": 1.04447949, "epoch": 0.39800396801539106, "flos": 17379507934080.0, "grad_norm": 3.1282570584172493, "language_loss": 0.65850866, "learning_rate": 2.7399269509972415e-06, "loss": 0.68050325, "num_input_tokens_seen": 71345315, "step": 3310, "time_per_iteration": 2.624429225921631 }, { "auxiliary_loss_clip": 0.01127529, "auxiliary_loss_mlp": 0.0105561, "balance_loss_clip": 1.04533827, "balance_loss_mlp": 1.03390157, "epoch": 0.3981242109060302, "flos": 19202764337280.0, "grad_norm": 2.637828499615343, "language_loss": 0.85368073, "learning_rate": 2.7392031941659514e-06, "loss": 0.87551206, "num_input_tokens_seen": 71363160, "step": 3311, "time_per_iteration": 3.5413875579833984 }, { "auxiliary_loss_clip": 0.01138032, "auxiliary_loss_mlp": 0.01053041, "balance_loss_clip": 1.05383873, "balance_loss_mlp": 1.03414643, "epoch": 0.3982444537966693, "flos": 24565124903040.0, "grad_norm": 1.8418946918713262, "language_loss": 0.85747576, "learning_rate": 2.7384793251981244e-06, "loss": 0.87938643, "num_input_tokens_seen": 71382145, "step": 3312, "time_per_iteration": 2.6634998321533203 }, { "auxiliary_loss_clip": 0.0115702, "auxiliary_loss_mlp": 0.01048856, "balance_loss_clip": 1.05493605, "balance_loss_mlp": 1.02935338, "epoch": 0.39836469668730834, "flos": 26213856099840.0, "grad_norm": 2.0924000905134066, "language_loss": 0.80649233, "learning_rate": 2.737755344203571e-06, "loss": 0.82855105, "num_input_tokens_seen": 71402095, "step": 3313, "time_per_iteration": 2.6860029697418213 }, { "auxiliary_loss_clip": 0.01150607, "auxiliary_loss_mlp": 0.01047367, "balance_loss_clip": 1.0552671, "balance_loss_mlp": 1.02817416, "epoch": 0.39848493957794745, "flos": 27636134002560.0, "grad_norm": 4.001297269627677, "language_loss": 0.79633462, "learning_rate": 2.7370312512921186e-06, "loss": 0.81831437, "num_input_tokens_seen": 71423875, "step": 3314, "time_per_iteration": 3.6872029304504395 }, { "auxiliary_loss_clip": 0.01143141, "auxiliary_loss_mlp": 0.01051109, "balance_loss_clip": 1.05090308, "balance_loss_mlp": 1.03054512, "epoch": 0.39860518246858656, "flos": 12239326944000.0, "grad_norm": 4.4748023120141305, "language_loss": 0.77257025, "learning_rate": 2.736307046573611e-06, "loss": 0.79451275, "num_input_tokens_seen": 71439745, "step": 3315, "time_per_iteration": 2.6474545001983643 }, { "auxiliary_loss_clip": 0.01159851, "auxiliary_loss_mlp": 0.01051327, "balance_loss_clip": 1.05228376, "balance_loss_mlp": 1.03283739, "epoch": 0.3987254253592256, "flos": 22379135005440.0, "grad_norm": 2.1534672716096814, "language_loss": 0.81829, "learning_rate": 2.73558273015791e-06, "loss": 0.84040183, "num_input_tokens_seen": 71459575, "step": 3316, "time_per_iteration": 3.583815574645996 }, { "auxiliary_loss_clip": 0.01162394, "auxiliary_loss_mlp": 0.01057934, "balance_loss_clip": 1.05392003, "balance_loss_mlp": 1.03772783, "epoch": 0.3988456682498647, "flos": 23514020190720.0, "grad_norm": 2.0461587353440116, "language_loss": 0.70757759, "learning_rate": 2.734858302154894e-06, "loss": 0.72978091, "num_input_tokens_seen": 71481075, "step": 3317, "time_per_iteration": 2.6625890731811523 }, { "auxiliary_loss_clip": 0.01132026, "auxiliary_loss_mlp": 0.01047663, "balance_loss_clip": 1.05031395, "balance_loss_mlp": 1.02783895, "epoch": 0.39896591114050384, "flos": 19208761908480.0, "grad_norm": 2.4698544110210663, "language_loss": 0.76330829, "learning_rate": 2.734133762674457e-06, "loss": 0.78510517, "num_input_tokens_seen": 71500665, "step": 3318, "time_per_iteration": 2.5912232398986816 }, { "auxiliary_loss_clip": 0.01138421, "auxiliary_loss_mlp": 0.01051221, "balance_loss_clip": 1.05205679, "balance_loss_mlp": 1.03130138, "epoch": 0.3990861540311429, "flos": 28401031146240.0, "grad_norm": 2.2582516347172676, "language_loss": 0.70849764, "learning_rate": 2.7334091118265124e-06, "loss": 0.73039407, "num_input_tokens_seen": 71522560, "step": 3319, "time_per_iteration": 2.682166814804077 }, { "auxiliary_loss_clip": 0.01049396, "auxiliary_loss_mlp": 0.01011886, "balance_loss_clip": 1.02403212, "balance_loss_mlp": 1.00981188, "epoch": 0.399206396921782, "flos": 61758563086080.0, "grad_norm": 0.6892226365254794, "language_loss": 0.57816905, "learning_rate": 2.732684349720989e-06, "loss": 0.59878182, "num_input_tokens_seen": 71590520, "step": 3320, "time_per_iteration": 3.2090647220611572 }, { "auxiliary_loss_clip": 0.01123557, "auxiliary_loss_mlp": 0.01051091, "balance_loss_clip": 1.04937768, "balance_loss_mlp": 1.03137398, "epoch": 0.3993266398124211, "flos": 28074567409920.0, "grad_norm": 1.8490444397060963, "language_loss": 0.75254154, "learning_rate": 2.7319594764678318e-06, "loss": 0.77428806, "num_input_tokens_seen": 71612620, "step": 3321, "time_per_iteration": 3.5788323879241943 }, { "auxiliary_loss_clip": 0.01112583, "auxiliary_loss_mlp": 0.01047753, "balance_loss_clip": 1.04767442, "balance_loss_mlp": 1.0274632, "epoch": 0.39944688270306017, "flos": 23225083188480.0, "grad_norm": 2.0329201341968375, "language_loss": 0.83463049, "learning_rate": 2.7312344921770044e-06, "loss": 0.85623384, "num_input_tokens_seen": 71634320, "step": 3322, "time_per_iteration": 2.771393060684204 }, { "auxiliary_loss_clip": 0.01132635, "auxiliary_loss_mlp": 0.01044719, "balance_loss_clip": 1.0459466, "balance_loss_mlp": 1.02457249, "epoch": 0.3995671255936993, "flos": 19390433921280.0, "grad_norm": 2.2248498310088203, "language_loss": 0.7838366, "learning_rate": 2.7305093969584857e-06, "loss": 0.80561018, "num_input_tokens_seen": 71653145, "step": 3323, "time_per_iteration": 2.650614023208618 }, { "auxiliary_loss_clip": 0.0114087, "auxiliary_loss_mlp": 0.01051846, "balance_loss_clip": 1.04832625, "balance_loss_mlp": 1.03165197, "epoch": 0.3996873684843384, "flos": 23842638743040.0, "grad_norm": 1.678909945045306, "language_loss": 0.80082011, "learning_rate": 2.729784190922272e-06, "loss": 0.82274735, "num_input_tokens_seen": 71674580, "step": 3324, "time_per_iteration": 2.6489663124084473 }, { "auxiliary_loss_clip": 0.01035199, "auxiliary_loss_mlp": 0.01008708, "balance_loss_clip": 1.0198586, "balance_loss_mlp": 1.00664616, "epoch": 0.39980761137497745, "flos": 66576877280640.0, "grad_norm": 0.9634802125302266, "language_loss": 0.57150191, "learning_rate": 2.729058874178378e-06, "loss": 0.59194094, "num_input_tokens_seen": 71745260, "step": 3325, "time_per_iteration": 3.2983317375183105 }, { "auxiliary_loss_clip": 0.01139035, "auxiliary_loss_mlp": 0.01046046, "balance_loss_clip": 1.05152082, "balance_loss_mlp": 1.02681756, "epoch": 0.39992785426561656, "flos": 28549162834560.0, "grad_norm": 1.8654557401036942, "language_loss": 0.69198966, "learning_rate": 2.7283334468368315e-06, "loss": 0.71384043, "num_input_tokens_seen": 71766540, "step": 3326, "time_per_iteration": 2.714139938354492 }, { "auxiliary_loss_clip": 0.01067917, "auxiliary_loss_mlp": 0.01058619, "balance_loss_clip": 1.04023564, "balance_loss_mlp": 1.03503895, "epoch": 0.4000480971562556, "flos": 15049408671360.0, "grad_norm": 1.9454597090841454, "language_loss": 0.72632313, "learning_rate": 2.72760790900768e-06, "loss": 0.7475884, "num_input_tokens_seen": 71783125, "step": 3327, "time_per_iteration": 2.870119094848633 }, { "auxiliary_loss_clip": 0.01165563, "auxiliary_loss_mlp": 0.01057621, "balance_loss_clip": 1.05602658, "balance_loss_mlp": 1.03869057, "epoch": 0.4001683400468947, "flos": 23915609222400.0, "grad_norm": 2.143266922918539, "language_loss": 0.79060364, "learning_rate": 2.7268822608009875e-06, "loss": 0.81283545, "num_input_tokens_seen": 71802500, "step": 3328, "time_per_iteration": 3.709540605545044 }, { "auxiliary_loss_clip": 0.01130209, "auxiliary_loss_mlp": 0.01057904, "balance_loss_clip": 1.04927289, "balance_loss_mlp": 1.03765023, "epoch": 0.40028858293753383, "flos": 24352677912960.0, "grad_norm": 1.835417410211438, "language_loss": 0.78467083, "learning_rate": 2.726156502326834e-06, "loss": 0.80655193, "num_input_tokens_seen": 71823800, "step": 3329, "time_per_iteration": 2.9016730785369873 }, { "auxiliary_loss_clip": 0.01012397, "auxiliary_loss_mlp": 0.01008205, "balance_loss_clip": 1.02645886, "balance_loss_mlp": 1.00595188, "epoch": 0.4004088258281729, "flos": 66787025800320.0, "grad_norm": 0.6995120930202042, "language_loss": 0.60292405, "learning_rate": 2.725430633695316e-06, "loss": 0.62313008, "num_input_tokens_seen": 71886880, "step": 3330, "time_per_iteration": 3.6213433742523193 }, { "auxiliary_loss_clip": 0.01055448, "auxiliary_loss_mlp": 0.01006675, "balance_loss_clip": 1.021173, "balance_loss_mlp": 1.00483942, "epoch": 0.400529068718812, "flos": 58598386473600.0, "grad_norm": 1.886406846385169, "language_loss": 0.57923102, "learning_rate": 2.7247046550165485e-06, "loss": 0.59985232, "num_input_tokens_seen": 71939005, "step": 3331, "time_per_iteration": 3.246225357055664 }, { "auxiliary_loss_clip": 0.01166473, "auxiliary_loss_mlp": 0.01064383, "balance_loss_clip": 1.05642045, "balance_loss_mlp": 1.04488063, "epoch": 0.4006493116094511, "flos": 25377460934400.0, "grad_norm": 1.6104742700894605, "language_loss": 0.75815022, "learning_rate": 2.7239785664006606e-06, "loss": 0.78045875, "num_input_tokens_seen": 71962545, "step": 3332, "time_per_iteration": 2.8272171020507812 }, { "auxiliary_loss_clip": 0.01047738, "auxiliary_loss_mlp": 0.01005684, "balance_loss_clip": 1.02196908, "balance_loss_mlp": 1.00368142, "epoch": 0.40076955450009016, "flos": 60280729822080.0, "grad_norm": 0.7781245973570016, "language_loss": 0.61814654, "learning_rate": 2.7232523679578002e-06, "loss": 0.63868076, "num_input_tokens_seen": 72025625, "step": 3333, "time_per_iteration": 3.2880234718322754 }, { "auxiliary_loss_clip": 0.01146925, "auxiliary_loss_mlp": 0.01047355, "balance_loss_clip": 1.05293989, "balance_loss_mlp": 1.02866352, "epoch": 0.4008897973907293, "flos": 16617268396800.0, "grad_norm": 2.445382868723667, "language_loss": 0.79306376, "learning_rate": 2.7225260597981295e-06, "loss": 0.81500655, "num_input_tokens_seen": 72043330, "step": 3334, "time_per_iteration": 2.7150774002075195 }, { "auxiliary_loss_clip": 0.01121581, "auxiliary_loss_mlp": 0.00776288, "balance_loss_clip": 1.04953229, "balance_loss_mlp": 1.00115764, "epoch": 0.4010100402813684, "flos": 15377344865280.0, "grad_norm": 2.5129205651639714, "language_loss": 0.78785348, "learning_rate": 2.721799642031831e-06, "loss": 0.80683219, "num_input_tokens_seen": 72059500, "step": 3335, "time_per_iteration": 2.8411972522735596 }, { "auxiliary_loss_clip": 0.01141797, "auxiliary_loss_mlp": 0.01056869, "balance_loss_clip": 1.04979324, "balance_loss_mlp": 1.0380342, "epoch": 0.40113028317200744, "flos": 13298835438720.0, "grad_norm": 2.690406701906372, "language_loss": 0.77703965, "learning_rate": 2.721073114769101e-06, "loss": 0.79902625, "num_input_tokens_seen": 72077175, "step": 3336, "time_per_iteration": 3.740947723388672 }, { "auxiliary_loss_clip": 0.01117975, "auxiliary_loss_mlp": 0.01046471, "balance_loss_clip": 1.04930568, "balance_loss_mlp": 1.02667022, "epoch": 0.40125052606264655, "flos": 20668027841280.0, "grad_norm": 2.0680891235506422, "language_loss": 0.74904454, "learning_rate": 2.7203464781201523e-06, "loss": 0.77068901, "num_input_tokens_seen": 72096490, "step": 3337, "time_per_iteration": 2.750037431716919 }, { "auxiliary_loss_clip": 0.01162228, "auxiliary_loss_mlp": 0.01043965, "balance_loss_clip": 1.05456662, "balance_loss_mlp": 1.02551091, "epoch": 0.40137076895328566, "flos": 24607679541120.0, "grad_norm": 2.5032527093063246, "language_loss": 0.78305399, "learning_rate": 2.719619732195215e-06, "loss": 0.80511594, "num_input_tokens_seen": 72118130, "step": 3338, "time_per_iteration": 2.678718328475952 }, { "auxiliary_loss_clip": 0.01126824, "auxiliary_loss_mlp": 0.0104705, "balance_loss_clip": 1.05051279, "balance_loss_mlp": 1.028108, "epoch": 0.4014910118439247, "flos": 24206593299840.0, "grad_norm": 1.5053513067292834, "language_loss": 0.72436607, "learning_rate": 2.7188928771045377e-06, "loss": 0.74610484, "num_input_tokens_seen": 72139450, "step": 3339, "time_per_iteration": 4.03430700302124 }, { "auxiliary_loss_clip": 0.01115997, "auxiliary_loss_mlp": 0.01047011, "balance_loss_clip": 1.04621065, "balance_loss_mlp": 1.02778232, "epoch": 0.4016112547345638, "flos": 26725080418560.0, "grad_norm": 1.800729048236489, "language_loss": 0.80045056, "learning_rate": 2.7181659129583815e-06, "loss": 0.82208061, "num_input_tokens_seen": 72159040, "step": 3340, "time_per_iteration": 2.803443431854248 }, { "auxiliary_loss_clip": 0.01127562, "auxiliary_loss_mlp": 0.01048265, "balance_loss_clip": 1.0475179, "balance_loss_mlp": 1.02861953, "epoch": 0.4017314976252029, "flos": 21288025520640.0, "grad_norm": 1.6407351129770364, "language_loss": 0.76076156, "learning_rate": 2.7174388398670276e-06, "loss": 0.78251988, "num_input_tokens_seen": 72178220, "step": 3341, "time_per_iteration": 4.1112024784088135 }, { "auxiliary_loss_clip": 0.01160161, "auxiliary_loss_mlp": 0.01044861, "balance_loss_clip": 1.05069613, "balance_loss_mlp": 1.02517939, "epoch": 0.401851740515842, "flos": 25484690010240.0, "grad_norm": 2.216644372415218, "language_loss": 0.9232415, "learning_rate": 2.716711657940773e-06, "loss": 0.94529176, "num_input_tokens_seen": 72199230, "step": 3342, "time_per_iteration": 2.677361249923706 }, { "auxiliary_loss_clip": 0.01020111, "auxiliary_loss_mlp": 0.01008097, "balance_loss_clip": 1.01384616, "balance_loss_mlp": 1.00618923, "epoch": 0.4019719834064811, "flos": 55395334978560.0, "grad_norm": 0.8240509400650288, "language_loss": 0.56457824, "learning_rate": 2.7159843672899284e-06, "loss": 0.58486038, "num_input_tokens_seen": 72263430, "step": 3343, "time_per_iteration": 3.428776264190674 }, { "auxiliary_loss_clip": 0.0114981, "auxiliary_loss_mlp": 0.01046971, "balance_loss_clip": 1.052858, "balance_loss_mlp": 1.02694416, "epoch": 0.40209222629712016, "flos": 18180100218240.0, "grad_norm": 1.9711653480632674, "language_loss": 0.81414437, "learning_rate": 2.715256968024825e-06, "loss": 0.8361122, "num_input_tokens_seen": 72280505, "step": 3344, "time_per_iteration": 2.655846357345581 }, { "auxiliary_loss_clip": 0.01140136, "auxiliary_loss_mlp": 0.01054968, "balance_loss_clip": 1.05210483, "balance_loss_mlp": 1.03593016, "epoch": 0.40221246918775927, "flos": 25961009287680.0, "grad_norm": 1.6522988259034141, "language_loss": 0.82207441, "learning_rate": 2.7145294602558083e-06, "loss": 0.84402549, "num_input_tokens_seen": 72301215, "step": 3345, "time_per_iteration": 2.7933688163757324 }, { "auxiliary_loss_clip": 0.01149977, "auxiliary_loss_mlp": 0.01046661, "balance_loss_clip": 1.05029225, "balance_loss_mlp": 1.02733731, "epoch": 0.4023327120783984, "flos": 33838912056960.0, "grad_norm": 1.9305348882260487, "language_loss": 0.71299165, "learning_rate": 2.713801844093241e-06, "loss": 0.73495805, "num_input_tokens_seen": 72322365, "step": 3346, "time_per_iteration": 2.749516248703003 }, { "auxiliary_loss_clip": 0.01151896, "auxiliary_loss_mlp": 0.01042832, "balance_loss_clip": 1.05568337, "balance_loss_mlp": 1.02428317, "epoch": 0.40245295496903744, "flos": 26900252069760.0, "grad_norm": 7.559038687157685, "language_loss": 0.88571835, "learning_rate": 2.7130741196475014e-06, "loss": 0.90766561, "num_input_tokens_seen": 72340495, "step": 3347, "time_per_iteration": 3.6367058753967285 }, { "auxiliary_loss_clip": 0.01138862, "auxiliary_loss_mlp": 0.01052455, "balance_loss_clip": 1.05042219, "balance_loss_mlp": 1.03171277, "epoch": 0.40257319785967655, "flos": 36902738436480.0, "grad_norm": 2.04812265399276, "language_loss": 0.79264504, "learning_rate": 2.7123462870289848e-06, "loss": 0.81455815, "num_input_tokens_seen": 72360545, "step": 3348, "time_per_iteration": 2.8462512493133545 }, { "auxiliary_loss_clip": 0.01136927, "auxiliary_loss_mlp": 0.01048874, "balance_loss_clip": 1.04896188, "balance_loss_mlp": 1.02922821, "epoch": 0.40269344075031566, "flos": 24353180703360.0, "grad_norm": 2.601343006756173, "language_loss": 0.81512088, "learning_rate": 2.711618346348102e-06, "loss": 0.83697891, "num_input_tokens_seen": 72381070, "step": 3349, "time_per_iteration": 2.7259719371795654 }, { "auxiliary_loss_clip": 0.01123984, "auxiliary_loss_mlp": 0.01056753, "balance_loss_clip": 1.0476656, "balance_loss_mlp": 1.03779817, "epoch": 0.4028136836409547, "flos": 14389657614720.0, "grad_norm": 1.723136753043678, "language_loss": 0.63647473, "learning_rate": 2.7108902977152825e-06, "loss": 0.65828204, "num_input_tokens_seen": 72398970, "step": 3350, "time_per_iteration": 2.8701767921447754 }, { "auxiliary_loss_clip": 0.01143824, "auxiliary_loss_mlp": 0.01055286, "balance_loss_clip": 1.05024493, "balance_loss_mlp": 1.03599811, "epoch": 0.4029339265315938, "flos": 26136037284480.0, "grad_norm": 2.283548472924911, "language_loss": 0.75358897, "learning_rate": 2.7101621412409704e-06, "loss": 0.77558011, "num_input_tokens_seen": 72418455, "step": 3351, "time_per_iteration": 2.682039260864258 }, { "auxiliary_loss_clip": 0.01162745, "auxiliary_loss_mlp": 0.01058994, "balance_loss_clip": 1.05231094, "balance_loss_mlp": 1.04021871, "epoch": 0.40305416942223293, "flos": 23256325042560.0, "grad_norm": 2.2503783562426034, "language_loss": 0.85787523, "learning_rate": 2.7094338770356256e-06, "loss": 0.88009262, "num_input_tokens_seen": 72437540, "step": 3352, "time_per_iteration": 2.7422242164611816 }, { "auxiliary_loss_clip": 0.01130924, "auxiliary_loss_mlp": 0.01055801, "balance_loss_clip": 1.0491538, "balance_loss_mlp": 1.03592932, "epoch": 0.403174412312872, "flos": 27089645506560.0, "grad_norm": 2.2061276965830268, "language_loss": 0.6425634, "learning_rate": 2.708705505209726e-06, "loss": 0.66443062, "num_input_tokens_seen": 72458315, "step": 3353, "time_per_iteration": 2.9390580654144287 }, { "auxiliary_loss_clip": 0.01102915, "auxiliary_loss_mlp": 0.01055207, "balance_loss_clip": 1.0453136, "balance_loss_mlp": 1.03408313, "epoch": 0.4032946552035111, "flos": 21756336065280.0, "grad_norm": 2.4711188146887633, "language_loss": 0.91699338, "learning_rate": 2.7079770258737646e-06, "loss": 0.93857461, "num_input_tokens_seen": 72476225, "step": 3354, "time_per_iteration": 2.767934799194336 }, { "auxiliary_loss_clip": 0.01116296, "auxiliary_loss_mlp": 0.01062242, "balance_loss_clip": 1.04469585, "balance_loss_mlp": 1.04177356, "epoch": 0.4034148980941502, "flos": 17343956448000.0, "grad_norm": 2.407015331862502, "language_loss": 0.75347221, "learning_rate": 2.707248439138251e-06, "loss": 0.77525753, "num_input_tokens_seen": 72492460, "step": 3355, "time_per_iteration": 2.704030990600586 }, { "auxiliary_loss_clip": 0.01133277, "auxiliary_loss_mlp": 0.01054664, "balance_loss_clip": 1.05281222, "balance_loss_mlp": 1.03575754, "epoch": 0.40353514098478926, "flos": 22017838055040.0, "grad_norm": 1.7661034802000293, "language_loss": 0.65050012, "learning_rate": 2.7065197451137114e-06, "loss": 0.67237949, "num_input_tokens_seen": 72513840, "step": 3356, "time_per_iteration": 2.855675220489502 }, { "auxiliary_loss_clip": 0.0113399, "auxiliary_loss_mlp": 0.01046309, "balance_loss_clip": 1.04841256, "balance_loss_mlp": 1.02742589, "epoch": 0.4036553838754284, "flos": 14246446089600.0, "grad_norm": 3.422717548933129, "language_loss": 0.67288435, "learning_rate": 2.7057909439106894e-06, "loss": 0.69468737, "num_input_tokens_seen": 72531695, "step": 3357, "time_per_iteration": 2.7955896854400635 }, { "auxiliary_loss_clip": 0.01139325, "auxiliary_loss_mlp": 0.00775876, "balance_loss_clip": 1.04921246, "balance_loss_mlp": 1.00121522, "epoch": 0.40377562676606743, "flos": 24790644443520.0, "grad_norm": 2.1713777035961854, "language_loss": 0.78424501, "learning_rate": 2.7050620356397417e-06, "loss": 0.803397, "num_input_tokens_seen": 72550645, "step": 3358, "time_per_iteration": 2.75260853767395 }, { "auxiliary_loss_clip": 0.01154112, "auxiliary_loss_mlp": 0.01043918, "balance_loss_clip": 1.0521152, "balance_loss_mlp": 1.02628708, "epoch": 0.40389586965670654, "flos": 24061226958720.0, "grad_norm": 1.8114522629887349, "language_loss": 0.72497928, "learning_rate": 2.7043330204114437e-06, "loss": 0.74695957, "num_input_tokens_seen": 72569355, "step": 3359, "time_per_iteration": 2.835797071456909 }, { "auxiliary_loss_clip": 0.01156018, "auxiliary_loss_mlp": 0.01048547, "balance_loss_clip": 1.05133724, "balance_loss_mlp": 1.03005743, "epoch": 0.40401611254734565, "flos": 16399613934720.0, "grad_norm": 2.02735985808142, "language_loss": 0.85499966, "learning_rate": 2.7036038983363862e-06, "loss": 0.87704527, "num_input_tokens_seen": 72585960, "step": 3360, "time_per_iteration": 2.6674156188964844 }, { "auxiliary_loss_clip": 0.01141672, "auxiliary_loss_mlp": 0.01057714, "balance_loss_clip": 1.04967821, "balance_loss_mlp": 1.03905785, "epoch": 0.4041363554379847, "flos": 23988220565760.0, "grad_norm": 1.839975021249097, "language_loss": 0.84294885, "learning_rate": 2.702874669525177e-06, "loss": 0.86494273, "num_input_tokens_seen": 72604440, "step": 3361, "time_per_iteration": 2.826188564300537 }, { "auxiliary_loss_clip": 0.01121353, "auxiliary_loss_mlp": 0.0105705, "balance_loss_clip": 1.04950881, "balance_loss_mlp": 1.03754759, "epoch": 0.4042565983286238, "flos": 28401964899840.0, "grad_norm": 2.1541241821564046, "language_loss": 0.69802535, "learning_rate": 2.7021453340884394e-06, "loss": 0.71980935, "num_input_tokens_seen": 72622165, "step": 3362, "time_per_iteration": 2.8915774822235107 }, { "auxiliary_loss_clip": 0.01122041, "auxiliary_loss_mlp": 0.00774425, "balance_loss_clip": 1.04696965, "balance_loss_mlp": 1.00116384, "epoch": 0.40437684121926293, "flos": 17710963660800.0, "grad_norm": 2.2013198237928986, "language_loss": 0.73073703, "learning_rate": 2.7014158921368125e-06, "loss": 0.74970174, "num_input_tokens_seen": 72640490, "step": 3363, "time_per_iteration": 3.9027795791625977 }, { "auxiliary_loss_clip": 0.01160505, "auxiliary_loss_mlp": 0.01041808, "balance_loss_clip": 1.05439901, "balance_loss_mlp": 1.02306807, "epoch": 0.404497084109902, "flos": 24018959629440.0, "grad_norm": 2.4962227654616234, "language_loss": 0.85317278, "learning_rate": 2.700686343780953e-06, "loss": 0.87519592, "num_input_tokens_seen": 72660360, "step": 3364, "time_per_iteration": 2.6744275093078613 }, { "auxiliary_loss_clip": 0.01134273, "auxiliary_loss_mlp": 0.010512, "balance_loss_clip": 1.04839563, "balance_loss_mlp": 1.03216267, "epoch": 0.4046173270005411, "flos": 22929861306240.0, "grad_norm": 2.1963403566592836, "language_loss": 0.88654137, "learning_rate": 2.699956689131532e-06, "loss": 0.90839612, "num_input_tokens_seen": 72680345, "step": 3365, "time_per_iteration": 2.6955816745758057 }, { "auxiliary_loss_clip": 0.01137503, "auxiliary_loss_mlp": 0.01044362, "balance_loss_clip": 1.05111444, "balance_loss_mlp": 1.02458501, "epoch": 0.4047375698911802, "flos": 20668135582080.0, "grad_norm": 3.3937077104369826, "language_loss": 0.85378355, "learning_rate": 2.699226928299238e-06, "loss": 0.87560219, "num_input_tokens_seen": 72698365, "step": 3366, "time_per_iteration": 3.621615171432495 }, { "auxiliary_loss_clip": 0.0114911, "auxiliary_loss_mlp": 0.01044953, "balance_loss_clip": 1.05141306, "balance_loss_mlp": 1.02642798, "epoch": 0.40485781278181926, "flos": 28912865996160.0, "grad_norm": 2.3608052434088664, "language_loss": 0.792665, "learning_rate": 2.698497061394774e-06, "loss": 0.81460559, "num_input_tokens_seen": 72716850, "step": 3367, "time_per_iteration": 2.795379161834717 }, { "auxiliary_loss_clip": 0.01133754, "auxiliary_loss_mlp": 0.00775353, "balance_loss_clip": 1.05346131, "balance_loss_mlp": 1.00124359, "epoch": 0.40497805567245837, "flos": 23148377694720.0, "grad_norm": 1.7872928246976487, "language_loss": 0.80806899, "learning_rate": 2.6977670885288627e-06, "loss": 0.82716012, "num_input_tokens_seen": 72738250, "step": 3368, "time_per_iteration": 3.992384910583496 }, { "auxiliary_loss_clip": 0.0112319, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.0474447, "balance_loss_mlp": 1.0205369, "epoch": 0.4050982985630975, "flos": 16289404030080.0, "grad_norm": 2.5934600850978313, "language_loss": 0.75086761, "learning_rate": 2.6970370098122378e-06, "loss": 0.77249587, "num_input_tokens_seen": 72755235, "step": 3369, "time_per_iteration": 2.7069435119628906 }, { "auxiliary_loss_clip": 0.01160809, "auxiliary_loss_mlp": 0.01044424, "balance_loss_clip": 1.0535754, "balance_loss_mlp": 1.02424145, "epoch": 0.40521854145373654, "flos": 34459484353920.0, "grad_norm": 1.8954298158528604, "language_loss": 0.86558557, "learning_rate": 2.6963068253556535e-06, "loss": 0.88763785, "num_input_tokens_seen": 72776620, "step": 3370, "time_per_iteration": 2.756805658340454 }, { "auxiliary_loss_clip": 0.01154624, "auxiliary_loss_mlp": 0.01052725, "balance_loss_clip": 1.05184174, "balance_loss_mlp": 1.03186321, "epoch": 0.40533878434437565, "flos": 25331099454720.0, "grad_norm": 2.047965018505182, "language_loss": 0.85695982, "learning_rate": 2.6955765352698763e-06, "loss": 0.87903327, "num_input_tokens_seen": 72796765, "step": 3371, "time_per_iteration": 2.751553773880005 }, { "auxiliary_loss_clip": 0.01166516, "auxiliary_loss_mlp": 0.01052032, "balance_loss_clip": 1.0545764, "balance_loss_mlp": 1.03219533, "epoch": 0.40545902723501476, "flos": 15012061505280.0, "grad_norm": 2.0603381958490887, "language_loss": 0.73354208, "learning_rate": 2.6948461396656923e-06, "loss": 0.75572753, "num_input_tokens_seen": 72814175, "step": 3372, "time_per_iteration": 2.654996633529663 }, { "auxiliary_loss_clip": 0.01155146, "auxiliary_loss_mlp": 0.0105207, "balance_loss_clip": 1.05592513, "balance_loss_mlp": 1.03207862, "epoch": 0.4055792701256538, "flos": 25521103422720.0, "grad_norm": 4.0306458506964775, "language_loss": 0.74365175, "learning_rate": 2.6941156386539013e-06, "loss": 0.76572388, "num_input_tokens_seen": 72834125, "step": 3373, "time_per_iteration": 3.6007187366485596 }, { "auxiliary_loss_clip": 0.01130674, "auxiliary_loss_mlp": 0.01064748, "balance_loss_clip": 1.05170631, "balance_loss_mlp": 1.04488766, "epoch": 0.4056995130162929, "flos": 19574583972480.0, "grad_norm": 1.9925927637192784, "language_loss": 0.81022173, "learning_rate": 2.6933850323453203e-06, "loss": 0.83217597, "num_input_tokens_seen": 72852570, "step": 3374, "time_per_iteration": 2.71124267578125 }, { "auxiliary_loss_clip": 0.01162344, "auxiliary_loss_mlp": 0.01043607, "balance_loss_clip": 1.05515528, "balance_loss_mlp": 1.02528477, "epoch": 0.405819755906932, "flos": 15413794191360.0, "grad_norm": 2.184264521290178, "language_loss": 0.75057375, "learning_rate": 2.6926543208507806e-06, "loss": 0.77263325, "num_input_tokens_seen": 72871250, "step": 3375, "time_per_iteration": 2.5772812366485596 }, { "auxiliary_loss_clip": 0.01151352, "auxiliary_loss_mlp": 0.01049415, "balance_loss_clip": 1.05287266, "balance_loss_mlp": 1.02869678, "epoch": 0.4059399987975711, "flos": 21433930565760.0, "grad_norm": 3.6377060578242353, "language_loss": 0.80314302, "learning_rate": 2.6919235042811316e-06, "loss": 0.82515073, "num_input_tokens_seen": 72890035, "step": 3376, "time_per_iteration": 2.6979880332946777 }, { "auxiliary_loss_clip": 0.01120349, "auxiliary_loss_mlp": 0.0105247, "balance_loss_clip": 1.04737616, "balance_loss_mlp": 1.02974844, "epoch": 0.4060602416882102, "flos": 25556942217600.0, "grad_norm": 2.079684267073448, "language_loss": 0.76701093, "learning_rate": 2.691192582747237e-06, "loss": 0.78873909, "num_input_tokens_seen": 72909665, "step": 3377, "time_per_iteration": 2.7736666202545166 }, { "auxiliary_loss_clip": 0.01162506, "auxiliary_loss_mlp": 0.01060503, "balance_loss_clip": 1.05374312, "balance_loss_mlp": 1.03930736, "epoch": 0.40618048457884925, "flos": 23766759262080.0, "grad_norm": 1.9072040384987796, "language_loss": 0.74222678, "learning_rate": 2.6904615563599765e-06, "loss": 0.76445687, "num_input_tokens_seen": 72929465, "step": 3378, "time_per_iteration": 2.6874797344207764 }, { "auxiliary_loss_clip": 0.01115157, "auxiliary_loss_mlp": 0.01049434, "balance_loss_clip": 1.0459851, "balance_loss_mlp": 1.02956176, "epoch": 0.40630072746948837, "flos": 17639681120640.0, "grad_norm": 1.8022235264771675, "language_loss": 0.83344579, "learning_rate": 2.6897304252302477e-06, "loss": 0.85509163, "num_input_tokens_seen": 72946785, "step": 3379, "time_per_iteration": 2.6757192611694336 }, { "auxiliary_loss_clip": 0.01020724, "auxiliary_loss_mlp": 0.01001481, "balance_loss_clip": 1.01463425, "balance_loss_mlp": 0.99974078, "epoch": 0.4064209703601275, "flos": 60836053063680.0, "grad_norm": 0.7929528644262043, "language_loss": 0.54817879, "learning_rate": 2.688999189468962e-06, "loss": 0.56840086, "num_input_tokens_seen": 73003215, "step": 3380, "time_per_iteration": 3.188892364501953 }, { "auxiliary_loss_clip": 0.01145273, "auxiliary_loss_mlp": 0.0104682, "balance_loss_clip": 1.05331779, "balance_loss_mlp": 1.0270195, "epoch": 0.40654121325076653, "flos": 24024346669440.0, "grad_norm": 3.05238654323623, "language_loss": 0.75436878, "learning_rate": 2.6882678491870464e-06, "loss": 0.77628982, "num_input_tokens_seen": 73023650, "step": 3381, "time_per_iteration": 2.712892532348633 }, { "auxiliary_loss_clip": 0.01152892, "auxiliary_loss_mlp": 0.01051892, "balance_loss_clip": 1.05361593, "balance_loss_mlp": 1.03247273, "epoch": 0.40666145614140564, "flos": 27344252085120.0, "grad_norm": 2.463566797713845, "language_loss": 0.71494126, "learning_rate": 2.6875364044954453e-06, "loss": 0.73698914, "num_input_tokens_seen": 73043880, "step": 3382, "time_per_iteration": 2.7244718074798584 }, { "auxiliary_loss_clip": 0.0112886, "auxiliary_loss_mlp": 0.01048764, "balance_loss_clip": 1.04486942, "balance_loss_mlp": 1.029917, "epoch": 0.40678169903204475, "flos": 26176724415360.0, "grad_norm": 1.760585168775834, "language_loss": 0.82492292, "learning_rate": 2.6868048555051185e-06, "loss": 0.84669918, "num_input_tokens_seen": 73065410, "step": 3383, "time_per_iteration": 2.799596071243286 }, { "auxiliary_loss_clip": 0.01138348, "auxiliary_loss_mlp": 0.01046667, "balance_loss_clip": 1.04676318, "balance_loss_mlp": 1.02622294, "epoch": 0.4069019419226838, "flos": 28622420622720.0, "grad_norm": 2.4981185743188745, "language_loss": 0.85861045, "learning_rate": 2.686073202327041e-06, "loss": 0.88046062, "num_input_tokens_seen": 73084410, "step": 3384, "time_per_iteration": 2.8023056983947754 }, { "auxiliary_loss_clip": 0.01120914, "auxiliary_loss_mlp": 0.01063593, "balance_loss_clip": 1.04510808, "balance_loss_mlp": 1.04420924, "epoch": 0.4070221848133229, "flos": 25229006023680.0, "grad_norm": 1.7992527843010881, "language_loss": 0.73383838, "learning_rate": 2.6853414450722043e-06, "loss": 0.75568342, "num_input_tokens_seen": 73104075, "step": 3385, "time_per_iteration": 2.735177516937256 }, { "auxiliary_loss_clip": 0.01147372, "auxiliary_loss_mlp": 0.01044451, "balance_loss_clip": 1.05136251, "balance_loss_mlp": 1.02506757, "epoch": 0.40714242770396203, "flos": 18405224709120.0, "grad_norm": 1.9305065408615791, "language_loss": 0.85606557, "learning_rate": 2.684609583851616e-06, "loss": 0.87798381, "num_input_tokens_seen": 73122250, "step": 3386, "time_per_iteration": 2.6666905879974365 }, { "auxiliary_loss_clip": 0.01108861, "auxiliary_loss_mlp": 0.01050334, "balance_loss_clip": 1.04465342, "balance_loss_mlp": 1.02958, "epoch": 0.4072626705946011, "flos": 30228920403840.0, "grad_norm": 1.6583576519576575, "language_loss": 0.80636334, "learning_rate": 2.683877618776297e-06, "loss": 0.82795525, "num_input_tokens_seen": 73144505, "step": 3387, "time_per_iteration": 2.902320384979248 }, { "auxiliary_loss_clip": 0.01128081, "auxiliary_loss_mlp": 0.01061399, "balance_loss_clip": 1.04488873, "balance_loss_mlp": 1.03810585, "epoch": 0.4073829134852402, "flos": 21834549930240.0, "grad_norm": 3.4220018134899144, "language_loss": 0.74817306, "learning_rate": 2.6831455499572876e-06, "loss": 0.77006787, "num_input_tokens_seen": 73162440, "step": 3388, "time_per_iteration": 3.6939756870269775 }, { "auxiliary_loss_clip": 0.01159062, "auxiliary_loss_mlp": 0.01049458, "balance_loss_clip": 1.05105841, "balance_loss_mlp": 1.0280602, "epoch": 0.40750315637587925, "flos": 25260211964160.0, "grad_norm": 2.115297302452377, "language_loss": 0.77646589, "learning_rate": 2.682413377505641e-06, "loss": 0.79855108, "num_input_tokens_seen": 73181245, "step": 3389, "time_per_iteration": 2.6523232460021973 }, { "auxiliary_loss_clip": 0.0114765, "auxiliary_loss_mlp": 0.0105198, "balance_loss_clip": 1.04918122, "balance_loss_mlp": 1.03135633, "epoch": 0.40762339926651836, "flos": 19712767593600.0, "grad_norm": 2.226169731142562, "language_loss": 0.76487744, "learning_rate": 2.6816811015324284e-06, "loss": 0.78687376, "num_input_tokens_seen": 73199295, "step": 3390, "time_per_iteration": 2.734196662902832 }, { "auxiliary_loss_clip": 0.01053011, "auxiliary_loss_mlp": 0.01007071, "balance_loss_clip": 1.01857519, "balance_loss_mlp": 1.00512791, "epoch": 0.40774364215715747, "flos": 71449307314560.0, "grad_norm": 0.7256709088958165, "language_loss": 0.56686556, "learning_rate": 2.6809487221487343e-06, "loss": 0.58746636, "num_input_tokens_seen": 73258780, "step": 3391, "time_per_iteration": 4.081084966659546 }, { "auxiliary_loss_clip": 0.01139904, "auxiliary_loss_mlp": 0.01065109, "balance_loss_clip": 1.04823983, "balance_loss_mlp": 1.04418778, "epoch": 0.4078638850477965, "flos": 15084134144640.0, "grad_norm": 2.511200984623158, "language_loss": 0.82039911, "learning_rate": 2.6802162394656605e-06, "loss": 0.84244919, "num_input_tokens_seen": 73275490, "step": 3392, "time_per_iteration": 2.6062169075012207 }, { "auxiliary_loss_clip": 0.01128386, "auxiliary_loss_mlp": 0.01048892, "balance_loss_clip": 1.0442102, "balance_loss_mlp": 1.02791071, "epoch": 0.40798412793843564, "flos": 23842890138240.0, "grad_norm": 1.9864927142025508, "language_loss": 0.71819055, "learning_rate": 2.679483653594324e-06, "loss": 0.73996329, "num_input_tokens_seen": 73297260, "step": 3393, "time_per_iteration": 2.7100088596343994 }, { "auxiliary_loss_clip": 0.01151461, "auxiliary_loss_mlp": 0.01045018, "balance_loss_clip": 1.05228686, "balance_loss_mlp": 1.02517009, "epoch": 0.40810437082907475, "flos": 21065774117760.0, "grad_norm": 2.8640715302923243, "language_loss": 0.76461387, "learning_rate": 2.678750964645857e-06, "loss": 0.78657871, "num_input_tokens_seen": 73316340, "step": 3394, "time_per_iteration": 3.6158394813537598 }, { "auxiliary_loss_clip": 0.01147657, "auxiliary_loss_mlp": 0.01046538, "balance_loss_clip": 1.05217719, "balance_loss_mlp": 1.02573562, "epoch": 0.4082246137197138, "flos": 11321377948800.0, "grad_norm": 2.370936130073143, "language_loss": 0.8318854, "learning_rate": 2.6780181727314094e-06, "loss": 0.85382736, "num_input_tokens_seen": 73331245, "step": 3395, "time_per_iteration": 2.614896297454834 }, { "auxiliary_loss_clip": 0.0112392, "auxiliary_loss_mlp": 0.00776678, "balance_loss_clip": 1.04934895, "balance_loss_mlp": 1.00138223, "epoch": 0.4083448566103529, "flos": 19062569554560.0, "grad_norm": 2.024045941045503, "language_loss": 0.77688003, "learning_rate": 2.6772852779621435e-06, "loss": 0.79588604, "num_input_tokens_seen": 73349105, "step": 3396, "time_per_iteration": 2.7130205631256104 }, { "auxiliary_loss_clip": 0.01138952, "auxiliary_loss_mlp": 0.00775282, "balance_loss_clip": 1.05218768, "balance_loss_mlp": 1.00120187, "epoch": 0.408465099500992, "flos": 23550254035200.0, "grad_norm": 2.061743614387476, "language_loss": 0.86499333, "learning_rate": 2.676552280449239e-06, "loss": 0.88413566, "num_input_tokens_seen": 73368990, "step": 3397, "time_per_iteration": 2.68619966506958 }, { "auxiliary_loss_clip": 0.01137136, "auxiliary_loss_mlp": 0.01052166, "balance_loss_clip": 1.04795313, "balance_loss_mlp": 1.03197169, "epoch": 0.4085853423916311, "flos": 12750012558720.0, "grad_norm": 3.5407574638126356, "language_loss": 0.75798142, "learning_rate": 2.6758191803038917e-06, "loss": 0.77987444, "num_input_tokens_seen": 73387485, "step": 3398, "time_per_iteration": 3.462184429168701 }, { "auxiliary_loss_clip": 0.01088874, "auxiliary_loss_mlp": 0.0104713, "balance_loss_clip": 1.04516757, "balance_loss_mlp": 1.02550578, "epoch": 0.4087055852822702, "flos": 24353072962560.0, "grad_norm": 1.7102455367453957, "language_loss": 0.82503653, "learning_rate": 2.6750859776373125e-06, "loss": 0.84639657, "num_input_tokens_seen": 73406940, "step": 3399, "time_per_iteration": 2.828777551651001 }, { "auxiliary_loss_clip": 0.01010543, "auxiliary_loss_mlp": 0.01006759, "balance_loss_clip": 1.02142739, "balance_loss_mlp": 1.00423133, "epoch": 0.4088258281729093, "flos": 66387950720640.0, "grad_norm": 0.7704047831745816, "language_loss": 0.60369396, "learning_rate": 2.674352672560727e-06, "loss": 0.62386692, "num_input_tokens_seen": 73468385, "step": 3400, "time_per_iteration": 3.3512630462646484 }, { "auxiliary_loss_clip": 0.01122061, "auxiliary_loss_mlp": 0.01060371, "balance_loss_clip": 1.04827845, "balance_loss_mlp": 1.03929532, "epoch": 0.40894607106354836, "flos": 20449260057600.0, "grad_norm": 1.6966525320044674, "language_loss": 0.77423167, "learning_rate": 2.673619265185377e-06, "loss": 0.79605603, "num_input_tokens_seen": 73488225, "step": 3401, "time_per_iteration": 2.7290849685668945 }, { "auxiliary_loss_clip": 0.01148142, "auxiliary_loss_mlp": 0.01048305, "balance_loss_clip": 1.05015635, "balance_loss_mlp": 1.02637053, "epoch": 0.40906631395418747, "flos": 27053627143680.0, "grad_norm": 1.6706718312099609, "language_loss": 0.78056657, "learning_rate": 2.672885755622521e-06, "loss": 0.802531, "num_input_tokens_seen": 73510640, "step": 3402, "time_per_iteration": 2.717770576477051 }, { "auxiliary_loss_clip": 0.01102474, "auxiliary_loss_mlp": 0.01044315, "balance_loss_clip": 1.0412879, "balance_loss_mlp": 1.02423966, "epoch": 0.4091865568448266, "flos": 25484151306240.0, "grad_norm": 6.234860509978173, "language_loss": 0.7037468, "learning_rate": 2.67215214398343e-06, "loss": 0.72521466, "num_input_tokens_seen": 73530655, "step": 3403, "time_per_iteration": 2.801133394241333 }, { "auxiliary_loss_clip": 0.01114825, "auxiliary_loss_mlp": 0.01047564, "balance_loss_clip": 1.04667926, "balance_loss_mlp": 1.02732205, "epoch": 0.40930679973546563, "flos": 28657864368000.0, "grad_norm": 4.758647291824505, "language_loss": 0.77882373, "learning_rate": 2.671418430379393e-06, "loss": 0.80044758, "num_input_tokens_seen": 73549340, "step": 3404, "time_per_iteration": 2.8137998580932617 }, { "auxiliary_loss_clip": 0.01158237, "auxiliary_loss_mlp": 0.01042469, "balance_loss_clip": 1.04957604, "balance_loss_mlp": 1.02310991, "epoch": 0.40942704262610474, "flos": 20886292834560.0, "grad_norm": 2.119531344574513, "language_loss": 0.83248079, "learning_rate": 2.670684614921715e-06, "loss": 0.8544879, "num_input_tokens_seen": 73568315, "step": 3405, "time_per_iteration": 2.698499917984009 }, { "auxiliary_loss_clip": 0.01139469, "auxiliary_loss_mlp": 0.01051272, "balance_loss_clip": 1.05137396, "balance_loss_mlp": 1.03141129, "epoch": 0.4095472855167438, "flos": 21618080616960.0, "grad_norm": 3.0260089976462132, "language_loss": 0.68985653, "learning_rate": 2.6699506977217128e-06, "loss": 0.71176398, "num_input_tokens_seen": 73588490, "step": 3406, "time_per_iteration": 2.6655631065368652 }, { "auxiliary_loss_clip": 0.01142455, "auxiliary_loss_mlp": 0.01049961, "balance_loss_clip": 1.04923093, "balance_loss_mlp": 1.03019619, "epoch": 0.4096675284073829, "flos": 27926112499200.0, "grad_norm": 2.1304093363675167, "language_loss": 0.70311695, "learning_rate": 2.6692166788907233e-06, "loss": 0.72504115, "num_input_tokens_seen": 73608685, "step": 3407, "time_per_iteration": 2.749342441558838 }, { "auxiliary_loss_clip": 0.01136492, "auxiliary_loss_mlp": 0.01050271, "balance_loss_clip": 1.04877853, "balance_loss_mlp": 1.03023171, "epoch": 0.409787771298022, "flos": 19206607092480.0, "grad_norm": 2.4767249599049697, "language_loss": 0.77078009, "learning_rate": 2.6684825585400957e-06, "loss": 0.79264778, "num_input_tokens_seen": 73627630, "step": 3408, "time_per_iteration": 2.6836724281311035 }, { "auxiliary_loss_clip": 0.01033065, "auxiliary_loss_mlp": 0.01006465, "balance_loss_clip": 1.01884162, "balance_loss_mlp": 1.00456929, "epoch": 0.4099080141886611, "flos": 59269234832640.0, "grad_norm": 0.8177595538556762, "language_loss": 0.65131462, "learning_rate": 2.6677483367811947e-06, "loss": 0.67171001, "num_input_tokens_seen": 73687670, "step": 3409, "time_per_iteration": 3.3841211795806885 }, { "auxiliary_loss_clip": 0.01150491, "auxiliary_loss_mlp": 0.01043921, "balance_loss_clip": 1.05042815, "balance_loss_mlp": 1.02462101, "epoch": 0.4100282570793002, "flos": 21906443001600.0, "grad_norm": 2.4759516979392004, "language_loss": 0.75241446, "learning_rate": 2.6670140137254028e-06, "loss": 0.77435863, "num_input_tokens_seen": 73707145, "step": 3410, "time_per_iteration": 2.6810362339019775 }, { "auxiliary_loss_clip": 0.01102833, "auxiliary_loss_mlp": 0.01046097, "balance_loss_clip": 1.04443598, "balance_loss_mlp": 1.02533042, "epoch": 0.4101484999699393, "flos": 18551596631040.0, "grad_norm": 2.5464386596624493, "language_loss": 0.89774567, "learning_rate": 2.666279589484115e-06, "loss": 0.91923499, "num_input_tokens_seen": 73725045, "step": 3411, "time_per_iteration": 2.7527804374694824 }, { "auxiliary_loss_clip": 0.01101153, "auxiliary_loss_mlp": 0.01053677, "balance_loss_clip": 1.04165971, "balance_loss_mlp": 1.03424644, "epoch": 0.41026874286057835, "flos": 19094529680640.0, "grad_norm": 1.9902122638621755, "language_loss": 0.81262875, "learning_rate": 2.6655450641687435e-06, "loss": 0.83417702, "num_input_tokens_seen": 73742610, "step": 3412, "time_per_iteration": 2.8118197917938232 }, { "auxiliary_loss_clip": 0.01157516, "auxiliary_loss_mlp": 0.0104931, "balance_loss_clip": 1.05166054, "balance_loss_mlp": 1.0305469, "epoch": 0.41038898575121746, "flos": 31209568588800.0, "grad_norm": 1.9607242297412193, "language_loss": 0.69207531, "learning_rate": 2.664810437890715e-06, "loss": 0.71414357, "num_input_tokens_seen": 73764280, "step": 3413, "time_per_iteration": 2.6685612201690674 }, { "auxiliary_loss_clip": 0.01089486, "auxiliary_loss_mlp": 0.01048895, "balance_loss_clip": 1.04652631, "balance_loss_mlp": 1.03032172, "epoch": 0.41050922864185657, "flos": 14355865895040.0, "grad_norm": 2.307706645176297, "language_loss": 0.79322231, "learning_rate": 2.6640757107614714e-06, "loss": 0.81460613, "num_input_tokens_seen": 73782375, "step": 3414, "time_per_iteration": 2.82629132270813 }, { "auxiliary_loss_clip": 0.01114383, "auxiliary_loss_mlp": 0.01046253, "balance_loss_clip": 1.04627705, "balance_loss_mlp": 1.02506971, "epoch": 0.4106294715324956, "flos": 30956290813440.0, "grad_norm": 2.378618719393673, "language_loss": 0.69639897, "learning_rate": 2.6633408828924697e-06, "loss": 0.7180053, "num_input_tokens_seen": 73801240, "step": 3415, "time_per_iteration": 3.8442203998565674 }, { "auxiliary_loss_clip": 0.01125291, "auxiliary_loss_mlp": 0.01050376, "balance_loss_clip": 1.04860401, "balance_loss_mlp": 1.03208899, "epoch": 0.41074971442313474, "flos": 24457321209600.0, "grad_norm": 1.487126939763218, "language_loss": 0.70106423, "learning_rate": 2.662605954395185e-06, "loss": 0.72282088, "num_input_tokens_seen": 73821200, "step": 3416, "time_per_iteration": 2.7914559841156006 }, { "auxiliary_loss_clip": 0.01145556, "auxiliary_loss_mlp": 0.01046614, "balance_loss_clip": 1.04822862, "balance_loss_mlp": 1.02831554, "epoch": 0.41086995731377385, "flos": 21542991235200.0, "grad_norm": 2.210744555213168, "language_loss": 0.84010875, "learning_rate": 2.6618709253811027e-06, "loss": 0.86203039, "num_input_tokens_seen": 73840655, "step": 3417, "time_per_iteration": 3.6878421306610107 }, { "auxiliary_loss_clip": 0.01151731, "auxiliary_loss_mlp": 0.0103532, "balance_loss_clip": 1.05004287, "balance_loss_mlp": 1.01832104, "epoch": 0.4109902002044129, "flos": 20702753314560.0, "grad_norm": 1.6862982563688846, "language_loss": 0.87689793, "learning_rate": 2.6611357959617277e-06, "loss": 0.89876842, "num_input_tokens_seen": 73860275, "step": 3418, "time_per_iteration": 2.6175780296325684 }, { "auxiliary_loss_clip": 0.01111239, "auxiliary_loss_mlp": 0.01051332, "balance_loss_clip": 1.04527223, "balance_loss_mlp": 1.03192484, "epoch": 0.411110443095052, "flos": 18179992477440.0, "grad_norm": 1.8857957937232082, "language_loss": 0.91356635, "learning_rate": 2.660400566248578e-06, "loss": 0.93519205, "num_input_tokens_seen": 73878400, "step": 3419, "time_per_iteration": 2.7770371437072754 }, { "auxiliary_loss_clip": 0.01117955, "auxiliary_loss_mlp": 0.01051444, "balance_loss_clip": 1.04580832, "balance_loss_mlp": 1.03127337, "epoch": 0.41123068598569107, "flos": 14575244209920.0, "grad_norm": 3.050056035654246, "language_loss": 0.6739943, "learning_rate": 2.6596652363531876e-06, "loss": 0.69568837, "num_input_tokens_seen": 73894275, "step": 3420, "time_per_iteration": 3.8800740242004395 }, { "auxiliary_loss_clip": 0.01155352, "auxiliary_loss_mlp": 0.01052324, "balance_loss_clip": 1.05142355, "balance_loss_mlp": 1.03298807, "epoch": 0.4113509288763302, "flos": 21177995184000.0, "grad_norm": 1.5696906474143282, "language_loss": 0.78300035, "learning_rate": 2.6589298063871055e-06, "loss": 0.80507708, "num_input_tokens_seen": 73914450, "step": 3421, "time_per_iteration": 2.722519874572754 }, { "auxiliary_loss_clip": 0.01158335, "auxiliary_loss_mlp": 0.01051944, "balance_loss_clip": 1.05309606, "balance_loss_mlp": 1.03257263, "epoch": 0.4114711717669693, "flos": 18442212739200.0, "grad_norm": 2.068526590676842, "language_loss": 0.69742006, "learning_rate": 2.658194276461895e-06, "loss": 0.71952283, "num_input_tokens_seen": 73932375, "step": 3422, "time_per_iteration": 2.6691086292266846 }, { "auxiliary_loss_clip": 0.0113066, "auxiliary_loss_mlp": 0.0106478, "balance_loss_clip": 1.04422593, "balance_loss_mlp": 1.04031813, "epoch": 0.41159141465760835, "flos": 27233395735680.0, "grad_norm": 2.8570442398962466, "language_loss": 0.67203951, "learning_rate": 2.6574586466891368e-06, "loss": 0.69399393, "num_input_tokens_seen": 73952850, "step": 3423, "time_per_iteration": 2.7631523609161377 }, { "auxiliary_loss_clip": 0.01130208, "auxiliary_loss_mlp": 0.00774588, "balance_loss_clip": 1.04810286, "balance_loss_mlp": 1.00106978, "epoch": 0.41171165754824746, "flos": 20006876154240.0, "grad_norm": 2.634716397834083, "language_loss": 0.64970648, "learning_rate": 2.6567229171804247e-06, "loss": 0.6687544, "num_input_tokens_seen": 73970735, "step": 3424, "time_per_iteration": 3.630786180496216 }, { "auxiliary_loss_clip": 0.01130076, "auxiliary_loss_mlp": 0.01046499, "balance_loss_clip": 1.04651523, "balance_loss_mlp": 1.02574444, "epoch": 0.41183190043888657, "flos": 18004318035840.0, "grad_norm": 3.104992056271375, "language_loss": 0.87944353, "learning_rate": 2.655987088047368e-06, "loss": 0.90120929, "num_input_tokens_seen": 73989080, "step": 3425, "time_per_iteration": 2.756964921951294 }, { "auxiliary_loss_clip": 0.01129556, "auxiliary_loss_mlp": 0.01049062, "balance_loss_clip": 1.04867387, "balance_loss_mlp": 1.02867699, "epoch": 0.4119521433295256, "flos": 27163370171520.0, "grad_norm": 2.2363849511799323, "language_loss": 0.78561008, "learning_rate": 2.6552511594015912e-06, "loss": 0.80739623, "num_input_tokens_seen": 74009470, "step": 3426, "time_per_iteration": 2.7424213886260986 }, { "auxiliary_loss_clip": 0.01130156, "auxiliary_loss_mlp": 0.01065938, "balance_loss_clip": 1.04361856, "balance_loss_mlp": 1.04397941, "epoch": 0.41207238622016473, "flos": 15122020014720.0, "grad_norm": 2.423600244696786, "language_loss": 0.85300779, "learning_rate": 2.654515131354735e-06, "loss": 0.87496877, "num_input_tokens_seen": 74027735, "step": 3427, "time_per_iteration": 2.733566999435425 }, { "auxiliary_loss_clip": 0.01124934, "auxiliary_loss_mlp": 0.01051757, "balance_loss_clip": 1.05056608, "balance_loss_mlp": 1.03167069, "epoch": 0.41219262911080384, "flos": 27052872958080.0, "grad_norm": 1.9760718907932215, "language_loss": 0.85197735, "learning_rate": 2.653779004018453e-06, "loss": 0.87374425, "num_input_tokens_seen": 74048300, "step": 3428, "time_per_iteration": 2.745460271835327 }, { "auxiliary_loss_clip": 0.01124317, "auxiliary_loss_mlp": 0.01051688, "balance_loss_clip": 1.04755688, "balance_loss_mlp": 1.03257847, "epoch": 0.4123128720014429, "flos": 24686360282880.0, "grad_norm": 1.9782793209409335, "language_loss": 0.82278973, "learning_rate": 2.653042777504417e-06, "loss": 0.84454972, "num_input_tokens_seen": 74070890, "step": 3429, "time_per_iteration": 2.8237459659576416 }, { "auxiliary_loss_clip": 0.01139056, "auxiliary_loss_mlp": 0.01052414, "balance_loss_clip": 1.04834008, "balance_loss_mlp": 1.03042006, "epoch": 0.412433114892082, "flos": 26244774731520.0, "grad_norm": 1.9670474214949984, "language_loss": 0.80244267, "learning_rate": 2.6523064519243105e-06, "loss": 0.82435739, "num_input_tokens_seen": 74090460, "step": 3430, "time_per_iteration": 2.71931791305542 }, { "auxiliary_loss_clip": 0.01146333, "auxiliary_loss_mlp": 0.0105182, "balance_loss_clip": 1.05033958, "balance_loss_mlp": 1.02984977, "epoch": 0.4125533577827211, "flos": 21361031913600.0, "grad_norm": 5.091708513849572, "language_loss": 0.78978312, "learning_rate": 2.6515700273898333e-06, "loss": 0.8117646, "num_input_tokens_seen": 74108335, "step": 3431, "time_per_iteration": 2.7059786319732666 }, { "auxiliary_loss_clip": 0.01123257, "auxiliary_loss_mlp": 0.01056337, "balance_loss_clip": 1.05035424, "balance_loss_mlp": 1.03685856, "epoch": 0.4126736006733602, "flos": 26067556005120.0, "grad_norm": 2.2804009287660487, "language_loss": 0.69062889, "learning_rate": 2.6508335040127018e-06, "loss": 0.71242476, "num_input_tokens_seen": 74128030, "step": 3432, "time_per_iteration": 2.717104911804199 }, { "auxiliary_loss_clip": 0.01150437, "auxiliary_loss_mlp": 0.01051061, "balance_loss_clip": 1.05340958, "balance_loss_mlp": 1.03092647, "epoch": 0.4127938435639993, "flos": 25666146541440.0, "grad_norm": 1.7633204822788575, "language_loss": 0.77169812, "learning_rate": 2.6500968819046446e-06, "loss": 0.79371309, "num_input_tokens_seen": 74148330, "step": 3433, "time_per_iteration": 2.7604875564575195 }, { "auxiliary_loss_clip": 0.01104044, "auxiliary_loss_mlp": 0.01054795, "balance_loss_clip": 1.04116869, "balance_loss_mlp": 1.03426719, "epoch": 0.4129140864546384, "flos": 17995914253440.0, "grad_norm": 2.730163150395288, "language_loss": 0.58928752, "learning_rate": 2.649360161177408e-06, "loss": 0.61087596, "num_input_tokens_seen": 74163390, "step": 3434, "time_per_iteration": 2.691239595413208 }, { "auxiliary_loss_clip": 0.01151671, "auxiliary_loss_mlp": 0.01055447, "balance_loss_clip": 1.05105948, "balance_loss_mlp": 1.03513408, "epoch": 0.41303432934527745, "flos": 23732895715200.0, "grad_norm": 2.225500304721389, "language_loss": 0.73583037, "learning_rate": 2.6486233419427504e-06, "loss": 0.75790155, "num_input_tokens_seen": 74183205, "step": 3435, "time_per_iteration": 2.744706392288208 }, { "auxiliary_loss_clip": 0.01111607, "auxiliary_loss_mlp": 0.01047569, "balance_loss_clip": 1.04586506, "balance_loss_mlp": 1.02705336, "epoch": 0.41315457223591656, "flos": 19755286318080.0, "grad_norm": 2.650060986820277, "language_loss": 0.75223869, "learning_rate": 2.6478864243124484e-06, "loss": 0.77383041, "num_input_tokens_seen": 74202870, "step": 3436, "time_per_iteration": 2.8158111572265625 }, { "auxiliary_loss_clip": 0.01146511, "auxiliary_loss_mlp": 0.01045878, "balance_loss_clip": 1.05058122, "balance_loss_mlp": 1.02762735, "epoch": 0.4132748151265556, "flos": 20923316778240.0, "grad_norm": 1.9247523261824357, "language_loss": 0.85293943, "learning_rate": 2.6471494083982903e-06, "loss": 0.87486327, "num_input_tokens_seen": 74222255, "step": 3437, "time_per_iteration": 2.618053674697876 }, { "auxiliary_loss_clip": 0.01124331, "auxiliary_loss_mlp": 0.01046411, "balance_loss_clip": 1.04670978, "balance_loss_mlp": 1.02575171, "epoch": 0.4133950580171947, "flos": 32232520016640.0, "grad_norm": 1.7802701298703487, "language_loss": 0.74822247, "learning_rate": 2.6464122943120818e-06, "loss": 0.76992983, "num_input_tokens_seen": 74242480, "step": 3438, "time_per_iteration": 2.844987392425537 }, { "auxiliary_loss_clip": 0.01121912, "auxiliary_loss_mlp": 0.01052153, "balance_loss_clip": 1.04828274, "balance_loss_mlp": 1.0325911, "epoch": 0.41351530090783384, "flos": 23292487059840.0, "grad_norm": 3.7864298147686446, "language_loss": 0.82191467, "learning_rate": 2.645675082165642e-06, "loss": 0.84365535, "num_input_tokens_seen": 74258690, "step": 3439, "time_per_iteration": 2.811208486557007 }, { "auxiliary_loss_clip": 0.011386, "auxiliary_loss_mlp": 0.01050823, "balance_loss_clip": 1.0507791, "balance_loss_mlp": 1.0303545, "epoch": 0.4136355437984729, "flos": 25593571111680.0, "grad_norm": 2.1750967868446986, "language_loss": 0.75720286, "learning_rate": 2.644937772070806e-06, "loss": 0.77909708, "num_input_tokens_seen": 74277135, "step": 3440, "time_per_iteration": 3.820018768310547 }, { "auxiliary_loss_clip": 0.0115555, "auxiliary_loss_mlp": 0.01044355, "balance_loss_clip": 1.05185556, "balance_loss_mlp": 1.02726018, "epoch": 0.413755786689112, "flos": 19828615933440.0, "grad_norm": 2.502009796851782, "language_loss": 0.83788937, "learning_rate": 2.6442003641394225e-06, "loss": 0.85988843, "num_input_tokens_seen": 74294730, "step": 3441, "time_per_iteration": 2.7029366493225098 }, { "auxiliary_loss_clip": 0.01125643, "auxiliary_loss_mlp": 0.01048435, "balance_loss_clip": 1.04652023, "balance_loss_mlp": 1.02821755, "epoch": 0.4138760295797511, "flos": 26870446759680.0, "grad_norm": 1.869593764975162, "language_loss": 0.83738112, "learning_rate": 2.643462858483356e-06, "loss": 0.85912192, "num_input_tokens_seen": 74315015, "step": 3442, "time_per_iteration": 2.794611930847168 }, { "auxiliary_loss_clip": 0.01103385, "auxiliary_loss_mlp": 0.01062367, "balance_loss_clip": 1.04622412, "balance_loss_mlp": 1.03951454, "epoch": 0.41399627247039017, "flos": 16399254798720.0, "grad_norm": 1.7287270320389707, "language_loss": 0.72507048, "learning_rate": 2.6427252552144856e-06, "loss": 0.74672794, "num_input_tokens_seen": 74333665, "step": 3443, "time_per_iteration": 3.815906047821045 }, { "auxiliary_loss_clip": 0.01156743, "auxiliary_loss_mlp": 0.01045228, "balance_loss_clip": 1.05222893, "balance_loss_mlp": 1.02462852, "epoch": 0.4141165153610293, "flos": 22930220442240.0, "grad_norm": 2.568857561226109, "language_loss": 0.7492792, "learning_rate": 2.6419875544447044e-06, "loss": 0.77129889, "num_input_tokens_seen": 74355065, "step": 3444, "time_per_iteration": 2.780832529067993 }, { "auxiliary_loss_clip": 0.01158476, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.05125451, "balance_loss_mlp": 1.024822, "epoch": 0.4142367582516684, "flos": 25192556697600.0, "grad_norm": 1.6479906564137428, "language_loss": 0.71760869, "learning_rate": 2.6412497562859218e-06, "loss": 0.73962891, "num_input_tokens_seen": 74376345, "step": 3445, "time_per_iteration": 2.709321975708008 }, { "auxiliary_loss_clip": 0.01146407, "auxiliary_loss_mlp": 0.01049292, "balance_loss_clip": 1.0483911, "balance_loss_mlp": 1.02936006, "epoch": 0.41435700114230745, "flos": 21690476478720.0, "grad_norm": 2.57363230944906, "language_loss": 0.76028597, "learning_rate": 2.6405118608500617e-06, "loss": 0.78224301, "num_input_tokens_seen": 74395170, "step": 3446, "time_per_iteration": 3.664015769958496 }, { "auxiliary_loss_clip": 0.01113069, "auxiliary_loss_mlp": 0.01055405, "balance_loss_clip": 1.05026579, "balance_loss_mlp": 1.03627157, "epoch": 0.41447724403294656, "flos": 25995160143360.0, "grad_norm": 1.7233768245836854, "language_loss": 0.81835103, "learning_rate": 2.6397738682490613e-06, "loss": 0.84003574, "num_input_tokens_seen": 74416070, "step": 3447, "time_per_iteration": 2.8839800357818604 }, { "auxiliary_loss_clip": 0.0115821, "auxiliary_loss_mlp": 0.01042333, "balance_loss_clip": 1.05208182, "balance_loss_mlp": 1.02412975, "epoch": 0.41459748692358567, "flos": 18259678800000.0, "grad_norm": 1.7192961371563427, "language_loss": 0.75583947, "learning_rate": 2.6390357785948734e-06, "loss": 0.77784491, "num_input_tokens_seen": 74433185, "step": 3448, "time_per_iteration": 2.676725149154663 }, { "auxiliary_loss_clip": 0.01147354, "auxiliary_loss_mlp": 0.01048465, "balance_loss_clip": 1.05150068, "balance_loss_mlp": 1.02726924, "epoch": 0.4147177298142247, "flos": 24168456034560.0, "grad_norm": 2.1713569754410424, "language_loss": 0.80481589, "learning_rate": 2.6382975919994667e-06, "loss": 0.826774, "num_input_tokens_seen": 74453760, "step": 3449, "time_per_iteration": 2.6945090293884277 }, { "auxiliary_loss_clip": 0.01131, "auxiliary_loss_mlp": 0.0103842, "balance_loss_clip": 1.04776061, "balance_loss_mlp": 1.02186179, "epoch": 0.41483797270486383, "flos": 20084659056000.0, "grad_norm": 1.645369467530342, "language_loss": 0.72770655, "learning_rate": 2.637559308574822e-06, "loss": 0.74940073, "num_input_tokens_seen": 74473505, "step": 3450, "time_per_iteration": 3.6324493885040283 }, { "auxiliary_loss_clip": 0.01159112, "auxiliary_loss_mlp": 0.01055617, "balance_loss_clip": 1.05220854, "balance_loss_mlp": 1.03655565, "epoch": 0.4149582155955029, "flos": 30081040110720.0, "grad_norm": 2.0159181096707623, "language_loss": 0.71537745, "learning_rate": 2.6368209284329376e-06, "loss": 0.73752475, "num_input_tokens_seen": 74494135, "step": 3451, "time_per_iteration": 2.74117112159729 }, { "auxiliary_loss_clip": 0.01141804, "auxiliary_loss_mlp": 0.01048607, "balance_loss_clip": 1.047791, "balance_loss_mlp": 1.02898502, "epoch": 0.415078458486142, "flos": 16764394504320.0, "grad_norm": 2.4964367874429816, "language_loss": 0.75512105, "learning_rate": 2.636082451685825e-06, "loss": 0.7770251, "num_input_tokens_seen": 74512335, "step": 3452, "time_per_iteration": 2.6177287101745605 }, { "auxiliary_loss_clip": 0.011346, "auxiliary_loss_mlp": 0.01049167, "balance_loss_clip": 1.0498234, "balance_loss_mlp": 1.02999806, "epoch": 0.4151987013767811, "flos": 26033692458240.0, "grad_norm": 1.6486252684809553, "language_loss": 0.86586046, "learning_rate": 2.6353438784455094e-06, "loss": 0.88769817, "num_input_tokens_seen": 74535620, "step": 3453, "time_per_iteration": 2.8586485385894775 }, { "auxiliary_loss_clip": 0.01129934, "auxiliary_loss_mlp": 0.01047627, "balance_loss_clip": 1.04877114, "balance_loss_mlp": 1.02652669, "epoch": 0.41531894426742016, "flos": 24608002763520.0, "grad_norm": 2.583303495380104, "language_loss": 0.71556574, "learning_rate": 2.6346052088240326e-06, "loss": 0.73734134, "num_input_tokens_seen": 74555140, "step": 3454, "time_per_iteration": 2.7413744926452637 }, { "auxiliary_loss_clip": 0.01135343, "auxiliary_loss_mlp": 0.0104507, "balance_loss_clip": 1.04955554, "balance_loss_mlp": 1.02545977, "epoch": 0.4154391871580593, "flos": 14975791747200.0, "grad_norm": 2.2892945554047843, "language_loss": 0.7699368, "learning_rate": 2.63386644293345e-06, "loss": 0.79174089, "num_input_tokens_seen": 74571485, "step": 3455, "time_per_iteration": 2.7253870964050293 }, { "auxiliary_loss_clip": 0.01113075, "auxiliary_loss_mlp": 0.01051306, "balance_loss_clip": 1.04337525, "balance_loss_mlp": 1.03152943, "epoch": 0.4155594300486984, "flos": 14647173194880.0, "grad_norm": 2.4242986428794024, "language_loss": 0.83277977, "learning_rate": 2.633127580885833e-06, "loss": 0.85442358, "num_input_tokens_seen": 74585985, "step": 3456, "time_per_iteration": 2.701796293258667 }, { "auxiliary_loss_clip": 0.01155517, "auxiliary_loss_mlp": 0.01049994, "balance_loss_clip": 1.05295253, "balance_loss_mlp": 1.03164732, "epoch": 0.41567967293933744, "flos": 29497276275840.0, "grad_norm": 2.1725719506794787, "language_loss": 0.65310639, "learning_rate": 2.632388622793265e-06, "loss": 0.67516148, "num_input_tokens_seen": 74605140, "step": 3457, "time_per_iteration": 2.7249975204467773 }, { "auxiliary_loss_clip": 0.01144599, "auxiliary_loss_mlp": 0.01051554, "balance_loss_clip": 1.05054784, "balance_loss_mlp": 1.03218305, "epoch": 0.41579991582997655, "flos": 19238387650560.0, "grad_norm": 4.401138781159467, "language_loss": 0.68084329, "learning_rate": 2.6316495687678457e-06, "loss": 0.7028048, "num_input_tokens_seen": 74623790, "step": 3458, "time_per_iteration": 2.6830530166625977 }, { "auxiliary_loss_clip": 0.01100799, "auxiliary_loss_mlp": 0.0105359, "balance_loss_clip": 1.04405618, "balance_loss_mlp": 1.03225148, "epoch": 0.41592015872061566, "flos": 24462061804800.0, "grad_norm": 2.5156131889002666, "language_loss": 0.76365077, "learning_rate": 2.6309104189216887e-06, "loss": 0.78519475, "num_input_tokens_seen": 74641355, "step": 3459, "time_per_iteration": 2.8296942710876465 }, { "auxiliary_loss_clip": 0.01106476, "auxiliary_loss_mlp": 0.0077698, "balance_loss_clip": 1.04351819, "balance_loss_mlp": 1.0008893, "epoch": 0.4160404016112547, "flos": 20775651966720.0, "grad_norm": 2.6827346445153495, "language_loss": 0.75146914, "learning_rate": 2.630171173366923e-06, "loss": 0.77030367, "num_input_tokens_seen": 74657155, "step": 3460, "time_per_iteration": 2.790969133377075 }, { "auxiliary_loss_clip": 0.01104453, "auxiliary_loss_mlp": 0.01050974, "balance_loss_clip": 1.04637384, "balance_loss_mlp": 1.03130484, "epoch": 0.41616064450189383, "flos": 13916462820480.0, "grad_norm": 3.440898270506517, "language_loss": 0.74216008, "learning_rate": 2.629431832215691e-06, "loss": 0.76371431, "num_input_tokens_seen": 74671960, "step": 3461, "time_per_iteration": 2.801380157470703 }, { "auxiliary_loss_clip": 0.01129997, "auxiliary_loss_mlp": 0.01041467, "balance_loss_clip": 1.04825342, "balance_loss_mlp": 1.02238202, "epoch": 0.41628088739253294, "flos": 20010826650240.0, "grad_norm": 1.8055734870682196, "language_loss": 0.8723048, "learning_rate": 2.628692395580151e-06, "loss": 0.89401948, "num_input_tokens_seen": 74692050, "step": 3462, "time_per_iteration": 2.7739462852478027 }, { "auxiliary_loss_clip": 0.01074409, "auxiliary_loss_mlp": 0.01054697, "balance_loss_clip": 1.04165423, "balance_loss_mlp": 1.03259611, "epoch": 0.416401130283172, "flos": 29168801377920.0, "grad_norm": 1.7841882908382913, "language_loss": 0.79132104, "learning_rate": 2.6279528635724747e-06, "loss": 0.81261206, "num_input_tokens_seen": 74712205, "step": 3463, "time_per_iteration": 2.841029167175293 }, { "auxiliary_loss_clip": 0.01147023, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.05067754, "balance_loss_mlp": 1.02446961, "epoch": 0.4165213731738111, "flos": 16246813478400.0, "grad_norm": 5.540875872794686, "language_loss": 0.78664315, "learning_rate": 2.627213236304848e-06, "loss": 0.80856955, "num_input_tokens_seen": 74729005, "step": 3464, "time_per_iteration": 2.7418723106384277 }, { "auxiliary_loss_clip": 0.01145294, "auxiliary_loss_mlp": 0.01047764, "balance_loss_clip": 1.05056334, "balance_loss_mlp": 1.02982259, "epoch": 0.4166416160644502, "flos": 33765438787200.0, "grad_norm": 4.0063539398508405, "language_loss": 0.70919263, "learning_rate": 2.626473513889472e-06, "loss": 0.73112321, "num_input_tokens_seen": 74751385, "step": 3465, "time_per_iteration": 2.758971929550171 }, { "auxiliary_loss_clip": 0.01137704, "auxiliary_loss_mlp": 0.0105025, "balance_loss_clip": 1.05106783, "balance_loss_mlp": 1.03037763, "epoch": 0.41676185895508927, "flos": 20917498775040.0, "grad_norm": 2.3378964694440967, "language_loss": 0.83068734, "learning_rate": 2.625733696438562e-06, "loss": 0.8525669, "num_input_tokens_seen": 74768890, "step": 3466, "time_per_iteration": 3.606262445449829 }, { "auxiliary_loss_clip": 0.01131414, "auxiliary_loss_mlp": 0.01053827, "balance_loss_clip": 1.05218673, "balance_loss_mlp": 1.03451467, "epoch": 0.4168821018457284, "flos": 18406122549120.0, "grad_norm": 1.985642102602639, "language_loss": 0.75163603, "learning_rate": 2.6249937840643476e-06, "loss": 0.77348852, "num_input_tokens_seen": 74787195, "step": 3467, "time_per_iteration": 2.790701389312744 }, { "auxiliary_loss_clip": 0.01158496, "auxiliary_loss_mlp": 0.00775062, "balance_loss_clip": 1.05447793, "balance_loss_mlp": 1.00090706, "epoch": 0.41700234473636744, "flos": 18698399516160.0, "grad_norm": 1.7110188943194686, "language_loss": 0.66979277, "learning_rate": 2.6242537768790733e-06, "loss": 0.6891284, "num_input_tokens_seen": 74806350, "step": 3468, "time_per_iteration": 2.630326986312866 }, { "auxiliary_loss_clip": 0.01146115, "auxiliary_loss_mlp": 0.01053207, "balance_loss_clip": 1.05138123, "balance_loss_mlp": 1.03219056, "epoch": 0.41712258762700655, "flos": 31033283616000.0, "grad_norm": 1.815995843887616, "language_loss": 0.68866253, "learning_rate": 2.6235136749949975e-06, "loss": 0.71065575, "num_input_tokens_seen": 74829800, "step": 3469, "time_per_iteration": 3.7582058906555176 }, { "auxiliary_loss_clip": 0.01160805, "auxiliary_loss_mlp": 0.01047484, "balance_loss_clip": 1.05354095, "balance_loss_mlp": 1.02861345, "epoch": 0.41724283051764566, "flos": 35914763877120.0, "grad_norm": 2.0338393841830302, "language_loss": 0.615569, "learning_rate": 2.6227734785243924e-06, "loss": 0.63765192, "num_input_tokens_seen": 74849760, "step": 3470, "time_per_iteration": 2.737337589263916 }, { "auxiliary_loss_clip": 0.01087307, "auxiliary_loss_mlp": 0.01043374, "balance_loss_clip": 1.04262829, "balance_loss_mlp": 1.02441978, "epoch": 0.4173630734082847, "flos": 25333649320320.0, "grad_norm": 2.635536039799712, "language_loss": 0.79179132, "learning_rate": 2.6220331875795466e-06, "loss": 0.81309819, "num_input_tokens_seen": 74869110, "step": 3471, "time_per_iteration": 2.8750522136688232 }, { "auxiliary_loss_clip": 0.0113791, "auxiliary_loss_mlp": 0.01065096, "balance_loss_clip": 1.04916191, "balance_loss_mlp": 1.04318547, "epoch": 0.4174833162989238, "flos": 26685398868480.0, "grad_norm": 1.7183890951877092, "language_loss": 0.7493813, "learning_rate": 2.62129280227276e-06, "loss": 0.7714113, "num_input_tokens_seen": 74889110, "step": 3472, "time_per_iteration": 3.647174119949341 }, { "auxiliary_loss_clip": 0.01150284, "auxiliary_loss_mlp": 0.01049288, "balance_loss_clip": 1.05121601, "balance_loss_mlp": 1.0295825, "epoch": 0.41760355918956293, "flos": 74739584010240.0, "grad_norm": 2.0028811905360207, "language_loss": 0.68500733, "learning_rate": 2.62055232271635e-06, "loss": 0.70700306, "num_input_tokens_seen": 74916260, "step": 3473, "time_per_iteration": 3.0614500045776367 }, { "auxiliary_loss_clip": 0.01105706, "auxiliary_loss_mlp": 0.01054027, "balance_loss_clip": 1.04302239, "balance_loss_mlp": 1.033499, "epoch": 0.417723802080202, "flos": 14317513148160.0, "grad_norm": 2.1609147560439212, "language_loss": 0.87988937, "learning_rate": 2.619811749022646e-06, "loss": 0.90148669, "num_input_tokens_seen": 74931570, "step": 3474, "time_per_iteration": 2.729032278060913 }, { "auxiliary_loss_clip": 0.0114922, "auxiliary_loss_mlp": 0.01053026, "balance_loss_clip": 1.05093384, "balance_loss_mlp": 1.03270054, "epoch": 0.4178440449708411, "flos": 14643797316480.0, "grad_norm": 2.436898220568134, "language_loss": 0.71456945, "learning_rate": 2.6190710813039917e-06, "loss": 0.73659188, "num_input_tokens_seen": 74944695, "step": 3475, "time_per_iteration": 2.777540922164917 }, { "auxiliary_loss_clip": 0.01101575, "auxiliary_loss_mlp": 0.00776406, "balance_loss_clip": 1.04341674, "balance_loss_mlp": 1.00075746, "epoch": 0.4179642878614802, "flos": 21507296094720.0, "grad_norm": 2.4308672588177616, "language_loss": 0.83450156, "learning_rate": 2.618330319672747e-06, "loss": 0.85328138, "num_input_tokens_seen": 74964115, "step": 3476, "time_per_iteration": 2.7717273235321045 }, { "auxiliary_loss_clip": 0.01157194, "auxiliary_loss_mlp": 0.01051466, "balance_loss_clip": 1.05090761, "balance_loss_mlp": 1.032655, "epoch": 0.41808453075211927, "flos": 18441997257600.0, "grad_norm": 2.5027245177968536, "language_loss": 0.92191935, "learning_rate": 2.617589464241284e-06, "loss": 0.94400597, "num_input_tokens_seen": 74978515, "step": 3477, "time_per_iteration": 3.5654029846191406 }, { "auxiliary_loss_clip": 0.01118743, "auxiliary_loss_mlp": 0.01048205, "balance_loss_clip": 1.0468843, "balance_loss_mlp": 1.0298115, "epoch": 0.4182047736427584, "flos": 20301020628480.0, "grad_norm": 2.0999815733587543, "language_loss": 0.74612433, "learning_rate": 2.6168485151219914e-06, "loss": 0.76779377, "num_input_tokens_seen": 74998135, "step": 3478, "time_per_iteration": 2.6669466495513916 }, { "auxiliary_loss_clip": 0.01149119, "auxiliary_loss_mlp": 0.01043717, "balance_loss_clip": 1.0522238, "balance_loss_mlp": 1.02379692, "epoch": 0.4183250165333975, "flos": 18876623823360.0, "grad_norm": 2.4220730549704848, "language_loss": 0.71792531, "learning_rate": 2.616107472427269e-06, "loss": 0.73985362, "num_input_tokens_seen": 75012830, "step": 3479, "time_per_iteration": 2.6559336185455322 }, { "auxiliary_loss_clip": 0.01150748, "auxiliary_loss_mlp": 0.01052154, "balance_loss_clip": 1.0505358, "balance_loss_mlp": 1.03107786, "epoch": 0.41844525942403654, "flos": 17740050698880.0, "grad_norm": 8.719628231417314, "language_loss": 0.76354051, "learning_rate": 2.615366336269533e-06, "loss": 0.78556949, "num_input_tokens_seen": 75026495, "step": 3480, "time_per_iteration": 2.6481099128723145 }, { "auxiliary_loss_clip": 0.01164194, "auxiliary_loss_mlp": 0.01054903, "balance_loss_clip": 1.05274463, "balance_loss_mlp": 1.03478098, "epoch": 0.41856550231467565, "flos": 18361377181440.0, "grad_norm": 2.2538238809942004, "language_loss": 0.80457693, "learning_rate": 2.6146251067612126e-06, "loss": 0.82676792, "num_input_tokens_seen": 75041970, "step": 3481, "time_per_iteration": 2.7098870277404785 }, { "auxiliary_loss_clip": 0.0114651, "auxiliary_loss_mlp": 0.01048626, "balance_loss_clip": 1.05518699, "balance_loss_mlp": 1.03075612, "epoch": 0.41868574520531476, "flos": 22781801445120.0, "grad_norm": 2.054994176136773, "language_loss": 0.82619143, "learning_rate": 2.6138837840147525e-06, "loss": 0.84814274, "num_input_tokens_seen": 75061005, "step": 3482, "time_per_iteration": 2.6713531017303467 }, { "auxiliary_loss_clip": 0.01117505, "auxiliary_loss_mlp": 0.01038264, "balance_loss_clip": 1.04715037, "balance_loss_mlp": 1.02028728, "epoch": 0.4188059880959538, "flos": 13699167494400.0, "grad_norm": 2.5556866260324123, "language_loss": 0.76389211, "learning_rate": 2.6131423681426103e-06, "loss": 0.7854498, "num_input_tokens_seen": 75076920, "step": 3483, "time_per_iteration": 2.7040958404541016 }, { "auxiliary_loss_clip": 0.01154564, "auxiliary_loss_mlp": 0.01041956, "balance_loss_clip": 1.05135012, "balance_loss_mlp": 1.02474236, "epoch": 0.41892623098659293, "flos": 37818281220480.0, "grad_norm": 2.045547271238905, "language_loss": 0.72807878, "learning_rate": 2.6124008592572587e-06, "loss": 0.75004399, "num_input_tokens_seen": 75100905, "step": 3484, "time_per_iteration": 2.779473066329956 }, { "auxiliary_loss_clip": 0.01160386, "auxiliary_loss_mlp": 0.01051291, "balance_loss_clip": 1.05155444, "balance_loss_mlp": 1.03084612, "epoch": 0.419046473877232, "flos": 23258874908160.0, "grad_norm": 2.162076023973905, "language_loss": 0.81907952, "learning_rate": 2.6116592574711835e-06, "loss": 0.8411963, "num_input_tokens_seen": 75119205, "step": 3485, "time_per_iteration": 2.643684148788452 }, { "auxiliary_loss_clip": 0.01163026, "auxiliary_loss_mlp": 0.01045038, "balance_loss_clip": 1.05418682, "balance_loss_mlp": 1.02560663, "epoch": 0.4191667167678711, "flos": 20741034234240.0, "grad_norm": 2.05910496395104, "language_loss": 0.84193575, "learning_rate": 2.6109175628968853e-06, "loss": 0.86401641, "num_input_tokens_seen": 75138970, "step": 3486, "time_per_iteration": 2.666721820831299 }, { "auxiliary_loss_clip": 0.0113473, "auxiliary_loss_mlp": 0.0105033, "balance_loss_clip": 1.04887676, "balance_loss_mlp": 1.03100657, "epoch": 0.4192869596585102, "flos": 23586416052480.0, "grad_norm": 2.191474115773646, "language_loss": 0.8285346, "learning_rate": 2.610175775646878e-06, "loss": 0.85038519, "num_input_tokens_seen": 75157550, "step": 3487, "time_per_iteration": 2.703878164291382 }, { "auxiliary_loss_clip": 0.01130692, "auxiliary_loss_mlp": 0.01054974, "balance_loss_clip": 1.04717767, "balance_loss_mlp": 1.03458881, "epoch": 0.41940720254914926, "flos": 25081269384960.0, "grad_norm": 9.72515927242669, "language_loss": 0.73609447, "learning_rate": 2.6094338958336907e-06, "loss": 0.75795108, "num_input_tokens_seen": 75176220, "step": 3488, "time_per_iteration": 2.7287847995758057 }, { "auxiliary_loss_clip": 0.01132352, "auxiliary_loss_mlp": 0.01047644, "balance_loss_clip": 1.05124569, "balance_loss_mlp": 1.02840412, "epoch": 0.41952744543978837, "flos": 15554132628480.0, "grad_norm": 3.0544235361619423, "language_loss": 0.8249445, "learning_rate": 2.608691923569867e-06, "loss": 0.84674448, "num_input_tokens_seen": 75193095, "step": 3489, "time_per_iteration": 2.659146308898926 }, { "auxiliary_loss_clip": 0.0114579, "auxiliary_loss_mlp": 0.0104407, "balance_loss_clip": 1.0513742, "balance_loss_mlp": 1.02633119, "epoch": 0.4196476883304275, "flos": 24644775312000.0, "grad_norm": 1.8460458921227334, "language_loss": 0.75789404, "learning_rate": 2.6079498589679616e-06, "loss": 0.77979267, "num_input_tokens_seen": 75214185, "step": 3490, "time_per_iteration": 2.769768476486206 }, { "auxiliary_loss_clip": 0.01089739, "auxiliary_loss_mlp": 0.01053759, "balance_loss_clip": 1.04167652, "balance_loss_mlp": 1.03153849, "epoch": 0.41976793122106654, "flos": 24531333183360.0, "grad_norm": 2.3207751240095433, "language_loss": 0.75801575, "learning_rate": 2.6072077021405465e-06, "loss": 0.77945077, "num_input_tokens_seen": 75233020, "step": 3491, "time_per_iteration": 2.8507227897644043 }, { "auxiliary_loss_clip": 0.01127549, "auxiliary_loss_mlp": 0.01053073, "balance_loss_clip": 1.0472095, "balance_loss_mlp": 1.03439295, "epoch": 0.41988817411170565, "flos": 21175301664000.0, "grad_norm": 2.0636123969695364, "language_loss": 0.6964376, "learning_rate": 2.6064654532002054e-06, "loss": 0.71824384, "num_input_tokens_seen": 75252030, "step": 3492, "time_per_iteration": 4.211406469345093 }, { "auxiliary_loss_clip": 0.01155132, "auxiliary_loss_mlp": 0.01050877, "balance_loss_clip": 1.05171049, "balance_loss_mlp": 1.03287685, "epoch": 0.42000841700234476, "flos": 31649402626560.0, "grad_norm": 1.8304631986204283, "language_loss": 0.75976574, "learning_rate": 2.6057231122595375e-06, "loss": 0.78182584, "num_input_tokens_seen": 75273340, "step": 3493, "time_per_iteration": 2.757357597351074 }, { "auxiliary_loss_clip": 0.01133111, "auxiliary_loss_mlp": 0.01051841, "balance_loss_clip": 1.0478071, "balance_loss_mlp": 1.03344738, "epoch": 0.4201286598929838, "flos": 21281525159040.0, "grad_norm": 1.8618130686354597, "language_loss": 0.73122382, "learning_rate": 2.604980679431154e-06, "loss": 0.75307333, "num_input_tokens_seen": 75291580, "step": 3494, "time_per_iteration": 3.713711977005005 }, { "auxiliary_loss_clip": 0.01144389, "auxiliary_loss_mlp": 0.01045804, "balance_loss_clip": 1.04989207, "balance_loss_mlp": 1.0270164, "epoch": 0.4202489027836229, "flos": 18546532813440.0, "grad_norm": 2.4094161738227022, "language_loss": 0.7457726, "learning_rate": 2.604238154827684e-06, "loss": 0.76767457, "num_input_tokens_seen": 75308205, "step": 3495, "time_per_iteration": 2.6466686725616455 }, { "auxiliary_loss_clip": 0.01149205, "auxiliary_loss_mlp": 0.01036555, "balance_loss_clip": 1.05337775, "balance_loss_mlp": 1.01923418, "epoch": 0.42036914567426203, "flos": 19317643009920.0, "grad_norm": 2.23835386616551, "language_loss": 0.72905254, "learning_rate": 2.6034955385617656e-06, "loss": 0.75091016, "num_input_tokens_seen": 75326535, "step": 3496, "time_per_iteration": 2.7069387435913086 }, { "auxiliary_loss_clip": 0.01028384, "auxiliary_loss_mlp": 0.0101176, "balance_loss_clip": 1.02186775, "balance_loss_mlp": 1.00969779, "epoch": 0.4204893885649011, "flos": 67842942935040.0, "grad_norm": 0.7239200689037788, "language_loss": 0.61634338, "learning_rate": 2.6027528307460544e-06, "loss": 0.63674486, "num_input_tokens_seen": 75390540, "step": 3497, "time_per_iteration": 4.281646490097046 }, { "auxiliary_loss_clip": 0.01161534, "auxiliary_loss_mlp": 0.01047485, "balance_loss_clip": 1.05446899, "balance_loss_mlp": 1.02901983, "epoch": 0.4206096314555402, "flos": 21908777385600.0, "grad_norm": 2.0415888603355614, "language_loss": 0.86752158, "learning_rate": 2.602010031493217e-06, "loss": 0.88961184, "num_input_tokens_seen": 75408770, "step": 3498, "time_per_iteration": 2.6508948802948 }, { "auxiliary_loss_clip": 0.01113039, "auxiliary_loss_mlp": 0.01045038, "balance_loss_clip": 1.04653907, "balance_loss_mlp": 1.02552354, "epoch": 0.42072987434617926, "flos": 29278185269760.0, "grad_norm": 2.0821270275133488, "language_loss": 0.86903691, "learning_rate": 2.6012671409159367e-06, "loss": 0.89061773, "num_input_tokens_seen": 75430105, "step": 3499, "time_per_iteration": 2.788019895553589 }, { "auxiliary_loss_clip": 0.01132636, "auxiliary_loss_mlp": 0.01046803, "balance_loss_clip": 1.05100846, "balance_loss_mlp": 1.02746725, "epoch": 0.42085011723681837, "flos": 27600726170880.0, "grad_norm": 1.8037046786444033, "language_loss": 0.81547618, "learning_rate": 2.6005241591269097e-06, "loss": 0.83727056, "num_input_tokens_seen": 75449475, "step": 3500, "time_per_iteration": 2.748887300491333 }, { "auxiliary_loss_clip": 0.011151, "auxiliary_loss_mlp": 0.01042941, "balance_loss_clip": 1.0505054, "balance_loss_mlp": 1.0256083, "epoch": 0.4209703601274575, "flos": 27818632028160.0, "grad_norm": 1.6957983645477486, "language_loss": 0.79377663, "learning_rate": 2.5997810862388454e-06, "loss": 0.81535709, "num_input_tokens_seen": 75469315, "step": 3501, "time_per_iteration": 2.8822224140167236 }, { "auxiliary_loss_clip": 0.01131263, "auxiliary_loss_mlp": 0.01050021, "balance_loss_clip": 1.04595685, "balance_loss_mlp": 1.02937365, "epoch": 0.42109060301809653, "flos": 27525529048320.0, "grad_norm": 2.8984160368210294, "language_loss": 0.75627267, "learning_rate": 2.599037922364467e-06, "loss": 0.77808547, "num_input_tokens_seen": 75488215, "step": 3502, "time_per_iteration": 3.7505393028259277 }, { "auxiliary_loss_clip": 0.01111297, "auxiliary_loss_mlp": 0.01043168, "balance_loss_clip": 1.04722643, "balance_loss_mlp": 1.02476215, "epoch": 0.42121084590873564, "flos": 29314275459840.0, "grad_norm": 2.730299919788673, "language_loss": 0.75131154, "learning_rate": 2.5982946676165112e-06, "loss": 0.77285618, "num_input_tokens_seen": 75507985, "step": 3503, "time_per_iteration": 2.822124481201172 }, { "auxiliary_loss_clip": 0.01026705, "auxiliary_loss_mlp": 0.01017821, "balance_loss_clip": 1.02506232, "balance_loss_mlp": 1.01528156, "epoch": 0.42133108879937475, "flos": 67398835178880.0, "grad_norm": 0.7308309144013905, "language_loss": 0.57554495, "learning_rate": 2.5975513221077313e-06, "loss": 0.59599018, "num_input_tokens_seen": 75571955, "step": 3504, "time_per_iteration": 3.379270315170288 }, { "auxiliary_loss_clip": 0.01123193, "auxiliary_loss_mlp": 0.01056052, "balance_loss_clip": 1.04671299, "balance_loss_mlp": 1.03696644, "epoch": 0.4214513316900138, "flos": 23106038538240.0, "grad_norm": 2.291773967136979, "language_loss": 0.88604122, "learning_rate": 2.5968078859508897e-06, "loss": 0.9078337, "num_input_tokens_seen": 75589155, "step": 3505, "time_per_iteration": 2.7207818031311035 }, { "auxiliary_loss_clip": 0.01144642, "auxiliary_loss_mlp": 0.01049377, "balance_loss_clip": 1.05029154, "balance_loss_mlp": 1.03132832, "epoch": 0.4215715745806529, "flos": 15336190857600.0, "grad_norm": 2.417551317251208, "language_loss": 0.79897988, "learning_rate": 2.5960643592587673e-06, "loss": 0.82092011, "num_input_tokens_seen": 75606565, "step": 3506, "time_per_iteration": 2.6654717922210693 }, { "auxiliary_loss_clip": 0.01118063, "auxiliary_loss_mlp": 0.01047814, "balance_loss_clip": 1.04515004, "balance_loss_mlp": 1.02903891, "epoch": 0.42169181747129203, "flos": 22127257860480.0, "grad_norm": 2.38610381796642, "language_loss": 0.8181684, "learning_rate": 2.5953207421441553e-06, "loss": 0.83982718, "num_input_tokens_seen": 75625165, "step": 3507, "time_per_iteration": 2.7239112854003906 }, { "auxiliary_loss_clip": 0.01117472, "auxiliary_loss_mlp": 0.01039863, "balance_loss_clip": 1.04790473, "balance_loss_mlp": 1.02194619, "epoch": 0.4218120603619311, "flos": 22630724841600.0, "grad_norm": 2.513472906147837, "language_loss": 0.74616086, "learning_rate": 2.5945770347198603e-06, "loss": 0.76773423, "num_input_tokens_seen": 75643320, "step": 3508, "time_per_iteration": 2.76920485496521 }, { "auxiliary_loss_clip": 0.01123164, "auxiliary_loss_mlp": 0.01059639, "balance_loss_clip": 1.04452991, "balance_loss_mlp": 1.03991032, "epoch": 0.4219323032525702, "flos": 19682818629120.0, "grad_norm": 1.8386056179859485, "language_loss": 0.81870365, "learning_rate": 2.593833237098701e-06, "loss": 0.84053171, "num_input_tokens_seen": 75660920, "step": 3509, "time_per_iteration": 2.6778228282928467 }, { "auxiliary_loss_clip": 0.01144792, "auxiliary_loss_mlp": 0.01045844, "balance_loss_clip": 1.04883826, "balance_loss_mlp": 1.02507782, "epoch": 0.4220525461432093, "flos": 30190747224960.0, "grad_norm": 8.709590271242154, "language_loss": 0.62014377, "learning_rate": 2.593089349393512e-06, "loss": 0.64205015, "num_input_tokens_seen": 75681410, "step": 3510, "time_per_iteration": 2.7298648357391357 }, { "auxiliary_loss_clip": 0.01143291, "auxiliary_loss_mlp": 0.01051607, "balance_loss_clip": 1.05181122, "balance_loss_mlp": 1.03286695, "epoch": 0.42217278903384836, "flos": 24315941278080.0, "grad_norm": 1.9283022040654063, "language_loss": 0.83555883, "learning_rate": 2.592345371717141e-06, "loss": 0.85750782, "num_input_tokens_seen": 75700940, "step": 3511, "time_per_iteration": 2.6755170822143555 }, { "auxiliary_loss_clip": 0.01141917, "auxiliary_loss_mlp": 0.01054285, "balance_loss_clip": 1.05113757, "balance_loss_mlp": 1.03511572, "epoch": 0.42229303192448747, "flos": 17092474352640.0, "grad_norm": 2.983572136367511, "language_loss": 0.71928549, "learning_rate": 2.591601304182448e-06, "loss": 0.74124753, "num_input_tokens_seen": 75718910, "step": 3512, "time_per_iteration": 2.7406628131866455 }, { "auxiliary_loss_clip": 0.01129135, "auxiliary_loss_mlp": 0.01050292, "balance_loss_clip": 1.04817367, "balance_loss_mlp": 1.03347111, "epoch": 0.4224132748151266, "flos": 22784530878720.0, "grad_norm": 1.7895027167950528, "language_loss": 0.79581797, "learning_rate": 2.5908571469023067e-06, "loss": 0.81761223, "num_input_tokens_seen": 75738395, "step": 3513, "time_per_iteration": 2.7074499130249023 }, { "auxiliary_loss_clip": 0.01154764, "auxiliary_loss_mlp": 0.01046509, "balance_loss_clip": 1.05043375, "balance_loss_mlp": 1.02812648, "epoch": 0.42253351770576564, "flos": 17819090576640.0, "grad_norm": 2.267329156185243, "language_loss": 0.75722963, "learning_rate": 2.5901128999896067e-06, "loss": 0.7792424, "num_input_tokens_seen": 75753825, "step": 3514, "time_per_iteration": 2.615837335586548 }, { "auxiliary_loss_clip": 0.011427, "auxiliary_loss_mlp": 0.01040922, "balance_loss_clip": 1.05034173, "balance_loss_mlp": 1.02305281, "epoch": 0.42265376059640475, "flos": 28512390286080.0, "grad_norm": 1.6166231374709108, "language_loss": 0.68337059, "learning_rate": 2.5893685635572487e-06, "loss": 0.70520681, "num_input_tokens_seen": 75774675, "step": 3515, "time_per_iteration": 2.753403663635254 }, { "auxiliary_loss_clip": 0.01128599, "auxiliary_loss_mlp": 0.01048091, "balance_loss_clip": 1.04800534, "balance_loss_mlp": 1.02882695, "epoch": 0.4227740034870438, "flos": 16253349753600.0, "grad_norm": 2.2732412963686746, "language_loss": 0.69155633, "learning_rate": 2.5886241377181483e-06, "loss": 0.71332324, "num_input_tokens_seen": 75793545, "step": 3516, "time_per_iteration": 2.6244521141052246 }, { "auxiliary_loss_clip": 0.01146201, "auxiliary_loss_mlp": 0.01042004, "balance_loss_clip": 1.04956937, "balance_loss_mlp": 1.02252507, "epoch": 0.4228942463776829, "flos": 25295691623040.0, "grad_norm": 2.664446572326574, "language_loss": 0.81526673, "learning_rate": 2.587879622585234e-06, "loss": 0.83714879, "num_input_tokens_seen": 75812145, "step": 3517, "time_per_iteration": 2.7315304279327393 }, { "auxiliary_loss_clip": 0.01143315, "auxiliary_loss_mlp": 0.01043659, "balance_loss_clip": 1.04995298, "balance_loss_mlp": 1.02483535, "epoch": 0.423014489268322, "flos": 26395779507840.0, "grad_norm": 3.0515067313705124, "language_loss": 0.75475872, "learning_rate": 2.5871350182714486e-06, "loss": 0.77662849, "num_input_tokens_seen": 75833025, "step": 3518, "time_per_iteration": 3.8205149173736572 }, { "auxiliary_loss_clip": 0.01156029, "auxiliary_loss_mlp": 0.01049061, "balance_loss_clip": 1.05247402, "balance_loss_mlp": 1.0310483, "epoch": 0.4231347321589611, "flos": 17274002711040.0, "grad_norm": 1.99886199449975, "language_loss": 0.80569202, "learning_rate": 2.586390324889748e-06, "loss": 0.82774293, "num_input_tokens_seen": 75848925, "step": 3519, "time_per_iteration": 2.7219715118408203 }, { "auxiliary_loss_clip": 0.01137525, "auxiliary_loss_mlp": 0.01043342, "balance_loss_clip": 1.0493089, "balance_loss_mlp": 1.02523422, "epoch": 0.4232549750496002, "flos": 22999635475200.0, "grad_norm": 2.237598252684917, "language_loss": 0.67625099, "learning_rate": 2.5856455425531003e-06, "loss": 0.69805968, "num_input_tokens_seen": 75870400, "step": 3520, "time_per_iteration": 2.7590172290802 }, { "auxiliary_loss_clip": 0.01136449, "auxiliary_loss_mlp": 0.01050156, "balance_loss_clip": 1.0479219, "balance_loss_mlp": 1.03228605, "epoch": 0.4233752179402393, "flos": 21248343970560.0, "grad_norm": 2.359512902425427, "language_loss": 0.80636621, "learning_rate": 2.5849006713744902e-06, "loss": 0.82823229, "num_input_tokens_seen": 75889195, "step": 3521, "time_per_iteration": 3.662604808807373 }, { "auxiliary_loss_clip": 0.01124936, "auxiliary_loss_mlp": 0.01049536, "balance_loss_clip": 1.04618478, "balance_loss_mlp": 1.03074861, "epoch": 0.42349546083087836, "flos": 20704297599360.0, "grad_norm": 2.2979577934147533, "language_loss": 0.72858953, "learning_rate": 2.5841557114669135e-06, "loss": 0.75033426, "num_input_tokens_seen": 75906055, "step": 3522, "time_per_iteration": 2.6937503814697266 }, { "auxiliary_loss_clip": 0.01161417, "auxiliary_loss_mlp": 0.01047425, "balance_loss_clip": 1.05160928, "balance_loss_mlp": 1.02819657, "epoch": 0.42361570372151747, "flos": 18585065128320.0, "grad_norm": 2.919766065974064, "language_loss": 0.67313606, "learning_rate": 2.58341066294338e-06, "loss": 0.6952244, "num_input_tokens_seen": 75922720, "step": 3523, "time_per_iteration": 2.637939214706421 }, { "auxiliary_loss_clip": 0.01106012, "auxiliary_loss_mlp": 0.00774219, "balance_loss_clip": 1.04397857, "balance_loss_mlp": 1.00079918, "epoch": 0.4237359466121566, "flos": 20959478795520.0, "grad_norm": 1.9962601560448263, "language_loss": 0.85062414, "learning_rate": 2.5826655259169124e-06, "loss": 0.86942649, "num_input_tokens_seen": 75941375, "step": 3524, "time_per_iteration": 3.6830623149871826 }, { "auxiliary_loss_clip": 0.01156239, "auxiliary_loss_mlp": 0.0104391, "balance_loss_clip": 1.05335927, "balance_loss_mlp": 1.02524209, "epoch": 0.42385618950279563, "flos": 18038181582720.0, "grad_norm": 1.9342007458631607, "language_loss": 0.90768117, "learning_rate": 2.5819203005005475e-06, "loss": 0.92968267, "num_input_tokens_seen": 75958710, "step": 3525, "time_per_iteration": 2.635692596435547 }, { "auxiliary_loss_clip": 0.01122351, "auxiliary_loss_mlp": 0.01045968, "balance_loss_clip": 1.04688585, "balance_loss_mlp": 1.02716875, "epoch": 0.42397643239343474, "flos": 23769129559680.0, "grad_norm": 1.6339467786180701, "language_loss": 0.78982997, "learning_rate": 2.581174986807336e-06, "loss": 0.81151319, "num_input_tokens_seen": 75978945, "step": 3526, "time_per_iteration": 2.6763172149658203 }, { "auxiliary_loss_clip": 0.01132511, "auxiliary_loss_mlp": 0.0077434, "balance_loss_clip": 1.04715681, "balance_loss_mlp": 1.00082791, "epoch": 0.42409667528407385, "flos": 16545088016640.0, "grad_norm": 2.228675862613618, "language_loss": 0.91268212, "learning_rate": 2.580429584950341e-06, "loss": 0.93175066, "num_input_tokens_seen": 75994695, "step": 3527, "time_per_iteration": 2.6378071308135986 }, { "auxiliary_loss_clip": 0.01121199, "auxiliary_loss_mlp": 0.01048546, "balance_loss_clip": 1.04871154, "balance_loss_mlp": 1.0280776, "epoch": 0.4242169181747129, "flos": 16034186920320.0, "grad_norm": 2.461100376048485, "language_loss": 0.66586161, "learning_rate": 2.5796840950426397e-06, "loss": 0.68755907, "num_input_tokens_seen": 76011780, "step": 3528, "time_per_iteration": 3.5252034664154053 }, { "auxiliary_loss_clip": 0.01131108, "auxiliary_loss_mlp": 0.01041146, "balance_loss_clip": 1.04647112, "balance_loss_mlp": 1.02245402, "epoch": 0.424337161065352, "flos": 20084012611200.0, "grad_norm": 1.9138532449621217, "language_loss": 0.65604031, "learning_rate": 2.578938517197322e-06, "loss": 0.67776287, "num_input_tokens_seen": 76029875, "step": 3529, "time_per_iteration": 2.6543641090393066 }, { "auxiliary_loss_clip": 0.01122338, "auxiliary_loss_mlp": 0.01050612, "balance_loss_clip": 1.04947567, "balance_loss_mlp": 1.0309782, "epoch": 0.4244574039559911, "flos": 23878369797120.0, "grad_norm": 2.676215259298792, "language_loss": 0.63321662, "learning_rate": 2.5781928515274916e-06, "loss": 0.65494615, "num_input_tokens_seen": 76048595, "step": 3530, "time_per_iteration": 2.7396080493927 }, { "auxiliary_loss_clip": 0.01144485, "auxiliary_loss_mlp": 0.01042978, "balance_loss_clip": 1.05030274, "balance_loss_mlp": 1.02467966, "epoch": 0.4245776468466302, "flos": 17565920542080.0, "grad_norm": 2.125907076503075, "language_loss": 0.68028831, "learning_rate": 2.577447098146265e-06, "loss": 0.70216292, "num_input_tokens_seen": 76065770, "step": 3531, "time_per_iteration": 2.670692205429077 }, { "auxiliary_loss_clip": 0.0111863, "auxiliary_loss_mlp": 0.01047816, "balance_loss_clip": 1.04770005, "balance_loss_mlp": 1.02997065, "epoch": 0.4246978897372693, "flos": 27776256958080.0, "grad_norm": 2.136411884467941, "language_loss": 0.79487401, "learning_rate": 2.5767012571667724e-06, "loss": 0.81653845, "num_input_tokens_seen": 76085250, "step": 3532, "time_per_iteration": 2.785841226577759 }, { "auxiliary_loss_clip": 0.01141581, "auxiliary_loss_mlp": 0.01045131, "balance_loss_clip": 1.04751539, "balance_loss_mlp": 1.02511597, "epoch": 0.42481813262790835, "flos": 15596615439360.0, "grad_norm": 1.8648622265875463, "language_loss": 0.68692565, "learning_rate": 2.5759553287021587e-06, "loss": 0.70879281, "num_input_tokens_seen": 76103580, "step": 3533, "time_per_iteration": 2.6349122524261475 }, { "auxiliary_loss_clip": 0.01125852, "auxiliary_loss_mlp": 0.01045822, "balance_loss_clip": 1.04985332, "balance_loss_mlp": 1.02808404, "epoch": 0.42493837551854746, "flos": 23951088881280.0, "grad_norm": 2.5660895543783573, "language_loss": 0.7781449, "learning_rate": 2.5752093128655786e-06, "loss": 0.79986161, "num_input_tokens_seen": 76121825, "step": 3534, "time_per_iteration": 2.7465786933898926 }, { "auxiliary_loss_clip": 0.01124006, "auxiliary_loss_mlp": 0.01049348, "balance_loss_clip": 1.04489613, "balance_loss_mlp": 1.02929735, "epoch": 0.4250586184091866, "flos": 20813466009600.0, "grad_norm": 2.1582945500200994, "language_loss": 0.7373631, "learning_rate": 2.574463209770204e-06, "loss": 0.75909656, "num_input_tokens_seen": 76141140, "step": 3535, "time_per_iteration": 2.649683952331543 }, { "auxiliary_loss_clip": 0.0111241, "auxiliary_loss_mlp": 0.0104671, "balance_loss_clip": 1.04462075, "balance_loss_mlp": 1.02737474, "epoch": 0.42517886129982563, "flos": 30371018607360.0, "grad_norm": 2.1798096277016876, "language_loss": 0.79367292, "learning_rate": 2.5737170195292165e-06, "loss": 0.81526417, "num_input_tokens_seen": 76164475, "step": 3536, "time_per_iteration": 2.8036673069000244 }, { "auxiliary_loss_clip": 0.01119416, "auxiliary_loss_mlp": 0.01038261, "balance_loss_clip": 1.0475024, "balance_loss_mlp": 1.02086878, "epoch": 0.42529910419046474, "flos": 20080636732800.0, "grad_norm": 1.96282936468929, "language_loss": 0.7809546, "learning_rate": 2.572970742255814e-06, "loss": 0.80253136, "num_input_tokens_seen": 76182965, "step": 3537, "time_per_iteration": 2.6892342567443848 }, { "auxiliary_loss_clip": 0.01141061, "auxiliary_loss_mlp": 0.01046798, "balance_loss_clip": 1.0509336, "balance_loss_mlp": 1.0299418, "epoch": 0.42541934708110385, "flos": 22632448694400.0, "grad_norm": 1.755310190604375, "language_loss": 0.8181268, "learning_rate": 2.5722243780632046e-06, "loss": 0.8400054, "num_input_tokens_seen": 76201230, "step": 3538, "time_per_iteration": 2.671661376953125 }, { "auxiliary_loss_clip": 0.01015878, "auxiliary_loss_mlp": 0.01019483, "balance_loss_clip": 1.01871967, "balance_loss_mlp": 1.01763535, "epoch": 0.4255395899717429, "flos": 66200676186240.0, "grad_norm": 0.75619005381897, "language_loss": 0.60501838, "learning_rate": 2.5714779270646125e-06, "loss": 0.62537193, "num_input_tokens_seen": 76262000, "step": 3539, "time_per_iteration": 3.260711908340454 }, { "auxiliary_loss_clip": 0.01135872, "auxiliary_loss_mlp": 0.00774049, "balance_loss_clip": 1.05278778, "balance_loss_mlp": 1.0007273, "epoch": 0.425659832862382, "flos": 17931814433280.0, "grad_norm": 3.445134126944061, "language_loss": 0.77876037, "learning_rate": 2.5707313893732735e-06, "loss": 0.79785961, "num_input_tokens_seen": 76280540, "step": 3540, "time_per_iteration": 2.7021117210388184 }, { "auxiliary_loss_clip": 0.01073652, "auxiliary_loss_mlp": 0.01041141, "balance_loss_clip": 1.04115081, "balance_loss_mlp": 1.02231789, "epoch": 0.4257800757530211, "flos": 24022550989440.0, "grad_norm": 1.9954964009560627, "language_loss": 0.77047861, "learning_rate": 2.5699847651024364e-06, "loss": 0.79162651, "num_input_tokens_seen": 76301180, "step": 3541, "time_per_iteration": 2.858510971069336 }, { "auxiliary_loss_clip": 0.01137365, "auxiliary_loss_mlp": 0.01052459, "balance_loss_clip": 1.05038297, "balance_loss_mlp": 1.03487575, "epoch": 0.4259003186436602, "flos": 23696015425920.0, "grad_norm": 3.846369364159312, "language_loss": 0.76891667, "learning_rate": 2.5692380543653627e-06, "loss": 0.79081488, "num_input_tokens_seen": 76319335, "step": 3542, "time_per_iteration": 2.6801326274871826 }, { "auxiliary_loss_clip": 0.01146943, "auxiliary_loss_mlp": 0.00774344, "balance_loss_clip": 1.05079937, "balance_loss_mlp": 1.00082302, "epoch": 0.4260205615342993, "flos": 15259772672640.0, "grad_norm": 2.2483105029766772, "language_loss": 0.6991713, "learning_rate": 2.5684912572753293e-06, "loss": 0.71838415, "num_input_tokens_seen": 76335010, "step": 3543, "time_per_iteration": 2.648073434829712 }, { "auxiliary_loss_clip": 0.01147999, "auxiliary_loss_mlp": 0.01041136, "balance_loss_clip": 1.04936576, "balance_loss_mlp": 1.02444696, "epoch": 0.4261408044249384, "flos": 30665306736000.0, "grad_norm": 1.7487020222291243, "language_loss": 0.84062153, "learning_rate": 2.5677443739456245e-06, "loss": 0.86251283, "num_input_tokens_seen": 76356670, "step": 3544, "time_per_iteration": 3.709200382232666 }, { "auxiliary_loss_clip": 0.01128653, "auxiliary_loss_mlp": 0.01047598, "balance_loss_clip": 1.050385, "balance_loss_mlp": 1.02962112, "epoch": 0.42626104731557746, "flos": 23257905240960.0, "grad_norm": 2.3160872531918826, "language_loss": 0.7945838, "learning_rate": 2.5669974044895495e-06, "loss": 0.81634629, "num_input_tokens_seen": 76373065, "step": 3545, "time_per_iteration": 2.7925050258636475 }, { "auxiliary_loss_clip": 0.01120458, "auxiliary_loss_mlp": 0.01038546, "balance_loss_clip": 1.04722345, "balance_loss_mlp": 1.02046204, "epoch": 0.42638129020621657, "flos": 25884770670720.0, "grad_norm": 1.7993622371004063, "language_loss": 0.79547501, "learning_rate": 2.5662503490204187e-06, "loss": 0.817065, "num_input_tokens_seen": 76393230, "step": 3546, "time_per_iteration": 2.772601366043091 }, { "auxiliary_loss_clip": 0.01126741, "auxiliary_loss_mlp": 0.0104114, "balance_loss_clip": 1.04675853, "balance_loss_mlp": 1.02360392, "epoch": 0.4265015330968556, "flos": 26502362138880.0, "grad_norm": 1.955760257579605, "language_loss": 0.76130313, "learning_rate": 2.5655032076515603e-06, "loss": 0.78298193, "num_input_tokens_seen": 76412555, "step": 3547, "time_per_iteration": 3.6000583171844482 }, { "auxiliary_loss_clip": 0.0113192, "auxiliary_loss_mlp": 0.01048309, "balance_loss_clip": 1.05060351, "balance_loss_mlp": 1.03054678, "epoch": 0.42662177598749473, "flos": 24389522288640.0, "grad_norm": 3.015218804265148, "language_loss": 0.82043219, "learning_rate": 2.5647559804963155e-06, "loss": 0.84223449, "num_input_tokens_seen": 76432485, "step": 3548, "time_per_iteration": 2.749866485595703 }, { "auxiliary_loss_clip": 0.01117103, "auxiliary_loss_mlp": 0.0104988, "balance_loss_clip": 1.05075967, "balance_loss_mlp": 1.0317843, "epoch": 0.42674201887813384, "flos": 23148629089920.0, "grad_norm": 2.1810166229878116, "language_loss": 0.78697187, "learning_rate": 2.5640086676680364e-06, "loss": 0.80864167, "num_input_tokens_seen": 76453980, "step": 3549, "time_per_iteration": 3.677529811859131 }, { "auxiliary_loss_clip": 0.01142872, "auxiliary_loss_mlp": 0.01041029, "balance_loss_clip": 1.05107141, "balance_loss_mlp": 1.02300465, "epoch": 0.4268622617687729, "flos": 21689614552320.0, "grad_norm": 2.168339909288458, "language_loss": 0.80847442, "learning_rate": 2.5632612692800923e-06, "loss": 0.83031344, "num_input_tokens_seen": 76473045, "step": 3550, "time_per_iteration": 2.6945133209228516 }, { "auxiliary_loss_clip": 0.0112367, "auxiliary_loss_mlp": 0.01055924, "balance_loss_clip": 1.05024958, "balance_loss_mlp": 1.03592062, "epoch": 0.426982504659412, "flos": 23440151871360.0, "grad_norm": 4.614827222440367, "language_loss": 0.75546575, "learning_rate": 2.5625137854458603e-06, "loss": 0.77726167, "num_input_tokens_seen": 76492060, "step": 3551, "time_per_iteration": 2.7406861782073975 }, { "auxiliary_loss_clip": 0.0112752, "auxiliary_loss_mlp": 0.01042327, "balance_loss_clip": 1.04721391, "balance_loss_mlp": 1.02481461, "epoch": 0.4271027475500511, "flos": 18916556768640.0, "grad_norm": 1.8452674022291518, "language_loss": 0.79866934, "learning_rate": 2.561766216278735e-06, "loss": 0.82036775, "num_input_tokens_seen": 76509655, "step": 3552, "time_per_iteration": 2.648714542388916 }, { "auxiliary_loss_clip": 0.01103219, "auxiliary_loss_mlp": 0.01045627, "balance_loss_clip": 1.04882956, "balance_loss_mlp": 1.0274117, "epoch": 0.4272229904406902, "flos": 26870554500480.0, "grad_norm": 2.3125200399625156, "language_loss": 0.81356382, "learning_rate": 2.561018561892121e-06, "loss": 0.83505225, "num_input_tokens_seen": 76528795, "step": 3553, "time_per_iteration": 2.7932839393615723 }, { "auxiliary_loss_clip": 0.01127923, "auxiliary_loss_mlp": 0.01045028, "balance_loss_clip": 1.04761624, "balance_loss_mlp": 1.02794576, "epoch": 0.4273432333313293, "flos": 23951376190080.0, "grad_norm": 1.8259359970791942, "language_loss": 0.76634026, "learning_rate": 2.5602708223994363e-06, "loss": 0.78806973, "num_input_tokens_seen": 76550660, "step": 3554, "time_per_iteration": 3.615952253341675 }, { "auxiliary_loss_clip": 0.01116141, "auxiliary_loss_mlp": 0.01046448, "balance_loss_clip": 1.04499888, "balance_loss_mlp": 1.02873361, "epoch": 0.4274634762219684, "flos": 29570354496000.0, "grad_norm": 2.2202868057527008, "language_loss": 0.67368788, "learning_rate": 2.559522997914115e-06, "loss": 0.69531375, "num_input_tokens_seen": 76570240, "step": 3555, "time_per_iteration": 2.8033854961395264 }, { "auxiliary_loss_clip": 0.01153678, "auxiliary_loss_mlp": 0.01039155, "balance_loss_clip": 1.05323267, "balance_loss_mlp": 1.02218509, "epoch": 0.42758371911260745, "flos": 21434146047360.0, "grad_norm": 2.6834156759787917, "language_loss": 0.849424, "learning_rate": 2.558775088549599e-06, "loss": 0.87135231, "num_input_tokens_seen": 76589820, "step": 3556, "time_per_iteration": 2.6874234676361084 }, { "auxiliary_loss_clip": 0.01148123, "auxiliary_loss_mlp": 0.01049084, "balance_loss_clip": 1.05124021, "balance_loss_mlp": 1.03004646, "epoch": 0.42770396200324656, "flos": 14752822072320.0, "grad_norm": 3.526893812544692, "language_loss": 0.66607416, "learning_rate": 2.5580270944193467e-06, "loss": 0.68804622, "num_input_tokens_seen": 76606640, "step": 3557, "time_per_iteration": 2.6750524044036865 }, { "auxiliary_loss_clip": 0.01052757, "auxiliary_loss_mlp": 0.01011532, "balance_loss_clip": 1.01865423, "balance_loss_mlp": 1.00985146, "epoch": 0.4278242048938857, "flos": 70654712601600.0, "grad_norm": 0.7428478775141893, "language_loss": 0.55480766, "learning_rate": 2.557279015636827e-06, "loss": 0.57545054, "num_input_tokens_seen": 76667050, "step": 3558, "time_per_iteration": 3.2051191329956055 }, { "auxiliary_loss_clip": 0.01040489, "auxiliary_loss_mlp": 0.0101226, "balance_loss_clip": 1.01692796, "balance_loss_mlp": 1.01049566, "epoch": 0.42794444778452473, "flos": 69366165033600.0, "grad_norm": 0.7849541562996375, "language_loss": 0.61265087, "learning_rate": 2.5565308523155245e-06, "loss": 0.63317835, "num_input_tokens_seen": 76726650, "step": 3559, "time_per_iteration": 3.1445960998535156 }, { "auxiliary_loss_clip": 0.01096863, "auxiliary_loss_mlp": 0.01047001, "balance_loss_clip": 1.04485142, "balance_loss_mlp": 1.02821326, "epoch": 0.42806469067516384, "flos": 18215328481920.0, "grad_norm": 2.4653654540383263, "language_loss": 0.8228091, "learning_rate": 2.5557826045689336e-06, "loss": 0.8442477, "num_input_tokens_seen": 76742890, "step": 3560, "time_per_iteration": 2.7066774368286133 }, { "auxiliary_loss_clip": 0.01023667, "auxiliary_loss_mlp": 0.01001121, "balance_loss_clip": 1.02336335, "balance_loss_mlp": 0.99901068, "epoch": 0.4281849335658029, "flos": 54535814432640.0, "grad_norm": 0.821540106700215, "language_loss": 0.587762, "learning_rate": 2.5550342725105643e-06, "loss": 0.60800982, "num_input_tokens_seen": 76801055, "step": 3561, "time_per_iteration": 3.217048168182373 }, { "auxiliary_loss_clip": 0.01143537, "auxiliary_loss_mlp": 0.01050396, "balance_loss_clip": 1.05266345, "balance_loss_mlp": 1.03129864, "epoch": 0.428305176456442, "flos": 17274828723840.0, "grad_norm": 2.4516238348604436, "language_loss": 0.81407648, "learning_rate": 2.554285856253937e-06, "loss": 0.83601582, "num_input_tokens_seen": 76819890, "step": 3562, "time_per_iteration": 2.652498722076416 }, { "auxiliary_loss_clip": 0.01125636, "auxiliary_loss_mlp": 0.01041824, "balance_loss_clip": 1.05015492, "balance_loss_mlp": 1.02577877, "epoch": 0.4284254193470811, "flos": 26359509749760.0, "grad_norm": 1.8785694114768285, "language_loss": 0.77754462, "learning_rate": 2.5535373559125855e-06, "loss": 0.79921919, "num_input_tokens_seen": 76840255, "step": 3563, "time_per_iteration": 2.7519640922546387 }, { "auxiliary_loss_clip": 0.01079249, "auxiliary_loss_mlp": 0.01051549, "balance_loss_clip": 1.04343462, "balance_loss_mlp": 1.03007936, "epoch": 0.42854566223772017, "flos": 29714248379520.0, "grad_norm": 1.680436224382597, "language_loss": 0.82188958, "learning_rate": 2.552788771600057e-06, "loss": 0.84319758, "num_input_tokens_seen": 76860565, "step": 3564, "time_per_iteration": 2.8773458003997803 }, { "auxiliary_loss_clip": 0.01125691, "auxiliary_loss_mlp": 0.01041054, "balance_loss_clip": 1.05271506, "balance_loss_mlp": 1.02444839, "epoch": 0.4286659051283593, "flos": 22018161277440.0, "grad_norm": 1.8298265202611468, "language_loss": 0.81697094, "learning_rate": 2.5520401034299118e-06, "loss": 0.83863837, "num_input_tokens_seen": 76878325, "step": 3565, "time_per_iteration": 2.7644972801208496 }, { "auxiliary_loss_clip": 0.01148569, "auxiliary_loss_mlp": 0.01044258, "balance_loss_clip": 1.05035162, "balance_loss_mlp": 1.02501774, "epoch": 0.4287861480189984, "flos": 13334422838400.0, "grad_norm": 4.860333383016429, "language_loss": 0.87725532, "learning_rate": 2.551291351515722e-06, "loss": 0.89918357, "num_input_tokens_seen": 76895340, "step": 3566, "time_per_iteration": 2.610842704772949 }, { "auxiliary_loss_clip": 0.01112975, "auxiliary_loss_mlp": 0.00774311, "balance_loss_clip": 1.0445441, "balance_loss_mlp": 1.00078166, "epoch": 0.42890639090963745, "flos": 26651535321600.0, "grad_norm": 1.729620519034002, "language_loss": 0.85739416, "learning_rate": 2.5505425159710726e-06, "loss": 0.87626696, "num_input_tokens_seen": 76915150, "step": 3567, "time_per_iteration": 2.7955338954925537 }, { "auxiliary_loss_clip": 0.01143266, "auxiliary_loss_mlp": 0.00773937, "balance_loss_clip": 1.05362105, "balance_loss_mlp": 1.000916, "epoch": 0.42902663380027656, "flos": 24055768091520.0, "grad_norm": 3.0214973369184044, "language_loss": 0.82890445, "learning_rate": 2.549793596909561e-06, "loss": 0.84807646, "num_input_tokens_seen": 76933770, "step": 3568, "time_per_iteration": 2.685670852661133 }, { "auxiliary_loss_clip": 0.0112298, "auxiliary_loss_mlp": 0.01046185, "balance_loss_clip": 1.04584122, "balance_loss_mlp": 1.02746964, "epoch": 0.42914687669091567, "flos": 15632561975040.0, "grad_norm": 2.616783816408893, "language_loss": 0.65962416, "learning_rate": 2.5490445944447976e-06, "loss": 0.68131584, "num_input_tokens_seen": 76952265, "step": 3569, "time_per_iteration": 2.717386245727539 }, { "auxiliary_loss_clip": 0.01142668, "auxiliary_loss_mlp": 0.01045513, "balance_loss_clip": 1.05189264, "balance_loss_mlp": 1.02782261, "epoch": 0.4292671195815547, "flos": 31467802440960.0, "grad_norm": 2.409302297647957, "language_loss": 0.65260458, "learning_rate": 2.548295508690406e-06, "loss": 0.67448634, "num_input_tokens_seen": 76973560, "step": 3570, "time_per_iteration": 3.7431375980377197 }, { "auxiliary_loss_clip": 0.01143839, "auxiliary_loss_mlp": 0.01048856, "balance_loss_clip": 1.04864836, "balance_loss_mlp": 1.0308907, "epoch": 0.42938736247219383, "flos": 30257756046720.0, "grad_norm": 1.8028978956183825, "language_loss": 0.76696229, "learning_rate": 2.5475463397600217e-06, "loss": 0.78888929, "num_input_tokens_seen": 76993640, "step": 3571, "time_per_iteration": 2.689208507537842 }, { "auxiliary_loss_clip": 0.01161066, "auxiliary_loss_mlp": 0.01057463, "balance_loss_clip": 1.05435336, "balance_loss_mlp": 1.0365535, "epoch": 0.42950760536283294, "flos": 29349683291520.0, "grad_norm": 4.9241122805501, "language_loss": 0.77271008, "learning_rate": 2.546797087767293e-06, "loss": 0.79489535, "num_input_tokens_seen": 77013765, "step": 3572, "time_per_iteration": 2.7278025150299072 }, { "auxiliary_loss_clip": 0.01098234, "auxiliary_loss_mlp": 0.01041287, "balance_loss_clip": 1.04528642, "balance_loss_mlp": 1.02376306, "epoch": 0.429627848253472, "flos": 26869943969280.0, "grad_norm": 2.21069919345904, "language_loss": 0.86835092, "learning_rate": 2.546047752825881e-06, "loss": 0.88974619, "num_input_tokens_seen": 77034370, "step": 3573, "time_per_iteration": 3.748542308807373 }, { "auxiliary_loss_clip": 0.01109332, "auxiliary_loss_mlp": 0.01051384, "balance_loss_clip": 1.04786038, "balance_loss_mlp": 1.03295398, "epoch": 0.4297480911441111, "flos": 13881270470400.0, "grad_norm": 2.053901098659698, "language_loss": 0.92833704, "learning_rate": 2.5452983350494595e-06, "loss": 0.94994414, "num_input_tokens_seen": 77049925, "step": 3574, "time_per_iteration": 2.7201602458953857 }, { "auxiliary_loss_clip": 0.011435, "auxiliary_loss_mlp": 0.00774175, "balance_loss_clip": 1.05096674, "balance_loss_mlp": 1.00101686, "epoch": 0.4298683340347502, "flos": 20741141975040.0, "grad_norm": 2.2955876914868703, "language_loss": 0.6534152, "learning_rate": 2.544548834551713e-06, "loss": 0.67259198, "num_input_tokens_seen": 77068930, "step": 3575, "time_per_iteration": 2.6826393604278564 }, { "auxiliary_loss_clip": 0.01111047, "auxiliary_loss_mlp": 0.00774268, "balance_loss_clip": 1.04496646, "balance_loss_mlp": 1.00077367, "epoch": 0.4299885769253893, "flos": 20882126856960.0, "grad_norm": 2.6702899202122747, "language_loss": 0.94234902, "learning_rate": 2.5437992514463424e-06, "loss": 0.96120214, "num_input_tokens_seen": 77082255, "step": 3576, "time_per_iteration": 3.5505683422088623 }, { "auxiliary_loss_clip": 0.01139024, "auxiliary_loss_mlp": 0.0105453, "balance_loss_clip": 1.0486095, "balance_loss_mlp": 1.03558755, "epoch": 0.4301088198160284, "flos": 25484618183040.0, "grad_norm": 1.7088976519425936, "language_loss": 0.88092917, "learning_rate": 2.5430495858470565e-06, "loss": 0.90286469, "num_input_tokens_seen": 77101725, "step": 3577, "time_per_iteration": 2.747847318649292 }, { "auxiliary_loss_clip": 0.01140769, "auxiliary_loss_mlp": 0.01043495, "balance_loss_clip": 1.05179143, "balance_loss_mlp": 1.02535188, "epoch": 0.43022906270666744, "flos": 18259427404800.0, "grad_norm": 3.355285238951717, "language_loss": 0.77922511, "learning_rate": 2.54229983786758e-06, "loss": 0.80106771, "num_input_tokens_seen": 77119670, "step": 3578, "time_per_iteration": 2.658405065536499 }, { "auxiliary_loss_clip": 0.01129591, "auxiliary_loss_mlp": 0.01050059, "balance_loss_clip": 1.04661632, "balance_loss_mlp": 1.03073525, "epoch": 0.43034930559730655, "flos": 23399536567680.0, "grad_norm": 1.8840506213520918, "language_loss": 0.85102338, "learning_rate": 2.541550007621651e-06, "loss": 0.8728199, "num_input_tokens_seen": 77138160, "step": 3579, "time_per_iteration": 2.7511403560638428 }, { "auxiliary_loss_clip": 0.0113894, "auxiliary_loss_mlp": 0.01041369, "balance_loss_clip": 1.04977942, "balance_loss_mlp": 1.02267742, "epoch": 0.43046954848794566, "flos": 28184382264960.0, "grad_norm": 2.0147933864909486, "language_loss": 0.80011559, "learning_rate": 2.5408000952230156e-06, "loss": 0.82191867, "num_input_tokens_seen": 77156950, "step": 3580, "time_per_iteration": 3.6953704357147217 }, { "auxiliary_loss_clip": 0.01125842, "auxiliary_loss_mlp": 0.01045403, "balance_loss_clip": 1.04750419, "balance_loss_mlp": 1.02585244, "epoch": 0.4305897913785847, "flos": 28580476515840.0, "grad_norm": 5.192440120771872, "language_loss": 0.90909517, "learning_rate": 2.5400501007854357e-06, "loss": 0.93080759, "num_input_tokens_seen": 77176395, "step": 3581, "time_per_iteration": 2.7644617557525635 }, { "auxiliary_loss_clip": 0.01105549, "auxiliary_loss_mlp": 0.01047582, "balance_loss_clip": 1.04642558, "balance_loss_mlp": 1.02834177, "epoch": 0.43071003426922383, "flos": 20448721353600.0, "grad_norm": 1.9857664317521384, "language_loss": 0.75763786, "learning_rate": 2.539300024422685e-06, "loss": 0.77916914, "num_input_tokens_seen": 77194340, "step": 3582, "time_per_iteration": 2.8157291412353516 }, { "auxiliary_loss_clip": 0.01023759, "auxiliary_loss_mlp": 0.0100297, "balance_loss_clip": 1.01806259, "balance_loss_mlp": 1.00113428, "epoch": 0.43083027715986294, "flos": 51997969883520.0, "grad_norm": 0.8106868974914015, "language_loss": 0.60968918, "learning_rate": 2.538549866248549e-06, "loss": 0.62995642, "num_input_tokens_seen": 77249320, "step": 3583, "time_per_iteration": 3.143799066543579 }, { "auxiliary_loss_clip": 0.01147713, "auxiliary_loss_mlp": 0.01044429, "balance_loss_clip": 1.05199575, "balance_loss_mlp": 1.0267148, "epoch": 0.430950520050502, "flos": 16690885320960.0, "grad_norm": 1.9002634780240546, "language_loss": 0.81057674, "learning_rate": 2.5377996263768274e-06, "loss": 0.83249819, "num_input_tokens_seen": 77267400, "step": 3584, "time_per_iteration": 2.7226450443267822 }, { "auxiliary_loss_clip": 0.01138836, "auxiliary_loss_mlp": 0.010462, "balance_loss_clip": 1.0480516, "balance_loss_mlp": 1.02667356, "epoch": 0.4310707629411411, "flos": 24608433726720.0, "grad_norm": 1.9584380026364758, "language_loss": 0.68647277, "learning_rate": 2.5370493049213293e-06, "loss": 0.70832312, "num_input_tokens_seen": 77287045, "step": 3585, "time_per_iteration": 2.678267240524292 }, { "auxiliary_loss_clip": 0.01063351, "auxiliary_loss_mlp": 0.01050269, "balance_loss_clip": 1.0409708, "balance_loss_mlp": 1.03061175, "epoch": 0.4311910058317802, "flos": 26432983019520.0, "grad_norm": 1.955943384403509, "language_loss": 0.79662448, "learning_rate": 2.536298901995878e-06, "loss": 0.81776071, "num_input_tokens_seen": 77306255, "step": 3586, "time_per_iteration": 3.0027568340301514 }, { "auxiliary_loss_clip": 0.0112905, "auxiliary_loss_mlp": 0.01051569, "balance_loss_clip": 1.048419, "balance_loss_mlp": 1.03173304, "epoch": 0.43131124872241927, "flos": 25155891889920.0, "grad_norm": 2.278628995538037, "language_loss": 0.80474114, "learning_rate": 2.535548417714311e-06, "loss": 0.82654732, "num_input_tokens_seen": 77325555, "step": 3587, "time_per_iteration": 2.9683475494384766 }, { "auxiliary_loss_clip": 0.01142014, "auxiliary_loss_mlp": 0.01047505, "balance_loss_clip": 1.04784262, "balance_loss_mlp": 1.02860999, "epoch": 0.4314314916130584, "flos": 21614812479360.0, "grad_norm": 2.6666607398852484, "language_loss": 0.87085587, "learning_rate": 2.534797852190474e-06, "loss": 0.8927511, "num_input_tokens_seen": 77345735, "step": 3588, "time_per_iteration": 2.6321966648101807 }, { "auxiliary_loss_clip": 0.01137363, "auxiliary_loss_mlp": 0.01061383, "balance_loss_clip": 1.04674447, "balance_loss_mlp": 1.04245234, "epoch": 0.4315517345036975, "flos": 19275016544640.0, "grad_norm": 1.9414643572460901, "language_loss": 0.81407994, "learning_rate": 2.5340472055382283e-06, "loss": 0.83606744, "num_input_tokens_seen": 77361765, "step": 3589, "time_per_iteration": 2.7024426460266113 }, { "auxiliary_loss_clip": 0.01115222, "auxiliary_loss_mlp": 0.01055727, "balance_loss_clip": 1.0458498, "balance_loss_mlp": 1.03509235, "epoch": 0.43167197739433655, "flos": 24273853516800.0, "grad_norm": 2.0497091037435493, "language_loss": 0.80899382, "learning_rate": 2.5332964778714468e-06, "loss": 0.83070326, "num_input_tokens_seen": 77378950, "step": 3590, "time_per_iteration": 2.7234177589416504 }, { "auxiliary_loss_clip": 0.01117448, "auxiliary_loss_mlp": 0.01053957, "balance_loss_clip": 1.04695785, "balance_loss_mlp": 1.03619444, "epoch": 0.43179222028497566, "flos": 16867816738560.0, "grad_norm": 11.889216925551727, "language_loss": 0.6645214, "learning_rate": 2.5325456693040123e-06, "loss": 0.68623543, "num_input_tokens_seen": 77396145, "step": 3591, "time_per_iteration": 2.7278270721435547 }, { "auxiliary_loss_clip": 0.01145747, "auxiliary_loss_mlp": 0.01053616, "balance_loss_clip": 1.04929638, "balance_loss_mlp": 1.03372049, "epoch": 0.43191246317561477, "flos": 17639214243840.0, "grad_norm": 2.135643280816635, "language_loss": 0.74571621, "learning_rate": 2.531794779949824e-06, "loss": 0.76770985, "num_input_tokens_seen": 77414045, "step": 3592, "time_per_iteration": 2.600290298461914 }, { "auxiliary_loss_clip": 0.01107878, "auxiliary_loss_mlp": 0.01057717, "balance_loss_clip": 1.04591513, "balance_loss_mlp": 1.03831017, "epoch": 0.4320327060662538, "flos": 23878800760320.0, "grad_norm": 2.1327699142077727, "language_loss": 0.88149893, "learning_rate": 2.5310438099227903e-06, "loss": 0.90315491, "num_input_tokens_seen": 77431310, "step": 3593, "time_per_iteration": 2.7618215084075928 }, { "auxiliary_loss_clip": 0.01038541, "auxiliary_loss_mlp": 0.01001784, "balance_loss_clip": 1.01432705, "balance_loss_mlp": 0.99981737, "epoch": 0.43215294895689293, "flos": 66394917959040.0, "grad_norm": 0.7980540263543427, "language_loss": 0.5331831, "learning_rate": 2.530292759336833e-06, "loss": 0.55358636, "num_input_tokens_seen": 77492045, "step": 3594, "time_per_iteration": 3.2798092365264893 }, { "auxiliary_loss_clip": 0.01123784, "auxiliary_loss_mlp": 0.01046141, "balance_loss_clip": 1.04745078, "balance_loss_mlp": 1.02607846, "epoch": 0.432273191847532, "flos": 20594267262720.0, "grad_norm": 4.126834893992486, "language_loss": 0.69703805, "learning_rate": 2.5295416283058855e-06, "loss": 0.71873736, "num_input_tokens_seen": 77510910, "step": 3595, "time_per_iteration": 2.6675896644592285 }, { "auxiliary_loss_clip": 0.01122067, "auxiliary_loss_mlp": 0.00775675, "balance_loss_clip": 1.04641366, "balance_loss_mlp": 1.00069213, "epoch": 0.4323934347381711, "flos": 19282127437440.0, "grad_norm": 1.7932157020325952, "language_loss": 0.65978825, "learning_rate": 2.5287904169438943e-06, "loss": 0.67876565, "num_input_tokens_seen": 77530115, "step": 3596, "time_per_iteration": 3.7543418407440186 }, { "auxiliary_loss_clip": 0.0109063, "auxiliary_loss_mlp": 0.01048027, "balance_loss_clip": 1.04529762, "balance_loss_mlp": 1.02863133, "epoch": 0.4325136776288102, "flos": 21726315273600.0, "grad_norm": 3.219168413598293, "language_loss": 0.64056242, "learning_rate": 2.528039125364817e-06, "loss": 0.66194892, "num_input_tokens_seen": 77548920, "step": 3597, "time_per_iteration": 2.7682607173919678 }, { "auxiliary_loss_clip": 0.01120844, "auxiliary_loss_mlp": 0.01043585, "balance_loss_clip": 1.04749465, "balance_loss_mlp": 1.02334321, "epoch": 0.43263392051944927, "flos": 22340746344960.0, "grad_norm": 2.177118897985349, "language_loss": 0.76136076, "learning_rate": 2.5272877536826246e-06, "loss": 0.78300506, "num_input_tokens_seen": 77567715, "step": 3598, "time_per_iteration": 2.7714407444000244 }, { "auxiliary_loss_clip": 0.01105857, "auxiliary_loss_mlp": 0.01052318, "balance_loss_clip": 1.04182971, "balance_loss_mlp": 1.03281546, "epoch": 0.4327541634100884, "flos": 29168406328320.0, "grad_norm": 2.670379864981005, "language_loss": 0.70452863, "learning_rate": 2.5265363020112986e-06, "loss": 0.72611046, "num_input_tokens_seen": 77588035, "step": 3599, "time_per_iteration": 3.9365761280059814 }, { "auxiliary_loss_clip": 0.01138153, "auxiliary_loss_mlp": 0.01040019, "balance_loss_clip": 1.05062318, "balance_loss_mlp": 1.02194643, "epoch": 0.4328744063007275, "flos": 26067448264320.0, "grad_norm": 1.910896045412882, "language_loss": 0.84039497, "learning_rate": 2.5257847704648344e-06, "loss": 0.86217672, "num_input_tokens_seen": 77609265, "step": 3600, "time_per_iteration": 2.6546292304992676 }, { "auxiliary_loss_clip": 0.01150893, "auxiliary_loss_mlp": 0.01041372, "balance_loss_clip": 1.04934669, "balance_loss_mlp": 1.02313268, "epoch": 0.43299464919136654, "flos": 16581357774720.0, "grad_norm": 2.4794025620047493, "language_loss": 0.75285006, "learning_rate": 2.525033159157239e-06, "loss": 0.7747727, "num_input_tokens_seen": 77625580, "step": 3601, "time_per_iteration": 2.664090156555176 }, { "auxiliary_loss_clip": 0.0114186, "auxiliary_loss_mlp": 0.01052874, "balance_loss_clip": 1.0499568, "balance_loss_mlp": 1.02967572, "epoch": 0.43311489208200565, "flos": 16107265140480.0, "grad_norm": 1.805202208101061, "language_loss": 0.77128154, "learning_rate": 2.52428146820253e-06, "loss": 0.79322886, "num_input_tokens_seen": 77643835, "step": 3602, "time_per_iteration": 3.5901761054992676 }, { "auxiliary_loss_clip": 0.01123258, "auxiliary_loss_mlp": 0.01043243, "balance_loss_clip": 1.04803562, "balance_loss_mlp": 1.02352548, "epoch": 0.43323513497264476, "flos": 22930220442240.0, "grad_norm": 1.6919877720122698, "language_loss": 0.81968379, "learning_rate": 2.52352969771474e-06, "loss": 0.84134883, "num_input_tokens_seen": 77663060, "step": 3603, "time_per_iteration": 2.7534170150756836 }, { "auxiliary_loss_clip": 0.01129916, "auxiliary_loss_mlp": 0.01040715, "balance_loss_clip": 1.04866707, "balance_loss_mlp": 1.02196383, "epoch": 0.4333553778632838, "flos": 25299031587840.0, "grad_norm": 2.0350836330294038, "language_loss": 0.88484573, "learning_rate": 2.5227778478079106e-06, "loss": 0.90655208, "num_input_tokens_seen": 77682470, "step": 3604, "time_per_iteration": 2.7547926902770996 }, { "auxiliary_loss_clip": 0.01131721, "auxiliary_loss_mlp": 0.01041874, "balance_loss_clip": 1.04638064, "balance_loss_mlp": 1.02511334, "epoch": 0.43347562075392293, "flos": 19387165783680.0, "grad_norm": 1.6673278154387905, "language_loss": 0.76726472, "learning_rate": 2.522025918596098e-06, "loss": 0.78900075, "num_input_tokens_seen": 77700770, "step": 3605, "time_per_iteration": 2.624504566192627 }, { "auxiliary_loss_clip": 0.01138313, "auxiliary_loss_mlp": 0.01040722, "balance_loss_clip": 1.04998839, "balance_loss_mlp": 1.02384162, "epoch": 0.43359586364456204, "flos": 26325969425280.0, "grad_norm": 1.3918885966768157, "language_loss": 0.65376163, "learning_rate": 2.521273910193368e-06, "loss": 0.67555189, "num_input_tokens_seen": 77723950, "step": 3606, "time_per_iteration": 3.686649799346924 }, { "auxiliary_loss_clip": 0.01148523, "auxiliary_loss_mlp": 0.01049697, "balance_loss_clip": 1.05184019, "balance_loss_mlp": 1.03118372, "epoch": 0.4337161065352011, "flos": 15989261984640.0, "grad_norm": 2.513934356703082, "language_loss": 0.87699229, "learning_rate": 2.5205218227138006e-06, "loss": 0.89897454, "num_input_tokens_seen": 77736905, "step": 3607, "time_per_iteration": 2.599158525466919 }, { "auxiliary_loss_clip": 0.01155243, "auxiliary_loss_mlp": 0.01044772, "balance_loss_clip": 1.04970384, "balance_loss_mlp": 1.02553129, "epoch": 0.4338363494258402, "flos": 20224710184320.0, "grad_norm": 2.505175678620874, "language_loss": 0.79312855, "learning_rate": 2.519769656271486e-06, "loss": 0.81512868, "num_input_tokens_seen": 77754325, "step": 3608, "time_per_iteration": 2.6003310680389404 }, { "auxiliary_loss_clip": 0.01093365, "auxiliary_loss_mlp": 0.01040779, "balance_loss_clip": 1.04276681, "balance_loss_mlp": 1.02197993, "epoch": 0.43395659231647926, "flos": 20083904870400.0, "grad_norm": 6.089771860365896, "language_loss": 0.67578745, "learning_rate": 2.5190174109805285e-06, "loss": 0.69712889, "num_input_tokens_seen": 77774150, "step": 3609, "time_per_iteration": 2.7216100692749023 }, { "auxiliary_loss_clip": 0.01115196, "auxiliary_loss_mlp": 0.01050799, "balance_loss_clip": 1.04463851, "balance_loss_mlp": 1.03134418, "epoch": 0.43407683520711837, "flos": 19901801894400.0, "grad_norm": 2.0734052978207758, "language_loss": 0.64194471, "learning_rate": 2.518265086955042e-06, "loss": 0.66360462, "num_input_tokens_seen": 77791870, "step": 3610, "time_per_iteration": 2.7155940532684326 }, { "auxiliary_loss_clip": 0.01154682, "auxiliary_loss_mlp": 0.01053927, "balance_loss_clip": 1.04974234, "balance_loss_mlp": 1.03500879, "epoch": 0.4341970780977575, "flos": 23108732058240.0, "grad_norm": 2.404206639148766, "language_loss": 0.83809245, "learning_rate": 2.5175126843091534e-06, "loss": 0.86017853, "num_input_tokens_seen": 77811240, "step": 3611, "time_per_iteration": 2.6668548583984375 }, { "auxiliary_loss_clip": 0.01128306, "auxiliary_loss_mlp": 0.01040993, "balance_loss_clip": 1.0466609, "balance_loss_mlp": 1.02234912, "epoch": 0.43431732098839654, "flos": 37408288406400.0, "grad_norm": 2.0487925469074786, "language_loss": 0.75737888, "learning_rate": 2.5167602031570034e-06, "loss": 0.77907187, "num_input_tokens_seen": 77831425, "step": 3612, "time_per_iteration": 2.824673891067505 }, { "auxiliary_loss_clip": 0.01154642, "auxiliary_loss_mlp": 0.01043641, "balance_loss_clip": 1.05150151, "balance_loss_mlp": 1.02556896, "epoch": 0.43443756387903565, "flos": 31868206323840.0, "grad_norm": 1.6496594397430453, "language_loss": 0.73227656, "learning_rate": 2.51600764361274e-06, "loss": 0.75425935, "num_input_tokens_seen": 77852950, "step": 3613, "time_per_iteration": 2.7193799018859863 }, { "auxiliary_loss_clip": 0.01151744, "auxiliary_loss_mlp": 0.01040797, "balance_loss_clip": 1.04976106, "balance_loss_mlp": 1.02286768, "epoch": 0.43455780676967476, "flos": 23477139901440.0, "grad_norm": 8.36789865939487, "language_loss": 0.78693259, "learning_rate": 2.5152550057905283e-06, "loss": 0.80885804, "num_input_tokens_seen": 77872840, "step": 3614, "time_per_iteration": 2.6164486408233643 }, { "auxiliary_loss_clip": 0.01142635, "auxiliary_loss_mlp": 0.00774627, "balance_loss_clip": 1.0501591, "balance_loss_mlp": 1.00069284, "epoch": 0.4346780496603138, "flos": 24207060176640.0, "grad_norm": 2.2201902258706414, "language_loss": 0.76940393, "learning_rate": 2.5145022898045415e-06, "loss": 0.7885766, "num_input_tokens_seen": 77892025, "step": 3615, "time_per_iteration": 2.712763547897339 }, { "auxiliary_loss_clip": 0.01133052, "auxiliary_loss_mlp": 0.01051088, "balance_loss_clip": 1.0472908, "balance_loss_mlp": 1.03150201, "epoch": 0.4347982925509529, "flos": 17092366611840.0, "grad_norm": 2.2838881176496573, "language_loss": 0.9014228, "learning_rate": 2.5137494957689664e-06, "loss": 0.92326427, "num_input_tokens_seen": 77907635, "step": 3616, "time_per_iteration": 2.655564546585083 }, { "auxiliary_loss_clip": 0.010294, "auxiliary_loss_mlp": 0.0100087, "balance_loss_clip": 1.01465714, "balance_loss_mlp": 0.99910623, "epoch": 0.43491853544159204, "flos": 60945544696320.0, "grad_norm": 0.7651340179867504, "language_loss": 0.57310963, "learning_rate": 2.5129966237980016e-06, "loss": 0.59341228, "num_input_tokens_seen": 77970630, "step": 3617, "time_per_iteration": 3.287935972213745 }, { "auxiliary_loss_clip": 0.01113936, "auxiliary_loss_mlp": 0.01042034, "balance_loss_clip": 1.04419982, "balance_loss_mlp": 1.02304411, "epoch": 0.4350387783322311, "flos": 21944652094080.0, "grad_norm": 1.8058939681288038, "language_loss": 0.7784481, "learning_rate": 2.512243674005857e-06, "loss": 0.80000782, "num_input_tokens_seen": 77989995, "step": 3618, "time_per_iteration": 2.7000880241394043 }, { "auxiliary_loss_clip": 0.01088362, "auxiliary_loss_mlp": 0.01052181, "balance_loss_clip": 1.04184341, "balance_loss_mlp": 1.03176081, "epoch": 0.4351590212228702, "flos": 25082705928960.0, "grad_norm": 2.034775970385249, "language_loss": 0.86265904, "learning_rate": 2.5114906465067537e-06, "loss": 0.88406444, "num_input_tokens_seen": 78010980, "step": 3619, "time_per_iteration": 2.7824172973632812 }, { "auxiliary_loss_clip": 0.01137149, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.04446352, "balance_loss_mlp": 1.02860272, "epoch": 0.4352792641135093, "flos": 21506541909120.0, "grad_norm": 2.6423152720622403, "language_loss": 0.7501049, "learning_rate": 2.5107375414149264e-06, "loss": 0.77194321, "num_input_tokens_seen": 78030225, "step": 3620, "time_per_iteration": 2.6756057739257812 }, { "auxiliary_loss_clip": 0.01094845, "auxiliary_loss_mlp": 0.01051395, "balance_loss_clip": 1.04043067, "balance_loss_mlp": 1.03315639, "epoch": 0.43539950700414837, "flos": 16253457494400.0, "grad_norm": 7.687513475938151, "language_loss": 0.71686947, "learning_rate": 2.5099843588446197e-06, "loss": 0.73833179, "num_input_tokens_seen": 78048545, "step": 3621, "time_per_iteration": 2.6959924697875977 }, { "auxiliary_loss_clip": 0.01107437, "auxiliary_loss_mlp": 0.01045977, "balance_loss_clip": 1.04555321, "balance_loss_mlp": 1.02680826, "epoch": 0.4355197498947875, "flos": 16691819074560.0, "grad_norm": 1.7920345688118178, "language_loss": 0.61391509, "learning_rate": 2.509231098910091e-06, "loss": 0.63544923, "num_input_tokens_seen": 78068415, "step": 3622, "time_per_iteration": 3.994269609451294 }, { "auxiliary_loss_clip": 0.01123407, "auxiliary_loss_mlp": 0.01051939, "balance_loss_clip": 1.04958022, "balance_loss_mlp": 1.03151822, "epoch": 0.4356399927854266, "flos": 16362733645440.0, "grad_norm": 2.3054244899145724, "language_loss": 0.7503345, "learning_rate": 2.508477761725611e-06, "loss": 0.77208799, "num_input_tokens_seen": 78086690, "step": 3623, "time_per_iteration": 2.5418431758880615 }, { "auxiliary_loss_clip": 0.01147747, "auxiliary_loss_mlp": 0.0105045, "balance_loss_clip": 1.05278945, "balance_loss_mlp": 1.03212714, "epoch": 0.43576023567606564, "flos": 17202037812480.0, "grad_norm": 2.5770718959920247, "language_loss": 0.80963051, "learning_rate": 2.507724347405458e-06, "loss": 0.83161247, "num_input_tokens_seen": 78104640, "step": 3624, "time_per_iteration": 2.59970760345459 }, { "auxiliary_loss_clip": 0.01097726, "auxiliary_loss_mlp": 0.01047934, "balance_loss_clip": 1.04223919, "balance_loss_mlp": 1.02982628, "epoch": 0.43588047856670475, "flos": 15917656222080.0, "grad_norm": 2.4114390379903146, "language_loss": 0.82113338, "learning_rate": 2.5069708560639243e-06, "loss": 0.84258997, "num_input_tokens_seen": 78122550, "step": 3625, "time_per_iteration": 3.661245107650757 }, { "auxiliary_loss_clip": 0.0111363, "auxiliary_loss_mlp": 0.01049033, "balance_loss_clip": 1.04448271, "balance_loss_mlp": 1.0302459, "epoch": 0.4360007214573438, "flos": 23659566099840.0, "grad_norm": 2.1000973492975255, "language_loss": 0.61414373, "learning_rate": 2.5062172878153158e-06, "loss": 0.63577044, "num_input_tokens_seen": 78141825, "step": 3626, "time_per_iteration": 2.6878411769866943 }, { "auxiliary_loss_clip": 0.01097152, "auxiliary_loss_mlp": 0.01045618, "balance_loss_clip": 1.04478097, "balance_loss_mlp": 1.02756977, "epoch": 0.4361209643479829, "flos": 21978767036160.0, "grad_norm": 2.0428548611385327, "language_loss": 0.87009132, "learning_rate": 2.505463642773947e-06, "loss": 0.89151901, "num_input_tokens_seen": 78161790, "step": 3627, "time_per_iteration": 3.7550854682922363 }, { "auxiliary_loss_clip": 0.01116115, "auxiliary_loss_mlp": 0.00774662, "balance_loss_clip": 1.04438746, "balance_loss_mlp": 1.00072515, "epoch": 0.43624120723862203, "flos": 17420159151360.0, "grad_norm": 2.460178190406278, "language_loss": 0.75569236, "learning_rate": 2.504709921054146e-06, "loss": 0.77460009, "num_input_tokens_seen": 78178605, "step": 3628, "time_per_iteration": 2.7105610370635986 }, { "auxiliary_loss_clip": 0.01106256, "auxiliary_loss_mlp": 0.0106196, "balance_loss_clip": 1.04106855, "balance_loss_mlp": 1.04194427, "epoch": 0.4363614501292611, "flos": 17895293280000.0, "grad_norm": 2.3501972535830733, "language_loss": 0.84085053, "learning_rate": 2.50395612277025e-06, "loss": 0.86253273, "num_input_tokens_seen": 78194460, "step": 3629, "time_per_iteration": 2.7724413871765137 }, { "auxiliary_loss_clip": 0.01132275, "auxiliary_loss_mlp": 0.01041162, "balance_loss_clip": 1.04883766, "balance_loss_mlp": 1.02211249, "epoch": 0.4364816930199002, "flos": 20302888135680.0, "grad_norm": 2.047569439666378, "language_loss": 0.72892606, "learning_rate": 2.503202248036612e-06, "loss": 0.75066042, "num_input_tokens_seen": 78213315, "step": 3630, "time_per_iteration": 2.72956919670105 }, { "auxiliary_loss_clip": 0.01152442, "auxiliary_loss_mlp": 0.01045743, "balance_loss_clip": 1.04936552, "balance_loss_mlp": 1.02762294, "epoch": 0.4366019359105393, "flos": 24061334699520.0, "grad_norm": 2.1408074445735847, "language_loss": 0.73851264, "learning_rate": 2.5024482969675927e-06, "loss": 0.76049447, "num_input_tokens_seen": 78233270, "step": 3631, "time_per_iteration": 2.618089437484741 }, { "auxiliary_loss_clip": 0.01105049, "auxiliary_loss_mlp": 0.01041896, "balance_loss_clip": 1.04518008, "balance_loss_mlp": 1.02420568, "epoch": 0.43672217880117836, "flos": 21754109422080.0, "grad_norm": 2.313897543216981, "language_loss": 0.84548628, "learning_rate": 2.501694269677566e-06, "loss": 0.86695576, "num_input_tokens_seen": 78251040, "step": 3632, "time_per_iteration": 3.613952159881592 }, { "auxiliary_loss_clip": 0.01146085, "auxiliary_loss_mlp": 0.01040066, "balance_loss_clip": 1.05050302, "balance_loss_mlp": 1.02199376, "epoch": 0.4368424216918175, "flos": 18035200753920.0, "grad_norm": 1.8817610677516603, "language_loss": 0.80208969, "learning_rate": 2.500940166280918e-06, "loss": 0.82395118, "num_input_tokens_seen": 78269470, "step": 3633, "time_per_iteration": 2.633976936340332 }, { "auxiliary_loss_clip": 0.01133669, "auxiliary_loss_mlp": 0.01043616, "balance_loss_clip": 1.04428911, "balance_loss_mlp": 1.02536488, "epoch": 0.4369626645824566, "flos": 25447127362560.0, "grad_norm": 1.8112538955335782, "language_loss": 0.79833907, "learning_rate": 2.500185986892045e-06, "loss": 0.82011199, "num_input_tokens_seen": 78288955, "step": 3634, "time_per_iteration": 2.730858564376831 }, { "auxiliary_loss_clip": 0.01136981, "auxiliary_loss_mlp": 0.0105597, "balance_loss_clip": 1.04688787, "balance_loss_mlp": 1.03544259, "epoch": 0.43708290747309564, "flos": 25302694775040.0, "grad_norm": 2.1652258151772465, "language_loss": 0.77697027, "learning_rate": 2.499431731625355e-06, "loss": 0.79889977, "num_input_tokens_seen": 78307980, "step": 3635, "time_per_iteration": 2.6676807403564453 }, { "auxiliary_loss_clip": 0.01156577, "auxiliary_loss_mlp": 0.01045258, "balance_loss_clip": 1.05040932, "balance_loss_mlp": 1.0269953, "epoch": 0.43720315036373475, "flos": 31575103344000.0, "grad_norm": 3.036930345772651, "language_loss": 0.79528552, "learning_rate": 2.4986774005952686e-06, "loss": 0.81730378, "num_input_tokens_seen": 78330355, "step": 3636, "time_per_iteration": 2.744180202484131 }, { "auxiliary_loss_clip": 0.01136417, "auxiliary_loss_mlp": 0.01036668, "balance_loss_clip": 1.04995418, "balance_loss_mlp": 1.01895308, "epoch": 0.43732339325437386, "flos": 23112000195840.0, "grad_norm": 2.5140026745050386, "language_loss": 0.847175, "learning_rate": 2.4979229939162166e-06, "loss": 0.86890578, "num_input_tokens_seen": 78349135, "step": 3637, "time_per_iteration": 2.687974691390991 }, { "auxiliary_loss_clip": 0.01137572, "auxiliary_loss_mlp": 0.01044836, "balance_loss_clip": 1.0487721, "balance_loss_mlp": 1.02611995, "epoch": 0.4374436361450129, "flos": 27746272080000.0, "grad_norm": 1.6363739159418944, "language_loss": 0.80687898, "learning_rate": 2.4971685117026433e-06, "loss": 0.82870305, "num_input_tokens_seen": 78368900, "step": 3638, "time_per_iteration": 2.6939055919647217 }, { "auxiliary_loss_clip": 0.01143819, "auxiliary_loss_mlp": 0.01048135, "balance_loss_clip": 1.05068326, "balance_loss_mlp": 1.02933526, "epoch": 0.437563879035652, "flos": 24172370616960.0, "grad_norm": 1.4414778281399556, "language_loss": 0.76669896, "learning_rate": 2.4964139540690018e-06, "loss": 0.78861851, "num_input_tokens_seen": 78392235, "step": 3639, "time_per_iteration": 2.7299060821533203 }, { "auxiliary_loss_clip": 0.01117281, "auxiliary_loss_mlp": 0.01047738, "balance_loss_clip": 1.04769301, "balance_loss_mlp": 1.02794909, "epoch": 0.4376841219262911, "flos": 23477211728640.0, "grad_norm": 1.981620823073047, "language_loss": 0.7319411, "learning_rate": 2.495659321129758e-06, "loss": 0.7535913, "num_input_tokens_seen": 78409980, "step": 3640, "time_per_iteration": 2.7600083351135254 }, { "auxiliary_loss_clip": 0.01138786, "auxiliary_loss_mlp": 0.01045981, "balance_loss_clip": 1.04802823, "balance_loss_mlp": 1.02887487, "epoch": 0.4378043648169302, "flos": 25447809720960.0, "grad_norm": 1.6951741026578475, "language_loss": 0.75517094, "learning_rate": 2.494904612999389e-06, "loss": 0.77701861, "num_input_tokens_seen": 78428690, "step": 3641, "time_per_iteration": 2.78122878074646 }, { "auxiliary_loss_clip": 0.0103933, "auxiliary_loss_mlp": 0.01014179, "balance_loss_clip": 1.01606989, "balance_loss_mlp": 1.01235485, "epoch": 0.4379246077075693, "flos": 53914056986880.0, "grad_norm": 0.7461742810198191, "language_loss": 0.56522852, "learning_rate": 2.4941498297923843e-06, "loss": 0.58576357, "num_input_tokens_seen": 78489260, "step": 3642, "time_per_iteration": 3.169175386428833 }, { "auxiliary_loss_clip": 0.01140186, "auxiliary_loss_mlp": 0.01047596, "balance_loss_clip": 1.05013251, "balance_loss_mlp": 1.02935719, "epoch": 0.43804485059820836, "flos": 20588305605120.0, "grad_norm": 1.7128845226189473, "language_loss": 0.69480777, "learning_rate": 2.4933949716232424e-06, "loss": 0.71668553, "num_input_tokens_seen": 78506785, "step": 3643, "time_per_iteration": 2.6482956409454346 }, { "auxiliary_loss_clip": 0.01111912, "auxiliary_loss_mlp": 0.01042478, "balance_loss_clip": 1.04787743, "balance_loss_mlp": 1.02494216, "epoch": 0.43816509348884747, "flos": 23876214981120.0, "grad_norm": 2.6483467793721798, "language_loss": 0.7377032, "learning_rate": 2.492640038606476e-06, "loss": 0.75924712, "num_input_tokens_seen": 78525150, "step": 3644, "time_per_iteration": 2.747770071029663 }, { "auxiliary_loss_clip": 0.01140039, "auxiliary_loss_mlp": 0.01040673, "balance_loss_clip": 1.04720581, "balance_loss_mlp": 1.02065802, "epoch": 0.4382853363794866, "flos": 14684448533760.0, "grad_norm": 2.0361269514624123, "language_loss": 0.78252172, "learning_rate": 2.491885030856608e-06, "loss": 0.80432886, "num_input_tokens_seen": 78543245, "step": 3645, "time_per_iteration": 2.7647411823272705 }, { "auxiliary_loss_clip": 0.01125601, "auxiliary_loss_mlp": 0.01045828, "balance_loss_clip": 1.04716253, "balance_loss_mlp": 1.02745771, "epoch": 0.43840557927012563, "flos": 17165301177600.0, "grad_norm": 6.324062654868039, "language_loss": 0.83052933, "learning_rate": 2.4911299484881713e-06, "loss": 0.85224366, "num_input_tokens_seen": 78560775, "step": 3646, "time_per_iteration": 2.628659963607788 }, { "auxiliary_loss_clip": 0.01119379, "auxiliary_loss_mlp": 0.0104823, "balance_loss_clip": 1.04501879, "balance_loss_mlp": 1.0306704, "epoch": 0.43852582216076474, "flos": 19390685316480.0, "grad_norm": 1.749045438685747, "language_loss": 0.81410885, "learning_rate": 2.490374791615712e-06, "loss": 0.83578497, "num_input_tokens_seen": 78580800, "step": 3647, "time_per_iteration": 2.740647792816162 }, { "auxiliary_loss_clip": 0.01158773, "auxiliary_loss_mlp": 0.00774583, "balance_loss_clip": 1.05213344, "balance_loss_mlp": 1.00086296, "epoch": 0.43864606505140386, "flos": 18075133699200.0, "grad_norm": 3.467367944698642, "language_loss": 0.77903676, "learning_rate": 2.4896195603537867e-06, "loss": 0.7983703, "num_input_tokens_seen": 78595410, "step": 3648, "time_per_iteration": 3.6560885906219482 }, { "auxiliary_loss_clip": 0.01098745, "auxiliary_loss_mlp": 0.0104584, "balance_loss_clip": 1.0464139, "balance_loss_mlp": 1.0285666, "epoch": 0.4387663079420429, "flos": 19644896845440.0, "grad_norm": 2.38596639697199, "language_loss": 0.74293685, "learning_rate": 2.488864254816964e-06, "loss": 0.76438272, "num_input_tokens_seen": 78614100, "step": 3649, "time_per_iteration": 2.733114004135132 }, { "auxiliary_loss_clip": 0.01144014, "auxiliary_loss_mlp": 0.010425, "balance_loss_clip": 1.04962814, "balance_loss_mlp": 1.02377212, "epoch": 0.438886550832682, "flos": 19719339782400.0, "grad_norm": 7.488613777507414, "language_loss": 0.68580317, "learning_rate": 2.4881088751198218e-06, "loss": 0.7076683, "num_input_tokens_seen": 78632260, "step": 3650, "time_per_iteration": 2.7172601222991943 }, { "auxiliary_loss_clip": 0.01130751, "auxiliary_loss_mlp": 0.01049071, "balance_loss_clip": 1.04663825, "balance_loss_mlp": 1.03027189, "epoch": 0.43900679372332113, "flos": 14536675981440.0, "grad_norm": 3.3928904262798536, "language_loss": 0.63121974, "learning_rate": 2.4873534213769517e-06, "loss": 0.653018, "num_input_tokens_seen": 78647490, "step": 3651, "time_per_iteration": 3.651576280593872 }, { "auxiliary_loss_clip": 0.01110223, "auxiliary_loss_mlp": 0.01041994, "balance_loss_clip": 1.04683328, "balance_loss_mlp": 1.02517366, "epoch": 0.4391270366139602, "flos": 24056234968320.0, "grad_norm": 1.6253776103767976, "language_loss": 0.71716368, "learning_rate": 2.4865978937029547e-06, "loss": 0.73868585, "num_input_tokens_seen": 78666470, "step": 3652, "time_per_iteration": 2.763521194458008 }, { "auxiliary_loss_clip": 0.01097709, "auxiliary_loss_mlp": 0.01046336, "balance_loss_clip": 1.04651952, "balance_loss_mlp": 1.02771521, "epoch": 0.4392472795045993, "flos": 31538510363520.0, "grad_norm": 2.3708370281243427, "language_loss": 0.66577321, "learning_rate": 2.485842292212445e-06, "loss": 0.68721366, "num_input_tokens_seen": 78687685, "step": 3653, "time_per_iteration": 3.830923080444336 }, { "auxiliary_loss_clip": 0.01159636, "auxiliary_loss_mlp": 0.01047216, "balance_loss_clip": 1.05381048, "balance_loss_mlp": 1.02852392, "epoch": 0.4393675223952384, "flos": 14866300114560.0, "grad_norm": 2.2304462197030173, "language_loss": 0.8065418, "learning_rate": 2.485086617020045e-06, "loss": 0.82861036, "num_input_tokens_seen": 78706180, "step": 3654, "time_per_iteration": 2.656306028366089 }, { "auxiliary_loss_clip": 0.01124582, "auxiliary_loss_mlp": 0.01046875, "balance_loss_clip": 1.04640317, "balance_loss_mlp": 1.02591777, "epoch": 0.43948776528587746, "flos": 14825900292480.0, "grad_norm": 2.3284650350047897, "language_loss": 0.81933188, "learning_rate": 2.4843308682403903e-06, "loss": 0.84104645, "num_input_tokens_seen": 78723095, "step": 3655, "time_per_iteration": 2.7646772861480713 }, { "auxiliary_loss_clip": 0.01153998, "auxiliary_loss_mlp": 0.01042706, "balance_loss_clip": 1.05077386, "balance_loss_mlp": 1.02575409, "epoch": 0.4396080081765166, "flos": 13914523486080.0, "grad_norm": 2.5878539121108055, "language_loss": 0.82904357, "learning_rate": 2.4835750459881294e-06, "loss": 0.85101062, "num_input_tokens_seen": 78739720, "step": 3656, "time_per_iteration": 2.720611095428467 }, { "auxiliary_loss_clip": 0.01120938, "auxiliary_loss_mlp": 0.01054748, "balance_loss_clip": 1.04452205, "balance_loss_mlp": 1.03532839, "epoch": 0.43972825106715563, "flos": 18222978078720.0, "grad_norm": 4.652423450316824, "language_loss": 0.82003045, "learning_rate": 2.4828191503779177e-06, "loss": 0.84178728, "num_input_tokens_seen": 78757820, "step": 3657, "time_per_iteration": 2.7061214447021484 }, { "auxiliary_loss_clip": 0.01118555, "auxiliary_loss_mlp": 0.01041858, "balance_loss_clip": 1.04850101, "balance_loss_mlp": 1.02377367, "epoch": 0.43984849395779474, "flos": 16873239692160.0, "grad_norm": 2.115043081683726, "language_loss": 0.89947313, "learning_rate": 2.482063181524425e-06, "loss": 0.92107725, "num_input_tokens_seen": 78773720, "step": 3658, "time_per_iteration": 3.7620885372161865 }, { "auxiliary_loss_clip": 0.01158689, "auxiliary_loss_mlp": 0.01051091, "balance_loss_clip": 1.05260575, "balance_loss_mlp": 1.03258944, "epoch": 0.43996873684843385, "flos": 18691504104960.0, "grad_norm": 2.2265583726282263, "language_loss": 0.81262159, "learning_rate": 2.4813071395423307e-06, "loss": 0.83471936, "num_input_tokens_seen": 78791285, "step": 3659, "time_per_iteration": 2.614751100540161 }, { "auxiliary_loss_clip": 0.01145849, "auxiliary_loss_mlp": 0.0104473, "balance_loss_clip": 1.05157173, "balance_loss_mlp": 1.02622914, "epoch": 0.4400889797390729, "flos": 23653460787840.0, "grad_norm": 2.2323179122813768, "language_loss": 0.64465874, "learning_rate": 2.4805510245463263e-06, "loss": 0.66656452, "num_input_tokens_seen": 78811440, "step": 3660, "time_per_iteration": 2.701805830001831 }, { "auxiliary_loss_clip": 0.01140194, "auxiliary_loss_mlp": 0.01042509, "balance_loss_clip": 1.04924822, "balance_loss_mlp": 1.02488947, "epoch": 0.440209222629712, "flos": 23149203707520.0, "grad_norm": 2.0817805352000067, "language_loss": 0.60499763, "learning_rate": 2.4797948366511137e-06, "loss": 0.62682462, "num_input_tokens_seen": 78831150, "step": 3661, "time_per_iteration": 2.6320977210998535 }, { "auxiliary_loss_clip": 0.01114558, "auxiliary_loss_mlp": 0.01045786, "balance_loss_clip": 1.04539895, "balance_loss_mlp": 1.02690339, "epoch": 0.4403294655203511, "flos": 24823394668800.0, "grad_norm": 2.0166479466197362, "language_loss": 0.75734639, "learning_rate": 2.4790385759714055e-06, "loss": 0.77894986, "num_input_tokens_seen": 78850215, "step": 3662, "time_per_iteration": 2.84318470954895 }, { "auxiliary_loss_clip": 0.0113732, "auxiliary_loss_mlp": 0.01041441, "balance_loss_clip": 1.05116868, "balance_loss_mlp": 1.02376258, "epoch": 0.4404497084109902, "flos": 22565080736640.0, "grad_norm": 1.8464436621556768, "language_loss": 0.71035779, "learning_rate": 2.478282242621926e-06, "loss": 0.73214537, "num_input_tokens_seen": 78870675, "step": 3663, "time_per_iteration": 2.7510268688201904 }, { "auxiliary_loss_clip": 0.01027242, "auxiliary_loss_mlp": 0.01007536, "balance_loss_clip": 1.02257466, "balance_loss_mlp": 1.0055455, "epoch": 0.4405699513016293, "flos": 64967073448320.0, "grad_norm": 0.8485366260157293, "language_loss": 0.59492147, "learning_rate": 2.477525836717411e-06, "loss": 0.6152693, "num_input_tokens_seen": 78938440, "step": 3664, "time_per_iteration": 3.536439895629883 }, { "auxiliary_loss_clip": 0.01144406, "auxiliary_loss_mlp": 0.01046759, "balance_loss_clip": 1.04981863, "balance_loss_mlp": 1.02806699, "epoch": 0.4406901941922684, "flos": 35661952978560.0, "grad_norm": 2.2264227291559884, "language_loss": 0.79682577, "learning_rate": 2.476769358372606e-06, "loss": 0.81873739, "num_input_tokens_seen": 78960090, "step": 3665, "time_per_iteration": 2.8628878593444824 }, { "auxiliary_loss_clip": 0.01111566, "auxiliary_loss_mlp": 0.01050111, "balance_loss_clip": 1.04783869, "balance_loss_mlp": 1.03237319, "epoch": 0.44081043708290746, "flos": 18040767361920.0, "grad_norm": 2.2862930439434708, "language_loss": 0.74701107, "learning_rate": 2.4760128077022683e-06, "loss": 0.76862776, "num_input_tokens_seen": 78978225, "step": 3666, "time_per_iteration": 2.7050974369049072 }, { "auxiliary_loss_clip": 0.01091223, "auxiliary_loss_mlp": 0.01051612, "balance_loss_clip": 1.04379582, "balance_loss_mlp": 1.03308678, "epoch": 0.44093067997354657, "flos": 30153507799680.0, "grad_norm": 2.910413725869674, "language_loss": 0.68498927, "learning_rate": 2.4752561848211672e-06, "loss": 0.70641762, "num_input_tokens_seen": 79000625, "step": 3667, "time_per_iteration": 2.8800323009490967 }, { "auxiliary_loss_clip": 0.01137723, "auxiliary_loss_mlp": 0.01044107, "balance_loss_clip": 1.05139899, "balance_loss_mlp": 1.02750158, "epoch": 0.4410509228641857, "flos": 23255068066560.0, "grad_norm": 1.8774475353345492, "language_loss": 0.71425653, "learning_rate": 2.4744994898440797e-06, "loss": 0.73607481, "num_input_tokens_seen": 79019415, "step": 3668, "time_per_iteration": 2.7396538257598877 }, { "auxiliary_loss_clip": 0.01118545, "auxiliary_loss_mlp": 0.01042976, "balance_loss_clip": 1.04396558, "balance_loss_mlp": 1.02529764, "epoch": 0.44117116575482473, "flos": 19500571998720.0, "grad_norm": 4.394120473977941, "language_loss": 0.83716452, "learning_rate": 2.473742722885797e-06, "loss": 0.85877973, "num_input_tokens_seen": 79038435, "step": 3669, "time_per_iteration": 2.8040640354156494 }, { "auxiliary_loss_clip": 0.01140854, "auxiliary_loss_mlp": 0.00774863, "balance_loss_clip": 1.05225277, "balance_loss_mlp": 1.00086975, "epoch": 0.44129140864546385, "flos": 27053124353280.0, "grad_norm": 2.6128073294649012, "language_loss": 0.65188825, "learning_rate": 2.4729858840611197e-06, "loss": 0.67104542, "num_input_tokens_seen": 79057345, "step": 3670, "time_per_iteration": 2.7819161415100098 }, { "auxiliary_loss_clip": 0.01150414, "auxiliary_loss_mlp": 0.01046297, "balance_loss_clip": 1.04955506, "balance_loss_mlp": 1.02845168, "epoch": 0.4414116515361029, "flos": 26102101910400.0, "grad_norm": 1.9776394532326544, "language_loss": 0.7286436, "learning_rate": 2.4722289734848605e-06, "loss": 0.75061071, "num_input_tokens_seen": 79077810, "step": 3671, "time_per_iteration": 2.6627769470214844 }, { "auxiliary_loss_clip": 0.01113719, "auxiliary_loss_mlp": 0.01046227, "balance_loss_clip": 1.05210888, "balance_loss_mlp": 1.02850008, "epoch": 0.441531894426742, "flos": 21906083865600.0, "grad_norm": 2.095984274798656, "language_loss": 0.77780253, "learning_rate": 2.471471991271841e-06, "loss": 0.799402, "num_input_tokens_seen": 79094935, "step": 3672, "time_per_iteration": 2.7560322284698486 }, { "auxiliary_loss_clip": 0.01132107, "auxiliary_loss_mlp": 0.01048623, "balance_loss_clip": 1.04655766, "balance_loss_mlp": 1.030074, "epoch": 0.4416521373173811, "flos": 23437099215360.0, "grad_norm": 2.8418674641994413, "language_loss": 0.79517102, "learning_rate": 2.470714937536896e-06, "loss": 0.81697834, "num_input_tokens_seen": 79113660, "step": 3673, "time_per_iteration": 2.653043508529663 }, { "auxiliary_loss_clip": 0.01098562, "auxiliary_loss_mlp": 0.01045392, "balance_loss_clip": 1.0451628, "balance_loss_mlp": 1.02739191, "epoch": 0.4417723802080202, "flos": 20334345471360.0, "grad_norm": 2.259462989483368, "language_loss": 0.70634806, "learning_rate": 2.469957812394868e-06, "loss": 0.72778761, "num_input_tokens_seen": 79132470, "step": 3674, "time_per_iteration": 3.739635705947876 }, { "auxiliary_loss_clip": 0.01154221, "auxiliary_loss_mlp": 0.01043531, "balance_loss_clip": 1.05239654, "balance_loss_mlp": 1.02617431, "epoch": 0.4418926230986593, "flos": 18880682060160.0, "grad_norm": 1.865490061266973, "language_loss": 0.7629801, "learning_rate": 2.4692006159606148e-06, "loss": 0.78495759, "num_input_tokens_seen": 79150000, "step": 3675, "time_per_iteration": 2.6170763969421387 }, { "auxiliary_loss_clip": 0.0115379, "auxiliary_loss_mlp": 0.01052719, "balance_loss_clip": 1.05080152, "balance_loss_mlp": 1.03343105, "epoch": 0.4420128659892984, "flos": 19464409981440.0, "grad_norm": 2.3977874085602084, "language_loss": 0.78389072, "learning_rate": 2.468443348349e-06, "loss": 0.80595583, "num_input_tokens_seen": 79167875, "step": 3676, "time_per_iteration": 3.5793404579162598 }, { "auxiliary_loss_clip": 0.0109968, "auxiliary_loss_mlp": 0.01042987, "balance_loss_clip": 1.04338741, "balance_loss_mlp": 1.02405643, "epoch": 0.44213310887993745, "flos": 17894359526400.0, "grad_norm": 3.198653465110518, "language_loss": 0.81980908, "learning_rate": 2.467686009674902e-06, "loss": 0.84123576, "num_input_tokens_seen": 79182325, "step": 3677, "time_per_iteration": 2.8050382137298584 }, { "auxiliary_loss_clip": 0.01138159, "auxiliary_loss_mlp": 0.01062658, "balance_loss_clip": 1.04882741, "balance_loss_mlp": 1.0425477, "epoch": 0.44225335177057656, "flos": 19204667758080.0, "grad_norm": 2.0160090388290133, "language_loss": 0.85524017, "learning_rate": 2.466928600053209e-06, "loss": 0.87724829, "num_input_tokens_seen": 79197630, "step": 3678, "time_per_iteration": 2.6892282962799072 }, { "auxiliary_loss_clip": 0.01122051, "auxiliary_loss_mlp": 0.01048095, "balance_loss_clip": 1.04548252, "balance_loss_mlp": 1.02996325, "epoch": 0.4423735946612157, "flos": 23471321898240.0, "grad_norm": 2.1950718088826773, "language_loss": 0.71741813, "learning_rate": 2.466171119598818e-06, "loss": 0.73911953, "num_input_tokens_seen": 79217600, "step": 3679, "time_per_iteration": 4.342228412628174 }, { "auxiliary_loss_clip": 0.01148092, "auxiliary_loss_mlp": 0.01053816, "balance_loss_clip": 1.04837155, "balance_loss_mlp": 1.03362191, "epoch": 0.44249383755185473, "flos": 26685398868480.0, "grad_norm": 1.8103731390218947, "language_loss": 0.77728069, "learning_rate": 2.465413568426639e-06, "loss": 0.79929978, "num_input_tokens_seen": 79238550, "step": 3680, "time_per_iteration": 2.829146146774292 }, { "auxiliary_loss_clip": 0.0113134, "auxiliary_loss_mlp": 0.01049149, "balance_loss_clip": 1.04541194, "balance_loss_mlp": 1.03147006, "epoch": 0.44261408044249384, "flos": 23147659422720.0, "grad_norm": 1.5924639502544549, "language_loss": 0.8180337, "learning_rate": 2.464655946651591e-06, "loss": 0.83983862, "num_input_tokens_seen": 79257555, "step": 3681, "time_per_iteration": 2.749382257461548 }, { "auxiliary_loss_clip": 0.01142945, "auxiliary_loss_mlp": 0.01042445, "balance_loss_clip": 1.04942083, "balance_loss_mlp": 1.02433717, "epoch": 0.44273432333313295, "flos": 24462564595200.0, "grad_norm": 2.0782912360734977, "language_loss": 0.81028485, "learning_rate": 2.4638982543886065e-06, "loss": 0.83213878, "num_input_tokens_seen": 79277595, "step": 3682, "time_per_iteration": 2.8146560192108154 }, { "auxiliary_loss_clip": 0.01145931, "auxiliary_loss_mlp": 0.01048564, "balance_loss_clip": 1.05079818, "balance_loss_mlp": 1.03061104, "epoch": 0.442854566223772, "flos": 17528932512000.0, "grad_norm": 2.2209503623556017, "language_loss": 0.86820877, "learning_rate": 2.4631404917526254e-06, "loss": 0.89015377, "num_input_tokens_seen": 79294550, "step": 3683, "time_per_iteration": 2.702960968017578 }, { "auxiliary_loss_clip": 0.01130252, "auxiliary_loss_mlp": 0.01059579, "balance_loss_clip": 1.04528284, "balance_loss_mlp": 1.04104161, "epoch": 0.4429748091144111, "flos": 24896293320960.0, "grad_norm": 1.927604956901547, "language_loss": 0.79226249, "learning_rate": 2.4623826588586e-06, "loss": 0.81416076, "num_input_tokens_seen": 79314820, "step": 3684, "time_per_iteration": 3.6846864223480225 }, { "auxiliary_loss_clip": 0.01124567, "auxiliary_loss_mlp": 0.0104783, "balance_loss_clip": 1.04698825, "balance_loss_mlp": 1.0292933, "epoch": 0.4430950520050502, "flos": 21614704738560.0, "grad_norm": 2.042213265581169, "language_loss": 0.82569575, "learning_rate": 2.461624755821492e-06, "loss": 0.84741974, "num_input_tokens_seen": 79334300, "step": 3685, "time_per_iteration": 2.8420920372009277 }, { "auxiliary_loss_clip": 0.011185, "auxiliary_loss_mlp": 0.01049067, "balance_loss_clip": 1.04767036, "balance_loss_mlp": 1.03118527, "epoch": 0.4432152948956893, "flos": 24572271709440.0, "grad_norm": 1.8208776945887546, "language_loss": 0.76541674, "learning_rate": 2.4608667827562763e-06, "loss": 0.78709239, "num_input_tokens_seen": 79353630, "step": 3686, "time_per_iteration": 2.849745273590088 }, { "auxiliary_loss_clip": 0.01145764, "auxiliary_loss_mlp": 0.01049803, "balance_loss_clip": 1.05041265, "balance_loss_mlp": 1.03069425, "epoch": 0.4433355377863284, "flos": 21762261809280.0, "grad_norm": 2.1160727441574774, "language_loss": 0.90433037, "learning_rate": 2.460108739777936e-06, "loss": 0.92628598, "num_input_tokens_seen": 79372765, "step": 3687, "time_per_iteration": 2.8278188705444336 }, { "auxiliary_loss_clip": 0.01124078, "auxiliary_loss_mlp": 0.01051941, "balance_loss_clip": 1.0486939, "balance_loss_mlp": 1.03514409, "epoch": 0.44345578067696745, "flos": 20084479488000.0, "grad_norm": 3.8152429896688824, "language_loss": 0.76346231, "learning_rate": 2.4593506270014656e-06, "loss": 0.78522253, "num_input_tokens_seen": 79391735, "step": 3688, "time_per_iteration": 2.7907536029815674 }, { "auxiliary_loss_clip": 0.01125998, "auxiliary_loss_mlp": 0.01046323, "balance_loss_clip": 1.04282355, "balance_loss_mlp": 1.02796435, "epoch": 0.44357602356760656, "flos": 24169497528960.0, "grad_norm": 1.6381188204577883, "language_loss": 0.81948519, "learning_rate": 2.45859244454187e-06, "loss": 0.84120846, "num_input_tokens_seen": 79411525, "step": 3689, "time_per_iteration": 2.8379201889038086 }, { "auxiliary_loss_clip": 0.01136192, "auxiliary_loss_mlp": 0.01036824, "balance_loss_clip": 1.04651809, "balance_loss_mlp": 1.02031314, "epoch": 0.44369626645824567, "flos": 22707717644160.0, "grad_norm": 1.6253801663380432, "language_loss": 0.66118437, "learning_rate": 2.4578341925141655e-06, "loss": 0.6829145, "num_input_tokens_seen": 79430740, "step": 3690, "time_per_iteration": 2.7180285453796387 }, { "auxiliary_loss_clip": 0.01146127, "auxiliary_loss_mlp": 0.01046908, "balance_loss_clip": 1.04878414, "balance_loss_mlp": 1.02757192, "epoch": 0.4438165093488847, "flos": 38030225420160.0, "grad_norm": 3.6309030594637717, "language_loss": 0.72401369, "learning_rate": 2.457075871033378e-06, "loss": 0.74594402, "num_input_tokens_seen": 79452615, "step": 3691, "time_per_iteration": 2.898557186126709 }, { "auxiliary_loss_clip": 0.01111845, "auxiliary_loss_mlp": 0.0104213, "balance_loss_clip": 1.04594707, "balance_loss_mlp": 1.02260303, "epoch": 0.44393675223952384, "flos": 15523213996800.0, "grad_norm": 2.167264361577877, "language_loss": 0.88507116, "learning_rate": 2.4563174802145445e-06, "loss": 0.90661091, "num_input_tokens_seen": 79469865, "step": 3692, "time_per_iteration": 2.9588961601257324 }, { "auxiliary_loss_clip": 0.01033872, "auxiliary_loss_mlp": 0.01027308, "balance_loss_clip": 1.01882148, "balance_loss_mlp": 1.02504313, "epoch": 0.44405699513016295, "flos": 64574893779840.0, "grad_norm": 0.6467173028287011, "language_loss": 0.48599917, "learning_rate": 2.455559020172712e-06, "loss": 0.50661099, "num_input_tokens_seen": 79537220, "step": 3693, "time_per_iteration": 3.377173900604248 }, { "auxiliary_loss_clip": 0.01104721, "auxiliary_loss_mlp": 0.01044798, "balance_loss_clip": 1.04798043, "balance_loss_mlp": 1.02419865, "epoch": 0.444177238020802, "flos": 23987394552960.0, "grad_norm": 1.8655393396585098, "language_loss": 0.89929593, "learning_rate": 2.4548004910229385e-06, "loss": 0.92079109, "num_input_tokens_seen": 79554795, "step": 3694, "time_per_iteration": 2.9622912406921387 }, { "auxiliary_loss_clip": 0.01143684, "auxiliary_loss_mlp": 0.00773889, "balance_loss_clip": 1.05042624, "balance_loss_mlp": 1.00069392, "epoch": 0.4442974809114411, "flos": 22563069575040.0, "grad_norm": 1.8572281865316427, "language_loss": 0.87158716, "learning_rate": 2.4540418928802913e-06, "loss": 0.89076287, "num_input_tokens_seen": 79573530, "step": 3695, "time_per_iteration": 2.7580409049987793 }, { "auxiliary_loss_clip": 0.01127341, "auxiliary_loss_mlp": 0.01055401, "balance_loss_clip": 1.04769826, "balance_loss_mlp": 1.03463495, "epoch": 0.4444177238020802, "flos": 17675699483520.0, "grad_norm": 2.21569853741666, "language_loss": 0.65794891, "learning_rate": 2.4532832258598506e-06, "loss": 0.67977631, "num_input_tokens_seen": 79591360, "step": 3696, "time_per_iteration": 2.7892839908599854 }, { "auxiliary_loss_clip": 0.01152252, "auxiliary_loss_mlp": 0.01038169, "balance_loss_clip": 1.05127907, "balance_loss_mlp": 1.01921415, "epoch": 0.4445379666927193, "flos": 28621594609920.0, "grad_norm": 1.7747703331471338, "language_loss": 0.80771911, "learning_rate": 2.4525244900767047e-06, "loss": 0.82962334, "num_input_tokens_seen": 79612175, "step": 3697, "time_per_iteration": 2.837811231613159 }, { "auxiliary_loss_clip": 0.0103464, "auxiliary_loss_mlp": 0.01000988, "balance_loss_clip": 1.01562166, "balance_loss_mlp": 0.99909228, "epoch": 0.4446582095833584, "flos": 70487370115200.0, "grad_norm": 0.7645857887663036, "language_loss": 0.60499108, "learning_rate": 2.4517656856459536e-06, "loss": 0.62534738, "num_input_tokens_seen": 79678020, "step": 3698, "time_per_iteration": 3.333629608154297 }, { "auxiliary_loss_clip": 0.01140338, "auxiliary_loss_mlp": 0.0104332, "balance_loss_clip": 1.0479852, "balance_loss_mlp": 1.02412796, "epoch": 0.4447784524739975, "flos": 26505199313280.0, "grad_norm": 1.7886147322306922, "language_loss": 0.67998785, "learning_rate": 2.4510068126827073e-06, "loss": 0.70182443, "num_input_tokens_seen": 79699020, "step": 3699, "time_per_iteration": 2.7666819095611572 }, { "auxiliary_loss_clip": 0.01125852, "auxiliary_loss_mlp": 0.01046689, "balance_loss_clip": 1.04642856, "balance_loss_mlp": 1.0287956, "epoch": 0.44489869536463655, "flos": 11656209553920.0, "grad_norm": 2.139383106998692, "language_loss": 0.81726462, "learning_rate": 2.450247871302086e-06, "loss": 0.83898997, "num_input_tokens_seen": 79716795, "step": 3700, "time_per_iteration": 3.6949281692504883 }, { "auxiliary_loss_clip": 0.01142644, "auxiliary_loss_mlp": 0.01049813, "balance_loss_clip": 1.04789257, "balance_loss_mlp": 1.03313589, "epoch": 0.44501893825527566, "flos": 20448469958400.0, "grad_norm": 2.5372629672679583, "language_loss": 0.83336502, "learning_rate": 2.44948886161922e-06, "loss": 0.85528958, "num_input_tokens_seen": 79735810, "step": 3701, "time_per_iteration": 2.616241455078125 }, { "auxiliary_loss_clip": 0.01142563, "auxiliary_loss_mlp": 0.01048527, "balance_loss_clip": 1.04861212, "balance_loss_mlp": 1.03138471, "epoch": 0.4451391811459148, "flos": 18261079430400.0, "grad_norm": 1.6194861121658135, "language_loss": 0.84947908, "learning_rate": 2.4487297837492524e-06, "loss": 0.87138993, "num_input_tokens_seen": 79754975, "step": 3702, "time_per_iteration": 3.6510791778564453 }, { "auxiliary_loss_clip": 0.01118993, "auxiliary_loss_mlp": 0.01045225, "balance_loss_clip": 1.04919553, "balance_loss_mlp": 1.02649724, "epoch": 0.44525942403655383, "flos": 16910155895040.0, "grad_norm": 2.1217051806593927, "language_loss": 0.62363732, "learning_rate": 2.4479706378073323e-06, "loss": 0.64527947, "num_input_tokens_seen": 79773515, "step": 3703, "time_per_iteration": 2.7800674438476562 }, { "auxiliary_loss_clip": 0.01106258, "auxiliary_loss_mlp": 0.01047707, "balance_loss_clip": 1.04410887, "balance_loss_mlp": 1.02969468, "epoch": 0.44537966692719294, "flos": 23258838994560.0, "grad_norm": 1.553484686344492, "language_loss": 0.83838618, "learning_rate": 2.447211423908623e-06, "loss": 0.85992587, "num_input_tokens_seen": 79793560, "step": 3704, "time_per_iteration": 2.8273279666900635 }, { "auxiliary_loss_clip": 0.01142339, "auxiliary_loss_mlp": 0.01042232, "balance_loss_clip": 1.04773235, "balance_loss_mlp": 1.02338469, "epoch": 0.445499909817832, "flos": 21724160457600.0, "grad_norm": 2.0483882866839433, "language_loss": 0.74947596, "learning_rate": 2.4464521421682966e-06, "loss": 0.77132165, "num_input_tokens_seen": 79811150, "step": 3705, "time_per_iteration": 3.834582805633545 }, { "auxiliary_loss_clip": 0.01134375, "auxiliary_loss_mlp": 0.0103712, "balance_loss_clip": 1.04652071, "balance_loss_mlp": 1.01932192, "epoch": 0.4456201527084711, "flos": 23987969170560.0, "grad_norm": 1.4240055754330658, "language_loss": 0.87994921, "learning_rate": 2.4456927927015345e-06, "loss": 0.9016642, "num_input_tokens_seen": 79832190, "step": 3706, "time_per_iteration": 2.6913697719573975 }, { "auxiliary_loss_clip": 0.01130761, "auxiliary_loss_mlp": 0.01046444, "balance_loss_clip": 1.04819942, "balance_loss_mlp": 1.02586889, "epoch": 0.4457403955991102, "flos": 18807065136000.0, "grad_norm": 2.1467939698637366, "language_loss": 0.76157236, "learning_rate": 2.4449333756235307e-06, "loss": 0.78334439, "num_input_tokens_seen": 79848905, "step": 3707, "time_per_iteration": 2.7454912662506104 }, { "auxiliary_loss_clip": 0.01142604, "auxiliary_loss_mlp": 0.01047277, "balance_loss_clip": 1.04915452, "balance_loss_mlp": 1.02874041, "epoch": 0.4458606384897493, "flos": 19207756327680.0, "grad_norm": 2.7016230431217347, "language_loss": 0.78719378, "learning_rate": 2.4441738910494876e-06, "loss": 0.80909264, "num_input_tokens_seen": 79863640, "step": 3708, "time_per_iteration": 2.74965238571167 }, { "auxiliary_loss_clip": 0.0113622, "auxiliary_loss_mlp": 0.01042379, "balance_loss_clip": 1.04773045, "balance_loss_mlp": 1.02368665, "epoch": 0.4459808813803884, "flos": 21361283308800.0, "grad_norm": 2.1409799960401696, "language_loss": 0.82426453, "learning_rate": 2.4434143390946176e-06, "loss": 0.8460505, "num_input_tokens_seen": 79882450, "step": 3709, "time_per_iteration": 2.859842300415039 }, { "auxiliary_loss_clip": 0.0111367, "auxiliary_loss_mlp": 0.01043017, "balance_loss_clip": 1.0442903, "balance_loss_mlp": 1.02450418, "epoch": 0.4461011242710275, "flos": 23288967527040.0, "grad_norm": 1.9312691793970134, "language_loss": 0.85320973, "learning_rate": 2.4426547198741457e-06, "loss": 0.8747766, "num_input_tokens_seen": 79900655, "step": 3710, "time_per_iteration": 3.8318822383880615 }, { "auxiliary_loss_clip": 0.01105468, "auxiliary_loss_mlp": 0.01046085, "balance_loss_clip": 1.04829156, "balance_loss_mlp": 1.02732182, "epoch": 0.44622136716166655, "flos": 20193001453440.0, "grad_norm": 2.066637491766877, "language_loss": 0.74805254, "learning_rate": 2.441895033503305e-06, "loss": 0.76956803, "num_input_tokens_seen": 79918575, "step": 3711, "time_per_iteration": 2.900352716445923 }, { "auxiliary_loss_clip": 0.01141964, "auxiliary_loss_mlp": 0.01049637, "balance_loss_clip": 1.04837072, "balance_loss_mlp": 1.02887058, "epoch": 0.44634161005230566, "flos": 21283033530240.0, "grad_norm": 1.8231548183510837, "language_loss": 0.82286334, "learning_rate": 2.4411352800973375e-06, "loss": 0.84477931, "num_input_tokens_seen": 79937010, "step": 3712, "time_per_iteration": 2.6832833290100098 }, { "auxiliary_loss_clip": 0.01115952, "auxiliary_loss_mlp": 0.01050368, "balance_loss_clip": 1.04682064, "balance_loss_mlp": 1.02976894, "epoch": 0.44646185294294477, "flos": 22929358515840.0, "grad_norm": 2.9311826224382433, "language_loss": 0.75218499, "learning_rate": 2.4403754597715005e-06, "loss": 0.77384818, "num_input_tokens_seen": 79956455, "step": 3713, "time_per_iteration": 2.8329918384552 }, { "auxiliary_loss_clip": 0.01130846, "auxiliary_loss_mlp": 0.01051275, "balance_loss_clip": 1.04567504, "balance_loss_mlp": 1.03134298, "epoch": 0.4465820958335838, "flos": 22637692080000.0, "grad_norm": 2.3495947738165133, "language_loss": 0.93159443, "learning_rate": 2.4396155726410553e-06, "loss": 0.95341563, "num_input_tokens_seen": 79975065, "step": 3714, "time_per_iteration": 2.810482978820801 }, { "auxiliary_loss_clip": 0.01146334, "auxiliary_loss_mlp": 0.01042977, "balance_loss_clip": 1.04968619, "balance_loss_mlp": 1.02397537, "epoch": 0.44670233872422294, "flos": 22672525294080.0, "grad_norm": 3.4512058606543836, "language_loss": 0.90919179, "learning_rate": 2.438855618821278e-06, "loss": 0.93108493, "num_input_tokens_seen": 79990865, "step": 3715, "time_per_iteration": 2.7395715713500977 }, { "auxiliary_loss_clip": 0.01135031, "auxiliary_loss_mlp": 0.01047471, "balance_loss_clip": 1.04698873, "balance_loss_mlp": 1.02827859, "epoch": 0.44682258161486205, "flos": 23582178247680.0, "grad_norm": 2.0115969459608087, "language_loss": 0.67422545, "learning_rate": 2.4380955984274517e-06, "loss": 0.69605052, "num_input_tokens_seen": 80009520, "step": 3716, "time_per_iteration": 2.7788453102111816 }, { "auxiliary_loss_clip": 0.01139284, "auxiliary_loss_mlp": 0.01055778, "balance_loss_clip": 1.04584348, "balance_loss_mlp": 1.0366447, "epoch": 0.4469428245055011, "flos": 26501356558080.0, "grad_norm": 1.7844575533819098, "language_loss": 0.76795816, "learning_rate": 2.4373355115748716e-06, "loss": 0.78990883, "num_input_tokens_seen": 80030350, "step": 3717, "time_per_iteration": 2.7957239151000977 }, { "auxiliary_loss_clip": 0.01124978, "auxiliary_loss_mlp": 0.01063074, "balance_loss_clip": 1.04917979, "balance_loss_mlp": 1.04166365, "epoch": 0.4470630673961402, "flos": 21504925797120.0, "grad_norm": 1.6540802368852316, "language_loss": 0.71853018, "learning_rate": 2.436575358378842e-06, "loss": 0.74041069, "num_input_tokens_seen": 80049840, "step": 3718, "time_per_iteration": 2.8233389854431152 }, { "auxiliary_loss_clip": 0.01141353, "auxiliary_loss_mlp": 0.0104421, "balance_loss_clip": 1.05242872, "balance_loss_mlp": 1.02479124, "epoch": 0.44718331028677927, "flos": 16173986653440.0, "grad_norm": 3.0264864836852396, "language_loss": 0.82926619, "learning_rate": 2.4358151389546782e-06, "loss": 0.85112184, "num_input_tokens_seen": 80066525, "step": 3719, "time_per_iteration": 2.7884926795959473 }, { "auxiliary_loss_clip": 0.01155758, "auxiliary_loss_mlp": 0.01050198, "balance_loss_clip": 1.05168247, "balance_loss_mlp": 1.03261423, "epoch": 0.4473035531774184, "flos": 19681238430720.0, "grad_norm": 2.305159661462121, "language_loss": 0.75363851, "learning_rate": 2.4350548534177035e-06, "loss": 0.77569807, "num_input_tokens_seen": 80083355, "step": 3720, "time_per_iteration": 2.706843376159668 }, { "auxiliary_loss_clip": 0.01113681, "auxiliary_loss_mlp": 0.0104677, "balance_loss_clip": 1.04636502, "balance_loss_mlp": 1.02981877, "epoch": 0.4474237960680575, "flos": 41427590515200.0, "grad_norm": 1.8352857378787308, "language_loss": 0.66322148, "learning_rate": 2.434294501883254e-06, "loss": 0.6848259, "num_input_tokens_seen": 80106450, "step": 3721, "time_per_iteration": 2.9795188903808594 }, { "auxiliary_loss_clip": 0.01115322, "auxiliary_loss_mlp": 0.01056445, "balance_loss_clip": 1.04266572, "balance_loss_mlp": 1.03511882, "epoch": 0.44754403895869654, "flos": 22891328991360.0, "grad_norm": 2.036651504551829, "language_loss": 0.65506101, "learning_rate": 2.433534084466674e-06, "loss": 0.67677867, "num_input_tokens_seen": 80125670, "step": 3722, "time_per_iteration": 2.8000147342681885 }, { "auxiliary_loss_clip": 0.01149121, "auxiliary_loss_mlp": 0.01044876, "balance_loss_clip": 1.04972875, "balance_loss_mlp": 1.02800798, "epoch": 0.44766428184933565, "flos": 25630271832960.0, "grad_norm": 1.7370861328214644, "language_loss": 0.71031249, "learning_rate": 2.4327736012833178e-06, "loss": 0.73225248, "num_input_tokens_seen": 80147390, "step": 3723, "time_per_iteration": 2.756582736968994 }, { "auxiliary_loss_clip": 0.011402, "auxiliary_loss_mlp": 0.01048362, "balance_loss_clip": 1.04694152, "balance_loss_mlp": 1.02914548, "epoch": 0.44778452473997477, "flos": 20448972748800.0, "grad_norm": 2.204010016814741, "language_loss": 0.76605093, "learning_rate": 2.4320130524485506e-06, "loss": 0.78793657, "num_input_tokens_seen": 80166185, "step": 3724, "time_per_iteration": 2.6511027812957764 }, { "auxiliary_loss_clip": 0.01119116, "auxiliary_loss_mlp": 0.01047696, "balance_loss_clip": 1.0487268, "balance_loss_mlp": 1.03085172, "epoch": 0.4479047676306138, "flos": 21975462984960.0, "grad_norm": 1.4564115337848156, "language_loss": 0.79575181, "learning_rate": 2.431252438077746e-06, "loss": 0.81741983, "num_input_tokens_seen": 80185685, "step": 3725, "time_per_iteration": 2.750310182571411 }, { "auxiliary_loss_clip": 0.01140229, "auxiliary_loss_mlp": 0.00774551, "balance_loss_clip": 1.04559207, "balance_loss_mlp": 1.00066352, "epoch": 0.44802501052125293, "flos": 21467219495040.0, "grad_norm": 2.31120072917709, "language_loss": 0.77512813, "learning_rate": 2.4304917582862906e-06, "loss": 0.794276, "num_input_tokens_seen": 80204865, "step": 3726, "time_per_iteration": 3.6132519245147705 }, { "auxiliary_loss_clip": 0.0114973, "auxiliary_loss_mlp": 0.01044694, "balance_loss_clip": 1.0480032, "balance_loss_mlp": 1.02624059, "epoch": 0.44814525341189204, "flos": 22126970551680.0, "grad_norm": 2.0053066342795933, "language_loss": 0.87641007, "learning_rate": 2.4297310131895774e-06, "loss": 0.89835435, "num_input_tokens_seen": 80223410, "step": 3727, "time_per_iteration": 2.6027913093566895 }, { "auxiliary_loss_clip": 0.01141468, "auxiliary_loss_mlp": 0.01041467, "balance_loss_clip": 1.0477879, "balance_loss_mlp": 1.02350211, "epoch": 0.4482654963025311, "flos": 16653933204480.0, "grad_norm": 2.383860377027818, "language_loss": 0.74738157, "learning_rate": 2.4289702029030113e-06, "loss": 0.76921093, "num_input_tokens_seen": 80240880, "step": 3728, "time_per_iteration": 3.6441729068756104 }, { "auxiliary_loss_clip": 0.01143744, "auxiliary_loss_mlp": 0.0104448, "balance_loss_clip": 1.052742, "balance_loss_mlp": 1.0262413, "epoch": 0.4483857391931702, "flos": 18841251905280.0, "grad_norm": 1.960746958547125, "language_loss": 0.83090174, "learning_rate": 2.4282093275420057e-06, "loss": 0.85278404, "num_input_tokens_seen": 80259910, "step": 3729, "time_per_iteration": 2.665210008621216 }, { "auxiliary_loss_clip": 0.01141366, "auxiliary_loss_mlp": 0.01047303, "balance_loss_clip": 1.0488807, "balance_loss_mlp": 1.02939236, "epoch": 0.4485059820838093, "flos": 20372590477440.0, "grad_norm": 2.1845085311305104, "language_loss": 0.7107693, "learning_rate": 2.4274483872219863e-06, "loss": 0.732656, "num_input_tokens_seen": 80277270, "step": 3730, "time_per_iteration": 2.703883409500122 }, { "auxiliary_loss_clip": 0.01134559, "auxiliary_loss_mlp": 0.0104943, "balance_loss_clip": 1.04683471, "balance_loss_mlp": 1.03134561, "epoch": 0.4486262249744484, "flos": 20047742853120.0, "grad_norm": 1.828871424629702, "language_loss": 0.93740094, "learning_rate": 2.426687382058386e-06, "loss": 0.95924085, "num_input_tokens_seen": 80295550, "step": 3731, "time_per_iteration": 4.035430908203125 }, { "auxiliary_loss_clip": 0.01041451, "auxiliary_loss_mlp": 0.01010149, "balance_loss_clip": 1.02179933, "balance_loss_mlp": 1.00818193, "epoch": 0.4487464678650875, "flos": 64595684776320.0, "grad_norm": 0.861344480575863, "language_loss": 0.59769535, "learning_rate": 2.425926312166649e-06, "loss": 0.61821139, "num_input_tokens_seen": 80348425, "step": 3732, "time_per_iteration": 3.1168553829193115 }, { "auxiliary_loss_clip": 0.01134578, "auxiliary_loss_mlp": 0.01052022, "balance_loss_clip": 1.04977131, "balance_loss_mlp": 1.03176856, "epoch": 0.4488667107557266, "flos": 20769798049920.0, "grad_norm": 2.26006954159454, "language_loss": 0.73232621, "learning_rate": 2.42516517766223e-06, "loss": 0.75419223, "num_input_tokens_seen": 80366505, "step": 3733, "time_per_iteration": 2.730698347091675 }, { "auxiliary_loss_clip": 0.01151617, "auxiliary_loss_mlp": 0.01044637, "balance_loss_clip": 1.05097687, "balance_loss_mlp": 1.02646995, "epoch": 0.44898695364636565, "flos": 23951735326080.0, "grad_norm": 1.9021684377811758, "language_loss": 0.67764401, "learning_rate": 2.4244039786605907e-06, "loss": 0.6996066, "num_input_tokens_seen": 80387510, "step": 3734, "time_per_iteration": 2.65805721282959 }, { "auxiliary_loss_clip": 0.01105246, "auxiliary_loss_mlp": 0.01046028, "balance_loss_clip": 1.04532981, "balance_loss_mlp": 1.02685928, "epoch": 0.44910719653700476, "flos": 18624351628800.0, "grad_norm": 2.336433285655665, "language_loss": 0.82402956, "learning_rate": 2.4236427152772055e-06, "loss": 0.84554231, "num_input_tokens_seen": 80405915, "step": 3735, "time_per_iteration": 2.7303600311279297 }, { "auxiliary_loss_clip": 0.01011806, "auxiliary_loss_mlp": 0.01001451, "balance_loss_clip": 1.0153358, "balance_loss_mlp": 0.9994961, "epoch": 0.4492274394276438, "flos": 57033435749760.0, "grad_norm": 0.8307595046937862, "language_loss": 0.57374948, "learning_rate": 2.422881387627557e-06, "loss": 0.59388214, "num_input_tokens_seen": 80458365, "step": 3736, "time_per_iteration": 3.9461214542388916 }, { "auxiliary_loss_clip": 0.01130879, "auxiliary_loss_mlp": 0.01045829, "balance_loss_clip": 1.04785323, "balance_loss_mlp": 1.0291636, "epoch": 0.4493476823182829, "flos": 23254888498560.0, "grad_norm": 1.678961500188787, "language_loss": 0.77491587, "learning_rate": 2.422119995827139e-06, "loss": 0.79668295, "num_input_tokens_seen": 80478490, "step": 3737, "time_per_iteration": 2.826085090637207 }, { "auxiliary_loss_clip": 0.01139472, "auxiliary_loss_mlp": 0.01051348, "balance_loss_clip": 1.04627562, "balance_loss_mlp": 1.03238201, "epoch": 0.44946792520892204, "flos": 15815131827840.0, "grad_norm": 3.376808876320522, "language_loss": 0.73823047, "learning_rate": 2.4213585399914528e-06, "loss": 0.76013863, "num_input_tokens_seen": 80495695, "step": 3738, "time_per_iteration": 2.924665689468384 }, { "auxiliary_loss_clip": 0.01139313, "auxiliary_loss_mlp": 0.01046677, "balance_loss_clip": 1.04885483, "balance_loss_mlp": 1.02814031, "epoch": 0.4495881680995611, "flos": 19610063631360.0, "grad_norm": 1.8739152073776992, "language_loss": 0.85451138, "learning_rate": 2.4205970202360113e-06, "loss": 0.87637126, "num_input_tokens_seen": 80515260, "step": 3739, "time_per_iteration": 2.8577721118927 }, { "auxiliary_loss_clip": 0.01091789, "auxiliary_loss_mlp": 0.01074001, "balance_loss_clip": 1.04280949, "balance_loss_mlp": 1.05498719, "epoch": 0.4497084109902002, "flos": 26031465815040.0, "grad_norm": 1.9664194791791139, "language_loss": 0.78080016, "learning_rate": 2.4198354366763354e-06, "loss": 0.80245805, "num_input_tokens_seen": 80533900, "step": 3740, "time_per_iteration": 2.8547468185424805 }, { "auxiliary_loss_clip": 0.01131892, "auxiliary_loss_mlp": 0.010445, "balance_loss_clip": 1.04691541, "balance_loss_mlp": 1.02680933, "epoch": 0.4498286538808393, "flos": 14793688771200.0, "grad_norm": 2.058134351330292, "language_loss": 0.78410685, "learning_rate": 2.4190737894279587e-06, "loss": 0.80587077, "num_input_tokens_seen": 80551270, "step": 3741, "time_per_iteration": 2.834019184112549 }, { "auxiliary_loss_clip": 0.01098384, "auxiliary_loss_mlp": 0.01059339, "balance_loss_clip": 1.03821433, "balance_loss_mlp": 1.03983617, "epoch": 0.44994889677147837, "flos": 15450171690240.0, "grad_norm": 2.181417436685164, "language_loss": 0.80805767, "learning_rate": 2.4183120786064203e-06, "loss": 0.82963485, "num_input_tokens_seen": 80568145, "step": 3742, "time_per_iteration": 2.75846266746521 }, { "auxiliary_loss_clip": 0.01143475, "auxiliary_loss_mlp": 0.00774529, "balance_loss_clip": 1.04975212, "balance_loss_mlp": 1.00068808, "epoch": 0.4500691396621175, "flos": 21798316085760.0, "grad_norm": 2.187127075408877, "language_loss": 0.85290313, "learning_rate": 2.417550304327273e-06, "loss": 0.87208319, "num_input_tokens_seen": 80586185, "step": 3743, "time_per_iteration": 2.7917685508728027 }, { "auxiliary_loss_clip": 0.01159652, "auxiliary_loss_mlp": 0.01053469, "balance_loss_clip": 1.05174708, "balance_loss_mlp": 1.03298914, "epoch": 0.4501893825527566, "flos": 32382016421760.0, "grad_norm": 1.5565520017541181, "language_loss": 0.75955546, "learning_rate": 2.4167884667060763e-06, "loss": 0.78168666, "num_input_tokens_seen": 80608895, "step": 3744, "time_per_iteration": 3.00099778175354 }, { "auxiliary_loss_clip": 0.01127999, "auxiliary_loss_mlp": 0.01058509, "balance_loss_clip": 1.04611063, "balance_loss_mlp": 1.03774309, "epoch": 0.45030962544339564, "flos": 16544944362240.0, "grad_norm": 2.4942500896082946, "language_loss": 0.87613189, "learning_rate": 2.4160265658584e-06, "loss": 0.89799702, "num_input_tokens_seen": 80623785, "step": 3745, "time_per_iteration": 2.6800789833068848 }, { "auxiliary_loss_clip": 0.01139803, "auxiliary_loss_mlp": 0.01053687, "balance_loss_clip": 1.04541552, "balance_loss_mlp": 1.03241968, "epoch": 0.45042986833403476, "flos": 19573039687680.0, "grad_norm": 2.1998696963672777, "language_loss": 0.69141233, "learning_rate": 2.4152646018998253e-06, "loss": 0.71334732, "num_input_tokens_seen": 80642735, "step": 3746, "time_per_iteration": 2.824856758117676 }, { "auxiliary_loss_clip": 0.01140844, "auxiliary_loss_mlp": 0.01049592, "balance_loss_clip": 1.04987717, "balance_loss_mlp": 1.03134131, "epoch": 0.45055011122467387, "flos": 23112467072640.0, "grad_norm": 1.8110622368661475, "language_loss": 0.71797961, "learning_rate": 2.4145025749459403e-06, "loss": 0.7398839, "num_input_tokens_seen": 80663760, "step": 3747, "time_per_iteration": 2.7078123092651367 }, { "auxiliary_loss_clip": 0.01079609, "auxiliary_loss_mlp": 0.01049847, "balance_loss_clip": 1.04577255, "balance_loss_mlp": 1.0315125, "epoch": 0.4506703541153129, "flos": 19934623946880.0, "grad_norm": 2.7440993832741296, "language_loss": 0.70363277, "learning_rate": 2.413740485112344e-06, "loss": 0.72492737, "num_input_tokens_seen": 80682100, "step": 3748, "time_per_iteration": 3.022780179977417 }, { "auxiliary_loss_clip": 0.01118428, "auxiliary_loss_mlp": 0.01046497, "balance_loss_clip": 1.04608667, "balance_loss_mlp": 1.02916443, "epoch": 0.45079059700595203, "flos": 19499530504320.0, "grad_norm": 1.767376968992053, "language_loss": 0.82318974, "learning_rate": 2.412978332514646e-06, "loss": 0.84483898, "num_input_tokens_seen": 80700880, "step": 3749, "time_per_iteration": 3.003253698348999 }, { "auxiliary_loss_clip": 0.01131971, "auxiliary_loss_mlp": 0.0104718, "balance_loss_clip": 1.04856896, "balance_loss_mlp": 1.02823734, "epoch": 0.4509108398965911, "flos": 27636313570560.0, "grad_norm": 3.2762250894169838, "language_loss": 0.72853732, "learning_rate": 2.4122161172684623e-06, "loss": 0.7503289, "num_input_tokens_seen": 80721675, "step": 3750, "time_per_iteration": 2.8003716468811035 }, { "auxiliary_loss_clip": 0.01126896, "auxiliary_loss_mlp": 0.01048402, "balance_loss_clip": 1.04701459, "balance_loss_mlp": 1.02926862, "epoch": 0.4510310827872302, "flos": 20995712640000.0, "grad_norm": 3.3910941321223693, "language_loss": 0.84217405, "learning_rate": 2.4114538394894216e-06, "loss": 0.86392701, "num_input_tokens_seen": 80739315, "step": 3751, "time_per_iteration": 2.754674196243286 }, { "auxiliary_loss_clip": 0.01121002, "auxiliary_loss_mlp": 0.01044361, "balance_loss_clip": 1.04202962, "balance_loss_mlp": 1.02636015, "epoch": 0.4511513256778693, "flos": 16216684945920.0, "grad_norm": 2.0071137289952836, "language_loss": 0.83280277, "learning_rate": 2.410691499293161e-06, "loss": 0.85445642, "num_input_tokens_seen": 80757470, "step": 3752, "time_per_iteration": 3.658449172973633 }, { "auxiliary_loss_clip": 0.01139176, "auxiliary_loss_mlp": 0.01053561, "balance_loss_clip": 1.04818881, "balance_loss_mlp": 1.03603721, "epoch": 0.45127156856850836, "flos": 25186702780800.0, "grad_norm": 2.0399974141437083, "language_loss": 0.74064279, "learning_rate": 2.409929096795326e-06, "loss": 0.76257014, "num_input_tokens_seen": 80777840, "step": 3753, "time_per_iteration": 2.7473433017730713 }, { "auxiliary_loss_clip": 0.01138456, "auxiliary_loss_mlp": 0.01046302, "balance_loss_clip": 1.04694891, "balance_loss_mlp": 1.02640605, "epoch": 0.4513918114591475, "flos": 20412523422720.0, "grad_norm": 2.0762707132387885, "language_loss": 0.79137409, "learning_rate": 2.409166632111573e-06, "loss": 0.81322169, "num_input_tokens_seen": 80795975, "step": 3754, "time_per_iteration": 3.5809779167175293 }, { "auxiliary_loss_clip": 0.01148209, "auxiliary_loss_mlp": 0.01043992, "balance_loss_clip": 1.04820275, "balance_loss_mlp": 1.02506149, "epoch": 0.4515120543497866, "flos": 26648482665600.0, "grad_norm": 1.6963233496537897, "language_loss": 0.80403113, "learning_rate": 2.4084041053575674e-06, "loss": 0.82595313, "num_input_tokens_seen": 80815395, "step": 3755, "time_per_iteration": 2.7002313137054443 }, { "auxiliary_loss_clip": 0.01130933, "auxiliary_loss_mlp": 0.01042935, "balance_loss_clip": 1.04812479, "balance_loss_mlp": 1.02563787, "epoch": 0.45163229724042564, "flos": 20595093275520.0, "grad_norm": 1.8187067374542916, "language_loss": 0.72488546, "learning_rate": 2.4076415166489834e-06, "loss": 0.74662417, "num_input_tokens_seen": 80834805, "step": 3756, "time_per_iteration": 3.670382022857666 }, { "auxiliary_loss_clip": 0.01106205, "auxiliary_loss_mlp": 0.01039716, "balance_loss_clip": 1.04548967, "balance_loss_mlp": 1.022645, "epoch": 0.45175254013106475, "flos": 21689004021120.0, "grad_norm": 1.6113494271013664, "language_loss": 0.78938961, "learning_rate": 2.406878866101506e-06, "loss": 0.81084883, "num_input_tokens_seen": 80853770, "step": 3757, "time_per_iteration": 2.796445369720459 }, { "auxiliary_loss_clip": 0.01151018, "auxiliary_loss_mlp": 0.01044998, "balance_loss_clip": 1.0508827, "balance_loss_mlp": 1.02759314, "epoch": 0.45187278302170386, "flos": 18878850466560.0, "grad_norm": 3.0558860169624897, "language_loss": 0.78342968, "learning_rate": 2.4061161538308273e-06, "loss": 0.80538982, "num_input_tokens_seen": 80870615, "step": 3758, "time_per_iteration": 2.6269664764404297 }, { "auxiliary_loss_clip": 0.01139297, "auxiliary_loss_mlp": 0.01041869, "balance_loss_clip": 1.04879332, "balance_loss_mlp": 1.02337933, "epoch": 0.4519930259123429, "flos": 18582479349120.0, "grad_norm": 2.166697103340705, "language_loss": 0.89365625, "learning_rate": 2.4053533799526523e-06, "loss": 0.91546786, "num_input_tokens_seen": 80886335, "step": 3759, "time_per_iteration": 2.6546220779418945 }, { "auxiliary_loss_clip": 0.01120768, "auxiliary_loss_mlp": 0.01037798, "balance_loss_clip": 1.04661453, "balance_loss_mlp": 1.02067983, "epoch": 0.452113268802982, "flos": 25192377129600.0, "grad_norm": 1.789788708238404, "language_loss": 0.86050379, "learning_rate": 2.404590544582691e-06, "loss": 0.8820895, "num_input_tokens_seen": 80904570, "step": 3760, "time_per_iteration": 2.798549175262451 }, { "auxiliary_loss_clip": 0.01105476, "auxiliary_loss_mlp": 0.01049848, "balance_loss_clip": 1.04316258, "balance_loss_mlp": 1.03016615, "epoch": 0.45223351169362114, "flos": 39378922312320.0, "grad_norm": 3.9144226742977817, "language_loss": 0.81232405, "learning_rate": 2.403827647836666e-06, "loss": 0.83387733, "num_input_tokens_seen": 80925125, "step": 3761, "time_per_iteration": 2.91182541847229 }, { "auxiliary_loss_clip": 0.01151594, "auxiliary_loss_mlp": 0.01044653, "balance_loss_clip": 1.04791474, "balance_loss_mlp": 1.02691507, "epoch": 0.4523537545842602, "flos": 21582169994880.0, "grad_norm": 3.6570174139150926, "language_loss": 0.69476897, "learning_rate": 2.4030646898303075e-06, "loss": 0.71673143, "num_input_tokens_seen": 80946615, "step": 3762, "time_per_iteration": 3.5473055839538574 }, { "auxiliary_loss_clip": 0.01130773, "auxiliary_loss_mlp": 0.01046741, "balance_loss_clip": 1.0460211, "balance_loss_mlp": 1.02813292, "epoch": 0.4524739974748993, "flos": 28439527547520.0, "grad_norm": 1.9342808094895816, "language_loss": 0.82293379, "learning_rate": 2.4023016706793566e-06, "loss": 0.84470904, "num_input_tokens_seen": 80966410, "step": 3763, "time_per_iteration": 2.777729034423828 }, { "auxiliary_loss_clip": 0.01021997, "auxiliary_loss_mlp": 0.01001249, "balance_loss_clip": 1.01464295, "balance_loss_mlp": 0.99885315, "epoch": 0.4525942403655384, "flos": 61556492148480.0, "grad_norm": 0.7762977011085939, "language_loss": 0.56883717, "learning_rate": 2.401538590499561e-06, "loss": 0.58906966, "num_input_tokens_seen": 81026865, "step": 3764, "time_per_iteration": 3.3058886528015137 }, { "auxiliary_loss_clip": 0.01140345, "auxiliary_loss_mlp": 0.00773924, "balance_loss_clip": 1.04939008, "balance_loss_mlp": 1.00059104, "epoch": 0.45271448325617747, "flos": 27529838680320.0, "grad_norm": 3.0468628856459774, "language_loss": 0.71886694, "learning_rate": 2.400775449406682e-06, "loss": 0.73800969, "num_input_tokens_seen": 81050060, "step": 3765, "time_per_iteration": 2.7692558765411377 }, { "auxiliary_loss_clip": 0.0113499, "auxiliary_loss_mlp": 0.01046405, "balance_loss_clip": 1.04572868, "balance_loss_mlp": 1.02933431, "epoch": 0.4528347261468166, "flos": 22452608275200.0, "grad_norm": 2.26593742983418, "language_loss": 0.73346853, "learning_rate": 2.400012247516485e-06, "loss": 0.75528252, "num_input_tokens_seen": 81070625, "step": 3766, "time_per_iteration": 2.6922643184661865 }, { "auxiliary_loss_clip": 0.01118756, "auxiliary_loss_mlp": 0.0104488, "balance_loss_clip": 1.04799628, "balance_loss_mlp": 1.02720165, "epoch": 0.45295496903745563, "flos": 21103875469440.0, "grad_norm": 1.878273462313026, "language_loss": 0.90267974, "learning_rate": 2.3992489849447484e-06, "loss": 0.92431611, "num_input_tokens_seen": 81089080, "step": 3767, "time_per_iteration": 2.7432024478912354 }, { "auxiliary_loss_clip": 0.0111812, "auxiliary_loss_mlp": 0.0104102, "balance_loss_clip": 1.046929, "balance_loss_mlp": 1.02357948, "epoch": 0.45307521192809475, "flos": 23221168606080.0, "grad_norm": 2.445258214028989, "language_loss": 0.78744781, "learning_rate": 2.3984856618072584e-06, "loss": 0.80903924, "num_input_tokens_seen": 81109115, "step": 3768, "time_per_iteration": 2.7415544986724854 }, { "auxiliary_loss_clip": 0.01119666, "auxiliary_loss_mlp": 0.01055292, "balance_loss_clip": 1.04780233, "balance_loss_mlp": 1.03785181, "epoch": 0.45319545481873386, "flos": 15560094286080.0, "grad_norm": 2.004089569488687, "language_loss": 0.7365678, "learning_rate": 2.3977222782198098e-06, "loss": 0.75831735, "num_input_tokens_seen": 81127750, "step": 3769, "time_per_iteration": 2.726949691772461 }, { "auxiliary_loss_clip": 0.01104291, "auxiliary_loss_mlp": 0.0104931, "balance_loss_clip": 1.04320681, "balance_loss_mlp": 1.02900863, "epoch": 0.4533156977093729, "flos": 21944759834880.0, "grad_norm": 2.406202333601711, "language_loss": 0.75293636, "learning_rate": 2.3969588342982077e-06, "loss": 0.77447236, "num_input_tokens_seen": 81147125, "step": 3770, "time_per_iteration": 2.7253196239471436 }, { "auxiliary_loss_clip": 0.01138985, "auxiliary_loss_mlp": 0.01042887, "balance_loss_clip": 1.04859173, "balance_loss_mlp": 1.02504158, "epoch": 0.453435940600012, "flos": 24242180699520.0, "grad_norm": 3.0553371908989932, "language_loss": 0.7247057, "learning_rate": 2.396195330158267e-06, "loss": 0.74652445, "num_input_tokens_seen": 81167015, "step": 3771, "time_per_iteration": 2.7030255794525146 }, { "auxiliary_loss_clip": 0.01155273, "auxiliary_loss_mlp": 0.01050721, "balance_loss_clip": 1.05153227, "balance_loss_mlp": 1.03157651, "epoch": 0.45355618349065113, "flos": 23440367352960.0, "grad_norm": 1.8067341148559322, "language_loss": 0.79346633, "learning_rate": 2.3954317659158094e-06, "loss": 0.81552625, "num_input_tokens_seen": 81187350, "step": 3772, "time_per_iteration": 2.6510791778564453 }, { "auxiliary_loss_clip": 0.01051824, "auxiliary_loss_mlp": 0.01002919, "balance_loss_clip": 1.01846218, "balance_loss_mlp": 1.0006541, "epoch": 0.4536764263812902, "flos": 66903161448960.0, "grad_norm": 0.8905682352379182, "language_loss": 0.56934559, "learning_rate": 2.394668141686667e-06, "loss": 0.5898931, "num_input_tokens_seen": 81249315, "step": 3773, "time_per_iteration": 3.224297285079956 }, { "auxiliary_loss_clip": 0.01134687, "auxiliary_loss_mlp": 0.0103879, "balance_loss_clip": 1.04554641, "balance_loss_mlp": 1.02192223, "epoch": 0.4537966692719293, "flos": 42739766254080.0, "grad_norm": 5.892454880679139, "language_loss": 0.69658446, "learning_rate": 2.3939044575866813e-06, "loss": 0.71831918, "num_input_tokens_seen": 81272065, "step": 3774, "time_per_iteration": 2.844761371612549 }, { "auxiliary_loss_clip": 0.01119724, "auxiliary_loss_mlp": 0.00775676, "balance_loss_clip": 1.04434967, "balance_loss_mlp": 1.00063014, "epoch": 0.4539169121625684, "flos": 35549480517120.0, "grad_norm": 2.2303224832252817, "language_loss": 0.75784898, "learning_rate": 2.3931407137317024e-06, "loss": 0.77680296, "num_input_tokens_seen": 81292220, "step": 3775, "time_per_iteration": 2.775083065032959 }, { "auxiliary_loss_clip": 0.01108319, "auxiliary_loss_mlp": 0.01054436, "balance_loss_clip": 1.04350948, "balance_loss_mlp": 1.03510022, "epoch": 0.45403715505320746, "flos": 18514716341760.0, "grad_norm": 2.630434395454302, "language_loss": 0.8485055, "learning_rate": 2.3923769102375907e-06, "loss": 0.8701331, "num_input_tokens_seen": 81311085, "step": 3776, "time_per_iteration": 2.726562738418579 }, { "auxiliary_loss_clip": 0.0111357, "auxiliary_loss_mlp": 0.01050717, "balance_loss_clip": 1.04438853, "balance_loss_mlp": 1.0328114, "epoch": 0.4541573979438466, "flos": 25045825639680.0, "grad_norm": 2.0590536340460206, "language_loss": 0.78371561, "learning_rate": 2.391613047220213e-06, "loss": 0.80535847, "num_input_tokens_seen": 81330985, "step": 3777, "time_per_iteration": 2.761937141418457 }, { "auxiliary_loss_clip": 0.01108541, "auxiliary_loss_mlp": 0.01046467, "balance_loss_clip": 1.04609227, "balance_loss_mlp": 1.02729774, "epoch": 0.4542776408344857, "flos": 18332397884160.0, "grad_norm": 2.043895786556363, "language_loss": 0.79196221, "learning_rate": 2.390849124795447e-06, "loss": 0.81351221, "num_input_tokens_seen": 81346985, "step": 3778, "time_per_iteration": 3.6981003284454346 }, { "auxiliary_loss_clip": 0.01154719, "auxiliary_loss_mlp": 0.01040317, "balance_loss_clip": 1.05328465, "balance_loss_mlp": 1.02324581, "epoch": 0.45439788372512474, "flos": 20701173116160.0, "grad_norm": 2.133846271483081, "language_loss": 0.84413201, "learning_rate": 2.3900851430791804e-06, "loss": 0.86608237, "num_input_tokens_seen": 81365005, "step": 3779, "time_per_iteration": 2.6501879692077637 }, { "auxiliary_loss_clip": 0.011579, "auxiliary_loss_mlp": 0.0104831, "balance_loss_clip": 1.05191696, "balance_loss_mlp": 1.02949893, "epoch": 0.45451812661576385, "flos": 22309432663680.0, "grad_norm": 2.1393460147465984, "language_loss": 0.84710336, "learning_rate": 2.389321102187307e-06, "loss": 0.86916554, "num_input_tokens_seen": 81383785, "step": 3780, "time_per_iteration": 3.5782365798950195 }, { "auxiliary_loss_clip": 0.01131155, "auxiliary_loss_mlp": 0.00774665, "balance_loss_clip": 1.05114412, "balance_loss_mlp": 1.00061381, "epoch": 0.4546383695064029, "flos": 21763303303680.0, "grad_norm": 2.2851472528296104, "language_loss": 0.81937206, "learning_rate": 2.3885570022357326e-06, "loss": 0.83843029, "num_input_tokens_seen": 81402915, "step": 3781, "time_per_iteration": 2.7031025886535645 }, { "auxiliary_loss_clip": 0.01026222, "auxiliary_loss_mlp": 0.01008854, "balance_loss_clip": 1.01951957, "balance_loss_mlp": 1.0060885, "epoch": 0.454758612397042, "flos": 64242755694720.0, "grad_norm": 0.7995307595252973, "language_loss": 0.60878021, "learning_rate": 2.38779284334037e-06, "loss": 0.62913096, "num_input_tokens_seen": 81467890, "step": 3782, "time_per_iteration": 3.310450792312622 }, { "auxiliary_loss_clip": 0.01092652, "auxiliary_loss_mlp": 0.01047995, "balance_loss_clip": 1.04372406, "balance_loss_mlp": 1.02858782, "epoch": 0.4548788552876811, "flos": 27304175485440.0, "grad_norm": 2.529342734854936, "language_loss": 0.78822744, "learning_rate": 2.387028625617141e-06, "loss": 0.80963397, "num_input_tokens_seen": 81487105, "step": 3783, "time_per_iteration": 3.7675094604492188 }, { "auxiliary_loss_clip": 0.01116699, "auxiliary_loss_mlp": 0.01045366, "balance_loss_clip": 1.04602623, "balance_loss_mlp": 1.02765155, "epoch": 0.4549990981783202, "flos": 22857142222080.0, "grad_norm": 10.730636940672504, "language_loss": 0.84837013, "learning_rate": 2.3862643491819766e-06, "loss": 0.86999083, "num_input_tokens_seen": 81505670, "step": 3784, "time_per_iteration": 2.676621913909912 }, { "auxiliary_loss_clip": 0.01137018, "auxiliary_loss_mlp": 0.010411, "balance_loss_clip": 1.04632711, "balance_loss_mlp": 1.02340961, "epoch": 0.4551193410689593, "flos": 23258587599360.0, "grad_norm": 1.7521264318214902, "language_loss": 0.84278333, "learning_rate": 2.3855000141508186e-06, "loss": 0.86456454, "num_input_tokens_seen": 81525825, "step": 3785, "time_per_iteration": 2.6584508419036865 }, { "auxiliary_loss_clip": 0.01133225, "auxiliary_loss_mlp": 0.01044235, "balance_loss_clip": 1.05257773, "balance_loss_mlp": 1.02449429, "epoch": 0.4552395839595984, "flos": 20777519473920.0, "grad_norm": 2.7273794660875033, "language_loss": 0.84289873, "learning_rate": 2.3847356206396143e-06, "loss": 0.86467326, "num_input_tokens_seen": 81543135, "step": 3786, "time_per_iteration": 2.722235918045044 }, { "auxiliary_loss_clip": 0.01157792, "auxiliary_loss_mlp": 0.01041939, "balance_loss_clip": 1.05517507, "balance_loss_mlp": 1.02306795, "epoch": 0.45535982685023746, "flos": 23257510191360.0, "grad_norm": 1.5247527549728537, "language_loss": 0.78705478, "learning_rate": 2.3839711687643227e-06, "loss": 0.80905205, "num_input_tokens_seen": 81564360, "step": 3787, "time_per_iteration": 2.596956968307495 }, { "auxiliary_loss_clip": 0.01139708, "auxiliary_loss_mlp": 0.0104294, "balance_loss_clip": 1.04864764, "balance_loss_mlp": 1.02408171, "epoch": 0.45548006974087657, "flos": 19646117907840.0, "grad_norm": 1.9744070593250838, "language_loss": 0.74129558, "learning_rate": 2.38320665864091e-06, "loss": 0.76312208, "num_input_tokens_seen": 81583710, "step": 3788, "time_per_iteration": 3.5236685276031494 }, { "auxiliary_loss_clip": 0.01093697, "auxiliary_loss_mlp": 0.01042607, "balance_loss_clip": 1.04513812, "balance_loss_mlp": 1.02473736, "epoch": 0.4556003126315157, "flos": 20047778766720.0, "grad_norm": 1.6294480754812124, "language_loss": 0.81797713, "learning_rate": 2.3824420903853516e-06, "loss": 0.83934015, "num_input_tokens_seen": 81602175, "step": 3789, "time_per_iteration": 2.7500784397125244 }, { "auxiliary_loss_clip": 0.01141429, "auxiliary_loss_mlp": 0.0104416, "balance_loss_clip": 1.05031574, "balance_loss_mlp": 1.0261358, "epoch": 0.45572055552215474, "flos": 22959738443520.0, "grad_norm": 5.18612901170333, "language_loss": 0.81704533, "learning_rate": 2.3816774641136324e-06, "loss": 0.83890122, "num_input_tokens_seen": 81619430, "step": 3790, "time_per_iteration": 2.649872303009033 }, { "auxiliary_loss_clip": 0.01142681, "auxiliary_loss_mlp": 0.00774133, "balance_loss_clip": 1.05039072, "balance_loss_mlp": 1.00056994, "epoch": 0.45584079841279385, "flos": 33109925535360.0, "grad_norm": 1.7336145343173757, "language_loss": 0.71395874, "learning_rate": 2.380912779941745e-06, "loss": 0.73312694, "num_input_tokens_seen": 81642550, "step": 3791, "time_per_iteration": 2.717714786529541 }, { "auxiliary_loss_clip": 0.01143999, "auxiliary_loss_mlp": 0.01047348, "balance_loss_clip": 1.04792368, "balance_loss_mlp": 1.02678466, "epoch": 0.45596104130343296, "flos": 27272179445760.0, "grad_norm": 2.374508850218331, "language_loss": 0.83782625, "learning_rate": 2.3801480379856918e-06, "loss": 0.85973978, "num_input_tokens_seen": 81664260, "step": 3792, "time_per_iteration": 2.7153191566467285 }, { "auxiliary_loss_clip": 0.0113374, "auxiliary_loss_mlp": 0.0104629, "balance_loss_clip": 1.05165744, "balance_loss_mlp": 1.02853918, "epoch": 0.456081284194072, "flos": 21579799697280.0, "grad_norm": 1.6931342221549637, "language_loss": 0.83812267, "learning_rate": 2.379383238361484e-06, "loss": 0.85992301, "num_input_tokens_seen": 81683620, "step": 3793, "time_per_iteration": 2.669926166534424 }, { "auxiliary_loss_clip": 0.01138647, "auxiliary_loss_mlp": 0.01041048, "balance_loss_clip": 1.04872704, "balance_loss_mlp": 1.02431095, "epoch": 0.4562015270847111, "flos": 35918822113920.0, "grad_norm": 2.6511292916707974, "language_loss": 0.79699981, "learning_rate": 2.3786183811851407e-06, "loss": 0.81879675, "num_input_tokens_seen": 81704325, "step": 3794, "time_per_iteration": 2.725191831588745 }, { "auxiliary_loss_clip": 0.01159458, "auxiliary_loss_mlp": 0.01046383, "balance_loss_clip": 1.05471945, "balance_loss_mlp": 1.0278101, "epoch": 0.45632176997535023, "flos": 13589783602560.0, "grad_norm": 1.7902534801751724, "language_loss": 0.80170405, "learning_rate": 2.3778534665726892e-06, "loss": 0.82376248, "num_input_tokens_seen": 81721155, "step": 3795, "time_per_iteration": 2.5521466732025146 }, { "auxiliary_loss_clip": 0.01130093, "auxiliary_loss_mlp": 0.01052741, "balance_loss_clip": 1.04799914, "balance_loss_mlp": 1.03536069, "epoch": 0.4564420128659893, "flos": 32635401937920.0, "grad_norm": 1.874915236731693, "language_loss": 0.72957623, "learning_rate": 2.377088494640168e-06, "loss": 0.75140452, "num_input_tokens_seen": 81742905, "step": 3796, "time_per_iteration": 2.7169129848480225 }, { "auxiliary_loss_clip": 0.01135845, "auxiliary_loss_mlp": 0.01042594, "balance_loss_clip": 1.04896116, "balance_loss_mlp": 1.02486789, "epoch": 0.4565622557566284, "flos": 20377690208640.0, "grad_norm": 1.672687254926935, "language_loss": 0.77960306, "learning_rate": 2.3763234655036216e-06, "loss": 0.80138743, "num_input_tokens_seen": 81762105, "step": 3797, "time_per_iteration": 2.6012537479400635 }, { "auxiliary_loss_clip": 0.01112455, "auxiliary_loss_mlp": 0.01058834, "balance_loss_clip": 1.04512107, "balance_loss_mlp": 1.03905702, "epoch": 0.45668249864726745, "flos": 25374372364800.0, "grad_norm": 2.238713887801648, "language_loss": 0.8712852, "learning_rate": 2.3755583792791046e-06, "loss": 0.8929981, "num_input_tokens_seen": 81781975, "step": 3798, "time_per_iteration": 2.6943697929382324 }, { "auxiliary_loss_clip": 0.01146091, "auxiliary_loss_mlp": 0.01043315, "balance_loss_clip": 1.05254698, "balance_loss_mlp": 1.02496862, "epoch": 0.45680274153790656, "flos": 15559806977280.0, "grad_norm": 1.8704203322765156, "language_loss": 0.74500847, "learning_rate": 2.3747932360826803e-06, "loss": 0.76690245, "num_input_tokens_seen": 81798905, "step": 3799, "time_per_iteration": 2.619823694229126 }, { "auxiliary_loss_clip": 0.01139527, "auxiliary_loss_mlp": 0.01045657, "balance_loss_clip": 1.04892445, "balance_loss_mlp": 1.02738225, "epoch": 0.4569229844285457, "flos": 19792884879360.0, "grad_norm": 2.463172263657892, "language_loss": 0.83114421, "learning_rate": 2.3740280360304205e-06, "loss": 0.85299611, "num_input_tokens_seen": 81816630, "step": 3800, "time_per_iteration": 2.6032752990722656 }, { "auxiliary_loss_clip": 0.01115564, "auxiliary_loss_mlp": 0.01045251, "balance_loss_clip": 1.04817808, "balance_loss_mlp": 1.02758408, "epoch": 0.45704322731918473, "flos": 24093941270400.0, "grad_norm": 2.437804944828677, "language_loss": 0.68321037, "learning_rate": 2.3732627792384038e-06, "loss": 0.70481849, "num_input_tokens_seen": 81837700, "step": 3801, "time_per_iteration": 2.7070882320404053 }, { "auxiliary_loss_clip": 0.01155007, "auxiliary_loss_mlp": 0.01039687, "balance_loss_clip": 1.05239058, "balance_loss_mlp": 1.02210402, "epoch": 0.45716347020982384, "flos": 31317803245440.0, "grad_norm": 2.2720379220657665, "language_loss": 0.75166214, "learning_rate": 2.3724974658227207e-06, "loss": 0.7736091, "num_input_tokens_seen": 81858490, "step": 3802, "time_per_iteration": 2.669055938720703 }, { "auxiliary_loss_clip": 0.01128246, "auxiliary_loss_mlp": 0.00774331, "balance_loss_clip": 1.04753673, "balance_loss_mlp": 1.00049973, "epoch": 0.45728371310046295, "flos": 26501392471680.0, "grad_norm": 2.843226771826462, "language_loss": 0.71305847, "learning_rate": 2.3717320958994687e-06, "loss": 0.73208427, "num_input_tokens_seen": 81876050, "step": 3803, "time_per_iteration": 2.690250873565674 }, { "auxiliary_loss_clip": 0.01111475, "auxiliary_loss_mlp": 0.01041802, "balance_loss_clip": 1.04195786, "balance_loss_mlp": 1.02392101, "epoch": 0.457403955991102, "flos": 17929408222080.0, "grad_norm": 15.1724273378735, "language_loss": 0.7037425, "learning_rate": 2.3709666695847534e-06, "loss": 0.72527528, "num_input_tokens_seen": 81894230, "step": 3804, "time_per_iteration": 3.6331610679626465 }, { "auxiliary_loss_clip": 0.01091532, "auxiliary_loss_mlp": 0.01049671, "balance_loss_clip": 1.04133689, "balance_loss_mlp": 1.03097892, "epoch": 0.4575241988817411, "flos": 42230660837760.0, "grad_norm": 1.779793346725392, "language_loss": 0.70420551, "learning_rate": 2.370201186994689e-06, "loss": 0.72561753, "num_input_tokens_seen": 81917915, "step": 3805, "time_per_iteration": 2.913113594055176 }, { "auxiliary_loss_clip": 0.01121682, "auxiliary_loss_mlp": 0.01041974, "balance_loss_clip": 1.04886031, "balance_loss_mlp": 1.02408087, "epoch": 0.45764444177238023, "flos": 30117309868800.0, "grad_norm": 1.7367999966327134, "language_loss": 0.70275879, "learning_rate": 2.369435648245399e-06, "loss": 0.72439539, "num_input_tokens_seen": 81938130, "step": 3806, "time_per_iteration": 3.6701459884643555 }, { "auxiliary_loss_clip": 0.01125284, "auxiliary_loss_mlp": 0.01050192, "balance_loss_clip": 1.04739869, "balance_loss_mlp": 1.03260839, "epoch": 0.4577646846630193, "flos": 24060293205120.0, "grad_norm": 1.836615898822247, "language_loss": 0.85025275, "learning_rate": 2.368670053453015e-06, "loss": 0.87200755, "num_input_tokens_seen": 81959820, "step": 3807, "time_per_iteration": 2.682600975036621 }, { "auxiliary_loss_clip": 0.01151454, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.05533409, "balance_loss_mlp": 1.03159213, "epoch": 0.4578849275536584, "flos": 17418578952960.0, "grad_norm": 2.930313437151442, "language_loss": 0.74674821, "learning_rate": 2.3679044027336757e-06, "loss": 0.76876622, "num_input_tokens_seen": 81975710, "step": 3808, "time_per_iteration": 3.604184150695801 }, { "auxiliary_loss_clip": 0.01154897, "auxiliary_loss_mlp": 0.01046839, "balance_loss_clip": 1.05103934, "balance_loss_mlp": 1.02842081, "epoch": 0.4580051704442975, "flos": 13510169107200.0, "grad_norm": 2.6672917751539593, "language_loss": 0.69092935, "learning_rate": 2.3671386962035326e-06, "loss": 0.71294665, "num_input_tokens_seen": 81993180, "step": 3809, "time_per_iteration": 2.607677459716797 }, { "auxiliary_loss_clip": 0.01145147, "auxiliary_loss_mlp": 0.01044838, "balance_loss_clip": 1.05153704, "balance_loss_mlp": 1.02557349, "epoch": 0.45812541333493656, "flos": 18037606965120.0, "grad_norm": 2.2801263492764017, "language_loss": 0.68596154, "learning_rate": 2.3663729339787405e-06, "loss": 0.70786142, "num_input_tokens_seen": 82010115, "step": 3810, "time_per_iteration": 2.602843761444092 }, { "auxiliary_loss_clip": 0.01156956, "auxiliary_loss_mlp": 0.01042216, "balance_loss_clip": 1.05348849, "balance_loss_mlp": 1.02348804, "epoch": 0.45824565622557567, "flos": 20222196232320.0, "grad_norm": 2.710372077813314, "language_loss": 0.7334857, "learning_rate": 2.365607116175466e-06, "loss": 0.75547743, "num_input_tokens_seen": 82025540, "step": 3811, "time_per_iteration": 2.6291301250457764 }, { "auxiliary_loss_clip": 0.01152707, "auxiliary_loss_mlp": 0.01040411, "balance_loss_clip": 1.04962611, "balance_loss_mlp": 1.0227325, "epoch": 0.4583658991162148, "flos": 19864885691520.0, "grad_norm": 3.3238490562886627, "language_loss": 0.66505545, "learning_rate": 2.3648412429098825e-06, "loss": 0.68698668, "num_input_tokens_seen": 82043890, "step": 3812, "time_per_iteration": 2.6200387477874756 }, { "auxiliary_loss_clip": 0.01111562, "auxiliary_loss_mlp": 0.01049018, "balance_loss_clip": 1.04549491, "balance_loss_mlp": 1.02727401, "epoch": 0.45848614200685384, "flos": 21029935322880.0, "grad_norm": 1.9960863855505082, "language_loss": 0.81839287, "learning_rate": 2.364075314298172e-06, "loss": 0.83999866, "num_input_tokens_seen": 82061345, "step": 3813, "time_per_iteration": 2.671572685241699 }, { "auxiliary_loss_clip": 0.01146274, "auxiliary_loss_mlp": 0.00774433, "balance_loss_clip": 1.05165148, "balance_loss_mlp": 1.00049663, "epoch": 0.45860638489749295, "flos": 21069293650560.0, "grad_norm": 2.3086025887172252, "language_loss": 0.70241725, "learning_rate": 2.3633093304565267e-06, "loss": 0.72162426, "num_input_tokens_seen": 82080400, "step": 3814, "time_per_iteration": 2.705230236053467 }, { "auxiliary_loss_clip": 0.01159984, "auxiliary_loss_mlp": 0.01045445, "balance_loss_clip": 1.05480003, "balance_loss_mlp": 1.02600229, "epoch": 0.458726627788132, "flos": 26833889692800.0, "grad_norm": 2.090780916044768, "language_loss": 0.63259447, "learning_rate": 2.3625432915011443e-06, "loss": 0.65464878, "num_input_tokens_seen": 82102310, "step": 3815, "time_per_iteration": 3.5108680725097656 }, { "auxiliary_loss_clip": 0.0112446, "auxiliary_loss_mlp": 0.01068557, "balance_loss_clip": 1.04958141, "balance_loss_mlp": 1.04774284, "epoch": 0.4588468706787711, "flos": 24097927680000.0, "grad_norm": 1.7491324118835248, "language_loss": 0.65613449, "learning_rate": 2.3617771975482334e-06, "loss": 0.67806464, "num_input_tokens_seen": 82121140, "step": 3816, "time_per_iteration": 2.6637537479400635 }, { "auxiliary_loss_clip": 0.01093781, "auxiliary_loss_mlp": 0.01053247, "balance_loss_clip": 1.04205441, "balance_loss_mlp": 1.03521109, "epoch": 0.4589671135694102, "flos": 17889331622400.0, "grad_norm": 1.5919392055961488, "language_loss": 0.74563199, "learning_rate": 2.3610110487140083e-06, "loss": 0.76710224, "num_input_tokens_seen": 82139575, "step": 3817, "time_per_iteration": 2.7158873081207275 }, { "auxiliary_loss_clip": 0.0112525, "auxiliary_loss_mlp": 0.01042426, "balance_loss_clip": 1.04851282, "balance_loss_mlp": 1.02437806, "epoch": 0.4590873564600493, "flos": 25626967781760.0, "grad_norm": 1.7152101772195676, "language_loss": 0.81107354, "learning_rate": 2.360244845114695e-06, "loss": 0.83275026, "num_input_tokens_seen": 82159195, "step": 3818, "time_per_iteration": 2.6909193992614746 }, { "auxiliary_loss_clip": 0.0111683, "auxiliary_loss_mlp": 0.0104328, "balance_loss_clip": 1.04546916, "balance_loss_mlp": 1.02535105, "epoch": 0.4592075993506884, "flos": 18514788168960.0, "grad_norm": 2.2057413898859752, "language_loss": 0.68295723, "learning_rate": 2.3594785868665245e-06, "loss": 0.70455837, "num_input_tokens_seen": 82175500, "step": 3819, "time_per_iteration": 2.638786792755127 }, { "auxiliary_loss_clip": 0.01119324, "auxiliary_loss_mlp": 0.00773632, "balance_loss_clip": 1.04671741, "balance_loss_mlp": 1.00041986, "epoch": 0.4593278422413275, "flos": 20631111638400.0, "grad_norm": 2.3266247537615268, "language_loss": 0.80688167, "learning_rate": 2.3587122740857386e-06, "loss": 0.82581127, "num_input_tokens_seen": 82192600, "step": 3820, "time_per_iteration": 2.6905932426452637 }, { "auxiliary_loss_clip": 0.01138775, "auxiliary_loss_mlp": 0.0104183, "balance_loss_clip": 1.05023932, "balance_loss_mlp": 1.02530766, "epoch": 0.45944808513196655, "flos": 21358517961600.0, "grad_norm": 1.7544907663787646, "language_loss": 0.78127712, "learning_rate": 2.357945906888586e-06, "loss": 0.80308318, "num_input_tokens_seen": 82212040, "step": 3821, "time_per_iteration": 2.7226502895355225 }, { "auxiliary_loss_clip": 0.01144566, "auxiliary_loss_mlp": 0.01044214, "balance_loss_clip": 1.05002511, "balance_loss_mlp": 1.0248065, "epoch": 0.45956832802260567, "flos": 21427789340160.0, "grad_norm": 3.3730169400140397, "language_loss": 0.80364263, "learning_rate": 2.357179485391324e-06, "loss": 0.82553041, "num_input_tokens_seen": 82229895, "step": 3822, "time_per_iteration": 2.624025583267212 }, { "auxiliary_loss_clip": 0.01150115, "auxiliary_loss_mlp": 0.01044032, "balance_loss_clip": 1.05071068, "balance_loss_mlp": 1.02627015, "epoch": 0.4596885709132448, "flos": 22382654538240.0, "grad_norm": 1.9547939902012104, "language_loss": 0.86437565, "learning_rate": 2.3564130097102173e-06, "loss": 0.88631713, "num_input_tokens_seen": 82249550, "step": 3823, "time_per_iteration": 2.6012861728668213 }, { "auxiliary_loss_clip": 0.01117727, "auxiliary_loss_mlp": 0.01052711, "balance_loss_clip": 1.04755867, "balance_loss_mlp": 1.03283882, "epoch": 0.45980881380388383, "flos": 28981957806720.0, "grad_norm": 1.7527376208482819, "language_loss": 0.75158393, "learning_rate": 2.355646479961541e-06, "loss": 0.77328837, "num_input_tokens_seen": 82268860, "step": 3824, "time_per_iteration": 2.7088332176208496 }, { "auxiliary_loss_clip": 0.01154622, "auxiliary_loss_mlp": 0.01046486, "balance_loss_clip": 1.05250311, "balance_loss_mlp": 1.0290339, "epoch": 0.45992905669452294, "flos": 33396599980800.0, "grad_norm": 1.9962824767658274, "language_loss": 0.71823347, "learning_rate": 2.354879896261576e-06, "loss": 0.74024451, "num_input_tokens_seen": 82289070, "step": 3825, "time_per_iteration": 2.669602870941162 }, { "auxiliary_loss_clip": 0.01108859, "auxiliary_loss_mlp": 0.010468, "balance_loss_clip": 1.04340291, "balance_loss_mlp": 1.02839375, "epoch": 0.46004929958516205, "flos": 36318184502400.0, "grad_norm": 1.9773413222670708, "language_loss": 0.57161242, "learning_rate": 2.3541132587266133e-06, "loss": 0.59316897, "num_input_tokens_seen": 82311790, "step": 3826, "time_per_iteration": 2.79390287399292 }, { "auxiliary_loss_clip": 0.01122739, "auxiliary_loss_mlp": 0.01046378, "balance_loss_clip": 1.0489285, "balance_loss_mlp": 1.02817488, "epoch": 0.4601695424758011, "flos": 17238451224960.0, "grad_norm": 2.172581688422897, "language_loss": 0.69158506, "learning_rate": 2.3533465674729515e-06, "loss": 0.71327627, "num_input_tokens_seen": 82329020, "step": 3827, "time_per_iteration": 2.6563868522644043 }, { "auxiliary_loss_clip": 0.01157224, "auxiliary_loss_mlp": 0.0104277, "balance_loss_clip": 1.05369067, "balance_loss_mlp": 1.02404261, "epoch": 0.4602897853664402, "flos": 15888425529600.0, "grad_norm": 2.034440091706972, "language_loss": 0.72614491, "learning_rate": 2.352579822616895e-06, "loss": 0.74814481, "num_input_tokens_seen": 82346455, "step": 3828, "time_per_iteration": 2.571376085281372 }, { "auxiliary_loss_clip": 0.01126435, "auxiliary_loss_mlp": 0.0104545, "balance_loss_clip": 1.04839492, "balance_loss_mlp": 1.02759206, "epoch": 0.4604100282570793, "flos": 25412617370880.0, "grad_norm": 3.5019490543263476, "language_loss": 0.77879006, "learning_rate": 2.351813024274761e-06, "loss": 0.80050892, "num_input_tokens_seen": 82367810, "step": 3829, "time_per_iteration": 2.679673671722412 }, { "auxiliary_loss_clip": 0.01122483, "auxiliary_loss_mlp": 0.0104692, "balance_loss_clip": 1.05075169, "balance_loss_mlp": 1.02853823, "epoch": 0.4605302711477184, "flos": 27630711048960.0, "grad_norm": 1.8730659861351662, "language_loss": 0.73800445, "learning_rate": 2.3510461725628693e-06, "loss": 0.75969845, "num_input_tokens_seen": 82388275, "step": 3830, "time_per_iteration": 3.6947319507598877 }, { "auxiliary_loss_clip": 0.01119734, "auxiliary_loss_mlp": 0.01042118, "balance_loss_clip": 1.04848826, "balance_loss_mlp": 1.02418852, "epoch": 0.4606505140383575, "flos": 23839657914240.0, "grad_norm": 1.9723071232862912, "language_loss": 0.70685828, "learning_rate": 2.350279267597554e-06, "loss": 0.72847682, "num_input_tokens_seen": 82408915, "step": 3831, "time_per_iteration": 3.660460948944092 }, { "auxiliary_loss_clip": 0.01142394, "auxiliary_loss_mlp": 0.01040549, "balance_loss_clip": 1.05028641, "balance_loss_mlp": 1.02161896, "epoch": 0.46077075692899655, "flos": 16107013745280.0, "grad_norm": 2.9617094795742096, "language_loss": 0.83087206, "learning_rate": 2.3495123094951515e-06, "loss": 0.85270149, "num_input_tokens_seen": 82427260, "step": 3832, "time_per_iteration": 2.7756004333496094 }, { "auxiliary_loss_clip": 0.01123151, "auxiliary_loss_mlp": 0.01045175, "balance_loss_clip": 1.04916048, "balance_loss_mlp": 1.02656686, "epoch": 0.46089099981963566, "flos": 48798147634560.0, "grad_norm": 2.65665022386247, "language_loss": 0.76353961, "learning_rate": 2.34874529837201e-06, "loss": 0.78522289, "num_input_tokens_seen": 82450805, "step": 3833, "time_per_iteration": 2.943490743637085 }, { "auxiliary_loss_clip": 0.01082888, "auxiliary_loss_mlp": 0.01043227, "balance_loss_clip": 1.04028463, "balance_loss_mlp": 1.02508318, "epoch": 0.46101124271027477, "flos": 19099234362240.0, "grad_norm": 1.8255197396667047, "language_loss": 0.79165864, "learning_rate": 2.347978234344483e-06, "loss": 0.81291974, "num_input_tokens_seen": 82467010, "step": 3834, "time_per_iteration": 2.7498509883880615 }, { "auxiliary_loss_clip": 0.01145358, "auxiliary_loss_mlp": 0.01044017, "balance_loss_clip": 1.05103147, "balance_loss_mlp": 1.02539706, "epoch": 0.4611314856009138, "flos": 39347931853440.0, "grad_norm": 1.8080460714800413, "language_loss": 0.6911729, "learning_rate": 2.347211117528935e-06, "loss": 0.71306658, "num_input_tokens_seen": 82489310, "step": 3835, "time_per_iteration": 3.810774326324463 }, { "auxiliary_loss_clip": 0.01125754, "auxiliary_loss_mlp": 0.01040024, "balance_loss_clip": 1.05248809, "balance_loss_mlp": 1.02141583, "epoch": 0.46125172849155294, "flos": 20810772489600.0, "grad_norm": 1.7843969888486255, "language_loss": 0.71823311, "learning_rate": 2.3464439480417374e-06, "loss": 0.73989081, "num_input_tokens_seen": 82508830, "step": 3836, "time_per_iteration": 2.746903419494629 }, { "auxiliary_loss_clip": 0.01150126, "auxiliary_loss_mlp": 0.01043449, "balance_loss_clip": 1.05459857, "balance_loss_mlp": 1.02407742, "epoch": 0.46137197138219205, "flos": 17930808852480.0, "grad_norm": 44.39825477210053, "language_loss": 0.77121794, "learning_rate": 2.3456767259992676e-06, "loss": 0.7931537, "num_input_tokens_seen": 82526475, "step": 3837, "time_per_iteration": 2.6012113094329834 }, { "auxiliary_loss_clip": 0.01153497, "auxiliary_loss_mlp": 0.00773861, "balance_loss_clip": 1.04922032, "balance_loss_mlp": 1.0005933, "epoch": 0.4614922142728311, "flos": 16836610798080.0, "grad_norm": 4.461903930937634, "language_loss": 0.88632345, "learning_rate": 2.3449094515179135e-06, "loss": 0.90559703, "num_input_tokens_seen": 82543935, "step": 3838, "time_per_iteration": 2.683140516281128 }, { "auxiliary_loss_clip": 0.01132748, "auxiliary_loss_mlp": 0.01042476, "balance_loss_clip": 1.04987144, "balance_loss_mlp": 1.02541685, "epoch": 0.4616124571634702, "flos": 26614906427520.0, "grad_norm": 2.524864313171101, "language_loss": 0.81775826, "learning_rate": 2.34414212471407e-06, "loss": 0.83951056, "num_input_tokens_seen": 82563730, "step": 3839, "time_per_iteration": 2.741562604904175 }, { "auxiliary_loss_clip": 0.01153587, "auxiliary_loss_mlp": 0.01048414, "balance_loss_clip": 1.05400872, "balance_loss_mlp": 1.02936482, "epoch": 0.4617327000541093, "flos": 20340127560960.0, "grad_norm": 2.565491593855363, "language_loss": 0.73135209, "learning_rate": 2.3433747457041394e-06, "loss": 0.75337213, "num_input_tokens_seen": 82582435, "step": 3840, "time_per_iteration": 2.646458864212036 }, { "auxiliary_loss_clip": 0.01113633, "auxiliary_loss_mlp": 0.0104607, "balance_loss_clip": 1.04663634, "balance_loss_mlp": 1.02684116, "epoch": 0.4618529429447484, "flos": 29570749545600.0, "grad_norm": 1.8713240024256437, "language_loss": 0.84973812, "learning_rate": 2.342607314604533e-06, "loss": 0.87133515, "num_input_tokens_seen": 82602185, "step": 3841, "time_per_iteration": 3.6260197162628174 }, { "auxiliary_loss_clip": 0.01141544, "auxiliary_loss_mlp": 0.01045605, "balance_loss_clip": 1.0534606, "balance_loss_mlp": 1.02704394, "epoch": 0.4619731858353875, "flos": 19787030962560.0, "grad_norm": 1.8017022739916424, "language_loss": 0.84469235, "learning_rate": 2.3418398315316694e-06, "loss": 0.86656386, "num_input_tokens_seen": 82620005, "step": 3842, "time_per_iteration": 2.6406404972076416 }, { "auxiliary_loss_clip": 0.01153557, "auxiliary_loss_mlp": 0.01045037, "balance_loss_clip": 1.05125165, "balance_loss_mlp": 1.02708364, "epoch": 0.4620934287260266, "flos": 18951138587520.0, "grad_norm": 2.1706860255699025, "language_loss": 0.78068101, "learning_rate": 2.3410722966019755e-06, "loss": 0.8026669, "num_input_tokens_seen": 82635120, "step": 3843, "time_per_iteration": 2.5483431816101074 }, { "auxiliary_loss_clip": 0.01138999, "auxiliary_loss_mlp": 0.01044713, "balance_loss_clip": 1.04820538, "balance_loss_mlp": 1.02768993, "epoch": 0.46221367161666566, "flos": 37341674634240.0, "grad_norm": 1.8221899316712935, "language_loss": 0.65570688, "learning_rate": 2.3403047099318848e-06, "loss": 0.677544, "num_input_tokens_seen": 82659190, "step": 3844, "time_per_iteration": 2.750566005706787 }, { "auxiliary_loss_clip": 0.01096172, "auxiliary_loss_mlp": 0.01058841, "balance_loss_clip": 1.04333544, "balance_loss_mlp": 1.03957725, "epoch": 0.46233391450730477, "flos": 14428549065600.0, "grad_norm": 2.474228605923491, "language_loss": 0.75535548, "learning_rate": 2.3395370716378405e-06, "loss": 0.7769056, "num_input_tokens_seen": 82676635, "step": 3845, "time_per_iteration": 2.7054736614227295 }, { "auxiliary_loss_clip": 0.01145138, "auxiliary_loss_mlp": 0.01049336, "balance_loss_clip": 1.049438, "balance_loss_mlp": 1.03153801, "epoch": 0.4624541573979438, "flos": 22493044010880.0, "grad_norm": 2.1271629183593976, "language_loss": 0.72683048, "learning_rate": 2.338769381836292e-06, "loss": 0.7487753, "num_input_tokens_seen": 82696245, "step": 3846, "time_per_iteration": 2.6100854873657227 }, { "auxiliary_loss_clip": 0.01111054, "auxiliary_loss_mlp": 0.01053602, "balance_loss_clip": 1.04725957, "balance_loss_mlp": 1.0346359, "epoch": 0.46257440028858293, "flos": 14465070218880.0, "grad_norm": 4.234926789712704, "language_loss": 0.7303499, "learning_rate": 2.3380016406436984e-06, "loss": 0.75199646, "num_input_tokens_seen": 82713725, "step": 3847, "time_per_iteration": 2.734083414077759 }, { "auxiliary_loss_clip": 0.01098138, "auxiliary_loss_mlp": 0.01061456, "balance_loss_clip": 1.04361892, "balance_loss_mlp": 1.04164302, "epoch": 0.46269464317922204, "flos": 23332204523520.0, "grad_norm": 1.9426642880254121, "language_loss": 0.81591082, "learning_rate": 2.337233848176524e-06, "loss": 0.83750677, "num_input_tokens_seen": 82731495, "step": 3848, "time_per_iteration": 2.6879873275756836 }, { "auxiliary_loss_clip": 0.01098432, "auxiliary_loss_mlp": 0.0105491, "balance_loss_clip": 1.04505539, "balance_loss_mlp": 1.03470349, "epoch": 0.4628148860698611, "flos": 18552027594240.0, "grad_norm": 2.301235755730272, "language_loss": 0.83074665, "learning_rate": 2.3364660045512435e-06, "loss": 0.85228002, "num_input_tokens_seen": 82750255, "step": 3849, "time_per_iteration": 2.696420431137085 }, { "auxiliary_loss_clip": 0.0103551, "auxiliary_loss_mlp": 0.01007969, "balance_loss_clip": 1.02244639, "balance_loss_mlp": 1.00519168, "epoch": 0.4629351289605002, "flos": 70667569670400.0, "grad_norm": 0.7392972562380629, "language_loss": 0.58160895, "learning_rate": 2.335698109884337e-06, "loss": 0.60204375, "num_input_tokens_seen": 82815460, "step": 3850, "time_per_iteration": 3.4386067390441895 }, { "auxiliary_loss_clip": 0.01016749, "auxiliary_loss_mlp": 0.01004826, "balance_loss_clip": 1.02582383, "balance_loss_mlp": 1.00204802, "epoch": 0.4630553718511393, "flos": 59687200465920.0, "grad_norm": 0.7883027415297672, "language_loss": 0.59843922, "learning_rate": 2.334930164292294e-06, "loss": 0.61865497, "num_input_tokens_seen": 82878010, "step": 3851, "time_per_iteration": 3.3826382160186768 }, { "auxiliary_loss_clip": 0.01089946, "auxiliary_loss_mlp": 0.01051857, "balance_loss_clip": 1.03882921, "balance_loss_mlp": 1.03252089, "epoch": 0.4631756147417784, "flos": 15960605909760.0, "grad_norm": 2.103360327813809, "language_loss": 0.79679954, "learning_rate": 2.334162167891612e-06, "loss": 0.81821752, "num_input_tokens_seen": 82895275, "step": 3852, "time_per_iteration": 2.6910154819488525 }, { "auxiliary_loss_clip": 0.01129911, "auxiliary_loss_mlp": 0.01047664, "balance_loss_clip": 1.04655695, "balance_loss_mlp": 1.02942538, "epoch": 0.4632958576324175, "flos": 16472907636480.0, "grad_norm": 2.332669465117425, "language_loss": 0.75080562, "learning_rate": 2.333394120798795e-06, "loss": 0.77258134, "num_input_tokens_seen": 82914010, "step": 3853, "time_per_iteration": 2.593973398208618 }, { "auxiliary_loss_clip": 0.01127559, "auxiliary_loss_mlp": 0.01040204, "balance_loss_clip": 1.04583597, "balance_loss_mlp": 1.0218339, "epoch": 0.4634161005230566, "flos": 22346492520960.0, "grad_norm": 2.506460820435176, "language_loss": 0.71549642, "learning_rate": 2.3326260231303545e-06, "loss": 0.73717403, "num_input_tokens_seen": 82932610, "step": 3854, "time_per_iteration": 2.650879383087158 }, { "auxiliary_loss_clip": 0.01151638, "auxiliary_loss_mlp": 0.01041839, "balance_loss_clip": 1.0527935, "balance_loss_mlp": 1.02616251, "epoch": 0.46353634341369565, "flos": 15742233175680.0, "grad_norm": 1.8487183006013979, "language_loss": 0.86527431, "learning_rate": 2.331857875002811e-06, "loss": 0.887209, "num_input_tokens_seen": 82951210, "step": 3855, "time_per_iteration": 2.5705881118774414 }, { "auxiliary_loss_clip": 0.0112902, "auxiliary_loss_mlp": 0.01047767, "balance_loss_clip": 1.04966605, "balance_loss_mlp": 1.02936077, "epoch": 0.46365658630433476, "flos": 28329820433280.0, "grad_norm": 1.8078782646336418, "language_loss": 0.76554668, "learning_rate": 2.3310896765326916e-06, "loss": 0.78731453, "num_input_tokens_seen": 82972210, "step": 3856, "time_per_iteration": 3.622715950012207 }, { "auxiliary_loss_clip": 0.01111051, "auxiliary_loss_mlp": 0.01050307, "balance_loss_clip": 1.04771399, "balance_loss_mlp": 1.03111458, "epoch": 0.46377682919497387, "flos": 24608074590720.0, "grad_norm": 1.8648679330753928, "language_loss": 0.84107494, "learning_rate": 2.330321427836531e-06, "loss": 0.86268854, "num_input_tokens_seen": 82994080, "step": 3857, "time_per_iteration": 3.6527512073516846 }, { "auxiliary_loss_clip": 0.01134494, "auxiliary_loss_mlp": 0.0103926, "balance_loss_clip": 1.0473814, "balance_loss_mlp": 1.02128339, "epoch": 0.4638970720856129, "flos": 19060953442560.0, "grad_norm": 2.0370245072050652, "language_loss": 0.82653654, "learning_rate": 2.3295531290308733e-06, "loss": 0.84827411, "num_input_tokens_seen": 83012230, "step": 3858, "time_per_iteration": 2.621328115463257 }, { "auxiliary_loss_clip": 0.01158033, "auxiliary_loss_mlp": 0.00774354, "balance_loss_clip": 1.05348182, "balance_loss_mlp": 1.00053239, "epoch": 0.46401731497625204, "flos": 18471012468480.0, "grad_norm": 2.6449704072907427, "language_loss": 0.76014489, "learning_rate": 2.3287847802322678e-06, "loss": 0.77946877, "num_input_tokens_seen": 83027800, "step": 3859, "time_per_iteration": 2.5926010608673096 }, { "auxiliary_loss_clip": 0.01137437, "auxiliary_loss_mlp": 0.01054011, "balance_loss_clip": 1.05349016, "balance_loss_mlp": 1.03474689, "epoch": 0.4641375578668911, "flos": 26067053214720.0, "grad_norm": 2.480927810665166, "language_loss": 0.84312034, "learning_rate": 2.3280163815572723e-06, "loss": 0.86503482, "num_input_tokens_seen": 83048395, "step": 3860, "time_per_iteration": 3.704798936843872 }, { "auxiliary_loss_clip": 0.01120686, "auxiliary_loss_mlp": 0.01049777, "balance_loss_clip": 1.0463028, "balance_loss_mlp": 1.03057194, "epoch": 0.4642578007575302, "flos": 19570382081280.0, "grad_norm": 2.202250573262859, "language_loss": 0.77329099, "learning_rate": 2.3272479331224522e-06, "loss": 0.79499561, "num_input_tokens_seen": 83065825, "step": 3861, "time_per_iteration": 2.671156406402588 }, { "auxiliary_loss_clip": 0.01155142, "auxiliary_loss_mlp": 0.01048277, "balance_loss_clip": 1.05155575, "balance_loss_mlp": 1.03076482, "epoch": 0.4643780436481693, "flos": 28186249772160.0, "grad_norm": 2.003584817728215, "language_loss": 0.78172326, "learning_rate": 2.3264794350443817e-06, "loss": 0.80375749, "num_input_tokens_seen": 83087920, "step": 3862, "time_per_iteration": 2.6494152545928955 }, { "auxiliary_loss_clip": 0.01145029, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.05085504, "balance_loss_mlp": 1.02319193, "epoch": 0.46449828653880837, "flos": 25375270204800.0, "grad_norm": 1.973731472435407, "language_loss": 0.79284573, "learning_rate": 2.3257108874396396e-06, "loss": 0.8147102, "num_input_tokens_seen": 83109015, "step": 3863, "time_per_iteration": 2.654958963394165 }, { "auxiliary_loss_clip": 0.01130844, "auxiliary_loss_mlp": 0.01048334, "balance_loss_clip": 1.05038667, "balance_loss_mlp": 1.0303278, "epoch": 0.4646185294294475, "flos": 16034330574720.0, "grad_norm": 1.90651114243705, "language_loss": 0.73997736, "learning_rate": 2.3249422904248152e-06, "loss": 0.76176918, "num_input_tokens_seen": 83127450, "step": 3864, "time_per_iteration": 2.609380006790161 }, { "auxiliary_loss_clip": 0.01143584, "auxiliary_loss_mlp": 0.01040146, "balance_loss_clip": 1.05264938, "balance_loss_mlp": 1.0235883, "epoch": 0.4647387723200866, "flos": 26363101109760.0, "grad_norm": 1.422137130091704, "language_loss": 0.87149715, "learning_rate": 2.324173644116504e-06, "loss": 0.89333439, "num_input_tokens_seen": 83150300, "step": 3865, "time_per_iteration": 2.6718668937683105 }, { "auxiliary_loss_clip": 0.01141949, "auxiliary_loss_mlp": 0.01050167, "balance_loss_clip": 1.05017173, "balance_loss_mlp": 1.03010416, "epoch": 0.46485901521072565, "flos": 27160209774720.0, "grad_norm": 1.8624080080038898, "language_loss": 0.81566656, "learning_rate": 2.3234049486313087e-06, "loss": 0.83758771, "num_input_tokens_seen": 83171750, "step": 3866, "time_per_iteration": 3.5607926845550537 }, { "auxiliary_loss_clip": 0.0114015, "auxiliary_loss_mlp": 0.01040618, "balance_loss_clip": 1.0495491, "balance_loss_mlp": 1.02420306, "epoch": 0.46497925810136476, "flos": 24279851088000.0, "grad_norm": 1.9394837004443997, "language_loss": 0.75711, "learning_rate": 2.322636204085839e-06, "loss": 0.77891773, "num_input_tokens_seen": 83191820, "step": 3867, "time_per_iteration": 2.6285109519958496 }, { "auxiliary_loss_clip": 0.01120755, "auxiliary_loss_mlp": 0.01042898, "balance_loss_clip": 1.04733551, "balance_loss_mlp": 1.02433741, "epoch": 0.46509950099200387, "flos": 16253134272000.0, "grad_norm": 2.371471667418421, "language_loss": 0.78592563, "learning_rate": 2.3218674105967143e-06, "loss": 0.80756217, "num_input_tokens_seen": 83210085, "step": 3868, "time_per_iteration": 2.6550967693328857 }, { "auxiliary_loss_clip": 0.01121938, "auxiliary_loss_mlp": 0.01046759, "balance_loss_clip": 1.04631948, "balance_loss_mlp": 1.02732801, "epoch": 0.4652197438826429, "flos": 23442270773760.0, "grad_norm": 1.6114155074815653, "language_loss": 0.83575165, "learning_rate": 2.3210985682805593e-06, "loss": 0.85743862, "num_input_tokens_seen": 83231865, "step": 3869, "time_per_iteration": 2.776181697845459 }, { "auxiliary_loss_clip": 0.01160363, "auxiliary_loss_mlp": 0.01037593, "balance_loss_clip": 1.05631053, "balance_loss_mlp": 1.02004576, "epoch": 0.46533998677328203, "flos": 16216397637120.0, "grad_norm": 4.264996913026016, "language_loss": 0.67976397, "learning_rate": 2.320329677254007e-06, "loss": 0.70174354, "num_input_tokens_seen": 83249195, "step": 3870, "time_per_iteration": 2.643239974975586 }, { "auxiliary_loss_clip": 0.01152721, "auxiliary_loss_mlp": 0.01046954, "balance_loss_clip": 1.04985952, "balance_loss_mlp": 1.02728474, "epoch": 0.46546022966392114, "flos": 21141869080320.0, "grad_norm": 18.661012349682984, "language_loss": 0.72318053, "learning_rate": 2.319560737633697e-06, "loss": 0.74517733, "num_input_tokens_seen": 83267915, "step": 3871, "time_per_iteration": 2.568784475326538 }, { "auxiliary_loss_clip": 0.01122787, "auxiliary_loss_mlp": 0.01047526, "balance_loss_clip": 1.04620957, "balance_loss_mlp": 1.02896476, "epoch": 0.4655804725545602, "flos": 41171942442240.0, "grad_norm": 2.3742546662967174, "language_loss": 0.68067473, "learning_rate": 2.3187917495362775e-06, "loss": 0.70237786, "num_input_tokens_seen": 83292325, "step": 3872, "time_per_iteration": 2.8278841972351074 }, { "auxiliary_loss_clip": 0.01101319, "auxiliary_loss_mlp": 0.01046023, "balance_loss_clip": 1.04507279, "balance_loss_mlp": 1.02897596, "epoch": 0.4657007154451993, "flos": 19570956698880.0, "grad_norm": 2.399146503575341, "language_loss": 0.7670517, "learning_rate": 2.318022713078403e-06, "loss": 0.7885251, "num_input_tokens_seen": 83306905, "step": 3873, "time_per_iteration": 2.6788547039031982 }, { "auxiliary_loss_clip": 0.01126388, "auxiliary_loss_mlp": 0.01044726, "balance_loss_clip": 1.04819846, "balance_loss_mlp": 1.02598619, "epoch": 0.4658209583358384, "flos": 15517826956800.0, "grad_norm": 2.3909924151120046, "language_loss": 0.85422146, "learning_rate": 2.3172536283767354e-06, "loss": 0.87593263, "num_input_tokens_seen": 83320665, "step": 3874, "time_per_iteration": 2.610934019088745 }, { "auxiliary_loss_clip": 0.01107951, "auxiliary_loss_mlp": 0.01042692, "balance_loss_clip": 1.04479289, "balance_loss_mlp": 1.02422619, "epoch": 0.4659412012264775, "flos": 14903180403840.0, "grad_norm": 2.091449122665958, "language_loss": 0.81202799, "learning_rate": 2.3164844955479447e-06, "loss": 0.83353442, "num_input_tokens_seen": 83336475, "step": 3875, "time_per_iteration": 2.6294729709625244 }, { "auxiliary_loss_clip": 0.01107348, "auxiliary_loss_mlp": 0.01041979, "balance_loss_clip": 1.04357779, "balance_loss_mlp": 1.02413368, "epoch": 0.4660614441171166, "flos": 24425612478720.0, "grad_norm": 1.7954141647368336, "language_loss": 0.70902073, "learning_rate": 2.3157153147087082e-06, "loss": 0.73051405, "num_input_tokens_seen": 83358365, "step": 3876, "time_per_iteration": 2.7413434982299805 }, { "auxiliary_loss_clip": 0.01108631, "auxiliary_loss_mlp": 0.0104595, "balance_loss_clip": 1.04930186, "balance_loss_mlp": 1.02853358, "epoch": 0.46618168700775564, "flos": 22091095843200.0, "grad_norm": 1.7601096191686496, "language_loss": 0.83263743, "learning_rate": 2.314946085975709e-06, "loss": 0.8541832, "num_input_tokens_seen": 83377345, "step": 3877, "time_per_iteration": 2.7102835178375244 }, { "auxiliary_loss_clip": 0.01102169, "auxiliary_loss_mlp": 0.01047109, "balance_loss_clip": 1.04504216, "balance_loss_mlp": 1.02977538, "epoch": 0.46630192989839475, "flos": 26176975810560.0, "grad_norm": 1.8022082491871716, "language_loss": 0.82310337, "learning_rate": 2.3141768094656393e-06, "loss": 0.84459615, "num_input_tokens_seen": 83395920, "step": 3878, "time_per_iteration": 2.6831727027893066 }, { "auxiliary_loss_clip": 0.01087502, "auxiliary_loss_mlp": 0.01047011, "balance_loss_clip": 1.04258394, "balance_loss_mlp": 1.02848577, "epoch": 0.46642217278903386, "flos": 11509622150400.0, "grad_norm": 2.8093493880092346, "language_loss": 0.82726061, "learning_rate": 2.3134074852951966e-06, "loss": 0.84860575, "num_input_tokens_seen": 83412510, "step": 3879, "time_per_iteration": 2.7341468334198 }, { "auxiliary_loss_clip": 0.01093216, "auxiliary_loss_mlp": 0.01042925, "balance_loss_clip": 1.04000843, "balance_loss_mlp": 1.02516234, "epoch": 0.4665424156796729, "flos": 32306819299200.0, "grad_norm": 1.701550336963504, "language_loss": 0.77927768, "learning_rate": 2.312638113581088e-06, "loss": 0.80063915, "num_input_tokens_seen": 83432995, "step": 3880, "time_per_iteration": 2.747947931289673 }, { "auxiliary_loss_clip": 0.01137703, "auxiliary_loss_mlp": 0.0104988, "balance_loss_clip": 1.04678273, "balance_loss_mlp": 1.03210545, "epoch": 0.46666265857031203, "flos": 18436179254400.0, "grad_norm": 2.5415611366254947, "language_loss": 0.78187048, "learning_rate": 2.311868694440027e-06, "loss": 0.80374628, "num_input_tokens_seen": 83447415, "step": 3881, "time_per_iteration": 2.618788003921509 }, { "auxiliary_loss_clip": 0.01048775, "auxiliary_loss_mlp": 0.01002649, "balance_loss_clip": 1.01618767, "balance_loss_mlp": 1.0004077, "epoch": 0.46678290146095114, "flos": 68438989221120.0, "grad_norm": 0.7331392917615707, "language_loss": 0.62474728, "learning_rate": 2.3110992279887323e-06, "loss": 0.64526153, "num_input_tokens_seen": 83519340, "step": 3882, "time_per_iteration": 4.252048969268799 }, { "auxiliary_loss_clip": 0.0112333, "auxiliary_loss_mlp": 0.0104673, "balance_loss_clip": 1.0498929, "balance_loss_mlp": 1.02757323, "epoch": 0.4669031443515902, "flos": 17712507945600.0, "grad_norm": 2.5670067900526914, "language_loss": 0.84397268, "learning_rate": 2.310329714343932e-06, "loss": 0.8656733, "num_input_tokens_seen": 83535490, "step": 3883, "time_per_iteration": 3.5971922874450684 }, { "auxiliary_loss_clip": 0.01121252, "auxiliary_loss_mlp": 0.01040789, "balance_loss_clip": 1.04603112, "balance_loss_mlp": 1.02287185, "epoch": 0.4670233872422293, "flos": 23947748916480.0, "grad_norm": 2.0792540805038997, "language_loss": 0.81846356, "learning_rate": 2.309560153622361e-06, "loss": 0.84008396, "num_input_tokens_seen": 83552400, "step": 3884, "time_per_iteration": 2.681643486022949 }, { "auxiliary_loss_clip": 0.01118301, "auxiliary_loss_mlp": 0.01043254, "balance_loss_clip": 1.04884458, "balance_loss_mlp": 1.02551568, "epoch": 0.4671436301328684, "flos": 28111268131200.0, "grad_norm": 2.8835602664989626, "language_loss": 0.75121921, "learning_rate": 2.3087905459407602e-06, "loss": 0.77283478, "num_input_tokens_seen": 83571340, "step": 3885, "time_per_iteration": 2.789370536804199 }, { "auxiliary_loss_clip": 0.01038607, "auxiliary_loss_mlp": 0.01002925, "balance_loss_clip": 1.01511765, "balance_loss_mlp": 1.00062394, "epoch": 0.46726387302350747, "flos": 69369684566400.0, "grad_norm": 0.7911981848834398, "language_loss": 0.62870669, "learning_rate": 2.3080208914158795e-06, "loss": 0.649122, "num_input_tokens_seen": 83634340, "step": 3886, "time_per_iteration": 3.2179408073425293 }, { "auxiliary_loss_clip": 0.01122839, "auxiliary_loss_mlp": 0.01042395, "balance_loss_clip": 1.04712248, "balance_loss_mlp": 1.02441871, "epoch": 0.4673841159141466, "flos": 25519666878720.0, "grad_norm": 2.1081749935344383, "language_loss": 0.72100997, "learning_rate": 2.3072511901644753e-06, "loss": 0.74266231, "num_input_tokens_seen": 83653410, "step": 3887, "time_per_iteration": 3.9673759937286377 }, { "auxiliary_loss_clip": 0.01149593, "auxiliary_loss_mlp": 0.01040902, "balance_loss_clip": 1.05184913, "balance_loss_mlp": 1.02423632, "epoch": 0.4675043588047857, "flos": 24499265316480.0, "grad_norm": 2.0439711178839417, "language_loss": 0.80829006, "learning_rate": 2.306481442303309e-06, "loss": 0.83019495, "num_input_tokens_seen": 83672985, "step": 3888, "time_per_iteration": 2.581141233444214 }, { "auxiliary_loss_clip": 0.01136477, "auxiliary_loss_mlp": 0.01053605, "balance_loss_clip": 1.0449208, "balance_loss_mlp": 1.03624868, "epoch": 0.46762460169542475, "flos": 20960771685120.0, "grad_norm": 1.769212286518392, "language_loss": 0.73374057, "learning_rate": 2.3057116479491515e-06, "loss": 0.75564146, "num_input_tokens_seen": 83692395, "step": 3889, "time_per_iteration": 2.6072046756744385 }, { "auxiliary_loss_clip": 0.01135421, "auxiliary_loss_mlp": 0.01041851, "balance_loss_clip": 1.04583013, "balance_loss_mlp": 1.02368319, "epoch": 0.46774484458606386, "flos": 19171666137600.0, "grad_norm": 1.9305163218645363, "language_loss": 0.76122212, "learning_rate": 2.30494180721878e-06, "loss": 0.78299487, "num_input_tokens_seen": 83709735, "step": 3890, "time_per_iteration": 2.596437692642212 }, { "auxiliary_loss_clip": 0.01131999, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.04554033, "balance_loss_mlp": 1.02907825, "epoch": 0.4678650874767029, "flos": 17967689141760.0, "grad_norm": 2.233655767971074, "language_loss": 0.8944788, "learning_rate": 2.3041719202289794e-06, "loss": 0.91625375, "num_input_tokens_seen": 83725910, "step": 3891, "time_per_iteration": 2.557866334915161 }, { "auxiliary_loss_clip": 0.01137588, "auxiliary_loss_mlp": 0.01043582, "balance_loss_clip": 1.04996169, "balance_loss_mlp": 1.02658284, "epoch": 0.467985330367342, "flos": 21360816432000.0, "grad_norm": 1.8482383543914958, "language_loss": 0.80701894, "learning_rate": 2.30340198709654e-06, "loss": 0.8288306, "num_input_tokens_seen": 83745745, "step": 3892, "time_per_iteration": 2.6071624755859375 }, { "auxiliary_loss_clip": 0.01131501, "auxiliary_loss_mlp": 0.01053495, "balance_loss_clip": 1.04719353, "balance_loss_mlp": 1.03547049, "epoch": 0.46810557325798113, "flos": 20521835487360.0, "grad_norm": 2.2078609456304186, "language_loss": 0.74246854, "learning_rate": 2.3026320079382605e-06, "loss": 0.76431847, "num_input_tokens_seen": 83762680, "step": 3893, "time_per_iteration": 3.4733338356018066 }, { "auxiliary_loss_clip": 0.0114445, "auxiliary_loss_mlp": 0.0103936, "balance_loss_clip": 1.04660964, "balance_loss_mlp": 1.0235765, "epoch": 0.4682258161486202, "flos": 30117848572800.0, "grad_norm": 2.2089539452470635, "language_loss": 0.76687413, "learning_rate": 2.3018619828709454e-06, "loss": 0.78871214, "num_input_tokens_seen": 83784220, "step": 3894, "time_per_iteration": 2.609347343444824 }, { "auxiliary_loss_clip": 0.01132635, "auxiliary_loss_mlp": 0.0077299, "balance_loss_clip": 1.0465169, "balance_loss_mlp": 1.0006423, "epoch": 0.4683460590392593, "flos": 25293357239040.0, "grad_norm": 2.3585297957650275, "language_loss": 0.82189989, "learning_rate": 2.3010919120114084e-06, "loss": 0.84095615, "num_input_tokens_seen": 83800750, "step": 3895, "time_per_iteration": 2.7524359226226807 }, { "auxiliary_loss_clip": 0.01132618, "auxiliary_loss_mlp": 0.01043788, "balance_loss_clip": 1.04295135, "balance_loss_mlp": 1.02421403, "epoch": 0.4684663019298984, "flos": 15368330551680.0, "grad_norm": 2.2903472089985493, "language_loss": 0.65715444, "learning_rate": 2.3003217954764672e-06, "loss": 0.6789186, "num_input_tokens_seen": 83815455, "step": 3896, "time_per_iteration": 2.54962420463562 }, { "auxiliary_loss_clip": 0.01135612, "auxiliary_loss_mlp": 0.01047488, "balance_loss_clip": 1.04354668, "balance_loss_mlp": 1.0293448, "epoch": 0.46858654482053747, "flos": 27778842737280.0, "grad_norm": 1.8267411106295155, "language_loss": 0.79688716, "learning_rate": 2.299551633382949e-06, "loss": 0.81871819, "num_input_tokens_seen": 83835765, "step": 3897, "time_per_iteration": 2.6449995040893555 }, { "auxiliary_loss_clip": 0.01117369, "auxiliary_loss_mlp": 0.01053385, "balance_loss_clip": 1.04376459, "balance_loss_mlp": 1.03377497, "epoch": 0.4687067877111766, "flos": 18040623707520.0, "grad_norm": 2.7834742338908316, "language_loss": 0.85660875, "learning_rate": 2.2987814258476854e-06, "loss": 0.87831628, "num_input_tokens_seen": 83853565, "step": 3898, "time_per_iteration": 2.6095173358917236 }, { "auxiliary_loss_clip": 0.01103972, "auxiliary_loss_mlp": 0.01043688, "balance_loss_clip": 1.04286504, "balance_loss_mlp": 1.02555633, "epoch": 0.4688270306018157, "flos": 16977380198400.0, "grad_norm": 3.581715231789544, "language_loss": 0.68285888, "learning_rate": 2.2980111729875177e-06, "loss": 0.70433545, "num_input_tokens_seen": 83869815, "step": 3899, "time_per_iteration": 2.627295732498169 }, { "auxiliary_loss_clip": 0.01116876, "auxiliary_loss_mlp": 0.01047562, "balance_loss_clip": 1.0447185, "balance_loss_mlp": 1.03071773, "epoch": 0.46894727349245474, "flos": 17821640442240.0, "grad_norm": 1.6930527979659207, "language_loss": 0.82083452, "learning_rate": 2.2972408749192917e-06, "loss": 0.84247887, "num_input_tokens_seen": 83887545, "step": 3900, "time_per_iteration": 2.709285020828247 }, { "auxiliary_loss_clip": 0.01132924, "auxiliary_loss_mlp": 0.00773065, "balance_loss_clip": 1.04568648, "balance_loss_mlp": 1.00068581, "epoch": 0.46906751638309385, "flos": 21471349559040.0, "grad_norm": 1.7302324755458005, "language_loss": 0.66913581, "learning_rate": 2.296470531759861e-06, "loss": 0.68819577, "num_input_tokens_seen": 83905645, "step": 3901, "time_per_iteration": 2.6200430393218994 }, { "auxiliary_loss_clip": 0.01106058, "auxiliary_loss_mlp": 0.01040709, "balance_loss_clip": 1.04084527, "balance_loss_mlp": 1.02326834, "epoch": 0.46918775927373296, "flos": 20337829090560.0, "grad_norm": 2.4714094793466743, "language_loss": 0.79626584, "learning_rate": 2.2957001436260866e-06, "loss": 0.81773353, "num_input_tokens_seen": 83922705, "step": 3902, "time_per_iteration": 2.664107084274292 }, { "auxiliary_loss_clip": 0.01120774, "auxiliary_loss_mlp": 0.01051387, "balance_loss_clip": 1.04564965, "balance_loss_mlp": 1.03311265, "epoch": 0.469308002164372, "flos": 18403249461120.0, "grad_norm": 1.5696286478421198, "language_loss": 0.73310673, "learning_rate": 2.294929710634836e-06, "loss": 0.75482827, "num_input_tokens_seen": 83940795, "step": 3903, "time_per_iteration": 2.701509475708008 }, { "auxiliary_loss_clip": 0.01133966, "auxiliary_loss_mlp": 0.01042465, "balance_loss_clip": 1.0431807, "balance_loss_mlp": 1.02492952, "epoch": 0.46942824505501113, "flos": 37962067363200.0, "grad_norm": 2.3241179901018274, "language_loss": 0.61482382, "learning_rate": 2.2941592329029823e-06, "loss": 0.63658816, "num_input_tokens_seen": 83961900, "step": 3904, "time_per_iteration": 2.7421116828918457 }, { "auxiliary_loss_clip": 0.01127791, "auxiliary_loss_mlp": 0.01041041, "balance_loss_clip": 1.04266191, "balance_loss_mlp": 1.02342153, "epoch": 0.46954848794565024, "flos": 21872507627520.0, "grad_norm": 1.8577418261983294, "language_loss": 0.79427636, "learning_rate": 2.2933887105474067e-06, "loss": 0.8159647, "num_input_tokens_seen": 83980075, "step": 3905, "time_per_iteration": 2.632962226867676 }, { "auxiliary_loss_clip": 0.01133124, "auxiliary_loss_mlp": 0.01036384, "balance_loss_clip": 1.04755521, "balance_loss_mlp": 1.02048111, "epoch": 0.4696687308362893, "flos": 22016545165440.0, "grad_norm": 1.7126414032676247, "language_loss": 0.81518668, "learning_rate": 2.2926181436849974e-06, "loss": 0.83688176, "num_input_tokens_seen": 83999430, "step": 3906, "time_per_iteration": 2.6021859645843506 }, { "auxiliary_loss_clip": 0.0113544, "auxiliary_loss_mlp": 0.01040361, "balance_loss_clip": 1.04648614, "balance_loss_mlp": 1.02213371, "epoch": 0.4697889737269284, "flos": 21613663244160.0, "grad_norm": 1.8042670256999613, "language_loss": 0.72762817, "learning_rate": 2.2918475324326478e-06, "loss": 0.74938619, "num_input_tokens_seen": 84019150, "step": 3907, "time_per_iteration": 3.5530760288238525 }, { "auxiliary_loss_clip": 0.01143075, "auxiliary_loss_mlp": 0.00774015, "balance_loss_clip": 1.04813159, "balance_loss_mlp": 1.00071788, "epoch": 0.46990921661756746, "flos": 25228323665280.0, "grad_norm": 2.2109623487375596, "language_loss": 0.91400325, "learning_rate": 2.2910768769072603e-06, "loss": 0.93317419, "num_input_tokens_seen": 84037930, "step": 3908, "time_per_iteration": 2.6154704093933105 }, { "auxiliary_loss_clip": 0.01129953, "auxiliary_loss_mlp": 0.01045262, "balance_loss_clip": 1.0463953, "balance_loss_mlp": 1.02802432, "epoch": 0.47002945950820657, "flos": 13844031045120.0, "grad_norm": 1.8509827662030975, "language_loss": 0.75958419, "learning_rate": 2.2903061772257417e-06, "loss": 0.78133631, "num_input_tokens_seen": 84055915, "step": 3909, "time_per_iteration": 3.5471713542938232 }, { "auxiliary_loss_clip": 0.01133784, "auxiliary_loss_mlp": 0.01038688, "balance_loss_clip": 1.04453611, "balance_loss_mlp": 1.02165282, "epoch": 0.4701497023988457, "flos": 26247001374720.0, "grad_norm": 2.2993880677520186, "language_loss": 0.78819513, "learning_rate": 2.289535433505007e-06, "loss": 0.80991989, "num_input_tokens_seen": 84077270, "step": 3910, "time_per_iteration": 2.6238508224487305 }, { "auxiliary_loss_clip": 0.01125349, "auxiliary_loss_mlp": 0.01040942, "balance_loss_clip": 1.04550791, "balance_loss_mlp": 1.02447939, "epoch": 0.47026994528948474, "flos": 25629517647360.0, "grad_norm": 1.9777700472046993, "language_loss": 0.63666546, "learning_rate": 2.2887646458619767e-06, "loss": 0.65832841, "num_input_tokens_seen": 84098635, "step": 3911, "time_per_iteration": 2.658708333969116 }, { "auxiliary_loss_clip": 0.011213, "auxiliary_loss_mlp": 0.01049568, "balance_loss_clip": 1.04689014, "balance_loss_mlp": 1.03078032, "epoch": 0.47039018818012385, "flos": 20554406144640.0, "grad_norm": 1.9356007421606347, "language_loss": 0.76412791, "learning_rate": 2.2879938144135797e-06, "loss": 0.78583658, "num_input_tokens_seen": 84114740, "step": 3912, "time_per_iteration": 2.67437481880188 }, { "auxiliary_loss_clip": 0.01104985, "auxiliary_loss_mlp": 0.00773302, "balance_loss_clip": 1.04054558, "balance_loss_mlp": 1.00079679, "epoch": 0.47051043107076296, "flos": 21577249831680.0, "grad_norm": 2.0925539023651942, "language_loss": 0.75146782, "learning_rate": 2.2872229392767496e-06, "loss": 0.77025068, "num_input_tokens_seen": 84134845, "step": 3913, "time_per_iteration": 3.6570944786071777 }, { "auxiliary_loss_clip": 0.01142, "auxiliary_loss_mlp": 0.01044339, "balance_loss_clip": 1.04962933, "balance_loss_mlp": 1.02631426, "epoch": 0.470630673961402, "flos": 18953185662720.0, "grad_norm": 2.3086633696636647, "language_loss": 0.74960184, "learning_rate": 2.286452020568428e-06, "loss": 0.77146524, "num_input_tokens_seen": 84152920, "step": 3914, "time_per_iteration": 2.598562240600586 }, { "auxiliary_loss_clip": 0.01155218, "auxiliary_loss_mlp": 0.0105553, "balance_loss_clip": 1.04884565, "balance_loss_mlp": 1.03757715, "epoch": 0.4707509168520411, "flos": 19938969492480.0, "grad_norm": 3.0117538570836255, "language_loss": 0.73393828, "learning_rate": 2.2856810584055637e-06, "loss": 0.75604582, "num_input_tokens_seen": 84170455, "step": 3915, "time_per_iteration": 2.5611164569854736 }, { "auxiliary_loss_clip": 0.01137531, "auxiliary_loss_mlp": 0.01046078, "balance_loss_clip": 1.0472641, "balance_loss_mlp": 1.02888775, "epoch": 0.47087115974268023, "flos": 40118754741120.0, "grad_norm": 1.5315249039425058, "language_loss": 0.67974871, "learning_rate": 2.2849100529051085e-06, "loss": 0.70158482, "num_input_tokens_seen": 84197390, "step": 3916, "time_per_iteration": 2.760658025741577 }, { "auxiliary_loss_clip": 0.01148668, "auxiliary_loss_mlp": 0.01046587, "balance_loss_clip": 1.04897261, "balance_loss_mlp": 1.0300411, "epoch": 0.4709914026333193, "flos": 13552723745280.0, "grad_norm": 13.229363336443562, "language_loss": 0.80632472, "learning_rate": 2.284139004184026e-06, "loss": 0.82827735, "num_input_tokens_seen": 84214620, "step": 3917, "time_per_iteration": 2.5663962364196777 }, { "auxiliary_loss_clip": 0.01149402, "auxiliary_loss_mlp": 0.01040767, "balance_loss_clip": 1.04772663, "balance_loss_mlp": 1.02239716, "epoch": 0.4711116455239584, "flos": 19974628719360.0, "grad_norm": 2.284150768146321, "language_loss": 0.74914706, "learning_rate": 2.2833679123592814e-06, "loss": 0.77104867, "num_input_tokens_seen": 84231880, "step": 3918, "time_per_iteration": 2.5512990951538086 }, { "auxiliary_loss_clip": 0.0112655, "auxiliary_loss_mlp": 0.01042541, "balance_loss_clip": 1.04838753, "balance_loss_mlp": 1.02511251, "epoch": 0.4712318884145975, "flos": 32124824064000.0, "grad_norm": 1.909936643522893, "language_loss": 0.63719332, "learning_rate": 2.2825967775478508e-06, "loss": 0.65888417, "num_input_tokens_seen": 84252980, "step": 3919, "time_per_iteration": 3.5433409214019775 }, { "auxiliary_loss_clip": 0.01149197, "auxiliary_loss_mlp": 0.01042619, "balance_loss_clip": 1.04771817, "balance_loss_mlp": 1.02390337, "epoch": 0.47135213130523657, "flos": 20047850593920.0, "grad_norm": 2.3098867471868205, "language_loss": 0.83385307, "learning_rate": 2.2818255998667135e-06, "loss": 0.85577118, "num_input_tokens_seen": 84271490, "step": 3920, "time_per_iteration": 2.5490238666534424 }, { "auxiliary_loss_clip": 0.01135902, "auxiliary_loss_mlp": 0.01039225, "balance_loss_clip": 1.04725063, "balance_loss_mlp": 1.02275038, "epoch": 0.4714723741958757, "flos": 19426990988160.0, "grad_norm": 1.881707084451802, "language_loss": 0.79069215, "learning_rate": 2.2810543794328566e-06, "loss": 0.81244338, "num_input_tokens_seen": 84290525, "step": 3921, "time_per_iteration": 2.5761983394622803 }, { "auxiliary_loss_clip": 0.01141694, "auxiliary_loss_mlp": 0.0105001, "balance_loss_clip": 1.0479902, "balance_loss_mlp": 1.03136563, "epoch": 0.4715926170865148, "flos": 20373883367040.0, "grad_norm": 2.0333734712038654, "language_loss": 0.82714611, "learning_rate": 2.2802831163632735e-06, "loss": 0.84906316, "num_input_tokens_seen": 84309245, "step": 3922, "time_per_iteration": 2.6006557941436768 }, { "auxiliary_loss_clip": 0.0109352, "auxiliary_loss_mlp": 0.01046171, "balance_loss_clip": 1.04498386, "balance_loss_mlp": 1.02671576, "epoch": 0.47171285997715384, "flos": 22672884430080.0, "grad_norm": 1.6342349133479472, "language_loss": 0.74317622, "learning_rate": 2.279511810774965e-06, "loss": 0.7645731, "num_input_tokens_seen": 84330775, "step": 3923, "time_per_iteration": 2.7375893592834473 }, { "auxiliary_loss_clip": 0.01150013, "auxiliary_loss_mlp": 0.01047161, "balance_loss_clip": 1.04856253, "balance_loss_mlp": 1.02956569, "epoch": 0.47183310286779295, "flos": 21105419754240.0, "grad_norm": 2.173701961225674, "language_loss": 0.71694666, "learning_rate": 2.2787404627849364e-06, "loss": 0.73891842, "num_input_tokens_seen": 84349985, "step": 3924, "time_per_iteration": 2.5637896060943604 }, { "auxiliary_loss_clip": 0.0112025, "auxiliary_loss_mlp": 0.01038019, "balance_loss_clip": 1.04258275, "balance_loss_mlp": 1.0206145, "epoch": 0.471953345758432, "flos": 21726566668800.0, "grad_norm": 1.6971715252797603, "language_loss": 0.78764945, "learning_rate": 2.277969072510202e-06, "loss": 0.80923223, "num_input_tokens_seen": 84368965, "step": 3925, "time_per_iteration": 2.616659641265869 }, { "auxiliary_loss_clip": 0.01123101, "auxiliary_loss_mlp": 0.01042184, "balance_loss_clip": 1.04626703, "balance_loss_mlp": 1.02591217, "epoch": 0.4720735886490711, "flos": 19861078849920.0, "grad_norm": 1.6784875614509007, "language_loss": 0.81456923, "learning_rate": 2.2771976400677803e-06, "loss": 0.83622205, "num_input_tokens_seen": 84387795, "step": 3926, "time_per_iteration": 2.600771903991699 }, { "auxiliary_loss_clip": 0.01087293, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.03984308, "balance_loss_mlp": 1.01952994, "epoch": 0.47219383153971023, "flos": 19171809792000.0, "grad_norm": 1.6791661663197492, "language_loss": 0.78887588, "learning_rate": 2.2764261655746965e-06, "loss": 0.81011593, "num_input_tokens_seen": 84405290, "step": 3927, "time_per_iteration": 2.673062801361084 }, { "auxiliary_loss_clip": 0.01107669, "auxiliary_loss_mlp": 0.01047542, "balance_loss_clip": 1.04189062, "balance_loss_mlp": 1.02771759, "epoch": 0.4723140744303493, "flos": 23224005780480.0, "grad_norm": 1.6602570391908729, "language_loss": 0.75926793, "learning_rate": 2.2756546491479832e-06, "loss": 0.78082001, "num_input_tokens_seen": 84426205, "step": 3928, "time_per_iteration": 2.669642210006714 }, { "auxiliary_loss_clip": 0.01149485, "auxiliary_loss_mlp": 0.00773292, "balance_loss_clip": 1.04596663, "balance_loss_mlp": 1.00072587, "epoch": 0.4724343173209884, "flos": 18223265387520.0, "grad_norm": 2.7190098015736743, "language_loss": 0.80206001, "learning_rate": 2.274883090904679e-06, "loss": 0.82128781, "num_input_tokens_seen": 84443970, "step": 3929, "time_per_iteration": 2.5149688720703125 }, { "auxiliary_loss_clip": 0.01151764, "auxiliary_loss_mlp": 0.01049087, "balance_loss_clip": 1.04882491, "balance_loss_mlp": 1.03152704, "epoch": 0.4725545602116275, "flos": 21251037490560.0, "grad_norm": 3.9781293164140257, "language_loss": 0.67522985, "learning_rate": 2.2741114909618283e-06, "loss": 0.69723839, "num_input_tokens_seen": 84459865, "step": 3930, "time_per_iteration": 2.5508363246917725 }, { "auxiliary_loss_clip": 0.0111215, "auxiliary_loss_mlp": 0.01043687, "balance_loss_clip": 1.04404497, "balance_loss_mlp": 1.02661633, "epoch": 0.47267480310226656, "flos": 21434002392960.0, "grad_norm": 1.7801493687331902, "language_loss": 0.72094643, "learning_rate": 2.2733398494364828e-06, "loss": 0.74250484, "num_input_tokens_seen": 84479110, "step": 3931, "time_per_iteration": 2.6446902751922607 }, { "auxiliary_loss_clip": 0.01112718, "auxiliary_loss_mlp": 0.01039639, "balance_loss_clip": 1.0438323, "balance_loss_mlp": 1.02258062, "epoch": 0.47279504599290567, "flos": 18770508069120.0, "grad_norm": 2.789420438866209, "language_loss": 0.84553158, "learning_rate": 2.272568166445699e-06, "loss": 0.86705506, "num_input_tokens_seen": 84497675, "step": 3932, "time_per_iteration": 2.575514316558838 }, { "auxiliary_loss_clip": 0.01136853, "auxiliary_loss_mlp": 0.01045871, "balance_loss_clip": 1.04799366, "balance_loss_mlp": 1.02771544, "epoch": 0.4729152888835448, "flos": 21105742976640.0, "grad_norm": 1.839943752685134, "language_loss": 0.643076, "learning_rate": 2.271796442106541e-06, "loss": 0.66490328, "num_input_tokens_seen": 84517030, "step": 3933, "time_per_iteration": 3.560300588607788 }, { "auxiliary_loss_clip": 0.0101421, "auxiliary_loss_mlp": 0.01013629, "balance_loss_clip": 1.01218748, "balance_loss_mlp": 1.0116266, "epoch": 0.47303553177418384, "flos": 70201877840640.0, "grad_norm": 0.8113175989883751, "language_loss": 0.56492269, "learning_rate": 2.271024676536079e-06, "loss": 0.58520114, "num_input_tokens_seen": 84577290, "step": 3934, "time_per_iteration": 3.2008087635040283 }, { "auxiliary_loss_clip": 0.01131369, "auxiliary_loss_mlp": 0.01054755, "balance_loss_clip": 1.04943061, "balance_loss_mlp": 1.0361824, "epoch": 0.47315577466482295, "flos": 22455122227200.0, "grad_norm": 2.168367898638804, "language_loss": 0.73116165, "learning_rate": 2.2702528698513894e-06, "loss": 0.75302291, "num_input_tokens_seen": 84598415, "step": 3935, "time_per_iteration": 2.631251573562622 }, { "auxiliary_loss_clip": 0.01122179, "auxiliary_loss_mlp": 0.01043987, "balance_loss_clip": 1.04091334, "balance_loss_mlp": 1.02400732, "epoch": 0.47327601755546206, "flos": 24352857480960.0, "grad_norm": 1.9452517577327004, "language_loss": 0.78892398, "learning_rate": 2.269481022169554e-06, "loss": 0.81058562, "num_input_tokens_seen": 84617010, "step": 3936, "time_per_iteration": 3.497650384902954 }, { "auxiliary_loss_clip": 0.01132925, "auxiliary_loss_mlp": 0.01044807, "balance_loss_clip": 1.04785967, "balance_loss_mlp": 1.02575696, "epoch": 0.4733962604461011, "flos": 22926772736640.0, "grad_norm": 1.9850614036086003, "language_loss": 0.80857342, "learning_rate": 2.2687091336076614e-06, "loss": 0.83035076, "num_input_tokens_seen": 84636350, "step": 3937, "time_per_iteration": 2.670492172241211 }, { "auxiliary_loss_clip": 0.01134895, "auxiliary_loss_mlp": 0.0104272, "balance_loss_clip": 1.04724789, "balance_loss_mlp": 1.02536368, "epoch": 0.4735165033367402, "flos": 18327369980160.0, "grad_norm": 1.8604340250734248, "language_loss": 0.79862309, "learning_rate": 2.267937204282807e-06, "loss": 0.82039928, "num_input_tokens_seen": 84653490, "step": 3938, "time_per_iteration": 2.5596041679382324 }, { "auxiliary_loss_clip": 0.01148424, "auxiliary_loss_mlp": 0.01048286, "balance_loss_clip": 1.04963112, "balance_loss_mlp": 1.02922487, "epoch": 0.4736367462273793, "flos": 23037018554880.0, "grad_norm": 2.106170266368564, "language_loss": 0.78621292, "learning_rate": 2.2671652343120926e-06, "loss": 0.80818003, "num_input_tokens_seen": 84673965, "step": 3939, "time_per_iteration": 3.5994226932525635 }, { "auxiliary_loss_clip": 0.01149822, "auxiliary_loss_mlp": 0.01041532, "balance_loss_clip": 1.04958642, "balance_loss_mlp": 1.02424669, "epoch": 0.4737569891180184, "flos": 25374336451200.0, "grad_norm": 1.6510751846404446, "language_loss": 0.80655974, "learning_rate": 2.2663932238126236e-06, "loss": 0.82847333, "num_input_tokens_seen": 84692525, "step": 3940, "time_per_iteration": 2.5690858364105225 }, { "auxiliary_loss_clip": 0.01136289, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 1.04546881, "balance_loss_mlp": 1.02467179, "epoch": 0.4738772320086575, "flos": 25849326925440.0, "grad_norm": 2.014583409488887, "language_loss": 0.80454254, "learning_rate": 2.265621172901515e-06, "loss": 0.82632571, "num_input_tokens_seen": 84715640, "step": 3941, "time_per_iteration": 2.6323416233062744 }, { "auxiliary_loss_clip": 0.01155246, "auxiliary_loss_mlp": 0.01053482, "balance_loss_clip": 1.05155647, "balance_loss_mlp": 1.03567243, "epoch": 0.47399747489929656, "flos": 27564420499200.0, "grad_norm": 2.5024894214607594, "language_loss": 0.71471894, "learning_rate": 2.2648490816958854e-06, "loss": 0.73680627, "num_input_tokens_seen": 84736635, "step": 3942, "time_per_iteration": 2.6114702224731445 }, { "auxiliary_loss_clip": 0.01137112, "auxiliary_loss_mlp": 0.01052468, "balance_loss_clip": 1.04501963, "balance_loss_mlp": 1.03238165, "epoch": 0.47411771778993567, "flos": 24863650836480.0, "grad_norm": 3.1052725983977765, "language_loss": 0.73582238, "learning_rate": 2.264076950312861e-06, "loss": 0.75771821, "num_input_tokens_seen": 84755445, "step": 3943, "time_per_iteration": 2.621593713760376 }, { "auxiliary_loss_clip": 0.01133676, "auxiliary_loss_mlp": 0.01042468, "balance_loss_clip": 1.04704523, "balance_loss_mlp": 1.02430081, "epoch": 0.4742379606805748, "flos": 22748009725440.0, "grad_norm": 2.0786690012113946, "language_loss": 0.82550174, "learning_rate": 2.2633047788695727e-06, "loss": 0.84726316, "num_input_tokens_seen": 84775750, "step": 3944, "time_per_iteration": 2.8110506534576416 }, { "auxiliary_loss_clip": 0.01122855, "auxiliary_loss_mlp": 0.0104397, "balance_loss_clip": 1.04679847, "balance_loss_mlp": 1.02687562, "epoch": 0.47435820357121383, "flos": 19681130689920.0, "grad_norm": 2.1514118102061595, "language_loss": 0.64535546, "learning_rate": 2.262532567483159e-06, "loss": 0.66702366, "num_input_tokens_seen": 84794310, "step": 3945, "time_per_iteration": 3.5205280780792236 }, { "auxiliary_loss_clip": 0.01152083, "auxiliary_loss_mlp": 0.00773716, "balance_loss_clip": 1.0496825, "balance_loss_mlp": 1.00079083, "epoch": 0.47447844646185294, "flos": 25228718714880.0, "grad_norm": 2.123148976431767, "language_loss": 0.80017972, "learning_rate": 2.2617603162707635e-06, "loss": 0.81943774, "num_input_tokens_seen": 84814720, "step": 3946, "time_per_iteration": 2.5752549171447754 }, { "auxiliary_loss_clip": 0.0114941, "auxiliary_loss_mlp": 0.01050545, "balance_loss_clip": 1.04945421, "balance_loss_mlp": 1.0330689, "epoch": 0.47459868935249205, "flos": 24570619683840.0, "grad_norm": 1.7022083477037462, "language_loss": 0.82610941, "learning_rate": 2.2609880253495363e-06, "loss": 0.84810889, "num_input_tokens_seen": 84834355, "step": 3947, "time_per_iteration": 2.5670325756073 }, { "auxiliary_loss_clip": 0.0111926, "auxiliary_loss_mlp": 0.01045248, "balance_loss_clip": 1.04541993, "balance_loss_mlp": 1.02634156, "epoch": 0.4747189322431311, "flos": 20558500295040.0, "grad_norm": 2.107985903490038, "language_loss": 0.86711353, "learning_rate": 2.260215694836633e-06, "loss": 0.8887586, "num_input_tokens_seen": 84853530, "step": 3948, "time_per_iteration": 2.657517433166504 }, { "auxiliary_loss_clip": 0.0110275, "auxiliary_loss_mlp": 0.0077423, "balance_loss_clip": 1.04223704, "balance_loss_mlp": 1.00071812, "epoch": 0.4748391751337702, "flos": 25995231970560.0, "grad_norm": 1.871582132908591, "language_loss": 0.64848924, "learning_rate": 2.2594433248492157e-06, "loss": 0.66725898, "num_input_tokens_seen": 84872505, "step": 3949, "time_per_iteration": 2.7416110038757324 }, { "auxiliary_loss_clip": 0.01141749, "auxiliary_loss_mlp": 0.01045012, "balance_loss_clip": 1.04744279, "balance_loss_mlp": 1.02765512, "epoch": 0.47495941802440933, "flos": 22821052032000.0, "grad_norm": 2.4507897353431067, "language_loss": 0.80508804, "learning_rate": 2.2586709155044527e-06, "loss": 0.82695568, "num_input_tokens_seen": 84893105, "step": 3950, "time_per_iteration": 2.604870557785034 }, { "auxiliary_loss_clip": 0.01151425, "auxiliary_loss_mlp": 0.0105352, "balance_loss_clip": 1.04862952, "balance_loss_mlp": 1.03536487, "epoch": 0.4750796609150484, "flos": 27891782075520.0, "grad_norm": 1.520292579657794, "language_loss": 0.76072025, "learning_rate": 2.2578984669195167e-06, "loss": 0.78276974, "num_input_tokens_seen": 84914070, "step": 3951, "time_per_iteration": 2.6030972003936768 }, { "auxiliary_loss_clip": 0.0113438, "auxiliary_loss_mlp": 0.01043965, "balance_loss_clip": 1.04541636, "balance_loss_mlp": 1.02675092, "epoch": 0.4751999038056875, "flos": 35660085471360.0, "grad_norm": 1.8390904646559139, "language_loss": 0.67735052, "learning_rate": 2.2571259792115887e-06, "loss": 0.69913399, "num_input_tokens_seen": 84935290, "step": 3952, "time_per_iteration": 2.689383029937744 }, { "auxiliary_loss_clip": 0.01133725, "auxiliary_loss_mlp": 0.01042572, "balance_loss_clip": 1.04822552, "balance_loss_mlp": 1.02572823, "epoch": 0.4753201466963266, "flos": 22090880361600.0, "grad_norm": 1.7983457298249124, "language_loss": 0.79521263, "learning_rate": 2.2563534524978544e-06, "loss": 0.81697565, "num_input_tokens_seen": 84952760, "step": 3953, "time_per_iteration": 2.584559679031372 }, { "auxiliary_loss_clip": 0.01104565, "auxiliary_loss_mlp": 0.01050249, "balance_loss_clip": 1.04368424, "balance_loss_mlp": 1.03265381, "epoch": 0.47544038958696566, "flos": 30190854965760.0, "grad_norm": 1.5952855518858942, "language_loss": 0.70488632, "learning_rate": 2.2555808868955052e-06, "loss": 0.72643441, "num_input_tokens_seen": 84974890, "step": 3954, "time_per_iteration": 2.7257497310638428 }, { "auxiliary_loss_clip": 0.01100906, "auxiliary_loss_mlp": 0.01055936, "balance_loss_clip": 1.04363239, "balance_loss_mlp": 1.03627884, "epoch": 0.47556063247760477, "flos": 23472219738240.0, "grad_norm": 2.453697389000792, "language_loss": 0.73876452, "learning_rate": 2.254808282521738e-06, "loss": 0.76033294, "num_input_tokens_seen": 84993640, "step": 3955, "time_per_iteration": 2.6807196140289307 }, { "auxiliary_loss_clip": 0.0111673, "auxiliary_loss_mlp": 0.00772949, "balance_loss_clip": 1.04633772, "balance_loss_mlp": 1.00084722, "epoch": 0.4756808753682438, "flos": 25155209531520.0, "grad_norm": 2.65327553211622, "language_loss": 0.81069553, "learning_rate": 2.2540356394937573e-06, "loss": 0.82959223, "num_input_tokens_seen": 85012340, "step": 3956, "time_per_iteration": 2.673907995223999 }, { "auxiliary_loss_clip": 0.01119671, "auxiliary_loss_mlp": 0.0105006, "balance_loss_clip": 1.04623282, "balance_loss_mlp": 1.03052163, "epoch": 0.47580111825888294, "flos": 15669729573120.0, "grad_norm": 2.8343867322222764, "language_loss": 0.84162366, "learning_rate": 2.253262957928772e-06, "loss": 0.86332095, "num_input_tokens_seen": 85029225, "step": 3957, "time_per_iteration": 2.630631685256958 }, { "auxiliary_loss_clip": 0.01120409, "auxiliary_loss_mlp": 0.01042633, "balance_loss_clip": 1.04497445, "balance_loss_mlp": 1.02366662, "epoch": 0.47592136114952205, "flos": 17636556637440.0, "grad_norm": 1.8085307043496328, "language_loss": 0.71929777, "learning_rate": 2.2524902379439976e-06, "loss": 0.74092811, "num_input_tokens_seen": 85047895, "step": 3958, "time_per_iteration": 2.5922155380249023 }, { "auxiliary_loss_clip": 0.00990575, "auxiliary_loss_mlp": 0.01000309, "balance_loss_clip": 1.01124871, "balance_loss_mlp": 0.99798411, "epoch": 0.4760416040401611, "flos": 61417159292160.0, "grad_norm": 0.74132691204479, "language_loss": 0.63687563, "learning_rate": 2.251717479656655e-06, "loss": 0.65678453, "num_input_tokens_seen": 85112690, "step": 3959, "time_per_iteration": 4.485357284545898 }, { "auxiliary_loss_clip": 0.01152411, "auxiliary_loss_mlp": 0.01048163, "balance_loss_clip": 1.04758501, "balance_loss_mlp": 1.03060317, "epoch": 0.4761618469308002, "flos": 18405871153920.0, "grad_norm": 1.9008816358123546, "language_loss": 0.76208901, "learning_rate": 2.2509446831839704e-06, "loss": 0.78409475, "num_input_tokens_seen": 85132130, "step": 3960, "time_per_iteration": 2.8838844299316406 }, { "auxiliary_loss_clip": 0.01129122, "auxiliary_loss_mlp": 0.01048034, "balance_loss_clip": 1.04532373, "balance_loss_mlp": 1.02970004, "epoch": 0.4762820898214393, "flos": 18040911016320.0, "grad_norm": 5.558825032781662, "language_loss": 0.81856006, "learning_rate": 2.250171848643177e-06, "loss": 0.84033167, "num_input_tokens_seen": 85149420, "step": 3961, "time_per_iteration": 2.6509361267089844 }, { "auxiliary_loss_clip": 0.01121928, "auxiliary_loss_mlp": 0.0104621, "balance_loss_clip": 1.04533303, "balance_loss_mlp": 1.0289607, "epoch": 0.4764023327120784, "flos": 19318253541120.0, "grad_norm": 1.8561604092642032, "language_loss": 0.8603524, "learning_rate": 2.249398976151513e-06, "loss": 0.88203377, "num_input_tokens_seen": 85166970, "step": 3962, "time_per_iteration": 3.5857908725738525 }, { "auxiliary_loss_clip": 0.01146017, "auxiliary_loss_mlp": 0.01049435, "balance_loss_clip": 1.04546463, "balance_loss_mlp": 1.03191102, "epoch": 0.4765225756027175, "flos": 22747255539840.0, "grad_norm": 3.0898720415552305, "language_loss": 0.78022128, "learning_rate": 2.248626065826223e-06, "loss": 0.80217582, "num_input_tokens_seen": 85185175, "step": 3963, "time_per_iteration": 2.5326712131500244 }, { "auxiliary_loss_clip": 0.01043848, "auxiliary_loss_mlp": 0.01001613, "balance_loss_clip": 1.01142955, "balance_loss_mlp": 0.99933589, "epoch": 0.4766428184933566, "flos": 65933392106880.0, "grad_norm": 0.7609863347088228, "language_loss": 0.62533575, "learning_rate": 2.2478531177845564e-06, "loss": 0.6457904, "num_input_tokens_seen": 85246170, "step": 3964, "time_per_iteration": 3.1295135021209717 }, { "auxiliary_loss_clip": 0.01129538, "auxiliary_loss_mlp": 0.01042402, "balance_loss_clip": 1.04803729, "balance_loss_mlp": 1.0261066, "epoch": 0.47676306138399566, "flos": 24136495908480.0, "grad_norm": 2.1515866642903156, "language_loss": 0.85042304, "learning_rate": 2.247080132143769e-06, "loss": 0.87214243, "num_input_tokens_seen": 85268525, "step": 3965, "time_per_iteration": 3.779629707336426 }, { "auxiliary_loss_clip": 0.01112286, "auxiliary_loss_mlp": 0.01050178, "balance_loss_clip": 1.04230368, "balance_loss_mlp": 1.03111625, "epoch": 0.47688330427463477, "flos": 12604322995200.0, "grad_norm": 2.498675204784412, "language_loss": 0.69279122, "learning_rate": 2.246307109021121e-06, "loss": 0.71441579, "num_input_tokens_seen": 85285930, "step": 3966, "time_per_iteration": 2.592998504638672 }, { "auxiliary_loss_clip": 0.01116678, "auxiliary_loss_mlp": 0.01041964, "balance_loss_clip": 1.04278517, "balance_loss_mlp": 1.02455974, "epoch": 0.4770035471652739, "flos": 21390585828480.0, "grad_norm": 1.6796295212201366, "language_loss": 0.82161856, "learning_rate": 2.2455340485338817e-06, "loss": 0.84320498, "num_input_tokens_seen": 85303565, "step": 3967, "time_per_iteration": 2.6563374996185303 }, { "auxiliary_loss_clip": 0.01139783, "auxiliary_loss_mlp": 0.01042665, "balance_loss_clip": 1.04815364, "balance_loss_mlp": 1.02472436, "epoch": 0.47712379005591293, "flos": 25156251025920.0, "grad_norm": 2.3877811546488648, "language_loss": 0.67722058, "learning_rate": 2.244760950799322e-06, "loss": 0.69904506, "num_input_tokens_seen": 85321835, "step": 3968, "time_per_iteration": 2.633635997772217 }, { "auxiliary_loss_clip": 0.01097532, "auxiliary_loss_mlp": 0.01044535, "balance_loss_clip": 1.04191971, "balance_loss_mlp": 1.02697539, "epoch": 0.47724403294655204, "flos": 22054323294720.0, "grad_norm": 2.7616672929812256, "language_loss": 0.72704637, "learning_rate": 2.2439878159347203e-06, "loss": 0.74846709, "num_input_tokens_seen": 85341260, "step": 3969, "time_per_iteration": 2.6395115852355957 }, { "auxiliary_loss_clip": 0.01043846, "auxiliary_loss_mlp": 0.01003659, "balance_loss_clip": 1.01122665, "balance_loss_mlp": 1.0014534, "epoch": 0.4773642758371911, "flos": 70229387658240.0, "grad_norm": 0.7321266075093209, "language_loss": 0.55271226, "learning_rate": 2.2432146440573616e-06, "loss": 0.57318729, "num_input_tokens_seen": 85407220, "step": 3970, "time_per_iteration": 3.2520971298217773 }, { "auxiliary_loss_clip": 0.01121765, "auxiliary_loss_mlp": 0.01035681, "balance_loss_clip": 1.04563427, "balance_loss_mlp": 1.019063, "epoch": 0.4774845187278302, "flos": 23548602009600.0, "grad_norm": 2.463439664041334, "language_loss": 0.66709626, "learning_rate": 2.242441435284534e-06, "loss": 0.68867075, "num_input_tokens_seen": 85426095, "step": 3971, "time_per_iteration": 3.875798225402832 }, { "auxiliary_loss_clip": 0.01138567, "auxiliary_loss_mlp": 0.0105198, "balance_loss_clip": 1.04672337, "balance_loss_mlp": 1.03322792, "epoch": 0.4776047616184693, "flos": 23075371301760.0, "grad_norm": 2.4511154063524323, "language_loss": 0.85251111, "learning_rate": 2.2416681897335337e-06, "loss": 0.87441659, "num_input_tokens_seen": 85444245, "step": 3972, "time_per_iteration": 2.6110730171203613 }, { "auxiliary_loss_clip": 0.01098865, "auxiliary_loss_mlp": 0.01043372, "balance_loss_clip": 1.04299664, "balance_loss_mlp": 1.02534795, "epoch": 0.4777250045091084, "flos": 31898119374720.0, "grad_norm": 1.8681285733659976, "language_loss": 0.67050201, "learning_rate": 2.240894907521661e-06, "loss": 0.69192433, "num_input_tokens_seen": 85463325, "step": 3973, "time_per_iteration": 2.782430410385132 }, { "auxiliary_loss_clip": 0.01125486, "auxiliary_loss_mlp": 0.01043197, "balance_loss_clip": 1.04509878, "balance_loss_mlp": 1.02524364, "epoch": 0.4778452473997475, "flos": 24278163148800.0, "grad_norm": 2.0615078242836526, "language_loss": 0.63922071, "learning_rate": 2.240121588766223e-06, "loss": 0.66090751, "num_input_tokens_seen": 85483375, "step": 3974, "time_per_iteration": 2.6085574626922607 }, { "auxiliary_loss_clip": 0.01120959, "auxiliary_loss_mlp": 0.01043664, "balance_loss_clip": 1.04569817, "balance_loss_mlp": 1.02692676, "epoch": 0.4779654902903866, "flos": 31575031516800.0, "grad_norm": 2.1198983564311105, "language_loss": 0.71352345, "learning_rate": 2.239348233584531e-06, "loss": 0.73516965, "num_input_tokens_seen": 85504230, "step": 3975, "time_per_iteration": 2.7021753787994385 }, { "auxiliary_loss_clip": 0.01139488, "auxiliary_loss_mlp": 0.01047336, "balance_loss_clip": 1.04872501, "balance_loss_mlp": 1.0309689, "epoch": 0.47808573318102565, "flos": 19500428344320.0, "grad_norm": 1.87710531057277, "language_loss": 0.81042975, "learning_rate": 2.2385748420939013e-06, "loss": 0.83229798, "num_input_tokens_seen": 85523425, "step": 3976, "time_per_iteration": 2.6610069274902344 }, { "auxiliary_loss_clip": 0.01149255, "auxiliary_loss_mlp": 0.01038093, "balance_loss_clip": 1.05089962, "balance_loss_mlp": 1.02188015, "epoch": 0.47820597607166476, "flos": 22601135013120.0, "grad_norm": 2.4745804709785393, "language_loss": 0.72241402, "learning_rate": 2.2378014144116583e-06, "loss": 0.74428749, "num_input_tokens_seen": 85542235, "step": 3977, "time_per_iteration": 2.5691757202148438 }, { "auxiliary_loss_clip": 0.01153145, "auxiliary_loss_mlp": 0.01043328, "balance_loss_clip": 1.0490427, "balance_loss_mlp": 1.02563763, "epoch": 0.4783262189623039, "flos": 23003011353600.0, "grad_norm": 2.0059847110177036, "language_loss": 0.79813361, "learning_rate": 2.23702795065513e-06, "loss": 0.82009834, "num_input_tokens_seen": 85561815, "step": 3978, "time_per_iteration": 2.5715267658233643 }, { "auxiliary_loss_clip": 0.01037977, "auxiliary_loss_mlp": 0.01008047, "balance_loss_clip": 1.01467109, "balance_loss_mlp": 1.00579441, "epoch": 0.47844646185294293, "flos": 49772801226240.0, "grad_norm": 0.9744001015978875, "language_loss": 0.67435718, "learning_rate": 2.2362544509416493e-06, "loss": 0.69481742, "num_input_tokens_seen": 85613930, "step": 3979, "time_per_iteration": 3.043320655822754 }, { "auxiliary_loss_clip": 0.0111373, "auxiliary_loss_mlp": 0.01041464, "balance_loss_clip": 1.04057276, "balance_loss_mlp": 1.02291489, "epoch": 0.47856670474358204, "flos": 20229558520320.0, "grad_norm": 2.579665294774492, "language_loss": 0.82742548, "learning_rate": 2.2354809153885572e-06, "loss": 0.84897739, "num_input_tokens_seen": 85631000, "step": 3980, "time_per_iteration": 2.6914994716644287 }, { "auxiliary_loss_clip": 0.01137895, "auxiliary_loss_mlp": 0.01041274, "balance_loss_clip": 1.04754472, "balance_loss_mlp": 1.02363122, "epoch": 0.47868694763422115, "flos": 20990936131200.0, "grad_norm": 3.9963313757577734, "language_loss": 0.83292031, "learning_rate": 2.234707344113197e-06, "loss": 0.85471201, "num_input_tokens_seen": 85649095, "step": 3981, "time_per_iteration": 2.5997979640960693 }, { "auxiliary_loss_clip": 0.01148883, "auxiliary_loss_mlp": 0.01046128, "balance_loss_clip": 1.04915226, "balance_loss_mlp": 1.02812731, "epoch": 0.4788071905248602, "flos": 19026551191680.0, "grad_norm": 2.718078142821131, "language_loss": 0.77871561, "learning_rate": 2.233933737232919e-06, "loss": 0.80066568, "num_input_tokens_seen": 85666875, "step": 3982, "time_per_iteration": 2.529777765274048 }, { "auxiliary_loss_clip": 0.01090156, "auxiliary_loss_mlp": 0.00775429, "balance_loss_clip": 1.03949821, "balance_loss_mlp": 1.0007503, "epoch": 0.4789274334154993, "flos": 23002221254400.0, "grad_norm": 1.8259803204809817, "language_loss": 0.78441364, "learning_rate": 2.2331600948650793e-06, "loss": 0.80306947, "num_input_tokens_seen": 85687020, "step": 3983, "time_per_iteration": 2.7532482147216797 }, { "auxiliary_loss_clip": 0.01099488, "auxiliary_loss_mlp": 0.00775337, "balance_loss_clip": 1.04244041, "balance_loss_mlp": 1.00068092, "epoch": 0.4790476763061384, "flos": 23075586783360.0, "grad_norm": 1.4634262682579344, "language_loss": 0.80299723, "learning_rate": 2.2323864171270386e-06, "loss": 0.82174551, "num_input_tokens_seen": 85708290, "step": 3984, "time_per_iteration": 2.689760208129883 }, { "auxiliary_loss_clip": 0.01114923, "auxiliary_loss_mlp": 0.01049157, "balance_loss_clip": 1.04472518, "balance_loss_mlp": 1.03135943, "epoch": 0.4791679191967775, "flos": 21179288073600.0, "grad_norm": 2.655409064004873, "language_loss": 0.72400516, "learning_rate": 2.231612704136164e-06, "loss": 0.745646, "num_input_tokens_seen": 85728660, "step": 3985, "time_per_iteration": 3.6193509101867676 }, { "auxiliary_loss_clip": 0.01131731, "auxiliary_loss_mlp": 0.01044563, "balance_loss_clip": 1.04364288, "balance_loss_mlp": 1.02534676, "epoch": 0.4792881620874166, "flos": 22301495758080.0, "grad_norm": 3.1693085369113567, "language_loss": 0.75396073, "learning_rate": 2.2308389560098253e-06, "loss": 0.7757237, "num_input_tokens_seen": 85745035, "step": 3986, "time_per_iteration": 2.5843467712402344 }, { "auxiliary_loss_clip": 0.01113397, "auxiliary_loss_mlp": 0.01046494, "balance_loss_clip": 1.04728079, "balance_loss_mlp": 1.0288868, "epoch": 0.47940840497805565, "flos": 17420877423360.0, "grad_norm": 3.7668701512014766, "language_loss": 0.77031362, "learning_rate": 2.2300651728654008e-06, "loss": 0.79191256, "num_input_tokens_seen": 85760295, "step": 3987, "time_per_iteration": 3.6705384254455566 }, { "auxiliary_loss_clip": 0.01033075, "auxiliary_loss_mlp": 0.00756016, "balance_loss_clip": 1.01358294, "balance_loss_mlp": 1.00049007, "epoch": 0.47952864786869476, "flos": 65358175708800.0, "grad_norm": 0.7288042039832102, "language_loss": 0.60164666, "learning_rate": 2.229291354820272e-06, "loss": 0.61953747, "num_input_tokens_seen": 85821305, "step": 3988, "time_per_iteration": 3.225499153137207 }, { "auxiliary_loss_clip": 0.0114055, "auxiliary_loss_mlp": 0.01040083, "balance_loss_clip": 1.0500772, "balance_loss_mlp": 1.02128327, "epoch": 0.47964889075933387, "flos": 16799802336000.0, "grad_norm": 2.353788863826904, "language_loss": 0.76303363, "learning_rate": 2.228517501991828e-06, "loss": 0.78483999, "num_input_tokens_seen": 85840105, "step": 3989, "time_per_iteration": 2.6268813610076904 }, { "auxiliary_loss_clip": 0.0102177, "auxiliary_loss_mlp": 0.01002144, "balance_loss_clip": 1.01169086, "balance_loss_mlp": 1.00011718, "epoch": 0.4797691336499729, "flos": 70079244808320.0, "grad_norm": 0.8067765718822746, "language_loss": 0.60983217, "learning_rate": 2.22774361449746e-06, "loss": 0.6300714, "num_input_tokens_seen": 85896585, "step": 3990, "time_per_iteration": 3.2582168579101562 }, { "auxiliary_loss_clip": 0.01090023, "auxiliary_loss_mlp": 0.01045835, "balance_loss_clip": 1.04505312, "balance_loss_mlp": 1.02835894, "epoch": 0.47988937654061203, "flos": 18953329317120.0, "grad_norm": 2.2644451130368837, "language_loss": 0.69653845, "learning_rate": 2.2269696924545668e-06, "loss": 0.71789694, "num_input_tokens_seen": 85914415, "step": 3991, "time_per_iteration": 3.7545688152313232 }, { "auxiliary_loss_clip": 0.01112232, "auxiliary_loss_mlp": 0.01046554, "balance_loss_clip": 1.04571867, "balance_loss_mlp": 1.02852964, "epoch": 0.48000961943125114, "flos": 14461981649280.0, "grad_norm": 2.3084242116162974, "language_loss": 0.78216302, "learning_rate": 2.2261957359805523e-06, "loss": 0.80375093, "num_input_tokens_seen": 85931650, "step": 3992, "time_per_iteration": 2.619154691696167 }, { "auxiliary_loss_clip": 0.01149848, "auxiliary_loss_mlp": 0.01050151, "balance_loss_clip": 1.04813242, "balance_loss_mlp": 1.03330636, "epoch": 0.4801298623218902, "flos": 27051149105280.0, "grad_norm": 2.99763856895039, "language_loss": 0.74246395, "learning_rate": 2.225421745192823e-06, "loss": 0.7644639, "num_input_tokens_seen": 85951805, "step": 3993, "time_per_iteration": 2.661158323287964 }, { "auxiliary_loss_clip": 0.01135165, "auxiliary_loss_mlp": 0.01044178, "balance_loss_clip": 1.04543161, "balance_loss_mlp": 1.02551007, "epoch": 0.4802501052125293, "flos": 26355236031360.0, "grad_norm": 2.5672465599836483, "language_loss": 0.77956361, "learning_rate": 2.2246477202087955e-06, "loss": 0.80135703, "num_input_tokens_seen": 85972485, "step": 3994, "time_per_iteration": 2.671339511871338 }, { "auxiliary_loss_clip": 0.0112515, "auxiliary_loss_mlp": 0.01042685, "balance_loss_clip": 1.04629874, "balance_loss_mlp": 1.02567434, "epoch": 0.4803703481031684, "flos": 20993916960000.0, "grad_norm": 1.8359662924059985, "language_loss": 0.82912159, "learning_rate": 2.223873661145887e-06, "loss": 0.85079992, "num_input_tokens_seen": 85992540, "step": 3995, "time_per_iteration": 2.7024648189544678 }, { "auxiliary_loss_clip": 0.01121096, "auxiliary_loss_mlp": 0.00773656, "balance_loss_clip": 1.04823875, "balance_loss_mlp": 1.00065637, "epoch": 0.4804905909938075, "flos": 20703722981760.0, "grad_norm": 1.8994584608185137, "language_loss": 0.71380609, "learning_rate": 2.2230995681215226e-06, "loss": 0.73275363, "num_input_tokens_seen": 86012065, "step": 3996, "time_per_iteration": 2.636613130569458 }, { "auxiliary_loss_clip": 0.01110079, "auxiliary_loss_mlp": 0.01050628, "balance_loss_clip": 1.04301047, "balance_loss_mlp": 1.03178072, "epoch": 0.4806108338844466, "flos": 16654831044480.0, "grad_norm": 1.9660485236196057, "language_loss": 0.78514683, "learning_rate": 2.2223254412531305e-06, "loss": 0.80675387, "num_input_tokens_seen": 86029435, "step": 3997, "time_per_iteration": 3.5453007221221924 }, { "auxiliary_loss_clip": 0.01110935, "auxiliary_loss_mlp": 0.01039826, "balance_loss_clip": 1.04202378, "balance_loss_mlp": 1.02245736, "epoch": 0.4807310767750857, "flos": 20011329440640.0, "grad_norm": 1.9687654841456634, "language_loss": 0.82705516, "learning_rate": 2.221551280658146e-06, "loss": 0.84856272, "num_input_tokens_seen": 86048495, "step": 3998, "time_per_iteration": 2.714928388595581 }, { "auxiliary_loss_clip": 0.01094091, "auxiliary_loss_mlp": 0.01043244, "balance_loss_clip": 1.04313827, "balance_loss_mlp": 1.02672172, "epoch": 0.48085131966572475, "flos": 23185257984000.0, "grad_norm": 1.6372466265509125, "language_loss": 0.73977059, "learning_rate": 2.2207770864540085e-06, "loss": 0.76114392, "num_input_tokens_seen": 86067470, "step": 3999, "time_per_iteration": 2.7505900859832764 }, { "auxiliary_loss_clip": 0.01118452, "auxiliary_loss_mlp": 0.01047166, "balance_loss_clip": 1.04577541, "balance_loss_mlp": 1.02945173, "epoch": 0.48097156255636386, "flos": 20558643949440.0, "grad_norm": 2.1439852435356737, "language_loss": 0.7302072, "learning_rate": 2.220002858758162e-06, "loss": 0.75186336, "num_input_tokens_seen": 86085460, "step": 4000, "time_per_iteration": 2.659932851791382 }, { "auxiliary_loss_clip": 0.01037395, "auxiliary_loss_mlp": 0.01003295, "balance_loss_clip": 1.01439881, "balance_loss_mlp": 1.00118482, "epoch": 0.481091805447003, "flos": 70511608817280.0, "grad_norm": 0.8855712140218317, "language_loss": 0.60878587, "learning_rate": 2.2192285976880573e-06, "loss": 0.62919271, "num_input_tokens_seen": 86149715, "step": 4001, "time_per_iteration": 3.2179312705993652 }, { "auxiliary_loss_clip": 0.01111947, "auxiliary_loss_mlp": 0.00773842, "balance_loss_clip": 1.04232287, "balance_loss_mlp": 1.00066924, "epoch": 0.48121204833764203, "flos": 36428214839040.0, "grad_norm": 1.9856204912864415, "language_loss": 0.80599731, "learning_rate": 2.2184543033611485e-06, "loss": 0.82485521, "num_input_tokens_seen": 86170795, "step": 4002, "time_per_iteration": 2.804621934890747 }, { "auxiliary_loss_clip": 0.01140169, "auxiliary_loss_mlp": 0.01046761, "balance_loss_clip": 1.04837537, "balance_loss_mlp": 1.02897477, "epoch": 0.48133229122828114, "flos": 27490264871040.0, "grad_norm": 2.0731848643738324, "language_loss": 0.8194083, "learning_rate": 2.2176799758948957e-06, "loss": 0.8412776, "num_input_tokens_seen": 86190955, "step": 4003, "time_per_iteration": 2.6769587993621826 }, { "auxiliary_loss_clip": 0.01121968, "auxiliary_loss_mlp": 0.01052628, "balance_loss_clip": 1.04592395, "balance_loss_mlp": 1.03434086, "epoch": 0.4814525341189202, "flos": 43072802179200.0, "grad_norm": 1.910687384692897, "language_loss": 0.73764694, "learning_rate": 2.2169056154067635e-06, "loss": 0.75939286, "num_input_tokens_seen": 86214875, "step": 4004, "time_per_iteration": 2.894627571105957 }, { "auxiliary_loss_clip": 0.0113747, "auxiliary_loss_mlp": 0.00773565, "balance_loss_clip": 1.0470202, "balance_loss_mlp": 1.00060463, "epoch": 0.4815727770095593, "flos": 24236901400320.0, "grad_norm": 2.083332666282002, "language_loss": 0.82660729, "learning_rate": 2.216131222014222e-06, "loss": 0.84571767, "num_input_tokens_seen": 86232950, "step": 4005, "time_per_iteration": 2.659062623977661 }, { "auxiliary_loss_clip": 0.01102804, "auxiliary_loss_mlp": 0.01042708, "balance_loss_clip": 1.04317856, "balance_loss_mlp": 1.02525592, "epoch": 0.4816930199001984, "flos": 18113630100480.0, "grad_norm": 2.3641753778249774, "language_loss": 0.80258846, "learning_rate": 2.2153567958347455e-06, "loss": 0.82404363, "num_input_tokens_seen": 86249160, "step": 4006, "time_per_iteration": 2.6268928050994873 }, { "auxiliary_loss_clip": 0.01126053, "auxiliary_loss_mlp": 0.01041834, "balance_loss_clip": 1.04848516, "balance_loss_mlp": 1.02317774, "epoch": 0.48181326279083747, "flos": 17274720983040.0, "grad_norm": 2.2488282583080497, "language_loss": 0.79846966, "learning_rate": 2.214582336985815e-06, "loss": 0.82014859, "num_input_tokens_seen": 86267060, "step": 4007, "time_per_iteration": 2.6182544231414795 }, { "auxiliary_loss_clip": 0.01116171, "auxiliary_loss_mlp": 0.01057244, "balance_loss_clip": 1.04388225, "balance_loss_mlp": 1.0376575, "epoch": 0.4819335056814766, "flos": 14903252231040.0, "grad_norm": 3.1684608852756493, "language_loss": 0.66350543, "learning_rate": 2.2138078455849142e-06, "loss": 0.68523955, "num_input_tokens_seen": 86285055, "step": 4008, "time_per_iteration": 2.5880346298217773 }, { "auxiliary_loss_clip": 0.01142223, "auxiliary_loss_mlp": 0.01044231, "balance_loss_clip": 1.04920673, "balance_loss_mlp": 1.02654052, "epoch": 0.4820537485721157, "flos": 19244888012160.0, "grad_norm": 2.315900530681299, "language_loss": 0.78805959, "learning_rate": 2.2130333217495334e-06, "loss": 0.80992413, "num_input_tokens_seen": 86304225, "step": 4009, "time_per_iteration": 2.588364601135254 }, { "auxiliary_loss_clip": 0.01122885, "auxiliary_loss_mlp": 0.01040916, "balance_loss_clip": 1.0455277, "balance_loss_mlp": 1.02365422, "epoch": 0.48217399146275475, "flos": 16033791870720.0, "grad_norm": 2.539614635832082, "language_loss": 0.68150306, "learning_rate": 2.2122587655971665e-06, "loss": 0.70314103, "num_input_tokens_seen": 86319170, "step": 4010, "time_per_iteration": 2.678687334060669 }, { "auxiliary_loss_clip": 0.01126115, "auxiliary_loss_mlp": 0.01043109, "balance_loss_clip": 1.0460422, "balance_loss_mlp": 1.02556181, "epoch": 0.48229423435339386, "flos": 24134197438080.0, "grad_norm": 1.805834033555491, "language_loss": 0.64074397, "learning_rate": 2.211484177245314e-06, "loss": 0.66243625, "num_input_tokens_seen": 86338760, "step": 4011, "time_per_iteration": 3.6728479862213135 }, { "auxiliary_loss_clip": 0.0115156, "auxiliary_loss_mlp": 0.01051127, "balance_loss_clip": 1.04995155, "balance_loss_mlp": 1.0322448, "epoch": 0.48241447724403297, "flos": 23805435231360.0, "grad_norm": 2.5303301019431395, "language_loss": 0.7259872, "learning_rate": 2.21070955681148e-06, "loss": 0.74801403, "num_input_tokens_seen": 86357865, "step": 4012, "time_per_iteration": 2.5849978923797607 }, { "auxiliary_loss_clip": 0.01102443, "auxiliary_loss_mlp": 0.01047912, "balance_loss_clip": 1.04300714, "balance_loss_mlp": 1.03001833, "epoch": 0.482534720134672, "flos": 23110312256640.0, "grad_norm": 1.6837100419896875, "language_loss": 0.78390348, "learning_rate": 2.209934904413174e-06, "loss": 0.80540699, "num_input_tokens_seen": 86379470, "step": 4013, "time_per_iteration": 3.661430597305298 }, { "auxiliary_loss_clip": 0.01082874, "auxiliary_loss_mlp": 0.01045685, "balance_loss_clip": 1.03533912, "balance_loss_mlp": 1.02674222, "epoch": 0.48265496302531113, "flos": 20923819568640.0, "grad_norm": 2.516667356008769, "language_loss": 0.71619928, "learning_rate": 2.2091602201679095e-06, "loss": 0.73748487, "num_input_tokens_seen": 86399080, "step": 4014, "time_per_iteration": 2.7273669242858887 }, { "auxiliary_loss_clip": 0.01118537, "auxiliary_loss_mlp": 0.01035601, "balance_loss_clip": 1.04591584, "balance_loss_mlp": 1.01855397, "epoch": 0.48277520591595025, "flos": 15231152511360.0, "grad_norm": 2.176854786244461, "language_loss": 0.82712746, "learning_rate": 2.208385504193206e-06, "loss": 0.84866887, "num_input_tokens_seen": 86416580, "step": 4015, "time_per_iteration": 2.7673041820526123 }, { "auxiliary_loss_clip": 0.01149928, "auxiliary_loss_mlp": 0.01043959, "balance_loss_clip": 1.04687464, "balance_loss_mlp": 1.02656591, "epoch": 0.4828954488065893, "flos": 17858664385920.0, "grad_norm": 2.5707783719980997, "language_loss": 0.81010646, "learning_rate": 2.2076107566065873e-06, "loss": 0.83204532, "num_input_tokens_seen": 86434365, "step": 4016, "time_per_iteration": 2.5481297969818115 }, { "auxiliary_loss_clip": 0.0114267, "auxiliary_loss_mlp": 0.01047403, "balance_loss_clip": 1.04973292, "balance_loss_mlp": 1.03078532, "epoch": 0.4830156916972284, "flos": 32087405070720.0, "grad_norm": 2.3751584720939145, "language_loss": 0.75589848, "learning_rate": 2.2068359775255816e-06, "loss": 0.77779925, "num_input_tokens_seen": 86452675, "step": 4017, "time_per_iteration": 3.688441753387451 }, { "auxiliary_loss_clip": 0.01094878, "auxiliary_loss_mlp": 0.01048159, "balance_loss_clip": 1.0419457, "balance_loss_mlp": 1.03100455, "epoch": 0.48313593458786747, "flos": 21871717528320.0, "grad_norm": 2.460443433019046, "language_loss": 0.78139174, "learning_rate": 2.206061167067723e-06, "loss": 0.80282211, "num_input_tokens_seen": 86470785, "step": 4018, "time_per_iteration": 2.6947216987609863 }, { "auxiliary_loss_clip": 0.01109594, "auxiliary_loss_mlp": 0.01054393, "balance_loss_clip": 1.04356432, "balance_loss_mlp": 1.03487849, "epoch": 0.4832561774785066, "flos": 22601206840320.0, "grad_norm": 2.2383045486848183, "language_loss": 0.79643452, "learning_rate": 2.205286325350549e-06, "loss": 0.81807446, "num_input_tokens_seen": 86489850, "step": 4019, "time_per_iteration": 2.8252413272857666 }, { "auxiliary_loss_clip": 0.01100396, "auxiliary_loss_mlp": 0.01048192, "balance_loss_clip": 1.04393816, "balance_loss_mlp": 1.03031039, "epoch": 0.4833764203691457, "flos": 13437342282240.0, "grad_norm": 1.9517371387579066, "language_loss": 0.72501254, "learning_rate": 2.204511452491603e-06, "loss": 0.74649847, "num_input_tokens_seen": 86506475, "step": 4020, "time_per_iteration": 2.725646495819092 }, { "auxiliary_loss_clip": 0.0114609, "auxiliary_loss_mlp": 0.01044879, "balance_loss_clip": 1.04905665, "balance_loss_mlp": 1.02858281, "epoch": 0.48349666325978474, "flos": 44128036955520.0, "grad_norm": 1.8822369895140625, "language_loss": 0.74999416, "learning_rate": 2.2037365486084316e-06, "loss": 0.77190387, "num_input_tokens_seen": 86529715, "step": 4021, "time_per_iteration": 2.7534751892089844 }, { "auxiliary_loss_clip": 0.01118618, "auxiliary_loss_mlp": 0.01043251, "balance_loss_clip": 1.04374552, "balance_loss_mlp": 1.02475023, "epoch": 0.48361690615042385, "flos": 26028377245440.0, "grad_norm": 2.1031900962165313, "language_loss": 0.7831974, "learning_rate": 2.2029616138185886e-06, "loss": 0.80481607, "num_input_tokens_seen": 86548715, "step": 4022, "time_per_iteration": 2.67439341545105 }, { "auxiliary_loss_clip": 0.01109232, "auxiliary_loss_mlp": 0.01051041, "balance_loss_clip": 1.04574203, "balance_loss_mlp": 1.03366041, "epoch": 0.48373714904106296, "flos": 22273306560000.0, "grad_norm": 1.761192653613694, "language_loss": 0.82853508, "learning_rate": 2.202186648239629e-06, "loss": 0.85013783, "num_input_tokens_seen": 86568650, "step": 4023, "time_per_iteration": 3.583211660385132 }, { "auxiliary_loss_clip": 0.01136579, "auxiliary_loss_mlp": 0.01037545, "balance_loss_clip": 1.04886508, "balance_loss_mlp": 1.02116537, "epoch": 0.483857391931702, "flos": 28292293699200.0, "grad_norm": 2.024327864309086, "language_loss": 0.71847343, "learning_rate": 2.201411651989117e-06, "loss": 0.74021471, "num_input_tokens_seen": 86590630, "step": 4024, "time_per_iteration": 2.633538246154785 }, { "auxiliary_loss_clip": 0.0112207, "auxiliary_loss_mlp": 0.00772994, "balance_loss_clip": 1.04613125, "balance_loss_mlp": 1.00061989, "epoch": 0.48397763482234113, "flos": 27418048577280.0, "grad_norm": 1.9115630069679284, "language_loss": 0.78163439, "learning_rate": 2.2006366251846167e-06, "loss": 0.80058503, "num_input_tokens_seen": 86611270, "step": 4025, "time_per_iteration": 2.678978443145752 }, { "auxiliary_loss_clip": 0.01121625, "auxiliary_loss_mlp": 0.01043082, "balance_loss_clip": 1.04903436, "balance_loss_mlp": 1.02536798, "epoch": 0.48409787771298024, "flos": 16797252470400.0, "grad_norm": 2.4874692204935624, "language_loss": 0.75297147, "learning_rate": 2.1998615679436997e-06, "loss": 0.77461851, "num_input_tokens_seen": 86628810, "step": 4026, "time_per_iteration": 2.5975611209869385 }, { "auxiliary_loss_clip": 0.01130606, "auxiliary_loss_mlp": 0.01045502, "balance_loss_clip": 1.04530251, "balance_loss_mlp": 1.02728724, "epoch": 0.4842181206036193, "flos": 25083496028160.0, "grad_norm": 2.430746102737381, "language_loss": 0.77538019, "learning_rate": 2.199086480383942e-06, "loss": 0.79714131, "num_input_tokens_seen": 86648185, "step": 4027, "time_per_iteration": 2.64890456199646 }, { "auxiliary_loss_clip": 0.01138957, "auxiliary_loss_mlp": 0.01052574, "balance_loss_clip": 1.0475142, "balance_loss_mlp": 1.03279757, "epoch": 0.4843383634942584, "flos": 30372311496960.0, "grad_norm": 2.6386038215093794, "language_loss": 0.67984128, "learning_rate": 2.1983113626229234e-06, "loss": 0.7017566, "num_input_tokens_seen": 86667435, "step": 4028, "time_per_iteration": 2.6915013790130615 }, { "auxiliary_loss_clip": 0.01102254, "auxiliary_loss_mlp": 0.00775032, "balance_loss_clip": 1.04044986, "balance_loss_mlp": 1.00060749, "epoch": 0.4844586063848975, "flos": 20413564917120.0, "grad_norm": 1.7926614407624981, "language_loss": 0.78815854, "learning_rate": 2.1975362147782293e-06, "loss": 0.80693144, "num_input_tokens_seen": 86686630, "step": 4029, "time_per_iteration": 2.656649589538574 }, { "auxiliary_loss_clip": 0.01023455, "auxiliary_loss_mlp": 0.01005629, "balance_loss_clip": 1.01466632, "balance_loss_mlp": 1.00336361, "epoch": 0.48457884927553657, "flos": 70303722854400.0, "grad_norm": 0.6933597388787992, "language_loss": 0.54110116, "learning_rate": 2.196761036967448e-06, "loss": 0.56139195, "num_input_tokens_seen": 86754595, "step": 4030, "time_per_iteration": 3.3311684131622314 }, { "auxiliary_loss_clip": 0.01130662, "auxiliary_loss_mlp": 0.01036625, "balance_loss_clip": 1.04553187, "balance_loss_mlp": 1.020818, "epoch": 0.4846990921661757, "flos": 19934516206080.0, "grad_norm": 1.7664122374211348, "language_loss": 0.77174574, "learning_rate": 2.1959858293081743e-06, "loss": 0.79341865, "num_input_tokens_seen": 86773730, "step": 4031, "time_per_iteration": 2.562201976776123 }, { "auxiliary_loss_clip": 0.01109086, "auxiliary_loss_mlp": 0.01041152, "balance_loss_clip": 1.04718566, "balance_loss_mlp": 1.02467728, "epoch": 0.4848193350568148, "flos": 23075945919360.0, "grad_norm": 1.6106223490918137, "language_loss": 0.75927979, "learning_rate": 2.1952105919180056e-06, "loss": 0.7807821, "num_input_tokens_seen": 86792985, "step": 4032, "time_per_iteration": 2.648080348968506 }, { "auxiliary_loss_clip": 0.01124167, "auxiliary_loss_mlp": 0.01040495, "balance_loss_clip": 1.0474546, "balance_loss_mlp": 1.02310252, "epoch": 0.48493957794745385, "flos": 22455481363200.0, "grad_norm": 2.6881389246003686, "language_loss": 0.67932069, "learning_rate": 2.1944353249145456e-06, "loss": 0.70096731, "num_input_tokens_seen": 86812095, "step": 4033, "time_per_iteration": 2.6532299518585205 }, { "auxiliary_loss_clip": 0.01148601, "auxiliary_loss_mlp": 0.01038184, "balance_loss_clip": 1.0521332, "balance_loss_mlp": 1.02249646, "epoch": 0.48505982083809296, "flos": 25046112948480.0, "grad_norm": 1.6720715477286399, "language_loss": 0.74920493, "learning_rate": 2.193660028415401e-06, "loss": 0.77107275, "num_input_tokens_seen": 86832875, "step": 4034, "time_per_iteration": 2.5827107429504395 }, { "auxiliary_loss_clip": 0.01118346, "auxiliary_loss_mlp": 0.01042241, "balance_loss_clip": 1.04605436, "balance_loss_mlp": 1.02474141, "epoch": 0.485180063728732, "flos": 26761386090240.0, "grad_norm": 2.0150058953914973, "language_loss": 0.81514966, "learning_rate": 2.1928847025381852e-06, "loss": 0.83675551, "num_input_tokens_seen": 86853480, "step": 4035, "time_per_iteration": 2.708606004714966 }, { "auxiliary_loss_clip": 0.01137631, "auxiliary_loss_mlp": 0.01047745, "balance_loss_clip": 1.04605544, "balance_loss_mlp": 1.02961326, "epoch": 0.4853003066193711, "flos": 24059143969920.0, "grad_norm": 1.6233209721451511, "language_loss": 0.84089172, "learning_rate": 2.192109347400512e-06, "loss": 0.86274552, "num_input_tokens_seen": 86873695, "step": 4036, "time_per_iteration": 2.6288626194000244 }, { "auxiliary_loss_clip": 0.01127224, "auxiliary_loss_mlp": 0.01042104, "balance_loss_clip": 1.04788578, "balance_loss_mlp": 1.02391231, "epoch": 0.48542054951001024, "flos": 23076376882560.0, "grad_norm": 3.279870434407379, "language_loss": 0.78376335, "learning_rate": 2.191333963120004e-06, "loss": 0.80545664, "num_input_tokens_seen": 86892675, "step": 4037, "time_per_iteration": 3.4944863319396973 }, { "auxiliary_loss_clip": 0.01126264, "auxiliary_loss_mlp": 0.01045241, "balance_loss_clip": 1.04660594, "balance_loss_mlp": 1.02806342, "epoch": 0.4855407924006493, "flos": 25664889565440.0, "grad_norm": 2.765761704584298, "language_loss": 0.70976967, "learning_rate": 2.190558549814286e-06, "loss": 0.73148471, "num_input_tokens_seen": 86912835, "step": 4038, "time_per_iteration": 2.6686248779296875 }, { "auxiliary_loss_clip": 0.0112013, "auxiliary_loss_mlp": 0.01052268, "balance_loss_clip": 1.04407334, "balance_loss_mlp": 1.03293252, "epoch": 0.4856610352912884, "flos": 23987933256960.0, "grad_norm": 1.770040365560705, "language_loss": 0.79761344, "learning_rate": 2.1897831076009872e-06, "loss": 0.81933743, "num_input_tokens_seen": 86932475, "step": 4039, "time_per_iteration": 3.5006513595581055 }, { "auxiliary_loss_clip": 0.0113957, "auxiliary_loss_mlp": 0.01046377, "balance_loss_clip": 1.04716086, "balance_loss_mlp": 1.02909184, "epoch": 0.4857812781819275, "flos": 24096814358400.0, "grad_norm": 1.7939447942712365, "language_loss": 0.79902911, "learning_rate": 2.1890076365977426e-06, "loss": 0.82088852, "num_input_tokens_seen": 86952300, "step": 4040, "time_per_iteration": 2.6302764415740967 }, { "auxiliary_loss_clip": 0.01031594, "auxiliary_loss_mlp": 0.01018845, "balance_loss_clip": 1.0196383, "balance_loss_mlp": 1.0167706, "epoch": 0.48590152107256657, "flos": 56266635185280.0, "grad_norm": 0.8547919196457883, "language_loss": 0.52843761, "learning_rate": 2.188232136922189e-06, "loss": 0.54894197, "num_input_tokens_seen": 87010420, "step": 4041, "time_per_iteration": 3.1178455352783203 }, { "auxiliary_loss_clip": 0.01081523, "auxiliary_loss_mlp": 0.01041002, "balance_loss_clip": 1.04091549, "balance_loss_mlp": 1.02176154, "epoch": 0.4860217639632057, "flos": 20046988667520.0, "grad_norm": 2.0755744501679327, "language_loss": 0.76084816, "learning_rate": 2.187456608691971e-06, "loss": 0.78207338, "num_input_tokens_seen": 87029295, "step": 4042, "time_per_iteration": 2.7777106761932373 }, { "auxiliary_loss_clip": 0.01115745, "auxiliary_loss_mlp": 0.01043933, "balance_loss_clip": 1.04807186, "balance_loss_mlp": 1.02649236, "epoch": 0.4861420068538448, "flos": 17822143232640.0, "grad_norm": 2.36356170288355, "language_loss": 0.879466, "learning_rate": 2.1866810520247334e-06, "loss": 0.90106279, "num_input_tokens_seen": 87048165, "step": 4043, "time_per_iteration": 3.6295199394226074 }, { "auxiliary_loss_clip": 0.01142316, "auxiliary_loss_mlp": 0.01045858, "balance_loss_clip": 1.04721117, "balance_loss_mlp": 1.02736878, "epoch": 0.48626224974448384, "flos": 26250125857920.0, "grad_norm": 2.257303507062883, "language_loss": 0.65469581, "learning_rate": 2.185905467038129e-06, "loss": 0.67657757, "num_input_tokens_seen": 87067070, "step": 4044, "time_per_iteration": 2.6288695335388184 }, { "auxiliary_loss_clip": 0.01151111, "auxiliary_loss_mlp": 0.01049975, "balance_loss_clip": 1.05277586, "balance_loss_mlp": 1.03422785, "epoch": 0.48638249263512295, "flos": 22054502862720.0, "grad_norm": 1.717942509281016, "language_loss": 0.77729815, "learning_rate": 2.1851298538498127e-06, "loss": 0.79930902, "num_input_tokens_seen": 87086785, "step": 4045, "time_per_iteration": 2.581329822540283 }, { "auxiliary_loss_clip": 0.01146602, "auxiliary_loss_mlp": 0.00774177, "balance_loss_clip": 1.05151176, "balance_loss_mlp": 1.0005517, "epoch": 0.48650273552576206, "flos": 25119945354240.0, "grad_norm": 2.045585455089005, "language_loss": 0.80028492, "learning_rate": 2.184354212577446e-06, "loss": 0.8194927, "num_input_tokens_seen": 87107090, "step": 4046, "time_per_iteration": 2.643117904663086 }, { "auxiliary_loss_clip": 0.01153638, "auxiliary_loss_mlp": 0.01048143, "balance_loss_clip": 1.04930997, "balance_loss_mlp": 1.02968919, "epoch": 0.4866229784164011, "flos": 17456931699840.0, "grad_norm": 2.67972391146054, "language_loss": 0.63281, "learning_rate": 2.1835785433386907e-06, "loss": 0.65482777, "num_input_tokens_seen": 87125905, "step": 4047, "time_per_iteration": 2.561516761779785 }, { "auxiliary_loss_clip": 0.01103402, "auxiliary_loss_mlp": 0.0105513, "balance_loss_clip": 1.0448606, "balance_loss_mlp": 1.03503084, "epoch": 0.48674322130704023, "flos": 23331127115520.0, "grad_norm": 1.8357684953222269, "language_loss": 0.65387672, "learning_rate": 2.182802846251216e-06, "loss": 0.67546201, "num_input_tokens_seen": 87146175, "step": 4048, "time_per_iteration": 2.658066511154175 }, { "auxiliary_loss_clip": 0.01112658, "auxiliary_loss_mlp": 0.0104815, "balance_loss_clip": 1.04348731, "balance_loss_mlp": 1.02988744, "epoch": 0.4868634641976793, "flos": 28804344030720.0, "grad_norm": 1.9492760215030536, "language_loss": 0.72477734, "learning_rate": 2.182027121432696e-06, "loss": 0.74638546, "num_input_tokens_seen": 87166800, "step": 4049, "time_per_iteration": 3.6071393489837646 }, { "auxiliary_loss_clip": 0.01154341, "auxiliary_loss_mlp": 0.01049245, "balance_loss_clip": 1.05033922, "balance_loss_mlp": 1.02996874, "epoch": 0.4869837070883184, "flos": 19025976574080.0, "grad_norm": 2.236736902701921, "language_loss": 0.82496893, "learning_rate": 2.1812513690008054e-06, "loss": 0.84700483, "num_input_tokens_seen": 87185920, "step": 4050, "time_per_iteration": 2.5772221088409424 }, { "auxiliary_loss_clip": 0.01148579, "auxiliary_loss_mlp": 0.01050903, "balance_loss_clip": 1.05402493, "balance_loss_mlp": 1.03272343, "epoch": 0.4871039499789575, "flos": 15121409483520.0, "grad_norm": 2.2327876179771575, "language_loss": 0.80157709, "learning_rate": 2.180475589073227e-06, "loss": 0.82357192, "num_input_tokens_seen": 87203620, "step": 4051, "time_per_iteration": 2.588780164718628 }, { "auxiliary_loss_clip": 0.01127315, "auxiliary_loss_mlp": 0.01043224, "balance_loss_clip": 1.04560792, "balance_loss_mlp": 1.02634406, "epoch": 0.48722419286959656, "flos": 26174066808960.0, "grad_norm": 1.6256193790242055, "language_loss": 0.73484099, "learning_rate": 2.1796997817676456e-06, "loss": 0.75654638, "num_input_tokens_seen": 87224630, "step": 4052, "time_per_iteration": 2.6534459590911865 }, { "auxiliary_loss_clip": 0.01136941, "auxiliary_loss_mlp": 0.00773516, "balance_loss_clip": 1.04754293, "balance_loss_mlp": 1.00065565, "epoch": 0.4873444357602357, "flos": 24026142349440.0, "grad_norm": 1.6952894730941368, "language_loss": 0.67578506, "learning_rate": 2.1789239472017494e-06, "loss": 0.69488955, "num_input_tokens_seen": 87246280, "step": 4053, "time_per_iteration": 2.7030675411224365 }, { "auxiliary_loss_clip": 0.01104627, "auxiliary_loss_mlp": 0.01046021, "balance_loss_clip": 1.04099202, "balance_loss_mlp": 1.02755499, "epoch": 0.4874646786508748, "flos": 22820441500800.0, "grad_norm": 2.1989340249937968, "language_loss": 0.72831619, "learning_rate": 2.1781480854932326e-06, "loss": 0.74982262, "num_input_tokens_seen": 87266045, "step": 4054, "time_per_iteration": 2.720776081085205 }, { "auxiliary_loss_clip": 0.01097058, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.04603934, "balance_loss_mlp": 1.02857435, "epoch": 0.48758492154151384, "flos": 21287594557440.0, "grad_norm": 2.151955538723729, "language_loss": 0.79743016, "learning_rate": 2.1773721967597933e-06, "loss": 0.8188588, "num_input_tokens_seen": 87284495, "step": 4055, "time_per_iteration": 2.7998857498168945 }, { "auxiliary_loss_clip": 0.01029024, "auxiliary_loss_mlp": 0.01009214, "balance_loss_clip": 1.01947558, "balance_loss_mlp": 1.00712824, "epoch": 0.48770516443215295, "flos": 62244109180800.0, "grad_norm": 0.8482168980618031, "language_loss": 0.57306862, "learning_rate": 2.1765962811191322e-06, "loss": 0.59345102, "num_input_tokens_seen": 87338960, "step": 4056, "time_per_iteration": 3.1431210041046143 }, { "auxiliary_loss_clip": 0.01006106, "auxiliary_loss_mlp": 0.0100293, "balance_loss_clip": 1.01693416, "balance_loss_mlp": 1.00102258, "epoch": 0.48782540732279206, "flos": 66133451882880.0, "grad_norm": 0.8261529434078748, "language_loss": 0.62023014, "learning_rate": 2.1758203386889566e-06, "loss": 0.64032054, "num_input_tokens_seen": 87401730, "step": 4057, "time_per_iteration": 3.367177724838257 }, { "auxiliary_loss_clip": 0.01112385, "auxiliary_loss_mlp": 0.0077458, "balance_loss_clip": 1.04567623, "balance_loss_mlp": 1.00064576, "epoch": 0.4879456502134311, "flos": 14607922608000.0, "grad_norm": 1.8691899130915215, "language_loss": 0.84044862, "learning_rate": 2.1750443695869746e-06, "loss": 0.85931826, "num_input_tokens_seen": 87417300, "step": 4058, "time_per_iteration": 2.6394083499908447 }, { "auxiliary_loss_clip": 0.01138813, "auxiliary_loss_mlp": 0.01043979, "balance_loss_clip": 1.04804897, "balance_loss_mlp": 1.02731347, "epoch": 0.4880658931040702, "flos": 19500464257920.0, "grad_norm": 1.8312320084914844, "language_loss": 0.85958397, "learning_rate": 2.174268373930901e-06, "loss": 0.88141191, "num_input_tokens_seen": 87434815, "step": 4059, "time_per_iteration": 2.597055435180664 }, { "auxiliary_loss_clip": 0.01106793, "auxiliary_loss_mlp": 0.00773716, "balance_loss_clip": 1.04524279, "balance_loss_mlp": 1.00060952, "epoch": 0.48818613599470934, "flos": 16723060928640.0, "grad_norm": 6.449304276151907, "language_loss": 0.79822636, "learning_rate": 2.1734923518384537e-06, "loss": 0.81703144, "num_input_tokens_seen": 87451420, "step": 4060, "time_per_iteration": 2.6424551010131836 }, { "auxiliary_loss_clip": 0.01096845, "auxiliary_loss_mlp": 0.01045514, "balance_loss_clip": 1.04387593, "balance_loss_mlp": 1.02825236, "epoch": 0.4883063788853484, "flos": 26756932803840.0, "grad_norm": 3.1077585966839334, "language_loss": 0.82356381, "learning_rate": 2.1727163034273547e-06, "loss": 0.84498739, "num_input_tokens_seen": 87469585, "step": 4061, "time_per_iteration": 2.6885433197021484 }, { "auxiliary_loss_clip": 0.01134116, "auxiliary_loss_mlp": 0.01045646, "balance_loss_clip": 1.04494405, "balance_loss_mlp": 1.02821732, "epoch": 0.4884266217759875, "flos": 16763388923520.0, "grad_norm": 2.706172752558971, "language_loss": 0.79539084, "learning_rate": 2.17194022881533e-06, "loss": 0.81718856, "num_input_tokens_seen": 87485675, "step": 4062, "time_per_iteration": 2.6062278747558594 }, { "auxiliary_loss_clip": 0.01126665, "auxiliary_loss_mlp": 0.01051181, "balance_loss_clip": 1.04508865, "balance_loss_mlp": 1.03250051, "epoch": 0.4885468646666266, "flos": 24207132003840.0, "grad_norm": 2.4609978360617295, "language_loss": 0.67771488, "learning_rate": 2.1711641281201092e-06, "loss": 0.69949329, "num_input_tokens_seen": 87505605, "step": 4063, "time_per_iteration": 3.68888783454895 }, { "auxiliary_loss_clip": 0.0113546, "auxiliary_loss_mlp": 0.01046023, "balance_loss_clip": 1.04903102, "balance_loss_mlp": 1.02906001, "epoch": 0.48866710755726567, "flos": 14610795696000.0, "grad_norm": 2.0931874421106422, "language_loss": 0.79167044, "learning_rate": 2.1703880014594264e-06, "loss": 0.81348532, "num_input_tokens_seen": 87523195, "step": 4064, "time_per_iteration": 2.6431021690368652 }, { "auxiliary_loss_clip": 0.01091398, "auxiliary_loss_mlp": 0.01042426, "balance_loss_clip": 1.04513717, "balance_loss_mlp": 1.02504563, "epoch": 0.4887873504479048, "flos": 28804451771520.0, "grad_norm": 1.659294807220934, "language_loss": 0.73887628, "learning_rate": 2.1696118489510182e-06, "loss": 0.76021445, "num_input_tokens_seen": 87544125, "step": 4065, "time_per_iteration": 3.6859302520751953 }, { "auxiliary_loss_clip": 0.0111716, "auxiliary_loss_mlp": 0.00773468, "balance_loss_clip": 1.04599631, "balance_loss_mlp": 1.00059819, "epoch": 0.48890759333854383, "flos": 22784387224320.0, "grad_norm": 2.4036606646082186, "language_loss": 0.72599494, "learning_rate": 2.1688356707126286e-06, "loss": 0.74490118, "num_input_tokens_seen": 87563745, "step": 4066, "time_per_iteration": 2.6604859828948975 }, { "auxiliary_loss_clip": 0.0110624, "auxiliary_loss_mlp": 0.01052329, "balance_loss_clip": 1.0432775, "balance_loss_mlp": 1.03375602, "epoch": 0.48902783622918294, "flos": 17786088956160.0, "grad_norm": 1.9504481640313198, "language_loss": 0.7002297, "learning_rate": 2.168059466862001e-06, "loss": 0.72181535, "num_input_tokens_seen": 87581895, "step": 4067, "time_per_iteration": 2.656445264816284 }, { "auxiliary_loss_clip": 0.01122944, "auxiliary_loss_mlp": 0.01036807, "balance_loss_clip": 1.04361355, "balance_loss_mlp": 1.01973605, "epoch": 0.48914807911982205, "flos": 22310294590080.0, "grad_norm": 2.1574995044291327, "language_loss": 0.82057196, "learning_rate": 2.167283237516887e-06, "loss": 0.8421694, "num_input_tokens_seen": 87600170, "step": 4068, "time_per_iteration": 3.611546039581299 }, { "auxiliary_loss_clip": 0.01126541, "auxiliary_loss_mlp": 0.01051786, "balance_loss_clip": 1.04490805, "balance_loss_mlp": 1.03316605, "epoch": 0.4892683220104611, "flos": 16363020954240.0, "grad_norm": 1.7657418563144032, "language_loss": 0.74576408, "learning_rate": 2.1665069827950383e-06, "loss": 0.76754737, "num_input_tokens_seen": 87617455, "step": 4069, "time_per_iteration": 2.6278886795043945 }, { "auxiliary_loss_clip": 0.01116691, "auxiliary_loss_mlp": 0.01039644, "balance_loss_clip": 1.04264045, "balance_loss_mlp": 1.02396798, "epoch": 0.4893885649011002, "flos": 15739144606080.0, "grad_norm": 1.7949101213985226, "language_loss": 0.86409932, "learning_rate": 2.1657307028142126e-06, "loss": 0.88566267, "num_input_tokens_seen": 87634995, "step": 4070, "time_per_iteration": 2.607963800430298 }, { "auxiliary_loss_clip": 0.01124996, "auxiliary_loss_mlp": 0.01040507, "balance_loss_clip": 1.04617262, "balance_loss_mlp": 1.02311397, "epoch": 0.48950880779173933, "flos": 28581984887040.0, "grad_norm": 1.9020155353693955, "language_loss": 0.6745804, "learning_rate": 2.164954397692171e-06, "loss": 0.69623542, "num_input_tokens_seen": 87654420, "step": 4071, "time_per_iteration": 2.6701135635375977 }, { "auxiliary_loss_clip": 0.01023539, "auxiliary_loss_mlp": 0.01008558, "balance_loss_clip": 1.00981998, "balance_loss_mlp": 1.00667477, "epoch": 0.4896290506823784, "flos": 66186310746240.0, "grad_norm": 1.0723179417572355, "language_loss": 0.77330548, "learning_rate": 2.164178067546678e-06, "loss": 0.79362643, "num_input_tokens_seen": 87713585, "step": 4072, "time_per_iteration": 3.2717244625091553 }, { "auxiliary_loss_clip": 0.01131586, "auxiliary_loss_mlp": 0.01042739, "balance_loss_clip": 1.04797947, "balance_loss_mlp": 1.02591825, "epoch": 0.4897492935730175, "flos": 12531065207040.0, "grad_norm": 1.818673495165641, "language_loss": 0.90990627, "learning_rate": 2.163401712495504e-06, "loss": 0.93164957, "num_input_tokens_seen": 87731280, "step": 4073, "time_per_iteration": 2.6311028003692627 }, { "auxiliary_loss_clip": 0.01096595, "auxiliary_loss_mlp": 0.01048558, "balance_loss_clip": 1.04374766, "balance_loss_mlp": 1.03047419, "epoch": 0.4898695364636566, "flos": 23476816679040.0, "grad_norm": 1.6594632590293181, "language_loss": 0.79383218, "learning_rate": 2.1626253326564194e-06, "loss": 0.81528372, "num_input_tokens_seen": 87750230, "step": 4074, "time_per_iteration": 2.692255735397339 }, { "auxiliary_loss_clip": 0.01123954, "auxiliary_loss_mlp": 0.0105081, "balance_loss_clip": 1.04445112, "balance_loss_mlp": 1.03269029, "epoch": 0.48998977935429566, "flos": 27160209774720.0, "grad_norm": 1.7685792011182517, "language_loss": 0.76994383, "learning_rate": 2.161848928147201e-06, "loss": 0.79169154, "num_input_tokens_seen": 87770500, "step": 4075, "time_per_iteration": 3.6236355304718018 }, { "auxiliary_loss_clip": 0.01137909, "auxiliary_loss_mlp": 0.01047048, "balance_loss_clip": 1.04838812, "balance_loss_mlp": 1.02991724, "epoch": 0.4901100222449348, "flos": 20339588856960.0, "grad_norm": 1.871468997883178, "language_loss": 0.80466795, "learning_rate": 2.161072499085629e-06, "loss": 0.82651752, "num_input_tokens_seen": 87789495, "step": 4076, "time_per_iteration": 2.5942230224609375 }, { "auxiliary_loss_clip": 0.01112641, "auxiliary_loss_mlp": 0.01043899, "balance_loss_clip": 1.04504776, "balance_loss_mlp": 1.02668571, "epoch": 0.4902302651355739, "flos": 30446359384320.0, "grad_norm": 1.7961613358427952, "language_loss": 0.83166122, "learning_rate": 2.160296045589487e-06, "loss": 0.8532266, "num_input_tokens_seen": 87812955, "step": 4077, "time_per_iteration": 2.800736427307129 }, { "auxiliary_loss_clip": 0.01131156, "auxiliary_loss_mlp": 0.01041498, "balance_loss_clip": 1.04524183, "balance_loss_mlp": 1.02415276, "epoch": 0.49035050802621294, "flos": 19174180089600.0, "grad_norm": 1.996451943327185, "language_loss": 0.70143104, "learning_rate": 2.159519567776562e-06, "loss": 0.72315758, "num_input_tokens_seen": 87832605, "step": 4078, "time_per_iteration": 2.6018776893615723 }, { "auxiliary_loss_clip": 0.0109663, "auxiliary_loss_mlp": 0.01051223, "balance_loss_clip": 1.03921437, "balance_loss_mlp": 1.03267384, "epoch": 0.49047075091685205, "flos": 22228489365120.0, "grad_norm": 2.552136477163565, "language_loss": 0.70751286, "learning_rate": 2.1587430657646463e-06, "loss": 0.72899139, "num_input_tokens_seen": 87846040, "step": 4079, "time_per_iteration": 2.638035774230957 }, { "auxiliary_loss_clip": 0.01121399, "auxiliary_loss_mlp": 0.01039919, "balance_loss_clip": 1.04569399, "balance_loss_mlp": 1.021716, "epoch": 0.4905909938074911, "flos": 20156516213760.0, "grad_norm": 1.853290636741608, "language_loss": 0.77815908, "learning_rate": 2.157966539671533e-06, "loss": 0.79977226, "num_input_tokens_seen": 87865680, "step": 4080, "time_per_iteration": 2.6375765800476074 }, { "auxiliary_loss_clip": 0.01108016, "auxiliary_loss_mlp": 0.01040455, "balance_loss_clip": 1.04219818, "balance_loss_mlp": 1.02427793, "epoch": 0.4907112366981302, "flos": 17202217380480.0, "grad_norm": 1.9166186703014498, "language_loss": 0.67648315, "learning_rate": 2.157189989615021e-06, "loss": 0.69796789, "num_input_tokens_seen": 87884270, "step": 4081, "time_per_iteration": 2.6609129905700684 }, { "auxiliary_loss_clip": 0.01141428, "auxiliary_loss_mlp": 0.0077457, "balance_loss_clip": 1.04848409, "balance_loss_mlp": 1.00044084, "epoch": 0.4908314795887693, "flos": 21688968107520.0, "grad_norm": 1.7997796251195526, "language_loss": 0.75232518, "learning_rate": 2.156413415712913e-06, "loss": 0.77148515, "num_input_tokens_seen": 87906320, "step": 4082, "time_per_iteration": 2.6333065032958984 }, { "auxiliary_loss_clip": 0.01131, "auxiliary_loss_mlp": 0.00775019, "balance_loss_clip": 1.04760933, "balance_loss_mlp": 1.00048709, "epoch": 0.4909517224794084, "flos": 26213676531840.0, "grad_norm": 1.768299045461757, "language_loss": 0.78621244, "learning_rate": 2.155636818083014e-06, "loss": 0.8052727, "num_input_tokens_seen": 87927690, "step": 4083, "time_per_iteration": 2.681183338165283 }, { "auxiliary_loss_clip": 0.01120474, "auxiliary_loss_mlp": 0.0105339, "balance_loss_clip": 1.04596984, "balance_loss_mlp": 1.03702295, "epoch": 0.4910719653700475, "flos": 23148377694720.0, "grad_norm": 2.5437848069905202, "language_loss": 0.84080863, "learning_rate": 2.154860196843134e-06, "loss": 0.86254728, "num_input_tokens_seen": 87946885, "step": 4084, "time_per_iteration": 2.6094017028808594 }, { "auxiliary_loss_clip": 0.01149597, "auxiliary_loss_mlp": 0.01050109, "balance_loss_clip": 1.04762316, "balance_loss_mlp": 1.03077316, "epoch": 0.4911922082606866, "flos": 23331845387520.0, "grad_norm": 1.8829694539794581, "language_loss": 0.7675643, "learning_rate": 2.154083552111085e-06, "loss": 0.78956139, "num_input_tokens_seen": 87966055, "step": 4085, "time_per_iteration": 2.613588809967041 }, { "auxiliary_loss_clip": 0.01147692, "auxiliary_loss_mlp": 0.01046985, "balance_loss_clip": 1.0458225, "balance_loss_mlp": 1.0299257, "epoch": 0.49131245115132566, "flos": 29203239542400.0, "grad_norm": 1.976614753102556, "language_loss": 0.81757545, "learning_rate": 2.1533068840046834e-06, "loss": 0.83952224, "num_input_tokens_seen": 87986320, "step": 4086, "time_per_iteration": 2.6359786987304688 }, { "auxiliary_loss_clip": 0.01116168, "auxiliary_loss_mlp": 0.00776485, "balance_loss_clip": 1.04427576, "balance_loss_mlp": 1.00046444, "epoch": 0.49143269404196477, "flos": 20147465986560.0, "grad_norm": 2.3890264594008594, "language_loss": 0.61604416, "learning_rate": 2.152530192641749e-06, "loss": 0.63497072, "num_input_tokens_seen": 88001230, "step": 4087, "time_per_iteration": 2.5994322299957275 }, { "auxiliary_loss_clip": 0.01136023, "auxiliary_loss_mlp": 0.0104733, "balance_loss_clip": 1.04687023, "balance_loss_mlp": 1.0310576, "epoch": 0.4915529369326039, "flos": 24389809597440.0, "grad_norm": 1.8532774363501796, "language_loss": 0.72474253, "learning_rate": 2.1517534781401068e-06, "loss": 0.74657607, "num_input_tokens_seen": 88019110, "step": 4088, "time_per_iteration": 2.6094810962677 }, { "auxiliary_loss_clip": 0.01133133, "auxiliary_loss_mlp": 0.01042539, "balance_loss_clip": 1.04579329, "balance_loss_mlp": 1.02543223, "epoch": 0.49167317982324293, "flos": 10524305197440.0, "grad_norm": 2.531569108778035, "language_loss": 0.69418681, "learning_rate": 2.150976740617581e-06, "loss": 0.71594357, "num_input_tokens_seen": 88035670, "step": 4089, "time_per_iteration": 3.582258701324463 }, { "auxiliary_loss_clip": 0.01128986, "auxiliary_loss_mlp": 0.01048268, "balance_loss_clip": 1.04538453, "balance_loss_mlp": 1.03008866, "epoch": 0.49179342271388204, "flos": 25593427457280.0, "grad_norm": 1.8463931529561148, "language_loss": 0.71287668, "learning_rate": 2.150199980192006e-06, "loss": 0.73464924, "num_input_tokens_seen": 88054790, "step": 4090, "time_per_iteration": 2.7683591842651367 }, { "auxiliary_loss_clip": 0.01113486, "auxiliary_loss_mlp": 0.01045463, "balance_loss_clip": 1.04198503, "balance_loss_mlp": 1.02920282, "epoch": 0.49191366560452116, "flos": 21102043875840.0, "grad_norm": 1.955400799089085, "language_loss": 0.8055594, "learning_rate": 2.1494231969812114e-06, "loss": 0.82714891, "num_input_tokens_seen": 88073780, "step": 4091, "time_per_iteration": 3.600168466567993 }, { "auxiliary_loss_clip": 0.01114635, "auxiliary_loss_mlp": 0.01038455, "balance_loss_clip": 1.04668379, "balance_loss_mlp": 1.0218612, "epoch": 0.4920339084951602, "flos": 26067520091520.0, "grad_norm": 2.472950333763898, "language_loss": 0.81542748, "learning_rate": 2.1486463911030372e-06, "loss": 0.83695841, "num_input_tokens_seen": 88094430, "step": 4092, "time_per_iteration": 2.721928834915161 }, { "auxiliary_loss_clip": 0.01117096, "auxiliary_loss_mlp": 0.0104229, "balance_loss_clip": 1.04106343, "balance_loss_mlp": 1.02509952, "epoch": 0.4921541513857993, "flos": 25081269384960.0, "grad_norm": 1.7527408120102532, "language_loss": 0.74597126, "learning_rate": 2.147869562675324e-06, "loss": 0.76756507, "num_input_tokens_seen": 88113400, "step": 4093, "time_per_iteration": 2.645967483520508 }, { "auxiliary_loss_clip": 0.01135913, "auxiliary_loss_mlp": 0.01044278, "balance_loss_clip": 1.0459398, "balance_loss_mlp": 1.02606273, "epoch": 0.49227439427643843, "flos": 24389809597440.0, "grad_norm": 2.1072293141480976, "language_loss": 0.72399116, "learning_rate": 2.147092711815915e-06, "loss": 0.7457931, "num_input_tokens_seen": 88132750, "step": 4094, "time_per_iteration": 3.631182909011841 }, { "auxiliary_loss_clip": 0.01108602, "auxiliary_loss_mlp": 0.01047305, "balance_loss_clip": 1.04402769, "balance_loss_mlp": 1.02919745, "epoch": 0.4923946371670775, "flos": 11363753018880.0, "grad_norm": 2.444567595850432, "language_loss": 0.8647207, "learning_rate": 2.1463158386426593e-06, "loss": 0.88627982, "num_input_tokens_seen": 88150560, "step": 4095, "time_per_iteration": 2.60154390335083 }, { "auxiliary_loss_clip": 0.01129187, "auxiliary_loss_mlp": 0.01045392, "balance_loss_clip": 1.0465616, "balance_loss_mlp": 1.02598476, "epoch": 0.4925148800577166, "flos": 30445964334720.0, "grad_norm": 2.3132916136907533, "language_loss": 0.77472681, "learning_rate": 2.145538943273407e-06, "loss": 0.79647255, "num_input_tokens_seen": 88170835, "step": 4096, "time_per_iteration": 2.6636030673980713 }, { "auxiliary_loss_clip": 0.01150701, "auxiliary_loss_mlp": 0.01041849, "balance_loss_clip": 1.0493294, "balance_loss_mlp": 1.02363372, "epoch": 0.49263512294835565, "flos": 20850454039680.0, "grad_norm": 1.8390850647772272, "language_loss": 0.71653068, "learning_rate": 2.144762025826013e-06, "loss": 0.73845625, "num_input_tokens_seen": 88189925, "step": 4097, "time_per_iteration": 2.567826271057129 }, { "auxiliary_loss_clip": 0.01138364, "auxiliary_loss_mlp": 0.01045228, "balance_loss_clip": 1.04601967, "balance_loss_mlp": 1.02655971, "epoch": 0.49275536583899476, "flos": 23767477534080.0, "grad_norm": 2.5409743648677274, "language_loss": 0.86811697, "learning_rate": 2.143985086418334e-06, "loss": 0.8899529, "num_input_tokens_seen": 88205105, "step": 4098, "time_per_iteration": 2.5773770809173584 }, { "auxiliary_loss_clip": 0.01122914, "auxiliary_loss_mlp": 0.01042155, "balance_loss_clip": 1.04561305, "balance_loss_mlp": 1.02546585, "epoch": 0.4928756087296339, "flos": 22273522041600.0, "grad_norm": 1.3723065916147092, "language_loss": 0.76508099, "learning_rate": 2.1432081251682324e-06, "loss": 0.78673166, "num_input_tokens_seen": 88225475, "step": 4099, "time_per_iteration": 2.6419920921325684 }, { "auxiliary_loss_clip": 0.01129909, "auxiliary_loss_mlp": 0.010419, "balance_loss_clip": 1.04781568, "balance_loss_mlp": 1.0246861, "epoch": 0.49299585162027293, "flos": 19645471463040.0, "grad_norm": 1.7644801079016046, "language_loss": 0.87045634, "learning_rate": 2.142431142193572e-06, "loss": 0.89217448, "num_input_tokens_seen": 88243255, "step": 4100, "time_per_iteration": 2.5745716094970703 }, { "auxiliary_loss_clip": 0.01143756, "auxiliary_loss_mlp": 0.01041843, "balance_loss_clip": 1.04792237, "balance_loss_mlp": 1.02435482, "epoch": 0.49311609451091204, "flos": 38837138497920.0, "grad_norm": 2.179375872851569, "language_loss": 0.71483696, "learning_rate": 2.1416541376122207e-06, "loss": 0.73669291, "num_input_tokens_seen": 88263435, "step": 4101, "time_per_iteration": 3.579416275024414 }, { "auxiliary_loss_clip": 0.01144429, "auxiliary_loss_mlp": 0.01038177, "balance_loss_clip": 1.04583812, "balance_loss_mlp": 1.0205816, "epoch": 0.49323633740155115, "flos": 28329102161280.0, "grad_norm": 1.9269399465320896, "language_loss": 0.73213851, "learning_rate": 2.1408771115420496e-06, "loss": 0.7539646, "num_input_tokens_seen": 88283295, "step": 4102, "time_per_iteration": 2.6199166774749756 }, { "auxiliary_loss_clip": 0.01099573, "auxiliary_loss_mlp": 0.01040854, "balance_loss_clip": 1.04785347, "balance_loss_mlp": 1.02511799, "epoch": 0.4933565802921902, "flos": 21135584200320.0, "grad_norm": 1.7681182834776727, "language_loss": 0.65049106, "learning_rate": 2.140100064100932e-06, "loss": 0.67189533, "num_input_tokens_seen": 88299270, "step": 4103, "time_per_iteration": 2.6341018676757812 }, { "auxiliary_loss_clip": 0.011316, "auxiliary_loss_mlp": 0.01047294, "balance_loss_clip": 1.04481161, "balance_loss_mlp": 1.02925754, "epoch": 0.4934768231828293, "flos": 18039007595520.0, "grad_norm": 1.9411008433003265, "language_loss": 0.75490022, "learning_rate": 2.139322995406746e-06, "loss": 0.77668905, "num_input_tokens_seen": 88316905, "step": 4104, "time_per_iteration": 2.5083696842193604 }, { "auxiliary_loss_clip": 0.01145798, "auxiliary_loss_mlp": 0.01051465, "balance_loss_clip": 1.04777741, "balance_loss_mlp": 1.03301132, "epoch": 0.4935970660734684, "flos": 23469957181440.0, "grad_norm": 3.909888717479376, "language_loss": 0.79505849, "learning_rate": 2.1385459055773727e-06, "loss": 0.81703115, "num_input_tokens_seen": 88335095, "step": 4105, "time_per_iteration": 2.560673475265503 }, { "auxiliary_loss_clip": 0.01087242, "auxiliary_loss_mlp": 0.00773441, "balance_loss_clip": 1.04172349, "balance_loss_mlp": 1.00054336, "epoch": 0.4937173089641075, "flos": 64479258840960.0, "grad_norm": 1.916421412198048, "language_loss": 0.73677075, "learning_rate": 2.137768794730696e-06, "loss": 0.75537759, "num_input_tokens_seen": 88358545, "step": 4106, "time_per_iteration": 3.085226535797119 }, { "auxiliary_loss_clip": 0.01131399, "auxiliary_loss_mlp": 0.01044149, "balance_loss_clip": 1.04957008, "balance_loss_mlp": 1.0256598, "epoch": 0.4938375518547466, "flos": 22346025644160.0, "grad_norm": 1.839660476730873, "language_loss": 0.80130649, "learning_rate": 2.1369916629846026e-06, "loss": 0.82306194, "num_input_tokens_seen": 88378295, "step": 4107, "time_per_iteration": 2.68601655960083 }, { "auxiliary_loss_clip": 0.01124665, "auxiliary_loss_mlp": 0.010433, "balance_loss_clip": 1.04559803, "balance_loss_mlp": 1.02609813, "epoch": 0.4939577947453857, "flos": 17858700299520.0, "grad_norm": 2.250808967562326, "language_loss": 0.75278962, "learning_rate": 2.136214510456983e-06, "loss": 0.77446926, "num_input_tokens_seen": 88396750, "step": 4108, "time_per_iteration": 2.6801576614379883 }, { "auxiliary_loss_clip": 0.01013435, "auxiliary_loss_mlp": 0.00756645, "balance_loss_clip": 1.018242, "balance_loss_mlp": 1.00045574, "epoch": 0.49407803763602476, "flos": 70066746875520.0, "grad_norm": 0.887457768912538, "language_loss": 0.63143855, "learning_rate": 2.1354373372657296e-06, "loss": 0.64913934, "num_input_tokens_seen": 88455190, "step": 4109, "time_per_iteration": 3.283637762069702 }, { "auxiliary_loss_clip": 0.01148048, "auxiliary_loss_mlp": 0.0104842, "balance_loss_clip": 1.0500102, "balance_loss_mlp": 1.03171873, "epoch": 0.49419828052666387, "flos": 24317485562880.0, "grad_norm": 1.6734531064854214, "language_loss": 0.70922512, "learning_rate": 2.1346601435287404e-06, "loss": 0.73118979, "num_input_tokens_seen": 88477460, "step": 4110, "time_per_iteration": 2.6184113025665283 }, { "auxiliary_loss_clip": 0.01125169, "auxiliary_loss_mlp": 0.01045069, "balance_loss_clip": 1.04582167, "balance_loss_mlp": 1.02758121, "epoch": 0.494318523417303, "flos": 29386060790400.0, "grad_norm": 2.150440000103263, "language_loss": 0.80421054, "learning_rate": 2.1338829293639144e-06, "loss": 0.82591295, "num_input_tokens_seen": 88497820, "step": 4111, "time_per_iteration": 2.7114226818084717 }, { "auxiliary_loss_clip": 0.01093171, "auxiliary_loss_mlp": 0.01045751, "balance_loss_clip": 1.04201138, "balance_loss_mlp": 1.02810812, "epoch": 0.49443876630794203, "flos": 15268284195840.0, "grad_norm": 2.0839601704580812, "language_loss": 0.82946765, "learning_rate": 2.1331056948891547e-06, "loss": 0.85085696, "num_input_tokens_seen": 88514920, "step": 4112, "time_per_iteration": 2.656765937805176 }, { "auxiliary_loss_clip": 0.01117654, "auxiliary_loss_mlp": 0.01040671, "balance_loss_clip": 1.04454207, "balance_loss_mlp": 1.02369571, "epoch": 0.49455900919858115, "flos": 12347453859840.0, "grad_norm": 3.3705777853112626, "language_loss": 0.76386487, "learning_rate": 2.1323284402223666e-06, "loss": 0.78544813, "num_input_tokens_seen": 88530910, "step": 4113, "time_per_iteration": 2.63942813873291 }, { "auxiliary_loss_clip": 0.01145314, "auxiliary_loss_mlp": 0.00773319, "balance_loss_clip": 1.05174398, "balance_loss_mlp": 1.00048065, "epoch": 0.4946792520892202, "flos": 22779610715520.0, "grad_norm": 1.9470286098850909, "language_loss": 0.88620663, "learning_rate": 2.1315511654814597e-06, "loss": 0.90539294, "num_input_tokens_seen": 88549320, "step": 4114, "time_per_iteration": 2.5880846977233887 }, { "auxiliary_loss_clip": 0.01114078, "auxiliary_loss_mlp": 0.01035975, "balance_loss_clip": 1.04489589, "balance_loss_mlp": 1.01957178, "epoch": 0.4947994949798593, "flos": 23148126299520.0, "grad_norm": 1.9373064493615728, "language_loss": 0.78431463, "learning_rate": 2.1307738707843456e-06, "loss": 0.8058151, "num_input_tokens_seen": 88568985, "step": 4115, "time_per_iteration": 3.539062976837158 }, { "auxiliary_loss_clip": 0.01143211, "auxiliary_loss_mlp": 0.01047969, "balance_loss_clip": 1.04921651, "balance_loss_mlp": 1.02802515, "epoch": 0.4949197378704984, "flos": 23659997063040.0, "grad_norm": 2.0371161721896014, "language_loss": 0.69511902, "learning_rate": 2.1299965562489385e-06, "loss": 0.71703076, "num_input_tokens_seen": 88588790, "step": 4116, "time_per_iteration": 3.6285762786865234 }, { "auxiliary_loss_clip": 0.0113148, "auxiliary_loss_mlp": 0.01047914, "balance_loss_clip": 1.04448843, "balance_loss_mlp": 1.02969861, "epoch": 0.4950399807611375, "flos": 26911493026560.0, "grad_norm": 1.5691642706398945, "language_loss": 0.78843594, "learning_rate": 2.129219221993158e-06, "loss": 0.81022984, "num_input_tokens_seen": 88613575, "step": 4117, "time_per_iteration": 2.6890275478363037 }, { "auxiliary_loss_clip": 0.0101796, "auxiliary_loss_mlp": 0.01001238, "balance_loss_clip": 1.01714134, "balance_loss_mlp": 0.99954528, "epoch": 0.4951602236517766, "flos": 67315270187520.0, "grad_norm": 0.7840227048170236, "language_loss": 0.59916306, "learning_rate": 2.128441868134924e-06, "loss": 0.61935496, "num_input_tokens_seen": 88675510, "step": 4118, "time_per_iteration": 3.28926420211792 }, { "auxiliary_loss_clip": 0.01115057, "auxiliary_loss_mlp": 0.01042145, "balance_loss_clip": 1.04619265, "balance_loss_mlp": 1.0242877, "epoch": 0.4952804665424157, "flos": 19901442758400.0, "grad_norm": 2.0671767138854698, "language_loss": 0.83241463, "learning_rate": 2.1276644947921606e-06, "loss": 0.85398662, "num_input_tokens_seen": 88694425, "step": 4119, "time_per_iteration": 2.669706344604492 }, { "auxiliary_loss_clip": 0.01138973, "auxiliary_loss_mlp": 0.01048185, "balance_loss_clip": 1.04875529, "balance_loss_mlp": 1.02944541, "epoch": 0.49540070943305475, "flos": 18806813740800.0, "grad_norm": 1.9032968779103854, "language_loss": 0.82494211, "learning_rate": 2.126887102082795e-06, "loss": 0.8468138, "num_input_tokens_seen": 88714450, "step": 4120, "time_per_iteration": 3.589172601699829 }, { "auxiliary_loss_clip": 0.01107745, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.04189825, "balance_loss_mlp": 1.03247917, "epoch": 0.49552095232369386, "flos": 24934179191040.0, "grad_norm": 1.711275510895337, "language_loss": 0.70400405, "learning_rate": 2.126109690124757e-06, "loss": 0.72558236, "num_input_tokens_seen": 88735265, "step": 4121, "time_per_iteration": 2.7092180252075195 }, { "auxiliary_loss_clip": 0.01096376, "auxiliary_loss_mlp": 0.01039574, "balance_loss_clip": 1.03992105, "balance_loss_mlp": 1.02227628, "epoch": 0.495641195214333, "flos": 22857249962880.0, "grad_norm": 1.767271564368651, "language_loss": 0.70918304, "learning_rate": 2.1253322590359786e-06, "loss": 0.73054254, "num_input_tokens_seen": 88754600, "step": 4122, "time_per_iteration": 2.7587342262268066 }, { "auxiliary_loss_clip": 0.01132191, "auxiliary_loss_mlp": 0.0104138, "balance_loss_clip": 1.04472721, "balance_loss_mlp": 1.02199674, "epoch": 0.49576143810497203, "flos": 25769748343680.0, "grad_norm": 2.454400783595832, "language_loss": 0.73731434, "learning_rate": 2.124554808934397e-06, "loss": 0.75905001, "num_input_tokens_seen": 88775180, "step": 4123, "time_per_iteration": 2.795976161956787 }, { "auxiliary_loss_clip": 0.01089943, "auxiliary_loss_mlp": 0.01044189, "balance_loss_clip": 1.03980303, "balance_loss_mlp": 1.02614069, "epoch": 0.49588168099561114, "flos": 22128838058880.0, "grad_norm": 2.316116043587993, "language_loss": 0.72716969, "learning_rate": 2.1237773399379496e-06, "loss": 0.74851102, "num_input_tokens_seen": 88796145, "step": 4124, "time_per_iteration": 2.7120792865753174 }, { "auxiliary_loss_clip": 0.01124563, "auxiliary_loss_mlp": 0.01048165, "balance_loss_clip": 1.0420028, "balance_loss_mlp": 1.0298897, "epoch": 0.49600192388625025, "flos": 24387331559040.0, "grad_norm": 5.803134947742804, "language_loss": 0.86886668, "learning_rate": 2.122999852164578e-06, "loss": 0.89059401, "num_input_tokens_seen": 88816765, "step": 4125, "time_per_iteration": 2.696789264678955 }, { "auxiliary_loss_clip": 0.01096519, "auxiliary_loss_mlp": 0.0104725, "balance_loss_clip": 1.04393721, "balance_loss_mlp": 1.02926159, "epoch": 0.4961221667768893, "flos": 22857429530880.0, "grad_norm": 2.2078433639483293, "language_loss": 0.58269453, "learning_rate": 2.122222345732227e-06, "loss": 0.60413229, "num_input_tokens_seen": 88836680, "step": 4126, "time_per_iteration": 2.7125465869903564 }, { "auxiliary_loss_clip": 0.0111301, "auxiliary_loss_mlp": 0.01046091, "balance_loss_clip": 1.04563653, "balance_loss_mlp": 1.02805424, "epoch": 0.4962424096675284, "flos": 17858089768320.0, "grad_norm": 2.1839493223367064, "language_loss": 0.82930481, "learning_rate": 2.121444820758843e-06, "loss": 0.85089582, "num_input_tokens_seen": 88855320, "step": 4127, "time_per_iteration": 3.7118263244628906 }, { "auxiliary_loss_clip": 0.01103359, "auxiliary_loss_mlp": 0.01048021, "balance_loss_clip": 1.04619193, "balance_loss_mlp": 1.02838731, "epoch": 0.49636265255816747, "flos": 21793611404160.0, "grad_norm": 2.475536934630817, "language_loss": 0.78843236, "learning_rate": 2.120667277362376e-06, "loss": 0.80994618, "num_input_tokens_seen": 88874035, "step": 4128, "time_per_iteration": 2.728752374649048 }, { "auxiliary_loss_clip": 0.01148117, "auxiliary_loss_mlp": 0.01047427, "balance_loss_clip": 1.04886818, "balance_loss_mlp": 1.02909255, "epoch": 0.4964828954488066, "flos": 16358603581440.0, "grad_norm": 3.0925887495083204, "language_loss": 0.85168666, "learning_rate": 2.1198897156607796e-06, "loss": 0.87364215, "num_input_tokens_seen": 88891390, "step": 4129, "time_per_iteration": 2.548368215560913 }, { "auxiliary_loss_clip": 0.01137263, "auxiliary_loss_mlp": 0.01040654, "balance_loss_clip": 1.04485846, "balance_loss_mlp": 1.02233148, "epoch": 0.4966031383394457, "flos": 24711101775360.0, "grad_norm": 2.010167804181844, "language_loss": 0.74005127, "learning_rate": 2.1191121357720085e-06, "loss": 0.76183045, "num_input_tokens_seen": 88909450, "step": 4130, "time_per_iteration": 2.6680567264556885 }, { "auxiliary_loss_clip": 0.01089653, "auxiliary_loss_mlp": 0.01044016, "balance_loss_clip": 1.04309785, "balance_loss_mlp": 1.0262537, "epoch": 0.49672338123008475, "flos": 22930615491840.0, "grad_norm": 1.709590564811981, "language_loss": 0.74790943, "learning_rate": 2.1183345378140206e-06, "loss": 0.7692461, "num_input_tokens_seen": 88929195, "step": 4131, "time_per_iteration": 2.769972324371338 }, { "auxiliary_loss_clip": 0.01033424, "auxiliary_loss_mlp": 0.01012738, "balance_loss_clip": 1.01114154, "balance_loss_mlp": 1.01087809, "epoch": 0.49684362412072386, "flos": 65976736844160.0, "grad_norm": 0.8507267066040779, "language_loss": 0.61992657, "learning_rate": 2.1175569219047783e-06, "loss": 0.64038819, "num_input_tokens_seen": 88990635, "step": 4132, "time_per_iteration": 3.278165340423584 }, { "auxiliary_loss_clip": 0.01149234, "auxiliary_loss_mlp": 0.01043003, "balance_loss_clip": 1.04937744, "balance_loss_mlp": 1.02648103, "epoch": 0.49696386701136297, "flos": 19971288754560.0, "grad_norm": 1.665539771306828, "language_loss": 0.7342304, "learning_rate": 2.1167792881622437e-06, "loss": 0.75615275, "num_input_tokens_seen": 89009655, "step": 4133, "time_per_iteration": 2.5937342643737793 }, { "auxiliary_loss_clip": 0.01118485, "auxiliary_loss_mlp": 0.01038563, "balance_loss_clip": 1.04519534, "balance_loss_mlp": 1.02133727, "epoch": 0.497084109902002, "flos": 24750819239040.0, "grad_norm": 2.4608473822387125, "language_loss": 0.80958301, "learning_rate": 2.116001636704384e-06, "loss": 0.83115351, "num_input_tokens_seen": 89030040, "step": 4134, "time_per_iteration": 2.637683391571045 }, { "auxiliary_loss_clip": 0.0110637, "auxiliary_loss_mlp": 0.01041231, "balance_loss_clip": 1.04512215, "balance_loss_mlp": 1.02274728, "epoch": 0.49720435279264114, "flos": 21871825269120.0, "grad_norm": 2.0709447718232776, "language_loss": 0.80143189, "learning_rate": 2.1152239676491685e-06, "loss": 0.82290792, "num_input_tokens_seen": 89048145, "step": 4135, "time_per_iteration": 2.6683788299560547 }, { "auxiliary_loss_clip": 0.01123973, "auxiliary_loss_mlp": 0.01045359, "balance_loss_clip": 1.04238343, "balance_loss_mlp": 1.0285387, "epoch": 0.49732459568328025, "flos": 23805794367360.0, "grad_norm": 2.2367514844075904, "language_loss": 0.73440045, "learning_rate": 2.114446281114569e-06, "loss": 0.75609374, "num_input_tokens_seen": 89067165, "step": 4136, "time_per_iteration": 2.636681079864502 }, { "auxiliary_loss_clip": 0.01112684, "auxiliary_loss_mlp": 0.01048442, "balance_loss_clip": 1.0437994, "balance_loss_mlp": 1.03144312, "epoch": 0.4974448385739193, "flos": 20047742853120.0, "grad_norm": 1.990146661318278, "language_loss": 0.76525426, "learning_rate": 2.1136685772185587e-06, "loss": 0.78686547, "num_input_tokens_seen": 89086190, "step": 4137, "time_per_iteration": 2.6413588523864746 }, { "auxiliary_loss_clip": 0.01122038, "auxiliary_loss_mlp": 0.00773688, "balance_loss_clip": 1.04165483, "balance_loss_mlp": 1.00047779, "epoch": 0.4975650814645584, "flos": 24821347593600.0, "grad_norm": 1.6259485479954512, "language_loss": 0.77941972, "learning_rate": 2.1128908560791163e-06, "loss": 0.79837692, "num_input_tokens_seen": 89106020, "step": 4138, "time_per_iteration": 2.5916907787323 }, { "auxiliary_loss_clip": 0.01147582, "auxiliary_loss_mlp": 0.0104613, "balance_loss_clip": 1.04870772, "balance_loss_mlp": 1.02871346, "epoch": 0.4976853243551975, "flos": 19829477859840.0, "grad_norm": 2.1886938188061857, "language_loss": 0.78082514, "learning_rate": 2.1121131178142203e-06, "loss": 0.80276227, "num_input_tokens_seen": 89125385, "step": 4139, "time_per_iteration": 2.543056011199951 }, { "auxiliary_loss_clip": 0.0112161, "auxiliary_loss_mlp": 0.01039002, "balance_loss_clip": 1.0424459, "balance_loss_mlp": 1.02212167, "epoch": 0.4978055672458366, "flos": 23142990654720.0, "grad_norm": 1.5055264338661132, "language_loss": 0.82422346, "learning_rate": 2.1113353625418544e-06, "loss": 0.84582961, "num_input_tokens_seen": 89143935, "step": 4140, "time_per_iteration": 2.646989583969116 }, { "auxiliary_loss_clip": 0.01128624, "auxiliary_loss_mlp": 0.0103769, "balance_loss_clip": 1.04834032, "balance_loss_mlp": 1.02238345, "epoch": 0.4979258101364757, "flos": 15559914718080.0, "grad_norm": 1.5759414089572799, "language_loss": 0.79029882, "learning_rate": 2.1105575903800017e-06, "loss": 0.81196189, "num_input_tokens_seen": 89162655, "step": 4141, "time_per_iteration": 3.5006203651428223 }, { "auxiliary_loss_clip": 0.01136619, "auxiliary_loss_mlp": 0.0104112, "balance_loss_clip": 1.04402697, "balance_loss_mlp": 1.02283287, "epoch": 0.4980460530271148, "flos": 26356169784960.0, "grad_norm": 1.8452506279856127, "language_loss": 0.8550784, "learning_rate": 2.1097798014466502e-06, "loss": 0.87685573, "num_input_tokens_seen": 89182255, "step": 4142, "time_per_iteration": 3.5222012996673584 }, { "auxiliary_loss_clip": 0.01139656, "auxiliary_loss_mlp": 0.01047707, "balance_loss_clip": 1.04706204, "balance_loss_mlp": 1.02936029, "epoch": 0.49816629591775385, "flos": 17274541415040.0, "grad_norm": 3.815564537553233, "language_loss": 0.59394717, "learning_rate": 2.109001995859791e-06, "loss": 0.61582077, "num_input_tokens_seen": 89201155, "step": 4143, "time_per_iteration": 2.6003005504608154 }, { "auxiliary_loss_clip": 0.0102225, "auxiliary_loss_mlp": 0.010108, "balance_loss_clip": 1.00994015, "balance_loss_mlp": 1.00878513, "epoch": 0.49828653880839296, "flos": 64930947344640.0, "grad_norm": 0.8120905401414557, "language_loss": 0.60050023, "learning_rate": 2.108224173737415e-06, "loss": 0.62083071, "num_input_tokens_seen": 89264455, "step": 4144, "time_per_iteration": 3.203622579574585 }, { "auxiliary_loss_clip": 0.01120111, "auxiliary_loss_mlp": 0.01037929, "balance_loss_clip": 1.04408026, "balance_loss_mlp": 1.0187484, "epoch": 0.498406781699032, "flos": 27484806003840.0, "grad_norm": 3.8182872223351816, "language_loss": 0.76248741, "learning_rate": 2.1074463351975183e-06, "loss": 0.78406781, "num_input_tokens_seen": 89283340, "step": 4145, "time_per_iteration": 2.6657392978668213 }, { "auxiliary_loss_clip": 0.01113237, "auxiliary_loss_mlp": 0.01041344, "balance_loss_clip": 1.04528451, "balance_loss_mlp": 1.02482176, "epoch": 0.49852702458967113, "flos": 31499870307840.0, "grad_norm": 2.334374437288614, "language_loss": 0.71553755, "learning_rate": 2.106668480358098e-06, "loss": 0.73708332, "num_input_tokens_seen": 89303565, "step": 4146, "time_per_iteration": 3.664424419403076 }, { "auxiliary_loss_clip": 0.01120711, "auxiliary_loss_mlp": 0.01044666, "balance_loss_clip": 1.04418778, "balance_loss_mlp": 1.02538943, "epoch": 0.49864726748031024, "flos": 22852868503680.0, "grad_norm": 1.8984856062608602, "language_loss": 0.70885795, "learning_rate": 2.105890609337154e-06, "loss": 0.73051172, "num_input_tokens_seen": 89322080, "step": 4147, "time_per_iteration": 2.6810972690582275 }, { "auxiliary_loss_clip": 0.01040486, "auxiliary_loss_mlp": 0.0100246, "balance_loss_clip": 1.00906348, "balance_loss_mlp": 1.00059986, "epoch": 0.4987675103709493, "flos": 70405708544640.0, "grad_norm": 0.6887796419753511, "language_loss": 0.63844156, "learning_rate": 2.1051127222526883e-06, "loss": 0.65887105, "num_input_tokens_seen": 89394195, "step": 4148, "time_per_iteration": 3.2953765392303467 }, { "auxiliary_loss_clip": 0.01131244, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.04734516, "balance_loss_mlp": 1.02358985, "epoch": 0.4988877532615884, "flos": 28767571482240.0, "grad_norm": 2.0058762928624354, "language_loss": 0.80781484, "learning_rate": 2.1043348192227067e-06, "loss": 0.82952857, "num_input_tokens_seen": 89414565, "step": 4149, "time_per_iteration": 2.6560893058776855 }, { "auxiliary_loss_clip": 0.01101489, "auxiliary_loss_mlp": 0.0105123, "balance_loss_clip": 1.04482543, "balance_loss_mlp": 1.03508937, "epoch": 0.4990079961522275, "flos": 16872700988160.0, "grad_norm": 1.7929704696974629, "language_loss": 0.6156919, "learning_rate": 2.1035569003652156e-06, "loss": 0.63721907, "num_input_tokens_seen": 89433195, "step": 4150, "time_per_iteration": 2.692054033279419 }, { "auxiliary_loss_clip": 0.01097422, "auxiliary_loss_mlp": 0.01049701, "balance_loss_clip": 1.04176855, "balance_loss_mlp": 1.02844572, "epoch": 0.4991282390428666, "flos": 13291042187520.0, "grad_norm": 2.043443498066861, "language_loss": 0.81680268, "learning_rate": 2.1027789657982255e-06, "loss": 0.83827388, "num_input_tokens_seen": 89447410, "step": 4151, "time_per_iteration": 2.6913390159606934 }, { "auxiliary_loss_clip": 0.01099174, "auxiliary_loss_mlp": 0.01038506, "balance_loss_clip": 1.04202735, "balance_loss_mlp": 1.02045751, "epoch": 0.4992484819335057, "flos": 21537496454400.0, "grad_norm": 2.081998493408619, "language_loss": 0.77118641, "learning_rate": 2.1020010156397482e-06, "loss": 0.7925632, "num_input_tokens_seen": 89464630, "step": 4152, "time_per_iteration": 2.6674561500549316 }, { "auxiliary_loss_clip": 0.01131966, "auxiliary_loss_mlp": 0.01041209, "balance_loss_clip": 1.04583669, "balance_loss_mlp": 1.02507961, "epoch": 0.4993687248241448, "flos": 24860095390080.0, "grad_norm": 1.4813882707726953, "language_loss": 0.77527308, "learning_rate": 2.101223050007797e-06, "loss": 0.79700482, "num_input_tokens_seen": 89483180, "step": 4153, "time_per_iteration": 3.547968626022339 }, { "auxiliary_loss_clip": 0.0103927, "auxiliary_loss_mlp": 0.01002742, "balance_loss_clip": 1.00829744, "balance_loss_mlp": 1.00066781, "epoch": 0.49948896771478385, "flos": 62941602453120.0, "grad_norm": 0.8181553412982417, "language_loss": 0.53774202, "learning_rate": 2.1004450690203904e-06, "loss": 0.55816209, "num_input_tokens_seen": 89539260, "step": 4154, "time_per_iteration": 3.1674468517303467 }, { "auxiliary_loss_clip": 0.01040079, "auxiliary_loss_mlp": 0.01001582, "balance_loss_clip": 1.0089848, "balance_loss_mlp": 0.99942476, "epoch": 0.49960921060542296, "flos": 68284213516800.0, "grad_norm": 0.8490833629300365, "language_loss": 0.6333549, "learning_rate": 2.099667072795546e-06, "loss": 0.65377152, "num_input_tokens_seen": 89601380, "step": 4155, "time_per_iteration": 3.1816821098327637 }, { "auxiliary_loss_clip": 0.01135398, "auxiliary_loss_mlp": 0.01038163, "balance_loss_clip": 1.04552007, "balance_loss_mlp": 1.02138996, "epoch": 0.49972945349606207, "flos": 23659350618240.0, "grad_norm": 1.803571023574061, "language_loss": 0.79672325, "learning_rate": 2.0988890614512864e-06, "loss": 0.8184588, "num_input_tokens_seen": 89621270, "step": 4156, "time_per_iteration": 2.615459442138672 }, { "auxiliary_loss_clip": 0.01125035, "auxiliary_loss_mlp": 0.01043365, "balance_loss_clip": 1.04731917, "balance_loss_mlp": 1.0251615, "epoch": 0.4998496963867011, "flos": 19755825022080.0, "grad_norm": 2.110977476245585, "language_loss": 0.84399211, "learning_rate": 2.098111035105635e-06, "loss": 0.86567616, "num_input_tokens_seen": 89639695, "step": 4157, "time_per_iteration": 2.623483180999756 }, { "auxiliary_loss_clip": 0.01096498, "auxiliary_loss_mlp": 0.01049661, "balance_loss_clip": 1.04432189, "balance_loss_mlp": 1.03192258, "epoch": 0.49996993927734024, "flos": 22265728790400.0, "grad_norm": 2.292748166779981, "language_loss": 0.73373938, "learning_rate": 2.0973329938766176e-06, "loss": 0.75520098, "num_input_tokens_seen": 89657125, "step": 4158, "time_per_iteration": 2.6813900470733643 }, { "auxiliary_loss_clip": 0.01142036, "auxiliary_loss_mlp": 0.01045238, "balance_loss_clip": 1.04705405, "balance_loss_mlp": 1.02739215, "epoch": 0.5000901821679793, "flos": 23327212533120.0, "grad_norm": 1.9404518083128668, "language_loss": 0.78714418, "learning_rate": 2.0965549378822618e-06, "loss": 0.80901682, "num_input_tokens_seen": 89678415, "step": 4159, "time_per_iteration": 2.6235668659210205 }, { "auxiliary_loss_clip": 0.01055484, "auxiliary_loss_mlp": 0.01057965, "balance_loss_clip": 1.03797531, "balance_loss_mlp": 1.03790164, "epoch": 0.5002104250586185, "flos": 20339014239360.0, "grad_norm": 2.3837613340550794, "language_loss": 0.84184217, "learning_rate": 2.095776867240599e-06, "loss": 0.86297673, "num_input_tokens_seen": 89695405, "step": 4160, "time_per_iteration": 2.9960243701934814 }, { "auxiliary_loss_clip": 0.01105022, "auxiliary_loss_mlp": 0.01042827, "balance_loss_clip": 1.04286778, "balance_loss_mlp": 1.0260303, "epoch": 0.5003306679492575, "flos": 13991372634240.0, "grad_norm": 2.225463932766762, "language_loss": 0.82806182, "learning_rate": 2.094998782069661e-06, "loss": 0.84954029, "num_input_tokens_seen": 89713110, "step": 4161, "time_per_iteration": 2.8152706623077393 }, { "auxiliary_loss_clip": 0.0114618, "auxiliary_loss_mlp": 0.01049798, "balance_loss_clip": 1.04677916, "balance_loss_mlp": 1.03164291, "epoch": 0.5004509108398966, "flos": 27672762896640.0, "grad_norm": 2.437239626760176, "language_loss": 0.75698924, "learning_rate": 2.0942206824874845e-06, "loss": 0.77894902, "num_input_tokens_seen": 89735885, "step": 4162, "time_per_iteration": 2.6374244689941406 }, { "auxiliary_loss_clip": 0.01131268, "auxiliary_loss_mlp": 0.01042848, "balance_loss_clip": 1.04764426, "balance_loss_mlp": 1.0250504, "epoch": 0.5005711537305357, "flos": 14976186796800.0, "grad_norm": 2.758938049127138, "language_loss": 0.79185164, "learning_rate": 2.093442568612105e-06, "loss": 0.81359285, "num_input_tokens_seen": 89753690, "step": 4163, "time_per_iteration": 2.577838659286499 }, { "auxiliary_loss_clip": 0.01155308, "auxiliary_loss_mlp": 0.01045691, "balance_loss_clip": 1.05018234, "balance_loss_mlp": 1.02736866, "epoch": 0.5006913966211748, "flos": 26503259978880.0, "grad_norm": 1.601181758911783, "language_loss": 0.8477999, "learning_rate": 2.0926644405615613e-06, "loss": 0.86980987, "num_input_tokens_seen": 89774590, "step": 4164, "time_per_iteration": 2.5815255641937256 }, { "auxiliary_loss_clip": 0.01105979, "auxiliary_loss_mlp": 0.01039096, "balance_loss_clip": 1.04337656, "balance_loss_mlp": 1.02123809, "epoch": 0.5008116395118138, "flos": 20449295971200.0, "grad_norm": 1.9265195029298208, "language_loss": 0.80952299, "learning_rate": 2.091886298453897e-06, "loss": 0.83097374, "num_input_tokens_seen": 89792775, "step": 4165, "time_per_iteration": 2.6609692573547363 }, { "auxiliary_loss_clip": 0.01131169, "auxiliary_loss_mlp": 0.01045843, "balance_loss_clip": 1.04625785, "balance_loss_mlp": 1.0288558, "epoch": 0.500931882402453, "flos": 21579871524480.0, "grad_norm": 2.162953821432956, "language_loss": 0.73021567, "learning_rate": 2.091108142407153e-06, "loss": 0.75198579, "num_input_tokens_seen": 89811515, "step": 4166, "time_per_iteration": 2.5813112258911133 }, { "auxiliary_loss_clip": 0.01019908, "auxiliary_loss_mlp": 0.01002863, "balance_loss_clip": 1.0117023, "balance_loss_mlp": 1.00078857, "epoch": 0.5010521252930921, "flos": 57785011925760.0, "grad_norm": 0.8356970710820231, "language_loss": 0.62371296, "learning_rate": 2.090329972539377e-06, "loss": 0.64394063, "num_input_tokens_seen": 89870080, "step": 4167, "time_per_iteration": 3.2313334941864014 }, { "auxiliary_loss_clip": 0.01054219, "auxiliary_loss_mlp": 0.01049899, "balance_loss_clip": 1.03804529, "balance_loss_mlp": 1.031708, "epoch": 0.5011723681837311, "flos": 18625500864000.0, "grad_norm": 2.049945413686618, "language_loss": 0.6876651, "learning_rate": 2.089551788968616e-06, "loss": 0.70870632, "num_input_tokens_seen": 89888045, "step": 4168, "time_per_iteration": 3.807245969772339 }, { "auxiliary_loss_clip": 0.01040748, "auxiliary_loss_mlp": 0.01003068, "balance_loss_clip": 1.00933385, "balance_loss_mlp": 1.00119638, "epoch": 0.5012926110743702, "flos": 55883146608000.0, "grad_norm": 0.8394076093122644, "language_loss": 0.60744774, "learning_rate": 2.08877359181292e-06, "loss": 0.62788588, "num_input_tokens_seen": 89944610, "step": 4169, "time_per_iteration": 4.208900213241577 }, { "auxiliary_loss_clip": 0.0111549, "auxiliary_loss_mlp": 0.01044664, "balance_loss_clip": 1.04280519, "balance_loss_mlp": 1.02703321, "epoch": 0.5014128539650093, "flos": 24238266117120.0, "grad_norm": 6.566607468479848, "language_loss": 0.85341954, "learning_rate": 2.0879953811903396e-06, "loss": 0.8750211, "num_input_tokens_seen": 89959495, "step": 4170, "time_per_iteration": 2.6957030296325684 }, { "auxiliary_loss_clip": 0.01135273, "auxiliary_loss_mlp": 0.01047574, "balance_loss_clip": 1.04656172, "balance_loss_mlp": 1.02975178, "epoch": 0.5015330968556484, "flos": 27527468382720.0, "grad_norm": 1.8651140550004568, "language_loss": 0.78709579, "learning_rate": 2.08721715721893e-06, "loss": 0.8089242, "num_input_tokens_seen": 89978820, "step": 4171, "time_per_iteration": 2.628260612487793 }, { "auxiliary_loss_clip": 0.01135619, "auxiliary_loss_mlp": 0.01040309, "balance_loss_clip": 1.04534018, "balance_loss_mlp": 1.02222455, "epoch": 0.5016533397462875, "flos": 23800802376960.0, "grad_norm": 2.229399474716718, "language_loss": 0.77210283, "learning_rate": 2.0864389200167477e-06, "loss": 0.7938621, "num_input_tokens_seen": 89997075, "step": 4172, "time_per_iteration": 3.7098519802093506 }, { "auxiliary_loss_clip": 0.01138286, "auxiliary_loss_mlp": 0.00773972, "balance_loss_clip": 1.04669976, "balance_loss_mlp": 1.00051975, "epoch": 0.5017735826369266, "flos": 25295009264640.0, "grad_norm": 2.073904409331647, "language_loss": 0.79159629, "learning_rate": 2.0856606697018504e-06, "loss": 0.81071883, "num_input_tokens_seen": 90015085, "step": 4173, "time_per_iteration": 2.7801926136016846 }, { "auxiliary_loss_clip": 0.01114346, "auxiliary_loss_mlp": 0.01041465, "balance_loss_clip": 1.04072046, "balance_loss_mlp": 1.02156842, "epoch": 0.5018938255275657, "flos": 16873203778560.0, "grad_norm": 2.4846409941359884, "language_loss": 0.73165739, "learning_rate": 2.084882406392297e-06, "loss": 0.75321555, "num_input_tokens_seen": 90033045, "step": 4174, "time_per_iteration": 2.6039416790008545 }, { "auxiliary_loss_clip": 0.01133964, "auxiliary_loss_mlp": 0.01038218, "balance_loss_clip": 1.04552281, "balance_loss_mlp": 1.02076578, "epoch": 0.5020140684182047, "flos": 25515429073920.0, "grad_norm": 4.758476018568063, "language_loss": 0.71660048, "learning_rate": 2.0841041302061496e-06, "loss": 0.73832226, "num_input_tokens_seen": 90052505, "step": 4175, "time_per_iteration": 2.6371254920959473 }, { "auxiliary_loss_clip": 0.0111371, "auxiliary_loss_mlp": 0.01045147, "balance_loss_clip": 1.04261255, "balance_loss_mlp": 1.02644265, "epoch": 0.5021343113088439, "flos": 23659278791040.0, "grad_norm": 1.8772122057714604, "language_loss": 0.75500804, "learning_rate": 2.083325841261473e-06, "loss": 0.77659661, "num_input_tokens_seen": 90071565, "step": 4176, "time_per_iteration": 2.6394035816192627 }, { "auxiliary_loss_clip": 0.01113511, "auxiliary_loss_mlp": 0.010465, "balance_loss_clip": 1.04046345, "balance_loss_mlp": 1.02764153, "epoch": 0.502254554199483, "flos": 24534673148160.0, "grad_norm": 2.3433640268812335, "language_loss": 0.66370511, "learning_rate": 2.0825475396763322e-06, "loss": 0.68530524, "num_input_tokens_seen": 90092215, "step": 4177, "time_per_iteration": 2.634178638458252 }, { "auxiliary_loss_clip": 0.01051365, "auxiliary_loss_mlp": 0.01043583, "balance_loss_clip": 1.03733492, "balance_loss_mlp": 1.02492678, "epoch": 0.502374797090122, "flos": 34240285607040.0, "grad_norm": 1.7693015116091424, "language_loss": 0.65501446, "learning_rate": 2.081769225568796e-06, "loss": 0.67596394, "num_input_tokens_seen": 90114665, "step": 4178, "time_per_iteration": 2.9234490394592285 }, { "auxiliary_loss_clip": 0.0113696, "auxiliary_loss_mlp": 0.01044037, "balance_loss_clip": 1.04482782, "balance_loss_mlp": 1.02422428, "epoch": 0.5024950399807612, "flos": 26031106679040.0, "grad_norm": 1.6097067276364132, "language_loss": 0.76046985, "learning_rate": 2.0809908990569327e-06, "loss": 0.78227979, "num_input_tokens_seen": 90136445, "step": 4179, "time_per_iteration": 3.512691020965576 }, { "auxiliary_loss_clip": 0.0111806, "auxiliary_loss_mlp": 0.01044621, "balance_loss_clip": 1.04325676, "balance_loss_mlp": 1.02716911, "epoch": 0.5026152828714002, "flos": 21252438120960.0, "grad_norm": 1.7292040536194493, "language_loss": 0.79100561, "learning_rate": 2.0802125602588146e-06, "loss": 0.81263244, "num_input_tokens_seen": 90155710, "step": 4180, "time_per_iteration": 2.577754497528076 }, { "auxiliary_loss_clip": 0.01146929, "auxiliary_loss_mlp": 0.01037252, "balance_loss_clip": 1.04754758, "balance_loss_mlp": 1.0193584, "epoch": 0.5027355257620393, "flos": 30956111245440.0, "grad_norm": 5.496890940881115, "language_loss": 0.66197926, "learning_rate": 2.0794342092925146e-06, "loss": 0.68382108, "num_input_tokens_seen": 90176845, "step": 4181, "time_per_iteration": 2.620676040649414 }, { "auxiliary_loss_clip": 0.01134379, "auxiliary_loss_mlp": 0.010376, "balance_loss_clip": 1.04705572, "balance_loss_mlp": 1.0209465, "epoch": 0.5028557686526784, "flos": 24791147233920.0, "grad_norm": 2.160189028384986, "language_loss": 0.68007255, "learning_rate": 2.078655846276108e-06, "loss": 0.7017923, "num_input_tokens_seen": 90197175, "step": 4182, "time_per_iteration": 2.615729570388794 }, { "auxiliary_loss_clip": 0.01120292, "auxiliary_loss_mlp": 0.01044294, "balance_loss_clip": 1.0446136, "balance_loss_mlp": 1.02433813, "epoch": 0.5029760115433175, "flos": 22966992990720.0, "grad_norm": 1.9727345296306058, "language_loss": 0.68835676, "learning_rate": 2.0778774713276727e-06, "loss": 0.7100026, "num_input_tokens_seen": 90216650, "step": 4183, "time_per_iteration": 2.575065851211548 }, { "auxiliary_loss_clip": 0.01133588, "auxiliary_loss_mlp": 0.01042719, "balance_loss_clip": 1.04284561, "balance_loss_mlp": 1.02359748, "epoch": 0.5030962544339566, "flos": 15305164485120.0, "grad_norm": 2.776752063095021, "language_loss": 0.67940199, "learning_rate": 2.077099084565287e-06, "loss": 0.70116508, "num_input_tokens_seen": 90234055, "step": 4184, "time_per_iteration": 2.5368311405181885 }, { "auxiliary_loss_clip": 0.01115801, "auxiliary_loss_mlp": 0.01049791, "balance_loss_clip": 1.04155409, "balance_loss_mlp": 1.03062177, "epoch": 0.5032164973245957, "flos": 24494847943680.0, "grad_norm": 2.8751174959981607, "language_loss": 0.65340841, "learning_rate": 2.0763206861070313e-06, "loss": 0.67506433, "num_input_tokens_seen": 90253115, "step": 4185, "time_per_iteration": 2.6278574466705322 }, { "auxiliary_loss_clip": 0.01145833, "auxiliary_loss_mlp": 0.01040621, "balance_loss_clip": 1.04509258, "balance_loss_mlp": 1.02201247, "epoch": 0.5033367402152348, "flos": 16213452721920.0, "grad_norm": 1.934073084508032, "language_loss": 0.75295246, "learning_rate": 2.0755422760709876e-06, "loss": 0.77481699, "num_input_tokens_seen": 90270515, "step": 4186, "time_per_iteration": 2.5177881717681885 }, { "auxiliary_loss_clip": 0.01088033, "auxiliary_loss_mlp": 0.01055315, "balance_loss_clip": 1.03901148, "balance_loss_mlp": 1.03720689, "epoch": 0.5034569831058738, "flos": 21391375927680.0, "grad_norm": 2.9770506679247983, "language_loss": 0.76958942, "learning_rate": 2.0747638545752417e-06, "loss": 0.7910229, "num_input_tokens_seen": 90289075, "step": 4187, "time_per_iteration": 2.6579678058624268 }, { "auxiliary_loss_clip": 0.01119782, "auxiliary_loss_mlp": 0.01050062, "balance_loss_clip": 1.04469895, "balance_loss_mlp": 1.03128648, "epoch": 0.503577225996513, "flos": 20558751690240.0, "grad_norm": 2.0215310797850634, "language_loss": 0.83350062, "learning_rate": 2.073985421737878e-06, "loss": 0.8551991, "num_input_tokens_seen": 90306385, "step": 4188, "time_per_iteration": 2.5942418575286865 }, { "auxiliary_loss_clip": 0.01138564, "auxiliary_loss_mlp": 0.01051767, "balance_loss_clip": 1.04744112, "balance_loss_mlp": 1.03267014, "epoch": 0.5036974688871521, "flos": 27229157930880.0, "grad_norm": 2.0968667100653393, "language_loss": 0.7381472, "learning_rate": 2.0732069776769844e-06, "loss": 0.76005054, "num_input_tokens_seen": 90323795, "step": 4189, "time_per_iteration": 2.635875940322876 }, { "auxiliary_loss_clip": 0.01150319, "auxiliary_loss_mlp": 0.01051661, "balance_loss_clip": 1.05029655, "balance_loss_mlp": 1.03386259, "epoch": 0.5038177117777911, "flos": 20412164286720.0, "grad_norm": 1.8969424625877855, "language_loss": 0.73162633, "learning_rate": 2.072428522510651e-06, "loss": 0.75364614, "num_input_tokens_seen": 90340360, "step": 4190, "time_per_iteration": 2.557013750076294 }, { "auxiliary_loss_clip": 0.0110184, "auxiliary_loss_mlp": 0.01043463, "balance_loss_clip": 1.04252577, "balance_loss_mlp": 1.0259397, "epoch": 0.5039379546684303, "flos": 21907987286400.0, "grad_norm": 3.3503286611991143, "language_loss": 0.77078795, "learning_rate": 2.071650056356968e-06, "loss": 0.79224098, "num_input_tokens_seen": 90357900, "step": 4191, "time_per_iteration": 2.6316466331481934 }, { "auxiliary_loss_clip": 0.01148682, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.04973435, "balance_loss_mlp": 1.02537644, "epoch": 0.5040581975590693, "flos": 20010718909440.0, "grad_norm": 2.510382948601656, "language_loss": 0.80067927, "learning_rate": 2.070871579334028e-06, "loss": 0.82259566, "num_input_tokens_seen": 90377010, "step": 4192, "time_per_iteration": 2.544612169265747 }, { "auxiliary_loss_clip": 0.01147419, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 1.04679191, "balance_loss_mlp": 1.02746117, "epoch": 0.5041784404497084, "flos": 20959837931520.0, "grad_norm": 1.6839918289444131, "language_loss": 0.71668398, "learning_rate": 2.0700930915599264e-06, "loss": 0.73861217, "num_input_tokens_seen": 90396740, "step": 4193, "time_per_iteration": 2.562742233276367 }, { "auxiliary_loss_clip": 0.01146506, "auxiliary_loss_mlp": 0.01040456, "balance_loss_clip": 1.04777527, "balance_loss_mlp": 1.02338552, "epoch": 0.5042986833403476, "flos": 12495082757760.0, "grad_norm": 2.046062924635481, "language_loss": 0.780864, "learning_rate": 2.0693145931527583e-06, "loss": 0.80273366, "num_input_tokens_seen": 90413220, "step": 4194, "time_per_iteration": 3.473066568374634 }, { "auxiliary_loss_clip": 0.01114331, "auxiliary_loss_mlp": 0.01040447, "balance_loss_clip": 1.04149747, "balance_loss_mlp": 1.02304256, "epoch": 0.5044189262309866, "flos": 29202305788800.0, "grad_norm": 1.5179163136831628, "language_loss": 0.77787268, "learning_rate": 2.068536084230622e-06, "loss": 0.79942048, "num_input_tokens_seen": 90435085, "step": 4195, "time_per_iteration": 3.6358392238616943 }, { "auxiliary_loss_clip": 0.0113472, "auxiliary_loss_mlp": 0.01045889, "balance_loss_clip": 1.04597187, "balance_loss_mlp": 1.02796006, "epoch": 0.5045391691216257, "flos": 23873198238720.0, "grad_norm": 2.024748653700091, "language_loss": 0.88670599, "learning_rate": 2.067757564911616e-06, "loss": 0.90851212, "num_input_tokens_seen": 90453660, "step": 4196, "time_per_iteration": 2.586853504180908 }, { "auxiliary_loss_clip": 0.01130124, "auxiliary_loss_mlp": 0.00773957, "balance_loss_clip": 1.04736543, "balance_loss_mlp": 1.00053, "epoch": 0.5046594120122648, "flos": 24644990793600.0, "grad_norm": 2.5896291664215783, "language_loss": 0.92720449, "learning_rate": 2.0669790353138407e-06, "loss": 0.94624531, "num_input_tokens_seen": 90472625, "step": 4197, "time_per_iteration": 2.664715051651001 }, { "auxiliary_loss_clip": 0.01107503, "auxiliary_loss_mlp": 0.00774207, "balance_loss_clip": 1.04480946, "balance_loss_mlp": 1.00047743, "epoch": 0.5047796549029039, "flos": 23362835846400.0, "grad_norm": 2.3987444139614706, "language_loss": 0.73064303, "learning_rate": 2.0662004955553995e-06, "loss": 0.74946016, "num_input_tokens_seen": 90492325, "step": 4198, "time_per_iteration": 3.668653726577759 }, { "auxiliary_loss_clip": 0.0111685, "auxiliary_loss_mlp": 0.0105336, "balance_loss_clip": 1.04278803, "balance_loss_mlp": 1.03612232, "epoch": 0.5048998977935429, "flos": 17304095329920.0, "grad_norm": 1.9947477430384561, "language_loss": 0.77085418, "learning_rate": 2.065421945754395e-06, "loss": 0.79255629, "num_input_tokens_seen": 90510055, "step": 4199, "time_per_iteration": 2.599052906036377 }, { "auxiliary_loss_clip": 0.01098652, "auxiliary_loss_mlp": 0.01045392, "balance_loss_clip": 1.04536283, "balance_loss_mlp": 1.02766538, "epoch": 0.505020140684182, "flos": 34856979235200.0, "grad_norm": 1.7289640900528633, "language_loss": 0.78151619, "learning_rate": 2.0646433860289344e-06, "loss": 0.80295664, "num_input_tokens_seen": 90528980, "step": 4200, "time_per_iteration": 2.7793285846710205 }, { "auxiliary_loss_clip": 0.01137019, "auxiliary_loss_mlp": 0.00774424, "balance_loss_clip": 1.04571462, "balance_loss_mlp": 1.00051379, "epoch": 0.5051403835748212, "flos": 24863974058880.0, "grad_norm": 2.003496484189159, "language_loss": 0.82848042, "learning_rate": 2.0638648164971233e-06, "loss": 0.84759486, "num_input_tokens_seen": 90547445, "step": 4201, "time_per_iteration": 2.6039538383483887 }, { "auxiliary_loss_clip": 0.01120215, "auxiliary_loss_mlp": 0.01048092, "balance_loss_clip": 1.0483321, "balance_loss_mlp": 1.03233838, "epoch": 0.5052606264654602, "flos": 20959694277120.0, "grad_norm": 2.3132784657334837, "language_loss": 0.88350904, "learning_rate": 2.06308623727707e-06, "loss": 0.90519214, "num_input_tokens_seen": 90567545, "step": 4202, "time_per_iteration": 2.6253623962402344 }, { "auxiliary_loss_clip": 0.01132285, "auxiliary_loss_mlp": 0.01047983, "balance_loss_clip": 1.04635811, "balance_loss_mlp": 1.02915978, "epoch": 0.5053808693560993, "flos": 19642382893440.0, "grad_norm": 2.582318842254593, "language_loss": 0.76167667, "learning_rate": 2.0623076484868846e-06, "loss": 0.78347945, "num_input_tokens_seen": 90585000, "step": 4203, "time_per_iteration": 2.574629306793213 }, { "auxiliary_loss_clip": 0.0102368, "auxiliary_loss_mlp": 0.01006716, "balance_loss_clip": 1.01567304, "balance_loss_mlp": 1.00511861, "epoch": 0.5055011122467384, "flos": 67504915019520.0, "grad_norm": 0.8266073963206857, "language_loss": 0.606861, "learning_rate": 2.061529050244679e-06, "loss": 0.62716496, "num_input_tokens_seen": 90644745, "step": 4204, "time_per_iteration": 4.070957660675049 }, { "auxiliary_loss_clip": 0.01111475, "auxiliary_loss_mlp": 0.01049078, "balance_loss_clip": 1.0419457, "balance_loss_mlp": 1.03257966, "epoch": 0.5056213551373775, "flos": 16872952383360.0, "grad_norm": 2.3624791303955273, "language_loss": 0.74009556, "learning_rate": 2.060750442668565e-06, "loss": 0.76170111, "num_input_tokens_seen": 90662500, "step": 4205, "time_per_iteration": 2.630254030227661 }, { "auxiliary_loss_clip": 0.01135212, "auxiliary_loss_mlp": 0.0104257, "balance_loss_clip": 1.0479579, "balance_loss_mlp": 1.02526093, "epoch": 0.5057415980280165, "flos": 15334179696000.0, "grad_norm": 2.54251985593101, "language_loss": 0.64049411, "learning_rate": 2.059971825876657e-06, "loss": 0.66227198, "num_input_tokens_seen": 90677010, "step": 4206, "time_per_iteration": 2.5390191078186035 }, { "auxiliary_loss_clip": 0.01138115, "auxiliary_loss_mlp": 0.01045798, "balance_loss_clip": 1.04670393, "balance_loss_mlp": 1.02907312, "epoch": 0.5058618409186557, "flos": 19025976574080.0, "grad_norm": 1.9704301282245735, "language_loss": 0.76310664, "learning_rate": 2.0591931999870713e-06, "loss": 0.78494573, "num_input_tokens_seen": 90695935, "step": 4207, "time_per_iteration": 2.6382455825805664 }, { "auxiliary_loss_clip": 0.01031432, "auxiliary_loss_mlp": 0.00999792, "balance_loss_clip": 1.01396394, "balance_loss_mlp": 0.99808711, "epoch": 0.5059820838092948, "flos": 63453114080640.0, "grad_norm": 0.8216230041767848, "language_loss": 0.57545173, "learning_rate": 2.0584145651179234e-06, "loss": 0.59576398, "num_input_tokens_seen": 90751645, "step": 4208, "time_per_iteration": 3.190768003463745 }, { "auxiliary_loss_clip": 0.01124926, "auxiliary_loss_mlp": 0.00773153, "balance_loss_clip": 1.04788184, "balance_loss_mlp": 1.00048137, "epoch": 0.5061023266999338, "flos": 15441803821440.0, "grad_norm": 2.5096845865745046, "language_loss": 0.80286902, "learning_rate": 2.0576359213873327e-06, "loss": 0.82184982, "num_input_tokens_seen": 90766795, "step": 4209, "time_per_iteration": 2.5928897857666016 }, { "auxiliary_loss_clip": 0.01131503, "auxiliary_loss_mlp": 0.01039942, "balance_loss_clip": 1.04452658, "balance_loss_mlp": 1.02105904, "epoch": 0.506222569590573, "flos": 22451063990400.0, "grad_norm": 3.36436141572896, "language_loss": 0.70402396, "learning_rate": 2.056857268913419e-06, "loss": 0.72573835, "num_input_tokens_seen": 90786845, "step": 4210, "time_per_iteration": 2.709261894226074 }, { "auxiliary_loss_clip": 0.01132705, "auxiliary_loss_mlp": 0.01048021, "balance_loss_clip": 1.04510021, "balance_loss_mlp": 1.02975821, "epoch": 0.506342812481212, "flos": 17558665994880.0, "grad_norm": 2.2369675549055614, "language_loss": 0.84123921, "learning_rate": 2.056078607814303e-06, "loss": 0.86304641, "num_input_tokens_seen": 90802630, "step": 4211, "time_per_iteration": 2.562121868133545 }, { "auxiliary_loss_clip": 0.01135513, "auxiliary_loss_mlp": 0.01041183, "balance_loss_clip": 1.04819202, "balance_loss_mlp": 1.02344489, "epoch": 0.5064630553718511, "flos": 23402050519680.0, "grad_norm": 2.40206502180735, "language_loss": 0.78620243, "learning_rate": 2.055299938208106e-06, "loss": 0.80796939, "num_input_tokens_seen": 90823620, "step": 4212, "time_per_iteration": 2.618826389312744 }, { "auxiliary_loss_clip": 0.01137795, "auxiliary_loss_mlp": 0.01048949, "balance_loss_clip": 1.04781199, "balance_loss_mlp": 1.03056693, "epoch": 0.5065832982624903, "flos": 23987035416960.0, "grad_norm": 1.7191295597553817, "language_loss": 0.86300552, "learning_rate": 2.0545212602129526e-06, "loss": 0.88487291, "num_input_tokens_seen": 90843475, "step": 4213, "time_per_iteration": 2.596872329711914 }, { "auxiliary_loss_clip": 0.01115488, "auxiliary_loss_mlp": 0.01043173, "balance_loss_clip": 1.04220676, "balance_loss_mlp": 1.02465999, "epoch": 0.5067035411531293, "flos": 21503058289920.0, "grad_norm": 1.908490077062281, "language_loss": 0.66382736, "learning_rate": 2.0537425739469673e-06, "loss": 0.68541402, "num_input_tokens_seen": 90862410, "step": 4214, "time_per_iteration": 2.5952131748199463 }, { "auxiliary_loss_clip": 0.01034403, "auxiliary_loss_mlp": 0.01000876, "balance_loss_clip": 1.01221704, "balance_loss_mlp": 0.99887317, "epoch": 0.5068237840437684, "flos": 65934397687680.0, "grad_norm": 0.8757840552792043, "language_loss": 0.59451264, "learning_rate": 2.052963879528276e-06, "loss": 0.61486536, "num_input_tokens_seen": 90922280, "step": 4215, "time_per_iteration": 3.1415445804595947 }, { "auxiliary_loss_clip": 0.01134207, "auxiliary_loss_mlp": 0.01043467, "balance_loss_clip": 1.04595196, "balance_loss_mlp": 1.02622938, "epoch": 0.5069440269344075, "flos": 27264206626560.0, "grad_norm": 1.936339017327669, "language_loss": 0.76864612, "learning_rate": 2.052185177075007e-06, "loss": 0.79042292, "num_input_tokens_seen": 90941850, "step": 4216, "time_per_iteration": 2.629335403442383 }, { "auxiliary_loss_clip": 0.01136003, "auxiliary_loss_mlp": 0.01044291, "balance_loss_clip": 1.04557967, "balance_loss_mlp": 1.02673125, "epoch": 0.5070642698250466, "flos": 23366319465600.0, "grad_norm": 2.0447945315500613, "language_loss": 0.82727998, "learning_rate": 2.051406466705288e-06, "loss": 0.84908295, "num_input_tokens_seen": 90961390, "step": 4217, "time_per_iteration": 2.6310675144195557 }, { "auxiliary_loss_clip": 0.01145822, "auxiliary_loss_mlp": 0.01043925, "balance_loss_clip": 1.04711151, "balance_loss_mlp": 1.02647305, "epoch": 0.5071845127156857, "flos": 20340127560960.0, "grad_norm": 1.9549016282755014, "language_loss": 0.80957222, "learning_rate": 2.0506277485372486e-06, "loss": 0.83146971, "num_input_tokens_seen": 90980215, "step": 4218, "time_per_iteration": 2.5777249336242676 }, { "auxiliary_loss_clip": 0.011288, "auxiliary_loss_mlp": 0.01041778, "balance_loss_clip": 1.04506898, "balance_loss_mlp": 1.02350378, "epoch": 0.5073047556063248, "flos": 12092955022080.0, "grad_norm": 2.289819763426504, "language_loss": 0.67198396, "learning_rate": 2.04984902268902e-06, "loss": 0.6936897, "num_input_tokens_seen": 90997415, "step": 4219, "time_per_iteration": 3.445404291152954 }, { "auxiliary_loss_clip": 0.01143879, "auxiliary_loss_mlp": 0.01045731, "balance_loss_clip": 1.0479759, "balance_loss_mlp": 1.02784932, "epoch": 0.5074249984969639, "flos": 19682854542720.0, "grad_norm": 2.2336853825246727, "language_loss": 0.75780702, "learning_rate": 2.0490702892787345e-06, "loss": 0.77970302, "num_input_tokens_seen": 91016475, "step": 4220, "time_per_iteration": 3.610227108001709 }, { "auxiliary_loss_clip": 0.01123237, "auxiliary_loss_mlp": 0.0104178, "balance_loss_clip": 1.04231095, "balance_loss_mlp": 1.02554369, "epoch": 0.5075452413876029, "flos": 28765703975040.0, "grad_norm": 1.6078856852414318, "language_loss": 0.62338734, "learning_rate": 2.0482915484245246e-06, "loss": 0.64503747, "num_input_tokens_seen": 91038095, "step": 4221, "time_per_iteration": 2.6366865634918213 }, { "auxiliary_loss_clip": 0.01088447, "auxiliary_loss_mlp": 0.01047075, "balance_loss_clip": 1.03919673, "balance_loss_mlp": 1.0284071, "epoch": 0.5076654842782421, "flos": 20339445202560.0, "grad_norm": 2.379795554025596, "language_loss": 0.84305668, "learning_rate": 2.047512800244526e-06, "loss": 0.86441195, "num_input_tokens_seen": 91053360, "step": 4222, "time_per_iteration": 2.6604831218719482 }, { "auxiliary_loss_clip": 0.01136091, "auxiliary_loss_mlp": 0.01042963, "balance_loss_clip": 1.04652965, "balance_loss_mlp": 1.02596331, "epoch": 0.5077857271688812, "flos": 26359653404160.0, "grad_norm": 1.974206567570211, "language_loss": 0.79041898, "learning_rate": 2.046734044856873e-06, "loss": 0.81220949, "num_input_tokens_seen": 91072770, "step": 4223, "time_per_iteration": 2.616780996322632 }, { "auxiliary_loss_clip": 0.01133917, "auxiliary_loss_mlp": 0.01044357, "balance_loss_clip": 1.04601312, "balance_loss_mlp": 1.02685714, "epoch": 0.5079059700595202, "flos": 21798962530560.0, "grad_norm": 1.9607158122582704, "language_loss": 0.81260508, "learning_rate": 2.045955282379702e-06, "loss": 0.83438778, "num_input_tokens_seen": 91091430, "step": 4224, "time_per_iteration": 3.5608131885528564 }, { "auxiliary_loss_clip": 0.01133278, "auxiliary_loss_mlp": 0.01047494, "balance_loss_clip": 1.04412699, "balance_loss_mlp": 1.02831268, "epoch": 0.5080262129501594, "flos": 13187943175680.0, "grad_norm": 7.500932128488266, "language_loss": 0.75902665, "learning_rate": 2.045176512931152e-06, "loss": 0.78083432, "num_input_tokens_seen": 91106060, "step": 4225, "time_per_iteration": 2.5449421405792236 }, { "auxiliary_loss_clip": 0.01110004, "auxiliary_loss_mlp": 0.01046624, "balance_loss_clip": 1.04411149, "balance_loss_mlp": 1.03087664, "epoch": 0.5081464558407984, "flos": 25301473712640.0, "grad_norm": 2.386962419960464, "language_loss": 0.75961399, "learning_rate": 2.0443977366293604e-06, "loss": 0.78118026, "num_input_tokens_seen": 91124100, "step": 4226, "time_per_iteration": 2.6703670024871826 }, { "auxiliary_loss_clip": 0.01085379, "auxiliary_loss_mlp": 0.01049185, "balance_loss_clip": 1.04280162, "balance_loss_mlp": 1.0299449, "epoch": 0.5082666987314375, "flos": 30951226995840.0, "grad_norm": 1.636069402092036, "language_loss": 0.77256835, "learning_rate": 2.043618953592468e-06, "loss": 0.79391396, "num_input_tokens_seen": 91146555, "step": 4227, "time_per_iteration": 2.8143310546875 }, { "auxiliary_loss_clip": 0.01125891, "auxiliary_loss_mlp": 0.01053555, "balance_loss_clip": 1.04638207, "balance_loss_mlp": 1.03495836, "epoch": 0.5083869416220766, "flos": 19682495406720.0, "grad_norm": 1.5959435040092291, "language_loss": 0.81077147, "learning_rate": 2.0428401639386144e-06, "loss": 0.8325659, "num_input_tokens_seen": 91167120, "step": 4228, "time_per_iteration": 2.624312162399292 }, { "auxiliary_loss_clip": 0.01020399, "auxiliary_loss_mlp": 0.01003199, "balance_loss_clip": 1.0095526, "balance_loss_mlp": 1.00147021, "epoch": 0.5085071845127157, "flos": 71817535589760.0, "grad_norm": 0.8239753959921888, "language_loss": 0.58142626, "learning_rate": 2.042061367785943e-06, "loss": 0.60166228, "num_input_tokens_seen": 91220260, "step": 4229, "time_per_iteration": 3.1782419681549072 }, { "auxiliary_loss_clip": 0.01111284, "auxiliary_loss_mlp": 0.01040678, "balance_loss_clip": 1.04283214, "balance_loss_mlp": 1.02324986, "epoch": 0.5086274274033548, "flos": 35951608252800.0, "grad_norm": 2.5670232829467547, "language_loss": 0.75242203, "learning_rate": 2.041282565252594e-06, "loss": 0.77394164, "num_input_tokens_seen": 91240425, "step": 4230, "time_per_iteration": 3.6925032138824463 }, { "auxiliary_loss_clip": 0.01106504, "auxiliary_loss_mlp": 0.01050598, "balance_loss_clip": 1.04138029, "balance_loss_mlp": 1.03072572, "epoch": 0.5087476702939938, "flos": 23513732881920.0, "grad_norm": 1.8910659711770983, "language_loss": 0.77127063, "learning_rate": 2.040503756456714e-06, "loss": 0.79284167, "num_input_tokens_seen": 91259635, "step": 4231, "time_per_iteration": 2.7178263664245605 }, { "auxiliary_loss_clip": 0.01127593, "auxiliary_loss_mlp": 0.01053954, "balance_loss_clip": 1.04462099, "balance_loss_mlp": 1.03476143, "epoch": 0.508867913184633, "flos": 15122091841920.0, "grad_norm": 3.3301742092645097, "language_loss": 0.7855767, "learning_rate": 2.0397249415164456e-06, "loss": 0.80739218, "num_input_tokens_seen": 91276990, "step": 4232, "time_per_iteration": 2.556114673614502 }, { "auxiliary_loss_clip": 0.01113166, "auxiliary_loss_mlp": 0.01052083, "balance_loss_clip": 1.04124618, "balance_loss_mlp": 1.03358197, "epoch": 0.508988156075272, "flos": 25885309374720.0, "grad_norm": 1.7432185887319729, "language_loss": 0.79986018, "learning_rate": 2.0389461205499354e-06, "loss": 0.82151264, "num_input_tokens_seen": 91296125, "step": 4233, "time_per_iteration": 2.6399149894714355 }, { "auxiliary_loss_clip": 0.0110621, "auxiliary_loss_mlp": 0.01036849, "balance_loss_clip": 1.04144919, "balance_loss_mlp": 1.01973104, "epoch": 0.5091083989659111, "flos": 13844857057920.0, "grad_norm": 1.8630619634034464, "language_loss": 0.73935664, "learning_rate": 2.03816729367533e-06, "loss": 0.76078725, "num_input_tokens_seen": 91314280, "step": 4234, "time_per_iteration": 2.642101526260376 }, { "auxiliary_loss_clip": 0.0112486, "auxiliary_loss_mlp": 0.01041587, "balance_loss_clip": 1.04830968, "balance_loss_mlp": 1.02344286, "epoch": 0.5092286418565503, "flos": 21104881050240.0, "grad_norm": 1.9397400244845169, "language_loss": 0.71422184, "learning_rate": 2.0373884610107765e-06, "loss": 0.73588634, "num_input_tokens_seen": 91334595, "step": 4235, "time_per_iteration": 2.6421496868133545 }, { "auxiliary_loss_clip": 0.01133944, "auxiliary_loss_mlp": 0.01035438, "balance_loss_clip": 1.04378533, "balance_loss_mlp": 1.01859331, "epoch": 0.5093488847471893, "flos": 18621298972800.0, "grad_norm": 3.0287678531992777, "language_loss": 0.69609708, "learning_rate": 2.0366096226744225e-06, "loss": 0.71779084, "num_input_tokens_seen": 91349790, "step": 4236, "time_per_iteration": 2.772123098373413 }, { "auxiliary_loss_clip": 0.01125769, "auxiliary_loss_mlp": 0.01047195, "balance_loss_clip": 1.04402351, "balance_loss_mlp": 1.02970743, "epoch": 0.5094691276378284, "flos": 23803783205760.0, "grad_norm": 2.126454086312618, "language_loss": 0.76581514, "learning_rate": 2.035830778784418e-06, "loss": 0.78754479, "num_input_tokens_seen": 91370465, "step": 4237, "time_per_iteration": 2.6462225914001465 }, { "auxiliary_loss_clip": 0.01124, "auxiliary_loss_mlp": 0.01038459, "balance_loss_clip": 1.04714537, "balance_loss_mlp": 1.02110183, "epoch": 0.5095893705284675, "flos": 17420410546560.0, "grad_norm": 2.1913499519127413, "language_loss": 0.80255777, "learning_rate": 2.0350519294589134e-06, "loss": 0.82418239, "num_input_tokens_seen": 91388505, "step": 4238, "time_per_iteration": 2.5793051719665527 }, { "auxiliary_loss_clip": 0.01090081, "auxiliary_loss_mlp": 0.01046533, "balance_loss_clip": 1.03790736, "balance_loss_mlp": 1.02843678, "epoch": 0.5097096134191066, "flos": 25849362839040.0, "grad_norm": 1.8040972117663439, "language_loss": 0.82523799, "learning_rate": 2.0342730748160588e-06, "loss": 0.84660417, "num_input_tokens_seen": 91408970, "step": 4239, "time_per_iteration": 2.742931842803955 }, { "auxiliary_loss_clip": 0.01120275, "auxiliary_loss_mlp": 0.0104452, "balance_loss_clip": 1.0427835, "balance_loss_mlp": 1.02600694, "epoch": 0.5098298563097456, "flos": 27745122844800.0, "grad_norm": 2.405633721278703, "language_loss": 0.70785934, "learning_rate": 2.033494214974006e-06, "loss": 0.72950733, "num_input_tokens_seen": 91430115, "step": 4240, "time_per_iteration": 2.6484763622283936 }, { "auxiliary_loss_clip": 0.01107967, "auxiliary_loss_mlp": 0.01047485, "balance_loss_clip": 1.04162621, "balance_loss_mlp": 1.03080773, "epoch": 0.5099500992003848, "flos": 21358913011200.0, "grad_norm": 1.895992717553242, "language_loss": 0.8383413, "learning_rate": 2.0327153500509067e-06, "loss": 0.85989577, "num_input_tokens_seen": 91449140, "step": 4241, "time_per_iteration": 2.6517651081085205 }, { "auxiliary_loss_clip": 0.01124599, "auxiliary_loss_mlp": 0.0104633, "balance_loss_clip": 1.04505467, "balance_loss_mlp": 1.02914023, "epoch": 0.5100703420910239, "flos": 19865999013120.0, "grad_norm": 2.9049569958467325, "language_loss": 0.85046172, "learning_rate": 2.031936480164916e-06, "loss": 0.87217093, "num_input_tokens_seen": 91466880, "step": 4242, "time_per_iteration": 2.618293046951294 }, { "auxiliary_loss_clip": 0.01118625, "auxiliary_loss_mlp": 0.01042165, "balance_loss_clip": 1.04640126, "balance_loss_mlp": 1.02656043, "epoch": 0.5101905849816629, "flos": 24648797635200.0, "grad_norm": 2.054998853284297, "language_loss": 0.80364585, "learning_rate": 2.0311576054341857e-06, "loss": 0.82525373, "num_input_tokens_seen": 91487495, "step": 4243, "time_per_iteration": 2.64929461479187 }, { "auxiliary_loss_clip": 0.01147873, "auxiliary_loss_mlp": 0.01042884, "balance_loss_clip": 1.04851568, "balance_loss_mlp": 1.02568245, "epoch": 0.5103108278723021, "flos": 22930076787840.0, "grad_norm": 1.6491312026088936, "language_loss": 0.62501132, "learning_rate": 2.0303787259768715e-06, "loss": 0.64691901, "num_input_tokens_seen": 91508395, "step": 4244, "time_per_iteration": 2.5909013748168945 }, { "auxiliary_loss_clip": 0.01123747, "auxiliary_loss_mlp": 0.01041702, "balance_loss_clip": 1.04598093, "balance_loss_mlp": 1.02326083, "epoch": 0.5104310707629411, "flos": 21506613736320.0, "grad_norm": 2.880147173123664, "language_loss": 0.69282901, "learning_rate": 2.0295998419111294e-06, "loss": 0.7144835, "num_input_tokens_seen": 91525685, "step": 4245, "time_per_iteration": 3.636037826538086 }, { "auxiliary_loss_clip": 0.01087947, "auxiliary_loss_mlp": 0.01043859, "balance_loss_clip": 1.04050577, "balance_loss_mlp": 1.02483308, "epoch": 0.5105513136535802, "flos": 14903180403840.0, "grad_norm": 2.66820439981939, "language_loss": 0.7404871, "learning_rate": 2.028820953355115e-06, "loss": 0.76180518, "num_input_tokens_seen": 91543785, "step": 4246, "time_per_iteration": 2.669297695159912 }, { "auxiliary_loss_clip": 0.01126219, "auxiliary_loss_mlp": 0.01044108, "balance_loss_clip": 1.0439986, "balance_loss_mlp": 1.02661991, "epoch": 0.5106715565442194, "flos": 22602212421120.0, "grad_norm": 2.097969598302068, "language_loss": 0.78498507, "learning_rate": 2.0280420604269834e-06, "loss": 0.80668831, "num_input_tokens_seen": 91563325, "step": 4247, "time_per_iteration": 3.5472586154937744 }, { "auxiliary_loss_clip": 0.01029158, "auxiliary_loss_mlp": 0.01005925, "balance_loss_clip": 1.00745773, "balance_loss_mlp": 1.00407743, "epoch": 0.5107917994348584, "flos": 71027645558400.0, "grad_norm": 0.7047170434241795, "language_loss": 0.58894145, "learning_rate": 2.027263163244895e-06, "loss": 0.60929227, "num_input_tokens_seen": 91632450, "step": 4248, "time_per_iteration": 3.2996349334716797 }, { "auxiliary_loss_clip": 0.01129816, "auxiliary_loss_mlp": 0.01039587, "balance_loss_clip": 1.04446125, "balance_loss_mlp": 1.02155066, "epoch": 0.5109120423254975, "flos": 24827416992000.0, "grad_norm": 1.6582792324553755, "language_loss": 0.74624676, "learning_rate": 2.026484261927005e-06, "loss": 0.76794076, "num_input_tokens_seen": 91651945, "step": 4249, "time_per_iteration": 2.628527879714966 }, { "auxiliary_loss_clip": 0.01138115, "auxiliary_loss_mlp": 0.0104764, "balance_loss_clip": 1.04701853, "balance_loss_mlp": 1.0293411, "epoch": 0.5110322852161366, "flos": 21247661612160.0, "grad_norm": 2.163948541882336, "language_loss": 0.74087512, "learning_rate": 2.025705356591475e-06, "loss": 0.76273268, "num_input_tokens_seen": 91669635, "step": 4250, "time_per_iteration": 3.505896806716919 }, { "auxiliary_loss_clip": 0.01011201, "auxiliary_loss_mlp": 0.00756178, "balance_loss_clip": 1.00868416, "balance_loss_mlp": 1.00085485, "epoch": 0.5111525281067757, "flos": 66457114358400.0, "grad_norm": 0.7564756714029384, "language_loss": 0.57958758, "learning_rate": 2.024926447356462e-06, "loss": 0.59726131, "num_input_tokens_seen": 91731920, "step": 4251, "time_per_iteration": 3.129274845123291 }, { "auxiliary_loss_clip": 0.01131979, "auxiliary_loss_mlp": 0.01041446, "balance_loss_clip": 1.04478931, "balance_loss_mlp": 1.02275372, "epoch": 0.5112727709974147, "flos": 14866731077760.0, "grad_norm": 1.9286027223490494, "language_loss": 0.78792423, "learning_rate": 2.024147534340127e-06, "loss": 0.80965853, "num_input_tokens_seen": 91749780, "step": 4252, "time_per_iteration": 2.5423531532287598 }, { "auxiliary_loss_clip": 0.01113048, "auxiliary_loss_mlp": 0.01046006, "balance_loss_clip": 1.03882074, "balance_loss_mlp": 1.0278151, "epoch": 0.5113930138880539, "flos": 21177600134400.0, "grad_norm": 1.8136930475695163, "language_loss": 0.79631382, "learning_rate": 2.02336861766063e-06, "loss": 0.81790435, "num_input_tokens_seen": 91768840, "step": 4253, "time_per_iteration": 2.635108709335327 }, { "auxiliary_loss_clip": 0.01138593, "auxiliary_loss_mlp": 0.01047431, "balance_loss_clip": 1.04597068, "balance_loss_mlp": 1.02927518, "epoch": 0.511513256778693, "flos": 20409111630720.0, "grad_norm": 2.3294531872518993, "language_loss": 0.78685248, "learning_rate": 2.0225896974361327e-06, "loss": 0.80871278, "num_input_tokens_seen": 91788945, "step": 4254, "time_per_iteration": 2.581908702850342 }, { "auxiliary_loss_clip": 0.0101086, "auxiliary_loss_mlp": 0.01005248, "balance_loss_clip": 1.00829458, "balance_loss_mlp": 1.00325763, "epoch": 0.511633499669332, "flos": 69879975131520.0, "grad_norm": 0.8893311603714675, "language_loss": 0.59919316, "learning_rate": 2.0218107737847962e-06, "loss": 0.61935425, "num_input_tokens_seen": 91850990, "step": 4255, "time_per_iteration": 3.2351865768432617 }, { "auxiliary_loss_clip": 0.01143652, "auxiliary_loss_mlp": 0.01048374, "balance_loss_clip": 1.04505444, "balance_loss_mlp": 1.03028941, "epoch": 0.5117537425599712, "flos": 24097855852800.0, "grad_norm": 1.773146101591085, "language_loss": 0.74951661, "learning_rate": 2.0210318468247826e-06, "loss": 0.77143693, "num_input_tokens_seen": 91869960, "step": 4256, "time_per_iteration": 3.4808120727539062 }, { "auxiliary_loss_clip": 0.01119382, "auxiliary_loss_mlp": 0.01041448, "balance_loss_clip": 1.04276824, "balance_loss_mlp": 1.02437782, "epoch": 0.5118739854506102, "flos": 20959550622720.0, "grad_norm": 1.8108781933548825, "language_loss": 0.81888986, "learning_rate": 2.020252916674255e-06, "loss": 0.84049809, "num_input_tokens_seen": 91889075, "step": 4257, "time_per_iteration": 2.622850179672241 }, { "auxiliary_loss_clip": 0.01131682, "auxiliary_loss_mlp": 0.01044975, "balance_loss_clip": 1.04319108, "balance_loss_mlp": 1.02844024, "epoch": 0.5119942283412493, "flos": 17457326749440.0, "grad_norm": 1.7790842708451655, "language_loss": 0.81316024, "learning_rate": 2.019473983451375e-06, "loss": 0.83492684, "num_input_tokens_seen": 91907495, "step": 4258, "time_per_iteration": 2.5659327507019043 }, { "auxiliary_loss_clip": 0.01113144, "auxiliary_loss_mlp": 0.01049071, "balance_loss_clip": 1.0411706, "balance_loss_mlp": 1.03160667, "epoch": 0.5121144712318885, "flos": 21066743784960.0, "grad_norm": 1.8913775434749278, "language_loss": 0.71659046, "learning_rate": 2.0186950472743076e-06, "loss": 0.73821259, "num_input_tokens_seen": 91927400, "step": 4259, "time_per_iteration": 2.653977870941162 }, { "auxiliary_loss_clip": 0.01142745, "auxiliary_loss_mlp": 0.01047128, "balance_loss_clip": 1.04427588, "balance_loss_mlp": 1.02979529, "epoch": 0.5122347141225275, "flos": 19860791541120.0, "grad_norm": 1.6352957057401254, "language_loss": 0.73916531, "learning_rate": 2.0179161082612162e-06, "loss": 0.76106405, "num_input_tokens_seen": 91946790, "step": 4260, "time_per_iteration": 2.55370831489563 }, { "auxiliary_loss_clip": 0.01115511, "auxiliary_loss_mlp": 0.01035674, "balance_loss_clip": 1.03973639, "balance_loss_mlp": 1.01719701, "epoch": 0.5123549570131666, "flos": 22528487756160.0, "grad_norm": 2.15250041632961, "language_loss": 0.72372603, "learning_rate": 2.017137166530266e-06, "loss": 0.74523795, "num_input_tokens_seen": 91966325, "step": 4261, "time_per_iteration": 2.6136181354522705 }, { "auxiliary_loss_clip": 0.01123171, "auxiliary_loss_mlp": 0.01044029, "balance_loss_clip": 1.04329777, "balance_loss_mlp": 1.02704215, "epoch": 0.5124751999038056, "flos": 20333375804160.0, "grad_norm": 2.4198976362292495, "language_loss": 0.80310833, "learning_rate": 2.0163582221996213e-06, "loss": 0.82478034, "num_input_tokens_seen": 91984700, "step": 4262, "time_per_iteration": 2.602747917175293 }, { "auxiliary_loss_clip": 0.01121715, "auxiliary_loss_mlp": 0.01038212, "balance_loss_clip": 1.04444134, "balance_loss_mlp": 1.02129602, "epoch": 0.5125954427944448, "flos": 39785970211200.0, "grad_norm": 1.9110694897154106, "language_loss": 0.68141401, "learning_rate": 2.015579275387446e-06, "loss": 0.70301318, "num_input_tokens_seen": 92010020, "step": 4263, "time_per_iteration": 2.7850027084350586 }, { "auxiliary_loss_clip": 0.01111008, "auxiliary_loss_mlp": 0.01041063, "balance_loss_clip": 1.04166663, "balance_loss_mlp": 1.02308643, "epoch": 0.5127156856850839, "flos": 29205394358400.0, "grad_norm": 2.179792639197155, "language_loss": 0.68702805, "learning_rate": 2.0148003262119085e-06, "loss": 0.70854878, "num_input_tokens_seen": 92030990, "step": 4264, "time_per_iteration": 2.7551443576812744 }, { "auxiliary_loss_clip": 0.01106405, "auxiliary_loss_mlp": 0.01044726, "balance_loss_clip": 1.04150379, "balance_loss_mlp": 1.02584314, "epoch": 0.5128359285757229, "flos": 13553693412480.0, "grad_norm": 1.856125402721766, "language_loss": 0.76429093, "learning_rate": 2.0140213747911728e-06, "loss": 0.78580213, "num_input_tokens_seen": 92049525, "step": 4265, "time_per_iteration": 2.627523899078369 }, { "auxiliary_loss_clip": 0.01109661, "auxiliary_loss_mlp": 0.01044745, "balance_loss_clip": 1.04376161, "balance_loss_mlp": 1.02637458, "epoch": 0.5129561714663621, "flos": 25192089820800.0, "grad_norm": 2.0355914197208413, "language_loss": 0.80467832, "learning_rate": 2.013242421243406e-06, "loss": 0.82622242, "num_input_tokens_seen": 92068430, "step": 4266, "time_per_iteration": 2.7056725025177 }, { "auxiliary_loss_clip": 0.01095015, "auxiliary_loss_mlp": 0.01042542, "balance_loss_clip": 1.0411942, "balance_loss_mlp": 1.02640092, "epoch": 0.5130764143570011, "flos": 18150223080960.0, "grad_norm": 2.2478460338232247, "language_loss": 0.79032516, "learning_rate": 2.012463465686774e-06, "loss": 0.8117007, "num_input_tokens_seen": 92088180, "step": 4267, "time_per_iteration": 2.7312369346618652 }, { "auxiliary_loss_clip": 0.01004067, "auxiliary_loss_mlp": 0.01003995, "balance_loss_clip": 1.01516485, "balance_loss_mlp": 1.0019803, "epoch": 0.5131966572476402, "flos": 59794896418560.0, "grad_norm": 0.7802536892995398, "language_loss": 0.54719329, "learning_rate": 2.0116845082394446e-06, "loss": 0.56727386, "num_input_tokens_seen": 92153015, "step": 4268, "time_per_iteration": 3.2808260917663574 }, { "auxiliary_loss_clip": 0.01134591, "auxiliary_loss_mlp": 0.01047537, "balance_loss_clip": 1.04338419, "balance_loss_mlp": 1.0300014, "epoch": 0.5133169001382794, "flos": 18515219132160.0, "grad_norm": 1.8520659339982408, "language_loss": 0.78646225, "learning_rate": 2.0109055490195836e-06, "loss": 0.80828357, "num_input_tokens_seen": 92171470, "step": 4269, "time_per_iteration": 2.5383126735687256 }, { "auxiliary_loss_clip": 0.0108498, "auxiliary_loss_mlp": 0.01041948, "balance_loss_clip": 1.03506243, "balance_loss_mlp": 1.02313685, "epoch": 0.5134371430289184, "flos": 15523537219200.0, "grad_norm": 1.9791202933039909, "language_loss": 0.64483893, "learning_rate": 2.0101265881453605e-06, "loss": 0.66610819, "num_input_tokens_seen": 92189945, "step": 4270, "time_per_iteration": 2.699164628982544 }, { "auxiliary_loss_clip": 0.01114711, "auxiliary_loss_mlp": 0.01046511, "balance_loss_clip": 1.04506302, "balance_loss_mlp": 1.03076363, "epoch": 0.5135573859195575, "flos": 21433786911360.0, "grad_norm": 3.159782364888406, "language_loss": 0.78186548, "learning_rate": 2.009347625734941e-06, "loss": 0.80347764, "num_input_tokens_seen": 92209855, "step": 4271, "time_per_iteration": 3.622054100036621 }, { "auxiliary_loss_clip": 0.01147524, "auxiliary_loss_mlp": 0.01048568, "balance_loss_clip": 1.04781663, "balance_loss_mlp": 1.03007865, "epoch": 0.5136776288101966, "flos": 17712651600000.0, "grad_norm": 2.4597714405750755, "language_loss": 0.74819303, "learning_rate": 2.0085686619064954e-06, "loss": 0.77015394, "num_input_tokens_seen": 92226295, "step": 4272, "time_per_iteration": 2.511172294616699 }, { "auxiliary_loss_clip": 0.011372, "auxiliary_loss_mlp": 0.01041058, "balance_loss_clip": 1.04708147, "balance_loss_mlp": 1.02451134, "epoch": 0.5137978717008357, "flos": 16581680997120.0, "grad_norm": 2.1786607334386447, "language_loss": 0.83179873, "learning_rate": 2.00778969677819e-06, "loss": 0.85358131, "num_input_tokens_seen": 92243330, "step": 4273, "time_per_iteration": 3.517423629760742 }, { "auxiliary_loss_clip": 0.01117457, "auxiliary_loss_mlp": 0.01044208, "balance_loss_clip": 1.04247069, "balance_loss_mlp": 1.0254209, "epoch": 0.5139181145914747, "flos": 20668243322880.0, "grad_norm": 2.0001615982616316, "language_loss": 0.6438607, "learning_rate": 2.0070107304681934e-06, "loss": 0.6654774, "num_input_tokens_seen": 92262285, "step": 4274, "time_per_iteration": 2.6100122928619385 }, { "auxiliary_loss_clip": 0.01108081, "auxiliary_loss_mlp": 0.01037349, "balance_loss_clip": 1.04426408, "balance_loss_mlp": 1.02062404, "epoch": 0.5140383574821139, "flos": 32926996546560.0, "grad_norm": 1.6997410355616906, "language_loss": 0.7842201, "learning_rate": 2.006231763094675e-06, "loss": 0.80567437, "num_input_tokens_seen": 92283305, "step": 4275, "time_per_iteration": 2.754709005355835 }, { "auxiliary_loss_clip": 0.01115398, "auxiliary_loss_mlp": 0.01043939, "balance_loss_clip": 1.04401028, "balance_loss_mlp": 1.02667761, "epoch": 0.514158600372753, "flos": 19537093152000.0, "grad_norm": 1.9440374907946427, "language_loss": 0.87405914, "learning_rate": 2.0054527947758027e-06, "loss": 0.89565253, "num_input_tokens_seen": 92302105, "step": 4276, "time_per_iteration": 3.645841360092163 }, { "auxiliary_loss_clip": 0.01027334, "auxiliary_loss_mlp": 0.01001532, "balance_loss_clip": 1.00523615, "balance_loss_mlp": 0.99961233, "epoch": 0.514278843263392, "flos": 62523855279360.0, "grad_norm": 0.7193234371912425, "language_loss": 0.55930877, "learning_rate": 2.004673825629746e-06, "loss": 0.57959747, "num_input_tokens_seen": 92362885, "step": 4277, "time_per_iteration": 3.1488847732543945 }, { "auxiliary_loss_clip": 0.0111113, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.03987098, "balance_loss_mlp": 1.0246985, "epoch": 0.5143990861540312, "flos": 25882328545920.0, "grad_norm": 1.5390919766827857, "language_loss": 0.72508895, "learning_rate": 2.0038948557746744e-06, "loss": 0.74663126, "num_input_tokens_seen": 92384740, "step": 4278, "time_per_iteration": 2.6786997318267822 }, { "auxiliary_loss_clip": 0.01130501, "auxiliary_loss_mlp": 0.01044346, "balance_loss_clip": 1.04446518, "balance_loss_mlp": 1.02721596, "epoch": 0.5145193290446702, "flos": 23330660238720.0, "grad_norm": 1.610781097046771, "language_loss": 0.74966884, "learning_rate": 2.0031158853287558e-06, "loss": 0.77141738, "num_input_tokens_seen": 92405175, "step": 4279, "time_per_iteration": 2.6159534454345703 }, { "auxiliary_loss_clip": 0.01119836, "auxiliary_loss_mlp": 0.01036494, "balance_loss_clip": 1.04360676, "balance_loss_mlp": 1.02007878, "epoch": 0.5146395719353093, "flos": 22856603518080.0, "grad_norm": 2.5208558416749884, "language_loss": 0.70335531, "learning_rate": 2.0023369144101593e-06, "loss": 0.7249186, "num_input_tokens_seen": 92423345, "step": 4280, "time_per_iteration": 2.6558196544647217 }, { "auxiliary_loss_clip": 0.01109228, "auxiliary_loss_mlp": 0.01041551, "balance_loss_clip": 1.0397768, "balance_loss_mlp": 1.02396822, "epoch": 0.5147598148259485, "flos": 26391577616640.0, "grad_norm": 2.2597433776565423, "language_loss": 0.7678932, "learning_rate": 2.0015579431370555e-06, "loss": 0.78940094, "num_input_tokens_seen": 92445025, "step": 4281, "time_per_iteration": 2.68634295463562 }, { "auxiliary_loss_clip": 0.01128488, "auxiliary_loss_mlp": 0.0104914, "balance_loss_clip": 1.04423428, "balance_loss_mlp": 1.03173518, "epoch": 0.5148800577165875, "flos": 29965694561280.0, "grad_norm": 3.0813652549314803, "language_loss": 0.69664061, "learning_rate": 2.000778971627612e-06, "loss": 0.71841687, "num_input_tokens_seen": 92464490, "step": 4282, "time_per_iteration": 3.4484546184539795 }, { "auxiliary_loss_clip": 0.01114181, "auxiliary_loss_mlp": 0.01040914, "balance_loss_clip": 1.04286289, "balance_loss_mlp": 1.02504706, "epoch": 0.5150003006072266, "flos": 17931383470080.0, "grad_norm": 1.8242824283335464, "language_loss": 0.90137136, "learning_rate": 2e-06, "loss": 0.92292231, "num_input_tokens_seen": 92482085, "step": 4283, "time_per_iteration": 2.5632224082946777 }, { "auxiliary_loss_clip": 0.01143094, "auxiliary_loss_mlp": 0.01041161, "balance_loss_clip": 1.04483461, "balance_loss_mlp": 1.02523482, "epoch": 0.5151205434978657, "flos": 18478733892480.0, "grad_norm": 1.7955485168367666, "language_loss": 0.85870868, "learning_rate": 1.9992210283723878e-06, "loss": 0.88055128, "num_input_tokens_seen": 92499325, "step": 4284, "time_per_iteration": 2.5363404750823975 }, { "auxiliary_loss_clip": 0.01141281, "auxiliary_loss_mlp": 0.01035355, "balance_loss_clip": 1.04534042, "balance_loss_mlp": 1.0197742, "epoch": 0.5152407863885048, "flos": 25341263003520.0, "grad_norm": 1.6187916516509673, "language_loss": 0.79530811, "learning_rate": 1.9984420568629448e-06, "loss": 0.81707454, "num_input_tokens_seen": 92522090, "step": 4285, "time_per_iteration": 2.6009936332702637 }, { "auxiliary_loss_clip": 0.0113221, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.04698777, "balance_loss_mlp": 1.02147484, "epoch": 0.5153610292791438, "flos": 18329740277760.0, "grad_norm": 5.333397066412181, "language_loss": 0.7843442, "learning_rate": 1.9976630855898405e-06, "loss": 0.80603445, "num_input_tokens_seen": 92539845, "step": 4286, "time_per_iteration": 2.5474390983581543 }, { "auxiliary_loss_clip": 0.01114921, "auxiliary_loss_mlp": 0.01043996, "balance_loss_clip": 1.0395987, "balance_loss_mlp": 1.02742553, "epoch": 0.515481272169783, "flos": 30409945971840.0, "grad_norm": 2.3007643009317578, "language_loss": 0.74734092, "learning_rate": 1.9968841146712445e-06, "loss": 0.76893008, "num_input_tokens_seen": 92559460, "step": 4287, "time_per_iteration": 2.7082109451293945 }, { "auxiliary_loss_clip": 0.01082544, "auxiliary_loss_mlp": 0.00773418, "balance_loss_clip": 1.03994501, "balance_loss_mlp": 1.00048733, "epoch": 0.5156015150604221, "flos": 23037305863680.0, "grad_norm": 4.068608428881636, "language_loss": 0.71688652, "learning_rate": 1.996105144225326e-06, "loss": 0.73544616, "num_input_tokens_seen": 92579695, "step": 4288, "time_per_iteration": 2.7812182903289795 }, { "auxiliary_loss_clip": 0.0112986, "auxiliary_loss_mlp": 0.0103983, "balance_loss_clip": 1.04444456, "balance_loss_mlp": 1.02380824, "epoch": 0.5157217579510611, "flos": 17858556645120.0, "grad_norm": 2.70348542512634, "language_loss": 0.7871424, "learning_rate": 1.995326174370254e-06, "loss": 0.80883926, "num_input_tokens_seen": 92598795, "step": 4289, "time_per_iteration": 2.7291557788848877 }, { "auxiliary_loss_clip": 0.01128421, "auxiliary_loss_mlp": 0.00772268, "balance_loss_clip": 1.04435742, "balance_loss_mlp": 1.00039113, "epoch": 0.5158420008417003, "flos": 19171486569600.0, "grad_norm": 1.6025562081000215, "language_loss": 0.72991645, "learning_rate": 1.994547205224197e-06, "loss": 0.74892336, "num_input_tokens_seen": 92617700, "step": 4290, "time_per_iteration": 2.7352941036224365 }, { "auxiliary_loss_clip": 0.01115597, "auxiliary_loss_mlp": 0.01042026, "balance_loss_clip": 1.04335833, "balance_loss_mlp": 1.02449, "epoch": 0.5159622437323393, "flos": 22419534827520.0, "grad_norm": 2.2810307676429518, "language_loss": 0.6732685, "learning_rate": 1.993768236905325e-06, "loss": 0.69484472, "num_input_tokens_seen": 92638370, "step": 4291, "time_per_iteration": 2.6922731399536133 }, { "auxiliary_loss_clip": 0.01116497, "auxiliary_loss_mlp": 0.01042175, "balance_loss_clip": 1.03971958, "balance_loss_mlp": 1.02413869, "epoch": 0.5160824866229784, "flos": 24603010773120.0, "grad_norm": 2.56362778551315, "language_loss": 0.65868497, "learning_rate": 1.992989269531807e-06, "loss": 0.68027169, "num_input_tokens_seen": 92657180, "step": 4292, "time_per_iteration": 2.721982479095459 }, { "auxiliary_loss_clip": 0.01123369, "auxiliary_loss_mlp": 0.01043784, "balance_loss_clip": 1.04472852, "balance_loss_mlp": 1.02535462, "epoch": 0.5162027295136175, "flos": 18002737837440.0, "grad_norm": 2.4669728526884716, "language_loss": 0.67833096, "learning_rate": 1.99221030322181e-06, "loss": 0.70000249, "num_input_tokens_seen": 92673985, "step": 4293, "time_per_iteration": 2.5972740650177 }, { "auxiliary_loss_clip": 0.01120436, "auxiliary_loss_mlp": 0.01037744, "balance_loss_clip": 1.04329598, "balance_loss_mlp": 1.02123368, "epoch": 0.5163229724042566, "flos": 27344611221120.0, "grad_norm": 1.5240191959829212, "language_loss": 0.80663294, "learning_rate": 1.991431338093505e-06, "loss": 0.82821476, "num_input_tokens_seen": 92696340, "step": 4294, "time_per_iteration": 2.6633410453796387 }, { "auxiliary_loss_clip": 0.01119352, "auxiliary_loss_mlp": 0.0104111, "balance_loss_clip": 1.04566574, "balance_loss_mlp": 1.02552962, "epoch": 0.5164432152948957, "flos": 21762764599680.0, "grad_norm": 1.8971417761515297, "language_loss": 0.79585826, "learning_rate": 1.9906523742650587e-06, "loss": 0.8174628, "num_input_tokens_seen": 92715200, "step": 4295, "time_per_iteration": 2.647545337677002 }, { "auxiliary_loss_clip": 0.01146542, "auxiliary_loss_mlp": 0.01044354, "balance_loss_clip": 1.04581404, "balance_loss_mlp": 1.02679467, "epoch": 0.5165634581855347, "flos": 25550334115200.0, "grad_norm": 2.8948134967789265, "language_loss": 0.77242243, "learning_rate": 1.9898734118546397e-06, "loss": 0.79433143, "num_input_tokens_seen": 92735150, "step": 4296, "time_per_iteration": 2.61019229888916 }, { "auxiliary_loss_clip": 0.0107551, "auxiliary_loss_mlp": 0.01040185, "balance_loss_clip": 1.03987074, "balance_loss_mlp": 1.02317345, "epoch": 0.5166837010761739, "flos": 19901191363200.0, "grad_norm": 2.104419612809599, "language_loss": 0.80440897, "learning_rate": 1.989094450980416e-06, "loss": 0.82556593, "num_input_tokens_seen": 92755250, "step": 4297, "time_per_iteration": 3.707280158996582 }, { "auxiliary_loss_clip": 0.01129391, "auxiliary_loss_mlp": 0.01039561, "balance_loss_clip": 1.04305875, "balance_loss_mlp": 1.023718, "epoch": 0.516803943966813, "flos": 26646076454400.0, "grad_norm": 35.619907526051364, "language_loss": 0.77085799, "learning_rate": 1.9883154917605556e-06, "loss": 0.79254746, "num_input_tokens_seen": 92774460, "step": 4298, "time_per_iteration": 2.5984439849853516 }, { "auxiliary_loss_clip": 0.0114203, "auxiliary_loss_mlp": 0.01037845, "balance_loss_clip": 1.04383516, "balance_loss_mlp": 1.02088118, "epoch": 0.516924186857452, "flos": 19682854542720.0, "grad_norm": 1.6699222914503669, "language_loss": 0.83457637, "learning_rate": 1.9875365343132262e-06, "loss": 0.8563751, "num_input_tokens_seen": 92791580, "step": 4299, "time_per_iteration": 3.4567713737487793 }, { "auxiliary_loss_clip": 0.01130841, "auxiliary_loss_mlp": 0.0077296, "balance_loss_clip": 1.04482174, "balance_loss_mlp": 1.00051975, "epoch": 0.5170444297480912, "flos": 15956583586560.0, "grad_norm": 2.029423707174594, "language_loss": 0.84564453, "learning_rate": 1.9867575787565946e-06, "loss": 0.86468256, "num_input_tokens_seen": 92806240, "step": 4300, "time_per_iteration": 2.5550918579101562 }, { "auxiliary_loss_clip": 0.01133167, "auxiliary_loss_mlp": 0.0104496, "balance_loss_clip": 1.045223, "balance_loss_mlp": 1.02680457, "epoch": 0.5171646726387302, "flos": 14174157968640.0, "grad_norm": 1.9808252930508812, "language_loss": 0.86215168, "learning_rate": 1.9859786252088275e-06, "loss": 0.88393295, "num_input_tokens_seen": 92823420, "step": 4301, "time_per_iteration": 2.5558459758758545 }, { "auxiliary_loss_clip": 0.01110584, "auxiliary_loss_mlp": 0.01039345, "balance_loss_clip": 1.04209638, "balance_loss_mlp": 1.02220249, "epoch": 0.5172849155293693, "flos": 23578550974080.0, "grad_norm": 3.675561511163276, "language_loss": 0.66666245, "learning_rate": 1.9851996737880914e-06, "loss": 0.68816179, "num_input_tokens_seen": 92838605, "step": 4302, "time_per_iteration": 2.6300368309020996 }, { "auxiliary_loss_clip": 0.01136896, "auxiliary_loss_mlp": 0.01051906, "balance_loss_clip": 1.04706287, "balance_loss_mlp": 1.03378582, "epoch": 0.5174051584200084, "flos": 14283541860480.0, "grad_norm": 2.2076908267900333, "language_loss": 0.74421221, "learning_rate": 1.9844207246125537e-06, "loss": 0.76610023, "num_input_tokens_seen": 92855185, "step": 4303, "time_per_iteration": 3.506354331970215 }, { "auxiliary_loss_clip": 0.01112643, "auxiliary_loss_mlp": 0.01047062, "balance_loss_clip": 1.04260254, "balance_loss_mlp": 1.03173721, "epoch": 0.5175254013106475, "flos": 37889384192640.0, "grad_norm": 2.1057130869390526, "language_loss": 0.68555015, "learning_rate": 1.983641777800379e-06, "loss": 0.70714724, "num_input_tokens_seen": 92877830, "step": 4304, "time_per_iteration": 2.688586473464966 }, { "auxiliary_loss_clip": 0.0101989, "auxiliary_loss_mlp": 0.01011254, "balance_loss_clip": 1.00643396, "balance_loss_mlp": 1.00929892, "epoch": 0.5176456442012866, "flos": 68549737829760.0, "grad_norm": 0.7506909851785184, "language_loss": 0.58758342, "learning_rate": 1.9828628334697343e-06, "loss": 0.60789478, "num_input_tokens_seen": 92945040, "step": 4305, "time_per_iteration": 3.2891523838043213 }, { "auxiliary_loss_clip": 0.01022901, "auxiliary_loss_mlp": 0.01009793, "balance_loss_clip": 1.00895143, "balance_loss_mlp": 1.00777864, "epoch": 0.5177658870919257, "flos": 64084137235200.0, "grad_norm": 0.7700377207354043, "language_loss": 0.54688108, "learning_rate": 1.982083891738784e-06, "loss": 0.56720799, "num_input_tokens_seen": 93005910, "step": 4306, "time_per_iteration": 3.152766704559326 }, { "auxiliary_loss_clip": 0.01112785, "auxiliary_loss_mlp": 0.01039995, "balance_loss_clip": 1.04518104, "balance_loss_mlp": 1.02493882, "epoch": 0.5178861299825648, "flos": 26651248012800.0, "grad_norm": 1.5792526807510396, "language_loss": 0.82848698, "learning_rate": 1.9813049527256923e-06, "loss": 0.85001481, "num_input_tokens_seen": 93026305, "step": 4307, "time_per_iteration": 2.648771286010742 }, { "auxiliary_loss_clip": 0.01104459, "auxiliary_loss_mlp": 0.01047478, "balance_loss_clip": 1.04106295, "balance_loss_mlp": 1.02969205, "epoch": 0.5180063728732038, "flos": 17931886260480.0, "grad_norm": 3.151823121972761, "language_loss": 0.82292974, "learning_rate": 1.9805260165486252e-06, "loss": 0.8444491, "num_input_tokens_seen": 93045675, "step": 4308, "time_per_iteration": 3.521190881729126 }, { "auxiliary_loss_clip": 0.01130199, "auxiliary_loss_mlp": 0.01038726, "balance_loss_clip": 1.04464078, "balance_loss_mlp": 1.02182174, "epoch": 0.518126615763843, "flos": 19500895221120.0, "grad_norm": 2.0269752586904444, "language_loss": 0.86583567, "learning_rate": 1.9797470833257457e-06, "loss": 0.8875249, "num_input_tokens_seen": 93065375, "step": 4309, "time_per_iteration": 2.672312021255493 }, { "auxiliary_loss_clip": 0.01136014, "auxiliary_loss_mlp": 0.01048768, "balance_loss_clip": 1.04874015, "balance_loss_mlp": 1.03166199, "epoch": 0.5182468586544821, "flos": 20704082117760.0, "grad_norm": 2.1123952213570387, "language_loss": 0.77203041, "learning_rate": 1.9789681531752177e-06, "loss": 0.7938782, "num_input_tokens_seen": 93085595, "step": 4310, "time_per_iteration": 2.6259491443634033 }, { "auxiliary_loss_clip": 0.01093293, "auxiliary_loss_mlp": 0.01039087, "balance_loss_clip": 1.04064405, "balance_loss_mlp": 1.02357781, "epoch": 0.5183671015451211, "flos": 23112107936640.0, "grad_norm": 3.856308311668397, "language_loss": 0.72670114, "learning_rate": 1.978189226215204e-06, "loss": 0.74802494, "num_input_tokens_seen": 93106140, "step": 4311, "time_per_iteration": 2.732538938522339 }, { "auxiliary_loss_clip": 0.01143627, "auxiliary_loss_mlp": 0.01044329, "balance_loss_clip": 1.04601622, "balance_loss_mlp": 1.02700245, "epoch": 0.5184873444357603, "flos": 17597090568960.0, "grad_norm": 2.0474289245120874, "language_loss": 0.77024853, "learning_rate": 1.9774103025638675e-06, "loss": 0.79212809, "num_input_tokens_seen": 93124265, "step": 4312, "time_per_iteration": 2.5307400226593018 }, { "auxiliary_loss_clip": 0.01097657, "auxiliary_loss_mlp": 0.01047497, "balance_loss_clip": 1.04568851, "balance_loss_mlp": 1.02974701, "epoch": 0.5186075873263993, "flos": 24936800883840.0, "grad_norm": 1.607365254283523, "language_loss": 0.76563132, "learning_rate": 1.9766313823393696e-06, "loss": 0.78708285, "num_input_tokens_seen": 93145130, "step": 4313, "time_per_iteration": 2.7991929054260254 }, { "auxiliary_loss_clip": 0.01087604, "auxiliary_loss_mlp": 0.01045034, "balance_loss_clip": 1.03520072, "balance_loss_mlp": 1.02666342, "epoch": 0.5187278302170384, "flos": 15190106244480.0, "grad_norm": 2.336777169377147, "language_loss": 0.69143832, "learning_rate": 1.975852465659873e-06, "loss": 0.71276468, "num_input_tokens_seen": 93161110, "step": 4314, "time_per_iteration": 2.6718311309814453 }, { "auxiliary_loss_clip": 0.01136801, "auxiliary_loss_mlp": 0.01044288, "balance_loss_clip": 1.04822135, "balance_loss_mlp": 1.02707434, "epoch": 0.5188480731076776, "flos": 25009412227200.0, "grad_norm": 2.3805362259659075, "language_loss": 0.70347703, "learning_rate": 1.9750735526435377e-06, "loss": 0.72528785, "num_input_tokens_seen": 93178055, "step": 4315, "time_per_iteration": 2.617175579071045 }, { "auxiliary_loss_clip": 0.01114071, "auxiliary_loss_mlp": 0.01045347, "balance_loss_clip": 1.04234505, "balance_loss_mlp": 1.02803802, "epoch": 0.5189683159983166, "flos": 24790141653120.0, "grad_norm": 2.793505514160254, "language_loss": 0.79377979, "learning_rate": 1.974294643408525e-06, "loss": 0.8153739, "num_input_tokens_seen": 93195850, "step": 4316, "time_per_iteration": 2.67836856842041 }, { "auxiliary_loss_clip": 0.01135362, "auxiliary_loss_mlp": 0.01045202, "balance_loss_clip": 1.04381084, "balance_loss_mlp": 1.02678418, "epoch": 0.5190885588889557, "flos": 24754266944640.0, "grad_norm": 2.0526932395601856, "language_loss": 0.67345673, "learning_rate": 1.9735157380729947e-06, "loss": 0.69526237, "num_input_tokens_seen": 93216260, "step": 4317, "time_per_iteration": 2.64056134223938 }, { "auxiliary_loss_clip": 0.01119447, "auxiliary_loss_mlp": 0.01039938, "balance_loss_clip": 1.0432744, "balance_loss_mlp": 1.02291512, "epoch": 0.5192088017795948, "flos": 24712646060160.0, "grad_norm": 1.6959689480496762, "language_loss": 0.84131855, "learning_rate": 1.9727368367551053e-06, "loss": 0.86291236, "num_input_tokens_seen": 93234810, "step": 4318, "time_per_iteration": 2.6529502868652344 }, { "auxiliary_loss_clip": 0.01107228, "auxiliary_loss_mlp": 0.01045063, "balance_loss_clip": 1.04077566, "balance_loss_mlp": 1.02777743, "epoch": 0.5193290446702339, "flos": 27229588894080.0, "grad_norm": 3.2317516190808, "language_loss": 0.68192112, "learning_rate": 1.9719579395730164e-06, "loss": 0.703444, "num_input_tokens_seen": 93254185, "step": 4319, "time_per_iteration": 2.705246925354004 }, { "auxiliary_loss_clip": 0.01147884, "auxiliary_loss_mlp": 0.01043818, "balance_loss_clip": 1.04964399, "balance_loss_mlp": 1.0264256, "epoch": 0.5194492875608729, "flos": 11473352392320.0, "grad_norm": 3.548654303097043, "language_loss": 0.93780291, "learning_rate": 1.9711790466448854e-06, "loss": 0.9597199, "num_input_tokens_seen": 93268205, "step": 4320, "time_per_iteration": 2.636927843093872 }, { "auxiliary_loss_clip": 0.01096741, "auxiliary_loss_mlp": 0.01045222, "balance_loss_clip": 1.04110432, "balance_loss_mlp": 1.02823496, "epoch": 0.5195695304515121, "flos": 20338906498560.0, "grad_norm": 2.7974375201913313, "language_loss": 0.71490139, "learning_rate": 1.9704001580888704e-06, "loss": 0.73632097, "num_input_tokens_seen": 93286945, "step": 4321, "time_per_iteration": 2.721904754638672 }, { "auxiliary_loss_clip": 0.01113115, "auxiliary_loss_mlp": 0.00772795, "balance_loss_clip": 1.04161561, "balance_loss_mlp": 1.00041556, "epoch": 0.5196897733421512, "flos": 20048317470720.0, "grad_norm": 1.8478762331851029, "language_loss": 0.87259328, "learning_rate": 1.9696212740231283e-06, "loss": 0.89145243, "num_input_tokens_seen": 93305595, "step": 4322, "time_per_iteration": 2.6512229442596436 }, { "auxiliary_loss_clip": 0.01140437, "auxiliary_loss_mlp": 0.01045337, "balance_loss_clip": 1.04654336, "balance_loss_mlp": 1.02533388, "epoch": 0.5198100162327902, "flos": 23805507058560.0, "grad_norm": 2.1342245968316793, "language_loss": 0.82337534, "learning_rate": 1.9688423945658146e-06, "loss": 0.84523302, "num_input_tokens_seen": 93326460, "step": 4323, "time_per_iteration": 3.539513111114502 }, { "auxiliary_loss_clip": 0.01083408, "auxiliary_loss_mlp": 0.01051625, "balance_loss_clip": 1.03544784, "balance_loss_mlp": 1.0322659, "epoch": 0.5199302591234293, "flos": 24023951619840.0, "grad_norm": 3.116125017769786, "language_loss": 0.7210356, "learning_rate": 1.9680635198350845e-06, "loss": 0.74238592, "num_input_tokens_seen": 93346170, "step": 4324, "time_per_iteration": 3.6981403827667236 }, { "auxiliary_loss_clip": 0.01135147, "auxiliary_loss_mlp": 0.01040659, "balance_loss_clip": 1.04597878, "balance_loss_mlp": 1.02331424, "epoch": 0.5200505020140684, "flos": 26359366095360.0, "grad_norm": 1.928436795585319, "language_loss": 0.72922373, "learning_rate": 1.967284649949093e-06, "loss": 0.75098181, "num_input_tokens_seen": 93365380, "step": 4325, "time_per_iteration": 2.617961883544922 }, { "auxiliary_loss_clip": 0.01099716, "auxiliary_loss_mlp": 0.01044251, "balance_loss_clip": 1.04037857, "balance_loss_mlp": 1.02812171, "epoch": 0.5201707449047075, "flos": 39604262284800.0, "grad_norm": 1.9973532580630282, "language_loss": 0.72564375, "learning_rate": 1.966505785025994e-06, "loss": 0.74708343, "num_input_tokens_seen": 93387285, "step": 4326, "time_per_iteration": 2.818466901779175 }, { "auxiliary_loss_clip": 0.01108135, "auxiliary_loss_mlp": 0.01041135, "balance_loss_clip": 1.04491448, "balance_loss_mlp": 1.0239327, "epoch": 0.5202909877953465, "flos": 53682788292480.0, "grad_norm": 1.884646612581379, "language_loss": 0.76317096, "learning_rate": 1.965726925183941e-06, "loss": 0.78466368, "num_input_tokens_seen": 93410390, "step": 4327, "time_per_iteration": 2.923367738723755 }, { "auxiliary_loss_clip": 0.01142416, "auxiliary_loss_mlp": 0.0104279, "balance_loss_clip": 1.04684615, "balance_loss_mlp": 1.02484906, "epoch": 0.5204112306859857, "flos": 19537021324800.0, "grad_norm": 1.9268798739392412, "language_loss": 0.84888136, "learning_rate": 1.964948070541087e-06, "loss": 0.87073338, "num_input_tokens_seen": 93429050, "step": 4328, "time_per_iteration": 3.917184352874756 }, { "auxiliary_loss_clip": 0.0112429, "auxiliary_loss_mlp": 0.01038656, "balance_loss_clip": 1.04412103, "balance_loss_mlp": 1.02146626, "epoch": 0.5205314735766248, "flos": 15304697608320.0, "grad_norm": 2.1538994385919192, "language_loss": 0.6931777, "learning_rate": 1.9641692212155816e-06, "loss": 0.71480715, "num_input_tokens_seen": 93446815, "step": 4329, "time_per_iteration": 2.570091485977173 }, { "auxiliary_loss_clip": 0.01093079, "auxiliary_loss_mlp": 0.01042883, "balance_loss_clip": 1.04464865, "balance_loss_mlp": 1.02622986, "epoch": 0.5206517164672638, "flos": 59263701160320.0, "grad_norm": 2.1789016704918946, "language_loss": 0.72738647, "learning_rate": 1.9633903773255777e-06, "loss": 0.7487461, "num_input_tokens_seen": 93469130, "step": 4330, "time_per_iteration": 3.084620237350464 }, { "auxiliary_loss_clip": 0.01144566, "auxiliary_loss_mlp": 0.01038244, "balance_loss_clip": 1.04650283, "balance_loss_mlp": 1.02198339, "epoch": 0.520771959357903, "flos": 26871129118080.0, "grad_norm": 1.629766989582584, "language_loss": 0.74745548, "learning_rate": 1.9626115389892237e-06, "loss": 0.76928359, "num_input_tokens_seen": 93489920, "step": 4331, "time_per_iteration": 2.6202874183654785 }, { "auxiliary_loss_clip": 0.0111406, "auxiliary_loss_mlp": 0.01038013, "balance_loss_clip": 1.04494834, "balance_loss_mlp": 1.02070343, "epoch": 0.520892202248542, "flos": 26907075653760.0, "grad_norm": 3.0106156485888995, "language_loss": 0.84957492, "learning_rate": 1.96183270632467e-06, "loss": 0.87109566, "num_input_tokens_seen": 93509770, "step": 4332, "time_per_iteration": 2.715334177017212 }, { "auxiliary_loss_clip": 0.01098761, "auxiliary_loss_mlp": 0.00775431, "balance_loss_clip": 1.03986263, "balance_loss_mlp": 1.00041485, "epoch": 0.5210124451391811, "flos": 25849434666240.0, "grad_norm": 1.6785467887552894, "language_loss": 0.78943467, "learning_rate": 1.9610538794500644e-06, "loss": 0.80817658, "num_input_tokens_seen": 93529320, "step": 4333, "time_per_iteration": 2.727844715118408 }, { "auxiliary_loss_clip": 0.0101678, "auxiliary_loss_mlp": 0.01006269, "balance_loss_clip": 1.01266026, "balance_loss_mlp": 1.00434971, "epoch": 0.5211326880298203, "flos": 70553804319360.0, "grad_norm": 0.7652159650709545, "language_loss": 0.59423041, "learning_rate": 1.9602750584835542e-06, "loss": 0.61446089, "num_input_tokens_seen": 93595255, "step": 4334, "time_per_iteration": 4.237839460372925 }, { "auxiliary_loss_clip": 0.01114524, "auxiliary_loss_mlp": 0.01046552, "balance_loss_clip": 1.04142427, "balance_loss_mlp": 1.02811027, "epoch": 0.5212529309204593, "flos": 15628898787840.0, "grad_norm": 2.702737434027704, "language_loss": 0.82717586, "learning_rate": 1.959496243543286e-06, "loss": 0.84878671, "num_input_tokens_seen": 93613135, "step": 4335, "time_per_iteration": 2.642749786376953 }, { "auxiliary_loss_clip": 0.01140747, "auxiliary_loss_mlp": 0.01058254, "balance_loss_clip": 1.05054128, "balance_loss_mlp": 1.04121947, "epoch": 0.5213731738110984, "flos": 26242655829120.0, "grad_norm": 2.1068656693970094, "language_loss": 0.79647446, "learning_rate": 1.9587174347474057e-06, "loss": 0.81846452, "num_input_tokens_seen": 93629645, "step": 4336, "time_per_iteration": 2.6281540393829346 }, { "auxiliary_loss_clip": 0.01084443, "auxiliary_loss_mlp": 0.01049017, "balance_loss_clip": 1.03881192, "balance_loss_mlp": 1.03021812, "epoch": 0.5214934167017375, "flos": 19418407637760.0, "grad_norm": 2.420754102349941, "language_loss": 0.82252234, "learning_rate": 1.9579386322140574e-06, "loss": 0.84385693, "num_input_tokens_seen": 93645325, "step": 4337, "time_per_iteration": 2.7106566429138184 }, { "auxiliary_loss_clip": 0.01146362, "auxiliary_loss_mlp": 0.00773609, "balance_loss_clip": 1.04659414, "balance_loss_mlp": 1.00056982, "epoch": 0.5216136595923766, "flos": 30955788023040.0, "grad_norm": 2.1014950905071434, "language_loss": 0.81049418, "learning_rate": 1.9571598360613854e-06, "loss": 0.82969391, "num_input_tokens_seen": 93668200, "step": 4338, "time_per_iteration": 2.661876678466797 }, { "auxiliary_loss_clip": 0.01103471, "auxiliary_loss_mlp": 0.01044001, "balance_loss_clip": 1.03745317, "balance_loss_mlp": 1.02452266, "epoch": 0.5217339024830157, "flos": 21945047143680.0, "grad_norm": 2.3531177083611405, "language_loss": 0.69741362, "learning_rate": 1.956381046407532e-06, "loss": 0.71888828, "num_input_tokens_seen": 93688495, "step": 4339, "time_per_iteration": 2.5844006538391113 }, { "auxiliary_loss_clip": 0.01103935, "auxiliary_loss_mlp": 0.0104466, "balance_loss_clip": 1.04332447, "balance_loss_mlp": 1.02754188, "epoch": 0.5218541453736548, "flos": 20923209037440.0, "grad_norm": 2.7477053252118506, "language_loss": 0.86462533, "learning_rate": 1.9556022633706394e-06, "loss": 0.88611132, "num_input_tokens_seen": 93707285, "step": 4340, "time_per_iteration": 2.659428119659424 }, { "auxiliary_loss_clip": 0.01111426, "auxiliary_loss_mlp": 0.01050458, "balance_loss_clip": 1.04232693, "balance_loss_mlp": 1.03243303, "epoch": 0.5219743882642939, "flos": 23951663498880.0, "grad_norm": 2.453249281945144, "language_loss": 0.80119419, "learning_rate": 1.954823487068848e-06, "loss": 0.82281297, "num_input_tokens_seen": 93727495, "step": 4341, "time_per_iteration": 2.671416759490967 }, { "auxiliary_loss_clip": 0.01133022, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.04660439, "balance_loss_mlp": 1.02204764, "epoch": 0.5220946311549329, "flos": 28799280213120.0, "grad_norm": 2.0485881245909874, "language_loss": 0.80952662, "learning_rate": 1.9540447176202976e-06, "loss": 0.83123374, "num_input_tokens_seen": 93748740, "step": 4342, "time_per_iteration": 2.6454994678497314 }, { "auxiliary_loss_clip": 0.0103206, "auxiliary_loss_mlp": 0.01001565, "balance_loss_clip": 1.01036191, "balance_loss_mlp": 0.99975318, "epoch": 0.5222148740455721, "flos": 67189369017600.0, "grad_norm": 0.8691461039070355, "language_loss": 0.606866, "learning_rate": 1.9532659551431272e-06, "loss": 0.62720227, "num_input_tokens_seen": 93815770, "step": 4343, "time_per_iteration": 3.3665482997894287 }, { "auxiliary_loss_clip": 0.01128976, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.04370642, "balance_loss_mlp": 1.01743126, "epoch": 0.5223351169362112, "flos": 61856164339200.0, "grad_norm": 1.691921903080628, "language_loss": 0.67758018, "learning_rate": 1.9524871997554744e-06, "loss": 0.69921529, "num_input_tokens_seen": 93843530, "step": 4344, "time_per_iteration": 3.0560200214385986 }, { "auxiliary_loss_clip": 0.01133425, "auxiliary_loss_mlp": 0.01044573, "balance_loss_clip": 1.04684401, "balance_loss_mlp": 1.02720439, "epoch": 0.5224553598268502, "flos": 14647388676480.0, "grad_norm": 2.4684419772492094, "language_loss": 0.81074303, "learning_rate": 1.951708451575475e-06, "loss": 0.83252299, "num_input_tokens_seen": 93860595, "step": 4345, "time_per_iteration": 2.809213161468506 }, { "auxiliary_loss_clip": 0.01113161, "auxiliary_loss_mlp": 0.01049348, "balance_loss_clip": 1.04155612, "balance_loss_mlp": 1.03216958, "epoch": 0.5225756027174894, "flos": 14826043946880.0, "grad_norm": 2.956737740510326, "language_loss": 0.82297939, "learning_rate": 1.9509297107212657e-06, "loss": 0.84460455, "num_input_tokens_seen": 93877365, "step": 4346, "time_per_iteration": 2.783022880554199 }, { "auxiliary_loss_clip": 0.01144827, "auxiliary_loss_mlp": 0.01043949, "balance_loss_clip": 1.04827332, "balance_loss_mlp": 1.02853549, "epoch": 0.5226958456081284, "flos": 23512009029120.0, "grad_norm": 2.077450197934366, "language_loss": 0.79123372, "learning_rate": 1.95015097731098e-06, "loss": 0.8131215, "num_input_tokens_seen": 93896855, "step": 4347, "time_per_iteration": 2.601506233215332 }, { "auxiliary_loss_clip": 0.01140708, "auxiliary_loss_mlp": 0.0104369, "balance_loss_clip": 1.04358792, "balance_loss_mlp": 1.02677393, "epoch": 0.5228160884987675, "flos": 19062928690560.0, "grad_norm": 2.5502619802487083, "language_loss": 0.81844687, "learning_rate": 1.949372251462751e-06, "loss": 0.8402909, "num_input_tokens_seen": 93914270, "step": 4348, "time_per_iteration": 2.594684362411499 }, { "auxiliary_loss_clip": 0.01106817, "auxiliary_loss_mlp": 0.00771674, "balance_loss_clip": 1.04500103, "balance_loss_mlp": 1.00040507, "epoch": 0.5229363313894067, "flos": 21063224252160.0, "grad_norm": 5.012660819468746, "language_loss": 0.82838392, "learning_rate": 1.9485935332947124e-06, "loss": 0.84716892, "num_input_tokens_seen": 93932180, "step": 4349, "time_per_iteration": 3.678948163986206 }, { "auxiliary_loss_clip": 0.01110721, "auxiliary_loss_mlp": 0.01040699, "balance_loss_clip": 1.04241168, "balance_loss_mlp": 1.02520216, "epoch": 0.5230565742800457, "flos": 14830389492480.0, "grad_norm": 2.3109643564025624, "language_loss": 0.83351153, "learning_rate": 1.947814822924993e-06, "loss": 0.85502571, "num_input_tokens_seen": 93949690, "step": 4350, "time_per_iteration": 3.5853521823883057 }, { "auxiliary_loss_clip": 0.0114067, "auxiliary_loss_mlp": 0.01035972, "balance_loss_clip": 1.04726756, "balance_loss_mlp": 1.02079666, "epoch": 0.5231768171706848, "flos": 25813021253760.0, "grad_norm": 2.2597634018396553, "language_loss": 0.82995808, "learning_rate": 1.9470361204717236e-06, "loss": 0.85172445, "num_input_tokens_seen": 93968830, "step": 4351, "time_per_iteration": 2.6361582279205322 }, { "auxiliary_loss_clip": 0.01110256, "auxiliary_loss_mlp": 0.00773367, "balance_loss_clip": 1.04441381, "balance_loss_mlp": 1.00043905, "epoch": 0.5232970600613239, "flos": 22743807834240.0, "grad_norm": 1.5607918629305546, "language_loss": 0.80622256, "learning_rate": 1.9462574260530326e-06, "loss": 0.82505876, "num_input_tokens_seen": 93989110, "step": 4352, "time_per_iteration": 2.7363877296447754 }, { "auxiliary_loss_clip": 0.0112325, "auxiliary_loss_mlp": 0.01046451, "balance_loss_clip": 1.04372633, "balance_loss_mlp": 1.02903438, "epoch": 0.523417302951963, "flos": 17310703432320.0, "grad_norm": 2.30129261522331, "language_loss": 0.80839586, "learning_rate": 1.9454787397870472e-06, "loss": 0.83009291, "num_input_tokens_seen": 94006430, "step": 4353, "time_per_iteration": 2.6150622367858887 }, { "auxiliary_loss_clip": 0.01070491, "auxiliary_loss_mlp": 0.01040868, "balance_loss_clip": 1.04016137, "balance_loss_mlp": 1.02457237, "epoch": 0.523537545842602, "flos": 18551740285440.0, "grad_norm": 2.1292931577609564, "language_loss": 0.71508193, "learning_rate": 1.944700061791894e-06, "loss": 0.7361955, "num_input_tokens_seen": 94024825, "step": 4354, "time_per_iteration": 2.681832790374756 }, { "auxiliary_loss_clip": 0.01131043, "auxiliary_loss_mlp": 0.01039306, "balance_loss_clip": 1.04720545, "balance_loss_mlp": 1.02308178, "epoch": 0.5236577887332411, "flos": 19719267955200.0, "grad_norm": 2.594505731759452, "language_loss": 0.65138495, "learning_rate": 1.943921392185698e-06, "loss": 0.67308843, "num_input_tokens_seen": 94043450, "step": 4355, "time_per_iteration": 3.6397180557250977 }, { "auxiliary_loss_clip": 0.01119548, "auxiliary_loss_mlp": 0.01046012, "balance_loss_clip": 1.0444622, "balance_loss_mlp": 1.03077674, "epoch": 0.5237780316238803, "flos": 23550218121600.0, "grad_norm": 2.1178428783681027, "language_loss": 0.77112627, "learning_rate": 1.9431427310865814e-06, "loss": 0.79278189, "num_input_tokens_seen": 94063055, "step": 4356, "time_per_iteration": 2.6251182556152344 }, { "auxiliary_loss_clip": 0.01084785, "auxiliary_loss_mlp": 0.01044143, "balance_loss_clip": 1.03918195, "balance_loss_mlp": 1.02707219, "epoch": 0.5238982745145193, "flos": 22491894775680.0, "grad_norm": 1.7366481404130032, "language_loss": 0.78948224, "learning_rate": 1.942364078612667e-06, "loss": 0.81077147, "num_input_tokens_seen": 94081785, "step": 4357, "time_per_iteration": 2.6765663623809814 }, { "auxiliary_loss_clip": 0.01115303, "auxiliary_loss_mlp": 0.01050108, "balance_loss_clip": 1.04612458, "balance_loss_mlp": 1.03236938, "epoch": 0.5240185174051584, "flos": 27088927234560.0, "grad_norm": 2.029120908901185, "language_loss": 0.7542479, "learning_rate": 1.9415854348820765e-06, "loss": 0.77590209, "num_input_tokens_seen": 94101635, "step": 4358, "time_per_iteration": 2.728670120239258 }, { "auxiliary_loss_clip": 0.01136099, "auxiliary_loss_mlp": 0.01045286, "balance_loss_clip": 1.04530668, "balance_loss_mlp": 1.02857256, "epoch": 0.5241387602957975, "flos": 22674680110080.0, "grad_norm": 2.455032195925325, "language_loss": 0.6833142, "learning_rate": 1.940806800012929e-06, "loss": 0.70512807, "num_input_tokens_seen": 94121705, "step": 4359, "time_per_iteration": 3.5303966999053955 }, { "auxiliary_loss_clip": 0.01090941, "auxiliary_loss_mlp": 0.00773587, "balance_loss_clip": 1.04256225, "balance_loss_mlp": 1.00053215, "epoch": 0.5242590031864366, "flos": 40553453134080.0, "grad_norm": 2.004112905975297, "language_loss": 0.63853252, "learning_rate": 1.9400281741233432e-06, "loss": 0.65717775, "num_input_tokens_seen": 94146595, "step": 4360, "time_per_iteration": 2.9171626567840576 }, { "auxiliary_loss_clip": 0.01016418, "auxiliary_loss_mlp": 0.01006021, "balance_loss_clip": 1.0185492, "balance_loss_mlp": 1.00405455, "epoch": 0.5243792460770756, "flos": 66676313105280.0, "grad_norm": 0.6558627211817338, "language_loss": 0.524948, "learning_rate": 1.939249557331435e-06, "loss": 0.54517245, "num_input_tokens_seen": 94212410, "step": 4361, "time_per_iteration": 3.2976272106170654 }, { "auxiliary_loss_clip": 0.01115355, "auxiliary_loss_mlp": 0.01040321, "balance_loss_clip": 1.04504681, "balance_loss_mlp": 1.02401936, "epoch": 0.5244994889677148, "flos": 28183663992960.0, "grad_norm": 2.95507200299761, "language_loss": 0.7250365, "learning_rate": 1.938470949755321e-06, "loss": 0.74659324, "num_input_tokens_seen": 94232290, "step": 4362, "time_per_iteration": 2.7639167308807373 }, { "auxiliary_loss_clip": 0.01014544, "auxiliary_loss_mlp": 0.01001915, "balance_loss_clip": 1.01041782, "balance_loss_mlp": 0.99986452, "epoch": 0.5246197318583539, "flos": 65950379239680.0, "grad_norm": 0.8088478634231896, "language_loss": 0.55689788, "learning_rate": 1.937692351513115e-06, "loss": 0.57706249, "num_input_tokens_seen": 94291285, "step": 4363, "time_per_iteration": 3.215040683746338 }, { "auxiliary_loss_clip": 0.01134179, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.04490209, "balance_loss_mlp": 1.02225816, "epoch": 0.5247399747489929, "flos": 21033490769280.0, "grad_norm": 2.152866257084933, "language_loss": 0.80577666, "learning_rate": 1.9369137627229297e-06, "loss": 0.82750249, "num_input_tokens_seen": 94309685, "step": 4364, "time_per_iteration": 2.6334784030914307 }, { "auxiliary_loss_clip": 0.0113427, "auxiliary_loss_mlp": 0.01039657, "balance_loss_clip": 1.04911852, "balance_loss_mlp": 1.02238381, "epoch": 0.5248602176396321, "flos": 19025940660480.0, "grad_norm": 2.645166383753813, "language_loss": 0.88005203, "learning_rate": 1.936135183502877e-06, "loss": 0.90179133, "num_input_tokens_seen": 94326985, "step": 4365, "time_per_iteration": 2.606201171875 }, { "auxiliary_loss_clip": 0.01105548, "auxiliary_loss_mlp": 0.01047858, "balance_loss_clip": 1.04061294, "balance_loss_mlp": 1.03079867, "epoch": 0.5249804605302711, "flos": 22200084685440.0, "grad_norm": 2.306043141801339, "language_loss": 0.80473828, "learning_rate": 1.935356613971066e-06, "loss": 0.82627237, "num_input_tokens_seen": 94347645, "step": 4366, "time_per_iteration": 2.699315309524536 }, { "auxiliary_loss_clip": 0.01115502, "auxiliary_loss_mlp": 0.00772591, "balance_loss_clip": 1.04319382, "balance_loss_mlp": 1.00039124, "epoch": 0.5251007034209102, "flos": 23805686626560.0, "grad_norm": 1.7885149064820485, "language_loss": 0.77354658, "learning_rate": 1.9345780542456047e-06, "loss": 0.79242754, "num_input_tokens_seen": 94367020, "step": 4367, "time_per_iteration": 2.6994681358337402 }, { "auxiliary_loss_clip": 0.01120906, "auxiliary_loss_mlp": 0.01040404, "balance_loss_clip": 1.04340076, "balance_loss_mlp": 1.0244422, "epoch": 0.5252209463115494, "flos": 23294605962240.0, "grad_norm": 2.918504064716654, "language_loss": 0.71737623, "learning_rate": 1.9337995044446007e-06, "loss": 0.73898935, "num_input_tokens_seen": 94385860, "step": 4368, "time_per_iteration": 2.617412567138672 }, { "auxiliary_loss_clip": 0.01133529, "auxiliary_loss_mlp": 0.01043082, "balance_loss_clip": 1.04546344, "balance_loss_mlp": 1.02646422, "epoch": 0.5253411892021884, "flos": 19828687760640.0, "grad_norm": 2.2572679004985923, "language_loss": 0.79988128, "learning_rate": 1.9330209646861596e-06, "loss": 0.82164741, "num_input_tokens_seen": 94405010, "step": 4369, "time_per_iteration": 2.5871026515960693 }, { "auxiliary_loss_clip": 0.01114626, "auxiliary_loss_mlp": 0.01042104, "balance_loss_clip": 1.04293728, "balance_loss_mlp": 1.02611756, "epoch": 0.5254614320928275, "flos": 24133730561280.0, "grad_norm": 1.721136362174143, "language_loss": 0.77881414, "learning_rate": 1.9322424350883843e-06, "loss": 0.80038142, "num_input_tokens_seen": 94426845, "step": 4370, "time_per_iteration": 2.692749500274658 }, { "auxiliary_loss_clip": 0.01116781, "auxiliary_loss_mlp": 0.01040247, "balance_loss_clip": 1.04332995, "balance_loss_mlp": 1.02412963, "epoch": 0.5255816749834666, "flos": 24644954880000.0, "grad_norm": 1.7041167264418673, "language_loss": 0.78681201, "learning_rate": 1.931463915769379e-06, "loss": 0.80838227, "num_input_tokens_seen": 94446960, "step": 4371, "time_per_iteration": 2.6624534130096436 }, { "auxiliary_loss_clip": 0.01092565, "auxiliary_loss_mlp": 0.01041681, "balance_loss_clip": 1.0410912, "balance_loss_mlp": 1.02571881, "epoch": 0.5257019178741057, "flos": 14136595320960.0, "grad_norm": 2.2555028657472396, "language_loss": 0.74334335, "learning_rate": 1.930685406847242e-06, "loss": 0.76468575, "num_input_tokens_seen": 94461535, "step": 4372, "time_per_iteration": 2.6921799182891846 }, { "auxiliary_loss_clip": 0.0111672, "auxiliary_loss_mlp": 0.0103894, "balance_loss_clip": 1.04428744, "balance_loss_mlp": 1.02344894, "epoch": 0.5258221607647448, "flos": 23548961145600.0, "grad_norm": 1.4123029848173343, "language_loss": 0.81697941, "learning_rate": 1.9299069084400734e-06, "loss": 0.83853602, "num_input_tokens_seen": 94482395, "step": 4373, "time_per_iteration": 2.6615145206451416 }, { "auxiliary_loss_clip": 0.01106041, "auxiliary_loss_mlp": 0.01040956, "balance_loss_clip": 1.04359245, "balance_loss_mlp": 1.02311015, "epoch": 0.5259424036553839, "flos": 24966103403520.0, "grad_norm": 2.2261080671118565, "language_loss": 0.69729626, "learning_rate": 1.9291284206659717e-06, "loss": 0.71876615, "num_input_tokens_seen": 94500580, "step": 4374, "time_per_iteration": 2.753234624862671 }, { "auxiliary_loss_clip": 0.01143952, "auxiliary_loss_mlp": 0.01042343, "balance_loss_clip": 1.04759169, "balance_loss_mlp": 1.0259639, "epoch": 0.526062646546023, "flos": 28763908295040.0, "grad_norm": 2.2539951889485197, "language_loss": 0.71287477, "learning_rate": 1.928349943643032e-06, "loss": 0.73473775, "num_input_tokens_seen": 94519680, "step": 4375, "time_per_iteration": 4.587484836578369 }, { "auxiliary_loss_clip": 0.01125027, "auxiliary_loss_mlp": 0.0104085, "balance_loss_clip": 1.04611993, "balance_loss_mlp": 1.02518582, "epoch": 0.526182889436662, "flos": 22821375254400.0, "grad_norm": 1.9289047214833983, "language_loss": 0.81990469, "learning_rate": 1.9275714774893493e-06, "loss": 0.8415634, "num_input_tokens_seen": 94539135, "step": 4376, "time_per_iteration": 2.6132256984710693 }, { "auxiliary_loss_clip": 0.01094128, "auxiliary_loss_mlp": 0.01058295, "balance_loss_clip": 1.03945959, "balance_loss_mlp": 1.04069996, "epoch": 0.5263031323273012, "flos": 22929466256640.0, "grad_norm": 2.591327895576922, "language_loss": 0.72428143, "learning_rate": 1.9267930223230154e-06, "loss": 0.74580562, "num_input_tokens_seen": 94557610, "step": 4377, "time_per_iteration": 2.6903774738311768 }, { "auxiliary_loss_clip": 0.01121735, "auxiliary_loss_mlp": 0.01042815, "balance_loss_clip": 1.04620576, "balance_loss_mlp": 1.02554131, "epoch": 0.5264233752179402, "flos": 17748634049280.0, "grad_norm": 2.070325154628623, "language_loss": 0.78079009, "learning_rate": 1.9260145782621224e-06, "loss": 0.80243564, "num_input_tokens_seen": 94575390, "step": 4378, "time_per_iteration": 2.660308837890625 }, { "auxiliary_loss_clip": 0.01114113, "auxiliary_loss_mlp": 0.01043666, "balance_loss_clip": 1.04343367, "balance_loss_mlp": 1.02810931, "epoch": 0.5265436181085793, "flos": 24421626069120.0, "grad_norm": 1.825919391037612, "language_loss": 0.88124532, "learning_rate": 1.925236145424758e-06, "loss": 0.90282321, "num_input_tokens_seen": 94594210, "step": 4379, "time_per_iteration": 2.695815324783325 }, { "auxiliary_loss_clip": 0.01037812, "auxiliary_loss_mlp": 0.01032356, "balance_loss_clip": 1.01433969, "balance_loss_mlp": 1.03060329, "epoch": 0.5266638609992185, "flos": 69207298156800.0, "grad_norm": 0.7050127906749277, "language_loss": 0.57576978, "learning_rate": 1.924457723929012e-06, "loss": 0.59647149, "num_input_tokens_seen": 94665020, "step": 4380, "time_per_iteration": 4.310207366943359 }, { "auxiliary_loss_clip": 0.01133785, "auxiliary_loss_mlp": 0.01039906, "balance_loss_clip": 1.04734147, "balance_loss_mlp": 1.02275169, "epoch": 0.5267841038898575, "flos": 20738699850240.0, "grad_norm": 1.7491362902829226, "language_loss": 0.82857811, "learning_rate": 1.9236793138929685e-06, "loss": 0.85031503, "num_input_tokens_seen": 94684290, "step": 4381, "time_per_iteration": 2.6522250175476074 }, { "auxiliary_loss_clip": 0.01131804, "auxiliary_loss_mlp": 0.01042845, "balance_loss_clip": 1.04464006, "balance_loss_mlp": 1.02638173, "epoch": 0.5269043467804966, "flos": 17234392988160.0, "grad_norm": 1.9891850243272755, "language_loss": 0.81160218, "learning_rate": 1.9229009154347133e-06, "loss": 0.83334869, "num_input_tokens_seen": 94701880, "step": 4382, "time_per_iteration": 2.588228940963745 }, { "auxiliary_loss_clip": 0.01076278, "auxiliary_loss_mlp": 0.00772191, "balance_loss_clip": 1.03720987, "balance_loss_mlp": 1.00050998, "epoch": 0.5270245896711357, "flos": 18223157646720.0, "grad_norm": 2.2353458637032584, "language_loss": 0.80563933, "learning_rate": 1.922122528672327e-06, "loss": 0.82412398, "num_input_tokens_seen": 94720545, "step": 4383, "time_per_iteration": 2.6729438304901123 }, { "auxiliary_loss_clip": 0.01141686, "auxiliary_loss_mlp": 0.010362, "balance_loss_clip": 1.04781628, "balance_loss_mlp": 1.02069116, "epoch": 0.5271448325617748, "flos": 21287558643840.0, "grad_norm": 2.8176687378845706, "language_loss": 0.78530169, "learning_rate": 1.9213441537238914e-06, "loss": 0.80708057, "num_input_tokens_seen": 94737420, "step": 4384, "time_per_iteration": 2.6038525104522705 }, { "auxiliary_loss_clip": 0.00998704, "auxiliary_loss_mlp": 0.01030824, "balance_loss_clip": 1.01476359, "balance_loss_mlp": 1.0292027, "epoch": 0.5272650754524139, "flos": 65495497403520.0, "grad_norm": 0.8294231277240873, "language_loss": 0.5733614, "learning_rate": 1.920565790707485e-06, "loss": 0.59365678, "num_input_tokens_seen": 94802810, "step": 4385, "time_per_iteration": 4.3595311641693115 }, { "auxiliary_loss_clip": 0.01099562, "auxiliary_loss_mlp": 0.01050741, "balance_loss_clip": 1.04136992, "balance_loss_mlp": 1.03405213, "epoch": 0.527385318343053, "flos": 19676426008320.0, "grad_norm": 2.5076172019300533, "language_loss": 0.65698636, "learning_rate": 1.9197874397411853e-06, "loss": 0.67848939, "num_input_tokens_seen": 94819440, "step": 4386, "time_per_iteration": 2.7093772888183594 }, { "auxiliary_loss_clip": 0.01106159, "auxiliary_loss_mlp": 0.01049092, "balance_loss_clip": 1.04026115, "balance_loss_mlp": 1.03200912, "epoch": 0.5275055612336921, "flos": 12712018947840.0, "grad_norm": 3.837510503958401, "language_loss": 0.6607337, "learning_rate": 1.919009100943067e-06, "loss": 0.68228614, "num_input_tokens_seen": 94835130, "step": 4387, "time_per_iteration": 2.6245617866516113 }, { "auxiliary_loss_clip": 0.01107085, "auxiliary_loss_mlp": 0.01049713, "balance_loss_clip": 1.04759097, "balance_loss_mlp": 1.033131, "epoch": 0.5276258041243311, "flos": 17749029098880.0, "grad_norm": 2.4665719813483706, "language_loss": 0.65530121, "learning_rate": 1.9182307744312043e-06, "loss": 0.67686921, "num_input_tokens_seen": 94852235, "step": 4388, "time_per_iteration": 2.705178737640381 }, { "auxiliary_loss_clip": 0.01118336, "auxiliary_loss_mlp": 0.01037354, "balance_loss_clip": 1.04414654, "balance_loss_mlp": 1.01997304, "epoch": 0.5277460470149702, "flos": 22710447077760.0, "grad_norm": 1.8633470597951738, "language_loss": 0.76446211, "learning_rate": 1.9174524603236676e-06, "loss": 0.78601903, "num_input_tokens_seen": 94871185, "step": 4389, "time_per_iteration": 2.674651622772217 }, { "auxiliary_loss_clip": 0.0111989, "auxiliary_loss_mlp": 0.01047238, "balance_loss_clip": 1.04325914, "balance_loss_mlp": 1.03057253, "epoch": 0.5278662899056094, "flos": 19902699734400.0, "grad_norm": 3.9803493892239055, "language_loss": 0.76200259, "learning_rate": 1.916674158738527e-06, "loss": 0.78367388, "num_input_tokens_seen": 94890090, "step": 4390, "time_per_iteration": 2.6455674171447754 }, { "auxiliary_loss_clip": 0.01101312, "auxiliary_loss_mlp": 0.00773895, "balance_loss_clip": 1.04337668, "balance_loss_mlp": 1.00047326, "epoch": 0.5279865327962484, "flos": 18005215875840.0, "grad_norm": 3.1520496090422965, "language_loss": 0.60462397, "learning_rate": 1.9158958697938506e-06, "loss": 0.62337613, "num_input_tokens_seen": 94908470, "step": 4391, "time_per_iteration": 2.6983349323272705 }, { "auxiliary_loss_clip": 0.01112678, "auxiliary_loss_mlp": 0.01053417, "balance_loss_clip": 1.04217482, "balance_loss_mlp": 1.03520203, "epoch": 0.5281067756868875, "flos": 15924443892480.0, "grad_norm": 2.216805513849491, "language_loss": 0.85931635, "learning_rate": 1.9151175936077032e-06, "loss": 0.88097733, "num_input_tokens_seen": 94923440, "step": 4392, "time_per_iteration": 2.57619571685791 }, { "auxiliary_loss_clip": 0.0112753, "auxiliary_loss_mlp": 0.01046738, "balance_loss_clip": 1.04657984, "balance_loss_mlp": 1.03081191, "epoch": 0.5282270185775266, "flos": 19426488197760.0, "grad_norm": 1.615902156848014, "language_loss": 0.79296148, "learning_rate": 1.9143393302981507e-06, "loss": 0.81470418, "num_input_tokens_seen": 94941125, "step": 4393, "time_per_iteration": 2.5843312740325928 }, { "auxiliary_loss_clip": 0.01121607, "auxiliary_loss_mlp": 0.0104499, "balance_loss_clip": 1.04513645, "balance_loss_mlp": 1.02938509, "epoch": 0.5283472614681657, "flos": 16399613934720.0, "grad_norm": 1.8075947295825971, "language_loss": 0.8331148, "learning_rate": 1.913561079983252e-06, "loss": 0.85478079, "num_input_tokens_seen": 94959950, "step": 4394, "time_per_iteration": 2.6210641860961914 }, { "auxiliary_loss_clip": 0.01125754, "auxiliary_loss_mlp": 0.01057545, "balance_loss_clip": 1.04534924, "balance_loss_mlp": 1.03846002, "epoch": 0.5284675043588047, "flos": 26760524163840.0, "grad_norm": 2.2421030425009576, "language_loss": 0.75150317, "learning_rate": 1.9127828427810693e-06, "loss": 0.77333617, "num_input_tokens_seen": 94980515, "step": 4395, "time_per_iteration": 2.648608922958374 }, { "auxiliary_loss_clip": 0.0111117, "auxiliary_loss_mlp": 0.01034173, "balance_loss_clip": 1.04235315, "balance_loss_mlp": 1.01810372, "epoch": 0.5285877472494439, "flos": 19899898473600.0, "grad_norm": 3.2260111015577726, "language_loss": 0.81041932, "learning_rate": 1.9120046188096607e-06, "loss": 0.83187276, "num_input_tokens_seen": 94998560, "step": 4396, "time_per_iteration": 2.660600423812866 }, { "auxiliary_loss_clip": 0.0111796, "auxiliary_loss_mlp": 0.01043027, "balance_loss_clip": 1.04876781, "balance_loss_mlp": 1.02514577, "epoch": 0.528707990140083, "flos": 20011257613440.0, "grad_norm": 1.7784150018811007, "language_loss": 0.74072999, "learning_rate": 1.9112264081870804e-06, "loss": 0.76233983, "num_input_tokens_seen": 95016950, "step": 4397, "time_per_iteration": 2.665235757827759 }, { "auxiliary_loss_clip": 0.01106108, "auxiliary_loss_mlp": 0.01038868, "balance_loss_clip": 1.04475176, "balance_loss_mlp": 1.02105844, "epoch": 0.528828233030722, "flos": 20667956014080.0, "grad_norm": 2.478505189026907, "language_loss": 0.75561774, "learning_rate": 1.9104482110313843e-06, "loss": 0.77706754, "num_input_tokens_seen": 95036540, "step": 4398, "time_per_iteration": 2.6981184482574463 }, { "auxiliary_loss_clip": 0.01130583, "auxiliary_loss_mlp": 0.01041424, "balance_loss_clip": 1.04698384, "balance_loss_mlp": 1.02453196, "epoch": 0.5289484759213612, "flos": 25192448956800.0, "grad_norm": 2.327270500474242, "language_loss": 0.74172461, "learning_rate": 1.909670027460623e-06, "loss": 0.76344466, "num_input_tokens_seen": 95053840, "step": 4399, "time_per_iteration": 2.6325161457061768 }, { "auxiliary_loss_clip": 0.01133375, "auxiliary_loss_mlp": 0.01040072, "balance_loss_clip": 1.04786777, "balance_loss_mlp": 1.02446759, "epoch": 0.5290687188120002, "flos": 31139255715840.0, "grad_norm": 1.890177254482091, "language_loss": 0.71939564, "learning_rate": 1.908891857592847e-06, "loss": 0.74113005, "num_input_tokens_seen": 95074910, "step": 4400, "time_per_iteration": 2.7011423110961914 }, { "auxiliary_loss_clip": 0.01102962, "auxiliary_loss_mlp": 0.01042811, "balance_loss_clip": 1.04632854, "balance_loss_mlp": 1.02674174, "epoch": 0.5291889617026393, "flos": 20119851406080.0, "grad_norm": 2.4235792411683126, "language_loss": 0.89724141, "learning_rate": 1.9081137015461034e-06, "loss": 0.91869915, "num_input_tokens_seen": 95090985, "step": 4401, "time_per_iteration": 3.6103193759918213 }, { "auxiliary_loss_clip": 0.01089175, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.04441988, "balance_loss_mlp": 1.02353799, "epoch": 0.5293092045932785, "flos": 19643747610240.0, "grad_norm": 1.9040938052807892, "language_loss": 0.90717149, "learning_rate": 1.9073355594384383e-06, "loss": 0.9284777, "num_input_tokens_seen": 95109225, "step": 4402, "time_per_iteration": 2.67734432220459 }, { "auxiliary_loss_clip": 0.01099573, "auxiliary_loss_mlp": 0.01046165, "balance_loss_clip": 1.04600644, "balance_loss_mlp": 1.02879596, "epoch": 0.5294294474839175, "flos": 24317736958080.0, "grad_norm": 1.969698797908576, "language_loss": 0.8057937, "learning_rate": 1.906557431387895e-06, "loss": 0.82725114, "num_input_tokens_seen": 95128215, "step": 4403, "time_per_iteration": 2.7164134979248047 }, { "auxiliary_loss_clip": 0.01102829, "auxiliary_loss_mlp": 0.01045763, "balance_loss_clip": 1.04635966, "balance_loss_mlp": 1.02797127, "epoch": 0.5295496903745566, "flos": 18875941464960.0, "grad_norm": 2.4415211994023216, "language_loss": 0.78790295, "learning_rate": 1.905779317512516e-06, "loss": 0.80938888, "num_input_tokens_seen": 95145760, "step": 4404, "time_per_iteration": 2.6619679927825928 }, { "auxiliary_loss_clip": 0.01134778, "auxiliary_loss_mlp": 0.01047223, "balance_loss_clip": 1.0489192, "balance_loss_mlp": 1.03054559, "epoch": 0.5296699332651957, "flos": 20923101296640.0, "grad_norm": 1.9613456090692565, "language_loss": 0.81024766, "learning_rate": 1.9050012179303385e-06, "loss": 0.83206773, "num_input_tokens_seen": 95164270, "step": 4405, "time_per_iteration": 2.6393020153045654 }, { "auxiliary_loss_clip": 0.01134549, "auxiliary_loss_mlp": 0.01046868, "balance_loss_clip": 1.04432225, "balance_loss_mlp": 1.02933192, "epoch": 0.5297901761558348, "flos": 22046745525120.0, "grad_norm": 2.060101522902241, "language_loss": 0.68886489, "learning_rate": 1.904223132759401e-06, "loss": 0.71067917, "num_input_tokens_seen": 95182870, "step": 4406, "time_per_iteration": 2.601064682006836 }, { "auxiliary_loss_clip": 0.01134204, "auxiliary_loss_mlp": 0.01037064, "balance_loss_clip": 1.04692578, "balance_loss_mlp": 1.0200882, "epoch": 0.5299104190464738, "flos": 21798495653760.0, "grad_norm": 2.8569368177412735, "language_loss": 0.6897378, "learning_rate": 1.9034450621177383e-06, "loss": 0.71145046, "num_input_tokens_seen": 95201190, "step": 4407, "time_per_iteration": 3.6892611980438232 }, { "auxiliary_loss_clip": 0.01134228, "auxiliary_loss_mlp": 0.01055106, "balance_loss_clip": 1.04698348, "balance_loss_mlp": 1.03693867, "epoch": 0.530030661937113, "flos": 14720790119040.0, "grad_norm": 2.40619307684879, "language_loss": 0.7028017, "learning_rate": 1.9026670061233824e-06, "loss": 0.72469509, "num_input_tokens_seen": 95218625, "step": 4408, "time_per_iteration": 2.5703845024108887 }, { "auxiliary_loss_clip": 0.01115455, "auxiliary_loss_mlp": 0.01037989, "balance_loss_clip": 1.04619205, "balance_loss_mlp": 1.02207422, "epoch": 0.5301509048277521, "flos": 21251504367360.0, "grad_norm": 2.4221000215781894, "language_loss": 0.80682975, "learning_rate": 1.901888964894365e-06, "loss": 0.82836419, "num_input_tokens_seen": 95237665, "step": 4409, "time_per_iteration": 2.6423068046569824 }, { "auxiliary_loss_clip": 0.01147064, "auxiliary_loss_mlp": 0.01044233, "balance_loss_clip": 1.04779935, "balance_loss_mlp": 1.02747202, "epoch": 0.5302711477183911, "flos": 25957058791680.0, "grad_norm": 2.3572021801612206, "language_loss": 0.67600989, "learning_rate": 1.9011109385487134e-06, "loss": 0.69792283, "num_input_tokens_seen": 95258915, "step": 4410, "time_per_iteration": 2.610365867614746 }, { "auxiliary_loss_clip": 0.01146727, "auxiliary_loss_mlp": 0.01041917, "balance_loss_clip": 1.04688668, "balance_loss_mlp": 1.02378559, "epoch": 0.5303913906090303, "flos": 22273126992000.0, "grad_norm": 3.3831309721947824, "language_loss": 0.66756916, "learning_rate": 1.900332927204454e-06, "loss": 0.68945557, "num_input_tokens_seen": 95277365, "step": 4411, "time_per_iteration": 3.462996006011963 }, { "auxiliary_loss_clip": 0.01124149, "auxiliary_loss_mlp": 0.01049235, "balance_loss_clip": 1.04332781, "balance_loss_mlp": 1.03169894, "epoch": 0.5305116334996693, "flos": 24936010784640.0, "grad_norm": 2.599597956554867, "language_loss": 0.76730275, "learning_rate": 1.8995549309796097e-06, "loss": 0.78903663, "num_input_tokens_seen": 95296670, "step": 4412, "time_per_iteration": 2.6773006916046143 }, { "auxiliary_loss_clip": 0.01134576, "auxiliary_loss_mlp": 0.01048834, "balance_loss_clip": 1.04760695, "balance_loss_mlp": 1.03041661, "epoch": 0.5306318763903084, "flos": 20189338266240.0, "grad_norm": 1.789783822688699, "language_loss": 0.76819682, "learning_rate": 1.8987769499922028e-06, "loss": 0.79003096, "num_input_tokens_seen": 95315640, "step": 4413, "time_per_iteration": 2.575085163116455 }, { "auxiliary_loss_clip": 0.01129208, "auxiliary_loss_mlp": 0.00772388, "balance_loss_clip": 1.04570353, "balance_loss_mlp": 1.00055385, "epoch": 0.5307521192809476, "flos": 20266366982400.0, "grad_norm": 2.3569560099853395, "language_loss": 0.70794868, "learning_rate": 1.897998984360252e-06, "loss": 0.72696471, "num_input_tokens_seen": 95334610, "step": 4414, "time_per_iteration": 2.58282732963562 }, { "auxiliary_loss_clip": 0.01113994, "auxiliary_loss_mlp": 0.01046057, "balance_loss_clip": 1.04228461, "balance_loss_mlp": 1.02985668, "epoch": 0.5308723621715866, "flos": 28844276976000.0, "grad_norm": 1.3894635702413884, "language_loss": 0.78463805, "learning_rate": 1.897221034201775e-06, "loss": 0.80623859, "num_input_tokens_seen": 95358350, "step": 4415, "time_per_iteration": 2.74627685546875 }, { "auxiliary_loss_clip": 0.01106574, "auxiliary_loss_mlp": 0.01036437, "balance_loss_clip": 1.0433526, "balance_loss_mlp": 1.02030802, "epoch": 0.5309926050622257, "flos": 27457766040960.0, "grad_norm": 8.034588644801596, "language_loss": 0.66973275, "learning_rate": 1.8964430996347842e-06, "loss": 0.69116288, "num_input_tokens_seen": 95379900, "step": 4416, "time_per_iteration": 2.7231476306915283 }, { "auxiliary_loss_clip": 0.01120894, "auxiliary_loss_mlp": 0.01042034, "balance_loss_clip": 1.04453087, "balance_loss_mlp": 1.02391386, "epoch": 0.5311128479528648, "flos": 20514545026560.0, "grad_norm": 1.7466386005957222, "language_loss": 0.82567692, "learning_rate": 1.8956651807772931e-06, "loss": 0.84730619, "num_input_tokens_seen": 95397935, "step": 4417, "time_per_iteration": 2.647920608520508 }, { "auxiliary_loss_clip": 0.01125298, "auxiliary_loss_mlp": 0.010399, "balance_loss_clip": 1.04312479, "balance_loss_mlp": 1.02493942, "epoch": 0.5312330908435039, "flos": 21397660807680.0, "grad_norm": 1.668756798182449, "language_loss": 0.84278905, "learning_rate": 1.8948872777473115e-06, "loss": 0.86444104, "num_input_tokens_seen": 95415890, "step": 4418, "time_per_iteration": 2.615198850631714 }, { "auxiliary_loss_clip": 0.01124294, "auxiliary_loss_mlp": 0.01049707, "balance_loss_clip": 1.04837561, "balance_loss_mlp": 1.03171885, "epoch": 0.531353333734143, "flos": 24717350741760.0, "grad_norm": 2.018156114681779, "language_loss": 0.63153154, "learning_rate": 1.8941093906628458e-06, "loss": 0.65327156, "num_input_tokens_seen": 95433675, "step": 4419, "time_per_iteration": 2.6711859703063965 }, { "auxiliary_loss_clip": 0.01112612, "auxiliary_loss_mlp": 0.01038221, "balance_loss_clip": 1.04110169, "balance_loss_mlp": 1.02069747, "epoch": 0.531473576624782, "flos": 30480689808000.0, "grad_norm": 2.254470302661999, "language_loss": 0.70819074, "learning_rate": 1.893331519641902e-06, "loss": 0.72969913, "num_input_tokens_seen": 95455820, "step": 4420, "time_per_iteration": 2.744053840637207 }, { "auxiliary_loss_clip": 0.01094496, "auxiliary_loss_mlp": 0.01045022, "balance_loss_clip": 1.03917432, "balance_loss_mlp": 1.02607918, "epoch": 0.5315938195154212, "flos": 23002975440000.0, "grad_norm": 2.470855673701127, "language_loss": 0.74106097, "learning_rate": 1.8925536648024815e-06, "loss": 0.76245618, "num_input_tokens_seen": 95473240, "step": 4421, "time_per_iteration": 2.677570104598999 }, { "auxiliary_loss_clip": 0.0114498, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.04694772, "balance_loss_mlp": 1.02091825, "epoch": 0.5317140624060602, "flos": 22748584343040.0, "grad_norm": 2.0053837607942926, "language_loss": 0.76273072, "learning_rate": 1.8917758262625849e-06, "loss": 0.78455424, "num_input_tokens_seen": 95493480, "step": 4422, "time_per_iteration": 2.5808637142181396 }, { "auxiliary_loss_clip": 0.01115265, "auxiliary_loss_mlp": 0.01042195, "balance_loss_clip": 1.04472649, "balance_loss_mlp": 1.02554107, "epoch": 0.5318343052966993, "flos": 22821087945600.0, "grad_norm": 2.0071173138095997, "language_loss": 0.80709529, "learning_rate": 1.8909980041402089e-06, "loss": 0.82866991, "num_input_tokens_seen": 95512075, "step": 4423, "time_per_iteration": 2.642582416534424 }, { "auxiliary_loss_clip": 0.01122637, "auxiliary_loss_mlp": 0.01056228, "balance_loss_clip": 1.04265094, "balance_loss_mlp": 1.03707075, "epoch": 0.5319545481873384, "flos": 13626089274240.0, "grad_norm": 2.5967987267623895, "language_loss": 0.6594193, "learning_rate": 1.8902201985533494e-06, "loss": 0.68120795, "num_input_tokens_seen": 95529340, "step": 4424, "time_per_iteration": 2.576049566268921 }, { "auxiliary_loss_clip": 0.01117527, "auxiliary_loss_mlp": 0.01037, "balance_loss_clip": 1.04299092, "balance_loss_mlp": 1.01978636, "epoch": 0.5320747910779775, "flos": 22162522037760.0, "grad_norm": 1.7479982246305747, "language_loss": 0.74719179, "learning_rate": 1.8894424096199983e-06, "loss": 0.76873702, "num_input_tokens_seen": 95548545, "step": 4425, "time_per_iteration": 2.660628318786621 }, { "auxiliary_loss_clip": 0.01134405, "auxiliary_loss_mlp": 0.01044755, "balance_loss_clip": 1.04889262, "balance_loss_mlp": 1.02733874, "epoch": 0.5321950339686166, "flos": 18588081870720.0, "grad_norm": 2.4140629945406524, "language_loss": 0.86050099, "learning_rate": 1.8886646374581463e-06, "loss": 0.88229263, "num_input_tokens_seen": 95567770, "step": 4426, "time_per_iteration": 2.5802979469299316 }, { "auxiliary_loss_clip": 0.01130705, "auxiliary_loss_mlp": 0.01050461, "balance_loss_clip": 1.04346991, "balance_loss_mlp": 1.03076816, "epoch": 0.5323152768592557, "flos": 22856818999680.0, "grad_norm": 2.2141350440555514, "language_loss": 0.71397305, "learning_rate": 1.8878868821857795e-06, "loss": 0.73578471, "num_input_tokens_seen": 95587420, "step": 4427, "time_per_iteration": 3.5757648944854736 }, { "auxiliary_loss_clip": 0.01083293, "auxiliary_loss_mlp": 0.01051059, "balance_loss_clip": 1.03768742, "balance_loss_mlp": 1.03210425, "epoch": 0.5324355197498948, "flos": 33948690998400.0, "grad_norm": 2.141892931949487, "language_loss": 0.75375688, "learning_rate": 1.8871091439208838e-06, "loss": 0.77510035, "num_input_tokens_seen": 95609030, "step": 4428, "time_per_iteration": 2.8032355308532715 }, { "auxiliary_loss_clip": 0.01091783, "auxiliary_loss_mlp": 0.01057027, "balance_loss_clip": 1.04215133, "balance_loss_mlp": 1.036165, "epoch": 0.5325557626405338, "flos": 23256720092160.0, "grad_norm": 2.12219840593113, "language_loss": 0.76935935, "learning_rate": 1.8863314227814414e-06, "loss": 0.79084742, "num_input_tokens_seen": 95627340, "step": 4429, "time_per_iteration": 2.7057785987854004 }, { "auxiliary_loss_clip": 0.01134357, "auxiliary_loss_mlp": 0.01037376, "balance_loss_clip": 1.04467225, "balance_loss_mlp": 1.01957822, "epoch": 0.532676005531173, "flos": 26718687797760.0, "grad_norm": 6.544048569252134, "language_loss": 0.48612931, "learning_rate": 1.8855537188854313e-06, "loss": 0.50784665, "num_input_tokens_seen": 95646315, "step": 4430, "time_per_iteration": 2.654104471206665 }, { "auxiliary_loss_clip": 0.01128467, "auxiliary_loss_mlp": 0.01044239, "balance_loss_clip": 1.0407598, "balance_loss_mlp": 1.02685833, "epoch": 0.5327962484218121, "flos": 17894610921600.0, "grad_norm": 1.9218208722986132, "language_loss": 0.78290021, "learning_rate": 1.8847760323508315e-06, "loss": 0.8046273, "num_input_tokens_seen": 95665220, "step": 4431, "time_per_iteration": 2.5865609645843506 }, { "auxiliary_loss_clip": 0.01112195, "auxiliary_loss_mlp": 0.01043449, "balance_loss_clip": 1.04214668, "balance_loss_mlp": 1.0261879, "epoch": 0.5329164913124511, "flos": 17925385898880.0, "grad_norm": 1.7895339015366394, "language_loss": 0.75806916, "learning_rate": 1.883998363295616e-06, "loss": 0.77962565, "num_input_tokens_seen": 95682700, "step": 4432, "time_per_iteration": 2.6349191665649414 }, { "auxiliary_loss_clip": 0.01028389, "auxiliary_loss_mlp": 0.01009549, "balance_loss_clip": 1.01585829, "balance_loss_mlp": 1.00758207, "epoch": 0.5330367342030903, "flos": 57254178781440.0, "grad_norm": 0.8815112388881771, "language_loss": 0.62678558, "learning_rate": 1.8832207118377565e-06, "loss": 0.64716494, "num_input_tokens_seen": 95738070, "step": 4433, "time_per_iteration": 4.119545221328735 }, { "auxiliary_loss_clip": 0.01140237, "auxiliary_loss_mlp": 0.01039327, "balance_loss_clip": 1.04557586, "balance_loss_mlp": 1.02305543, "epoch": 0.5331569770937293, "flos": 17420518287360.0, "grad_norm": 1.9816007756272496, "language_loss": 0.69546819, "learning_rate": 1.882443078095222e-06, "loss": 0.71726388, "num_input_tokens_seen": 95756950, "step": 4434, "time_per_iteration": 2.5618128776550293 }, { "auxiliary_loss_clip": 0.01014443, "auxiliary_loss_mlp": 0.01012517, "balance_loss_clip": 1.01881862, "balance_loss_mlp": 1.01037121, "epoch": 0.5332772199843684, "flos": 56750783627520.0, "grad_norm": 0.8525062352702416, "language_loss": 0.667539, "learning_rate": 1.8816654621859794e-06, "loss": 0.68780857, "num_input_tokens_seen": 95816615, "step": 4435, "time_per_iteration": 3.180402994155884 }, { "auxiliary_loss_clip": 0.01140222, "auxiliary_loss_mlp": 0.01044832, "balance_loss_clip": 1.04601693, "balance_loss_mlp": 1.02809453, "epoch": 0.5333974628750076, "flos": 18697753071360.0, "grad_norm": 2.396227531048155, "language_loss": 0.7242403, "learning_rate": 1.8808878642279915e-06, "loss": 0.74609077, "num_input_tokens_seen": 95832020, "step": 4436, "time_per_iteration": 2.594454288482666 }, { "auxiliary_loss_clip": 0.01108091, "auxiliary_loss_mlp": 0.01045438, "balance_loss_clip": 1.03871799, "balance_loss_mlp": 1.02680576, "epoch": 0.5335177057656466, "flos": 23805507058560.0, "grad_norm": 2.5989621100434044, "language_loss": 0.65451634, "learning_rate": 1.8801102843392209e-06, "loss": 0.67605162, "num_input_tokens_seen": 95851425, "step": 4437, "time_per_iteration": 3.443607807159424 }, { "auxiliary_loss_clip": 0.01104656, "auxiliary_loss_mlp": 0.01039421, "balance_loss_clip": 1.04206276, "balance_loss_mlp": 1.02213597, "epoch": 0.5336379486562857, "flos": 25078683605760.0, "grad_norm": 1.5715033294714504, "language_loss": 0.85209596, "learning_rate": 1.8793327226376238e-06, "loss": 0.87353683, "num_input_tokens_seen": 95870745, "step": 4438, "time_per_iteration": 2.6829941272735596 }, { "auxiliary_loss_clip": 0.01124294, "auxiliary_loss_mlp": 0.01042484, "balance_loss_clip": 1.04378176, "balance_loss_mlp": 1.02424538, "epoch": 0.5337581915469248, "flos": 21396691140480.0, "grad_norm": 1.700932097631027, "language_loss": 0.79921472, "learning_rate": 1.8785551792411569e-06, "loss": 0.82088244, "num_input_tokens_seen": 95889755, "step": 4439, "time_per_iteration": 2.6200923919677734 }, { "auxiliary_loss_clip": 0.01116836, "auxiliary_loss_mlp": 0.01043717, "balance_loss_clip": 1.04302359, "balance_loss_mlp": 1.0269444, "epoch": 0.5338784344375639, "flos": 14865905064960.0, "grad_norm": 1.940711842399282, "language_loss": 0.82540768, "learning_rate": 1.8777776542677733e-06, "loss": 0.84701324, "num_input_tokens_seen": 95907805, "step": 4440, "time_per_iteration": 2.597813844680786 }, { "auxiliary_loss_clip": 0.01103497, "auxiliary_loss_mlp": 0.01042287, "balance_loss_clip": 1.03638005, "balance_loss_mlp": 1.0233562, "epoch": 0.5339986773282029, "flos": 20813501923200.0, "grad_norm": 1.920901682776688, "language_loss": 0.73017097, "learning_rate": 1.8770001478354216e-06, "loss": 0.75162888, "num_input_tokens_seen": 95927480, "step": 4441, "time_per_iteration": 2.644284963607788 }, { "auxiliary_loss_clip": 0.01128363, "auxiliary_loss_mlp": 0.0105698, "balance_loss_clip": 1.04412413, "balance_loss_mlp": 1.03614211, "epoch": 0.5341189202188421, "flos": 17969089772160.0, "grad_norm": 2.4177513033717326, "language_loss": 0.83693337, "learning_rate": 1.8762226600620504e-06, "loss": 0.85878682, "num_input_tokens_seen": 95946095, "step": 4442, "time_per_iteration": 2.5749423503875732 }, { "auxiliary_loss_clip": 0.01127531, "auxiliary_loss_mlp": 0.010426, "balance_loss_clip": 1.04318726, "balance_loss_mlp": 1.02303791, "epoch": 0.5342391631094812, "flos": 11031866328960.0, "grad_norm": 3.010160541167978, "language_loss": 0.59305757, "learning_rate": 1.8754451910656031e-06, "loss": 0.61475891, "num_input_tokens_seen": 95959995, "step": 4443, "time_per_iteration": 2.5961203575134277 }, { "auxiliary_loss_clip": 0.01101402, "auxiliary_loss_mlp": 0.01041769, "balance_loss_clip": 1.04106581, "balance_loss_mlp": 1.0242573, "epoch": 0.5343594060001202, "flos": 15339135772800.0, "grad_norm": 24.193516766613637, "language_loss": 0.8289181, "learning_rate": 1.8746677409640212e-06, "loss": 0.85034978, "num_input_tokens_seen": 95977095, "step": 4444, "time_per_iteration": 2.64473032951355 }, { "auxiliary_loss_clip": 0.01132837, "auxiliary_loss_mlp": 0.01044395, "balance_loss_clip": 1.04579401, "balance_loss_mlp": 1.02702618, "epoch": 0.5344796488907594, "flos": 26900898514560.0, "grad_norm": 2.645808709962168, "language_loss": 0.84672564, "learning_rate": 1.8738903098752432e-06, "loss": 0.86849797, "num_input_tokens_seen": 95996225, "step": 4445, "time_per_iteration": 2.632075309753418 }, { "auxiliary_loss_clip": 0.0112099, "auxiliary_loss_mlp": 0.01042168, "balance_loss_clip": 1.04473114, "balance_loss_mlp": 1.02471602, "epoch": 0.5345998917813984, "flos": 25411216740480.0, "grad_norm": 2.803233510975072, "language_loss": 0.74026406, "learning_rate": 1.8731128979172052e-06, "loss": 0.76189572, "num_input_tokens_seen": 96015425, "step": 4446, "time_per_iteration": 2.69093656539917 }, { "auxiliary_loss_clip": 0.01117268, "auxiliary_loss_mlp": 0.01038128, "balance_loss_clip": 1.04274237, "balance_loss_mlp": 1.0211283, "epoch": 0.5347201346720375, "flos": 32853379622400.0, "grad_norm": 2.3488229814256347, "language_loss": 0.67347544, "learning_rate": 1.8723355052078394e-06, "loss": 0.69502944, "num_input_tokens_seen": 96035460, "step": 4447, "time_per_iteration": 2.708045482635498 }, { "auxiliary_loss_clip": 0.01129605, "auxiliary_loss_mlp": 0.01043289, "balance_loss_clip": 1.0428319, "balance_loss_mlp": 1.02440643, "epoch": 0.5348403775626767, "flos": 17967940536960.0, "grad_norm": 2.4806527790630604, "language_loss": 0.77558029, "learning_rate": 1.8715581318650765e-06, "loss": 0.79730928, "num_input_tokens_seen": 96054515, "step": 4448, "time_per_iteration": 2.62526273727417 }, { "auxiliary_loss_clip": 0.01118636, "auxiliary_loss_mlp": 0.01039536, "balance_loss_clip": 1.04453516, "balance_loss_mlp": 1.02000999, "epoch": 0.5349606204533157, "flos": 17603339535360.0, "grad_norm": 2.352353263482549, "language_loss": 0.81734878, "learning_rate": 1.8707807780068422e-06, "loss": 0.83893049, "num_input_tokens_seen": 96072330, "step": 4449, "time_per_iteration": 2.651245594024658 }, { "auxiliary_loss_clip": 0.01117703, "auxiliary_loss_mlp": 0.01044065, "balance_loss_clip": 1.04505658, "balance_loss_mlp": 1.02712572, "epoch": 0.5350808633439548, "flos": 29167831710720.0, "grad_norm": 2.2766658762133196, "language_loss": 0.659325, "learning_rate": 1.8700034437510611e-06, "loss": 0.68094271, "num_input_tokens_seen": 96092425, "step": 4450, "time_per_iteration": 2.678135395050049 }, { "auxiliary_loss_clip": 0.01097775, "auxiliary_loss_mlp": 0.01044963, "balance_loss_clip": 1.04189301, "balance_loss_mlp": 1.02637804, "epoch": 0.5352011062345938, "flos": 19499997381120.0, "grad_norm": 2.1692607246564948, "language_loss": 0.81206357, "learning_rate": 1.8692261292156549e-06, "loss": 0.83349097, "num_input_tokens_seen": 96111660, "step": 4451, "time_per_iteration": 2.686811923980713 }, { "auxiliary_loss_clip": 0.01142405, "auxiliary_loss_mlp": 0.01039017, "balance_loss_clip": 1.04878211, "balance_loss_mlp": 1.02316236, "epoch": 0.535321349125233, "flos": 23477642691840.0, "grad_norm": 3.3054276303454033, "language_loss": 0.80984616, "learning_rate": 1.8684488345185401e-06, "loss": 0.83166039, "num_input_tokens_seen": 96131835, "step": 4452, "time_per_iteration": 2.6047565937042236 }, { "auxiliary_loss_clip": 0.01146095, "auxiliary_loss_mlp": 0.0104293, "balance_loss_clip": 1.0492785, "balance_loss_mlp": 1.02560902, "epoch": 0.535441592015872, "flos": 20478059786880.0, "grad_norm": 2.6391610022919574, "language_loss": 0.78486812, "learning_rate": 1.8676715597776332e-06, "loss": 0.8067584, "num_input_tokens_seen": 96150180, "step": 4453, "time_per_iteration": 3.5261471271514893 }, { "auxiliary_loss_clip": 0.01083445, "auxiliary_loss_mlp": 0.01041565, "balance_loss_clip": 1.03860641, "balance_loss_mlp": 1.02517402, "epoch": 0.5355618349065111, "flos": 19573147428480.0, "grad_norm": 2.2027228093916777, "language_loss": 0.76247555, "learning_rate": 1.8668943051108455e-06, "loss": 0.78372568, "num_input_tokens_seen": 96167485, "step": 4454, "time_per_iteration": 2.7715866565704346 }, { "auxiliary_loss_clip": 0.01117634, "auxiliary_loss_mlp": 0.01039472, "balance_loss_clip": 1.04256272, "balance_loss_mlp": 1.02267575, "epoch": 0.5356820777971503, "flos": 24024633978240.0, "grad_norm": 1.8923642727954526, "language_loss": 0.76535147, "learning_rate": 1.8661170706360856e-06, "loss": 0.78692257, "num_input_tokens_seen": 96186650, "step": 4455, "time_per_iteration": 2.6267294883728027 }, { "auxiliary_loss_clip": 0.01129802, "auxiliary_loss_mlp": 0.01039943, "balance_loss_clip": 1.04632354, "balance_loss_mlp": 1.02455282, "epoch": 0.5358023206877893, "flos": 20884676722560.0, "grad_norm": 1.6855454842787256, "language_loss": 0.81626892, "learning_rate": 1.8653398564712594e-06, "loss": 0.83796632, "num_input_tokens_seen": 96205595, "step": 4456, "time_per_iteration": 2.5828745365142822 }, { "auxiliary_loss_clip": 0.01127564, "auxiliary_loss_mlp": 0.0104037, "balance_loss_clip": 1.04496539, "balance_loss_mlp": 1.02391958, "epoch": 0.5359225635784284, "flos": 22418996123520.0, "grad_norm": 1.7147989664313479, "language_loss": 0.82244825, "learning_rate": 1.8645626627342704e-06, "loss": 0.84412754, "num_input_tokens_seen": 96226360, "step": 4457, "time_per_iteration": 2.6278421878814697 }, { "auxiliary_loss_clip": 0.01136011, "auxiliary_loss_mlp": 0.010483, "balance_loss_clip": 1.04649544, "balance_loss_mlp": 1.03150296, "epoch": 0.5360428064690675, "flos": 24097784025600.0, "grad_norm": 2.767578403418945, "language_loss": 0.80786812, "learning_rate": 1.8637854895430172e-06, "loss": 0.82971126, "num_input_tokens_seen": 96245625, "step": 4458, "time_per_iteration": 2.6229233741760254 }, { "auxiliary_loss_clip": 0.01097204, "auxiliary_loss_mlp": 0.01043205, "balance_loss_clip": 1.04041767, "balance_loss_mlp": 1.02327323, "epoch": 0.5361630493597066, "flos": 21434505183360.0, "grad_norm": 2.007989526743794, "language_loss": 0.69401681, "learning_rate": 1.8630083370153978e-06, "loss": 0.7154209, "num_input_tokens_seen": 96265265, "step": 4459, "time_per_iteration": 3.638807535171509 }, { "auxiliary_loss_clip": 0.01000269, "auxiliary_loss_mlp": 0.01005524, "balance_loss_clip": 1.01494503, "balance_loss_mlp": 1.00343752, "epoch": 0.5362832922503457, "flos": 68888696520960.0, "grad_norm": 0.7444492717996299, "language_loss": 0.55357528, "learning_rate": 1.8622312052693041e-06, "loss": 0.57363319, "num_input_tokens_seen": 96326445, "step": 4460, "time_per_iteration": 3.3785300254821777 }, { "auxiliary_loss_clip": 0.01120885, "auxiliary_loss_mlp": 0.01039936, "balance_loss_clip": 1.04132879, "balance_loss_mlp": 1.02254343, "epoch": 0.5364035351409848, "flos": 9793702563840.0, "grad_norm": 2.5447571519059093, "language_loss": 0.72163379, "learning_rate": 1.8614540944226267e-06, "loss": 0.74324197, "num_input_tokens_seen": 96343115, "step": 4461, "time_per_iteration": 2.5836381912231445 }, { "auxiliary_loss_clip": 0.0111522, "auxiliary_loss_mlp": 0.01041817, "balance_loss_clip": 1.04676116, "balance_loss_mlp": 1.02609968, "epoch": 0.5365237780316239, "flos": 23290080848640.0, "grad_norm": 1.9412596355983844, "language_loss": 0.68202698, "learning_rate": 1.8606770045932537e-06, "loss": 0.70359737, "num_input_tokens_seen": 96362230, "step": 4462, "time_per_iteration": 2.6250576972961426 }, { "auxiliary_loss_clip": 0.01098558, "auxiliary_loss_mlp": 0.01036079, "balance_loss_clip": 1.04006052, "balance_loss_mlp": 1.01896071, "epoch": 0.5366440209222629, "flos": 26578133879040.0, "grad_norm": 2.386043232447244, "language_loss": 0.81740415, "learning_rate": 1.859899935899068e-06, "loss": 0.83875054, "num_input_tokens_seen": 96382085, "step": 4463, "time_per_iteration": 3.6010754108428955 }, { "auxiliary_loss_clip": 0.01114, "auxiliary_loss_mlp": 0.01042855, "balance_loss_clip": 1.04570603, "balance_loss_mlp": 1.02596295, "epoch": 0.5367642638129021, "flos": 19608052469760.0, "grad_norm": 1.4994537748357293, "language_loss": 0.79134786, "learning_rate": 1.8591228884579506e-06, "loss": 0.8129164, "num_input_tokens_seen": 96400580, "step": 4464, "time_per_iteration": 2.619462251663208 }, { "auxiliary_loss_clip": 0.01111134, "auxiliary_loss_mlp": 0.01045284, "balance_loss_clip": 1.04527187, "balance_loss_mlp": 1.02803469, "epoch": 0.5368845067035412, "flos": 23915214172800.0, "grad_norm": 2.228686391517203, "language_loss": 0.82612211, "learning_rate": 1.8583458623877795e-06, "loss": 0.84768629, "num_input_tokens_seen": 96419680, "step": 4465, "time_per_iteration": 2.7092819213867188 }, { "auxiliary_loss_clip": 0.01132606, "auxiliary_loss_mlp": 0.01038965, "balance_loss_clip": 1.04640579, "balance_loss_mlp": 1.02258539, "epoch": 0.5370047495941802, "flos": 16873131951360.0, "grad_norm": 2.0143888646254737, "language_loss": 0.74231768, "learning_rate": 1.8575688578064281e-06, "loss": 0.76403344, "num_input_tokens_seen": 96437805, "step": 4466, "time_per_iteration": 2.6391098499298096 }, { "auxiliary_loss_clip": 0.01132746, "auxiliary_loss_mlp": 0.01044807, "balance_loss_clip": 1.04684091, "balance_loss_mlp": 1.02786767, "epoch": 0.5371249924848194, "flos": 20740926493440.0, "grad_norm": 1.664370890725903, "language_loss": 0.7697795, "learning_rate": 1.8567918748317674e-06, "loss": 0.79155505, "num_input_tokens_seen": 96457155, "step": 4467, "time_per_iteration": 2.599562644958496 }, { "auxiliary_loss_clip": 0.01105588, "auxiliary_loss_mlp": 0.01037709, "balance_loss_clip": 1.0420289, "balance_loss_mlp": 1.01912403, "epoch": 0.5372452353754584, "flos": 17968120104960.0, "grad_norm": 6.613945724148552, "language_loss": 0.8332094, "learning_rate": 1.8560149135816659e-06, "loss": 0.85464233, "num_input_tokens_seen": 96473990, "step": 4468, "time_per_iteration": 2.638697385787964 }, { "auxiliary_loss_clip": 0.01125213, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.0431844, "balance_loss_mlp": 1.02046609, "epoch": 0.5373654782660975, "flos": 15377021642880.0, "grad_norm": 2.458949426006014, "language_loss": 0.84146196, "learning_rate": 1.8552379741739873e-06, "loss": 0.86308384, "num_input_tokens_seen": 96491335, "step": 4469, "time_per_iteration": 2.5702197551727295 }, { "auxiliary_loss_clip": 0.01020453, "auxiliary_loss_mlp": 0.00755731, "balance_loss_clip": 1.01743317, "balance_loss_mlp": 1.00084329, "epoch": 0.5374857211567367, "flos": 69000091574400.0, "grad_norm": 0.8884772573618337, "language_loss": 0.55680156, "learning_rate": 1.8544610567265935e-06, "loss": 0.5745635, "num_input_tokens_seen": 96545275, "step": 4470, "time_per_iteration": 3.218740463256836 }, { "auxiliary_loss_clip": 0.01118332, "auxiliary_loss_mlp": 0.00772993, "balance_loss_clip": 1.04554641, "balance_loss_mlp": 1.00051236, "epoch": 0.5376059640473757, "flos": 15085355207040.0, "grad_norm": 2.1429900331230693, "language_loss": 0.83128452, "learning_rate": 1.853684161357341e-06, "loss": 0.85019779, "num_input_tokens_seen": 96562935, "step": 4471, "time_per_iteration": 2.602679967880249 }, { "auxiliary_loss_clip": 0.0112683, "auxiliary_loss_mlp": 0.007731, "balance_loss_clip": 1.04409373, "balance_loss_mlp": 1.00053358, "epoch": 0.5377262069380148, "flos": 19792597570560.0, "grad_norm": 2.5671937788879, "language_loss": 0.76594067, "learning_rate": 1.852907288184085e-06, "loss": 0.78493994, "num_input_tokens_seen": 96581820, "step": 4472, "time_per_iteration": 2.579746961593628 }, { "auxiliary_loss_clip": 0.01096248, "auxiliary_loss_mlp": 0.01040032, "balance_loss_clip": 1.04243302, "balance_loss_mlp": 1.02256739, "epoch": 0.5378464498286539, "flos": 30003077640960.0, "grad_norm": 3.674019060803244, "language_loss": 0.69948536, "learning_rate": 1.8521304373246762e-06, "loss": 0.72084808, "num_input_tokens_seen": 96602865, "step": 4473, "time_per_iteration": 2.754965305328369 }, { "auxiliary_loss_clip": 0.01131501, "auxiliary_loss_mlp": 0.01043495, "balance_loss_clip": 1.04353762, "balance_loss_mlp": 1.02541125, "epoch": 0.537966692719293, "flos": 21251217058560.0, "grad_norm": 2.706749178216425, "language_loss": 0.8855049, "learning_rate": 1.8513536088969626e-06, "loss": 0.90725482, "num_input_tokens_seen": 96620530, "step": 4474, "time_per_iteration": 2.609438419342041 }, { "auxiliary_loss_clip": 0.01138466, "auxiliary_loss_mlp": 0.0104482, "balance_loss_clip": 1.04994261, "balance_loss_mlp": 1.0265218, "epoch": 0.538086935609932, "flos": 21543170803200.0, "grad_norm": 1.6438134122514265, "language_loss": 0.80440497, "learning_rate": 1.8505768030187884e-06, "loss": 0.8262378, "num_input_tokens_seen": 96640660, "step": 4475, "time_per_iteration": 2.610105276107788 }, { "auxiliary_loss_clip": 0.0111647, "auxiliary_loss_mlp": 0.01043063, "balance_loss_clip": 1.0468446, "balance_loss_mlp": 1.0278517, "epoch": 0.5382071785005712, "flos": 22747219626240.0, "grad_norm": 1.5135952666568093, "language_loss": 0.79690301, "learning_rate": 1.849800019807995e-06, "loss": 0.81849837, "num_input_tokens_seen": 96661885, "step": 4476, "time_per_iteration": 2.681823253631592 }, { "auxiliary_loss_clip": 0.01104123, "auxiliary_loss_mlp": 0.01038451, "balance_loss_clip": 1.04510784, "balance_loss_mlp": 1.02136827, "epoch": 0.5383274213912103, "flos": 24934574240640.0, "grad_norm": 2.1154186077303514, "language_loss": 0.71080524, "learning_rate": 1.8490232593824186e-06, "loss": 0.73223096, "num_input_tokens_seen": 96678340, "step": 4477, "time_per_iteration": 2.6508848667144775 }, { "auxiliary_loss_clip": 0.01110641, "auxiliary_loss_mlp": 0.01033587, "balance_loss_clip": 1.041044, "balance_loss_mlp": 1.01860285, "epoch": 0.5384476642818493, "flos": 22310186849280.0, "grad_norm": 1.8253031801216235, "language_loss": 0.84612161, "learning_rate": 1.8482465218598935e-06, "loss": 0.86756396, "num_input_tokens_seen": 96698285, "step": 4478, "time_per_iteration": 2.6419434547424316 }, { "auxiliary_loss_clip": 0.0110568, "auxiliary_loss_mlp": 0.01059232, "balance_loss_clip": 1.04264641, "balance_loss_mlp": 1.0404681, "epoch": 0.5385679071724885, "flos": 22711021695360.0, "grad_norm": 1.8449815769019429, "language_loss": 0.83368289, "learning_rate": 1.8474698073582508e-06, "loss": 0.85533202, "num_input_tokens_seen": 96719655, "step": 4479, "time_per_iteration": 3.655686616897583 }, { "auxiliary_loss_clip": 0.01111733, "auxiliary_loss_mlp": 0.01050832, "balance_loss_clip": 1.04404306, "balance_loss_mlp": 1.03295016, "epoch": 0.5386881500631275, "flos": 15953746412160.0, "grad_norm": 2.2826536378930107, "language_loss": 0.87388235, "learning_rate": 1.8466931159953166e-06, "loss": 0.89550799, "num_input_tokens_seen": 96736290, "step": 4480, "time_per_iteration": 2.623875141143799 }, { "auxiliary_loss_clip": 0.01121123, "auxiliary_loss_mlp": 0.01042685, "balance_loss_clip": 1.04628265, "balance_loss_mlp": 1.02567387, "epoch": 0.5388083929537666, "flos": 24060041809920.0, "grad_norm": 2.057684067680607, "language_loss": 0.84135163, "learning_rate": 1.8459164478889158e-06, "loss": 0.86298978, "num_input_tokens_seen": 96757685, "step": 4481, "time_per_iteration": 2.670466423034668 }, { "auxiliary_loss_clip": 0.0109847, "auxiliary_loss_mlp": 0.01037992, "balance_loss_clip": 1.04230642, "balance_loss_mlp": 1.02161229, "epoch": 0.5389286358444056, "flos": 22236893147520.0, "grad_norm": 1.841469289097482, "language_loss": 0.7593509, "learning_rate": 1.8451398031568663e-06, "loss": 0.78071547, "num_input_tokens_seen": 96777310, "step": 4482, "time_per_iteration": 2.661184549331665 }, { "auxiliary_loss_clip": 0.01104794, "auxiliary_loss_mlp": 0.01046838, "balance_loss_clip": 1.04223633, "balance_loss_mlp": 1.02912319, "epoch": 0.5390488787350448, "flos": 24281718595200.0, "grad_norm": 1.7256307714307588, "language_loss": 0.74477804, "learning_rate": 1.844363181916986e-06, "loss": 0.76629436, "num_input_tokens_seen": 96798035, "step": 4483, "time_per_iteration": 2.6886181831359863 }, { "auxiliary_loss_clip": 0.01127356, "auxiliary_loss_mlp": 0.01049172, "balance_loss_clip": 1.04338682, "balance_loss_mlp": 1.029634, "epoch": 0.5391691216256839, "flos": 16581393688320.0, "grad_norm": 2.4199288051676113, "language_loss": 0.83494544, "learning_rate": 1.8435865842870868e-06, "loss": 0.85671079, "num_input_tokens_seen": 96815975, "step": 4484, "time_per_iteration": 2.568524122238159 }, { "auxiliary_loss_clip": 0.0110446, "auxiliary_loss_mlp": 0.007746, "balance_loss_clip": 1.03753138, "balance_loss_mlp": 1.0005393, "epoch": 0.5392893645163229, "flos": 23330049707520.0, "grad_norm": 1.7148610472913701, "language_loss": 0.72106421, "learning_rate": 1.8428100103849787e-06, "loss": 0.73985481, "num_input_tokens_seen": 96835770, "step": 4485, "time_per_iteration": 3.6069812774658203 }, { "auxiliary_loss_clip": 0.01115769, "auxiliary_loss_mlp": 0.01037368, "balance_loss_clip": 1.04483962, "balance_loss_mlp": 1.02075028, "epoch": 0.5394096074069621, "flos": 15669801400320.0, "grad_norm": 2.274992250149813, "language_loss": 0.73388803, "learning_rate": 1.842033460328467e-06, "loss": 0.75541937, "num_input_tokens_seen": 96854490, "step": 4486, "time_per_iteration": 2.606860399246216 }, { "auxiliary_loss_clip": 0.01120086, "auxiliary_loss_mlp": 0.00772425, "balance_loss_clip": 1.0448575, "balance_loss_mlp": 1.0004648, "epoch": 0.5395298502976011, "flos": 22893447893760.0, "grad_norm": 1.7755461373793964, "language_loss": 0.75249624, "learning_rate": 1.8412569342353541e-06, "loss": 0.77142131, "num_input_tokens_seen": 96874645, "step": 4487, "time_per_iteration": 2.656081438064575 }, { "auxiliary_loss_clip": 0.01124392, "auxiliary_loss_mlp": 0.01048257, "balance_loss_clip": 1.04608691, "balance_loss_mlp": 1.02994609, "epoch": 0.5396500931882402, "flos": 23842135952640.0, "grad_norm": 3.105068416406894, "language_loss": 0.84329784, "learning_rate": 1.840480432223438e-06, "loss": 0.86502433, "num_input_tokens_seen": 96893650, "step": 4488, "time_per_iteration": 2.686790704727173 }, { "auxiliary_loss_clip": 0.01116078, "auxiliary_loss_mlp": 0.01041755, "balance_loss_clip": 1.04246378, "balance_loss_mlp": 1.02616227, "epoch": 0.5397703360788794, "flos": 26322988596480.0, "grad_norm": 2.2691497274510937, "language_loss": 0.77803874, "learning_rate": 1.8397039544105131e-06, "loss": 0.79961711, "num_input_tokens_seen": 96912735, "step": 4489, "time_per_iteration": 3.5355653762817383 }, { "auxiliary_loss_clip": 0.01109316, "auxiliary_loss_mlp": 0.01040206, "balance_loss_clip": 1.03883386, "balance_loss_mlp": 1.02160931, "epoch": 0.5398905789695184, "flos": 21214588164480.0, "grad_norm": 1.7903595815540374, "language_loss": 0.69884038, "learning_rate": 1.8389275009143711e-06, "loss": 0.7203356, "num_input_tokens_seen": 96932475, "step": 4490, "time_per_iteration": 2.661525249481201 }, { "auxiliary_loss_clip": 0.01136062, "auxiliary_loss_mlp": 0.01034185, "balance_loss_clip": 1.04336202, "balance_loss_mlp": 1.01819348, "epoch": 0.5400108218601575, "flos": 25080335631360.0, "grad_norm": 1.8222300541917673, "language_loss": 0.73699051, "learning_rate": 1.8381510718527988e-06, "loss": 0.75869298, "num_input_tokens_seen": 96952085, "step": 4491, "time_per_iteration": 2.579002618789673 }, { "auxiliary_loss_clip": 0.01115188, "auxiliary_loss_mlp": 0.01047878, "balance_loss_clip": 1.0388763, "balance_loss_mlp": 1.03010428, "epoch": 0.5401310647507966, "flos": 26357498588160.0, "grad_norm": 3.074009148964676, "language_loss": 0.63267624, "learning_rate": 1.8373746673435812e-06, "loss": 0.65430695, "num_input_tokens_seen": 96973110, "step": 4492, "time_per_iteration": 2.653866767883301 }, { "auxiliary_loss_clip": 0.01144938, "auxiliary_loss_mlp": 0.01044573, "balance_loss_clip": 1.04779029, "balance_loss_mlp": 1.02695334, "epoch": 0.5402513076414357, "flos": 27855332749440.0, "grad_norm": 2.236605073089364, "language_loss": 0.79307914, "learning_rate": 1.8365982875044964e-06, "loss": 0.81497419, "num_input_tokens_seen": 96993420, "step": 4493, "time_per_iteration": 2.6285693645477295 }, { "auxiliary_loss_clip": 0.01136458, "auxiliary_loss_mlp": 0.00774302, "balance_loss_clip": 1.0477587, "balance_loss_mlp": 1.00042057, "epoch": 0.5403715505320748, "flos": 22893771116160.0, "grad_norm": 2.484703512400831, "language_loss": 0.75999486, "learning_rate": 1.8358219324533217e-06, "loss": 0.77910244, "num_input_tokens_seen": 97013685, "step": 4494, "time_per_iteration": 2.595914602279663 }, { "auxiliary_loss_clip": 0.01110731, "auxiliary_loss_mlp": 0.01034999, "balance_loss_clip": 1.04187953, "balance_loss_mlp": 1.020491, "epoch": 0.5404917934227139, "flos": 30224143895040.0, "grad_norm": 1.7592771156055724, "language_loss": 0.70156848, "learning_rate": 1.8350456023078292e-06, "loss": 0.7230258, "num_input_tokens_seen": 97036060, "step": 4495, "time_per_iteration": 2.6865994930267334 }, { "auxiliary_loss_clip": 0.01148919, "auxiliary_loss_mlp": 0.01045716, "balance_loss_clip": 1.04925442, "balance_loss_mlp": 1.02711928, "epoch": 0.540612036313353, "flos": 19938502615680.0, "grad_norm": 2.856584400208746, "language_loss": 0.78051877, "learning_rate": 1.8342692971857874e-06, "loss": 0.80246508, "num_input_tokens_seen": 97055260, "step": 4496, "time_per_iteration": 2.5566201210021973 }, { "auxiliary_loss_clip": 0.01114768, "auxiliary_loss_mlp": 0.01035666, "balance_loss_clip": 1.04239178, "balance_loss_mlp": 1.01985919, "epoch": 0.540732279203992, "flos": 24279599692800.0, "grad_norm": 2.307342014298447, "language_loss": 0.7092855, "learning_rate": 1.833493017204962e-06, "loss": 0.73078978, "num_input_tokens_seen": 97075365, "step": 4497, "time_per_iteration": 2.6580893993377686 }, { "auxiliary_loss_clip": 0.01142884, "auxiliary_loss_mlp": 0.01043078, "balance_loss_clip": 1.04713607, "balance_loss_mlp": 1.02594805, "epoch": 0.5408525220946312, "flos": 20193216935040.0, "grad_norm": 2.226315919948544, "language_loss": 0.7816447, "learning_rate": 1.8327167624831134e-06, "loss": 0.80350435, "num_input_tokens_seen": 97093095, "step": 4498, "time_per_iteration": 2.551609516143799 }, { "auxiliary_loss_clip": 0.01140564, "auxiliary_loss_mlp": 0.01039485, "balance_loss_clip": 1.04702163, "balance_loss_mlp": 1.02254581, "epoch": 0.5409727649852702, "flos": 24134448833280.0, "grad_norm": 1.6419957514155568, "language_loss": 0.70796502, "learning_rate": 1.831940533137999e-06, "loss": 0.72976553, "num_input_tokens_seen": 97112000, "step": 4499, "time_per_iteration": 2.5711829662323 }, { "auxiliary_loss_clip": 0.01123973, "auxiliary_loss_mlp": 0.01042412, "balance_loss_clip": 1.04702234, "balance_loss_mlp": 1.02664065, "epoch": 0.5410930078759093, "flos": 23912700220800.0, "grad_norm": 1.9376836609539076, "language_loss": 0.72309178, "learning_rate": 1.8311643292873718e-06, "loss": 0.74475563, "num_input_tokens_seen": 97130820, "step": 4500, "time_per_iteration": 2.607691526412964 }, { "auxiliary_loss_clip": 0.01124319, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.04458284, "balance_loss_mlp": 1.0246377, "epoch": 0.5412132507665485, "flos": 21105132445440.0, "grad_norm": 1.9169339937933123, "language_loss": 0.8795656, "learning_rate": 1.8303881510489818e-06, "loss": 0.90120769, "num_input_tokens_seen": 97149210, "step": 4501, "time_per_iteration": 2.6194965839385986 }, { "auxiliary_loss_clip": 0.01121044, "auxiliary_loss_mlp": 0.01037954, "balance_loss_clip": 1.04506016, "balance_loss_mlp": 1.02106225, "epoch": 0.5413334936571875, "flos": 30227340205440.0, "grad_norm": 1.920126481809583, "language_loss": 0.69529653, "learning_rate": 1.829611998540574e-06, "loss": 0.71688646, "num_input_tokens_seen": 97170415, "step": 4502, "time_per_iteration": 2.7236788272857666 }, { "auxiliary_loss_clip": 0.01132313, "auxiliary_loss_mlp": 0.00774155, "balance_loss_clip": 1.04533279, "balance_loss_mlp": 1.00053573, "epoch": 0.5414537365478266, "flos": 24279635606400.0, "grad_norm": 1.9273844651513885, "language_loss": 0.80227757, "learning_rate": 1.8288358718798914e-06, "loss": 0.82134229, "num_input_tokens_seen": 97189605, "step": 4503, "time_per_iteration": 2.6699295043945312 }, { "auxiliary_loss_clip": 0.01123712, "auxiliary_loss_mlp": 0.00773073, "balance_loss_clip": 1.04442239, "balance_loss_mlp": 1.00056946, "epoch": 0.5415739794384657, "flos": 16654543735680.0, "grad_norm": 4.311823295305531, "language_loss": 0.72738075, "learning_rate": 1.8280597711846703e-06, "loss": 0.74634862, "num_input_tokens_seen": 97207845, "step": 4504, "time_per_iteration": 2.5672192573547363 }, { "auxiliary_loss_clip": 0.01125791, "auxiliary_loss_mlp": 0.01036772, "balance_loss_clip": 1.04729295, "balance_loss_mlp": 1.02162027, "epoch": 0.5416942223291048, "flos": 23185724860800.0, "grad_norm": 1.8236570987631933, "language_loss": 0.83179009, "learning_rate": 1.8272836965726455e-06, "loss": 0.85341573, "num_input_tokens_seen": 97226780, "step": 4505, "time_per_iteration": 4.54194188117981 }, { "auxiliary_loss_clip": 0.01078632, "auxiliary_loss_mlp": 0.01048281, "balance_loss_clip": 1.03853905, "balance_loss_mlp": 1.0293622, "epoch": 0.5418144652197439, "flos": 20303247271680.0, "grad_norm": 2.4572952084080373, "language_loss": 0.78584623, "learning_rate": 1.8265076481615461e-06, "loss": 0.80711532, "num_input_tokens_seen": 97246695, "step": 4506, "time_per_iteration": 2.7429585456848145 }, { "auxiliary_loss_clip": 0.01117506, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.04730415, "balance_loss_mlp": 1.02332568, "epoch": 0.541934708110383, "flos": 12458633431680.0, "grad_norm": 2.1268843171243574, "language_loss": 0.87499326, "learning_rate": 1.8257316260690987e-06, "loss": 0.8965646, "num_input_tokens_seen": 97264480, "step": 4507, "time_per_iteration": 2.5892581939697266 }, { "auxiliary_loss_clip": 0.01128046, "auxiliary_loss_mlp": 0.0104027, "balance_loss_clip": 1.04577649, "balance_loss_mlp": 1.02495217, "epoch": 0.5420549510010221, "flos": 21253802837760.0, "grad_norm": 1.5628330420514747, "language_loss": 0.76062089, "learning_rate": 1.8249556304130254e-06, "loss": 0.78230405, "num_input_tokens_seen": 97285760, "step": 4508, "time_per_iteration": 2.6252481937408447 }, { "auxiliary_loss_clip": 0.01107954, "auxiliary_loss_mlp": 0.01046588, "balance_loss_clip": 1.04151499, "balance_loss_mlp": 1.02946389, "epoch": 0.5421751938916611, "flos": 29490524519040.0, "grad_norm": 4.618001902278473, "language_loss": 0.68786919, "learning_rate": 1.824179661311044e-06, "loss": 0.70941466, "num_input_tokens_seen": 97304510, "step": 4509, "time_per_iteration": 2.6875603199005127 }, { "auxiliary_loss_clip": 0.01094587, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.03999877, "balance_loss_mlp": 1.02309334, "epoch": 0.5422954367823003, "flos": 18734238311040.0, "grad_norm": 2.0413549586912136, "language_loss": 0.80154109, "learning_rate": 1.823403718880868e-06, "loss": 0.82289612, "num_input_tokens_seen": 97323270, "step": 4510, "time_per_iteration": 2.700087070465088 }, { "auxiliary_loss_clip": 0.01118584, "auxiliary_loss_mlp": 0.01044047, "balance_loss_clip": 1.04354024, "balance_loss_mlp": 1.02509236, "epoch": 0.5424156796729394, "flos": 39969006940800.0, "grad_norm": 1.7457094240290583, "language_loss": 0.66499794, "learning_rate": 1.822627803240207e-06, "loss": 0.68662429, "num_input_tokens_seen": 97345600, "step": 4511, "time_per_iteration": 3.7563278675079346 }, { "auxiliary_loss_clip": 0.01109298, "auxiliary_loss_mlp": 0.01038346, "balance_loss_clip": 1.04523933, "balance_loss_mlp": 1.02151358, "epoch": 0.5425359225635784, "flos": 11546538353280.0, "grad_norm": 2.307809342521181, "language_loss": 0.85256571, "learning_rate": 1.8218519145067675e-06, "loss": 0.87404215, "num_input_tokens_seen": 97361220, "step": 4512, "time_per_iteration": 2.632758378982544 }, { "auxiliary_loss_clip": 0.011001, "auxiliary_loss_mlp": 0.01049512, "balance_loss_clip": 1.04365182, "balance_loss_mlp": 1.03133225, "epoch": 0.5426561654542175, "flos": 20229702174720.0, "grad_norm": 2.3549487178617303, "language_loss": 0.89542514, "learning_rate": 1.8210760527982508e-06, "loss": 0.91692126, "num_input_tokens_seen": 97381505, "step": 4513, "time_per_iteration": 2.723273515701294 }, { "auxiliary_loss_clip": 0.01116871, "auxiliary_loss_mlp": 0.00772035, "balance_loss_clip": 1.04556906, "balance_loss_mlp": 1.00042391, "epoch": 0.5427764083448566, "flos": 21871681614720.0, "grad_norm": 2.0794222943411063, "language_loss": 0.75605243, "learning_rate": 1.8203002182323552e-06, "loss": 0.7749415, "num_input_tokens_seen": 97399060, "step": 4514, "time_per_iteration": 2.624135971069336 }, { "auxiliary_loss_clip": 0.01116587, "auxiliary_loss_mlp": 0.01049854, "balance_loss_clip": 1.04497576, "balance_loss_mlp": 1.03249764, "epoch": 0.5428966512354957, "flos": 19640946349440.0, "grad_norm": 2.239556486073374, "language_loss": 0.75933206, "learning_rate": 1.819524410926773e-06, "loss": 0.7809965, "num_input_tokens_seen": 97416740, "step": 4515, "time_per_iteration": 3.5797085762023926 }, { "auxiliary_loss_clip": 0.01075273, "auxiliary_loss_mlp": 0.01043815, "balance_loss_clip": 1.03998756, "balance_loss_mlp": 1.02606511, "epoch": 0.5430168941261347, "flos": 22382187661440.0, "grad_norm": 1.7304753115797487, "language_loss": 0.77135605, "learning_rate": 1.8187486309991944e-06, "loss": 0.79254687, "num_input_tokens_seen": 97437620, "step": 4516, "time_per_iteration": 2.727207899093628 }, { "auxiliary_loss_clip": 0.01133911, "auxiliary_loss_mlp": 0.01041344, "balance_loss_clip": 1.04824722, "balance_loss_mlp": 1.02538204, "epoch": 0.5431371370167739, "flos": 18764187275520.0, "grad_norm": 2.0529359017283544, "language_loss": 0.77401125, "learning_rate": 1.817972878567304e-06, "loss": 0.79576379, "num_input_tokens_seen": 97456275, "step": 4517, "time_per_iteration": 2.5988268852233887 }, { "auxiliary_loss_clip": 0.01121626, "auxiliary_loss_mlp": 0.01039141, "balance_loss_clip": 1.04544044, "balance_loss_mlp": 1.02330983, "epoch": 0.543257379907413, "flos": 18806023641600.0, "grad_norm": 2.379230935070701, "language_loss": 0.76489377, "learning_rate": 1.8171971537487834e-06, "loss": 0.78650141, "num_input_tokens_seen": 97474925, "step": 4518, "time_per_iteration": 2.645256519317627 }, { "auxiliary_loss_clip": 0.01143018, "auxiliary_loss_mlp": 0.01040644, "balance_loss_clip": 1.04591238, "balance_loss_mlp": 1.02367425, "epoch": 0.543377622798052, "flos": 17493381025920.0, "grad_norm": 1.9519128006940083, "language_loss": 0.80554104, "learning_rate": 1.8164214566613093e-06, "loss": 0.82737768, "num_input_tokens_seen": 97493550, "step": 4519, "time_per_iteration": 2.5449984073638916 }, { "auxiliary_loss_clip": 0.01140094, "auxiliary_loss_mlp": 0.010377, "balance_loss_clip": 1.04567647, "balance_loss_mlp": 1.02163696, "epoch": 0.5434978656886912, "flos": 18989311766400.0, "grad_norm": 3.1887854275991603, "language_loss": 0.66395783, "learning_rate": 1.8156457874225547e-06, "loss": 0.68573576, "num_input_tokens_seen": 97512010, "step": 4520, "time_per_iteration": 2.5557801723480225 }, { "auxiliary_loss_clip": 0.01110343, "auxiliary_loss_mlp": 0.01037382, "balance_loss_clip": 1.04571342, "balance_loss_mlp": 1.01936913, "epoch": 0.5436181085793302, "flos": 17274936464640.0, "grad_norm": 2.438502680562519, "language_loss": 0.80873328, "learning_rate": 1.814870146150187e-06, "loss": 0.83021057, "num_input_tokens_seen": 97530120, "step": 4521, "time_per_iteration": 2.5845696926116943 }, { "auxiliary_loss_clip": 0.01122413, "auxiliary_loss_mlp": 0.01046287, "balance_loss_clip": 1.04399848, "balance_loss_mlp": 1.02785754, "epoch": 0.5437383514699693, "flos": 19098587917440.0, "grad_norm": 1.9736910704075166, "language_loss": 0.78652084, "learning_rate": 1.814094532961871e-06, "loss": 0.80820787, "num_input_tokens_seen": 97548695, "step": 4522, "time_per_iteration": 2.6271042823791504 }, { "auxiliary_loss_clip": 0.01096938, "auxiliary_loss_mlp": 0.0104402, "balance_loss_clip": 1.04097784, "balance_loss_mlp": 1.02590013, "epoch": 0.5438585943606085, "flos": 22602715211520.0, "grad_norm": 2.258836200776803, "language_loss": 0.83731294, "learning_rate": 1.8133189479752666e-06, "loss": 0.85872245, "num_input_tokens_seen": 97567625, "step": 4523, "time_per_iteration": 2.7269527912139893 }, { "auxiliary_loss_clip": 0.01140825, "auxiliary_loss_mlp": 0.01041198, "balance_loss_clip": 1.04635549, "balance_loss_mlp": 1.02564144, "epoch": 0.5439788372512475, "flos": 21798495653760.0, "grad_norm": 2.0435519941116596, "language_loss": 0.81787372, "learning_rate": 1.8125433913080292e-06, "loss": 0.83969396, "num_input_tokens_seen": 97585325, "step": 4524, "time_per_iteration": 2.5967798233032227 }, { "auxiliary_loss_clip": 0.01032319, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.03555977, "balance_loss_mlp": 1.02157474, "epoch": 0.5440990801418866, "flos": 16399362539520.0, "grad_norm": 1.959134487601283, "language_loss": 0.8264069, "learning_rate": 1.811767863077811e-06, "loss": 0.84710205, "num_input_tokens_seen": 97604275, "step": 4525, "time_per_iteration": 2.966970920562744 }, { "auxiliary_loss_clip": 0.01073177, "auxiliary_loss_mlp": 0.01045747, "balance_loss_clip": 1.0422523, "balance_loss_mlp": 1.03061914, "epoch": 0.5442193230325257, "flos": 21615638492160.0, "grad_norm": 1.5666193739418026, "language_loss": 0.78423274, "learning_rate": 1.8109923634022577e-06, "loss": 0.80542201, "num_input_tokens_seen": 97624300, "step": 4526, "time_per_iteration": 3.033663511276245 }, { "auxiliary_loss_clip": 0.01144662, "auxiliary_loss_mlp": 0.01039052, "balance_loss_clip": 1.04605746, "balance_loss_mlp": 1.02235103, "epoch": 0.5443395659231648, "flos": 15481198062720.0, "grad_norm": 2.07858587056486, "language_loss": 0.86487961, "learning_rate": 1.8102168923990128e-06, "loss": 0.88671672, "num_input_tokens_seen": 97637845, "step": 4527, "time_per_iteration": 2.5412256717681885 }, { "auxiliary_loss_clip": 0.01132572, "auxiliary_loss_mlp": 0.00773265, "balance_loss_clip": 1.04658175, "balance_loss_mlp": 1.00035596, "epoch": 0.5444598088138038, "flos": 18770436241920.0, "grad_norm": 3.4818792807575982, "language_loss": 0.80384576, "learning_rate": 1.809441450185714e-06, "loss": 0.82290411, "num_input_tokens_seen": 97656330, "step": 4528, "time_per_iteration": 2.6184329986572266 }, { "auxiliary_loss_clip": 0.01122749, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.04359043, "balance_loss_mlp": 1.02214289, "epoch": 0.544580051704443, "flos": 21142335957120.0, "grad_norm": 2.1197411379635893, "language_loss": 0.73616898, "learning_rate": 1.8086660368799958e-06, "loss": 0.75777036, "num_input_tokens_seen": 97674380, "step": 4529, "time_per_iteration": 2.658392906188965 }, { "auxiliary_loss_clip": 0.01119028, "auxiliary_loss_mlp": 0.01044374, "balance_loss_clip": 1.04458141, "balance_loss_mlp": 1.02676654, "epoch": 0.5447002945950821, "flos": 32491508054400.0, "grad_norm": 1.5746276772299121, "language_loss": 0.77190351, "learning_rate": 1.807890652599488e-06, "loss": 0.7935375, "num_input_tokens_seen": 97698765, "step": 4530, "time_per_iteration": 2.734752893447876 }, { "auxiliary_loss_clip": 0.01137784, "auxiliary_loss_mlp": 0.01043918, "balance_loss_clip": 1.04742014, "balance_loss_mlp": 1.02825403, "epoch": 0.5448205374857211, "flos": 11798307757440.0, "grad_norm": 2.2186908670759173, "language_loss": 0.82345402, "learning_rate": 1.8071152974618156e-06, "loss": 0.84527111, "num_input_tokens_seen": 97716565, "step": 4531, "time_per_iteration": 5.33666467666626 }, { "auxiliary_loss_clip": 0.01105541, "auxiliary_loss_mlp": 0.00772819, "balance_loss_clip": 1.03974771, "balance_loss_mlp": 1.00029051, "epoch": 0.5449407803763603, "flos": 24133766474880.0, "grad_norm": 2.4507370021696153, "language_loss": 0.78367102, "learning_rate": 1.806339971584599e-06, "loss": 0.80245465, "num_input_tokens_seen": 97733225, "step": 4532, "time_per_iteration": 2.7007832527160645 }, { "auxiliary_loss_clip": 0.01140003, "auxiliary_loss_mlp": 0.01038884, "balance_loss_clip": 1.04432929, "balance_loss_mlp": 1.02230215, "epoch": 0.5450610232669993, "flos": 23258551685760.0, "grad_norm": 2.523236759624545, "language_loss": 0.8560524, "learning_rate": 1.8055646750854546e-06, "loss": 0.87784123, "num_input_tokens_seen": 97752735, "step": 4533, "time_per_iteration": 2.5624382495880127 }, { "auxiliary_loss_clip": 0.0112145, "auxiliary_loss_mlp": 0.0103868, "balance_loss_clip": 1.04550862, "balance_loss_mlp": 1.02137041, "epoch": 0.5451812661576384, "flos": 17785083375360.0, "grad_norm": 2.415846082632018, "language_loss": 0.82169783, "learning_rate": 1.8047894080819945e-06, "loss": 0.84329915, "num_input_tokens_seen": 97769985, "step": 4534, "time_per_iteration": 2.635000228881836 }, { "auxiliary_loss_clip": 0.01041893, "auxiliary_loss_mlp": 0.01003057, "balance_loss_clip": 1.01030421, "balance_loss_mlp": 1.00135255, "epoch": 0.5453015090482776, "flos": 71062586513280.0, "grad_norm": 0.7204649222675001, "language_loss": 0.63129497, "learning_rate": 1.8040141706918258e-06, "loss": 0.65174448, "num_input_tokens_seen": 97831225, "step": 4535, "time_per_iteration": 3.2840945720672607 }, { "auxiliary_loss_clip": 0.01114979, "auxiliary_loss_mlp": 0.01041183, "balance_loss_clip": 1.04456043, "balance_loss_mlp": 1.02491093, "epoch": 0.5454217519389166, "flos": 25552201622400.0, "grad_norm": 2.608497726023029, "language_loss": 0.77025068, "learning_rate": 1.8032389630325525e-06, "loss": 0.7918123, "num_input_tokens_seen": 97849975, "step": 4536, "time_per_iteration": 2.688163995742798 }, { "auxiliary_loss_clip": 0.01114274, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.04068589, "balance_loss_mlp": 1.02320695, "epoch": 0.5455419948295557, "flos": 23658345037440.0, "grad_norm": 1.8036914765580818, "language_loss": 0.75603324, "learning_rate": 1.8024637852217707e-06, "loss": 0.77756703, "num_input_tokens_seen": 97869700, "step": 4537, "time_per_iteration": 3.525153636932373 }, { "auxiliary_loss_clip": 0.01119299, "auxiliary_loss_mlp": 0.01046505, "balance_loss_clip": 1.04680347, "balance_loss_mlp": 1.03094792, "epoch": 0.5456622377201948, "flos": 23403989854080.0, "grad_norm": 1.9217759284173561, "language_loss": 0.84852397, "learning_rate": 1.8016886373770766e-06, "loss": 0.87018204, "num_input_tokens_seen": 97888215, "step": 4538, "time_per_iteration": 2.655074119567871 }, { "auxiliary_loss_clip": 0.01114376, "auxiliary_loss_mlp": 0.01041787, "balance_loss_clip": 1.04320264, "balance_loss_mlp": 1.02591991, "epoch": 0.5457824806108339, "flos": 23988040997760.0, "grad_norm": 1.8722021396327393, "language_loss": 0.78962123, "learning_rate": 1.8009135196160579e-06, "loss": 0.81118286, "num_input_tokens_seen": 97907090, "step": 4539, "time_per_iteration": 2.672163963317871 }, { "auxiliary_loss_clip": 0.01099778, "auxiliary_loss_mlp": 0.01044786, "balance_loss_clip": 1.04126287, "balance_loss_mlp": 1.02852607, "epoch": 0.545902723501473, "flos": 22565870835840.0, "grad_norm": 1.8381394170071257, "language_loss": 0.8421998, "learning_rate": 1.8001384320563e-06, "loss": 0.86364543, "num_input_tokens_seen": 97927345, "step": 4540, "time_per_iteration": 2.6899397373199463 }, { "auxiliary_loss_clip": 0.01041629, "auxiliary_loss_mlp": 0.01001703, "balance_loss_clip": 1.01010442, "balance_loss_mlp": 1.00018871, "epoch": 0.5460229663921121, "flos": 55198399685760.0, "grad_norm": 0.7719328173537563, "language_loss": 0.5786134, "learning_rate": 1.7993633748153833e-06, "loss": 0.59904671, "num_input_tokens_seen": 97981950, "step": 4541, "time_per_iteration": 4.197952508926392 }, { "auxiliary_loss_clip": 0.01131839, "auxiliary_loss_mlp": 0.0105221, "balance_loss_clip": 1.0455935, "balance_loss_mlp": 1.03501987, "epoch": 0.5461432092827512, "flos": 15413866018560.0, "grad_norm": 1.920871069387107, "language_loss": 0.72701114, "learning_rate": 1.7985883480108834e-06, "loss": 0.7488516, "num_input_tokens_seen": 97999585, "step": 4542, "time_per_iteration": 2.648012161254883 }, { "auxiliary_loss_clip": 0.01122401, "auxiliary_loss_mlp": 0.01045548, "balance_loss_clip": 1.04192698, "balance_loss_mlp": 1.02786922, "epoch": 0.5462634521733902, "flos": 24024921287040.0, "grad_norm": 1.6717229673789304, "language_loss": 0.72093165, "learning_rate": 1.797813351760371e-06, "loss": 0.74261105, "num_input_tokens_seen": 98021290, "step": 4543, "time_per_iteration": 2.624782085418701 }, { "auxiliary_loss_clip": 0.01141128, "auxiliary_loss_mlp": 0.01043148, "balance_loss_clip": 1.04503274, "balance_loss_mlp": 1.02571976, "epoch": 0.5463836950640293, "flos": 22820944291200.0, "grad_norm": 1.8266416501247988, "language_loss": 0.78152013, "learning_rate": 1.7970383861814116e-06, "loss": 0.80336291, "num_input_tokens_seen": 98041060, "step": 4544, "time_per_iteration": 2.5918331146240234 }, { "auxiliary_loss_clip": 0.01131799, "auxiliary_loss_mlp": 0.01038027, "balance_loss_clip": 1.04773235, "balance_loss_mlp": 1.02081323, "epoch": 0.5465039379546685, "flos": 20448290390400.0, "grad_norm": 2.1645743845522922, "language_loss": 0.74345285, "learning_rate": 1.7962634513915684e-06, "loss": 0.76515114, "num_input_tokens_seen": 98058410, "step": 4545, "time_per_iteration": 2.617934226989746 }, { "auxiliary_loss_clip": 0.01135387, "auxiliary_loss_mlp": 0.01039245, "balance_loss_clip": 1.04246676, "balance_loss_mlp": 1.02336669, "epoch": 0.5466241808453075, "flos": 17343310003200.0, "grad_norm": 1.983678105531919, "language_loss": 0.79276431, "learning_rate": 1.7954885475083969e-06, "loss": 0.81451058, "num_input_tokens_seen": 98076080, "step": 4546, "time_per_iteration": 2.505807876586914 }, { "auxiliary_loss_clip": 0.01142751, "auxiliary_loss_mlp": 0.01043878, "balance_loss_clip": 1.04634953, "balance_loss_mlp": 1.02633071, "epoch": 0.5467444237359466, "flos": 21617039122560.0, "grad_norm": 2.44013426531985, "language_loss": 0.72792721, "learning_rate": 1.7947136746494513e-06, "loss": 0.74979353, "num_input_tokens_seen": 98096995, "step": 4547, "time_per_iteration": 2.593989372253418 }, { "auxiliary_loss_clip": 0.01128337, "auxiliary_loss_mlp": 0.01039466, "balance_loss_clip": 1.04371476, "balance_loss_mlp": 1.0225029, "epoch": 0.5468646666265857, "flos": 24170467196160.0, "grad_norm": 2.0408055368016345, "language_loss": 0.88102287, "learning_rate": 1.793938832932277e-06, "loss": 0.9027009, "num_input_tokens_seen": 98115105, "step": 4548, "time_per_iteration": 2.635354518890381 }, { "auxiliary_loss_clip": 0.01139501, "auxiliary_loss_mlp": 0.01040458, "balance_loss_clip": 1.044047, "balance_loss_mlp": 1.02333999, "epoch": 0.5469849095172248, "flos": 27527001505920.0, "grad_norm": 5.118094440923828, "language_loss": 0.70539552, "learning_rate": 1.7931640224744185e-06, "loss": 0.72719508, "num_input_tokens_seen": 98135655, "step": 4549, "time_per_iteration": 2.5815582275390625 }, { "auxiliary_loss_clip": 0.0109098, "auxiliary_loss_mlp": 0.01038128, "balance_loss_clip": 1.03683519, "balance_loss_mlp": 1.0212245, "epoch": 0.5471051524078638, "flos": 27964680727680.0, "grad_norm": 1.8860398049389617, "language_loss": 0.73736739, "learning_rate": 1.7923892433934127e-06, "loss": 0.75865847, "num_input_tokens_seen": 98156730, "step": 4550, "time_per_iteration": 2.754974126815796 }, { "auxiliary_loss_clip": 0.01122625, "auxiliary_loss_mlp": 0.00773927, "balance_loss_clip": 1.04531002, "balance_loss_mlp": 1.00033867, "epoch": 0.547225395298503, "flos": 18150510389760.0, "grad_norm": 2.022973034994444, "language_loss": 0.7866599, "learning_rate": 1.7916144958067939e-06, "loss": 0.80562544, "num_input_tokens_seen": 98174590, "step": 4551, "time_per_iteration": 2.674433946609497 }, { "auxiliary_loss_clip": 0.01132574, "auxiliary_loss_mlp": 0.01044927, "balance_loss_clip": 1.04689693, "balance_loss_mlp": 1.02890515, "epoch": 0.5473456381891421, "flos": 21361498790400.0, "grad_norm": 2.3827106534181666, "language_loss": 0.79231584, "learning_rate": 1.7908397798320905e-06, "loss": 0.81409085, "num_input_tokens_seen": 98194325, "step": 4552, "time_per_iteration": 2.6174209117889404 }, { "auxiliary_loss_clip": 0.01127798, "auxiliary_loss_mlp": 0.0077323, "balance_loss_clip": 1.0438993, "balance_loss_mlp": 1.00034273, "epoch": 0.5474658810797811, "flos": 19932145908480.0, "grad_norm": 1.882776566481058, "language_loss": 0.75023472, "learning_rate": 1.7900650955868265e-06, "loss": 0.76924503, "num_input_tokens_seen": 98213970, "step": 4553, "time_per_iteration": 2.61214280128479 }, { "auxiliary_loss_clip": 0.01128919, "auxiliary_loss_mlp": 0.00772408, "balance_loss_clip": 1.04479623, "balance_loss_mlp": 1.00034153, "epoch": 0.5475861239704203, "flos": 50476217264640.0, "grad_norm": 1.3850425682633438, "language_loss": 0.76410758, "learning_rate": 1.7892904431885202e-06, "loss": 0.78312081, "num_input_tokens_seen": 98241145, "step": 4554, "time_per_iteration": 2.8455820083618164 }, { "auxiliary_loss_clip": 0.01086497, "auxiliary_loss_mlp": 0.01038784, "balance_loss_clip": 1.0369308, "balance_loss_mlp": 1.02385902, "epoch": 0.5477063668610593, "flos": 20705123612160.0, "grad_norm": 7.854894717311218, "language_loss": 0.75357044, "learning_rate": 1.788515822754686e-06, "loss": 0.77482325, "num_input_tokens_seen": 98261565, "step": 4555, "time_per_iteration": 2.7000746726989746 }, { "auxiliary_loss_clip": 0.01102775, "auxiliary_loss_mlp": 0.01049351, "balance_loss_clip": 1.03926492, "balance_loss_mlp": 1.03042006, "epoch": 0.5478266097516984, "flos": 19609740408960.0, "grad_norm": 2.9020577404211116, "language_loss": 0.78420734, "learning_rate": 1.7877412344028335e-06, "loss": 0.80572855, "num_input_tokens_seen": 98281370, "step": 4556, "time_per_iteration": 3.6125288009643555 }, { "auxiliary_loss_clip": 0.01128663, "auxiliary_loss_mlp": 0.01041881, "balance_loss_clip": 1.0433408, "balance_loss_mlp": 1.02484655, "epoch": 0.5479468526423376, "flos": 12896599962240.0, "grad_norm": 2.2101865965718592, "language_loss": 0.77496189, "learning_rate": 1.7869666782504668e-06, "loss": 0.79666734, "num_input_tokens_seen": 98297950, "step": 4557, "time_per_iteration": 3.571918249130249 }, { "auxiliary_loss_clip": 0.01104532, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.04075503, "balance_loss_mlp": 1.02147937, "epoch": 0.5480670955329766, "flos": 18588800142720.0, "grad_norm": 2.11571528603019, "language_loss": 0.69115633, "learning_rate": 1.7861921544150867e-06, "loss": 0.71257359, "num_input_tokens_seen": 98316800, "step": 4558, "time_per_iteration": 2.7362866401672363 }, { "auxiliary_loss_clip": 0.01071085, "auxiliary_loss_mlp": 0.00772377, "balance_loss_clip": 1.03899145, "balance_loss_mlp": 1.00031471, "epoch": 0.5481873384236157, "flos": 15954608338560.0, "grad_norm": 1.7552207780314582, "language_loss": 0.76841027, "learning_rate": 1.7854176630141856e-06, "loss": 0.78684485, "num_input_tokens_seen": 98333935, "step": 4559, "time_per_iteration": 2.751774787902832 }, { "auxiliary_loss_clip": 0.01144132, "auxiliary_loss_mlp": 0.01045142, "balance_loss_clip": 1.04563808, "balance_loss_mlp": 1.02727294, "epoch": 0.5483075813142548, "flos": 22783812606720.0, "grad_norm": 2.8304464118571446, "language_loss": 0.8443203, "learning_rate": 1.784643204165255e-06, "loss": 0.86621308, "num_input_tokens_seen": 98353255, "step": 4560, "time_per_iteration": 2.541722297668457 }, { "auxiliary_loss_clip": 0.01121623, "auxiliary_loss_mlp": 0.01034398, "balance_loss_clip": 1.04392982, "balance_loss_mlp": 1.01843631, "epoch": 0.5484278242048939, "flos": 19317212046720.0, "grad_norm": 2.0168786912415264, "language_loss": 0.77412212, "learning_rate": 1.7838687779857783e-06, "loss": 0.79568231, "num_input_tokens_seen": 98371130, "step": 4561, "time_per_iteration": 2.565159559249878 }, { "auxiliary_loss_clip": 0.01107544, "auxiliary_loss_mlp": 0.01047619, "balance_loss_clip": 1.04049075, "balance_loss_mlp": 1.02999973, "epoch": 0.5485480670955329, "flos": 22816024128000.0, "grad_norm": 2.5203195442635247, "language_loss": 0.63788652, "learning_rate": 1.7830943845932366e-06, "loss": 0.65943813, "num_input_tokens_seen": 98390455, "step": 4562, "time_per_iteration": 2.647812604904175 }, { "auxiliary_loss_clip": 0.01122134, "auxiliary_loss_mlp": 0.01040514, "balance_loss_clip": 1.04661965, "balance_loss_mlp": 1.02458739, "epoch": 0.5486683099861721, "flos": 22671304231680.0, "grad_norm": 3.393381375576572, "language_loss": 0.75070739, "learning_rate": 1.7823200241051044e-06, "loss": 0.77233386, "num_input_tokens_seen": 98409370, "step": 4563, "time_per_iteration": 3.593658208847046 }, { "auxiliary_loss_clip": 0.01140275, "auxiliary_loss_mlp": 0.01041373, "balance_loss_clip": 1.0468334, "balance_loss_mlp": 1.02538693, "epoch": 0.5487885528768112, "flos": 23149383275520.0, "grad_norm": 2.3884689342250227, "language_loss": 0.80323613, "learning_rate": 1.7815456966388513e-06, "loss": 0.82505262, "num_input_tokens_seen": 98428465, "step": 4564, "time_per_iteration": 2.603907346725464 }, { "auxiliary_loss_clip": 0.01104519, "auxiliary_loss_mlp": 0.0104001, "balance_loss_clip": 1.04079974, "balance_loss_mlp": 1.02191401, "epoch": 0.5489087957674502, "flos": 22053928245120.0, "grad_norm": 2.163066685298218, "language_loss": 0.80906498, "learning_rate": 1.780771402311943e-06, "loss": 0.8305102, "num_input_tokens_seen": 98447300, "step": 4565, "time_per_iteration": 2.647514581680298 }, { "auxiliary_loss_clip": 0.01114079, "auxiliary_loss_mlp": 0.01046456, "balance_loss_clip": 1.04323792, "balance_loss_mlp": 1.02905118, "epoch": 0.5490290386580894, "flos": 24315977191680.0, "grad_norm": 3.816234948784631, "language_loss": 0.78727144, "learning_rate": 1.7799971412418374e-06, "loss": 0.80887675, "num_input_tokens_seen": 98468695, "step": 4566, "time_per_iteration": 2.628408432006836 }, { "auxiliary_loss_clip": 0.01102848, "auxiliary_loss_mlp": 0.0105137, "balance_loss_clip": 1.0408591, "balance_loss_mlp": 1.03320265, "epoch": 0.5491492815487284, "flos": 18294942977280.0, "grad_norm": 2.3011113127692724, "language_loss": 0.73632997, "learning_rate": 1.7792229135459918e-06, "loss": 0.7578721, "num_input_tokens_seen": 98485345, "step": 4567, "time_per_iteration": 3.5311765670776367 }, { "auxiliary_loss_clip": 0.01008134, "auxiliary_loss_mlp": 0.0100851, "balance_loss_clip": 1.02107954, "balance_loss_mlp": 1.00682914, "epoch": 0.5492695244393675, "flos": 64550257050240.0, "grad_norm": 0.739353914359054, "language_loss": 0.61540395, "learning_rate": 1.7784487193418538e-06, "loss": 0.63557041, "num_input_tokens_seen": 98543195, "step": 4568, "time_per_iteration": 3.1901533603668213 }, { "auxiliary_loss_clip": 0.01090149, "auxiliary_loss_mlp": 0.01046562, "balance_loss_clip": 1.03532732, "balance_loss_mlp": 1.02844226, "epoch": 0.5493897673300067, "flos": 17379579761280.0, "grad_norm": 1.857615861188927, "language_loss": 0.60695124, "learning_rate": 1.7776745587468698e-06, "loss": 0.62831837, "num_input_tokens_seen": 98560620, "step": 4569, "time_per_iteration": 2.6437509059906006 }, { "auxiliary_loss_clip": 0.01140349, "auxiliary_loss_mlp": 0.01037449, "balance_loss_clip": 1.04471231, "balance_loss_mlp": 1.01985383, "epoch": 0.5495100102206457, "flos": 19901765980800.0, "grad_norm": 3.1618224694417436, "language_loss": 0.81627667, "learning_rate": 1.7769004318784776e-06, "loss": 0.8380546, "num_input_tokens_seen": 98578265, "step": 4570, "time_per_iteration": 2.5822768211364746 }, { "auxiliary_loss_clip": 0.01130796, "auxiliary_loss_mlp": 0.01034347, "balance_loss_clip": 1.04580653, "balance_loss_mlp": 1.01820588, "epoch": 0.5496302531112848, "flos": 16727190992640.0, "grad_norm": 2.66809245456302, "language_loss": 0.80893999, "learning_rate": 1.776126338854113e-06, "loss": 0.83059138, "num_input_tokens_seen": 98596055, "step": 4571, "time_per_iteration": 2.6767733097076416 }, { "auxiliary_loss_clip": 0.01119461, "auxiliary_loss_mlp": 0.01037161, "balance_loss_clip": 1.04345179, "balance_loss_mlp": 1.02117538, "epoch": 0.5497504960019239, "flos": 24572343536640.0, "grad_norm": 1.6760721274414934, "language_loss": 0.84355938, "learning_rate": 1.7753522797912044e-06, "loss": 0.86512554, "num_input_tokens_seen": 98616140, "step": 4572, "time_per_iteration": 2.633612871170044 }, { "auxiliary_loss_clip": 0.01123249, "auxiliary_loss_mlp": 0.01044921, "balance_loss_clip": 1.04454017, "balance_loss_mlp": 1.0285418, "epoch": 0.549870738892563, "flos": 15450494912640.0, "grad_norm": 3.5518713611482085, "language_loss": 0.69879043, "learning_rate": 1.7745782548071765e-06, "loss": 0.7204721, "num_input_tokens_seen": 98633035, "step": 4573, "time_per_iteration": 2.614537000656128 }, { "auxiliary_loss_clip": 0.01098368, "auxiliary_loss_mlp": 0.01038849, "balance_loss_clip": 1.043396, "balance_loss_mlp": 1.02283895, "epoch": 0.549990981783202, "flos": 21069114082560.0, "grad_norm": 1.747736304295719, "language_loss": 0.74551064, "learning_rate": 1.7738042640194482e-06, "loss": 0.76688278, "num_input_tokens_seen": 98652700, "step": 4574, "time_per_iteration": 2.693773031234741 }, { "auxiliary_loss_clip": 0.01141773, "auxiliary_loss_mlp": 0.01043869, "balance_loss_clip": 1.04621971, "balance_loss_mlp": 1.02528453, "epoch": 0.5501112246738411, "flos": 21395901041280.0, "grad_norm": 1.7694270138650232, "language_loss": 0.70563722, "learning_rate": 1.7730303075454335e-06, "loss": 0.72749364, "num_input_tokens_seen": 98671590, "step": 4575, "time_per_iteration": 2.6013920307159424 }, { "auxiliary_loss_clip": 0.01112233, "auxiliary_loss_mlp": 0.01043363, "balance_loss_clip": 1.04426181, "balance_loss_mlp": 1.02660227, "epoch": 0.5502314675644803, "flos": 17456931699840.0, "grad_norm": 2.0565156490256853, "language_loss": 0.84637719, "learning_rate": 1.7722563855025402e-06, "loss": 0.86793315, "num_input_tokens_seen": 98689620, "step": 4576, "time_per_iteration": 2.667262315750122 }, { "auxiliary_loss_clip": 0.01117453, "auxiliary_loss_mlp": 0.0104238, "balance_loss_clip": 1.04032683, "balance_loss_mlp": 1.02510631, "epoch": 0.5503517104551193, "flos": 24310410583680.0, "grad_norm": 4.92942505390751, "language_loss": 0.71199989, "learning_rate": 1.7714824980081721e-06, "loss": 0.73359823, "num_input_tokens_seen": 98708915, "step": 4577, "time_per_iteration": 2.6568827629089355 }, { "auxiliary_loss_clip": 0.01123311, "auxiliary_loss_mlp": 0.01033436, "balance_loss_clip": 1.04434824, "balance_loss_mlp": 1.01743793, "epoch": 0.5504719533457584, "flos": 22419427086720.0, "grad_norm": 1.7712941971464171, "language_loss": 0.73741412, "learning_rate": 1.7707086451797276e-06, "loss": 0.75898159, "num_input_tokens_seen": 98729790, "step": 4578, "time_per_iteration": 2.6449475288391113 }, { "auxiliary_loss_clip": 0.01019113, "auxiliary_loss_mlp": 0.01004375, "balance_loss_clip": 1.01647067, "balance_loss_mlp": 1.00236082, "epoch": 0.5505921962363975, "flos": 67294155968640.0, "grad_norm": 0.7199773773190891, "language_loss": 0.52272975, "learning_rate": 1.7699348271345993e-06, "loss": 0.54296464, "num_input_tokens_seen": 98792415, "step": 4579, "time_per_iteration": 3.210376262664795 }, { "auxiliary_loss_clip": 0.01007283, "auxiliary_loss_mlp": 0.01012967, "balance_loss_clip": 1.01192546, "balance_loss_mlp": 1.01065469, "epoch": 0.5507124391270366, "flos": 45685125578880.0, "grad_norm": 0.7070050334199504, "language_loss": 0.54456699, "learning_rate": 1.7691610439901753e-06, "loss": 0.56476951, "num_input_tokens_seen": 98855350, "step": 4580, "time_per_iteration": 3.3229517936706543 }, { "auxiliary_loss_clip": 0.01126833, "auxiliary_loss_mlp": 0.01037098, "balance_loss_clip": 1.04250336, "balance_loss_mlp": 1.02038538, "epoch": 0.5508326820176757, "flos": 22273845264000.0, "grad_norm": 2.063509077265466, "language_loss": 0.75488168, "learning_rate": 1.7683872958638367e-06, "loss": 0.77652097, "num_input_tokens_seen": 98874230, "step": 4581, "time_per_iteration": 2.6803793907165527 }, { "auxiliary_loss_clip": 0.01114977, "auxiliary_loss_mlp": 0.01039682, "balance_loss_clip": 1.04174829, "balance_loss_mlp": 1.02320683, "epoch": 0.5509529249083148, "flos": 20012442762240.0, "grad_norm": 2.2250381266106483, "language_loss": 0.84303057, "learning_rate": 1.7676135828729614e-06, "loss": 0.86457717, "num_input_tokens_seen": 98893940, "step": 4582, "time_per_iteration": 3.660299777984619 }, { "auxiliary_loss_clip": 0.01130302, "auxiliary_loss_mlp": 0.01038034, "balance_loss_clip": 1.04473352, "balance_loss_mlp": 1.02164245, "epoch": 0.5510731677989539, "flos": 21834801325440.0, "grad_norm": 2.337055921340879, "language_loss": 0.82939792, "learning_rate": 1.7668399051349205e-06, "loss": 0.85108125, "num_input_tokens_seen": 98913620, "step": 4583, "time_per_iteration": 3.6224491596221924 }, { "auxiliary_loss_clip": 0.01102801, "auxiliary_loss_mlp": 0.0104655, "balance_loss_clip": 1.04233241, "balance_loss_mlp": 1.02902603, "epoch": 0.5511934106895929, "flos": 21467901853440.0, "grad_norm": 1.8754921619101943, "language_loss": 0.8348757, "learning_rate": 1.766066262767081e-06, "loss": 0.85636926, "num_input_tokens_seen": 98931460, "step": 4584, "time_per_iteration": 2.664950132369995 }, { "auxiliary_loss_clip": 0.0111514, "auxiliary_loss_mlp": 0.01038379, "balance_loss_clip": 1.04413354, "balance_loss_mlp": 1.02221417, "epoch": 0.5513136535802321, "flos": 21068934514560.0, "grad_norm": 2.4775936013285484, "language_loss": 0.77380878, "learning_rate": 1.765292655886803e-06, "loss": 0.795344, "num_input_tokens_seen": 98950105, "step": 4585, "time_per_iteration": 2.6509690284729004 }, { "auxiliary_loss_clip": 0.01108865, "auxiliary_loss_mlp": 0.01038607, "balance_loss_clip": 1.04177284, "balance_loss_mlp": 1.02240682, "epoch": 0.5514338964708712, "flos": 27815004754560.0, "grad_norm": 2.0551720443756367, "language_loss": 0.71068525, "learning_rate": 1.764519084611443e-06, "loss": 0.73215997, "num_input_tokens_seen": 98970560, "step": 4586, "time_per_iteration": 2.7064361572265625 }, { "auxiliary_loss_clip": 0.01119211, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.04288757, "balance_loss_mlp": 1.02250957, "epoch": 0.5515541393615102, "flos": 21908525990400.0, "grad_norm": 1.748048416421784, "language_loss": 0.77774602, "learning_rate": 1.7637455490583505e-06, "loss": 0.79934728, "num_input_tokens_seen": 98989885, "step": 4587, "time_per_iteration": 2.691331624984741 }, { "auxiliary_loss_clip": 0.01128182, "auxiliary_loss_mlp": 0.01042566, "balance_loss_clip": 1.04621577, "balance_loss_mlp": 1.02716374, "epoch": 0.5516743822521494, "flos": 20485422074880.0, "grad_norm": 2.177925003376497, "language_loss": 0.77497125, "learning_rate": 1.7629720493448701e-06, "loss": 0.79667878, "num_input_tokens_seen": 99007180, "step": 4588, "time_per_iteration": 2.66965651512146 }, { "auxiliary_loss_clip": 0.01122511, "auxiliary_loss_mlp": 0.0104029, "balance_loss_clip": 1.04388452, "balance_loss_mlp": 1.02201557, "epoch": 0.5517946251427884, "flos": 14940383915520.0, "grad_norm": 1.8312506088361071, "language_loss": 0.85484421, "learning_rate": 1.7621985855883418e-06, "loss": 0.87647223, "num_input_tokens_seen": 99023880, "step": 4589, "time_per_iteration": 3.763529062271118 }, { "auxiliary_loss_clip": 0.01111118, "auxiliary_loss_mlp": 0.01036168, "balance_loss_clip": 1.04282975, "balance_loss_mlp": 1.02011013, "epoch": 0.5519148680334275, "flos": 18404865573120.0, "grad_norm": 1.8321680906795652, "language_loss": 0.72413373, "learning_rate": 1.7614251579060983e-06, "loss": 0.74560654, "num_input_tokens_seen": 99042475, "step": 4590, "time_per_iteration": 2.693535089492798 }, { "auxiliary_loss_clip": 0.01104694, "auxiliary_loss_mlp": 0.01039314, "balance_loss_clip": 1.04166925, "balance_loss_mlp": 1.02313733, "epoch": 0.5520351109240667, "flos": 25113337251840.0, "grad_norm": 4.0210307456309025, "language_loss": 0.84805971, "learning_rate": 1.76065176641547e-06, "loss": 0.8694998, "num_input_tokens_seen": 99065185, "step": 4591, "time_per_iteration": 2.7620394229888916 }, { "auxiliary_loss_clip": 0.0112774, "auxiliary_loss_mlp": 0.01038924, "balance_loss_clip": 1.04286957, "balance_loss_mlp": 1.02324796, "epoch": 0.5521553538147057, "flos": 21069545045760.0, "grad_norm": 2.00001477097589, "language_loss": 0.77859795, "learning_rate": 1.759878411233777e-06, "loss": 0.8002646, "num_input_tokens_seen": 99083645, "step": 4592, "time_per_iteration": 2.7348408699035645 }, { "auxiliary_loss_clip": 0.01127095, "auxiliary_loss_mlp": 0.01046888, "balance_loss_clip": 1.04440689, "balance_loss_mlp": 1.03029406, "epoch": 0.5522755967053448, "flos": 18879999701760.0, "grad_norm": 2.21977815489917, "language_loss": 0.76041728, "learning_rate": 1.7591050924783388e-06, "loss": 0.78215706, "num_input_tokens_seen": 99100835, "step": 4593, "time_per_iteration": 2.6800005435943604 }, { "auxiliary_loss_clip": 0.01003286, "auxiliary_loss_mlp": 0.01003302, "balance_loss_clip": 1.00995207, "balance_loss_mlp": 1.00144255, "epoch": 0.5523958395959839, "flos": 64675622494080.0, "grad_norm": 0.8356952959035595, "language_loss": 0.57901227, "learning_rate": 1.7583318102664661e-06, "loss": 0.59907812, "num_input_tokens_seen": 99168400, "step": 4594, "time_per_iteration": 4.256865501403809 }, { "auxiliary_loss_clip": 0.01130533, "auxiliary_loss_mlp": 0.01039275, "balance_loss_clip": 1.04348326, "balance_loss_mlp": 1.02300334, "epoch": 0.552516082486623, "flos": 10889732211840.0, "grad_norm": 1.88441321174686, "language_loss": 0.78900838, "learning_rate": 1.757558564715466e-06, "loss": 0.8107065, "num_input_tokens_seen": 99186475, "step": 4595, "time_per_iteration": 2.7090513706207275 }, { "auxiliary_loss_clip": 0.01130087, "auxiliary_loss_mlp": 0.01045002, "balance_loss_clip": 1.04142189, "balance_loss_mlp": 1.02787197, "epoch": 0.552636325377262, "flos": 22199797376640.0, "grad_norm": 3.9995014451478537, "language_loss": 0.73728371, "learning_rate": 1.7567853559426386e-06, "loss": 0.75903457, "num_input_tokens_seen": 99203525, "step": 4596, "time_per_iteration": 2.7005836963653564 }, { "auxiliary_loss_clip": 0.01128983, "auxiliary_loss_mlp": 0.01041248, "balance_loss_clip": 1.04551911, "balance_loss_mlp": 1.02579808, "epoch": 0.5527565682679012, "flos": 23988184652160.0, "grad_norm": 2.041382765757546, "language_loss": 0.75270265, "learning_rate": 1.7560121840652797e-06, "loss": 0.774405, "num_input_tokens_seen": 99222910, "step": 4597, "time_per_iteration": 2.7712059020996094 }, { "auxiliary_loss_clip": 0.01095614, "auxiliary_loss_mlp": 0.01044822, "balance_loss_clip": 1.04141736, "balance_loss_mlp": 1.02857423, "epoch": 0.5528768111585403, "flos": 19719267955200.0, "grad_norm": 1.8513949369216145, "language_loss": 0.69337928, "learning_rate": 1.7552390492006782e-06, "loss": 0.71478367, "num_input_tokens_seen": 99241230, "step": 4598, "time_per_iteration": 2.7382349967956543 }, { "auxiliary_loss_clip": 0.01100956, "auxiliary_loss_mlp": 0.00774105, "balance_loss_clip": 1.04019976, "balance_loss_mlp": 1.00026202, "epoch": 0.5529970540491793, "flos": 26215975002240.0, "grad_norm": 1.8635365173229583, "language_loss": 0.65411073, "learning_rate": 1.7544659514661184e-06, "loss": 0.67286134, "num_input_tokens_seen": 99264320, "step": 4599, "time_per_iteration": 2.959944248199463 }, { "auxiliary_loss_clip": 0.01110347, "auxiliary_loss_mlp": 0.01040249, "balance_loss_clip": 1.03917551, "balance_loss_mlp": 1.02426279, "epoch": 0.5531172969398185, "flos": 24425971614720.0, "grad_norm": 2.7369032058614216, "language_loss": 0.79793924, "learning_rate": 1.7536928909788786e-06, "loss": 0.81944519, "num_input_tokens_seen": 99283625, "step": 4600, "time_per_iteration": 2.7211334705352783 }, { "auxiliary_loss_clip": 0.01009593, "auxiliary_loss_mlp": 0.01001126, "balance_loss_clip": 1.01443338, "balance_loss_mlp": 0.99932557, "epoch": 0.5532375398304575, "flos": 64907316195840.0, "grad_norm": 0.8830852853599479, "language_loss": 0.61989659, "learning_rate": 1.752919867856231e-06, "loss": 0.6400038, "num_input_tokens_seen": 99335270, "step": 4601, "time_per_iteration": 3.1343605518341064 }, { "auxiliary_loss_clip": 0.01107489, "auxiliary_loss_mlp": 0.01038795, "balance_loss_clip": 1.0393889, "balance_loss_mlp": 1.02276134, "epoch": 0.5533577827210966, "flos": 19683105937920.0, "grad_norm": 1.660625256932496, "language_loss": 0.78766513, "learning_rate": 1.7521468822154436e-06, "loss": 0.80912793, "num_input_tokens_seen": 99354185, "step": 4602, "time_per_iteration": 2.7952916622161865 }, { "auxiliary_loss_clip": 0.01109811, "auxiliary_loss_mlp": 0.01038388, "balance_loss_clip": 1.0430162, "balance_loss_mlp": 1.02162719, "epoch": 0.5534780256117358, "flos": 32306496076800.0, "grad_norm": 2.1688372222091066, "language_loss": 0.75308168, "learning_rate": 1.751373934173777e-06, "loss": 0.77456361, "num_input_tokens_seen": 99376930, "step": 4603, "time_per_iteration": 2.78555965423584 }, { "auxiliary_loss_clip": 0.01140612, "auxiliary_loss_mlp": 0.01038642, "balance_loss_clip": 1.04336405, "balance_loss_mlp": 1.02271545, "epoch": 0.5535982685023748, "flos": 23222425582080.0, "grad_norm": 1.6598885886192298, "language_loss": 0.73047143, "learning_rate": 1.750601023848487e-06, "loss": 0.75226402, "num_input_tokens_seen": 99397655, "step": 4604, "time_per_iteration": 2.7661263942718506 }, { "auxiliary_loss_clip": 0.01141858, "auxiliary_loss_mlp": 0.00772049, "balance_loss_clip": 1.04809082, "balance_loss_mlp": 1.00021899, "epoch": 0.5537185113930139, "flos": 24352534258560.0, "grad_norm": 1.881517907246107, "language_loss": 0.73703456, "learning_rate": 1.749828151356823e-06, "loss": 0.75617361, "num_input_tokens_seen": 99417850, "step": 4605, "time_per_iteration": 2.6687333583831787 }, { "auxiliary_loss_clip": 0.01114504, "auxiliary_loss_mlp": 0.01034175, "balance_loss_clip": 1.04229808, "balance_loss_mlp": 1.01888108, "epoch": 0.553838754283653, "flos": 23549068886400.0, "grad_norm": 1.7159935248720712, "language_loss": 0.75405455, "learning_rate": 1.7490553168160297e-06, "loss": 0.77554137, "num_input_tokens_seen": 99438920, "step": 4606, "time_per_iteration": 2.7003538608551025 }, { "auxiliary_loss_clip": 0.01111115, "auxiliary_loss_mlp": 0.0104062, "balance_loss_clip": 1.04131901, "balance_loss_mlp": 1.02476501, "epoch": 0.5539589971742921, "flos": 17275044205440.0, "grad_norm": 1.999362935493774, "language_loss": 0.76542628, "learning_rate": 1.748282520343345e-06, "loss": 0.78694367, "num_input_tokens_seen": 99457950, "step": 4607, "time_per_iteration": 2.6264920234680176 }, { "auxiliary_loss_clip": 0.01135575, "auxiliary_loss_mlp": 0.01040857, "balance_loss_clip": 1.04554236, "balance_loss_mlp": 1.02414346, "epoch": 0.5540792400649311, "flos": 27564169104000.0, "grad_norm": 2.1226671716595087, "language_loss": 0.78740579, "learning_rate": 1.7475097620560023e-06, "loss": 0.80917013, "num_input_tokens_seen": 99478015, "step": 4608, "time_per_iteration": 2.676070213317871 }, { "auxiliary_loss_clip": 0.01139448, "auxiliary_loss_mlp": 0.0104084, "balance_loss_clip": 1.04627144, "balance_loss_mlp": 1.02562892, "epoch": 0.5541994829555702, "flos": 23878657105920.0, "grad_norm": 2.479520934162244, "language_loss": 0.70910937, "learning_rate": 1.746737042071228e-06, "loss": 0.73091221, "num_input_tokens_seen": 99496520, "step": 4609, "time_per_iteration": 4.4692981243133545 }, { "auxiliary_loss_clip": 0.01111571, "auxiliary_loss_mlp": 0.01037377, "balance_loss_clip": 1.04274321, "balance_loss_mlp": 1.020437, "epoch": 0.5543197258462094, "flos": 20115721342080.0, "grad_norm": 3.476138835951536, "language_loss": 0.79409713, "learning_rate": 1.7459643605062424e-06, "loss": 0.81558657, "num_input_tokens_seen": 99513780, "step": 4610, "time_per_iteration": 2.631166934967041 }, { "auxiliary_loss_clip": 0.0109036, "auxiliary_loss_mlp": 0.01043963, "balance_loss_clip": 1.04363501, "balance_loss_mlp": 1.02612972, "epoch": 0.5544399687368484, "flos": 20916565021440.0, "grad_norm": 1.6446919525312975, "language_loss": 0.80644733, "learning_rate": 1.745191717478262e-06, "loss": 0.82779056, "num_input_tokens_seen": 99532360, "step": 4611, "time_per_iteration": 2.72038197517395 }, { "auxiliary_loss_clip": 0.01109121, "auxiliary_loss_mlp": 0.01037085, "balance_loss_clip": 1.04445672, "balance_loss_mlp": 1.02145672, "epoch": 0.5545602116274875, "flos": 25518661297920.0, "grad_norm": 2.2560046945804584, "language_loss": 0.7976625, "learning_rate": 1.7444191131044948e-06, "loss": 0.81912458, "num_input_tokens_seen": 99552635, "step": 4612, "time_per_iteration": 2.6966540813446045 }, { "auxiliary_loss_clip": 0.01120595, "auxiliary_loss_mlp": 0.01044776, "balance_loss_clip": 1.0477854, "balance_loss_mlp": 1.02865851, "epoch": 0.5546804545181266, "flos": 20995568985600.0, "grad_norm": 1.886141107013765, "language_loss": 0.73282754, "learning_rate": 1.7436465475021456e-06, "loss": 0.7544812, "num_input_tokens_seen": 99572685, "step": 4613, "time_per_iteration": 2.6358048915863037 }, { "auxiliary_loss_clip": 0.01096059, "auxiliary_loss_mlp": 0.01052218, "balance_loss_clip": 1.04098511, "balance_loss_mlp": 1.03470576, "epoch": 0.5548006974087657, "flos": 26833638297600.0, "grad_norm": 2.151580775457448, "language_loss": 0.71291411, "learning_rate": 1.7428740207884111e-06, "loss": 0.73439693, "num_input_tokens_seen": 99593565, "step": 4614, "time_per_iteration": 2.7028820514678955 }, { "auxiliary_loss_clip": 0.0109375, "auxiliary_loss_mlp": 0.01043259, "balance_loss_clip": 1.04264832, "balance_loss_mlp": 1.02746356, "epoch": 0.5549209402994048, "flos": 33656414031360.0, "grad_norm": 2.1718152971698528, "language_loss": 0.61250842, "learning_rate": 1.7421015330804833e-06, "loss": 0.63387847, "num_input_tokens_seen": 99613485, "step": 4615, "time_per_iteration": 3.851170301437378 }, { "auxiliary_loss_clip": 0.01140018, "auxiliary_loss_mlp": 0.0104177, "balance_loss_clip": 1.04508734, "balance_loss_mlp": 1.02438879, "epoch": 0.5550411831900439, "flos": 23769524609280.0, "grad_norm": 15.638968751474668, "language_loss": 0.72627372, "learning_rate": 1.7413290844955475e-06, "loss": 0.74809158, "num_input_tokens_seen": 99633515, "step": 4616, "time_per_iteration": 2.6022865772247314 }, { "auxiliary_loss_clip": 0.0112035, "auxiliary_loss_mlp": 0.01050233, "balance_loss_clip": 1.04510474, "balance_loss_mlp": 1.03315079, "epoch": 0.555161426080683, "flos": 21651189978240.0, "grad_norm": 1.8628392930192037, "language_loss": 0.78542328, "learning_rate": 1.7405566751507843e-06, "loss": 0.8071292, "num_input_tokens_seen": 99651560, "step": 4617, "time_per_iteration": 2.602823257446289 }, { "auxiliary_loss_clip": 0.01100038, "auxiliary_loss_mlp": 0.01042198, "balance_loss_clip": 1.0394963, "balance_loss_mlp": 1.02587867, "epoch": 0.555281668971322, "flos": 49563116605440.0, "grad_norm": 1.584215277187556, "language_loss": 0.6749531, "learning_rate": 1.7397843051633668e-06, "loss": 0.69637543, "num_input_tokens_seen": 99674255, "step": 4618, "time_per_iteration": 2.9161136150360107 }, { "auxiliary_loss_clip": 0.01125458, "auxiliary_loss_mlp": 0.0104077, "balance_loss_clip": 1.04493058, "balance_loss_mlp": 1.02379429, "epoch": 0.5554019118619612, "flos": 20741608851840.0, "grad_norm": 1.7702745941074267, "language_loss": 0.71576273, "learning_rate": 1.739011974650464e-06, "loss": 0.73742509, "num_input_tokens_seen": 99693585, "step": 4619, "time_per_iteration": 3.540837049484253 }, { "auxiliary_loss_clip": 0.01094202, "auxiliary_loss_mlp": 0.01040805, "balance_loss_clip": 1.04116142, "balance_loss_mlp": 1.02254176, "epoch": 0.5555221547526003, "flos": 25483217552640.0, "grad_norm": 3.864976454898832, "language_loss": 0.76912141, "learning_rate": 1.7382396837292365e-06, "loss": 0.79047149, "num_input_tokens_seen": 99714045, "step": 4620, "time_per_iteration": 2.72096586227417 }, { "auxiliary_loss_clip": 0.01140283, "auxiliary_loss_mlp": 0.01041816, "balance_loss_clip": 1.04548788, "balance_loss_mlp": 1.02411354, "epoch": 0.5556423976432393, "flos": 21762513204480.0, "grad_norm": 2.0981176949629754, "language_loss": 0.73678279, "learning_rate": 1.737467432516841e-06, "loss": 0.75860375, "num_input_tokens_seen": 99734145, "step": 4621, "time_per_iteration": 2.564387321472168 }, { "auxiliary_loss_clip": 0.01116495, "auxiliary_loss_mlp": 0.01039138, "balance_loss_clip": 1.04114008, "balance_loss_mlp": 1.02199531, "epoch": 0.5557626405338785, "flos": 24900171989760.0, "grad_norm": 2.320747230374674, "language_loss": 0.74007982, "learning_rate": 1.7366952211304274e-06, "loss": 0.76163614, "num_input_tokens_seen": 99751990, "step": 4622, "time_per_iteration": 2.7327306270599365 }, { "auxiliary_loss_clip": 0.01110544, "auxiliary_loss_mlp": 0.01051881, "balance_loss_clip": 1.04336548, "balance_loss_mlp": 1.03527498, "epoch": 0.5558828834245175, "flos": 18697501676160.0, "grad_norm": 2.084549340990908, "language_loss": 0.83626413, "learning_rate": 1.735923049687139e-06, "loss": 0.85788834, "num_input_tokens_seen": 99768565, "step": 4623, "time_per_iteration": 2.656249761581421 }, { "auxiliary_loss_clip": 0.01105878, "auxiliary_loss_mlp": 0.01036138, "balance_loss_clip": 1.03919744, "balance_loss_mlp": 1.01993704, "epoch": 0.5560031263151566, "flos": 27272179445760.0, "grad_norm": 1.4804188284030007, "language_loss": 0.74071264, "learning_rate": 1.7351509183041144e-06, "loss": 0.76213276, "num_input_tokens_seen": 99788895, "step": 4624, "time_per_iteration": 2.7145895957946777 }, { "auxiliary_loss_clip": 0.01139353, "auxiliary_loss_mlp": 0.01047995, "balance_loss_clip": 1.04403853, "balance_loss_mlp": 1.03149629, "epoch": 0.5561233692057957, "flos": 23403738458880.0, "grad_norm": 6.192301551116807, "language_loss": 0.71633095, "learning_rate": 1.7343788270984852e-06, "loss": 0.73820448, "num_input_tokens_seen": 99808035, "step": 4625, "time_per_iteration": 2.635175943374634 }, { "auxiliary_loss_clip": 0.01114107, "auxiliary_loss_mlp": 0.01049809, "balance_loss_clip": 1.04369569, "balance_loss_mlp": 1.0323329, "epoch": 0.5562436120964348, "flos": 37670867804160.0, "grad_norm": 1.7989016994140212, "language_loss": 0.74587637, "learning_rate": 1.7336067761873764e-06, "loss": 0.76751554, "num_input_tokens_seen": 99830460, "step": 4626, "time_per_iteration": 2.779275894165039 }, { "auxiliary_loss_clip": 0.01134317, "auxiliary_loss_mlp": 0.01042592, "balance_loss_clip": 1.04347563, "balance_loss_mlp": 1.02533078, "epoch": 0.5563638549870739, "flos": 25155245445120.0, "grad_norm": 2.077713984361728, "language_loss": 0.76233697, "learning_rate": 1.7328347656879076e-06, "loss": 0.78410608, "num_input_tokens_seen": 99850320, "step": 4627, "time_per_iteration": 2.6761271953582764 }, { "auxiliary_loss_clip": 0.01102465, "auxiliary_loss_mlp": 0.01047282, "balance_loss_clip": 1.03993928, "balance_loss_mlp": 1.03065276, "epoch": 0.556484097877713, "flos": 13581810783360.0, "grad_norm": 2.4772643435754165, "language_loss": 0.68344599, "learning_rate": 1.7320627957171927e-06, "loss": 0.70494348, "num_input_tokens_seen": 99864980, "step": 4628, "time_per_iteration": 2.6541500091552734 }, { "auxiliary_loss_clip": 0.01138816, "auxiliary_loss_mlp": 0.01041835, "balance_loss_clip": 1.04496789, "balance_loss_mlp": 1.02553952, "epoch": 0.5566043407683521, "flos": 24681368292480.0, "grad_norm": 2.4311770622311624, "language_loss": 0.81716925, "learning_rate": 1.7312908663923382e-06, "loss": 0.83897579, "num_input_tokens_seen": 99881155, "step": 4629, "time_per_iteration": 2.640730857849121 }, { "auxiliary_loss_clip": 0.01123531, "auxiliary_loss_mlp": 0.01042679, "balance_loss_clip": 1.04247475, "balance_loss_mlp": 1.02438056, "epoch": 0.5567245836589911, "flos": 20588161950720.0, "grad_norm": 2.216782296429693, "language_loss": 0.67349887, "learning_rate": 1.7305189778304463e-06, "loss": 0.69516098, "num_input_tokens_seen": 99899330, "step": 4630, "time_per_iteration": 2.6200850009918213 }, { "auxiliary_loss_clip": 0.01115042, "auxiliary_loss_mlp": 0.01036351, "balance_loss_clip": 1.04538739, "balance_loss_mlp": 1.02044833, "epoch": 0.5568448265496303, "flos": 20704189858560.0, "grad_norm": 1.9304084682586677, "language_loss": 0.8011986, "learning_rate": 1.729747130148611e-06, "loss": 0.82271254, "num_input_tokens_seen": 99918525, "step": 4631, "time_per_iteration": 2.5840327739715576 }, { "auxiliary_loss_clip": 0.01111491, "auxiliary_loss_mlp": 0.01040404, "balance_loss_clip": 1.04195571, "balance_loss_mlp": 1.02301168, "epoch": 0.5569650694402694, "flos": 25302910256640.0, "grad_norm": 2.8920217549885447, "language_loss": 0.768538, "learning_rate": 1.7289753234639208e-06, "loss": 0.79005688, "num_input_tokens_seen": 99937500, "step": 4632, "time_per_iteration": 2.7301862239837646 }, { "auxiliary_loss_clip": 0.01131901, "auxiliary_loss_mlp": 0.01047458, "balance_loss_clip": 1.0432868, "balance_loss_mlp": 1.02997017, "epoch": 0.5570853123309084, "flos": 19712623939200.0, "grad_norm": 1.976312225021108, "language_loss": 0.76455021, "learning_rate": 1.7282035578934592e-06, "loss": 0.78634381, "num_input_tokens_seen": 99955665, "step": 4633, "time_per_iteration": 2.672252655029297 }, { "auxiliary_loss_clip": 0.01108303, "auxiliary_loss_mlp": 0.01044358, "balance_loss_clip": 1.04320538, "balance_loss_mlp": 1.02712059, "epoch": 0.5572055552215476, "flos": 16108091153280.0, "grad_norm": 1.6165809794031414, "language_loss": 0.78709334, "learning_rate": 1.727431833554301e-06, "loss": 0.80861998, "num_input_tokens_seen": 99974140, "step": 4634, "time_per_iteration": 2.61183500289917 }, { "auxiliary_loss_clip": 0.01081785, "auxiliary_loss_mlp": 0.0105215, "balance_loss_clip": 1.03888023, "balance_loss_mlp": 1.03315997, "epoch": 0.5573257981121866, "flos": 17128815937920.0, "grad_norm": 2.027544832434437, "language_loss": 0.7725454, "learning_rate": 1.7266601505635175e-06, "loss": 0.79388475, "num_input_tokens_seen": 99991480, "step": 4635, "time_per_iteration": 4.592777729034424 }, { "auxiliary_loss_clip": 0.01130524, "auxiliary_loss_mlp": 0.01040831, "balance_loss_clip": 1.04419136, "balance_loss_mlp": 1.02345037, "epoch": 0.5574460410028257, "flos": 18807029222400.0, "grad_norm": 15.540953335506947, "language_loss": 0.75981855, "learning_rate": 1.7258885090381717e-06, "loss": 0.78153217, "num_input_tokens_seen": 100009520, "step": 4636, "time_per_iteration": 2.622769832611084 }, { "auxiliary_loss_clip": 0.01119023, "auxiliary_loss_mlp": 0.01039055, "balance_loss_clip": 1.04499996, "balance_loss_mlp": 1.02313495, "epoch": 0.5575662838934649, "flos": 29642678530560.0, "grad_norm": 1.9664918303300056, "language_loss": 0.78226733, "learning_rate": 1.7251169090953213e-06, "loss": 0.80384809, "num_input_tokens_seen": 100029995, "step": 4637, "time_per_iteration": 2.7136805057525635 }, { "auxiliary_loss_clip": 0.01130103, "auxiliary_loss_mlp": 0.01043639, "balance_loss_clip": 1.04512787, "balance_loss_mlp": 1.02602029, "epoch": 0.5576865267841039, "flos": 22054466949120.0, "grad_norm": 6.686707660069957, "language_loss": 0.76096582, "learning_rate": 1.7243453508520168e-06, "loss": 0.78270328, "num_input_tokens_seen": 100046980, "step": 4638, "time_per_iteration": 2.6346800327301025 }, { "auxiliary_loss_clip": 0.01115955, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.04129112, "balance_loss_mlp": 1.02144527, "epoch": 0.557806769674743, "flos": 17196040241280.0, "grad_norm": 2.1912418721070757, "language_loss": 0.84435093, "learning_rate": 1.7235738344253038e-06, "loss": 0.86589372, "num_input_tokens_seen": 100060610, "step": 4639, "time_per_iteration": 2.5644586086273193 }, { "auxiliary_loss_clip": 0.01126664, "auxiliary_loss_mlp": 0.01044226, "balance_loss_clip": 1.04631448, "balance_loss_mlp": 1.02659488, "epoch": 0.557927012565382, "flos": 24712717887360.0, "grad_norm": 1.9096409745486018, "language_loss": 0.82572782, "learning_rate": 1.72280235993222e-06, "loss": 0.84743667, "num_input_tokens_seen": 100078915, "step": 4640, "time_per_iteration": 2.6242105960845947 }, { "auxiliary_loss_clip": 0.01126963, "auxiliary_loss_mlp": 0.0077375, "balance_loss_clip": 1.04431129, "balance_loss_mlp": 1.00029361, "epoch": 0.5580472554560212, "flos": 16983090460800.0, "grad_norm": 3.143111334557507, "language_loss": 0.69595486, "learning_rate": 1.722030927489798e-06, "loss": 0.71496201, "num_input_tokens_seen": 100096195, "step": 4641, "time_per_iteration": 3.5939254760742188 }, { "auxiliary_loss_clip": 0.01105694, "auxiliary_loss_mlp": 0.01044586, "balance_loss_clip": 1.04383254, "balance_loss_mlp": 1.02701509, "epoch": 0.5581674983466602, "flos": 23509100027520.0, "grad_norm": 1.7350382524677457, "language_loss": 0.74232543, "learning_rate": 1.7212595372150634e-06, "loss": 0.76382828, "num_input_tokens_seen": 100116175, "step": 4642, "time_per_iteration": 2.6979312896728516 }, { "auxiliary_loss_clip": 0.01141139, "auxiliary_loss_mlp": 0.0104218, "balance_loss_clip": 1.04663789, "balance_loss_mlp": 1.02478719, "epoch": 0.5582877412372993, "flos": 13480291969920.0, "grad_norm": 2.5419061484652534, "language_loss": 0.7296133, "learning_rate": 1.720488189225035e-06, "loss": 0.75144649, "num_input_tokens_seen": 100133875, "step": 4643, "time_per_iteration": 2.57407808303833 }, { "auxiliary_loss_clip": 0.01135014, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.04684389, "balance_loss_mlp": 1.02698565, "epoch": 0.5584079841279385, "flos": 21903605827200.0, "grad_norm": 2.425428444049804, "language_loss": 0.79397118, "learning_rate": 1.7197168836367265e-06, "loss": 0.81576324, "num_input_tokens_seen": 100150685, "step": 4644, "time_per_iteration": 2.657947540283203 }, { "auxiliary_loss_clip": 0.01126222, "auxiliary_loss_mlp": 0.00772691, "balance_loss_clip": 1.0428499, "balance_loss_mlp": 1.00024855, "epoch": 0.5585282270185775, "flos": 18843550375680.0, "grad_norm": 1.83959661233867, "language_loss": 0.8215428, "learning_rate": 1.7189456205671433e-06, "loss": 0.84053195, "num_input_tokens_seen": 100169530, "step": 4645, "time_per_iteration": 2.6288318634033203 }, { "auxiliary_loss_clip": 0.01138004, "auxiliary_loss_mlp": 0.01043736, "balance_loss_clip": 1.04847026, "balance_loss_mlp": 1.02538967, "epoch": 0.5586484699092166, "flos": 21868449390720.0, "grad_norm": 2.2131420475795776, "language_loss": 0.82699966, "learning_rate": 1.7181744001332866e-06, "loss": 0.84881705, "num_input_tokens_seen": 100188140, "step": 4646, "time_per_iteration": 3.8546433448791504 }, { "auxiliary_loss_clip": 0.01139585, "auxiliary_loss_mlp": 0.01045648, "balance_loss_clip": 1.046417, "balance_loss_mlp": 1.03006744, "epoch": 0.5587687127998557, "flos": 22893232412160.0, "grad_norm": 2.8806158334574326, "language_loss": 0.63146836, "learning_rate": 1.7174032224521493e-06, "loss": 0.65332067, "num_input_tokens_seen": 100206850, "step": 4647, "time_per_iteration": 2.635680913925171 }, { "auxiliary_loss_clip": 0.01123604, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.04301476, "balance_loss_mlp": 1.0213865, "epoch": 0.5588889556904948, "flos": 20303067703680.0, "grad_norm": 2.148220056981461, "language_loss": 0.69656545, "learning_rate": 1.7166320876407184e-06, "loss": 0.71817946, "num_input_tokens_seen": 100226270, "step": 4648, "time_per_iteration": 2.6043334007263184 }, { "auxiliary_loss_clip": 0.01146598, "auxiliary_loss_mlp": 0.00774662, "balance_loss_clip": 1.04926991, "balance_loss_mlp": 1.00027394, "epoch": 0.5590091985811338, "flos": 16472153450880.0, "grad_norm": 2.029456677855964, "language_loss": 0.67734998, "learning_rate": 1.7158609958159742e-06, "loss": 0.69656259, "num_input_tokens_seen": 100243675, "step": 4649, "time_per_iteration": 2.5555286407470703 }, { "auxiliary_loss_clip": 0.01087074, "auxiliary_loss_mlp": 0.01044305, "balance_loss_clip": 1.03998744, "balance_loss_mlp": 1.02542257, "epoch": 0.559129441471773, "flos": 14532186781440.0, "grad_norm": 2.2720140677468805, "language_loss": 0.78082967, "learning_rate": 1.7150899470948911e-06, "loss": 0.80214345, "num_input_tokens_seen": 100258940, "step": 4650, "time_per_iteration": 2.7170674800872803 }, { "auxiliary_loss_clip": 0.0102188, "auxiliary_loss_mlp": 0.01005143, "balance_loss_clip": 1.01269531, "balance_loss_mlp": 1.00308025, "epoch": 0.5592496843624121, "flos": 60521009852160.0, "grad_norm": 0.7938356876933765, "language_loss": 0.56681347, "learning_rate": 1.7143189415944365e-06, "loss": 0.5870837, "num_input_tokens_seen": 100323400, "step": 4651, "time_per_iteration": 3.2738380432128906 }, { "auxiliary_loss_clip": 0.01128169, "auxiliary_loss_mlp": 0.01042229, "balance_loss_clip": 1.04657531, "balance_loss_mlp": 1.02524185, "epoch": 0.5593699272530511, "flos": 20886256920960.0, "grad_norm": 1.7375981400018494, "language_loss": 0.76437068, "learning_rate": 1.7135479794315714e-06, "loss": 0.78607464, "num_input_tokens_seen": 100340355, "step": 4652, "time_per_iteration": 2.648588180541992 }, { "auxiliary_loss_clip": 0.01103403, "auxiliary_loss_mlp": 0.01038638, "balance_loss_clip": 1.04180086, "balance_loss_mlp": 1.02192473, "epoch": 0.5594901701436903, "flos": 12896743616640.0, "grad_norm": 2.7536998440652454, "language_loss": 0.7877925, "learning_rate": 1.7127770607232502e-06, "loss": 0.80921292, "num_input_tokens_seen": 100358900, "step": 4653, "time_per_iteration": 2.611191749572754 }, { "auxiliary_loss_clip": 0.01112206, "auxiliary_loss_mlp": 0.01048183, "balance_loss_clip": 1.04353809, "balance_loss_mlp": 1.03117156, "epoch": 0.5596104130343293, "flos": 23112107936640.0, "grad_norm": 2.6035030739584113, "language_loss": 0.7984339, "learning_rate": 1.7120061855864204e-06, "loss": 0.82003778, "num_input_tokens_seen": 100378910, "step": 4654, "time_per_iteration": 2.662505626678467 }, { "auxiliary_loss_clip": 0.01130318, "auxiliary_loss_mlp": 0.01043802, "balance_loss_clip": 1.04566503, "balance_loss_mlp": 1.02556276, "epoch": 0.5597306559249684, "flos": 25957812977280.0, "grad_norm": 1.9850297092492513, "language_loss": 0.70969987, "learning_rate": 1.7112353541380233e-06, "loss": 0.73144102, "num_input_tokens_seen": 100398770, "step": 4655, "time_per_iteration": 2.664783000946045 }, { "auxiliary_loss_clip": 0.0111557, "auxiliary_loss_mlp": 0.01047941, "balance_loss_clip": 1.04316115, "balance_loss_mlp": 1.02935624, "epoch": 0.5598508988156076, "flos": 22492289825280.0, "grad_norm": 1.4723335032730371, "language_loss": 0.71864176, "learning_rate": 1.7104645664949931e-06, "loss": 0.74027681, "num_input_tokens_seen": 100421240, "step": 4656, "time_per_iteration": 2.6654675006866455 }, { "auxiliary_loss_clip": 0.01116665, "auxiliary_loss_mlp": 0.0103861, "balance_loss_clip": 1.04242742, "balance_loss_mlp": 1.02225423, "epoch": 0.5599711417062466, "flos": 23112538899840.0, "grad_norm": 2.090413809244542, "language_loss": 0.71426618, "learning_rate": 1.7096938227742584e-06, "loss": 0.73581892, "num_input_tokens_seen": 100442370, "step": 4657, "time_per_iteration": 2.687753438949585 }, { "auxiliary_loss_clip": 0.01141638, "auxiliary_loss_mlp": 0.01044381, "balance_loss_clip": 1.04434049, "balance_loss_mlp": 1.02709568, "epoch": 0.5600913845968857, "flos": 22339345714560.0, "grad_norm": 2.081365545686936, "language_loss": 0.8459726, "learning_rate": 1.70892312309274e-06, "loss": 0.86783284, "num_input_tokens_seen": 100460260, "step": 4658, "time_per_iteration": 2.5857722759246826 }, { "auxiliary_loss_clip": 0.01115379, "auxiliary_loss_mlp": 0.01044338, "balance_loss_clip": 1.04049611, "balance_loss_mlp": 1.02659929, "epoch": 0.5602116274875248, "flos": 17633791290240.0, "grad_norm": 2.226796130164086, "language_loss": 0.68070865, "learning_rate": 1.7081524675673523e-06, "loss": 0.70230579, "num_input_tokens_seen": 100475750, "step": 4659, "time_per_iteration": 2.5768325328826904 }, { "auxiliary_loss_clip": 0.01025303, "auxiliary_loss_mlp": 0.01001768, "balance_loss_clip": 1.01308775, "balance_loss_mlp": 1.00002754, "epoch": 0.5603318703781639, "flos": 70115945529600.0, "grad_norm": 0.7686441835064777, "language_loss": 0.59555042, "learning_rate": 1.7073818563150026e-06, "loss": 0.61582112, "num_input_tokens_seen": 100537830, "step": 4660, "time_per_iteration": 4.290245294570923 }, { "auxiliary_loss_clip": 0.0112507, "auxiliary_loss_mlp": 0.01042264, "balance_loss_clip": 1.04382324, "balance_loss_mlp": 1.02413201, "epoch": 0.560452113268803, "flos": 18545850455040.0, "grad_norm": 2.4941831573601427, "language_loss": 0.86244649, "learning_rate": 1.7066112894525935e-06, "loss": 0.88411975, "num_input_tokens_seen": 100555910, "step": 4661, "time_per_iteration": 2.5915749073028564 }, { "auxiliary_loss_clip": 0.01108566, "auxiliary_loss_mlp": 0.01041385, "balance_loss_clip": 1.04153466, "balance_loss_mlp": 1.02247834, "epoch": 0.5605723561594421, "flos": 25264665250560.0, "grad_norm": 1.6584022961673965, "language_loss": 0.7292853, "learning_rate": 1.7058407670970177e-06, "loss": 0.75078475, "num_input_tokens_seen": 100577385, "step": 4662, "time_per_iteration": 3.653578758239746 }, { "auxiliary_loss_clip": 0.01134209, "auxiliary_loss_mlp": 0.01046937, "balance_loss_clip": 1.0454185, "balance_loss_mlp": 1.02663565, "epoch": 0.5606925990500812, "flos": 20594949621120.0, "grad_norm": 1.8850257007442615, "language_loss": 0.61390907, "learning_rate": 1.7050702893651643e-06, "loss": 0.63572055, "num_input_tokens_seen": 100596965, "step": 4663, "time_per_iteration": 2.62353777885437 }, { "auxiliary_loss_clip": 0.0113757, "auxiliary_loss_mlp": 0.01048103, "balance_loss_clip": 1.04992914, "balance_loss_mlp": 1.02995932, "epoch": 0.5608128419407202, "flos": 35006044677120.0, "grad_norm": 2.138740596807917, "language_loss": 0.7511822, "learning_rate": 1.7042998563739134e-06, "loss": 0.77303892, "num_input_tokens_seen": 100615315, "step": 4664, "time_per_iteration": 2.7359659671783447 }, { "auxiliary_loss_clip": 0.01123136, "auxiliary_loss_mlp": 0.01036617, "balance_loss_clip": 1.04280674, "balance_loss_mlp": 1.01872373, "epoch": 0.5609330848313594, "flos": 24639819235200.0, "grad_norm": 2.353895175940466, "language_loss": 0.7157737, "learning_rate": 1.703529468240139e-06, "loss": 0.73737121, "num_input_tokens_seen": 100634185, "step": 4665, "time_per_iteration": 2.702979564666748 }, { "auxiliary_loss_clip": 0.01111873, "auxiliary_loss_mlp": 0.0104421, "balance_loss_clip": 1.04465175, "balance_loss_mlp": 1.02804494, "epoch": 0.5610533277219985, "flos": 18762894385920.0, "grad_norm": 2.547978629431916, "language_loss": 0.73574543, "learning_rate": 1.7027591250807088e-06, "loss": 0.75730628, "num_input_tokens_seen": 100651360, "step": 4666, "time_per_iteration": 2.6098408699035645 }, { "auxiliary_loss_clip": 0.01143805, "auxiliary_loss_mlp": 0.0103933, "balance_loss_clip": 1.04687607, "balance_loss_mlp": 1.02169871, "epoch": 0.5611735706126375, "flos": 15012384727680.0, "grad_norm": 2.7926633164356702, "language_loss": 0.84528553, "learning_rate": 1.7019888270124825e-06, "loss": 0.86711687, "num_input_tokens_seen": 100668525, "step": 4667, "time_per_iteration": 3.5695154666900635 }, { "auxiliary_loss_clip": 0.0113058, "auxiliary_loss_mlp": 0.01046433, "balance_loss_clip": 1.04531109, "balance_loss_mlp": 1.02935052, "epoch": 0.5612938135032767, "flos": 16468167041280.0, "grad_norm": 1.7876075984934214, "language_loss": 0.81702626, "learning_rate": 1.7012185741523147e-06, "loss": 0.83879638, "num_input_tokens_seen": 100684850, "step": 4668, "time_per_iteration": 2.562394857406616 }, { "auxiliary_loss_clip": 0.01141376, "auxiliary_loss_mlp": 0.01050944, "balance_loss_clip": 1.04653478, "balance_loss_mlp": 1.03362238, "epoch": 0.5614140563939157, "flos": 25666433850240.0, "grad_norm": 2.7640929411137583, "language_loss": 0.62391055, "learning_rate": 1.7004483666170514e-06, "loss": 0.64583379, "num_input_tokens_seen": 100705345, "step": 4669, "time_per_iteration": 2.618288040161133 }, { "auxiliary_loss_clip": 0.01125055, "auxiliary_loss_mlp": 0.01041731, "balance_loss_clip": 1.04210114, "balance_loss_mlp": 1.02464795, "epoch": 0.5615342992845548, "flos": 24717566223360.0, "grad_norm": 1.9657732477089005, "language_loss": 0.80260146, "learning_rate": 1.699678204523533e-06, "loss": 0.82426929, "num_input_tokens_seen": 100725210, "step": 4670, "time_per_iteration": 2.6541991233825684 }, { "auxiliary_loss_clip": 0.01121325, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.04688036, "balance_loss_mlp": 1.02447915, "epoch": 0.5616545421751938, "flos": 22015934634240.0, "grad_norm": 2.9401787710148097, "language_loss": 0.68661565, "learning_rate": 1.6989080879885918e-06, "loss": 0.70824313, "num_input_tokens_seen": 100743070, "step": 4671, "time_per_iteration": 3.570889472961426 }, { "auxiliary_loss_clip": 0.01015783, "auxiliary_loss_mlp": 0.01002563, "balance_loss_clip": 1.01390982, "balance_loss_mlp": 1.00066721, "epoch": 0.561774785065833, "flos": 53760358690560.0, "grad_norm": 0.9554661955133884, "language_loss": 0.61126947, "learning_rate": 1.6981380171290544e-06, "loss": 0.63145292, "num_input_tokens_seen": 100804095, "step": 4672, "time_per_iteration": 3.2239060401916504 }, { "auxiliary_loss_clip": 0.01110039, "auxiliary_loss_mlp": 0.01043578, "balance_loss_clip": 1.04097342, "balance_loss_mlp": 1.02612567, "epoch": 0.5618950279564721, "flos": 19750007018880.0, "grad_norm": 2.0305485797911316, "language_loss": 0.743572, "learning_rate": 1.6973679920617396e-06, "loss": 0.76510823, "num_input_tokens_seen": 100821630, "step": 4673, "time_per_iteration": 2.6300101280212402 }, { "auxiliary_loss_clip": 0.01117204, "auxiliary_loss_mlp": 0.01043342, "balance_loss_clip": 1.04324484, "balance_loss_mlp": 1.02543736, "epoch": 0.5620152708471111, "flos": 16800592435200.0, "grad_norm": 2.072255395457027, "language_loss": 0.84928459, "learning_rate": 1.6965980129034603e-06, "loss": 0.87089002, "num_input_tokens_seen": 100839015, "step": 4674, "time_per_iteration": 2.624687433242798 }, { "auxiliary_loss_clip": 0.0111885, "auxiliary_loss_mlp": 0.01046045, "balance_loss_clip": 1.04495943, "balance_loss_mlp": 1.02875948, "epoch": 0.5621355137377503, "flos": 26797799502720.0, "grad_norm": 1.5819211114762661, "language_loss": 0.7647354, "learning_rate": 1.6958280797710209e-06, "loss": 0.78638434, "num_input_tokens_seen": 100860940, "step": 4675, "time_per_iteration": 2.698788642883301 }, { "auxiliary_loss_clip": 0.01027119, "auxiliary_loss_mlp": 0.01000317, "balance_loss_clip": 1.01548946, "balance_loss_mlp": 0.9985528, "epoch": 0.5622557566283893, "flos": 61207046686080.0, "grad_norm": 0.7192358844911644, "language_loss": 0.54691446, "learning_rate": 1.6950581927812198e-06, "loss": 0.56718886, "num_input_tokens_seen": 100920510, "step": 4676, "time_per_iteration": 3.1240530014038086 }, { "auxiliary_loss_clip": 0.01131679, "auxiliary_loss_mlp": 0.01042524, "balance_loss_clip": 1.04766321, "balance_loss_mlp": 1.02601385, "epoch": 0.5623759995190284, "flos": 26468534505600.0, "grad_norm": 2.4696256631836357, "language_loss": 0.79049927, "learning_rate": 1.6942883520508486e-06, "loss": 0.81224132, "num_input_tokens_seen": 100939245, "step": 4677, "time_per_iteration": 2.63027286529541 }, { "auxiliary_loss_clip": 0.01131533, "auxiliary_loss_mlp": 0.01042124, "balance_loss_clip": 1.04622781, "balance_loss_mlp": 1.02566707, "epoch": 0.5624962424096676, "flos": 19390900798080.0, "grad_norm": 2.6424494199861073, "language_loss": 0.76605541, "learning_rate": 1.693518557696691e-06, "loss": 0.78779197, "num_input_tokens_seen": 100958385, "step": 4678, "time_per_iteration": 2.586625814437866 }, { "auxiliary_loss_clip": 0.01124972, "auxiliary_loss_mlp": 0.01041723, "balance_loss_clip": 1.04212272, "balance_loss_mlp": 1.02443743, "epoch": 0.5626164853003066, "flos": 20667345482880.0, "grad_norm": 2.7143737548831823, "language_loss": 0.89440393, "learning_rate": 1.6927488098355252e-06, "loss": 0.91607082, "num_input_tokens_seen": 100976015, "step": 4679, "time_per_iteration": 2.75842022895813 }, { "auxiliary_loss_clip": 0.01011625, "auxiliary_loss_mlp": 0.0100282, "balance_loss_clip": 1.0155766, "balance_loss_mlp": 1.00117481, "epoch": 0.5627367281909457, "flos": 62766071665920.0, "grad_norm": 0.9087264108276619, "language_loss": 0.63261032, "learning_rate": 1.6919791085841201e-06, "loss": 0.65275478, "num_input_tokens_seen": 101033425, "step": 4680, "time_per_iteration": 3.225668430328369 }, { "auxiliary_loss_clip": 0.0112251, "auxiliary_loss_mlp": 0.01057941, "balance_loss_clip": 1.0417974, "balance_loss_mlp": 1.03806853, "epoch": 0.5628569710815848, "flos": 12787144243200.0, "grad_norm": 2.37017252136621, "language_loss": 0.78696841, "learning_rate": 1.6912094540592396e-06, "loss": 0.80877292, "num_input_tokens_seen": 101048945, "step": 4681, "time_per_iteration": 2.5888521671295166 }, { "auxiliary_loss_clip": 0.01133718, "auxiliary_loss_mlp": 0.01049711, "balance_loss_clip": 1.04712903, "balance_loss_mlp": 1.03205609, "epoch": 0.5629772139722239, "flos": 13762082165760.0, "grad_norm": 2.6410951675207737, "language_loss": 0.82038593, "learning_rate": 1.6904398463776393e-06, "loss": 0.84222025, "num_input_tokens_seen": 101062745, "step": 4682, "time_per_iteration": 2.580671548843384 }, { "auxiliary_loss_clip": 0.01131098, "auxiliary_loss_mlp": 0.01044087, "balance_loss_clip": 1.04624069, "balance_loss_mlp": 1.02738583, "epoch": 0.5630974568628629, "flos": 21467830026240.0, "grad_norm": 1.7494522432269044, "language_loss": 0.72460377, "learning_rate": 1.6896702856560683e-06, "loss": 0.74635553, "num_input_tokens_seen": 101081840, "step": 4683, "time_per_iteration": 2.6222779750823975 }, { "auxiliary_loss_clip": 0.0109933, "auxiliary_loss_mlp": 0.01041001, "balance_loss_clip": 1.03797174, "balance_loss_mlp": 1.02468157, "epoch": 0.5632176997535021, "flos": 14245907385600.0, "grad_norm": 14.129072509919292, "language_loss": 0.69269288, "learning_rate": 1.6889007720112677e-06, "loss": 0.71409619, "num_input_tokens_seen": 101099585, "step": 4684, "time_per_iteration": 2.6410768032073975 }, { "auxiliary_loss_clip": 0.01133629, "auxiliary_loss_mlp": 0.01040576, "balance_loss_clip": 1.04726577, "balance_loss_mlp": 1.0236361, "epoch": 0.5633379426441412, "flos": 20812244947200.0, "grad_norm": 1.6288103329513413, "language_loss": 0.77061635, "learning_rate": 1.6881313055599734e-06, "loss": 0.7923584, "num_input_tokens_seen": 101119515, "step": 4685, "time_per_iteration": 2.618035316467285 }, { "auxiliary_loss_clip": 0.01107239, "auxiliary_loss_mlp": 0.01052923, "balance_loss_clip": 1.04054427, "balance_loss_mlp": 1.03311014, "epoch": 0.5634581855347802, "flos": 22600883617920.0, "grad_norm": 2.657057047048434, "language_loss": 0.82030374, "learning_rate": 1.6873618864189117e-06, "loss": 0.84190536, "num_input_tokens_seen": 101135285, "step": 4686, "time_per_iteration": 2.642174482345581 }, { "auxiliary_loss_clip": 0.01132066, "auxiliary_loss_mlp": 0.01040907, "balance_loss_clip": 1.04466105, "balance_loss_mlp": 1.02329946, "epoch": 0.5635784284254194, "flos": 21506972872320.0, "grad_norm": 2.408955924750475, "language_loss": 0.78306901, "learning_rate": 1.686592514704803e-06, "loss": 0.80479872, "num_input_tokens_seen": 101152680, "step": 4687, "time_per_iteration": 4.532764434814453 }, { "auxiliary_loss_clip": 0.0111897, "auxiliary_loss_mlp": 0.01039769, "balance_loss_clip": 1.04690468, "balance_loss_mlp": 1.02431929, "epoch": 0.5636986713160584, "flos": 19827466698240.0, "grad_norm": 3.1682629045193, "language_loss": 0.70646179, "learning_rate": 1.685823190534361e-06, "loss": 0.72804916, "num_input_tokens_seen": 101170920, "step": 4688, "time_per_iteration": 2.6660690307617188 }, { "auxiliary_loss_clip": 0.01148244, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.04832077, "balance_loss_mlp": 1.02345657, "epoch": 0.5638189142066975, "flos": 19792453916160.0, "grad_norm": 1.773171741535132, "language_loss": 0.84229064, "learning_rate": 1.6850539140242907e-06, "loss": 0.86418641, "num_input_tokens_seen": 101190180, "step": 4689, "time_per_iteration": 2.551103353500366 }, { "auxiliary_loss_clip": 0.01131911, "auxiliary_loss_mlp": 0.01044432, "balance_loss_clip": 1.04492068, "balance_loss_mlp": 1.02678943, "epoch": 0.5639391570973367, "flos": 22893771116160.0, "grad_norm": 2.7205963911523456, "language_loss": 0.82299685, "learning_rate": 1.684284685291292e-06, "loss": 0.84476036, "num_input_tokens_seen": 101211825, "step": 4690, "time_per_iteration": 2.700096607208252 }, { "auxiliary_loss_clip": 0.01143993, "auxiliary_loss_mlp": 0.01044122, "balance_loss_clip": 1.04724622, "balance_loss_mlp": 1.02589452, "epoch": 0.5640593999879757, "flos": 23727077712000.0, "grad_norm": 2.194195914606469, "language_loss": 0.81079179, "learning_rate": 1.683515504452055e-06, "loss": 0.83267295, "num_input_tokens_seen": 101229200, "step": 4691, "time_per_iteration": 2.606811761856079 }, { "auxiliary_loss_clip": 0.01095039, "auxiliary_loss_mlp": 0.01050253, "balance_loss_clip": 1.03824234, "balance_loss_mlp": 1.03246713, "epoch": 0.5641796428786148, "flos": 22710123855360.0, "grad_norm": 1.6368826037829374, "language_loss": 0.666067, "learning_rate": 1.6827463716232648e-06, "loss": 0.68751991, "num_input_tokens_seen": 101249860, "step": 4692, "time_per_iteration": 2.7292020320892334 }, { "auxiliary_loss_clip": 0.01130817, "auxiliary_loss_mlp": 0.00773697, "balance_loss_clip": 1.04465842, "balance_loss_mlp": 1.00037444, "epoch": 0.5642998857692539, "flos": 19791987039360.0, "grad_norm": 1.682964643703322, "language_loss": 0.75507122, "learning_rate": 1.6819772869215972e-06, "loss": 0.7741164, "num_input_tokens_seen": 101268940, "step": 4693, "time_per_iteration": 3.568588972091675 }, { "auxiliary_loss_clip": 0.01124572, "auxiliary_loss_mlp": 0.01038607, "balance_loss_clip": 1.04662287, "balance_loss_mlp": 1.02129793, "epoch": 0.564420128659893, "flos": 23185904428800.0, "grad_norm": 1.700412398554855, "language_loss": 0.8207705, "learning_rate": 1.6812082504637228e-06, "loss": 0.84240234, "num_input_tokens_seen": 101290260, "step": 4694, "time_per_iteration": 2.6972124576568604 }, { "auxiliary_loss_clip": 0.01127612, "auxiliary_loss_mlp": 0.01047043, "balance_loss_clip": 1.04425359, "balance_loss_mlp": 1.03005004, "epoch": 0.564540371550532, "flos": 23258264376960.0, "grad_norm": 2.0331875709069487, "language_loss": 0.74366224, "learning_rate": 1.6804392623663025e-06, "loss": 0.76540881, "num_input_tokens_seen": 101311465, "step": 4695, "time_per_iteration": 2.6485893726348877 }, { "auxiliary_loss_clip": 0.01125597, "auxiliary_loss_mlp": 0.0103363, "balance_loss_clip": 1.04523325, "balance_loss_mlp": 1.01803732, "epoch": 0.5646606144411712, "flos": 25010058672000.0, "grad_norm": 2.5829468690473267, "language_loss": 0.78271341, "learning_rate": 1.6796703227459935e-06, "loss": 0.80430567, "num_input_tokens_seen": 101329420, "step": 4696, "time_per_iteration": 2.6383090019226074 }, { "auxiliary_loss_clip": 0.01084037, "auxiliary_loss_mlp": 0.01053771, "balance_loss_clip": 1.03742731, "balance_loss_mlp": 1.03550792, "epoch": 0.5647808573318103, "flos": 36539645806080.0, "grad_norm": 3.08717216917619, "language_loss": 0.76164341, "learning_rate": 1.6789014317194407e-06, "loss": 0.78302145, "num_input_tokens_seen": 101350900, "step": 4697, "time_per_iteration": 3.7462120056152344 }, { "auxiliary_loss_clip": 0.0112356, "auxiliary_loss_mlp": 0.01051788, "balance_loss_clip": 1.04530871, "balance_loss_mlp": 1.03358436, "epoch": 0.5649011002224493, "flos": 22528451842560.0, "grad_norm": 2.7679825924085932, "language_loss": 0.73097968, "learning_rate": 1.6781325894032853e-06, "loss": 0.75273317, "num_input_tokens_seen": 101369860, "step": 4698, "time_per_iteration": 2.6706125736236572 }, { "auxiliary_loss_clip": 0.01114099, "auxiliary_loss_mlp": 0.01045007, "balance_loss_clip": 1.04607892, "balance_loss_mlp": 1.02763855, "epoch": 0.5650213431130885, "flos": 18515147304960.0, "grad_norm": 2.2515947773453933, "language_loss": 0.91939795, "learning_rate": 1.6773637959141608e-06, "loss": 0.94098896, "num_input_tokens_seen": 101386835, "step": 4699, "time_per_iteration": 2.6371042728424072 }, { "auxiliary_loss_clip": 0.01109625, "auxiliary_loss_mlp": 0.01055264, "balance_loss_clip": 1.04335797, "balance_loss_mlp": 1.03684592, "epoch": 0.5651415860037275, "flos": 17526310819200.0, "grad_norm": 2.9761818945859013, "language_loss": 0.66581613, "learning_rate": 1.6765950513686915e-06, "loss": 0.68746501, "num_input_tokens_seen": 101404945, "step": 4700, "time_per_iteration": 2.6583313941955566 }, { "auxiliary_loss_clip": 0.01093769, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.0395472, "balance_loss_mlp": 1.02495944, "epoch": 0.5652618288943666, "flos": 25520026014720.0, "grad_norm": 1.8992858933859502, "language_loss": 0.7638067, "learning_rate": 1.675826355883496e-06, "loss": 0.78517956, "num_input_tokens_seen": 101424160, "step": 4701, "time_per_iteration": 2.7200875282287598 }, { "auxiliary_loss_clip": 0.0111291, "auxiliary_loss_mlp": 0.01045686, "balance_loss_clip": 1.04528141, "balance_loss_mlp": 1.02835321, "epoch": 0.5653820717850057, "flos": 19683105937920.0, "grad_norm": 1.8877230811917491, "language_loss": 0.79201579, "learning_rate": 1.6750577095751848e-06, "loss": 0.81360173, "num_input_tokens_seen": 101443270, "step": 4702, "time_per_iteration": 2.6332943439483643 }, { "auxiliary_loss_clip": 0.01140023, "auxiliary_loss_mlp": 0.01039352, "balance_loss_clip": 1.0467155, "balance_loss_mlp": 1.02253187, "epoch": 0.5655023146756448, "flos": 26979722910720.0, "grad_norm": 1.6832474345455657, "language_loss": 0.72839332, "learning_rate": 1.6742891125603605e-06, "loss": 0.75018704, "num_input_tokens_seen": 101464175, "step": 4703, "time_per_iteration": 2.6281352043151855 }, { "auxiliary_loss_clip": 0.01130546, "auxiliary_loss_mlp": 0.01039986, "balance_loss_clip": 1.04536784, "balance_loss_mlp": 1.02177095, "epoch": 0.5656225575662839, "flos": 27669351104640.0, "grad_norm": 1.7169502004068464, "language_loss": 0.71923405, "learning_rate": 1.6735205649556185e-06, "loss": 0.74093938, "num_input_tokens_seen": 101484045, "step": 4704, "time_per_iteration": 2.665210008621216 }, { "auxiliary_loss_clip": 0.01107291, "auxiliary_loss_mlp": 0.01052065, "balance_loss_clip": 1.04142499, "balance_loss_mlp": 1.03396881, "epoch": 0.5657428004569229, "flos": 24349732997760.0, "grad_norm": 1.710542607948951, "language_loss": 0.84926462, "learning_rate": 1.6727520668775476e-06, "loss": 0.87085819, "num_input_tokens_seen": 101504330, "step": 4705, "time_per_iteration": 2.736081600189209 }, { "auxiliary_loss_clip": 0.01144442, "auxiliary_loss_mlp": 0.01049612, "balance_loss_clip": 1.04669392, "balance_loss_mlp": 1.03175402, "epoch": 0.5658630433475621, "flos": 21944041562880.0, "grad_norm": 2.5353356641935774, "language_loss": 0.75412703, "learning_rate": 1.6719836184427275e-06, "loss": 0.77606761, "num_input_tokens_seen": 101524635, "step": 4706, "time_per_iteration": 2.623143196105957 }, { "auxiliary_loss_clip": 0.0111218, "auxiliary_loss_mlp": 0.01044707, "balance_loss_clip": 1.0404079, "balance_loss_mlp": 1.02730215, "epoch": 0.5659832862382012, "flos": 30409012218240.0, "grad_norm": 2.039682545336722, "language_loss": 0.64541936, "learning_rate": 1.671215219767733e-06, "loss": 0.66698825, "num_input_tokens_seen": 101544095, "step": 4707, "time_per_iteration": 2.7110941410064697 }, { "auxiliary_loss_clip": 0.01094937, "auxiliary_loss_mlp": 0.01048676, "balance_loss_clip": 1.04377174, "balance_loss_mlp": 1.03110492, "epoch": 0.5661035291288402, "flos": 13188194570880.0, "grad_norm": 11.312264063371744, "language_loss": 0.76776797, "learning_rate": 1.670446870969127e-06, "loss": 0.78920412, "num_input_tokens_seen": 101561760, "step": 4708, "time_per_iteration": 2.724350690841675 }, { "auxiliary_loss_clip": 0.01117526, "auxiliary_loss_mlp": 0.01048489, "balance_loss_clip": 1.04236162, "balance_loss_mlp": 1.03097701, "epoch": 0.5662237720194794, "flos": 16143032108160.0, "grad_norm": 2.5222193370896493, "language_loss": 0.8020798, "learning_rate": 1.6696785721634685e-06, "loss": 0.82374001, "num_input_tokens_seen": 101576245, "step": 4709, "time_per_iteration": 2.631150007247925 }, { "auxiliary_loss_clip": 0.01131105, "auxiliary_loss_mlp": 0.01048888, "balance_loss_clip": 1.0446173, "balance_loss_mlp": 1.03043461, "epoch": 0.5663440149101184, "flos": 17676848718720.0, "grad_norm": 2.687359339861887, "language_loss": 0.737378, "learning_rate": 1.6689103234673086e-06, "loss": 0.75917792, "num_input_tokens_seen": 101594565, "step": 4710, "time_per_iteration": 2.584243059158325 }, { "auxiliary_loss_clip": 0.01118619, "auxiliary_loss_mlp": 0.01041614, "balance_loss_clip": 1.04617429, "balance_loss_mlp": 1.02429342, "epoch": 0.5664642578007575, "flos": 23368330627200.0, "grad_norm": 2.02036945939648, "language_loss": 0.77092564, "learning_rate": 1.668142124997189e-06, "loss": 0.79252797, "num_input_tokens_seen": 101614225, "step": 4711, "time_per_iteration": 2.673837184906006 }, { "auxiliary_loss_clip": 0.01023927, "auxiliary_loss_mlp": 0.01004967, "balance_loss_clip": 1.0179143, "balance_loss_mlp": 1.00314355, "epoch": 0.5665845006913967, "flos": 65516470945920.0, "grad_norm": 0.7194410912301893, "language_loss": 0.59723085, "learning_rate": 1.6673739768696453e-06, "loss": 0.61751986, "num_input_tokens_seen": 101680795, "step": 4712, "time_per_iteration": 4.241714954376221 }, { "auxiliary_loss_clip": 0.01121251, "auxiliary_loss_mlp": 0.01050116, "balance_loss_clip": 1.04154134, "balance_loss_mlp": 1.03178132, "epoch": 0.5667047435820357, "flos": 26140885620480.0, "grad_norm": 1.7219894001648184, "language_loss": 0.77673602, "learning_rate": 1.6666058792012052e-06, "loss": 0.7984497, "num_input_tokens_seen": 101701680, "step": 4713, "time_per_iteration": 3.6554341316223145 }, { "auxiliary_loss_clip": 0.0103506, "auxiliary_loss_mlp": 0.0100473, "balance_loss_clip": 1.01328766, "balance_loss_mlp": 1.00288188, "epoch": 0.5668249864726748, "flos": 71866949725440.0, "grad_norm": 0.8958943768191113, "language_loss": 0.68777871, "learning_rate": 1.6658378321083878e-06, "loss": 0.70817661, "num_input_tokens_seen": 101766010, "step": 4714, "time_per_iteration": 3.2244040966033936 }, { "auxiliary_loss_clip": 0.0107856, "auxiliary_loss_mlp": 0.01043115, "balance_loss_clip": 1.03764677, "balance_loss_mlp": 1.02654505, "epoch": 0.5669452293633139, "flos": 22195667312640.0, "grad_norm": 1.668375748757426, "language_loss": 0.8267343, "learning_rate": 1.6650698357077055e-06, "loss": 0.84795105, "num_input_tokens_seen": 101783055, "step": 4715, "time_per_iteration": 2.6895503997802734 }, { "auxiliary_loss_clip": 0.01121528, "auxiliary_loss_mlp": 0.01043664, "balance_loss_clip": 1.04405904, "balance_loss_mlp": 1.0264976, "epoch": 0.567065472253953, "flos": 18223193560320.0, "grad_norm": 2.62175226909418, "language_loss": 0.80983156, "learning_rate": 1.6643018901156632e-06, "loss": 0.83148348, "num_input_tokens_seen": 101802150, "step": 4716, "time_per_iteration": 2.633458375930786 }, { "auxiliary_loss_clip": 0.01120215, "auxiliary_loss_mlp": 0.01047336, "balance_loss_clip": 1.0437274, "balance_loss_mlp": 1.03058767, "epoch": 0.567185715144592, "flos": 20371548983040.0, "grad_norm": 3.088922227861594, "language_loss": 0.79537243, "learning_rate": 1.6635339954487566e-06, "loss": 0.81704795, "num_input_tokens_seen": 101818025, "step": 4717, "time_per_iteration": 2.6267807483673096 }, { "auxiliary_loss_clip": 0.01123849, "auxiliary_loss_mlp": 0.01046205, "balance_loss_clip": 1.04469049, "balance_loss_mlp": 1.02639246, "epoch": 0.5673059580352312, "flos": 23221348174080.0, "grad_norm": 2.0234379025099374, "language_loss": 0.81980884, "learning_rate": 1.6627661518234765e-06, "loss": 0.84150934, "num_input_tokens_seen": 101837280, "step": 4718, "time_per_iteration": 2.659724473953247 }, { "auxiliary_loss_clip": 0.01095141, "auxiliary_loss_mlp": 0.01047155, "balance_loss_clip": 1.04396892, "balance_loss_mlp": 1.03042984, "epoch": 0.5674262009258703, "flos": 21719599430400.0, "grad_norm": 1.9892129142278723, "language_loss": 0.85629249, "learning_rate": 1.661998359356302e-06, "loss": 0.87771547, "num_input_tokens_seen": 101856310, "step": 4719, "time_per_iteration": 2.7299787998199463 }, { "auxiliary_loss_clip": 0.01042745, "auxiliary_loss_mlp": 0.01000298, "balance_loss_clip": 1.01262903, "balance_loss_mlp": 0.99849796, "epoch": 0.5675464438165093, "flos": 67470369114240.0, "grad_norm": 0.7530305218083386, "language_loss": 0.55784279, "learning_rate": 1.6612306181637077e-06, "loss": 0.57827324, "num_input_tokens_seen": 101915635, "step": 4720, "time_per_iteration": 4.1683759689331055 }, { "auxiliary_loss_clip": 0.01105708, "auxiliary_loss_mlp": 0.0104822, "balance_loss_clip": 1.04313266, "balance_loss_mlp": 1.03240108, "epoch": 0.5676666867071485, "flos": 18879173688960.0, "grad_norm": 2.280926421848017, "language_loss": 0.6573351, "learning_rate": 1.6604629283621598e-06, "loss": 0.67887437, "num_input_tokens_seen": 101933565, "step": 4721, "time_per_iteration": 2.7113149166107178 }, { "auxiliary_loss_clip": 0.01145809, "auxiliary_loss_mlp": 0.01044505, "balance_loss_clip": 1.04823303, "balance_loss_mlp": 1.02593195, "epoch": 0.5677869295977875, "flos": 33546778744320.0, "grad_norm": 1.660941414251342, "language_loss": 0.74892998, "learning_rate": 1.6596952900681152e-06, "loss": 0.77083302, "num_input_tokens_seen": 101954325, "step": 4722, "time_per_iteration": 2.6678173542022705 }, { "auxiliary_loss_clip": 0.01082944, "auxiliary_loss_mlp": 0.01048538, "balance_loss_clip": 1.04278851, "balance_loss_mlp": 1.02921426, "epoch": 0.5679071724884266, "flos": 28037256157440.0, "grad_norm": 2.2524700158396285, "language_loss": 0.81843269, "learning_rate": 1.658927703398025e-06, "loss": 0.83974755, "num_input_tokens_seen": 101974390, "step": 4723, "time_per_iteration": 3.697638511657715 }, { "auxiliary_loss_clip": 0.01093328, "auxiliary_loss_mlp": 0.01043508, "balance_loss_clip": 1.0377543, "balance_loss_mlp": 1.02593684, "epoch": 0.5680274153790658, "flos": 23550110380800.0, "grad_norm": 2.1520878596770205, "language_loss": 0.77660298, "learning_rate": 1.6581601684683309e-06, "loss": 0.79797137, "num_input_tokens_seen": 101994815, "step": 4724, "time_per_iteration": 2.7505571842193604 }, { "auxiliary_loss_clip": 0.01135066, "auxiliary_loss_mlp": 0.0104766, "balance_loss_clip": 1.04699397, "balance_loss_mlp": 1.03008819, "epoch": 0.5681476582697048, "flos": 22455158140800.0, "grad_norm": 2.806638759107687, "language_loss": 0.68577743, "learning_rate": 1.6573926853954674e-06, "loss": 0.70760471, "num_input_tokens_seen": 102012400, "step": 4725, "time_per_iteration": 2.621429681777954 }, { "auxiliary_loss_clip": 0.01110255, "auxiliary_loss_mlp": 0.01041505, "balance_loss_clip": 1.04120278, "balance_loss_mlp": 1.02518511, "epoch": 0.5682679011603439, "flos": 19536913584000.0, "grad_norm": 2.030762708411655, "language_loss": 0.82738256, "learning_rate": 1.6566252542958608e-06, "loss": 0.84890014, "num_input_tokens_seen": 102031900, "step": 4726, "time_per_iteration": 2.651181936264038 }, { "auxiliary_loss_clip": 0.01102121, "auxiliary_loss_mlp": 0.01052529, "balance_loss_clip": 1.04206753, "balance_loss_mlp": 1.03498185, "epoch": 0.568388144050983, "flos": 28765488493440.0, "grad_norm": 1.8483953389295098, "language_loss": 0.78462219, "learning_rate": 1.6558578752859305e-06, "loss": 0.80616874, "num_input_tokens_seen": 102050860, "step": 4727, "time_per_iteration": 2.7327964305877686 }, { "auxiliary_loss_clip": 0.01103936, "auxiliary_loss_mlp": 0.01045002, "balance_loss_clip": 1.04172349, "balance_loss_mlp": 1.02599978, "epoch": 0.5685083869416221, "flos": 21209452519680.0, "grad_norm": 2.054527466286241, "language_loss": 0.78689045, "learning_rate": 1.6550905484820865e-06, "loss": 0.80837977, "num_input_tokens_seen": 102069320, "step": 4728, "time_per_iteration": 2.6977055072784424 }, { "auxiliary_loss_clip": 0.01143336, "auxiliary_loss_mlp": 0.01046566, "balance_loss_clip": 1.04528689, "balance_loss_mlp": 1.02808857, "epoch": 0.5686286298322611, "flos": 24827021942400.0, "grad_norm": 2.7775731905345764, "language_loss": 0.78721511, "learning_rate": 1.6543232740007328e-06, "loss": 0.8091141, "num_input_tokens_seen": 102086435, "step": 4729, "time_per_iteration": 2.6264023780822754 }, { "auxiliary_loss_clip": 0.01133304, "auxiliary_loss_mlp": 0.01041553, "balance_loss_clip": 1.04678535, "balance_loss_mlp": 1.02326679, "epoch": 0.5687488727229003, "flos": 26615121909120.0, "grad_norm": 2.512098245932955, "language_loss": 0.66715395, "learning_rate": 1.653556051958263e-06, "loss": 0.6889025, "num_input_tokens_seen": 102106115, "step": 4730, "time_per_iteration": 2.635820150375366 }, { "auxiliary_loss_clip": 0.01066719, "auxiliary_loss_mlp": 0.01057585, "balance_loss_clip": 1.03858161, "balance_loss_mlp": 1.03787935, "epoch": 0.5688691156135394, "flos": 20808725414400.0, "grad_norm": 2.176195647527071, "language_loss": 0.73689151, "learning_rate": 1.6527888824710642e-06, "loss": 0.7581346, "num_input_tokens_seen": 102125715, "step": 4731, "time_per_iteration": 2.727269172668457 }, { "auxiliary_loss_clip": 0.01102792, "auxiliary_loss_mlp": 0.01039795, "balance_loss_clip": 1.04192448, "balance_loss_mlp": 1.02067435, "epoch": 0.5689893585041784, "flos": 25880963829120.0, "grad_norm": 2.35948708041387, "language_loss": 0.77027017, "learning_rate": 1.6520217656555166e-06, "loss": 0.79169601, "num_input_tokens_seen": 102145005, "step": 4732, "time_per_iteration": 2.7306571006774902 }, { "auxiliary_loss_clip": 0.0110643, "auxiliary_loss_mlp": 0.01047071, "balance_loss_clip": 1.04309964, "balance_loss_mlp": 1.0285579, "epoch": 0.5691096013948175, "flos": 23477463123840.0, "grad_norm": 1.8744488416392904, "language_loss": 0.70649314, "learning_rate": 1.65125470162799e-06, "loss": 0.72802818, "num_input_tokens_seen": 102165360, "step": 4733, "time_per_iteration": 2.666562080383301 }, { "auxiliary_loss_clip": 0.01108925, "auxiliary_loss_mlp": 0.01038663, "balance_loss_clip": 1.04084587, "balance_loss_mlp": 1.0215925, "epoch": 0.5692298442854566, "flos": 18075600576000.0, "grad_norm": 3.323506776931414, "language_loss": 0.69714421, "learning_rate": 1.6504876905048485e-06, "loss": 0.71862012, "num_input_tokens_seen": 102182320, "step": 4734, "time_per_iteration": 2.7025885581970215 }, { "auxiliary_loss_clip": 0.01139179, "auxiliary_loss_mlp": 0.01042086, "balance_loss_clip": 1.04632664, "balance_loss_mlp": 1.02580154, "epoch": 0.5693500871760957, "flos": 23039317025280.0, "grad_norm": 2.0056908937538775, "language_loss": 0.7189163, "learning_rate": 1.6497207324024464e-06, "loss": 0.74072897, "num_input_tokens_seen": 102201220, "step": 4735, "time_per_iteration": 2.615292549133301 }, { "auxiliary_loss_clip": 0.01122233, "auxiliary_loss_mlp": 0.01046452, "balance_loss_clip": 1.04407096, "balance_loss_mlp": 1.02920294, "epoch": 0.5694703300667348, "flos": 18989670902400.0, "grad_norm": 1.9617997963786296, "language_loss": 0.82671338, "learning_rate": 1.6489538274371305e-06, "loss": 0.84840024, "num_input_tokens_seen": 102219825, "step": 4736, "time_per_iteration": 2.622025728225708 }, { "auxiliary_loss_clip": 0.01123702, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.04565132, "balance_loss_mlp": 1.02244079, "epoch": 0.5695905729573739, "flos": 21908705558400.0, "grad_norm": 3.237944358003051, "language_loss": 0.83294731, "learning_rate": 1.6481869757252396e-06, "loss": 0.85456944, "num_input_tokens_seen": 102238160, "step": 4737, "time_per_iteration": 2.6255288124084473 }, { "auxiliary_loss_clip": 0.01130937, "auxiliary_loss_mlp": 0.01038747, "balance_loss_clip": 1.04683781, "balance_loss_mlp": 1.02235603, "epoch": 0.569710815848013, "flos": 28476659232000.0, "grad_norm": 1.559585865292168, "language_loss": 0.71894789, "learning_rate": 1.647420177383105e-06, "loss": 0.74064475, "num_input_tokens_seen": 102261030, "step": 4738, "time_per_iteration": 3.554711103439331 }, { "auxiliary_loss_clip": 0.01126211, "auxiliary_loss_mlp": 0.01055389, "balance_loss_clip": 1.04670572, "balance_loss_mlp": 1.03787756, "epoch": 0.569831058738652, "flos": 28366162018560.0, "grad_norm": 1.7948041536551038, "language_loss": 0.72676504, "learning_rate": 1.646653432527049e-06, "loss": 0.74858099, "num_input_tokens_seen": 102281670, "step": 4739, "time_per_iteration": 2.6803131103515625 }, { "auxiliary_loss_clip": 0.0111017, "auxiliary_loss_mlp": 0.01043111, "balance_loss_clip": 1.04631877, "balance_loss_mlp": 1.02515829, "epoch": 0.5699513016292912, "flos": 25849973370240.0, "grad_norm": 1.6798516794881342, "language_loss": 0.7460494, "learning_rate": 1.645886741273387e-06, "loss": 0.76758218, "num_input_tokens_seen": 102303485, "step": 4740, "time_per_iteration": 3.686410903930664 }, { "auxiliary_loss_clip": 0.01099549, "auxiliary_loss_mlp": 0.01042237, "balance_loss_clip": 1.04478312, "balance_loss_mlp": 1.02487993, "epoch": 0.5700715445199303, "flos": 18037858360320.0, "grad_norm": 1.8561753309707567, "language_loss": 0.7346859, "learning_rate": 1.645120103738424e-06, "loss": 0.75610375, "num_input_tokens_seen": 102320995, "step": 4741, "time_per_iteration": 2.6468753814697266 }, { "auxiliary_loss_clip": 0.01117458, "auxiliary_loss_mlp": 0.00773314, "balance_loss_clip": 1.04229975, "balance_loss_mlp": 1.0006845, "epoch": 0.5701917874105693, "flos": 11473352392320.0, "grad_norm": 2.0158302383462043, "language_loss": 0.84329802, "learning_rate": 1.6443535200384591e-06, "loss": 0.86220574, "num_input_tokens_seen": 102339170, "step": 4742, "time_per_iteration": 2.5999832153320312 }, { "auxiliary_loss_clip": 0.01145854, "auxiliary_loss_mlp": 0.01047229, "balance_loss_clip": 1.05039215, "balance_loss_mlp": 1.02881157, "epoch": 0.5703120303012085, "flos": 21761759018880.0, "grad_norm": 1.6404235549585418, "language_loss": 0.70832026, "learning_rate": 1.6435869902897827e-06, "loss": 0.73025107, "num_input_tokens_seen": 102357750, "step": 4743, "time_per_iteration": 2.5713987350463867 }, { "auxiliary_loss_clip": 0.01016883, "auxiliary_loss_mlp": 0.01009762, "balance_loss_clip": 1.01691031, "balance_loss_mlp": 1.00769949, "epoch": 0.5704322731918475, "flos": 56746258513920.0, "grad_norm": 0.8004240427815665, "language_loss": 0.6204223, "learning_rate": 1.6428205146086764e-06, "loss": 0.64068884, "num_input_tokens_seen": 102419730, "step": 4744, "time_per_iteration": 3.329540252685547 }, { "auxiliary_loss_clip": 0.01125191, "auxiliary_loss_mlp": 0.01040738, "balance_loss_clip": 1.04549026, "balance_loss_mlp": 1.02307141, "epoch": 0.5705525160824866, "flos": 20741141975040.0, "grad_norm": 12.389259940031206, "language_loss": 0.70723438, "learning_rate": 1.6420540931114142e-06, "loss": 0.72889364, "num_input_tokens_seen": 102440320, "step": 4745, "time_per_iteration": 2.65702486038208 }, { "auxiliary_loss_clip": 0.01122644, "auxiliary_loss_mlp": 0.01045913, "balance_loss_clip": 1.04582047, "balance_loss_mlp": 1.02810311, "epoch": 0.5706727589731257, "flos": 18771262254720.0, "grad_norm": 2.2606718967326924, "language_loss": 0.79102123, "learning_rate": 1.6412877259142616e-06, "loss": 0.81270683, "num_input_tokens_seen": 102460240, "step": 4746, "time_per_iteration": 3.732936382293701 }, { "auxiliary_loss_clip": 0.01117136, "auxiliary_loss_mlp": 0.01061718, "balance_loss_clip": 1.04444647, "balance_loss_mlp": 1.04159522, "epoch": 0.5707930018637648, "flos": 27634733372160.0, "grad_norm": 2.3582337060314096, "language_loss": 0.73646349, "learning_rate": 1.6405214131334757e-06, "loss": 0.75825202, "num_input_tokens_seen": 102478765, "step": 4747, "time_per_iteration": 2.8338706493377686 }, { "auxiliary_loss_clip": 0.01090722, "auxiliary_loss_mlp": 0.01043091, "balance_loss_clip": 1.04326582, "balance_loss_mlp": 1.02653337, "epoch": 0.5709132447544039, "flos": 27597673514880.0, "grad_norm": 1.8122648632639842, "language_loss": 0.79382706, "learning_rate": 1.6397551548853052e-06, "loss": 0.81516516, "num_input_tokens_seen": 102496930, "step": 4748, "time_per_iteration": 3.6514415740966797 }, { "auxiliary_loss_clip": 0.01127433, "auxiliary_loss_mlp": 0.01053788, "balance_loss_clip": 1.04989195, "balance_loss_mlp": 1.03448796, "epoch": 0.571033487645043, "flos": 21686095019520.0, "grad_norm": 1.7800983392247207, "language_loss": 0.70737922, "learning_rate": 1.6389889512859917e-06, "loss": 0.72919142, "num_input_tokens_seen": 102516590, "step": 4749, "time_per_iteration": 2.668447732925415 }, { "auxiliary_loss_clip": 0.01024199, "auxiliary_loss_mlp": 0.01002126, "balance_loss_clip": 1.01260328, "balance_loss_mlp": 1.00051677, "epoch": 0.5711537305356821, "flos": 70181445980160.0, "grad_norm": 0.8093708152239402, "language_loss": 0.60351902, "learning_rate": 1.638222802451767e-06, "loss": 0.62378228, "num_input_tokens_seen": 102578070, "step": 4750, "time_per_iteration": 3.212177038192749 }, { "auxiliary_loss_clip": 0.01121701, "auxiliary_loss_mlp": 0.01037816, "balance_loss_clip": 1.0451777, "balance_loss_mlp": 1.02168679, "epoch": 0.5712739734263211, "flos": 24717494396160.0, "grad_norm": 1.6972376412033838, "language_loss": 0.75122422, "learning_rate": 1.6374567084988561e-06, "loss": 0.7728194, "num_input_tokens_seen": 102599255, "step": 4751, "time_per_iteration": 2.653740882873535 }, { "auxiliary_loss_clip": 0.0112709, "auxiliary_loss_mlp": 0.0105005, "balance_loss_clip": 1.04873955, "balance_loss_mlp": 1.03235888, "epoch": 0.5713942163169603, "flos": 26578169792640.0, "grad_norm": 2.0004526829936493, "language_loss": 0.76618469, "learning_rate": 1.6366906695434738e-06, "loss": 0.78795606, "num_input_tokens_seen": 102621775, "step": 4752, "time_per_iteration": 2.7009823322296143 }, { "auxiliary_loss_clip": 0.01131154, "auxiliary_loss_mlp": 0.01048489, "balance_loss_clip": 1.04729772, "balance_loss_mlp": 1.03143072, "epoch": 0.5715144592075994, "flos": 21142443697920.0, "grad_norm": 2.1036929962055826, "language_loss": 0.86173725, "learning_rate": 1.6359246857018275e-06, "loss": 0.88353372, "num_input_tokens_seen": 102639305, "step": 4753, "time_per_iteration": 2.6011483669281006 }, { "auxiliary_loss_clip": 0.0109164, "auxiliary_loss_mlp": 0.01040633, "balance_loss_clip": 1.0413847, "balance_loss_mlp": 1.0226562, "epoch": 0.5716347020982384, "flos": 23330265189120.0, "grad_norm": 3.2193159248852186, "language_loss": 0.78296494, "learning_rate": 1.6351587570901178e-06, "loss": 0.80428767, "num_input_tokens_seen": 102659430, "step": 4754, "time_per_iteration": 2.7047111988067627 }, { "auxiliary_loss_clip": 0.01105764, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.04574895, "balance_loss_mlp": 1.02776289, "epoch": 0.5717549449888776, "flos": 17009555806080.0, "grad_norm": 2.892430284435028, "language_loss": 0.75470054, "learning_rate": 1.634392883824534e-06, "loss": 0.77620924, "num_input_tokens_seen": 102671430, "step": 4755, "time_per_iteration": 2.6287615299224854 }, { "auxiliary_loss_clip": 0.01097039, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.04397607, "balance_loss_mlp": 1.0237956, "epoch": 0.5718751878795166, "flos": 35518130922240.0, "grad_norm": 1.944304565698218, "language_loss": 0.6796248, "learning_rate": 1.6336270660212595e-06, "loss": 0.70101261, "num_input_tokens_seen": 102693025, "step": 4756, "time_per_iteration": 2.784376621246338 }, { "auxiliary_loss_clip": 0.0111434, "auxiliary_loss_mlp": 0.01049084, "balance_loss_clip": 1.04658592, "balance_loss_mlp": 1.03113055, "epoch": 0.5719954307701557, "flos": 38613989255040.0, "grad_norm": 2.1917990610716487, "language_loss": 0.65898913, "learning_rate": 1.6328613037964676e-06, "loss": 0.68062335, "num_input_tokens_seen": 102716090, "step": 4757, "time_per_iteration": 2.7914247512817383 }, { "auxiliary_loss_clip": 0.01130696, "auxiliary_loss_mlp": 0.01040802, "balance_loss_clip": 1.04721904, "balance_loss_mlp": 1.02325439, "epoch": 0.5721156736607949, "flos": 20631111638400.0, "grad_norm": 2.4297055817233146, "language_loss": 0.67908961, "learning_rate": 1.6320955972663241e-06, "loss": 0.70080459, "num_input_tokens_seen": 102735685, "step": 4758, "time_per_iteration": 2.611924648284912 }, { "auxiliary_loss_clip": 0.01131235, "auxiliary_loss_mlp": 0.0104364, "balance_loss_clip": 1.0465045, "balance_loss_mlp": 1.02660501, "epoch": 0.5722359165514339, "flos": 37415076076800.0, "grad_norm": 2.871064883893405, "language_loss": 0.65959489, "learning_rate": 1.6313299465469857e-06, "loss": 0.68134362, "num_input_tokens_seen": 102758415, "step": 4759, "time_per_iteration": 2.782526731491089 }, { "auxiliary_loss_clip": 0.01128845, "auxiliary_loss_mlp": 0.01051157, "balance_loss_clip": 1.04612517, "balance_loss_mlp": 1.03108203, "epoch": 0.572356159442073, "flos": 21972877205760.0, "grad_norm": 2.6225922154508927, "language_loss": 0.79746515, "learning_rate": 1.6305643517546014e-06, "loss": 0.81926513, "num_input_tokens_seen": 102773795, "step": 4760, "time_per_iteration": 2.699932336807251 }, { "auxiliary_loss_clip": 0.01140507, "auxiliary_loss_mlp": 0.01042792, "balance_loss_clip": 1.04829574, "balance_loss_mlp": 1.02617466, "epoch": 0.5724764023327121, "flos": 19135540033920.0, "grad_norm": 1.9052760397531907, "language_loss": 0.84809589, "learning_rate": 1.629798813005311e-06, "loss": 0.86992896, "num_input_tokens_seen": 102793515, "step": 4761, "time_per_iteration": 2.5898776054382324 }, { "auxiliary_loss_clip": 0.01092569, "auxiliary_loss_mlp": 0.01042856, "balance_loss_clip": 1.04102349, "balance_loss_mlp": 1.02567768, "epoch": 0.5725966452233512, "flos": 22819759142400.0, "grad_norm": 2.1197058870803684, "language_loss": 0.71040261, "learning_rate": 1.6290333304152473e-06, "loss": 0.73175681, "num_input_tokens_seen": 102813390, "step": 4762, "time_per_iteration": 2.7033255100250244 }, { "auxiliary_loss_clip": 0.01114965, "auxiliary_loss_mlp": 0.01045022, "balance_loss_clip": 1.04756331, "balance_loss_mlp": 1.02774858, "epoch": 0.5727168881139902, "flos": 41496610498560.0, "grad_norm": 1.882632586403673, "language_loss": 0.56506562, "learning_rate": 1.6282679041005314e-06, "loss": 0.58666551, "num_input_tokens_seen": 102838980, "step": 4763, "time_per_iteration": 2.8272762298583984 }, { "auxiliary_loss_clip": 0.01109154, "auxiliary_loss_mlp": 0.01034985, "balance_loss_clip": 1.04052937, "balance_loss_mlp": 1.01762867, "epoch": 0.5728371310046293, "flos": 14647675985280.0, "grad_norm": 2.1877465326342453, "language_loss": 0.87536347, "learning_rate": 1.6275025341772789e-06, "loss": 0.89680487, "num_input_tokens_seen": 102855285, "step": 4764, "time_per_iteration": 3.5311496257781982 }, { "auxiliary_loss_clip": 0.0111675, "auxiliary_loss_mlp": 0.01045942, "balance_loss_clip": 1.04235768, "balance_loss_mlp": 1.0283823, "epoch": 0.5729573738952685, "flos": 21506613736320.0, "grad_norm": 2.2109299373660916, "language_loss": 0.81748688, "learning_rate": 1.626737220761596e-06, "loss": 0.83911383, "num_input_tokens_seen": 102872750, "step": 4765, "time_per_iteration": 2.660752058029175 }, { "auxiliary_loss_clip": 0.01129362, "auxiliary_loss_mlp": 0.01041166, "balance_loss_clip": 1.04779828, "balance_loss_mlp": 1.02417827, "epoch": 0.5730776167859075, "flos": 23621680229760.0, "grad_norm": 2.908223635300858, "language_loss": 0.79197431, "learning_rate": 1.62597196396958e-06, "loss": 0.81367958, "num_input_tokens_seen": 102890920, "step": 4766, "time_per_iteration": 3.624852418899536 }, { "auxiliary_loss_clip": 0.01127246, "auxiliary_loss_mlp": 0.01050859, "balance_loss_clip": 1.04508734, "balance_loss_mlp": 1.03347838, "epoch": 0.5731978596765466, "flos": 25739224761600.0, "grad_norm": 2.0296010525886063, "language_loss": 0.85695577, "learning_rate": 1.6252067639173197e-06, "loss": 0.87873685, "num_input_tokens_seen": 102912830, "step": 4767, "time_per_iteration": 2.6765592098236084 }, { "auxiliary_loss_clip": 0.0113216, "auxiliary_loss_mlp": 0.01039867, "balance_loss_clip": 1.04731679, "balance_loss_mlp": 1.02383327, "epoch": 0.5733181025671857, "flos": 26359509749760.0, "grad_norm": 1.7039385255257895, "language_loss": 0.69625926, "learning_rate": 1.6244416207208956e-06, "loss": 0.71797955, "num_input_tokens_seen": 102933765, "step": 4768, "time_per_iteration": 2.640599250793457 }, { "auxiliary_loss_clip": 0.01104204, "auxiliary_loss_mlp": 0.01040538, "balance_loss_clip": 1.04286134, "balance_loss_mlp": 1.02309775, "epoch": 0.5734383454578248, "flos": 29423874833280.0, "grad_norm": 2.0199747314395147, "language_loss": 0.73631471, "learning_rate": 1.6236765344963787e-06, "loss": 0.75776213, "num_input_tokens_seen": 102955025, "step": 4769, "time_per_iteration": 2.721374988555908 }, { "auxiliary_loss_clip": 0.01115619, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.0420264, "balance_loss_mlp": 1.02631903, "epoch": 0.5735585883484638, "flos": 34969954487040.0, "grad_norm": 2.6429676928765478, "language_loss": 0.69163561, "learning_rate": 1.6229115053598322e-06, "loss": 0.71322966, "num_input_tokens_seen": 102976780, "step": 4770, "time_per_iteration": 2.7253427505493164 }, { "auxiliary_loss_clip": 0.01135298, "auxiliary_loss_mlp": 0.01044901, "balance_loss_clip": 1.04936945, "balance_loss_mlp": 1.02723384, "epoch": 0.573678831239103, "flos": 18770759464320.0, "grad_norm": 1.9513136554372768, "language_loss": 0.72454011, "learning_rate": 1.6221465334273108e-06, "loss": 0.74634212, "num_input_tokens_seen": 102995990, "step": 4771, "time_per_iteration": 2.7069435119628906 }, { "auxiliary_loss_clip": 0.01114018, "auxiliary_loss_mlp": 0.01043943, "balance_loss_clip": 1.04711556, "balance_loss_mlp": 1.02598977, "epoch": 0.5737990741297421, "flos": 25702883176320.0, "grad_norm": 2.1966416840658662, "language_loss": 0.61142987, "learning_rate": 1.6213816188148593e-06, "loss": 0.63300943, "num_input_tokens_seen": 103014695, "step": 4772, "time_per_iteration": 3.683948278427124 }, { "auxiliary_loss_clip": 0.01107563, "auxiliary_loss_mlp": 0.01050965, "balance_loss_clip": 1.04640031, "balance_loss_mlp": 1.03425205, "epoch": 0.5739193170203811, "flos": 27269234530560.0, "grad_norm": 2.0269127055199774, "language_loss": 0.773664, "learning_rate": 1.6206167616385162e-06, "loss": 0.79524934, "num_input_tokens_seen": 103035760, "step": 4773, "time_per_iteration": 2.6622109413146973 }, { "auxiliary_loss_clip": 0.01124821, "auxiliary_loss_mlp": 0.01044989, "balance_loss_clip": 1.04645824, "balance_loss_mlp": 1.02732205, "epoch": 0.5740395599110203, "flos": 12239721993600.0, "grad_norm": 2.0791924755921016, "language_loss": 0.73809361, "learning_rate": 1.6198519620143078e-06, "loss": 0.75979173, "num_input_tokens_seen": 103052915, "step": 4774, "time_per_iteration": 3.58967924118042 }, { "auxiliary_loss_clip": 0.01111146, "auxiliary_loss_mlp": 0.01047169, "balance_loss_clip": 1.04725027, "balance_loss_mlp": 1.02956176, "epoch": 0.5741598028016593, "flos": 25921399564800.0, "grad_norm": 2.023116998193456, "language_loss": 0.78189504, "learning_rate": 1.6190872200582546e-06, "loss": 0.80347824, "num_input_tokens_seen": 103074655, "step": 4775, "time_per_iteration": 2.704430103302002 }, { "auxiliary_loss_clip": 0.01112289, "auxiliary_loss_mlp": 0.00773063, "balance_loss_clip": 1.04361606, "balance_loss_mlp": 1.00049961, "epoch": 0.5742800456922984, "flos": 19244133826560.0, "grad_norm": 2.0920461715190246, "language_loss": 0.78005278, "learning_rate": 1.6183225358863676e-06, "loss": 0.79890627, "num_input_tokens_seen": 103091550, "step": 4776, "time_per_iteration": 2.6211202144622803 }, { "auxiliary_loss_clip": 0.01110127, "auxiliary_loss_mlp": 0.01053694, "balance_loss_clip": 1.04350591, "balance_loss_mlp": 1.03448915, "epoch": 0.5744002885829376, "flos": 30920487932160.0, "grad_norm": 2.756481522098876, "language_loss": 0.71626937, "learning_rate": 1.617557909614648e-06, "loss": 0.73790759, "num_input_tokens_seen": 103110985, "step": 4777, "time_per_iteration": 2.7558062076568604 }, { "auxiliary_loss_clip": 0.01102632, "auxiliary_loss_mlp": 0.01040553, "balance_loss_clip": 1.04077435, "balance_loss_mlp": 1.02426863, "epoch": 0.5745205314735766, "flos": 23840017050240.0, "grad_norm": 2.67253748263706, "language_loss": 0.8565855, "learning_rate": 1.6167933413590899e-06, "loss": 0.87801731, "num_input_tokens_seen": 103129890, "step": 4778, "time_per_iteration": 2.72578763961792 }, { "auxiliary_loss_clip": 0.01128003, "auxiliary_loss_mlp": 0.01047555, "balance_loss_clip": 1.04535496, "balance_loss_mlp": 1.029948, "epoch": 0.5746407743642157, "flos": 12311902373760.0, "grad_norm": 2.1383158300738483, "language_loss": 0.90531576, "learning_rate": 1.6160288312356773e-06, "loss": 0.92707133, "num_input_tokens_seen": 103147020, "step": 4779, "time_per_iteration": 2.6078531742095947 }, { "auxiliary_loss_clip": 0.01133809, "auxiliary_loss_mlp": 0.01041757, "balance_loss_clip": 1.04686785, "balance_loss_mlp": 1.02509141, "epoch": 0.5747610172548548, "flos": 24133658734080.0, "grad_norm": 1.7342681408185567, "language_loss": 0.81664026, "learning_rate": 1.6152643793603857e-06, "loss": 0.83839595, "num_input_tokens_seen": 103167370, "step": 4780, "time_per_iteration": 2.646365165710449 }, { "auxiliary_loss_clip": 0.01141651, "auxiliary_loss_mlp": 0.01043774, "balance_loss_clip": 1.04741073, "balance_loss_mlp": 1.02645302, "epoch": 0.5748812601454939, "flos": 25408451393280.0, "grad_norm": 1.8146821831202034, "language_loss": 0.87785149, "learning_rate": 1.6144999858491815e-06, "loss": 0.89970565, "num_input_tokens_seen": 103186000, "step": 4781, "time_per_iteration": 2.6148579120635986 }, { "auxiliary_loss_clip": 0.01120253, "auxiliary_loss_mlp": 0.01042891, "balance_loss_clip": 1.0429697, "balance_loss_mlp": 1.02551067, "epoch": 0.575001503036133, "flos": 30624942827520.0, "grad_norm": 2.152549828510422, "language_loss": 0.85828102, "learning_rate": 1.6137356508180232e-06, "loss": 0.8799125, "num_input_tokens_seen": 103207710, "step": 4782, "time_per_iteration": 2.693652629852295 }, { "auxiliary_loss_clip": 0.01143802, "auxiliary_loss_mlp": 0.00774514, "balance_loss_clip": 1.04764128, "balance_loss_mlp": 1.0006218, "epoch": 0.5751217459267721, "flos": 21726566668800.0, "grad_norm": 2.0799461419395886, "language_loss": 0.8148039, "learning_rate": 1.6129713743828593e-06, "loss": 0.83398706, "num_input_tokens_seen": 103226720, "step": 4783, "time_per_iteration": 2.587766647338867 }, { "auxiliary_loss_clip": 0.01113942, "auxiliary_loss_mlp": 0.01049477, "balance_loss_clip": 1.04063725, "balance_loss_mlp": 1.03215623, "epoch": 0.5752419888174112, "flos": 21651620941440.0, "grad_norm": 1.8614432161483, "language_loss": 0.75588393, "learning_rate": 1.6122071566596306e-06, "loss": 0.77751809, "num_input_tokens_seen": 103246995, "step": 4784, "time_per_iteration": 2.6692514419555664 }, { "auxiliary_loss_clip": 0.01129926, "auxiliary_loss_mlp": 0.01037753, "balance_loss_clip": 1.04360783, "balance_loss_mlp": 1.02166009, "epoch": 0.5753622317080502, "flos": 17775997234560.0, "grad_norm": 3.3765903349329722, "language_loss": 0.83616787, "learning_rate": 1.6114429977642674e-06, "loss": 0.85784465, "num_input_tokens_seen": 103261500, "step": 4785, "time_per_iteration": 2.5516459941864014 }, { "auxiliary_loss_clip": 0.01131767, "auxiliary_loss_mlp": 0.0104925, "balance_loss_clip": 1.04818702, "balance_loss_mlp": 1.03234601, "epoch": 0.5754824745986894, "flos": 19789616741760.0, "grad_norm": 1.9296177133311518, "language_loss": 0.74037373, "learning_rate": 1.6106788978126926e-06, "loss": 0.76218379, "num_input_tokens_seen": 103280475, "step": 4786, "time_per_iteration": 2.5790891647338867 }, { "auxiliary_loss_clip": 0.01090795, "auxiliary_loss_mlp": 0.01041987, "balance_loss_clip": 1.04126334, "balance_loss_mlp": 1.02410567, "epoch": 0.5756027174893285, "flos": 30985665160320.0, "grad_norm": 2.0813713897677113, "language_loss": 0.78743702, "learning_rate": 1.6099148569208196e-06, "loss": 0.80876482, "num_input_tokens_seen": 103297695, "step": 4787, "time_per_iteration": 2.7570993900299072 }, { "auxiliary_loss_clip": 0.01121596, "auxiliary_loss_mlp": 0.0103935, "balance_loss_clip": 1.04919875, "balance_loss_mlp": 1.02031279, "epoch": 0.5757229603799675, "flos": 28546864364160.0, "grad_norm": 1.767099287372578, "language_loss": 0.6289252, "learning_rate": 1.6091508752045523e-06, "loss": 0.65053469, "num_input_tokens_seen": 103318575, "step": 4788, "time_per_iteration": 2.7258589267730713 }, { "auxiliary_loss_clip": 0.01094865, "auxiliary_loss_mlp": 0.01048305, "balance_loss_clip": 1.03840911, "balance_loss_mlp": 1.03165126, "epoch": 0.5758432032706067, "flos": 22999024944000.0, "grad_norm": 1.6365233662664913, "language_loss": 0.86273646, "learning_rate": 1.608386952779787e-06, "loss": 0.88416815, "num_input_tokens_seen": 103337945, "step": 4789, "time_per_iteration": 2.677581310272217 }, { "auxiliary_loss_clip": 0.01121411, "auxiliary_loss_mlp": 0.0103846, "balance_loss_clip": 1.04485703, "balance_loss_mlp": 1.02055502, "epoch": 0.5759634461612457, "flos": 25739727552000.0, "grad_norm": 1.5496773722435453, "language_loss": 0.7473762, "learning_rate": 1.6076230897624098e-06, "loss": 0.76897496, "num_input_tokens_seen": 103360150, "step": 4790, "time_per_iteration": 3.6724934577941895 }, { "auxiliary_loss_clip": 0.01133027, "auxiliary_loss_mlp": 0.01042485, "balance_loss_clip": 1.04567969, "balance_loss_mlp": 1.02496099, "epoch": 0.5760836890518848, "flos": 30591761639040.0, "grad_norm": 2.2567269399213057, "language_loss": 0.77483702, "learning_rate": 1.6068592862682974e-06, "loss": 0.79659212, "num_input_tokens_seen": 103378305, "step": 4791, "time_per_iteration": 2.65775990486145 }, { "auxiliary_loss_clip": 0.01116742, "auxiliary_loss_mlp": 0.01047586, "balance_loss_clip": 1.04298949, "balance_loss_mlp": 1.0300386, "epoch": 0.576203931942524, "flos": 36538963447680.0, "grad_norm": 4.963103374951624, "language_loss": 0.73799151, "learning_rate": 1.6060955424133187e-06, "loss": 0.75963479, "num_input_tokens_seen": 103399230, "step": 4792, "time_per_iteration": 3.730722427368164 }, { "auxiliary_loss_clip": 0.01129163, "auxiliary_loss_mlp": 0.01038461, "balance_loss_clip": 1.04546189, "balance_loss_mlp": 1.02127099, "epoch": 0.576324174833163, "flos": 25516937445120.0, "grad_norm": 1.9843575599127112, "language_loss": 0.89878058, "learning_rate": 1.6053318583133332e-06, "loss": 0.92045689, "num_input_tokens_seen": 103420100, "step": 4793, "time_per_iteration": 2.6306536197662354 }, { "auxiliary_loss_clip": 0.01128313, "auxiliary_loss_mlp": 0.01039355, "balance_loss_clip": 1.04569983, "balance_loss_mlp": 1.02172375, "epoch": 0.5764444177238021, "flos": 25119262995840.0, "grad_norm": 2.229714786857362, "language_loss": 0.75304943, "learning_rate": 1.6045682340841907e-06, "loss": 0.77472603, "num_input_tokens_seen": 103439025, "step": 4794, "time_per_iteration": 2.6428489685058594 }, { "auxiliary_loss_clip": 0.01012906, "auxiliary_loss_mlp": 0.00755524, "balance_loss_clip": 1.01114821, "balance_loss_mlp": 1.00032568, "epoch": 0.5765646606144411, "flos": 62212687758720.0, "grad_norm": 0.7511505640027759, "language_loss": 0.58000964, "learning_rate": 1.6038046698417336e-06, "loss": 0.59769392, "num_input_tokens_seen": 103499920, "step": 4795, "time_per_iteration": 3.228139638900757 }, { "auxiliary_loss_clip": 0.01127262, "auxiliary_loss_mlp": 0.01042747, "balance_loss_clip": 1.04367161, "balance_loss_mlp": 1.02566481, "epoch": 0.5766849035050803, "flos": 25118760205440.0, "grad_norm": 2.151937512597283, "language_loss": 0.69142818, "learning_rate": 1.6030411657017919e-06, "loss": 0.71312833, "num_input_tokens_seen": 103519575, "step": 4796, "time_per_iteration": 2.6422553062438965 }, { "auxiliary_loss_clip": 0.0112214, "auxiliary_loss_mlp": 0.01038855, "balance_loss_clip": 1.04456699, "balance_loss_mlp": 1.02186823, "epoch": 0.5768051463957193, "flos": 15991093578240.0, "grad_norm": 1.8485629432449326, "language_loss": 0.84484947, "learning_rate": 1.6022777217801903e-06, "loss": 0.86645937, "num_input_tokens_seen": 103536530, "step": 4797, "time_per_iteration": 3.501408100128174 }, { "auxiliary_loss_clip": 0.01104933, "auxiliary_loss_mlp": 0.01041322, "balance_loss_clip": 1.04468203, "balance_loss_mlp": 1.02364361, "epoch": 0.5769253892863584, "flos": 22163635359360.0, "grad_norm": 2.2716761568835877, "language_loss": 0.74044091, "learning_rate": 1.601514338192742e-06, "loss": 0.76190341, "num_input_tokens_seen": 103556460, "step": 4798, "time_per_iteration": 2.659741163253784 }, { "auxiliary_loss_clip": 0.01135559, "auxiliary_loss_mlp": 0.01034596, "balance_loss_clip": 1.04448128, "balance_loss_mlp": 1.01912296, "epoch": 0.5770456321769976, "flos": 22856388036480.0, "grad_norm": 2.243626003492805, "language_loss": 0.71560973, "learning_rate": 1.6007510150552514e-06, "loss": 0.73731124, "num_input_tokens_seen": 103574520, "step": 4799, "time_per_iteration": 2.582019567489624 }, { "auxiliary_loss_clip": 0.01133928, "auxiliary_loss_mlp": 0.01046417, "balance_loss_clip": 1.0451858, "balance_loss_mlp": 1.028476, "epoch": 0.5771658750676366, "flos": 46353672489600.0, "grad_norm": 1.5368886862543831, "language_loss": 0.6197139, "learning_rate": 1.599987752483515e-06, "loss": 0.64151728, "num_input_tokens_seen": 103598965, "step": 4800, "time_per_iteration": 3.7344417572021484 }, { "auxiliary_loss_clip": 0.01099656, "auxiliary_loss_mlp": 0.01041908, "balance_loss_clip": 1.04019403, "balance_loss_mlp": 1.02489662, "epoch": 0.5772861179582757, "flos": 22159972172160.0, "grad_norm": 1.7372709456083404, "language_loss": 0.67714238, "learning_rate": 1.5992245505933184e-06, "loss": 0.69855809, "num_input_tokens_seen": 103618665, "step": 4801, "time_per_iteration": 2.678412675857544 }, { "auxiliary_loss_clip": 0.01145423, "auxiliary_loss_mlp": 0.01032188, "balance_loss_clip": 1.04934096, "balance_loss_mlp": 1.01583254, "epoch": 0.5774063608489148, "flos": 31248926916480.0, "grad_norm": 2.0415089962584223, "language_loss": 0.7134437, "learning_rate": 1.5984614095004388e-06, "loss": 0.73521984, "num_input_tokens_seen": 103639800, "step": 4802, "time_per_iteration": 2.647507429122925 }, { "auxiliary_loss_clip": 0.01124883, "auxiliary_loss_mlp": 0.01048142, "balance_loss_clip": 1.04508471, "balance_loss_mlp": 1.03051138, "epoch": 0.5775266037395539, "flos": 22527123039360.0, "grad_norm": 2.3904769617111215, "language_loss": 0.81136763, "learning_rate": 1.5976983293206438e-06, "loss": 0.83309782, "num_input_tokens_seen": 103655605, "step": 4803, "time_per_iteration": 2.620509147644043 }, { "auxiliary_loss_clip": 0.0111098, "auxiliary_loss_mlp": 0.01045644, "balance_loss_clip": 1.04173541, "balance_loss_mlp": 1.02699971, "epoch": 0.577646846630193, "flos": 21068790860160.0, "grad_norm": 1.7528412229729184, "language_loss": 0.71477491, "learning_rate": 1.5969353101696928e-06, "loss": 0.73634112, "num_input_tokens_seen": 103674045, "step": 4804, "time_per_iteration": 2.67374324798584 }, { "auxiliary_loss_clip": 0.01125343, "auxiliary_loss_mlp": 0.01033822, "balance_loss_clip": 1.04350567, "balance_loss_mlp": 1.01761007, "epoch": 0.5777670895208321, "flos": 29714284293120.0, "grad_norm": 1.7263432898751885, "language_loss": 0.79851484, "learning_rate": 1.5961723521633341e-06, "loss": 0.82010651, "num_input_tokens_seen": 103695285, "step": 4805, "time_per_iteration": 2.6581859588623047 }, { "auxiliary_loss_clip": 0.01111103, "auxiliary_loss_mlp": 0.01044741, "balance_loss_clip": 1.04224253, "balance_loss_mlp": 1.02746737, "epoch": 0.5778873324114712, "flos": 19500428344320.0, "grad_norm": 2.0457924572698434, "language_loss": 0.90681112, "learning_rate": 1.5954094554173097e-06, "loss": 0.92836952, "num_input_tokens_seen": 103713275, "step": 4806, "time_per_iteration": 2.6663894653320312 }, { "auxiliary_loss_clip": 0.011238, "auxiliary_loss_mlp": 0.01041394, "balance_loss_clip": 1.04849792, "balance_loss_mlp": 1.0250504, "epoch": 0.5780075753021102, "flos": 14136846716160.0, "grad_norm": 2.0249212986292786, "language_loss": 0.7908873, "learning_rate": 1.5946466200473482e-06, "loss": 0.81253922, "num_input_tokens_seen": 103731185, "step": 4807, "time_per_iteration": 2.6234681606292725 }, { "auxiliary_loss_clip": 0.01121149, "auxiliary_loss_mlp": 0.01037457, "balance_loss_clip": 1.04268193, "balance_loss_mlp": 1.02074432, "epoch": 0.5781278181927494, "flos": 15262178883840.0, "grad_norm": 1.8196618859902747, "language_loss": 0.83080912, "learning_rate": 1.5938838461691723e-06, "loss": 0.85239512, "num_input_tokens_seen": 103748095, "step": 4808, "time_per_iteration": 2.635878324508667 }, { "auxiliary_loss_clip": 0.01145625, "auxiliary_loss_mlp": 0.01047742, "balance_loss_clip": 1.04906237, "balance_loss_mlp": 1.02955019, "epoch": 0.5782480610833884, "flos": 16726831856640.0, "grad_norm": 2.456814698462623, "language_loss": 0.82787228, "learning_rate": 1.593121133898494e-06, "loss": 0.84980595, "num_input_tokens_seen": 103765300, "step": 4809, "time_per_iteration": 2.5613181591033936 }, { "auxiliary_loss_clip": 0.01137832, "auxiliary_loss_mlp": 0.01037019, "balance_loss_clip": 1.0499115, "balance_loss_mlp": 1.02077091, "epoch": 0.5783683039740275, "flos": 25482140144640.0, "grad_norm": 2.2220783879067976, "language_loss": 0.79606497, "learning_rate": 1.592358483351016e-06, "loss": 0.81781346, "num_input_tokens_seen": 103785475, "step": 4810, "time_per_iteration": 2.685089111328125 }, { "auxiliary_loss_clip": 0.01125747, "auxiliary_loss_mlp": 0.01040467, "balance_loss_clip": 1.04436803, "balance_loss_mlp": 1.02548206, "epoch": 0.5784885468646667, "flos": 18405835240320.0, "grad_norm": 1.8996486387501919, "language_loss": 0.72202367, "learning_rate": 1.5915958946424326e-06, "loss": 0.74368584, "num_input_tokens_seen": 103804160, "step": 4811, "time_per_iteration": 2.5747995376586914 }, { "auxiliary_loss_clip": 0.01101963, "auxiliary_loss_mlp": 0.00773795, "balance_loss_clip": 1.04235363, "balance_loss_mlp": 1.00063443, "epoch": 0.5786087897553057, "flos": 46100717936640.0, "grad_norm": 2.4363321605070265, "language_loss": 0.74056584, "learning_rate": 1.5908333678884271e-06, "loss": 0.75932348, "num_input_tokens_seen": 103830580, "step": 4812, "time_per_iteration": 2.945420503616333 }, { "auxiliary_loss_clip": 0.0112773, "auxiliary_loss_mlp": 0.01041631, "balance_loss_clip": 1.04468691, "balance_loss_mlp": 1.0236659, "epoch": 0.5787290326459448, "flos": 12385950261120.0, "grad_norm": 2.4499385518485233, "language_loss": 0.73686522, "learning_rate": 1.5900709032046743e-06, "loss": 0.75855887, "num_input_tokens_seen": 103848655, "step": 4813, "time_per_iteration": 2.602236270904541 }, { "auxiliary_loss_clip": 0.01115279, "auxiliary_loss_mlp": 0.01044497, "balance_loss_clip": 1.04780042, "balance_loss_mlp": 1.02795053, "epoch": 0.5788492755365839, "flos": 23290332243840.0, "grad_norm": 3.4877260222202286, "language_loss": 0.78524482, "learning_rate": 1.5893085007068391e-06, "loss": 0.80684257, "num_input_tokens_seen": 103866215, "step": 4814, "time_per_iteration": 2.6787045001983643 }, { "auxiliary_loss_clip": 0.01102746, "auxiliary_loss_mlp": 0.01052932, "balance_loss_clip": 1.03916144, "balance_loss_mlp": 1.03437114, "epoch": 0.578969518427223, "flos": 24061047390720.0, "grad_norm": 1.8205381817113382, "language_loss": 0.70917791, "learning_rate": 1.5885461605105786e-06, "loss": 0.73073471, "num_input_tokens_seen": 103887815, "step": 4815, "time_per_iteration": 2.7752091884613037 }, { "auxiliary_loss_clip": 0.01114841, "auxiliary_loss_mlp": 0.01045229, "balance_loss_clip": 1.04350042, "balance_loss_mlp": 1.02785993, "epoch": 0.579089761317862, "flos": 21871825269120.0, "grad_norm": 2.9296563490418435, "language_loss": 0.76773155, "learning_rate": 1.5877838827315375e-06, "loss": 0.78933227, "num_input_tokens_seen": 103906360, "step": 4816, "time_per_iteration": 2.6774449348449707 }, { "auxiliary_loss_clip": 0.01144366, "auxiliary_loss_mlp": 0.01039176, "balance_loss_clip": 1.04973817, "balance_loss_mlp": 1.02273655, "epoch": 0.5792100042085012, "flos": 22929681738240.0, "grad_norm": 2.334503163379984, "language_loss": 0.70081699, "learning_rate": 1.587021667485355e-06, "loss": 0.72265244, "num_input_tokens_seen": 103925730, "step": 4817, "time_per_iteration": 3.5382721424102783 }, { "auxiliary_loss_clip": 0.01118503, "auxiliary_loss_mlp": 0.01040057, "balance_loss_clip": 1.04362631, "balance_loss_mlp": 1.02432108, "epoch": 0.5793302470991403, "flos": 21470056669440.0, "grad_norm": 1.82069334981145, "language_loss": 0.78560162, "learning_rate": 1.5862595148876559e-06, "loss": 0.8071872, "num_input_tokens_seen": 103945835, "step": 4818, "time_per_iteration": 3.661442995071411 }, { "auxiliary_loss_clip": 0.01088564, "auxiliary_loss_mlp": 0.01047943, "balance_loss_clip": 1.04080069, "balance_loss_mlp": 1.02952552, "epoch": 0.5794504899897793, "flos": 12711013367040.0, "grad_norm": 1.9705235055542196, "language_loss": 0.76404738, "learning_rate": 1.58549742505406e-06, "loss": 0.78541243, "num_input_tokens_seen": 103960580, "step": 4819, "time_per_iteration": 2.6468875408172607 }, { "auxiliary_loss_clip": 0.0114439, "auxiliary_loss_mlp": 0.01043054, "balance_loss_clip": 1.04818344, "balance_loss_mlp": 1.02556574, "epoch": 0.5795707328804185, "flos": 14867054300160.0, "grad_norm": 2.44666017767372, "language_loss": 0.75765896, "learning_rate": 1.5847353981001747e-06, "loss": 0.77953339, "num_input_tokens_seen": 103977760, "step": 4820, "time_per_iteration": 2.515507936477661 }, { "auxiliary_loss_clip": 0.01111097, "auxiliary_loss_mlp": 0.01049638, "balance_loss_clip": 1.04095888, "balance_loss_mlp": 1.03191137, "epoch": 0.5796909757710575, "flos": 36430046432640.0, "grad_norm": 3.1694104000571417, "language_loss": 0.69645548, "learning_rate": 1.5839734341415993e-06, "loss": 0.71806282, "num_input_tokens_seen": 103999960, "step": 4821, "time_per_iteration": 2.744671583175659 }, { "auxiliary_loss_clip": 0.01122595, "auxiliary_loss_mlp": 0.01043747, "balance_loss_clip": 1.04712415, "balance_loss_mlp": 1.02891159, "epoch": 0.5798112186616966, "flos": 23039891642880.0, "grad_norm": 1.6664733802546097, "language_loss": 0.76667893, "learning_rate": 1.5832115332939238e-06, "loss": 0.78834236, "num_input_tokens_seen": 104018400, "step": 4822, "time_per_iteration": 2.594012498855591 }, { "auxiliary_loss_clip": 0.01126719, "auxiliary_loss_mlp": 0.01045547, "balance_loss_clip": 1.04415941, "balance_loss_mlp": 1.02851188, "epoch": 0.5799314615523358, "flos": 16652604401280.0, "grad_norm": 1.73793021690061, "language_loss": 0.74717546, "learning_rate": 1.5824496956727272e-06, "loss": 0.76889807, "num_input_tokens_seen": 104035605, "step": 4823, "time_per_iteration": 3.685960054397583 }, { "auxiliary_loss_clip": 0.01113674, "auxiliary_loss_mlp": 0.0104157, "balance_loss_clip": 1.04269648, "balance_loss_mlp": 1.02538157, "epoch": 0.5800517044429748, "flos": 20485673470080.0, "grad_norm": 1.705117932503328, "language_loss": 0.73551154, "learning_rate": 1.5816879213935797e-06, "loss": 0.75706398, "num_input_tokens_seen": 104054415, "step": 4824, "time_per_iteration": 2.640859603881836 }, { "auxiliary_loss_clip": 0.01126658, "auxiliary_loss_mlp": 0.01040923, "balance_loss_clip": 1.04563057, "balance_loss_mlp": 1.02577186, "epoch": 0.5801719473336139, "flos": 31538258968320.0, "grad_norm": 1.5375206775738206, "language_loss": 0.79913473, "learning_rate": 1.5809262105720416e-06, "loss": 0.8208105, "num_input_tokens_seen": 104075455, "step": 4825, "time_per_iteration": 2.707042932510376 }, { "auxiliary_loss_clip": 0.01137873, "auxiliary_loss_mlp": 0.01042344, "balance_loss_clip": 1.04518723, "balance_loss_mlp": 1.0265373, "epoch": 0.580292190224253, "flos": 20375966355840.0, "grad_norm": 1.4583854096056896, "language_loss": 0.79341531, "learning_rate": 1.5801645633236644e-06, "loss": 0.81521749, "num_input_tokens_seen": 104096440, "step": 4826, "time_per_iteration": 3.4614241123199463 }, { "auxiliary_loss_clip": 0.01111117, "auxiliary_loss_mlp": 0.01048589, "balance_loss_clip": 1.04415715, "balance_loss_mlp": 1.03079128, "epoch": 0.5804124331148921, "flos": 26615373304320.0, "grad_norm": 1.9155648258910227, "language_loss": 0.77178943, "learning_rate": 1.579402979763989e-06, "loss": 0.79338646, "num_input_tokens_seen": 104116775, "step": 4827, "time_per_iteration": 2.686455249786377 }, { "auxiliary_loss_clip": 0.01091792, "auxiliary_loss_mlp": 0.0104008, "balance_loss_clip": 1.04358053, "balance_loss_mlp": 1.02352142, "epoch": 0.5805326760055312, "flos": 13478496289920.0, "grad_norm": 2.1586115335081195, "language_loss": 0.80922699, "learning_rate": 1.578641460008548e-06, "loss": 0.83054578, "num_input_tokens_seen": 104134510, "step": 4828, "time_per_iteration": 2.7157366275787354 }, { "auxiliary_loss_clip": 0.01126489, "auxiliary_loss_mlp": 0.01033299, "balance_loss_clip": 1.04549026, "balance_loss_mlp": 1.01807618, "epoch": 0.5806529188961702, "flos": 12091374823680.0, "grad_norm": 1.9752325862811206, "language_loss": 0.68008351, "learning_rate": 1.5778800041728613e-06, "loss": 0.70168144, "num_input_tokens_seen": 104150800, "step": 4829, "time_per_iteration": 2.594832420349121 }, { "auxiliary_loss_clip": 0.01123728, "auxiliary_loss_mlp": 0.01041823, "balance_loss_clip": 1.0448916, "balance_loss_mlp": 1.02489519, "epoch": 0.5807731617868094, "flos": 26214107495040.0, "grad_norm": 1.6109674090774737, "language_loss": 0.66424412, "learning_rate": 1.577118612372443e-06, "loss": 0.68589967, "num_input_tokens_seen": 104172640, "step": 4830, "time_per_iteration": 2.667512893676758 }, { "auxiliary_loss_clip": 0.01112078, "auxiliary_loss_mlp": 0.00774631, "balance_loss_clip": 1.04200947, "balance_loss_mlp": 1.00067794, "epoch": 0.5808934046774484, "flos": 37962139190400.0, "grad_norm": 1.633459751128761, "language_loss": 0.70800501, "learning_rate": 1.5763572847227943e-06, "loss": 0.72687209, "num_input_tokens_seen": 104193525, "step": 4831, "time_per_iteration": 2.790353298187256 }, { "auxiliary_loss_clip": 0.01125359, "auxiliary_loss_mlp": 0.01036656, "balance_loss_clip": 1.04352343, "balance_loss_mlp": 1.02030063, "epoch": 0.5810136475680875, "flos": 20485853038080.0, "grad_norm": 2.116064839214306, "language_loss": 0.81231463, "learning_rate": 1.5755960213394091e-06, "loss": 0.83393472, "num_input_tokens_seen": 104210625, "step": 4832, "time_per_iteration": 2.622602939605713 }, { "auxiliary_loss_clip": 0.01104626, "auxiliary_loss_mlp": 0.01038829, "balance_loss_clip": 1.04285121, "balance_loss_mlp": 1.02315879, "epoch": 0.5811338904587267, "flos": 17530153574400.0, "grad_norm": 1.829643373649415, "language_loss": 0.78626525, "learning_rate": 1.5748348223377703e-06, "loss": 0.8076998, "num_input_tokens_seen": 104228180, "step": 4833, "time_per_iteration": 2.646822929382324 }, { "auxiliary_loss_clip": 0.01109926, "auxiliary_loss_mlp": 0.0104518, "balance_loss_clip": 1.04358876, "balance_loss_mlp": 1.02850223, "epoch": 0.5812541333493657, "flos": 19458017360640.0, "grad_norm": 1.4884325567173633, "language_loss": 0.77988869, "learning_rate": 1.5740736878333507e-06, "loss": 0.8014397, "num_input_tokens_seen": 104246020, "step": 4834, "time_per_iteration": 2.627126455307007 }, { "auxiliary_loss_clip": 0.01119156, "auxiliary_loss_mlp": 0.01040992, "balance_loss_clip": 1.04231548, "balance_loss_mlp": 1.02362275, "epoch": 0.5813743762400048, "flos": 20594949621120.0, "grad_norm": 2.411653847302556, "language_loss": 0.77578795, "learning_rate": 1.5733126179416143e-06, "loss": 0.79738939, "num_input_tokens_seen": 104260505, "step": 4835, "time_per_iteration": 2.688196897506714 }, { "auxiliary_loss_clip": 0.01128676, "auxiliary_loss_mlp": 0.01047343, "balance_loss_clip": 1.0461688, "balance_loss_mlp": 1.0302248, "epoch": 0.5814946191306439, "flos": 33178227246720.0, "grad_norm": 2.193228533848993, "language_loss": 0.72656053, "learning_rate": 1.5725516127780137e-06, "loss": 0.74832076, "num_input_tokens_seen": 104282640, "step": 4836, "time_per_iteration": 2.731562852859497 }, { "auxiliary_loss_clip": 0.01135894, "auxiliary_loss_mlp": 0.01042828, "balance_loss_clip": 1.04796839, "balance_loss_mlp": 1.02600789, "epoch": 0.581614862021283, "flos": 16143283503360.0, "grad_norm": 2.303977010988013, "language_loss": 0.88385862, "learning_rate": 1.5717906724579943e-06, "loss": 0.90564585, "num_input_tokens_seen": 104299700, "step": 4837, "time_per_iteration": 2.5947725772857666 }, { "auxiliary_loss_clip": 0.01110939, "auxiliary_loss_mlp": 0.0104394, "balance_loss_clip": 1.04446316, "balance_loss_mlp": 1.02784669, "epoch": 0.581735104911922, "flos": 33802642298880.0, "grad_norm": 2.0964479033779186, "language_loss": 0.6848616, "learning_rate": 1.571029797096989e-06, "loss": 0.70641041, "num_input_tokens_seen": 104320805, "step": 4838, "time_per_iteration": 2.764989137649536 }, { "auxiliary_loss_clip": 0.01138113, "auxiliary_loss_mlp": 0.01037735, "balance_loss_clip": 1.04476118, "balance_loss_mlp": 1.02229714, "epoch": 0.5818553478025612, "flos": 23331163029120.0, "grad_norm": 2.162692934175319, "language_loss": 0.78953981, "learning_rate": 1.570268986810423e-06, "loss": 0.81129825, "num_input_tokens_seen": 104340700, "step": 4839, "time_per_iteration": 2.5662693977355957 }, { "auxiliary_loss_clip": 0.01109557, "auxiliary_loss_mlp": 0.01038201, "balance_loss_clip": 1.04146194, "balance_loss_mlp": 1.02166641, "epoch": 0.5819755906932003, "flos": 20996143603200.0, "grad_norm": 1.7515609827719207, "language_loss": 0.75156349, "learning_rate": 1.5695082417137096e-06, "loss": 0.77304113, "num_input_tokens_seen": 104358575, "step": 4840, "time_per_iteration": 2.629542589187622 }, { "auxiliary_loss_clip": 0.01113877, "auxiliary_loss_mlp": 0.01043019, "balance_loss_clip": 1.04342222, "balance_loss_mlp": 1.0263294, "epoch": 0.5820958335838393, "flos": 21431668008960.0, "grad_norm": 1.771701862883361, "language_loss": 0.75300336, "learning_rate": 1.5687475619222539e-06, "loss": 0.77457237, "num_input_tokens_seen": 104378530, "step": 4841, "time_per_iteration": 2.669390916824341 }, { "auxiliary_loss_clip": 0.01110715, "auxiliary_loss_mlp": 0.01042593, "balance_loss_clip": 1.04240215, "balance_loss_mlp": 1.02456236, "epoch": 0.5822160764744785, "flos": 17967473660160.0, "grad_norm": 2.118469396442554, "language_loss": 0.73304725, "learning_rate": 1.5679869475514496e-06, "loss": 0.75458032, "num_input_tokens_seen": 104395465, "step": 4842, "time_per_iteration": 3.5413482189178467 }, { "auxiliary_loss_clip": 0.0112661, "auxiliary_loss_mlp": 0.01038417, "balance_loss_clip": 1.04298925, "balance_loss_mlp": 1.0219065, "epoch": 0.5823363193651175, "flos": 23033858158080.0, "grad_norm": 2.7028760852726386, "language_loss": 0.81303561, "learning_rate": 1.567226398716682e-06, "loss": 0.83468586, "num_input_tokens_seen": 104415380, "step": 4843, "time_per_iteration": 2.6397383213043213 }, { "auxiliary_loss_clip": 0.01125108, "auxiliary_loss_mlp": 0.01051348, "balance_loss_clip": 1.04444575, "balance_loss_mlp": 1.03369308, "epoch": 0.5824565622557566, "flos": 32891840110080.0, "grad_norm": 1.8642201747270173, "language_loss": 0.61704767, "learning_rate": 1.566465915533326e-06, "loss": 0.63881218, "num_input_tokens_seen": 104437410, "step": 4844, "time_per_iteration": 3.706270933151245 }, { "auxiliary_loss_clip": 0.01126772, "auxiliary_loss_mlp": 0.01037807, "balance_loss_clip": 1.04504967, "balance_loss_mlp": 1.02084374, "epoch": 0.5825768051463958, "flos": 22229674513920.0, "grad_norm": 2.162776002174921, "language_loss": 0.87869394, "learning_rate": 1.5657054981167458e-06, "loss": 0.90033972, "num_input_tokens_seen": 104456305, "step": 4845, "time_per_iteration": 2.6222050189971924 }, { "auxiliary_loss_clip": 0.01121007, "auxiliary_loss_mlp": 0.01043708, "balance_loss_clip": 1.04248405, "balance_loss_mlp": 1.02817488, "epoch": 0.5826970480370348, "flos": 28001561016960.0, "grad_norm": 1.8732535510257062, "language_loss": 0.67933637, "learning_rate": 1.5649451465822965e-06, "loss": 0.70098352, "num_input_tokens_seen": 104477695, "step": 4846, "time_per_iteration": 2.6780829429626465 }, { "auxiliary_loss_clip": 0.01088775, "auxiliary_loss_mlp": 0.01044455, "balance_loss_clip": 1.04231644, "balance_loss_mlp": 1.02836204, "epoch": 0.5828172909276739, "flos": 17858053854720.0, "grad_norm": 1.9887735320222133, "language_loss": 0.83347231, "learning_rate": 1.5641848610453218e-06, "loss": 0.85480464, "num_input_tokens_seen": 104496355, "step": 4847, "time_per_iteration": 2.674901247024536 }, { "auxiliary_loss_clip": 0.01121903, "auxiliary_loss_mlp": 0.01039915, "balance_loss_clip": 1.04234076, "balance_loss_mlp": 1.02338099, "epoch": 0.582937533818313, "flos": 19865244827520.0, "grad_norm": 2.144443806433351, "language_loss": 0.85932982, "learning_rate": 1.563424641621158e-06, "loss": 0.88094801, "num_input_tokens_seen": 104515535, "step": 4848, "time_per_iteration": 2.6215105056762695 }, { "auxiliary_loss_clip": 0.01122367, "auxiliary_loss_mlp": 0.01044416, "balance_loss_clip": 1.04637361, "balance_loss_mlp": 1.02723825, "epoch": 0.5830577767089521, "flos": 26870734068480.0, "grad_norm": 2.2625885130672367, "language_loss": 0.70120966, "learning_rate": 1.5626644884251282e-06, "loss": 0.7228775, "num_input_tokens_seen": 104535055, "step": 4849, "time_per_iteration": 3.6837761402130127 }, { "auxiliary_loss_clip": 0.01134653, "auxiliary_loss_mlp": 0.01038735, "balance_loss_clip": 1.04306245, "balance_loss_mlp": 1.02197433, "epoch": 0.5831780195995911, "flos": 25298205575040.0, "grad_norm": 1.625005202090107, "language_loss": 0.88186389, "learning_rate": 1.5619044015725488e-06, "loss": 0.90359777, "num_input_tokens_seen": 104554745, "step": 4850, "time_per_iteration": 2.610868215560913 }, { "auxiliary_loss_clip": 0.01148623, "auxiliary_loss_mlp": 0.01046978, "balance_loss_clip": 1.0493536, "balance_loss_mlp": 1.02974033, "epoch": 0.5832982624902303, "flos": 14756988049920.0, "grad_norm": 2.3389059389969797, "language_loss": 0.86839265, "learning_rate": 1.5611443811787224e-06, "loss": 0.89034861, "num_input_tokens_seen": 104568870, "step": 4851, "time_per_iteration": 2.555962324142456 }, { "auxiliary_loss_clip": 0.01127002, "auxiliary_loss_mlp": 0.01041628, "balance_loss_clip": 1.04597795, "balance_loss_mlp": 1.02652991, "epoch": 0.5834185053808694, "flos": 20444555376000.0, "grad_norm": 3.092053417561433, "language_loss": 0.688658, "learning_rate": 1.560384427358945e-06, "loss": 0.71034431, "num_input_tokens_seen": 104588415, "step": 4852, "time_per_iteration": 3.5094680786132812 }, { "auxiliary_loss_clip": 0.01110884, "auxiliary_loss_mlp": 0.01044162, "balance_loss_clip": 1.04098558, "balance_loss_mlp": 1.0271275, "epoch": 0.5835387482715084, "flos": 27200394115200.0, "grad_norm": 1.5264931850173278, "language_loss": 0.73414755, "learning_rate": 1.5596245402284998e-06, "loss": 0.75569797, "num_input_tokens_seen": 104611940, "step": 4853, "time_per_iteration": 2.708369255065918 }, { "auxiliary_loss_clip": 0.01131012, "auxiliary_loss_mlp": 0.01047574, "balance_loss_clip": 1.04596806, "balance_loss_mlp": 1.03190947, "epoch": 0.5836589911621476, "flos": 16654615562880.0, "grad_norm": 2.0085000849873706, "language_loss": 0.82111347, "learning_rate": 1.5588647199026619e-06, "loss": 0.84289932, "num_input_tokens_seen": 104629675, "step": 4854, "time_per_iteration": 2.607403039932251 }, { "auxiliary_loss_clip": 0.01146774, "auxiliary_loss_mlp": 0.01039506, "balance_loss_clip": 1.0500443, "balance_loss_mlp": 1.02312708, "epoch": 0.5837792340527866, "flos": 20446817932800.0, "grad_norm": 2.3297303816261445, "language_loss": 0.8745659, "learning_rate": 1.5581049664966956e-06, "loss": 0.89642876, "num_input_tokens_seen": 104647435, "step": 4855, "time_per_iteration": 2.576406955718994 }, { "auxiliary_loss_clip": 0.00994731, "auxiliary_loss_mlp": 0.01020513, "balance_loss_clip": 1.00937223, "balance_loss_mlp": 1.01839066, "epoch": 0.5838994769434257, "flos": 65995480765440.0, "grad_norm": 1.0161834154503333, "language_loss": 0.65091074, "learning_rate": 1.5573452801258545e-06, "loss": 0.67106318, "num_input_tokens_seen": 104694605, "step": 4856, "time_per_iteration": 3.1717045307159424 }, { "auxiliary_loss_clip": 0.01135439, "auxiliary_loss_mlp": 0.01040308, "balance_loss_clip": 1.046561, "balance_loss_mlp": 1.02317727, "epoch": 0.5840197198340649, "flos": 21470523546240.0, "grad_norm": 3.4307606047561947, "language_loss": 0.63849109, "learning_rate": 1.5565856609053824e-06, "loss": 0.66024852, "num_input_tokens_seen": 104713400, "step": 4857, "time_per_iteration": 2.8810689449310303 }, { "auxiliary_loss_clip": 0.01143408, "auxiliary_loss_mlp": 0.01044974, "balance_loss_clip": 1.04738545, "balance_loss_mlp": 1.02758181, "epoch": 0.5841399627247039, "flos": 19135144984320.0, "grad_norm": 1.9794095542137042, "language_loss": 0.79927498, "learning_rate": 1.5558261089505127e-06, "loss": 0.82115883, "num_input_tokens_seen": 104732130, "step": 4858, "time_per_iteration": 2.552652597427368 }, { "auxiliary_loss_clip": 0.01130675, "auxiliary_loss_mlp": 0.01042335, "balance_loss_clip": 1.04634929, "balance_loss_mlp": 1.02540708, "epoch": 0.584260205615343, "flos": 26425692558720.0, "grad_norm": 1.9249092890848636, "language_loss": 0.80270326, "learning_rate": 1.5550666243764697e-06, "loss": 0.82443339, "num_input_tokens_seen": 104750290, "step": 4859, "time_per_iteration": 2.67374324798584 }, { "auxiliary_loss_clip": 0.01130622, "auxiliary_loss_mlp": 0.01035231, "balance_loss_clip": 1.04592156, "balance_loss_mlp": 1.01911438, "epoch": 0.584380448505982, "flos": 13881809174400.0, "grad_norm": 2.2092235717311577, "language_loss": 0.77023888, "learning_rate": 1.554307207298465e-06, "loss": 0.79189742, "num_input_tokens_seen": 104768550, "step": 4860, "time_per_iteration": 2.585594892501831 }, { "auxiliary_loss_clip": 0.01145248, "auxiliary_loss_mlp": 0.01041959, "balance_loss_clip": 1.04770291, "balance_loss_mlp": 1.02578855, "epoch": 0.5845006913966212, "flos": 21543709507200.0, "grad_norm": 2.6241900314100497, "language_loss": 0.78379095, "learning_rate": 1.553547857831704e-06, "loss": 0.80566299, "num_input_tokens_seen": 104785060, "step": 4861, "time_per_iteration": 2.5760350227355957 }, { "auxiliary_loss_clip": 0.01042221, "auxiliary_loss_mlp": 0.01005732, "balance_loss_clip": 1.01201653, "balance_loss_mlp": 1.00407469, "epoch": 0.5846209342872603, "flos": 58375452712320.0, "grad_norm": 0.8775921422177664, "language_loss": 0.6417861, "learning_rate": 1.5527885760913771e-06, "loss": 0.66226566, "num_input_tokens_seen": 104834950, "step": 4862, "time_per_iteration": 2.9959933757781982 }, { "auxiliary_loss_clip": 0.01114023, "auxiliary_loss_mlp": 0.01036805, "balance_loss_clip": 1.04393983, "balance_loss_mlp": 1.02103364, "epoch": 0.5847411771778993, "flos": 18588045957120.0, "grad_norm": 1.542041661483348, "language_loss": 0.76591003, "learning_rate": 1.552029362192668e-06, "loss": 0.78741831, "num_input_tokens_seen": 104854210, "step": 4863, "time_per_iteration": 2.606381416320801 }, { "auxiliary_loss_clip": 0.01101151, "auxiliary_loss_mlp": 0.01044132, "balance_loss_clip": 1.04178929, "balance_loss_mlp": 1.02635765, "epoch": 0.5848614200685385, "flos": 24240780069120.0, "grad_norm": 2.4498699639156705, "language_loss": 0.73351371, "learning_rate": 1.5512702162507478e-06, "loss": 0.75496662, "num_input_tokens_seen": 104874525, "step": 4864, "time_per_iteration": 2.7007288932800293 }, { "auxiliary_loss_clip": 0.01024822, "auxiliary_loss_mlp": 0.01001649, "balance_loss_clip": 1.01216245, "balance_loss_mlp": 0.99983722, "epoch": 0.5849816629591775, "flos": 71660245933440.0, "grad_norm": 1.097257654827365, "language_loss": 0.55774826, "learning_rate": 1.5505111383807792e-06, "loss": 0.578013, "num_input_tokens_seen": 104937195, "step": 4865, "time_per_iteration": 3.2672457695007324 }, { "auxiliary_loss_clip": 0.01095775, "auxiliary_loss_mlp": 0.01042503, "balance_loss_clip": 1.0420804, "balance_loss_mlp": 1.026088, "epoch": 0.5851019058498166, "flos": 23802095266560.0, "grad_norm": 1.9358322957070717, "language_loss": 0.80760944, "learning_rate": 1.5497521286979138e-06, "loss": 0.82899225, "num_input_tokens_seen": 104957435, "step": 4866, "time_per_iteration": 2.7218902111053467 }, { "auxiliary_loss_clip": 0.01105505, "auxiliary_loss_mlp": 0.0104388, "balance_loss_clip": 1.04068744, "balance_loss_mlp": 1.02555788, "epoch": 0.5852221487404557, "flos": 24388516707840.0, "grad_norm": 2.169092058263571, "language_loss": 0.74495631, "learning_rate": 1.5489931873172927e-06, "loss": 0.76645017, "num_input_tokens_seen": 104978755, "step": 4867, "time_per_iteration": 2.723672866821289 }, { "auxiliary_loss_clip": 0.01060062, "auxiliary_loss_mlp": 0.0104409, "balance_loss_clip": 1.03449893, "balance_loss_mlp": 1.02619684, "epoch": 0.5853423916310948, "flos": 27271425260160.0, "grad_norm": 1.76341882976863, "language_loss": 0.79088128, "learning_rate": 1.5482343143540467e-06, "loss": 0.81192279, "num_input_tokens_seen": 105000020, "step": 4868, "time_per_iteration": 3.7008583545684814 }, { "auxiliary_loss_clip": 0.01104593, "auxiliary_loss_mlp": 0.00772914, "balance_loss_clip": 1.04273808, "balance_loss_mlp": 1.00061464, "epoch": 0.5854626345217339, "flos": 11983786611840.0, "grad_norm": 2.370159548713836, "language_loss": 0.8301813, "learning_rate": 1.547475509923295e-06, "loss": 0.84895635, "num_input_tokens_seen": 105017060, "step": 4869, "time_per_iteration": 2.6807327270507812 }, { "auxiliary_loss_clip": 0.01004702, "auxiliary_loss_mlp": 0.01004924, "balance_loss_clip": 1.00982881, "balance_loss_mlp": 1.00295734, "epoch": 0.585582877412373, "flos": 64342335173760.0, "grad_norm": 0.7225648345871838, "language_loss": 0.55992633, "learning_rate": 1.5467167741401495e-06, "loss": 0.58002257, "num_input_tokens_seen": 105078540, "step": 4870, "time_per_iteration": 4.246856212615967 }, { "auxiliary_loss_clip": 0.01114915, "auxiliary_loss_mlp": 0.01036548, "balance_loss_clip": 1.04263055, "balance_loss_mlp": 1.01975131, "epoch": 0.5857031203030121, "flos": 17011926103680.0, "grad_norm": 2.0875291977175277, "language_loss": 0.71317983, "learning_rate": 1.5459581071197083e-06, "loss": 0.73469448, "num_input_tokens_seen": 105094200, "step": 4871, "time_per_iteration": 2.6397933959960938 }, { "auxiliary_loss_clip": 0.01134676, "auxiliary_loss_mlp": 0.01047743, "balance_loss_clip": 1.04885364, "balance_loss_mlp": 1.03033876, "epoch": 0.5858233631936511, "flos": 20885682303360.0, "grad_norm": 2.2650698959952815, "language_loss": 0.83471203, "learning_rate": 1.5451995089770624e-06, "loss": 0.85653621, "num_input_tokens_seen": 105113985, "step": 4872, "time_per_iteration": 2.5847461223602295 }, { "auxiliary_loss_clip": 0.01139857, "auxiliary_loss_mlp": 0.01036544, "balance_loss_clip": 1.04676867, "balance_loss_mlp": 1.0204922, "epoch": 0.5859436060842903, "flos": 23191902000000.0, "grad_norm": 1.3110526716842383, "language_loss": 0.71876872, "learning_rate": 1.5444409798272885e-06, "loss": 0.74053264, "num_input_tokens_seen": 105138075, "step": 4873, "time_per_iteration": 2.6905648708343506 }, { "auxiliary_loss_clip": 0.01103517, "auxiliary_loss_mlp": 0.010477, "balance_loss_clip": 1.04093158, "balance_loss_mlp": 1.02973485, "epoch": 0.5860638489749294, "flos": 22492648961280.0, "grad_norm": 1.8504026373604467, "language_loss": 0.8048842, "learning_rate": 1.543682519785456e-06, "loss": 0.82639635, "num_input_tokens_seen": 105156555, "step": 4874, "time_per_iteration": 2.6727066040039062 }, { "auxiliary_loss_clip": 0.01113162, "auxiliary_loss_mlp": 0.01040069, "balance_loss_clip": 1.04312587, "balance_loss_mlp": 1.02260518, "epoch": 0.5861840918655684, "flos": 17566243764480.0, "grad_norm": 2.8038980449797877, "language_loss": 0.80169773, "learning_rate": 1.5429241289666219e-06, "loss": 0.82322997, "num_input_tokens_seen": 105174055, "step": 4875, "time_per_iteration": 3.8037662506103516 }, { "auxiliary_loss_clip": 0.01106017, "auxiliary_loss_mlp": 0.01042353, "balance_loss_clip": 1.04042006, "balance_loss_mlp": 1.02478194, "epoch": 0.5863043347562076, "flos": 25556152118400.0, "grad_norm": 2.560064555894422, "language_loss": 0.69645143, "learning_rate": 1.5421658074858342e-06, "loss": 0.71793509, "num_input_tokens_seen": 105192160, "step": 4876, "time_per_iteration": 2.6795897483825684 }, { "auxiliary_loss_clip": 0.01113891, "auxiliary_loss_mlp": 0.01047008, "balance_loss_clip": 1.04386401, "balance_loss_mlp": 1.0301038, "epoch": 0.5864245776468466, "flos": 20667525050880.0, "grad_norm": 2.2212014185164968, "language_loss": 0.66251826, "learning_rate": 1.5414075554581298e-06, "loss": 0.68412727, "num_input_tokens_seen": 105210205, "step": 4877, "time_per_iteration": 2.595027446746826 }, { "auxiliary_loss_clip": 0.01143316, "auxiliary_loss_mlp": 0.01040924, "balance_loss_clip": 1.04742873, "balance_loss_mlp": 1.02417505, "epoch": 0.5865448205374857, "flos": 28913907490560.0, "grad_norm": 2.6948981925089486, "language_loss": 0.7834543, "learning_rate": 1.5406493729985348e-06, "loss": 0.80529672, "num_input_tokens_seen": 105229400, "step": 4878, "time_per_iteration": 3.522690773010254 }, { "auxiliary_loss_clip": 0.01097681, "auxiliary_loss_mlp": 0.00774107, "balance_loss_clip": 1.04684699, "balance_loss_mlp": 1.00077653, "epoch": 0.5866650634281249, "flos": 25842575168640.0, "grad_norm": 3.519574622480554, "language_loss": 0.72236073, "learning_rate": 1.5398912602220644e-06, "loss": 0.74107862, "num_input_tokens_seen": 105248675, "step": 4879, "time_per_iteration": 2.726813316345215 }, { "auxiliary_loss_clip": 0.01105012, "auxiliary_loss_mlp": 0.01044459, "balance_loss_clip": 1.04597259, "balance_loss_mlp": 1.02767467, "epoch": 0.5867853063187639, "flos": 17052325925760.0, "grad_norm": 2.062803122225902, "language_loss": 0.78690183, "learning_rate": 1.539133217243724e-06, "loss": 0.80839652, "num_input_tokens_seen": 105265695, "step": 4880, "time_per_iteration": 2.696218729019165 }, { "auxiliary_loss_clip": 0.01110667, "auxiliary_loss_mlp": 0.01049159, "balance_loss_clip": 1.04361546, "balance_loss_mlp": 1.02999043, "epoch": 0.586905549209403, "flos": 24645026707200.0, "grad_norm": 2.135276629006151, "language_loss": 0.76247251, "learning_rate": 1.5383752441785081e-06, "loss": 0.78407073, "num_input_tokens_seen": 105284920, "step": 4881, "time_per_iteration": 2.7426540851593018 }, { "auxiliary_loss_clip": 0.01135546, "auxiliary_loss_mlp": 0.01045495, "balance_loss_clip": 1.04589093, "balance_loss_mlp": 1.02786374, "epoch": 0.5870257921000421, "flos": 14720538723840.0, "grad_norm": 2.1175855751746475, "language_loss": 0.85364318, "learning_rate": 1.5376173411414003e-06, "loss": 0.87545359, "num_input_tokens_seen": 105302960, "step": 4882, "time_per_iteration": 2.5890355110168457 }, { "auxiliary_loss_clip": 0.01113246, "auxiliary_loss_mlp": 0.01047284, "balance_loss_clip": 1.04069662, "balance_loss_mlp": 1.03115535, "epoch": 0.5871460349906812, "flos": 23914998691200.0, "grad_norm": 1.9781572827832787, "language_loss": 0.7877692, "learning_rate": 1.5368595082473753e-06, "loss": 0.80937457, "num_input_tokens_seen": 105321260, "step": 4883, "time_per_iteration": 2.7093734741210938 }, { "auxiliary_loss_clip": 0.01129647, "auxiliary_loss_mlp": 0.0103939, "balance_loss_clip": 1.0454179, "balance_loss_mlp": 1.02253401, "epoch": 0.5872662778813202, "flos": 22164174063360.0, "grad_norm": 1.822121890019935, "language_loss": 0.78083229, "learning_rate": 1.5361017456113935e-06, "loss": 0.80252266, "num_input_tokens_seen": 105341610, "step": 4884, "time_per_iteration": 2.6275970935821533 }, { "auxiliary_loss_clip": 0.01131049, "auxiliary_loss_mlp": 0.01043465, "balance_loss_clip": 1.04470742, "balance_loss_mlp": 1.0265255, "epoch": 0.5873865207719594, "flos": 18441925430400.0, "grad_norm": 2.4519742345869253, "language_loss": 0.85919368, "learning_rate": 1.5353440533484085e-06, "loss": 0.88093877, "num_input_tokens_seen": 105360465, "step": 4885, "time_per_iteration": 2.594869375228882 }, { "auxiliary_loss_clip": 0.01123187, "auxiliary_loss_mlp": 0.01051174, "balance_loss_clip": 1.0477457, "balance_loss_mlp": 1.03308964, "epoch": 0.5875067636625985, "flos": 54015321427200.0, "grad_norm": 1.768038335670267, "language_loss": 0.66398519, "learning_rate": 1.534586431573361e-06, "loss": 0.68572879, "num_input_tokens_seen": 105385405, "step": 4886, "time_per_iteration": 2.956080436706543 }, { "auxiliary_loss_clip": 0.01079374, "auxiliary_loss_mlp": 0.01051555, "balance_loss_clip": 1.03740609, "balance_loss_mlp": 1.03386497, "epoch": 0.5876270065532375, "flos": 27995707100160.0, "grad_norm": 1.8967925420780354, "language_loss": 0.79026026, "learning_rate": 1.5338288804011817e-06, "loss": 0.81156957, "num_input_tokens_seen": 105404905, "step": 4887, "time_per_iteration": 2.7865419387817383 }, { "auxiliary_loss_clip": 0.01114038, "auxiliary_loss_mlp": 0.01036936, "balance_loss_clip": 1.04270339, "balance_loss_mlp": 1.01955557, "epoch": 0.5877472494438767, "flos": 21361462876800.0, "grad_norm": 2.145727656358421, "language_loss": 0.70964277, "learning_rate": 1.533071399946791e-06, "loss": 0.73115253, "num_input_tokens_seen": 105423650, "step": 4888, "time_per_iteration": 2.6793689727783203 }, { "auxiliary_loss_clip": 0.01116668, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.04247546, "balance_loss_mlp": 1.02324522, "epoch": 0.5878674923345157, "flos": 22383013674240.0, "grad_norm": 1.8701528091692876, "language_loss": 0.57238472, "learning_rate": 1.5323139903250977e-06, "loss": 0.5939492, "num_input_tokens_seen": 105444255, "step": 4889, "time_per_iteration": 2.6715030670166016 }, { "auxiliary_loss_clip": 0.01122648, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.04927373, "balance_loss_mlp": 1.02332711, "epoch": 0.5879877352251548, "flos": 21868664872320.0, "grad_norm": 1.6858043991892304, "language_loss": 0.76980406, "learning_rate": 1.5315566516510002e-06, "loss": 0.79143178, "num_input_tokens_seen": 105462425, "step": 4890, "time_per_iteration": 2.6603636741638184 }, { "auxiliary_loss_clip": 0.01140425, "auxiliary_loss_mlp": 0.01048953, "balance_loss_clip": 1.04714489, "balance_loss_mlp": 1.03247833, "epoch": 0.5881079781157939, "flos": 17493811989120.0, "grad_norm": 1.9134821740517618, "language_loss": 0.67801207, "learning_rate": 1.5307993840393857e-06, "loss": 0.69990587, "num_input_tokens_seen": 105480505, "step": 4891, "time_per_iteration": 2.5852432250976562 }, { "auxiliary_loss_clip": 0.01139551, "auxiliary_loss_mlp": 0.01034749, "balance_loss_clip": 1.04523778, "balance_loss_mlp": 1.01952565, "epoch": 0.588228221006433, "flos": 22601853285120.0, "grad_norm": 1.9352612280972934, "language_loss": 0.80539024, "learning_rate": 1.530042187605132e-06, "loss": 0.8271333, "num_input_tokens_seen": 105499760, "step": 4892, "time_per_iteration": 2.5883493423461914 }, { "auxiliary_loss_clip": 0.01127889, "auxiliary_loss_mlp": 0.00772136, "balance_loss_clip": 1.04568326, "balance_loss_mlp": 1.00085425, "epoch": 0.5883484638970721, "flos": 26176939896960.0, "grad_norm": 5.9900278136752245, "language_loss": 0.84525859, "learning_rate": 1.5292850624631044e-06, "loss": 0.86425877, "num_input_tokens_seen": 105521955, "step": 4893, "time_per_iteration": 2.6775739192962646 }, { "auxiliary_loss_clip": 0.01124296, "auxiliary_loss_mlp": 0.01049891, "balance_loss_clip": 1.04699993, "balance_loss_mlp": 1.0337615, "epoch": 0.5884687067877111, "flos": 30443737691520.0, "grad_norm": 2.4928640835997644, "language_loss": 0.80492842, "learning_rate": 1.5285280087281593e-06, "loss": 0.82667029, "num_input_tokens_seen": 105542685, "step": 4894, "time_per_iteration": 3.6115529537200928 }, { "auxiliary_loss_clip": 0.01027448, "auxiliary_loss_mlp": 0.01013335, "balance_loss_clip": 1.01600456, "balance_loss_mlp": 1.01172543, "epoch": 0.5885889496783503, "flos": 70507550580480.0, "grad_norm": 0.6435253398813304, "language_loss": 0.56571412, "learning_rate": 1.5277710265151398e-06, "loss": 0.58612192, "num_input_tokens_seen": 105612165, "step": 4895, "time_per_iteration": 3.3761110305786133 }, { "auxiliary_loss_clip": 0.011303, "auxiliary_loss_mlp": 0.01044701, "balance_loss_clip": 1.04466176, "balance_loss_mlp": 1.02677178, "epoch": 0.5887091925689893, "flos": 19098767485440.0, "grad_norm": 3.621692471730713, "language_loss": 0.77774888, "learning_rate": 1.5270141159388803e-06, "loss": 0.79949892, "num_input_tokens_seen": 105629185, "step": 4896, "time_per_iteration": 3.5162229537963867 }, { "auxiliary_loss_clip": 0.01139156, "auxiliary_loss_mlp": 0.01042848, "balance_loss_clip": 1.04340219, "balance_loss_mlp": 1.02671862, "epoch": 0.5888294354596284, "flos": 23294282739840.0, "grad_norm": 1.6491581306914787, "language_loss": 0.80496156, "learning_rate": 1.526257277114203e-06, "loss": 0.82678157, "num_input_tokens_seen": 105650260, "step": 4897, "time_per_iteration": 2.5952866077423096 }, { "auxiliary_loss_clip": 0.01113797, "auxiliary_loss_mlp": 0.0103797, "balance_loss_clip": 1.04424119, "balance_loss_mlp": 1.02193666, "epoch": 0.5889496783502676, "flos": 21981532383360.0, "grad_norm": 1.9744134319586082, "language_loss": 0.79827255, "learning_rate": 1.5255005101559201e-06, "loss": 0.81979018, "num_input_tokens_seen": 105667870, "step": 4898, "time_per_iteration": 2.6559853553771973 }, { "auxiliary_loss_clip": 0.0112972, "auxiliary_loss_mlp": 0.01037701, "balance_loss_clip": 1.04556322, "balance_loss_mlp": 1.02192998, "epoch": 0.5890699212409066, "flos": 21685233093120.0, "grad_norm": 2.830167570918618, "language_loss": 0.76806808, "learning_rate": 1.524743815178833e-06, "loss": 0.78974223, "num_input_tokens_seen": 105685830, "step": 4899, "time_per_iteration": 2.607457160949707 }, { "auxiliary_loss_clip": 0.0111662, "auxiliary_loss_mlp": 0.01039819, "balance_loss_clip": 1.04405808, "balance_loss_mlp": 1.02428579, "epoch": 0.5891901641315457, "flos": 19464553635840.0, "grad_norm": 1.7233621257079623, "language_loss": 0.8045733, "learning_rate": 1.5239871922977315e-06, "loss": 0.82613766, "num_input_tokens_seen": 105705745, "step": 4900, "time_per_iteration": 2.632934808731079 }, { "auxiliary_loss_clip": 0.01112398, "auxiliary_loss_mlp": 0.01045858, "balance_loss_clip": 1.04122758, "balance_loss_mlp": 1.02873993, "epoch": 0.5893104070221848, "flos": 19609884063360.0, "grad_norm": 2.0633115408264473, "language_loss": 0.89969301, "learning_rate": 1.523230641627394e-06, "loss": 0.92127562, "num_input_tokens_seen": 105724730, "step": 4901, "time_per_iteration": 3.6449644565582275 }, { "auxiliary_loss_clip": 0.01090734, "auxiliary_loss_mlp": 0.01041635, "balance_loss_clip": 1.03862548, "balance_loss_mlp": 1.02519619, "epoch": 0.5894306499128239, "flos": 29060063930880.0, "grad_norm": 2.189807975044674, "language_loss": 0.72919571, "learning_rate": 1.5224741632825888e-06, "loss": 0.75051945, "num_input_tokens_seen": 105744920, "step": 4902, "time_per_iteration": 2.7742109298706055 }, { "auxiliary_loss_clip": 0.01146283, "auxiliary_loss_mlp": 0.01047065, "balance_loss_clip": 1.04923701, "balance_loss_mlp": 1.02935028, "epoch": 0.589550892803463, "flos": 42298890721920.0, "grad_norm": 3.3735657610315615, "language_loss": 0.69400764, "learning_rate": 1.521717757378074e-06, "loss": 0.71594107, "num_input_tokens_seen": 105765465, "step": 4903, "time_per_iteration": 2.764451503753662 }, { "auxiliary_loss_clip": 0.01129747, "auxiliary_loss_mlp": 0.01037118, "balance_loss_clip": 1.04374623, "balance_loss_mlp": 1.02017879, "epoch": 0.5896711356941021, "flos": 14137062197760.0, "grad_norm": 1.8737315428172672, "language_loss": 0.68799508, "learning_rate": 1.5209614240285943e-06, "loss": 0.70966375, "num_input_tokens_seen": 105783120, "step": 4904, "time_per_iteration": 3.5103931427001953 }, { "auxiliary_loss_clip": 0.0113836, "auxiliary_loss_mlp": 0.0077283, "balance_loss_clip": 1.04437971, "balance_loss_mlp": 1.00075698, "epoch": 0.5897913785847412, "flos": 17201355454080.0, "grad_norm": 2.1301700975127003, "language_loss": 0.84987998, "learning_rate": 1.520205163348887e-06, "loss": 0.86899185, "num_input_tokens_seen": 105801055, "step": 4905, "time_per_iteration": 2.5533671379089355 }, { "auxiliary_loss_clip": 0.01021445, "auxiliary_loss_mlp": 0.01000795, "balance_loss_clip": 1.01649833, "balance_loss_mlp": 0.99905413, "epoch": 0.5899116214753802, "flos": 48794164202880.0, "grad_norm": 0.7233486833439121, "language_loss": 0.56967229, "learning_rate": 1.519448975453674e-06, "loss": 0.58989471, "num_input_tokens_seen": 105856155, "step": 4906, "time_per_iteration": 3.0708699226379395 }, { "auxiliary_loss_clip": 0.01131813, "auxiliary_loss_mlp": 0.00773319, "balance_loss_clip": 1.0449245, "balance_loss_mlp": 1.00078821, "epoch": 0.5900318643660194, "flos": 21103659987840.0, "grad_norm": 2.0903789313944183, "language_loss": 0.75969076, "learning_rate": 1.5186928604576696e-06, "loss": 0.77874207, "num_input_tokens_seen": 105873350, "step": 4907, "time_per_iteration": 2.617318630218506 }, { "auxiliary_loss_clip": 0.01116617, "auxiliary_loss_mlp": 0.01041368, "balance_loss_clip": 1.04342115, "balance_loss_mlp": 1.02413011, "epoch": 0.5901521072566585, "flos": 21178390233600.0, "grad_norm": 2.144200340727848, "language_loss": 0.77074116, "learning_rate": 1.5179368184755752e-06, "loss": 0.79232103, "num_input_tokens_seen": 105891435, "step": 4908, "time_per_iteration": 2.649034261703491 }, { "auxiliary_loss_clip": 0.01115546, "auxiliary_loss_mlp": 0.01039987, "balance_loss_clip": 1.04435015, "balance_loss_mlp": 1.02411985, "epoch": 0.5902723501472975, "flos": 20225967160320.0, "grad_norm": 1.9416791433371359, "language_loss": 0.82679677, "learning_rate": 1.5171808496220821e-06, "loss": 0.84835213, "num_input_tokens_seen": 105910190, "step": 4909, "time_per_iteration": 2.6620285511016846 }, { "auxiliary_loss_clip": 0.01120467, "auxiliary_loss_mlp": 0.01040213, "balance_loss_clip": 1.04452312, "balance_loss_mlp": 1.02407169, "epoch": 0.5903925930379367, "flos": 22964407211520.0, "grad_norm": 1.678166915146803, "language_loss": 0.81146276, "learning_rate": 1.5164249540118708e-06, "loss": 0.83306956, "num_input_tokens_seen": 105929315, "step": 4910, "time_per_iteration": 2.639591932296753 }, { "auxiliary_loss_clip": 0.01082331, "auxiliary_loss_mlp": 0.01036259, "balance_loss_clip": 1.0408839, "balance_loss_mlp": 1.02011776, "epoch": 0.5905128359285757, "flos": 23367720096000.0, "grad_norm": 2.088182233872284, "language_loss": 0.83333516, "learning_rate": 1.5156691317596093e-06, "loss": 0.8545211, "num_input_tokens_seen": 105950740, "step": 4911, "time_per_iteration": 2.844374656677246 }, { "auxiliary_loss_clip": 0.01129755, "auxiliary_loss_mlp": 0.00772609, "balance_loss_clip": 1.04502201, "balance_loss_mlp": 1.00071061, "epoch": 0.5906330788192148, "flos": 28032335994240.0, "grad_norm": 2.010503752099926, "language_loss": 0.66403121, "learning_rate": 1.5149133829799556e-06, "loss": 0.68305486, "num_input_tokens_seen": 105968735, "step": 4912, "time_per_iteration": 2.691091299057007 }, { "auxiliary_loss_clip": 0.01122909, "auxiliary_loss_mlp": 0.01041427, "balance_loss_clip": 1.046031, "balance_loss_mlp": 1.02554822, "epoch": 0.590753321709854, "flos": 18477943793280.0, "grad_norm": 4.672023560381534, "language_loss": 0.80968064, "learning_rate": 1.5141577077875556e-06, "loss": 0.83132398, "num_input_tokens_seen": 105986060, "step": 4913, "time_per_iteration": 2.6694648265838623 }, { "auxiliary_loss_clip": 0.01130574, "auxiliary_loss_mlp": 0.01044363, "balance_loss_clip": 1.04585171, "balance_loss_mlp": 1.0281744, "epoch": 0.590873564600493, "flos": 16873706568960.0, "grad_norm": 2.0183847223204556, "language_loss": 0.72621977, "learning_rate": 1.5134021062970451e-06, "loss": 0.74796915, "num_input_tokens_seen": 106004440, "step": 4914, "time_per_iteration": 2.5738348960876465 }, { "auxiliary_loss_clip": 0.01091051, "auxiliary_loss_mlp": 0.01038997, "balance_loss_clip": 1.04142714, "balance_loss_mlp": 1.02342784, "epoch": 0.5909938074911321, "flos": 13516166678400.0, "grad_norm": 2.5426967720297324, "language_loss": 0.81198978, "learning_rate": 1.5126465786230483e-06, "loss": 0.83329028, "num_input_tokens_seen": 106021215, "step": 4915, "time_per_iteration": 2.6682028770446777 }, { "auxiliary_loss_clip": 0.01140277, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.04537201, "balance_loss_mlp": 1.02500987, "epoch": 0.5911140503817712, "flos": 26024067613440.0, "grad_norm": 2.018623616998467, "language_loss": 0.82181537, "learning_rate": 1.5118911248801787e-06, "loss": 0.84362924, "num_input_tokens_seen": 106039225, "step": 4916, "time_per_iteration": 2.594034433364868 }, { "auxiliary_loss_clip": 0.01124766, "auxiliary_loss_mlp": 0.01036946, "balance_loss_clip": 1.04433072, "balance_loss_mlp": 1.02113867, "epoch": 0.5912342932724103, "flos": 23258731253760.0, "grad_norm": 2.07407058582315, "language_loss": 0.7985602, "learning_rate": 1.5111357451830364e-06, "loss": 0.82017732, "num_input_tokens_seen": 106057920, "step": 4917, "time_per_iteration": 2.6347720623016357 }, { "auxiliary_loss_clip": 0.01127923, "auxiliary_loss_mlp": 0.01043907, "balance_loss_clip": 1.0453403, "balance_loss_mlp": 1.02720547, "epoch": 0.5913545361630493, "flos": 19573039687680.0, "grad_norm": 1.8631798231251921, "language_loss": 0.7126056, "learning_rate": 1.5103804396462131e-06, "loss": 0.73432386, "num_input_tokens_seen": 106077855, "step": 4918, "time_per_iteration": 2.610779285430908 }, { "auxiliary_loss_clip": 0.0113154, "auxiliary_loss_mlp": 0.01050094, "balance_loss_clip": 1.04307318, "balance_loss_mlp": 1.03277302, "epoch": 0.5914747790536885, "flos": 26213532877440.0, "grad_norm": 2.1544360574600954, "language_loss": 0.80626798, "learning_rate": 1.5096252083842877e-06, "loss": 0.82808429, "num_input_tokens_seen": 106097065, "step": 4919, "time_per_iteration": 2.7169134616851807 }, { "auxiliary_loss_clip": 0.01127667, "auxiliary_loss_mlp": 0.01045089, "balance_loss_clip": 1.04263508, "balance_loss_mlp": 1.02646875, "epoch": 0.5915950219443276, "flos": 27417545786880.0, "grad_norm": 16.062418431093448, "language_loss": 0.85336065, "learning_rate": 1.5088700515118285e-06, "loss": 0.87508821, "num_input_tokens_seen": 106116385, "step": 4920, "time_per_iteration": 3.5320045948028564 }, { "auxiliary_loss_clip": 0.0110047, "auxiliary_loss_mlp": 0.01052629, "balance_loss_clip": 1.04310846, "balance_loss_mlp": 1.034235, "epoch": 0.5917152648349666, "flos": 21907879545600.0, "grad_norm": 1.5142834539128074, "language_loss": 0.6636399, "learning_rate": 1.508114969143392e-06, "loss": 0.68517089, "num_input_tokens_seen": 106136370, "step": 4921, "time_per_iteration": 2.7131500244140625 }, { "auxiliary_loss_clip": 0.01113176, "auxiliary_loss_mlp": 0.01036744, "balance_loss_clip": 1.04112411, "balance_loss_mlp": 1.02127671, "epoch": 0.5918355077256057, "flos": 28109185142400.0, "grad_norm": 2.5897836181750122, "language_loss": 0.77430582, "learning_rate": 1.5073599613935238e-06, "loss": 0.7958051, "num_input_tokens_seen": 106158490, "step": 4922, "time_per_iteration": 3.701831817626953 }, { "auxiliary_loss_clip": 0.01114651, "auxiliary_loss_mlp": 0.01038813, "balance_loss_clip": 1.04141366, "balance_loss_mlp": 1.02255321, "epoch": 0.5919557506162448, "flos": 28183807647360.0, "grad_norm": 2.1425035166665616, "language_loss": 0.57172358, "learning_rate": 1.5066050283767574e-06, "loss": 0.59325826, "num_input_tokens_seen": 106179170, "step": 4923, "time_per_iteration": 2.686781644821167 }, { "auxiliary_loss_clip": 0.01107649, "auxiliary_loss_mlp": 0.01052959, "balance_loss_clip": 1.04266644, "balance_loss_mlp": 1.03758073, "epoch": 0.5920759935068839, "flos": 12094355652480.0, "grad_norm": 2.394921970558122, "language_loss": 0.82737833, "learning_rate": 1.505850170207616e-06, "loss": 0.84898448, "num_input_tokens_seen": 106196035, "step": 4924, "time_per_iteration": 2.6512248516082764 }, { "auxiliary_loss_clip": 0.01112918, "auxiliary_loss_mlp": 0.01039698, "balance_loss_clip": 1.0416441, "balance_loss_mlp": 1.0237056, "epoch": 0.592196236397523, "flos": 29424772673280.0, "grad_norm": 2.124093669537776, "language_loss": 0.78240311, "learning_rate": 1.505095387000611e-06, "loss": 0.80392927, "num_input_tokens_seen": 106218335, "step": 4925, "time_per_iteration": 2.7020697593688965 }, { "auxiliary_loss_clip": 0.01109081, "auxiliary_loss_mlp": 0.01049568, "balance_loss_clip": 1.0437746, "balance_loss_mlp": 1.03283072, "epoch": 0.5923164792881621, "flos": 24384709866240.0, "grad_norm": 2.221088125810156, "language_loss": 0.74824208, "learning_rate": 1.504340678870242e-06, "loss": 0.76982856, "num_input_tokens_seen": 106236550, "step": 4926, "time_per_iteration": 2.6830387115478516 }, { "auxiliary_loss_clip": 0.01126268, "auxiliary_loss_mlp": 0.01042306, "balance_loss_clip": 1.04385138, "balance_loss_mlp": 1.02585483, "epoch": 0.5924367221788012, "flos": 24024238928640.0, "grad_norm": 2.590580385639599, "language_loss": 0.89799219, "learning_rate": 1.5035860459309989e-06, "loss": 0.91967797, "num_input_tokens_seen": 106254265, "step": 4927, "time_per_iteration": 3.582547664642334 }, { "auxiliary_loss_clip": 0.01112608, "auxiliary_loss_mlp": 0.01047093, "balance_loss_clip": 1.04359984, "balance_loss_mlp": 1.02853251, "epoch": 0.5925569650694402, "flos": 26870590414080.0, "grad_norm": 2.421901455992474, "language_loss": 0.63322204, "learning_rate": 1.5028314882973568e-06, "loss": 0.65481901, "num_input_tokens_seen": 106274670, "step": 4928, "time_per_iteration": 2.693781614303589 }, { "auxiliary_loss_clip": 0.01116935, "auxiliary_loss_mlp": 0.01046036, "balance_loss_clip": 1.04477286, "balance_loss_mlp": 1.02839255, "epoch": 0.5926772079600794, "flos": 22302788647680.0, "grad_norm": 1.8884532550445041, "language_loss": 0.84593368, "learning_rate": 1.502077006083783e-06, "loss": 0.86756337, "num_input_tokens_seen": 106293330, "step": 4929, "time_per_iteration": 2.664766311645508 }, { "auxiliary_loss_clip": 0.01132156, "auxiliary_loss_mlp": 0.00772498, "balance_loss_clip": 1.0459578, "balance_loss_mlp": 1.00072777, "epoch": 0.5927974508507184, "flos": 19865244827520.0, "grad_norm": 2.3134045253196778, "language_loss": 0.76466608, "learning_rate": 1.5013225994047315e-06, "loss": 0.78371263, "num_input_tokens_seen": 106310960, "step": 4930, "time_per_iteration": 3.5182745456695557 }, { "auxiliary_loss_clip": 0.01132443, "auxiliary_loss_mlp": 0.0077353, "balance_loss_clip": 1.04819608, "balance_loss_mlp": 1.00075793, "epoch": 0.5929176937413575, "flos": 15776743167360.0, "grad_norm": 1.5782874537351648, "language_loss": 0.80749691, "learning_rate": 1.5005682683746452e-06, "loss": 0.82655668, "num_input_tokens_seen": 106329475, "step": 4931, "time_per_iteration": 2.6226327419281006 }, { "auxiliary_loss_clip": 0.01130652, "auxiliary_loss_mlp": 0.01040617, "balance_loss_clip": 1.04606283, "balance_loss_mlp": 1.02420187, "epoch": 0.5930379366319967, "flos": 17601472028160.0, "grad_norm": 2.144064808912914, "language_loss": 0.72911018, "learning_rate": 1.4998140131079553e-06, "loss": 0.75082284, "num_input_tokens_seen": 106345565, "step": 4932, "time_per_iteration": 2.596184492111206 }, { "auxiliary_loss_clip": 0.01075952, "auxiliary_loss_mlp": 0.00773316, "balance_loss_clip": 1.03812158, "balance_loss_mlp": 1.00066745, "epoch": 0.5931581795226357, "flos": 17704283731200.0, "grad_norm": 1.7525055149377609, "language_loss": 0.72991657, "learning_rate": 1.4990598337190821e-06, "loss": 0.74840927, "num_input_tokens_seen": 106361920, "step": 4933, "time_per_iteration": 2.756279468536377 }, { "auxiliary_loss_clip": 0.01137884, "auxiliary_loss_mlp": 0.00773351, "balance_loss_clip": 1.0447787, "balance_loss_mlp": 1.00077605, "epoch": 0.5932784224132748, "flos": 24280102483200.0, "grad_norm": 1.7602252121073552, "language_loss": 0.67729735, "learning_rate": 1.4983057303224338e-06, "loss": 0.6964097, "num_input_tokens_seen": 106381735, "step": 4934, "time_per_iteration": 2.590391159057617 }, { "auxiliary_loss_clip": 0.01091676, "auxiliary_loss_mlp": 0.01042852, "balance_loss_clip": 1.0400157, "balance_loss_mlp": 1.02595973, "epoch": 0.5933986653039139, "flos": 22926700909440.0, "grad_norm": 1.734478379268213, "language_loss": 0.87531388, "learning_rate": 1.4975517030324072e-06, "loss": 0.89665914, "num_input_tokens_seen": 106399745, "step": 4935, "time_per_iteration": 2.7167961597442627 }, { "auxiliary_loss_clip": 0.01041894, "auxiliary_loss_mlp": 0.00755928, "balance_loss_clip": 1.01146173, "balance_loss_mlp": 1.00077403, "epoch": 0.593518908194553, "flos": 71121730256640.0, "grad_norm": 0.7843324435871406, "language_loss": 0.61793613, "learning_rate": 1.4967977519633882e-06, "loss": 0.63591433, "num_input_tokens_seen": 106457205, "step": 4936, "time_per_iteration": 3.3151259422302246 }, { "auxiliary_loss_clip": 0.01101949, "auxiliary_loss_mlp": 0.01039493, "balance_loss_clip": 1.03954577, "balance_loss_mlp": 1.0234592, "epoch": 0.593639151085192, "flos": 20448649526400.0, "grad_norm": 2.2646203621917786, "language_loss": 0.78159916, "learning_rate": 1.4960438772297494e-06, "loss": 0.80301362, "num_input_tokens_seen": 106474250, "step": 4937, "time_per_iteration": 2.669635057449341 }, { "auxiliary_loss_clip": 0.01118935, "auxiliary_loss_mlp": 0.01040849, "balance_loss_clip": 1.0431478, "balance_loss_mlp": 1.02505326, "epoch": 0.5937593939758312, "flos": 30883428074880.0, "grad_norm": 2.5080837770378133, "language_loss": 0.7377007, "learning_rate": 1.495290078945855e-06, "loss": 0.75929856, "num_input_tokens_seen": 106494015, "step": 4938, "time_per_iteration": 2.7203638553619385 }, { "auxiliary_loss_clip": 0.01139882, "auxiliary_loss_mlp": 0.01041811, "balance_loss_clip": 1.04653776, "balance_loss_mlp": 1.02567041, "epoch": 0.5938796368664703, "flos": 36898069668480.0, "grad_norm": 3.506687525000573, "language_loss": 0.74374598, "learning_rate": 1.4945363572260529e-06, "loss": 0.76556289, "num_input_tokens_seen": 106515010, "step": 4939, "time_per_iteration": 2.700119972229004 }, { "auxiliary_loss_clip": 0.01129943, "auxiliary_loss_mlp": 0.01035893, "balance_loss_clip": 1.04502487, "balance_loss_mlp": 1.01912045, "epoch": 0.5939998797571093, "flos": 23842926051840.0, "grad_norm": 2.211548963540382, "language_loss": 0.67943347, "learning_rate": 1.4937827121846845e-06, "loss": 0.70109189, "num_input_tokens_seen": 106535265, "step": 4940, "time_per_iteration": 2.6233277320861816 }, { "auxiliary_loss_clip": 0.01096217, "auxiliary_loss_mlp": 0.01042189, "balance_loss_clip": 1.04267991, "balance_loss_mlp": 1.02561903, "epoch": 0.5941201226477485, "flos": 25191407462400.0, "grad_norm": 2.2207721748634763, "language_loss": 0.73752666, "learning_rate": 1.4930291439360755e-06, "loss": 0.75891072, "num_input_tokens_seen": 106557830, "step": 4941, "time_per_iteration": 2.7519991397857666 }, { "auxiliary_loss_clip": 0.01132949, "auxiliary_loss_mlp": 0.01048255, "balance_loss_clip": 1.04613602, "balance_loss_mlp": 1.03110111, "epoch": 0.5942403655383875, "flos": 22418996123520.0, "grad_norm": 2.1016790228981965, "language_loss": 0.79273522, "learning_rate": 1.4922756525945427e-06, "loss": 0.81454724, "num_input_tokens_seen": 106577140, "step": 4942, "time_per_iteration": 2.5873708724975586 }, { "auxiliary_loss_clip": 0.01032137, "auxiliary_loss_mlp": 0.01002594, "balance_loss_clip": 1.01064825, "balance_loss_mlp": 1.00092506, "epoch": 0.5943606084290266, "flos": 67629310796160.0, "grad_norm": 0.7712957803040968, "language_loss": 0.59556186, "learning_rate": 1.4915222382743894e-06, "loss": 0.61590922, "num_input_tokens_seen": 106635975, "step": 4943, "time_per_iteration": 3.338538408279419 }, { "auxiliary_loss_clip": 0.01132314, "auxiliary_loss_mlp": 0.01042703, "balance_loss_clip": 1.0482192, "balance_loss_mlp": 1.02501202, "epoch": 0.5944808513196658, "flos": 18223157646720.0, "grad_norm": 2.2897136173346158, "language_loss": 0.72243804, "learning_rate": 1.4907689010899085e-06, "loss": 0.74418819, "num_input_tokens_seen": 106653555, "step": 4944, "time_per_iteration": 2.5858306884765625 }, { "auxiliary_loss_clip": 0.01116113, "auxiliary_loss_mlp": 0.010335, "balance_loss_clip": 1.04272938, "balance_loss_mlp": 1.01721621, "epoch": 0.5946010942103048, "flos": 24790824011520.0, "grad_norm": 4.303631948339196, "language_loss": 0.6258446, "learning_rate": 1.4900156411553804e-06, "loss": 0.64734077, "num_input_tokens_seen": 106673385, "step": 4945, "time_per_iteration": 2.815586805343628 }, { "auxiliary_loss_clip": 0.01116312, "auxiliary_loss_mlp": 0.01043486, "balance_loss_clip": 1.04319239, "balance_loss_mlp": 1.02690363, "epoch": 0.5947213371009439, "flos": 15231619388160.0, "grad_norm": 2.572969448648799, "language_loss": 0.85266209, "learning_rate": 1.4892624585850739e-06, "loss": 0.87426007, "num_input_tokens_seen": 106691740, "step": 4946, "time_per_iteration": 3.651031494140625 }, { "auxiliary_loss_clip": 0.01142489, "auxiliary_loss_mlp": 0.01044162, "balance_loss_clip": 1.04672837, "balance_loss_mlp": 1.02692389, "epoch": 0.594841579991583, "flos": 25848069949440.0, "grad_norm": 1.9751974442885694, "language_loss": 0.79195601, "learning_rate": 1.4885093534932465e-06, "loss": 0.81382251, "num_input_tokens_seen": 106709705, "step": 4947, "time_per_iteration": 2.6407341957092285 }, { "auxiliary_loss_clip": 0.01114314, "auxiliary_loss_mlp": 0.01042339, "balance_loss_clip": 1.04251099, "balance_loss_mlp": 1.02507782, "epoch": 0.5949618228822221, "flos": 23981109672960.0, "grad_norm": 2.5991053440578122, "language_loss": 0.71236205, "learning_rate": 1.4877563259941433e-06, "loss": 0.73392856, "num_input_tokens_seen": 106727560, "step": 4948, "time_per_iteration": 3.718554735183716 }, { "auxiliary_loss_clip": 0.01135328, "auxiliary_loss_mlp": 0.0103938, "balance_loss_clip": 1.04788661, "balance_loss_mlp": 1.02326274, "epoch": 0.5950820657728612, "flos": 40547491476480.0, "grad_norm": 2.3915944986499884, "language_loss": 0.6780743, "learning_rate": 1.4870033762019988e-06, "loss": 0.69982135, "num_input_tokens_seen": 106747725, "step": 4949, "time_per_iteration": 2.7695302963256836 }, { "auxiliary_loss_clip": 0.01112425, "auxiliary_loss_mlp": 0.01053371, "balance_loss_clip": 1.04328203, "balance_loss_mlp": 1.03413057, "epoch": 0.5952023086635003, "flos": 23184467884800.0, "grad_norm": 3.074042685526469, "language_loss": 0.7338627, "learning_rate": 1.4862505042310334e-06, "loss": 0.75552058, "num_input_tokens_seen": 106767010, "step": 4950, "time_per_iteration": 2.655147075653076 }, { "auxiliary_loss_clip": 0.01108187, "auxiliary_loss_mlp": 0.01041877, "balance_loss_clip": 1.04328132, "balance_loss_mlp": 1.02648711, "epoch": 0.5953225515541394, "flos": 33653289548160.0, "grad_norm": 1.580520561045449, "language_loss": 0.69741774, "learning_rate": 1.4854977101954587e-06, "loss": 0.71891844, "num_input_tokens_seen": 106789230, "step": 4951, "time_per_iteration": 2.7383363246917725 }, { "auxiliary_loss_clip": 0.01128496, "auxiliary_loss_mlp": 0.01038828, "balance_loss_clip": 1.04250824, "balance_loss_mlp": 1.02259135, "epoch": 0.5954427944447784, "flos": 24459619680000.0, "grad_norm": 4.272470921337727, "language_loss": 0.8627907, "learning_rate": 1.4847449942094716e-06, "loss": 0.88446391, "num_input_tokens_seen": 106808110, "step": 4952, "time_per_iteration": 2.600903272628784 }, { "auxiliary_loss_clip": 0.0111021, "auxiliary_loss_mlp": 0.01041408, "balance_loss_clip": 1.04197359, "balance_loss_mlp": 1.02490973, "epoch": 0.5955630373354175, "flos": 18551848026240.0, "grad_norm": 2.198086682728047, "language_loss": 0.86073917, "learning_rate": 1.4839923563872598e-06, "loss": 0.88225532, "num_input_tokens_seen": 106826650, "step": 4953, "time_per_iteration": 2.6415493488311768 }, { "auxiliary_loss_clip": 0.01102322, "auxiliary_loss_mlp": 0.0104161, "balance_loss_clip": 1.04511118, "balance_loss_mlp": 1.02447963, "epoch": 0.5956832802260567, "flos": 19791699730560.0, "grad_norm": 1.7342466715187015, "language_loss": 0.76022798, "learning_rate": 1.483239796842997e-06, "loss": 0.78166723, "num_input_tokens_seen": 106844680, "step": 4954, "time_per_iteration": 3.623141050338745 }, { "auxiliary_loss_clip": 0.01101023, "auxiliary_loss_mlp": 0.01046779, "balance_loss_clip": 1.04183555, "balance_loss_mlp": 1.03050721, "epoch": 0.5958035231166957, "flos": 19750868945280.0, "grad_norm": 2.9593057737670443, "language_loss": 0.8384968, "learning_rate": 1.4824873156908462e-06, "loss": 0.8599748, "num_input_tokens_seen": 106862605, "step": 4955, "time_per_iteration": 2.6646416187286377 }, { "auxiliary_loss_clip": 0.01130689, "auxiliary_loss_mlp": 0.00773489, "balance_loss_clip": 1.04507422, "balance_loss_mlp": 1.00073171, "epoch": 0.5959237660073348, "flos": 21652806090240.0, "grad_norm": 1.5469707134948698, "language_loss": 0.75425047, "learning_rate": 1.4817349130449584e-06, "loss": 0.77329224, "num_input_tokens_seen": 106882325, "step": 4956, "time_per_iteration": 3.552190065383911 }, { "auxiliary_loss_clip": 0.01125006, "auxiliary_loss_mlp": 0.01042392, "balance_loss_clip": 1.04522967, "balance_loss_mlp": 1.02566719, "epoch": 0.5960440088979739, "flos": 21171207513600.0, "grad_norm": 2.704437462148499, "language_loss": 0.8292309, "learning_rate": 1.4809825890194717e-06, "loss": 0.85090494, "num_input_tokens_seen": 106900995, "step": 4957, "time_per_iteration": 2.614795207977295 }, { "auxiliary_loss_clip": 0.01107896, "auxiliary_loss_mlp": 0.01051409, "balance_loss_clip": 1.04183912, "balance_loss_mlp": 1.03471994, "epoch": 0.596164251788613, "flos": 14757526753920.0, "grad_norm": 1.8202970937749663, "language_loss": 0.77314049, "learning_rate": 1.4802303437285139e-06, "loss": 0.79473352, "num_input_tokens_seen": 106918265, "step": 4958, "time_per_iteration": 2.613260507583618 }, { "auxiliary_loss_clip": 0.01111738, "auxiliary_loss_mlp": 0.0104017, "balance_loss_clip": 1.0416193, "balance_loss_mlp": 1.02390993, "epoch": 0.596284494679252, "flos": 20485924865280.0, "grad_norm": 2.340468568828166, "language_loss": 0.80828857, "learning_rate": 1.4794781772861994e-06, "loss": 0.82980764, "num_input_tokens_seen": 106934760, "step": 4959, "time_per_iteration": 2.644012451171875 }, { "auxiliary_loss_clip": 0.01111914, "auxiliary_loss_mlp": 0.00772344, "balance_loss_clip": 1.04237986, "balance_loss_mlp": 1.00058305, "epoch": 0.5964047375698912, "flos": 31212262108800.0, "grad_norm": 4.658636131237198, "language_loss": 0.66342103, "learning_rate": 1.4787260898066324e-06, "loss": 0.68226361, "num_input_tokens_seen": 106954760, "step": 4960, "time_per_iteration": 2.7144508361816406 }, { "auxiliary_loss_clip": 0.01134924, "auxiliary_loss_mlp": 0.01035577, "balance_loss_clip": 1.04444039, "balance_loss_mlp": 1.01816022, "epoch": 0.5965249804605303, "flos": 27483620855040.0, "grad_norm": 2.0283084462003913, "language_loss": 0.85652936, "learning_rate": 1.4779740814039023e-06, "loss": 0.87823433, "num_input_tokens_seen": 106974845, "step": 4961, "time_per_iteration": 2.63877534866333 }, { "auxiliary_loss_clip": 0.01140674, "auxiliary_loss_mlp": 0.01043522, "balance_loss_clip": 1.04651523, "balance_loss_mlp": 1.02663565, "epoch": 0.5966452233511693, "flos": 30773936442240.0, "grad_norm": 1.8474485026017295, "language_loss": 0.68543541, "learning_rate": 1.4772221521920894e-06, "loss": 0.70727742, "num_input_tokens_seen": 106994870, "step": 4962, "time_per_iteration": 2.6546735763549805 }, { "auxiliary_loss_clip": 0.01111248, "auxiliary_loss_mlp": 0.01039891, "balance_loss_clip": 1.04392457, "balance_loss_mlp": 1.02406049, "epoch": 0.5967654662418085, "flos": 25481170477440.0, "grad_norm": 2.5092279486552784, "language_loss": 0.74097919, "learning_rate": 1.4764703022852598e-06, "loss": 0.76249063, "num_input_tokens_seen": 107015390, "step": 4963, "time_per_iteration": 2.669663190841675 }, { "auxiliary_loss_clip": 0.01061618, "auxiliary_loss_mlp": 0.01042, "balance_loss_clip": 1.03627312, "balance_loss_mlp": 1.02535856, "epoch": 0.5968857091324475, "flos": 19099126621440.0, "grad_norm": 2.570993081555532, "language_loss": 0.76675355, "learning_rate": 1.4757185317974696e-06, "loss": 0.78778976, "num_input_tokens_seen": 107033775, "step": 4964, "time_per_iteration": 2.715272903442383 }, { "auxiliary_loss_clip": 0.01132102, "auxiliary_loss_mlp": 0.01045965, "balance_loss_clip": 1.04579818, "balance_loss_mlp": 1.02834606, "epoch": 0.5970059520230866, "flos": 23692711374720.0, "grad_norm": 2.8813523931575387, "language_loss": 0.70711565, "learning_rate": 1.474966840842761e-06, "loss": 0.72889632, "num_input_tokens_seen": 107053355, "step": 4965, "time_per_iteration": 2.6380090713500977 }, { "auxiliary_loss_clip": 0.01127487, "auxiliary_loss_mlp": 0.01035682, "balance_loss_clip": 1.04351926, "balance_loss_mlp": 1.01907659, "epoch": 0.5971261949137258, "flos": 23185545292800.0, "grad_norm": 1.7813879283182426, "language_loss": 0.86934423, "learning_rate": 1.4742152295351655e-06, "loss": 0.89097601, "num_input_tokens_seen": 107072510, "step": 4966, "time_per_iteration": 2.6108620166778564 }, { "auxiliary_loss_clip": 0.01129172, "auxiliary_loss_mlp": 0.00773573, "balance_loss_clip": 1.04404402, "balance_loss_mlp": 1.00061369, "epoch": 0.5972464378043648, "flos": 20557710195840.0, "grad_norm": 4.906975440472979, "language_loss": 0.64215469, "learning_rate": 1.4734636979887016e-06, "loss": 0.66118211, "num_input_tokens_seen": 107089970, "step": 4967, "time_per_iteration": 2.623419761657715 }, { "auxiliary_loss_clip": 0.01111028, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 1.04386449, "balance_loss_mlp": 1.02068329, "epoch": 0.5973666806950039, "flos": 29387030457600.0, "grad_norm": 2.062482018170652, "language_loss": 0.89766347, "learning_rate": 1.4727122463173755e-06, "loss": 0.91914433, "num_input_tokens_seen": 107108500, "step": 4968, "time_per_iteration": 2.735989809036255 }, { "auxiliary_loss_clip": 0.01116914, "auxiliary_loss_mlp": 0.01039521, "balance_loss_clip": 1.04626846, "balance_loss_mlp": 1.02333283, "epoch": 0.597486923585643, "flos": 22273522041600.0, "grad_norm": 2.7066951097014087, "language_loss": 0.64356315, "learning_rate": 1.471960874635183e-06, "loss": 0.66512752, "num_input_tokens_seen": 107128060, "step": 4969, "time_per_iteration": 2.6354684829711914 }, { "auxiliary_loss_clip": 0.01110422, "auxiliary_loss_mlp": 0.01038391, "balance_loss_clip": 1.04180503, "balance_loss_mlp": 1.02089143, "epoch": 0.5976071664762821, "flos": 13772461196160.0, "grad_norm": 2.153689173533952, "language_loss": 0.70378488, "learning_rate": 1.4712095830561055e-06, "loss": 0.72527301, "num_input_tokens_seen": 107146550, "step": 4970, "time_per_iteration": 2.6142921447753906 }, { "auxiliary_loss_clip": 0.01112266, "auxiliary_loss_mlp": 0.01043936, "balance_loss_clip": 1.04113102, "balance_loss_mlp": 1.02843857, "epoch": 0.5977274093669211, "flos": 19098623831040.0, "grad_norm": 14.604623027118734, "language_loss": 0.81244099, "learning_rate": 1.4704583716941147e-06, "loss": 0.83400297, "num_input_tokens_seen": 107165415, "step": 4971, "time_per_iteration": 2.6321184635162354 }, { "auxiliary_loss_clip": 0.01123358, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.04753757, "balance_loss_mlp": 1.02145052, "epoch": 0.5978476522575603, "flos": 20376002269440.0, "grad_norm": 1.977510376920837, "language_loss": 0.72186875, "learning_rate": 1.4697072406631672e-06, "loss": 0.74348849, "num_input_tokens_seen": 107185320, "step": 4972, "time_per_iteration": 3.4607393741607666 }, { "auxiliary_loss_clip": 0.01097076, "auxiliary_loss_mlp": 0.01048349, "balance_loss_clip": 1.04498196, "balance_loss_mlp": 1.03062248, "epoch": 0.5979678951481994, "flos": 29023147728000.0, "grad_norm": 1.7455080609278661, "language_loss": 0.72653794, "learning_rate": 1.4689561900772097e-06, "loss": 0.74799216, "num_input_tokens_seen": 107205380, "step": 4973, "time_per_iteration": 2.7521262168884277 }, { "auxiliary_loss_clip": 0.01113781, "auxiliary_loss_mlp": 0.01045076, "balance_loss_clip": 1.04184091, "balance_loss_mlp": 1.02775455, "epoch": 0.5980881380388384, "flos": 17967689141760.0, "grad_norm": 2.3543219487504126, "language_loss": 0.72316879, "learning_rate": 1.4682052200501758e-06, "loss": 0.74475735, "num_input_tokens_seen": 107222585, "step": 4974, "time_per_iteration": 2.648902654647827 }, { "auxiliary_loss_clip": 0.01138622, "auxiliary_loss_mlp": 0.01037295, "balance_loss_clip": 1.04551339, "balance_loss_mlp": 1.02127337, "epoch": 0.5982083809294776, "flos": 22962827013120.0, "grad_norm": 1.954008455322104, "language_loss": 0.80417579, "learning_rate": 1.4674543306959876e-06, "loss": 0.82593495, "num_input_tokens_seen": 107242055, "step": 4975, "time_per_iteration": 3.580237627029419 }, { "auxiliary_loss_clip": 0.01123391, "auxiliary_loss_mlp": 0.01045839, "balance_loss_clip": 1.04565251, "balance_loss_mlp": 1.02864957, "epoch": 0.5983286238201166, "flos": 20991941712000.0, "grad_norm": 2.465723119474972, "language_loss": 0.84271836, "learning_rate": 1.4667035221285535e-06, "loss": 0.86441064, "num_input_tokens_seen": 107259695, "step": 4976, "time_per_iteration": 2.6347572803497314 }, { "auxiliary_loss_clip": 0.01120736, "auxiliary_loss_mlp": 0.01040444, "balance_loss_clip": 1.04280639, "balance_loss_mlp": 1.02364731, "epoch": 0.5984488667107557, "flos": 28183448511360.0, "grad_norm": 3.3866373225200546, "language_loss": 0.74025166, "learning_rate": 1.4659527944617715e-06, "loss": 0.76186347, "num_input_tokens_seen": 107279640, "step": 4977, "time_per_iteration": 2.648500919342041 }, { "auxiliary_loss_clip": 0.01075752, "auxiliary_loss_mlp": 0.01062847, "balance_loss_clip": 1.03877223, "balance_loss_mlp": 1.04453635, "epoch": 0.5985691096013949, "flos": 16471794314880.0, "grad_norm": 2.282942647416678, "language_loss": 0.75905132, "learning_rate": 1.465202147809526e-06, "loss": 0.78043723, "num_input_tokens_seen": 107298135, "step": 4978, "time_per_iteration": 2.7186107635498047 }, { "auxiliary_loss_clip": 0.0113987, "auxiliary_loss_mlp": 0.01039089, "balance_loss_clip": 1.04651225, "balance_loss_mlp": 1.02401531, "epoch": 0.5986893524920339, "flos": 26719046933760.0, "grad_norm": 5.243809699130933, "language_loss": 0.76321805, "learning_rate": 1.4644515822856888e-06, "loss": 0.7850076, "num_input_tokens_seen": 107316570, "step": 4979, "time_per_iteration": 3.533529281616211 }, { "auxiliary_loss_clip": 0.01018713, "auxiliary_loss_mlp": 0.01003809, "balance_loss_clip": 1.0150764, "balance_loss_mlp": 1.0018425, "epoch": 0.598809595382673, "flos": 61608061100160.0, "grad_norm": 0.7549476748757277, "language_loss": 0.56532866, "learning_rate": 1.4637010980041215e-06, "loss": 0.58555388, "num_input_tokens_seen": 107378680, "step": 4980, "time_per_iteration": 3.217824697494507 }, { "auxiliary_loss_clip": 0.01141227, "auxiliary_loss_mlp": 0.01048804, "balance_loss_clip": 1.04651165, "balance_loss_mlp": 1.03134036, "epoch": 0.5989298382733121, "flos": 11801719549440.0, "grad_norm": 2.401054489700388, "language_loss": 0.89751929, "learning_rate": 1.4629506950786707e-06, "loss": 0.91941959, "num_input_tokens_seen": 107394860, "step": 4981, "time_per_iteration": 2.4997923374176025 }, { "auxiliary_loss_clip": 0.01045461, "auxiliary_loss_mlp": 0.01006505, "balance_loss_clip": 1.01505017, "balance_loss_mlp": 1.00465703, "epoch": 0.5990500811639512, "flos": 60025800021120.0, "grad_norm": 0.8075967376380597, "language_loss": 0.5605104, "learning_rate": 1.4622003736231733e-06, "loss": 0.58103001, "num_input_tokens_seen": 107453850, "step": 4982, "time_per_iteration": 3.982708692550659 }, { "auxiliary_loss_clip": 0.01125096, "auxiliary_loss_mlp": 0.0104722, "balance_loss_clip": 1.04537296, "balance_loss_mlp": 1.03057814, "epoch": 0.5991703240545903, "flos": 18222726683520.0, "grad_norm": 1.8757706860066201, "language_loss": 0.80531877, "learning_rate": 1.461450133751451e-06, "loss": 0.82704186, "num_input_tokens_seen": 107471920, "step": 4983, "time_per_iteration": 2.4875328540802 }, { "auxiliary_loss_clip": 0.01127662, "auxiliary_loss_mlp": 0.01044621, "balance_loss_clip": 1.04447222, "balance_loss_mlp": 1.02842021, "epoch": 0.5992905669452293, "flos": 27709894581120.0, "grad_norm": 2.1043456258312268, "language_loss": 0.76179153, "learning_rate": 1.4606999755773153e-06, "loss": 0.78351432, "num_input_tokens_seen": 107493125, "step": 4984, "time_per_iteration": 2.6991090774536133 }, { "auxiliary_loss_clip": 0.01136754, "auxiliary_loss_mlp": 0.01039313, "balance_loss_clip": 1.04628825, "balance_loss_mlp": 1.02331543, "epoch": 0.5994108098358685, "flos": 20449008662400.0, "grad_norm": 1.5820360869536811, "language_loss": 0.82201099, "learning_rate": 1.4599498992145643e-06, "loss": 0.8437717, "num_input_tokens_seen": 107513150, "step": 4985, "time_per_iteration": 2.58170485496521 }, { "auxiliary_loss_clip": 0.0111944, "auxiliary_loss_mlp": 0.00772929, "balance_loss_clip": 1.04609597, "balance_loss_mlp": 1.00050724, "epoch": 0.5995310527265075, "flos": 22269966595200.0, "grad_norm": 1.7444491080774875, "language_loss": 0.70890725, "learning_rate": 1.4591999047769846e-06, "loss": 0.72783101, "num_input_tokens_seen": 107532005, "step": 4986, "time_per_iteration": 2.726724147796631 }, { "auxiliary_loss_clip": 0.01074793, "auxiliary_loss_mlp": 0.01042319, "balance_loss_clip": 1.03843355, "balance_loss_mlp": 1.0238055, "epoch": 0.5996512956171466, "flos": 18916951818240.0, "grad_norm": 3.431531310365734, "language_loss": 0.75520134, "learning_rate": 1.4584499923783486e-06, "loss": 0.77637243, "num_input_tokens_seen": 107550585, "step": 4987, "time_per_iteration": 2.811758279800415 }, { "auxiliary_loss_clip": 0.0111578, "auxiliary_loss_mlp": 0.01035959, "balance_loss_clip": 1.04457355, "balance_loss_mlp": 1.02017021, "epoch": 0.5997715385077858, "flos": 15370916330880.0, "grad_norm": 1.862808556639238, "language_loss": 0.76138043, "learning_rate": 1.457700162132419e-06, "loss": 0.78289783, "num_input_tokens_seen": 107567575, "step": 4988, "time_per_iteration": 2.624643325805664 }, { "auxiliary_loss_clip": 0.0108939, "auxiliary_loss_mlp": 0.01032946, "balance_loss_clip": 1.04351234, "balance_loss_mlp": 1.01696014, "epoch": 0.5998917813984248, "flos": 25264844818560.0, "grad_norm": 2.3500912498099624, "language_loss": 0.71938533, "learning_rate": 1.4569504141529433e-06, "loss": 0.74060869, "num_input_tokens_seen": 107585410, "step": 4989, "time_per_iteration": 2.731527328491211 }, { "auxiliary_loss_clip": 0.01124602, "auxiliary_loss_mlp": 0.01041965, "balance_loss_clip": 1.0444057, "balance_loss_mlp": 1.02330852, "epoch": 0.6000120242890639, "flos": 22054502862720.0, "grad_norm": 2.1912597182928004, "language_loss": 0.71722734, "learning_rate": 1.456200748553658e-06, "loss": 0.73889303, "num_input_tokens_seen": 107603405, "step": 4990, "time_per_iteration": 2.639026165008545 }, { "auxiliary_loss_clip": 0.01139891, "auxiliary_loss_mlp": 0.01045355, "balance_loss_clip": 1.04643416, "balance_loss_mlp": 1.02908278, "epoch": 0.600132267179703, "flos": 29863421562240.0, "grad_norm": 4.229654636099543, "language_loss": 0.78759599, "learning_rate": 1.455451165448287e-06, "loss": 0.80944848, "num_input_tokens_seen": 107626060, "step": 4991, "time_per_iteration": 2.6529948711395264 }, { "auxiliary_loss_clip": 0.01114111, "auxiliary_loss_mlp": 0.01039776, "balance_loss_clip": 1.04421139, "balance_loss_mlp": 1.02364731, "epoch": 0.6002525100703421, "flos": 25045358762880.0, "grad_norm": 2.9274772712423447, "language_loss": 0.73754656, "learning_rate": 1.4547016649505407e-06, "loss": 0.75908542, "num_input_tokens_seen": 107644070, "step": 4992, "time_per_iteration": 2.6865429878234863 }, { "auxiliary_loss_clip": 0.01099368, "auxiliary_loss_mlp": 0.01052137, "balance_loss_clip": 1.04008174, "balance_loss_mlp": 1.03480387, "epoch": 0.6003727529609811, "flos": 20849592113280.0, "grad_norm": 2.02449947681025, "language_loss": 0.84967208, "learning_rate": 1.4539522471741193e-06, "loss": 0.87118709, "num_input_tokens_seen": 107661495, "step": 4993, "time_per_iteration": 2.8760781288146973 }, { "auxiliary_loss_clip": 0.01131761, "auxiliary_loss_mlp": 0.01046592, "balance_loss_clip": 1.04498529, "balance_loss_mlp": 1.02852011, "epoch": 0.6004929958516203, "flos": 15594604277760.0, "grad_norm": 2.3612611217535484, "language_loss": 0.71032333, "learning_rate": 1.4532029122327067e-06, "loss": 0.73210686, "num_input_tokens_seen": 107678280, "step": 4994, "time_per_iteration": 2.704041004180908 }, { "auxiliary_loss_clip": 0.0109545, "auxiliary_loss_mlp": 0.01038172, "balance_loss_clip": 1.04466271, "balance_loss_mlp": 1.02246046, "epoch": 0.6006132387422594, "flos": 21763267390080.0, "grad_norm": 2.187543355123, "language_loss": 0.75545794, "learning_rate": 1.4524536602399783e-06, "loss": 0.7767942, "num_input_tokens_seen": 107697370, "step": 4995, "time_per_iteration": 2.791388511657715 }, { "auxiliary_loss_clip": 0.01114437, "auxiliary_loss_mlp": 0.01042731, "balance_loss_clip": 1.04670095, "balance_loss_mlp": 1.02651834, "epoch": 0.6007334816328984, "flos": 22858542852480.0, "grad_norm": 2.03232260693602, "language_loss": 0.77372563, "learning_rate": 1.4517044913095938e-06, "loss": 0.79529727, "num_input_tokens_seen": 107717790, "step": 4996, "time_per_iteration": 2.8731749057769775 }, { "auxiliary_loss_clip": 0.01128088, "auxiliary_loss_mlp": 0.01036002, "balance_loss_clip": 1.04535508, "balance_loss_mlp": 1.01925373, "epoch": 0.6008537245235376, "flos": 28324577047680.0, "grad_norm": 1.7548202254181005, "language_loss": 0.8164674, "learning_rate": 1.4509554055552022e-06, "loss": 0.8381083, "num_input_tokens_seen": 107738020, "step": 4997, "time_per_iteration": 2.7303006649017334 }, { "auxiliary_loss_clip": 0.01111879, "auxiliary_loss_mlp": 0.01041659, "balance_loss_clip": 1.04158354, "balance_loss_mlp": 1.02401602, "epoch": 0.6009739674141766, "flos": 20886113266560.0, "grad_norm": 2.6873339982686466, "language_loss": 0.83893651, "learning_rate": 1.450206403090439e-06, "loss": 0.8604719, "num_input_tokens_seen": 107756215, "step": 4998, "time_per_iteration": 3.629624128341675 }, { "auxiliary_loss_clip": 0.011268, "auxiliary_loss_mlp": 0.01035295, "balance_loss_clip": 1.04627001, "balance_loss_mlp": 1.02059603, "epoch": 0.6010942103048157, "flos": 20481004702080.0, "grad_norm": 2.2245010330412605, "language_loss": 0.86277282, "learning_rate": 1.4494574840289274e-06, "loss": 0.88439381, "num_input_tokens_seen": 107773330, "step": 4999, "time_per_iteration": 2.6910171508789062 }, { "auxiliary_loss_clip": 0.01137389, "auxiliary_loss_mlp": 0.01047678, "balance_loss_clip": 1.04746628, "balance_loss_mlp": 1.03020215, "epoch": 0.6012144531954549, "flos": 23805973935360.0, "grad_norm": 2.2571313778273034, "language_loss": 0.73935324, "learning_rate": 1.4487086484842782e-06, "loss": 0.76120389, "num_input_tokens_seen": 107791975, "step": 5000, "time_per_iteration": 3.7051024436950684 }, { "auxiliary_loss_clip": 0.01138315, "auxiliary_loss_mlp": 0.01044698, "balance_loss_clip": 1.0468967, "balance_loss_mlp": 1.02860451, "epoch": 0.6013346960860939, "flos": 18988378012800.0, "grad_norm": 3.9958972406989917, "language_loss": 0.60275239, "learning_rate": 1.4479598965700878e-06, "loss": 0.62458253, "num_input_tokens_seen": 107809240, "step": 5001, "time_per_iteration": 2.7084875106811523 }, { "auxiliary_loss_clip": 0.01101427, "auxiliary_loss_mlp": 0.01047236, "balance_loss_clip": 1.04184151, "balance_loss_mlp": 1.03001046, "epoch": 0.601454938976733, "flos": 24025316336640.0, "grad_norm": 5.697667988833208, "language_loss": 0.69568795, "learning_rate": 1.4472112283999427e-06, "loss": 0.71717459, "num_input_tokens_seen": 107827895, "step": 5002, "time_per_iteration": 2.734179973602295 }, { "auxiliary_loss_clip": 0.01118305, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.04421759, "balance_loss_mlp": 1.01737738, "epoch": 0.6015751818673721, "flos": 26427129102720.0, "grad_norm": 2.6194271395714956, "language_loss": 0.6968354, "learning_rate": 1.4464626440874143e-06, "loss": 0.71834761, "num_input_tokens_seen": 107847010, "step": 5003, "time_per_iteration": 2.6909244060516357 }, { "auxiliary_loss_clip": 0.01099743, "auxiliary_loss_mlp": 0.01038151, "balance_loss_clip": 1.04293084, "balance_loss_mlp": 1.02029335, "epoch": 0.6016954247580112, "flos": 13115260005120.0, "grad_norm": 3.669390464655994, "language_loss": 0.743101, "learning_rate": 1.4457141437460636e-06, "loss": 0.76447999, "num_input_tokens_seen": 107864235, "step": 5004, "time_per_iteration": 2.748420000076294 }, { "auxiliary_loss_clip": 0.01119682, "auxiliary_loss_mlp": 0.01037115, "balance_loss_clip": 1.04419696, "balance_loss_mlp": 1.01948977, "epoch": 0.6018156676486502, "flos": 23768447201280.0, "grad_norm": 1.7984842501914708, "language_loss": 0.73005378, "learning_rate": 1.444965727489436e-06, "loss": 0.75162178, "num_input_tokens_seen": 107883680, "step": 5005, "time_per_iteration": 3.935483932495117 }, { "auxiliary_loss_clip": 0.01101208, "auxiliary_loss_mlp": 0.01046158, "balance_loss_clip": 1.0423975, "balance_loss_mlp": 1.02909946, "epoch": 0.6019359105392894, "flos": 26469360518400.0, "grad_norm": 2.257656970447882, "language_loss": 0.63018364, "learning_rate": 1.444217395431066e-06, "loss": 0.65165728, "num_input_tokens_seen": 107906220, "step": 5006, "time_per_iteration": 2.7819695472717285 }, { "auxiliary_loss_clip": 0.01016835, "auxiliary_loss_mlp": 0.01001471, "balance_loss_clip": 1.01887321, "balance_loss_mlp": 0.99968308, "epoch": 0.6020561534299285, "flos": 69190849728000.0, "grad_norm": 0.7885630750112366, "language_loss": 0.55754352, "learning_rate": 1.4434691476844755e-06, "loss": 0.5777266, "num_input_tokens_seen": 107967195, "step": 5007, "time_per_iteration": 3.211216449737549 }, { "auxiliary_loss_clip": 0.01113988, "auxiliary_loss_mlp": 0.01045586, "balance_loss_clip": 1.04812241, "balance_loss_mlp": 1.02979112, "epoch": 0.6021763963205675, "flos": 21835304115840.0, "grad_norm": 2.3075510075220516, "language_loss": 0.67287755, "learning_rate": 1.4427209843631729e-06, "loss": 0.69447327, "num_input_tokens_seen": 107984245, "step": 5008, "time_per_iteration": 3.615405321121216 }, { "auxiliary_loss_clip": 0.01139751, "auxiliary_loss_mlp": 0.00773112, "balance_loss_clip": 1.04807305, "balance_loss_mlp": 1.00039291, "epoch": 0.6022966392112067, "flos": 26578636669440.0, "grad_norm": 1.7981903131322445, "language_loss": 0.81017578, "learning_rate": 1.4419729055806534e-06, "loss": 0.82930446, "num_input_tokens_seen": 108003680, "step": 5009, "time_per_iteration": 2.719660520553589 }, { "auxiliary_loss_clip": 0.01110766, "auxiliary_loss_mlp": 0.00772358, "balance_loss_clip": 1.04456651, "balance_loss_mlp": 1.00052488, "epoch": 0.6024168821018457, "flos": 20703722981760.0, "grad_norm": 1.6665778075312188, "language_loss": 0.82264227, "learning_rate": 1.441224911450401e-06, "loss": 0.84147352, "num_input_tokens_seen": 108019635, "step": 5010, "time_per_iteration": 2.7993407249450684 }, { "auxiliary_loss_clip": 0.01128022, "auxiliary_loss_mlp": 0.01044039, "balance_loss_clip": 1.04432666, "balance_loss_mlp": 1.0272063, "epoch": 0.6025371249924848, "flos": 24680973242880.0, "grad_norm": 1.810787552875531, "language_loss": 0.82283199, "learning_rate": 1.4404770020858851e-06, "loss": 0.84455258, "num_input_tokens_seen": 108039120, "step": 5011, "time_per_iteration": 2.6987385749816895 }, { "auxiliary_loss_clip": 0.01121966, "auxiliary_loss_mlp": 0.01042287, "balance_loss_clip": 1.0451138, "balance_loss_mlp": 1.0254308, "epoch": 0.602657367883124, "flos": 25955801815680.0, "grad_norm": 1.5702355476296161, "language_loss": 0.85782897, "learning_rate": 1.439729177600563e-06, "loss": 0.87947154, "num_input_tokens_seen": 108059615, "step": 5012, "time_per_iteration": 2.711493492126465 }, { "auxiliary_loss_clip": 0.01129931, "auxiliary_loss_mlp": 0.01043386, "balance_loss_clip": 1.04945183, "balance_loss_mlp": 1.02700651, "epoch": 0.602777610773763, "flos": 16690633925760.0, "grad_norm": 2.369400665785397, "language_loss": 0.72754645, "learning_rate": 1.4389814381078793e-06, "loss": 0.74927962, "num_input_tokens_seen": 108078855, "step": 5013, "time_per_iteration": 2.640047550201416 }, { "auxiliary_loss_clip": 0.01044903, "auxiliary_loss_mlp": 0.01039475, "balance_loss_clip": 1.03726327, "balance_loss_mlp": 1.02265429, "epoch": 0.6028978536644021, "flos": 13334243270400.0, "grad_norm": 1.9852264734655616, "language_loss": 0.80093515, "learning_rate": 1.438233783721265e-06, "loss": 0.82177889, "num_input_tokens_seen": 108095020, "step": 5014, "time_per_iteration": 3.1604115962982178 }, { "auxiliary_loss_clip": 0.01107419, "auxiliary_loss_mlp": 0.01037334, "balance_loss_clip": 1.04435086, "balance_loss_mlp": 1.01991796, "epoch": 0.6030180965550412, "flos": 19644825018240.0, "grad_norm": 2.321909705684541, "language_loss": 0.77999067, "learning_rate": 1.43748621455414e-06, "loss": 0.80143821, "num_input_tokens_seen": 108111455, "step": 5015, "time_per_iteration": 2.9149794578552246 }, { "auxiliary_loss_clip": 0.0111235, "auxiliary_loss_mlp": 0.01037893, "balance_loss_clip": 1.04353571, "balance_loss_mlp": 1.021263, "epoch": 0.6031383394456803, "flos": 14458390289280.0, "grad_norm": 77.16811548338441, "language_loss": 0.80589253, "learning_rate": 1.4367387307199082e-06, "loss": 0.82739496, "num_input_tokens_seen": 108128305, "step": 5016, "time_per_iteration": 2.7256410121917725 }, { "auxiliary_loss_clip": 0.01122427, "auxiliary_loss_mlp": 0.01043868, "balance_loss_clip": 1.04303718, "balance_loss_mlp": 1.02722657, "epoch": 0.6032585823363193, "flos": 13917791623680.0, "grad_norm": 2.0702789128940027, "language_loss": 0.82550108, "learning_rate": 1.4359913323319632e-06, "loss": 0.84716409, "num_input_tokens_seen": 108145475, "step": 5017, "time_per_iteration": 2.637511730194092 }, { "auxiliary_loss_clip": 0.01065609, "auxiliary_loss_mlp": 0.01049978, "balance_loss_clip": 1.03723335, "balance_loss_mlp": 1.03282428, "epoch": 0.6033788252269584, "flos": 24353252530560.0, "grad_norm": 1.8589038242275486, "language_loss": 0.78050178, "learning_rate": 1.4352440195036847e-06, "loss": 0.80165768, "num_input_tokens_seen": 108165650, "step": 5018, "time_per_iteration": 2.790059804916382 }, { "auxiliary_loss_clip": 0.01067863, "auxiliary_loss_mlp": 0.01047501, "balance_loss_clip": 1.03565574, "balance_loss_mlp": 1.03094268, "epoch": 0.6034990681175976, "flos": 25521247077120.0, "grad_norm": 1.5651411629628431, "language_loss": 0.80225295, "learning_rate": 1.4344967923484395e-06, "loss": 0.82340658, "num_input_tokens_seen": 108187620, "step": 5019, "time_per_iteration": 2.8633618354797363 }, { "auxiliary_loss_clip": 0.01127253, "auxiliary_loss_mlp": 0.01043127, "balance_loss_clip": 1.04471743, "balance_loss_mlp": 1.02648568, "epoch": 0.6036193110082366, "flos": 25958387594880.0, "grad_norm": 2.0927324743141718, "language_loss": 0.71655327, "learning_rate": 1.433749650979581e-06, "loss": 0.73825705, "num_input_tokens_seen": 108207605, "step": 5020, "time_per_iteration": 2.673067808151245 }, { "auxiliary_loss_clip": 0.01104241, "auxiliary_loss_mlp": 0.01042189, "balance_loss_clip": 1.04160857, "balance_loss_mlp": 1.02687049, "epoch": 0.6037395538988757, "flos": 25593427457280.0, "grad_norm": 1.8347527458079544, "language_loss": 0.67724991, "learning_rate": 1.433002595510451e-06, "loss": 0.69871414, "num_input_tokens_seen": 108226385, "step": 5021, "time_per_iteration": 2.7736213207244873 }, { "auxiliary_loss_clip": 0.01112429, "auxiliary_loss_mlp": 0.007746, "balance_loss_clip": 1.04336047, "balance_loss_mlp": 1.00047183, "epoch": 0.6038597967895148, "flos": 17816253402240.0, "grad_norm": 3.3750227379474524, "language_loss": 0.7211917, "learning_rate": 1.4322556260543757e-06, "loss": 0.740062, "num_input_tokens_seen": 108242960, "step": 5022, "time_per_iteration": 2.6779589653015137 }, { "auxiliary_loss_clip": 0.01018001, "auxiliary_loss_mlp": 0.01005477, "balance_loss_clip": 1.01519072, "balance_loss_mlp": 1.00374818, "epoch": 0.6039800396801539, "flos": 65169213235200.0, "grad_norm": 0.8913786340655441, "language_loss": 0.62696898, "learning_rate": 1.4315087427246703e-06, "loss": 0.64720368, "num_input_tokens_seen": 108296785, "step": 5023, "time_per_iteration": 3.154139995574951 }, { "auxiliary_loss_clip": 0.01044062, "auxiliary_loss_mlp": 0.01006937, "balance_loss_clip": 1.01359844, "balance_loss_mlp": 1.00526834, "epoch": 0.604100282570793, "flos": 67386409073280.0, "grad_norm": 0.8676447624017545, "language_loss": 0.58429873, "learning_rate": 1.4307619456346372e-06, "loss": 0.60480869, "num_input_tokens_seen": 108341090, "step": 5024, "time_per_iteration": 4.848861217498779 }, { "auxiliary_loss_clip": 0.01129975, "auxiliary_loss_mlp": 0.01044183, "balance_loss_clip": 1.04424405, "balance_loss_mlp": 1.0280776, "epoch": 0.6042205254614321, "flos": 35297495631360.0, "grad_norm": 3.7146337182090026, "language_loss": 0.74246353, "learning_rate": 1.430015234897564e-06, "loss": 0.7642051, "num_input_tokens_seen": 108364370, "step": 5025, "time_per_iteration": 2.7740414142608643 }, { "auxiliary_loss_clip": 0.01138549, "auxiliary_loss_mlp": 0.0077341, "balance_loss_clip": 1.04402304, "balance_loss_mlp": 1.00044668, "epoch": 0.6043407683520712, "flos": 45658262206080.0, "grad_norm": 1.629423266242002, "language_loss": 0.66753685, "learning_rate": 1.4292686106267274e-06, "loss": 0.68665648, "num_input_tokens_seen": 108387220, "step": 5026, "time_per_iteration": 3.7598695755004883 }, { "auxiliary_loss_clip": 0.01130683, "auxiliary_loss_mlp": 0.01040701, "balance_loss_clip": 1.04570198, "balance_loss_mlp": 1.02447629, "epoch": 0.6044610112427102, "flos": 16180020138240.0, "grad_norm": 15.189765293758104, "language_loss": 0.76978397, "learning_rate": 1.4285220729353876e-06, "loss": 0.79149783, "num_input_tokens_seen": 108405760, "step": 5027, "time_per_iteration": 2.569502115249634 }, { "auxiliary_loss_clip": 0.01115814, "auxiliary_loss_mlp": 0.01042371, "balance_loss_clip": 1.0440259, "balance_loss_mlp": 1.02638555, "epoch": 0.6045812541333494, "flos": 13804062186240.0, "grad_norm": 2.3360819683213214, "language_loss": 0.78203475, "learning_rate": 1.4277756219367957e-06, "loss": 0.80361652, "num_input_tokens_seen": 108422785, "step": 5028, "time_per_iteration": 2.6298532485961914 }, { "auxiliary_loss_clip": 0.01111693, "auxiliary_loss_mlp": 0.0104263, "balance_loss_clip": 1.04416573, "balance_loss_mlp": 1.02498698, "epoch": 0.6047014970239885, "flos": 19975059682560.0, "grad_norm": 1.9106931686485298, "language_loss": 0.79619026, "learning_rate": 1.4270292577441864e-06, "loss": 0.81773353, "num_input_tokens_seen": 108442290, "step": 5029, "time_per_iteration": 2.7025980949401855 }, { "auxiliary_loss_clip": 0.01128983, "auxiliary_loss_mlp": 0.01039305, "balance_loss_clip": 1.04349625, "balance_loss_mlp": 1.02283001, "epoch": 0.6048217399146275, "flos": 25337097025920.0, "grad_norm": 1.966051375496515, "language_loss": 0.71835685, "learning_rate": 1.4262829804707836e-06, "loss": 0.74003977, "num_input_tokens_seen": 108464280, "step": 5030, "time_per_iteration": 2.652083396911621 }, { "auxiliary_loss_clip": 0.01126341, "auxiliary_loss_mlp": 0.01044773, "balance_loss_clip": 1.04229105, "balance_loss_mlp": 1.02758288, "epoch": 0.6049419828052667, "flos": 26030819370240.0, "grad_norm": 2.67420854799143, "language_loss": 0.6988799, "learning_rate": 1.4255367902297958e-06, "loss": 0.72059107, "num_input_tokens_seen": 108485610, "step": 5031, "time_per_iteration": 3.6235404014587402 }, { "auxiliary_loss_clip": 0.01134976, "auxiliary_loss_mlp": 0.01037464, "balance_loss_clip": 1.04360318, "balance_loss_mlp": 1.02233648, "epoch": 0.6050622256959057, "flos": 14648106948480.0, "grad_norm": 2.2739022915904883, "language_loss": 0.7889992, "learning_rate": 1.4247906871344215e-06, "loss": 0.81072366, "num_input_tokens_seen": 108501005, "step": 5032, "time_per_iteration": 2.5224547386169434 }, { "auxiliary_loss_clip": 0.01111107, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.04147029, "balance_loss_mlp": 1.0179801, "epoch": 0.6051824685865448, "flos": 23331450337920.0, "grad_norm": 2.108018715413084, "language_loss": 0.75698888, "learning_rate": 1.4240446712978415e-06, "loss": 0.77844149, "num_input_tokens_seen": 108519990, "step": 5033, "time_per_iteration": 2.6296730041503906 }, { "auxiliary_loss_clip": 0.01131296, "auxiliary_loss_mlp": 0.01048527, "balance_loss_clip": 1.04527259, "balance_loss_mlp": 1.03012145, "epoch": 0.605302711477184, "flos": 27563307177600.0, "grad_norm": 2.1666844563306085, "language_loss": 0.74414659, "learning_rate": 1.423298742833227e-06, "loss": 0.76594484, "num_input_tokens_seen": 108538650, "step": 5034, "time_per_iteration": 3.563405990600586 }, { "auxiliary_loss_clip": 0.01106701, "auxiliary_loss_mlp": 0.01040765, "balance_loss_clip": 1.04076254, "balance_loss_mlp": 1.0250293, "epoch": 0.605422954367823, "flos": 15154698412800.0, "grad_norm": 2.6318182966707533, "language_loss": 0.71949375, "learning_rate": 1.4225529018537352e-06, "loss": 0.74096841, "num_input_tokens_seen": 108554155, "step": 5035, "time_per_iteration": 2.6561450958251953 }, { "auxiliary_loss_clip": 0.01138127, "auxiliary_loss_mlp": 0.01038788, "balance_loss_clip": 1.04517484, "balance_loss_mlp": 1.02288568, "epoch": 0.6055431972584621, "flos": 27673912131840.0, "grad_norm": 2.211480378546144, "language_loss": 0.77911747, "learning_rate": 1.4218071484725082e-06, "loss": 0.80088657, "num_input_tokens_seen": 108576275, "step": 5036, "time_per_iteration": 2.665950298309326 }, { "auxiliary_loss_clip": 0.01109353, "auxiliary_loss_mlp": 0.01045504, "balance_loss_clip": 1.04316998, "balance_loss_mlp": 1.0294224, "epoch": 0.6056634401491012, "flos": 19387489006080.0, "grad_norm": 2.11510540307123, "language_loss": 0.76133752, "learning_rate": 1.4210614828026786e-06, "loss": 0.78288609, "num_input_tokens_seen": 108594125, "step": 5037, "time_per_iteration": 2.6314773559570312 }, { "auxiliary_loss_clip": 0.01139318, "auxiliary_loss_mlp": 0.01036784, "balance_loss_clip": 1.0463326, "balance_loss_mlp": 1.02033257, "epoch": 0.6057836830397403, "flos": 24789459294720.0, "grad_norm": 1.4921298176241438, "language_loss": 0.74725366, "learning_rate": 1.4203159049573605e-06, "loss": 0.76901472, "num_input_tokens_seen": 108615360, "step": 5038, "time_per_iteration": 2.63401198387146 }, { "auxiliary_loss_clip": 0.01119155, "auxiliary_loss_mlp": 0.01038984, "balance_loss_clip": 1.0421648, "balance_loss_mlp": 1.02290273, "epoch": 0.6059039259303793, "flos": 20558248899840.0, "grad_norm": 2.8945963799320156, "language_loss": 0.87583423, "learning_rate": 1.4195704150496593e-06, "loss": 0.89741564, "num_input_tokens_seen": 108633075, "step": 5039, "time_per_iteration": 2.6435463428497314 }, { "auxiliary_loss_clip": 0.01115736, "auxiliary_loss_mlp": 0.01036801, "balance_loss_clip": 1.04537487, "balance_loss_mlp": 1.01962256, "epoch": 0.6060241688210185, "flos": 21069724613760.0, "grad_norm": 1.7913077889384426, "language_loss": 0.74137658, "learning_rate": 1.4188250131926639e-06, "loss": 0.76290196, "num_input_tokens_seen": 108651875, "step": 5040, "time_per_iteration": 2.643130302429199 }, { "auxiliary_loss_clip": 0.01114948, "auxiliary_loss_mlp": 0.01042329, "balance_loss_clip": 1.04237592, "balance_loss_mlp": 1.02443528, "epoch": 0.6061444117116576, "flos": 16361081619840.0, "grad_norm": 2.117314533554951, "language_loss": 0.80562699, "learning_rate": 1.4180796994994525e-06, "loss": 0.82719982, "num_input_tokens_seen": 108669290, "step": 5041, "time_per_iteration": 2.6244959831237793 }, { "auxiliary_loss_clip": 0.01111609, "auxiliary_loss_mlp": 0.01042055, "balance_loss_clip": 1.04111862, "balance_loss_mlp": 1.02592587, "epoch": 0.6062646546022966, "flos": 21507296094720.0, "grad_norm": 1.7568001909894237, "language_loss": 0.71640611, "learning_rate": 1.4173344740830877e-06, "loss": 0.73794276, "num_input_tokens_seen": 108688420, "step": 5042, "time_per_iteration": 2.6562960147857666 }, { "auxiliary_loss_clip": 0.01106764, "auxiliary_loss_mlp": 0.01057323, "balance_loss_clip": 1.04467392, "balance_loss_mlp": 1.04022837, "epoch": 0.6063848974929358, "flos": 38983151283840.0, "grad_norm": 1.5511025347762548, "language_loss": 0.70597041, "learning_rate": 1.4165893370566206e-06, "loss": 0.72761136, "num_input_tokens_seen": 108712175, "step": 5043, "time_per_iteration": 2.7566847801208496 }, { "auxiliary_loss_clip": 0.01120853, "auxiliary_loss_mlp": 0.01051173, "balance_loss_clip": 1.04183686, "balance_loss_mlp": 1.03255272, "epoch": 0.6065051403835748, "flos": 19646584784640.0, "grad_norm": 1.7255453523565887, "language_loss": 0.77517772, "learning_rate": 1.4158442885330865e-06, "loss": 0.79689801, "num_input_tokens_seen": 108730745, "step": 5044, "time_per_iteration": 2.620529890060425 }, { "auxiliary_loss_clip": 0.01118663, "auxiliary_loss_mlp": 0.0103927, "balance_loss_clip": 1.0411377, "balance_loss_mlp": 1.0229739, "epoch": 0.6066253832742139, "flos": 23513086437120.0, "grad_norm": 2.0382526409712103, "language_loss": 0.78909624, "learning_rate": 1.4150993286255094e-06, "loss": 0.81067556, "num_input_tokens_seen": 108749995, "step": 5045, "time_per_iteration": 2.638287305831909 }, { "auxiliary_loss_clip": 0.0113744, "auxiliary_loss_mlp": 0.0103967, "balance_loss_clip": 1.0452776, "balance_loss_mlp": 1.02336788, "epoch": 0.6067456261648531, "flos": 19133708440320.0, "grad_norm": 2.5710327409576017, "language_loss": 0.797611, "learning_rate": 1.4143544574468993e-06, "loss": 0.81938207, "num_input_tokens_seen": 108768355, "step": 5046, "time_per_iteration": 2.566650390625 }, { "auxiliary_loss_clip": 0.01121258, "auxiliary_loss_mlp": 0.01040082, "balance_loss_clip": 1.04310977, "balance_loss_mlp": 1.02413154, "epoch": 0.6068658690554921, "flos": 20520614424960.0, "grad_norm": 5.150627055818386, "language_loss": 0.82268608, "learning_rate": 1.4136096751102523e-06, "loss": 0.84429955, "num_input_tokens_seen": 108786685, "step": 5047, "time_per_iteration": 2.6198716163635254 }, { "auxiliary_loss_clip": 0.01114581, "auxiliary_loss_mlp": 0.01042821, "balance_loss_clip": 1.04395723, "balance_loss_mlp": 1.02793181, "epoch": 0.6069861119461312, "flos": 27374560185600.0, "grad_norm": 2.0711446638451867, "language_loss": 0.83108294, "learning_rate": 1.4128649817285516e-06, "loss": 0.85265696, "num_input_tokens_seen": 108804820, "step": 5048, "time_per_iteration": 2.6865437030792236 }, { "auxiliary_loss_clip": 0.01118263, "auxiliary_loss_mlp": 0.01042961, "balance_loss_clip": 1.04410636, "balance_loss_mlp": 1.02679634, "epoch": 0.6071063548367702, "flos": 25626500904960.0, "grad_norm": 2.41737183087741, "language_loss": 0.63237292, "learning_rate": 1.412120377414766e-06, "loss": 0.65398514, "num_input_tokens_seen": 108825010, "step": 5049, "time_per_iteration": 2.698371171951294 }, { "auxiliary_loss_clip": 0.01137955, "auxiliary_loss_mlp": 0.01044045, "balance_loss_clip": 1.04601252, "balance_loss_mlp": 1.02898836, "epoch": 0.6072265977274094, "flos": 24460517520000.0, "grad_norm": 3.748252977863052, "language_loss": 0.71461964, "learning_rate": 1.4113758622818522e-06, "loss": 0.73643959, "num_input_tokens_seen": 108845075, "step": 5050, "time_per_iteration": 3.501347064971924 }, { "auxiliary_loss_clip": 0.01117801, "auxiliary_loss_mlp": 0.00772272, "balance_loss_clip": 1.04534459, "balance_loss_mlp": 1.00044572, "epoch": 0.6073468406180484, "flos": 18149253413760.0, "grad_norm": 2.2485826341259307, "language_loss": 0.83252192, "learning_rate": 1.410631436442751e-06, "loss": 0.85142267, "num_input_tokens_seen": 108863870, "step": 5051, "time_per_iteration": 2.6825435161590576 }, { "auxiliary_loss_clip": 0.01130175, "auxiliary_loss_mlp": 0.0103986, "balance_loss_clip": 1.04413855, "balance_loss_mlp": 1.02296758, "epoch": 0.6074670835086875, "flos": 20697617669760.0, "grad_norm": 2.4018603566510057, "language_loss": 0.86649919, "learning_rate": 1.4098871000103936e-06, "loss": 0.88819951, "num_input_tokens_seen": 108882470, "step": 5052, "time_per_iteration": 3.5717053413391113 }, { "auxiliary_loss_clip": 0.01112515, "auxiliary_loss_mlp": 0.01039212, "balance_loss_clip": 1.04336691, "balance_loss_mlp": 1.02367949, "epoch": 0.6075873263993267, "flos": 23769955572480.0, "grad_norm": 1.7831972525951305, "language_loss": 0.82262135, "learning_rate": 1.409142853097693e-06, "loss": 0.84413856, "num_input_tokens_seen": 108902710, "step": 5053, "time_per_iteration": 2.699390411376953 }, { "auxiliary_loss_clip": 0.01118222, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.04410279, "balance_loss_mlp": 1.01676309, "epoch": 0.6077075692899657, "flos": 24454484035200.0, "grad_norm": 1.8984913590382635, "language_loss": 0.79455829, "learning_rate": 1.408398695817553e-06, "loss": 0.8160671, "num_input_tokens_seen": 108919935, "step": 5054, "time_per_iteration": 2.625053882598877 }, { "auxiliary_loss_clip": 0.01116202, "auxiliary_loss_mlp": 0.01043945, "balance_loss_clip": 1.04297519, "balance_loss_mlp": 1.02564633, "epoch": 0.6078278121806048, "flos": 27382102041600.0, "grad_norm": 1.639790354660382, "language_loss": 0.70244277, "learning_rate": 1.4076546282828593e-06, "loss": 0.7240442, "num_input_tokens_seen": 108942790, "step": 5055, "time_per_iteration": 2.715393543243408 }, { "auxiliary_loss_clip": 0.01116741, "auxiliary_loss_mlp": 0.01042787, "balance_loss_clip": 1.04073882, "balance_loss_mlp": 1.02621651, "epoch": 0.6079480550712439, "flos": 38436447306240.0, "grad_norm": 2.6649398668603523, "language_loss": 0.66489071, "learning_rate": 1.4069106506064874e-06, "loss": 0.68648601, "num_input_tokens_seen": 108964215, "step": 5056, "time_per_iteration": 2.7873988151550293 }, { "auxiliary_loss_clip": 0.01111415, "auxiliary_loss_mlp": 0.01040522, "balance_loss_clip": 1.04331851, "balance_loss_mlp": 1.02445245, "epoch": 0.608068297961883, "flos": 25336271013120.0, "grad_norm": 1.6687918673495559, "language_loss": 0.78134835, "learning_rate": 1.4061667629012989e-06, "loss": 0.80286771, "num_input_tokens_seen": 108984885, "step": 5057, "time_per_iteration": 3.870926856994629 }, { "auxiliary_loss_clip": 0.0110653, "auxiliary_loss_mlp": 0.01034022, "balance_loss_clip": 1.04273415, "balance_loss_mlp": 1.01726663, "epoch": 0.608188540852522, "flos": 24202463235840.0, "grad_norm": 1.8565567645732504, "language_loss": 0.83341157, "learning_rate": 1.40542296528014e-06, "loss": 0.85481709, "num_input_tokens_seen": 109004545, "step": 5058, "time_per_iteration": 2.6995131969451904 }, { "auxiliary_loss_clip": 0.01125458, "auxiliary_loss_mlp": 0.01043396, "balance_loss_clip": 1.04297817, "balance_loss_mlp": 1.02744555, "epoch": 0.6083087837431612, "flos": 21284146851840.0, "grad_norm": 2.0045869156002385, "language_loss": 0.75923759, "learning_rate": 1.4046792578558452e-06, "loss": 0.78092611, "num_input_tokens_seen": 109022440, "step": 5059, "time_per_iteration": 2.6020302772521973 }, { "auxiliary_loss_clip": 0.01108195, "auxiliary_loss_mlp": 0.0104986, "balance_loss_clip": 1.04133594, "balance_loss_mlp": 1.0328604, "epoch": 0.6084290266338003, "flos": 16471435178880.0, "grad_norm": 2.3110988644444297, "language_loss": 0.76100791, "learning_rate": 1.4039356407412325e-06, "loss": 0.78258848, "num_input_tokens_seen": 109035680, "step": 5060, "time_per_iteration": 3.508790969848633 }, { "auxiliary_loss_clip": 0.01039655, "auxiliary_loss_mlp": 0.0100333, "balance_loss_clip": 1.01876473, "balance_loss_mlp": 1.00172031, "epoch": 0.6085492695244393, "flos": 66443574931200.0, "grad_norm": 0.7883581523103378, "language_loss": 0.57143438, "learning_rate": 1.40319211404911e-06, "loss": 0.59186423, "num_input_tokens_seen": 109090680, "step": 5061, "time_per_iteration": 3.1426737308502197 }, { "auxiliary_loss_clip": 0.01140415, "auxiliary_loss_mlp": 0.01045488, "balance_loss_clip": 1.04684258, "balance_loss_mlp": 1.02934742, "epoch": 0.6086695124150785, "flos": 23618986709760.0, "grad_norm": 1.927593433995584, "language_loss": 0.90316713, "learning_rate": 1.4024486778922691e-06, "loss": 0.92502618, "num_input_tokens_seen": 109108995, "step": 5062, "time_per_iteration": 2.606961250305176 }, { "auxiliary_loss_clip": 0.0111952, "auxiliary_loss_mlp": 0.01040072, "balance_loss_clip": 1.04509246, "balance_loss_mlp": 1.02394319, "epoch": 0.6087897553057176, "flos": 20157054917760.0, "grad_norm": 2.0990645163891464, "language_loss": 0.77678639, "learning_rate": 1.4017053323834884e-06, "loss": 0.79838228, "num_input_tokens_seen": 109128825, "step": 5063, "time_per_iteration": 2.6749963760375977 }, { "auxiliary_loss_clip": 0.01113674, "auxiliary_loss_mlp": 0.01044374, "balance_loss_clip": 1.0415709, "balance_loss_mlp": 1.02968764, "epoch": 0.6089099981963566, "flos": 25482535194240.0, "grad_norm": 2.067763397601098, "language_loss": 0.76233357, "learning_rate": 1.4009620776355333e-06, "loss": 0.78391409, "num_input_tokens_seen": 109150425, "step": 5064, "time_per_iteration": 2.6923656463623047 }, { "auxiliary_loss_clip": 0.01119325, "auxiliary_loss_mlp": 0.01048269, "balance_loss_clip": 1.04167747, "balance_loss_mlp": 1.03095973, "epoch": 0.6090302410869958, "flos": 25332895134720.0, "grad_norm": 13.18061714298145, "language_loss": 0.79056543, "learning_rate": 1.4002189137611553e-06, "loss": 0.81224138, "num_input_tokens_seen": 109169765, "step": 5065, "time_per_iteration": 2.661489725112915 }, { "auxiliary_loss_clip": 0.01124169, "auxiliary_loss_mlp": 0.01039765, "balance_loss_clip": 1.04317999, "balance_loss_mlp": 1.02480412, "epoch": 0.6091504839776348, "flos": 23987358639360.0, "grad_norm": 1.6333639552338388, "language_loss": 0.69656444, "learning_rate": 1.3994758408730901e-06, "loss": 0.71820384, "num_input_tokens_seen": 109188950, "step": 5066, "time_per_iteration": 2.6050233840942383 }, { "auxiliary_loss_clip": 0.01117453, "auxiliary_loss_mlp": 0.01040725, "balance_loss_clip": 1.04357016, "balance_loss_mlp": 1.02499497, "epoch": 0.6092707268682739, "flos": 29643037666560.0, "grad_norm": 2.468530411830295, "language_loss": 0.76757318, "learning_rate": 1.3987328590840629e-06, "loss": 0.78915495, "num_input_tokens_seen": 109209895, "step": 5067, "time_per_iteration": 2.730109453201294 }, { "auxiliary_loss_clip": 0.01121525, "auxiliary_loss_mlp": 0.0105262, "balance_loss_clip": 1.04235077, "balance_loss_mlp": 1.03512049, "epoch": 0.609390969758913, "flos": 24024957200640.0, "grad_norm": 1.9077456883014483, "language_loss": 0.86435926, "learning_rate": 1.397989968506783e-06, "loss": 0.88610077, "num_input_tokens_seen": 109228905, "step": 5068, "time_per_iteration": 2.666919708251953 }, { "auxiliary_loss_clip": 0.01141308, "auxiliary_loss_mlp": 0.01046334, "balance_loss_clip": 1.04605508, "balance_loss_mlp": 1.03108752, "epoch": 0.6095112126495521, "flos": 11102143288320.0, "grad_norm": 4.043903310384227, "language_loss": 0.72410607, "learning_rate": 1.3972471692539458e-06, "loss": 0.74598253, "num_input_tokens_seen": 109243620, "step": 5069, "time_per_iteration": 2.538600206375122 }, { "auxiliary_loss_clip": 0.01110346, "auxiliary_loss_mlp": 0.01049604, "balance_loss_clip": 1.04397988, "balance_loss_mlp": 1.03364217, "epoch": 0.6096314555401912, "flos": 17265491187840.0, "grad_norm": 2.3299596896838195, "language_loss": 0.75525987, "learning_rate": 1.3965044614382348e-06, "loss": 0.77685934, "num_input_tokens_seen": 109259070, "step": 5070, "time_per_iteration": 2.6293981075286865 }, { "auxiliary_loss_clip": 0.01144149, "auxiliary_loss_mlp": 0.01044976, "balance_loss_clip": 1.04866362, "balance_loss_mlp": 1.02634323, "epoch": 0.6097516984308303, "flos": 21645910679040.0, "grad_norm": 2.813975676072755, "language_loss": 0.75942421, "learning_rate": 1.3957618451723162e-06, "loss": 0.78131545, "num_input_tokens_seen": 109275100, "step": 5071, "time_per_iteration": 2.5835630893707275 }, { "auxiliary_loss_clip": 0.01118338, "auxiliary_loss_mlp": 0.01040942, "balance_loss_clip": 1.04402983, "balance_loss_mlp": 1.02418113, "epoch": 0.6098719413214694, "flos": 27199208966400.0, "grad_norm": 52.14652928702791, "language_loss": 0.71397233, "learning_rate": 1.3950193205688457e-06, "loss": 0.73556513, "num_input_tokens_seen": 109294825, "step": 5072, "time_per_iteration": 2.6597511768341064 }, { "auxiliary_loss_clip": 0.01110347, "auxiliary_loss_mlp": 0.01034806, "balance_loss_clip": 1.04253435, "balance_loss_mlp": 1.01964211, "epoch": 0.6099921842121084, "flos": 20412954385920.0, "grad_norm": 2.343921491523796, "language_loss": 0.83601618, "learning_rate": 1.3942768877404627e-06, "loss": 0.85746765, "num_input_tokens_seen": 109313790, "step": 5073, "time_per_iteration": 2.650568962097168 }, { "auxiliary_loss_clip": 0.01135376, "auxiliary_loss_mlp": 0.01036104, "balance_loss_clip": 1.04248583, "balance_loss_mlp": 1.02170908, "epoch": 0.6101124271027476, "flos": 23366139897600.0, "grad_norm": 1.6176954137435087, "language_loss": 0.73722124, "learning_rate": 1.393534546799795e-06, "loss": 0.75893599, "num_input_tokens_seen": 109333490, "step": 5074, "time_per_iteration": 2.588107109069824 }, { "auxiliary_loss_clip": 0.01106241, "auxiliary_loss_mlp": 0.01046693, "balance_loss_clip": 1.04329515, "balance_loss_mlp": 1.03046846, "epoch": 0.6102326699933867, "flos": 26687840993280.0, "grad_norm": 1.7244714126453398, "language_loss": 0.67542863, "learning_rate": 1.3927922978594536e-06, "loss": 0.69695801, "num_input_tokens_seen": 109354575, "step": 5075, "time_per_iteration": 2.695570468902588 }, { "auxiliary_loss_clip": 0.0103552, "auxiliary_loss_mlp": 0.01002816, "balance_loss_clip": 1.01517057, "balance_loss_mlp": 1.00120664, "epoch": 0.6103529128840257, "flos": 60644612551680.0, "grad_norm": 0.7860554734481631, "language_loss": 0.57360613, "learning_rate": 1.3920501410320387e-06, "loss": 0.59398949, "num_input_tokens_seen": 109410690, "step": 5076, "time_per_iteration": 4.08814263343811 }, { "auxiliary_loss_clip": 0.01112459, "auxiliary_loss_mlp": 0.01036709, "balance_loss_clip": 1.04226315, "balance_loss_mlp": 1.02117586, "epoch": 0.6104731557746649, "flos": 19021307806080.0, "grad_norm": 2.050523113041855, "language_loss": 0.76455778, "learning_rate": 1.3913080764301333e-06, "loss": 0.78604949, "num_input_tokens_seen": 109427650, "step": 5077, "time_per_iteration": 2.7651114463806152 }, { "auxiliary_loss_clip": 0.01095853, "auxiliary_loss_mlp": 0.01038962, "balance_loss_clip": 1.03932703, "balance_loss_mlp": 1.02214158, "epoch": 0.6105933986653039, "flos": 23366894083200.0, "grad_norm": 2.3357537216947413, "language_loss": 0.71162963, "learning_rate": 1.3905661041663085e-06, "loss": 0.73297775, "num_input_tokens_seen": 109448835, "step": 5078, "time_per_iteration": 3.965045213699341 }, { "auxiliary_loss_clip": 0.0112473, "auxiliary_loss_mlp": 0.01039115, "balance_loss_clip": 1.04373097, "balance_loss_mlp": 1.02138901, "epoch": 0.610713641555943, "flos": 34637565006720.0, "grad_norm": 2.914895299753122, "language_loss": 0.64840865, "learning_rate": 1.389824224353122e-06, "loss": 0.6700471, "num_input_tokens_seen": 109470425, "step": 5079, "time_per_iteration": 2.7299880981445312 }, { "auxiliary_loss_clip": 0.01127351, "auxiliary_loss_mlp": 0.01037027, "balance_loss_clip": 1.04622376, "balance_loss_mlp": 1.02112484, "epoch": 0.610833884446582, "flos": 26646471504000.0, "grad_norm": 6.890756941808569, "language_loss": 0.77160394, "learning_rate": 1.389082437103115e-06, "loss": 0.7932477, "num_input_tokens_seen": 109489695, "step": 5080, "time_per_iteration": 2.7976789474487305 }, { "auxiliary_loss_clip": 0.01098558, "auxiliary_loss_mlp": 0.01038949, "balance_loss_clip": 1.03907835, "balance_loss_mlp": 1.0220449, "epoch": 0.6109541273372212, "flos": 21215126868480.0, "grad_norm": 2.470374490152264, "language_loss": 0.77915072, "learning_rate": 1.3883407425288172e-06, "loss": 0.80052578, "num_input_tokens_seen": 109510030, "step": 5081, "time_per_iteration": 2.7035014629364014 }, { "auxiliary_loss_clip": 0.01107905, "auxiliary_loss_mlp": 0.01053413, "balance_loss_clip": 1.04027748, "balance_loss_mlp": 1.03450596, "epoch": 0.6110743702278603, "flos": 20084084438400.0, "grad_norm": 2.114841036163113, "language_loss": 0.79608583, "learning_rate": 1.3875991407427417e-06, "loss": 0.81769896, "num_input_tokens_seen": 109528255, "step": 5082, "time_per_iteration": 2.6509740352630615 }, { "auxiliary_loss_clip": 0.01017242, "auxiliary_loss_mlp": 0.01001812, "balance_loss_clip": 1.01347423, "balance_loss_mlp": 1.00009501, "epoch": 0.6111946131184993, "flos": 68302957438080.0, "grad_norm": 0.8075977403099642, "language_loss": 0.58145928, "learning_rate": 1.38685763185739e-06, "loss": 0.60164988, "num_input_tokens_seen": 109581915, "step": 5083, "time_per_iteration": 4.295028448104858 }, { "auxiliary_loss_clip": 0.01138393, "auxiliary_loss_mlp": 0.01033135, "balance_loss_clip": 1.04614496, "balance_loss_mlp": 1.01765013, "epoch": 0.6113148560091385, "flos": 19937676602880.0, "grad_norm": 2.4632074518266593, "language_loss": 0.67470545, "learning_rate": 1.3861162159852476e-06, "loss": 0.69642067, "num_input_tokens_seen": 109600050, "step": 5084, "time_per_iteration": 2.6035280227661133 }, { "auxiliary_loss_clip": 0.01117262, "auxiliary_loss_mlp": 0.01050399, "balance_loss_clip": 1.04407096, "balance_loss_mlp": 1.03428149, "epoch": 0.6114350988997775, "flos": 23731854220800.0, "grad_norm": 1.701274918765062, "language_loss": 0.79810387, "learning_rate": 1.3853748932387875e-06, "loss": 0.81978047, "num_input_tokens_seen": 109620690, "step": 5085, "time_per_iteration": 2.6607015132904053 }, { "auxiliary_loss_clip": 0.01101266, "auxiliary_loss_mlp": 0.01037672, "balance_loss_clip": 1.03864074, "balance_loss_mlp": 1.02035093, "epoch": 0.6115553417904166, "flos": 24023700224640.0, "grad_norm": 2.316425213228158, "language_loss": 0.75223845, "learning_rate": 1.3846336637304671e-06, "loss": 0.77362788, "num_input_tokens_seen": 109638960, "step": 5086, "time_per_iteration": 3.552353858947754 }, { "auxiliary_loss_clip": 0.01102191, "auxiliary_loss_mlp": 0.01038976, "balance_loss_clip": 1.0406667, "balance_loss_mlp": 1.02222729, "epoch": 0.6116755846810558, "flos": 23733542160000.0, "grad_norm": 4.814944939037008, "language_loss": 0.83221412, "learning_rate": 1.3838925275727316e-06, "loss": 0.85362577, "num_input_tokens_seen": 109659700, "step": 5087, "time_per_iteration": 2.6818110942840576 }, { "auxiliary_loss_clip": 0.01141007, "auxiliary_loss_mlp": 0.01042398, "balance_loss_clip": 1.04859555, "balance_loss_mlp": 1.02687681, "epoch": 0.6117958275716948, "flos": 18661626967680.0, "grad_norm": 2.015534546585557, "language_loss": 0.79294765, "learning_rate": 1.3831514848780089e-06, "loss": 0.81478167, "num_input_tokens_seen": 109679275, "step": 5088, "time_per_iteration": 2.573115825653076 }, { "auxiliary_loss_clip": 0.01121523, "auxiliary_loss_mlp": 0.0104721, "balance_loss_clip": 1.04504204, "balance_loss_mlp": 1.02942443, "epoch": 0.6119160704623339, "flos": 16471183783680.0, "grad_norm": 2.677281753092589, "language_loss": 0.92399782, "learning_rate": 1.3824105357587152e-06, "loss": 0.94568515, "num_input_tokens_seen": 109696380, "step": 5089, "time_per_iteration": 2.694692611694336 }, { "auxiliary_loss_clip": 0.01109319, "auxiliary_loss_mlp": 0.01035732, "balance_loss_clip": 1.04043174, "balance_loss_mlp": 1.02004397, "epoch": 0.612036313352973, "flos": 23915465568000.0, "grad_norm": 1.6161996032351122, "language_loss": 0.82541817, "learning_rate": 1.381669680327253e-06, "loss": 0.84686863, "num_input_tokens_seen": 109718060, "step": 5090, "time_per_iteration": 2.6743357181549072 }, { "auxiliary_loss_clip": 0.01106722, "auxiliary_loss_mlp": 0.01040846, "balance_loss_clip": 1.04397774, "balance_loss_mlp": 1.02518177, "epoch": 0.6121565562436121, "flos": 26974766833920.0, "grad_norm": 2.074483854720602, "language_loss": 0.71098727, "learning_rate": 1.380928918696008e-06, "loss": 0.73246288, "num_input_tokens_seen": 109736830, "step": 5091, "time_per_iteration": 2.6961476802825928 }, { "auxiliary_loss_clip": 0.01126272, "auxiliary_loss_mlp": 0.01033782, "balance_loss_clip": 1.0455687, "balance_loss_mlp": 1.01715207, "epoch": 0.6122767991342511, "flos": 15668867646720.0, "grad_norm": 2.4084586051185597, "language_loss": 0.72323626, "learning_rate": 1.3801882509773548e-06, "loss": 0.74483681, "num_input_tokens_seen": 109754690, "step": 5092, "time_per_iteration": 2.624690532684326 }, { "auxiliary_loss_clip": 0.01121141, "auxiliary_loss_mlp": 0.01045072, "balance_loss_clip": 1.04180419, "balance_loss_mlp": 1.02775085, "epoch": 0.6123970420248903, "flos": 27964321591680.0, "grad_norm": 2.247437877626167, "language_loss": 0.81862533, "learning_rate": 1.3794476772836503e-06, "loss": 0.84028745, "num_input_tokens_seen": 109775790, "step": 5093, "time_per_iteration": 2.813305616378784 }, { "auxiliary_loss_clip": 0.01092148, "auxiliary_loss_mlp": 0.01038165, "balance_loss_clip": 1.03991508, "balance_loss_mlp": 1.02090359, "epoch": 0.6125172849155294, "flos": 21468727866240.0, "grad_norm": 1.6468135121584155, "language_loss": 0.84575307, "learning_rate": 1.3787071977272402e-06, "loss": 0.86705619, "num_input_tokens_seen": 109795050, "step": 5094, "time_per_iteration": 2.681117534637451 }, { "auxiliary_loss_clip": 0.01085889, "auxiliary_loss_mlp": 0.01047687, "balance_loss_clip": 1.04098916, "balance_loss_mlp": 1.03112888, "epoch": 0.6126375278061684, "flos": 16248321849600.0, "grad_norm": 3.1186246788197107, "language_loss": 0.72063267, "learning_rate": 1.3779668124204535e-06, "loss": 0.74196839, "num_input_tokens_seen": 109811465, "step": 5095, "time_per_iteration": 2.7366013526916504 }, { "auxiliary_loss_clip": 0.01103803, "auxiliary_loss_mlp": 0.0103955, "balance_loss_clip": 1.04140282, "balance_loss_mlp": 1.02381408, "epoch": 0.6127577706968076, "flos": 20448865008000.0, "grad_norm": 1.6418615510150465, "language_loss": 0.80705225, "learning_rate": 1.3772265214756074e-06, "loss": 0.82848579, "num_input_tokens_seen": 109831225, "step": 5096, "time_per_iteration": 2.6158127784729004 }, { "auxiliary_loss_clip": 0.01130908, "auxiliary_loss_mlp": 0.01041362, "balance_loss_clip": 1.043679, "balance_loss_mlp": 1.02344465, "epoch": 0.6128780135874466, "flos": 18260397072000.0, "grad_norm": 2.1800892854311775, "language_loss": 0.74803698, "learning_rate": 1.3764863250050025e-06, "loss": 0.76975971, "num_input_tokens_seen": 109849465, "step": 5097, "time_per_iteration": 2.6013410091400146 }, { "auxiliary_loss_clip": 0.01100743, "auxiliary_loss_mlp": 0.01036883, "balance_loss_clip": 1.04211354, "balance_loss_mlp": 1.02071822, "epoch": 0.6129982564780857, "flos": 24937088192640.0, "grad_norm": 1.6038611567605305, "language_loss": 0.80588657, "learning_rate": 1.3757462231209272e-06, "loss": 0.82726288, "num_input_tokens_seen": 109869770, "step": 5098, "time_per_iteration": 2.728696823120117 }, { "auxiliary_loss_clip": 0.01106989, "auxiliary_loss_mlp": 0.01041902, "balance_loss_clip": 1.04143333, "balance_loss_mlp": 1.02553463, "epoch": 0.6131184993687249, "flos": 22492038430080.0, "grad_norm": 2.527179499383733, "language_loss": 0.88986182, "learning_rate": 1.3750062159356525e-06, "loss": 0.91135073, "num_input_tokens_seen": 109889120, "step": 5099, "time_per_iteration": 2.691903829574585 }, { "auxiliary_loss_clip": 0.01092914, "auxiliary_loss_mlp": 0.01039032, "balance_loss_clip": 1.04152274, "balance_loss_mlp": 1.0230341, "epoch": 0.6132387422593639, "flos": 15885839750400.0, "grad_norm": 1.833830574834583, "language_loss": 0.83014667, "learning_rate": 1.3742663035614382e-06, "loss": 0.85146612, "num_input_tokens_seen": 109906490, "step": 5100, "time_per_iteration": 2.734654188156128 }, { "auxiliary_loss_clip": 0.01140621, "auxiliary_loss_mlp": 0.01043073, "balance_loss_clip": 1.04691792, "balance_loss_mlp": 1.02755165, "epoch": 0.613358985150003, "flos": 25411539962880.0, "grad_norm": 2.2115921143679005, "language_loss": 0.79876375, "learning_rate": 1.3735264861105283e-06, "loss": 0.82060069, "num_input_tokens_seen": 109927130, "step": 5101, "time_per_iteration": 3.4835543632507324 }, { "auxiliary_loss_clip": 0.01101719, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.04164445, "balance_loss_mlp": 1.02407336, "epoch": 0.6134792280406421, "flos": 21361283308800.0, "grad_norm": 2.1021592451285493, "language_loss": 0.78861046, "learning_rate": 1.372786763695152e-06, "loss": 0.81002569, "num_input_tokens_seen": 109945890, "step": 5102, "time_per_iteration": 2.6952216625213623 }, { "auxiliary_loss_clip": 0.01125172, "auxiliary_loss_mlp": 0.0104052, "balance_loss_clip": 1.04295707, "balance_loss_mlp": 1.02354455, "epoch": 0.6135994709312812, "flos": 21211248199680.0, "grad_norm": 1.8061685726202306, "language_loss": 0.77498215, "learning_rate": 1.3720471364275257e-06, "loss": 0.79663908, "num_input_tokens_seen": 109965535, "step": 5103, "time_per_iteration": 2.625657081604004 }, { "auxiliary_loss_clip": 0.01099388, "auxiliary_loss_mlp": 0.00772838, "balance_loss_clip": 1.04175925, "balance_loss_mlp": 1.00052059, "epoch": 0.6137197138219203, "flos": 14794047907200.0, "grad_norm": 2.2413880857500343, "language_loss": 0.78301787, "learning_rate": 1.3713076044198486e-06, "loss": 0.80174017, "num_input_tokens_seen": 109982345, "step": 5104, "time_per_iteration": 3.7140865325927734 }, { "auxiliary_loss_clip": 0.01106781, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.04115009, "balance_loss_mlp": 1.0233624, "epoch": 0.6138399567125594, "flos": 20084515401600.0, "grad_norm": 2.4515966862860177, "language_loss": 0.80885148, "learning_rate": 1.3705681677843086e-06, "loss": 0.83031571, "num_input_tokens_seen": 110000940, "step": 5105, "time_per_iteration": 2.6721315383911133 }, { "auxiliary_loss_clip": 0.01044421, "auxiliary_loss_mlp": 0.01005103, "balance_loss_clip": 1.01401925, "balance_loss_mlp": 1.00361252, "epoch": 0.6139601996031985, "flos": 60123838193280.0, "grad_norm": 0.7667498171426471, "language_loss": 0.60494161, "learning_rate": 1.3698288266330768e-06, "loss": 0.62543684, "num_input_tokens_seen": 110061565, "step": 5106, "time_per_iteration": 3.2653775215148926 }, { "auxiliary_loss_clip": 0.01104087, "auxiliary_loss_mlp": 0.01043946, "balance_loss_clip": 1.04142189, "balance_loss_mlp": 1.02893806, "epoch": 0.6140804424938375, "flos": 23586703361280.0, "grad_norm": 2.3580102710367234, "language_loss": 0.73322189, "learning_rate": 1.3690895810783113e-06, "loss": 0.75470221, "num_input_tokens_seen": 110080360, "step": 5107, "time_per_iteration": 2.663503408432007 }, { "auxiliary_loss_clip": 0.01077239, "auxiliary_loss_mlp": 0.00772849, "balance_loss_clip": 1.03773344, "balance_loss_mlp": 1.00066137, "epoch": 0.6142006853844767, "flos": 21398199511680.0, "grad_norm": 2.072217727210164, "language_loss": 0.71171874, "learning_rate": 1.3683504312321543e-06, "loss": 0.73021966, "num_input_tokens_seen": 110100695, "step": 5108, "time_per_iteration": 2.784343719482422 }, { "auxiliary_loss_clip": 0.01129928, "auxiliary_loss_mlp": 0.01038097, "balance_loss_clip": 1.04446876, "balance_loss_mlp": 1.02214694, "epoch": 0.6143209282751158, "flos": 12057367622400.0, "grad_norm": 2.2320024927291784, "language_loss": 0.80124354, "learning_rate": 1.3676113772067355e-06, "loss": 0.82292384, "num_input_tokens_seen": 110117750, "step": 5109, "time_per_iteration": 3.633758783340454 }, { "auxiliary_loss_clip": 0.01095914, "auxiliary_loss_mlp": 0.01043549, "balance_loss_clip": 1.04069686, "balance_loss_mlp": 1.0265615, "epoch": 0.6144411711657548, "flos": 25082274965760.0, "grad_norm": 1.9981985514050873, "language_loss": 0.72688478, "learning_rate": 1.3668724191141671e-06, "loss": 0.74827939, "num_input_tokens_seen": 110137020, "step": 5110, "time_per_iteration": 2.7472639083862305 }, { "auxiliary_loss_clip": 0.01093706, "auxiliary_loss_mlp": 0.01044555, "balance_loss_clip": 1.04332173, "balance_loss_mlp": 1.02899861, "epoch": 0.6145614140563939, "flos": 20114069316480.0, "grad_norm": 5.858460521998835, "language_loss": 0.66480172, "learning_rate": 1.3661335570665493e-06, "loss": 0.68618441, "num_input_tokens_seen": 110154930, "step": 5111, "time_per_iteration": 2.6940042972564697 }, { "auxiliary_loss_clip": 0.01119954, "auxiliary_loss_mlp": 0.01038214, "balance_loss_clip": 1.04781294, "balance_loss_mlp": 1.02200186, "epoch": 0.614681656947033, "flos": 16800376953600.0, "grad_norm": 2.2633453994921107, "language_loss": 0.70078731, "learning_rate": 1.3653947911759676e-06, "loss": 0.72236896, "num_input_tokens_seen": 110172480, "step": 5112, "time_per_iteration": 3.566758155822754 }, { "auxiliary_loss_clip": 0.01082078, "auxiliary_loss_mlp": 0.01043524, "balance_loss_clip": 1.03790939, "balance_loss_mlp": 1.02785969, "epoch": 0.6148018998376721, "flos": 38801587011840.0, "grad_norm": 1.804912068158888, "language_loss": 0.74582994, "learning_rate": 1.3646561215544904e-06, "loss": 0.76708597, "num_input_tokens_seen": 110197120, "step": 5113, "time_per_iteration": 2.9990665912628174 }, { "auxiliary_loss_clip": 0.01127032, "auxiliary_loss_mlp": 0.0104102, "balance_loss_clip": 1.04466569, "balance_loss_mlp": 1.02486753, "epoch": 0.6149221427283111, "flos": 23327032965120.0, "grad_norm": 2.247526477248546, "language_loss": 0.79641354, "learning_rate": 1.363917548314176e-06, "loss": 0.81809402, "num_input_tokens_seen": 110216385, "step": 5114, "time_per_iteration": 2.627572536468506 }, { "auxiliary_loss_clip": 0.01130351, "auxiliary_loss_mlp": 0.01043303, "balance_loss_clip": 1.04507613, "balance_loss_mlp": 1.02598238, "epoch": 0.6150423856189503, "flos": 22379494141440.0, "grad_norm": 1.8418190151775478, "language_loss": 0.73129845, "learning_rate": 1.3631790715670626e-06, "loss": 0.75303495, "num_input_tokens_seen": 110234790, "step": 5115, "time_per_iteration": 2.6223878860473633 }, { "auxiliary_loss_clip": 0.01049582, "auxiliary_loss_mlp": 0.01039478, "balance_loss_clip": 1.03555369, "balance_loss_mlp": 1.02530444, "epoch": 0.6151626285095894, "flos": 18692078722560.0, "grad_norm": 1.742820127764419, "language_loss": 0.85527802, "learning_rate": 1.3624406914251783e-06, "loss": 0.87616861, "num_input_tokens_seen": 110251910, "step": 5116, "time_per_iteration": 2.8619813919067383 }, { "auxiliary_loss_clip": 0.01123061, "auxiliary_loss_mlp": 0.01039059, "balance_loss_clip": 1.04258764, "balance_loss_mlp": 1.0241816, "epoch": 0.6152828714002284, "flos": 15851688894720.0, "grad_norm": 2.1712322125780816, "language_loss": 0.88430083, "learning_rate": 1.3617024080005335e-06, "loss": 0.905922, "num_input_tokens_seen": 110268810, "step": 5117, "time_per_iteration": 2.7927663326263428 }, { "auxiliary_loss_clip": 0.01117437, "auxiliary_loss_mlp": 0.00772534, "balance_loss_clip": 1.04368436, "balance_loss_mlp": 1.00059366, "epoch": 0.6154031142908676, "flos": 24869792062080.0, "grad_norm": 2.1226737565539078, "language_loss": 0.74199122, "learning_rate": 1.3609642214051266e-06, "loss": 0.7608909, "num_input_tokens_seen": 110293035, "step": 5118, "time_per_iteration": 2.739464521408081 }, { "auxiliary_loss_clip": 0.01102308, "auxiliary_loss_mlp": 0.01037009, "balance_loss_clip": 1.04120231, "balance_loss_mlp": 1.02048612, "epoch": 0.6155233571815066, "flos": 19244744357760.0, "grad_norm": 7.279059881623801, "language_loss": 0.66271704, "learning_rate": 1.3602261317509385e-06, "loss": 0.68411022, "num_input_tokens_seen": 110309695, "step": 5119, "time_per_iteration": 2.6491987705230713 }, { "auxiliary_loss_clip": 0.01127922, "auxiliary_loss_mlp": 0.01038357, "balance_loss_clip": 1.04358733, "balance_loss_mlp": 1.02159595, "epoch": 0.6156436000721457, "flos": 18770077105920.0, "grad_norm": 4.181594986578309, "language_loss": 0.83080572, "learning_rate": 1.3594881391499387e-06, "loss": 0.85246849, "num_input_tokens_seen": 110328610, "step": 5120, "time_per_iteration": 2.5987436771392822 }, { "auxiliary_loss_clip": 0.01115837, "auxiliary_loss_mlp": 0.0104652, "balance_loss_clip": 1.04596615, "balance_loss_mlp": 1.03046274, "epoch": 0.6157638429627849, "flos": 18041198325120.0, "grad_norm": 1.791457635568787, "language_loss": 0.79562783, "learning_rate": 1.3587502437140778e-06, "loss": 0.81725144, "num_input_tokens_seen": 110346775, "step": 5121, "time_per_iteration": 2.6993675231933594 }, { "auxiliary_loss_clip": 0.01119396, "auxiliary_loss_mlp": 0.01040148, "balance_loss_clip": 1.04518759, "balance_loss_mlp": 1.02386415, "epoch": 0.6158840858534239, "flos": 25556726736000.0, "grad_norm": 2.439953724994625, "language_loss": 0.84837878, "learning_rate": 1.3580124455552952e-06, "loss": 0.86997426, "num_input_tokens_seen": 110366140, "step": 5122, "time_per_iteration": 2.6832377910614014 }, { "auxiliary_loss_clip": 0.01127842, "auxiliary_loss_mlp": 0.00772492, "balance_loss_clip": 1.04632521, "balance_loss_mlp": 1.00061226, "epoch": 0.616004328744063, "flos": 24640788902400.0, "grad_norm": 1.7047844196040034, "language_loss": 0.87546015, "learning_rate": 1.3572747447855148e-06, "loss": 0.89446348, "num_input_tokens_seen": 110386550, "step": 5123, "time_per_iteration": 2.6350293159484863 }, { "auxiliary_loss_clip": 0.01143088, "auxiliary_loss_mlp": 0.01042875, "balance_loss_clip": 1.04745626, "balance_loss_mlp": 1.02666283, "epoch": 0.6161245716347021, "flos": 21689686379520.0, "grad_norm": 1.967369653896155, "language_loss": 0.69441319, "learning_rate": 1.356537141516644e-06, "loss": 0.71627283, "num_input_tokens_seen": 110403970, "step": 5124, "time_per_iteration": 2.614234209060669 }, { "auxiliary_loss_clip": 0.01127399, "auxiliary_loss_mlp": 0.01036398, "balance_loss_clip": 1.04655838, "balance_loss_mlp": 1.02091324, "epoch": 0.6162448145253412, "flos": 35189225061120.0, "grad_norm": 2.0843872533444516, "language_loss": 0.61726367, "learning_rate": 1.3557996358605775e-06, "loss": 0.63890165, "num_input_tokens_seen": 110423890, "step": 5125, "time_per_iteration": 2.7199935913085938 }, { "auxiliary_loss_clip": 0.01123924, "auxiliary_loss_mlp": 0.01040763, "balance_loss_clip": 1.04392958, "balance_loss_mlp": 1.02641606, "epoch": 0.6163650574159802, "flos": 21615279356160.0, "grad_norm": 3.025215088503159, "language_loss": 0.70500445, "learning_rate": 1.3550622279291941e-06, "loss": 0.72665131, "num_input_tokens_seen": 110442035, "step": 5126, "time_per_iteration": 2.6539113521575928 }, { "auxiliary_loss_clip": 0.01078067, "auxiliary_loss_mlp": 0.01043417, "balance_loss_clip": 1.03693271, "balance_loss_mlp": 1.02766919, "epoch": 0.6164853003066194, "flos": 24572163968640.0, "grad_norm": 1.3848821330256893, "language_loss": 0.83218926, "learning_rate": 1.354324917834358e-06, "loss": 0.8534041, "num_input_tokens_seen": 110463280, "step": 5127, "time_per_iteration": 3.7543535232543945 }, { "auxiliary_loss_clip": 0.01068381, "auxiliary_loss_mlp": 0.00772975, "balance_loss_clip": 1.03762007, "balance_loss_mlp": 1.00058746, "epoch": 0.6166055431972585, "flos": 21835986474240.0, "grad_norm": 1.8852598766301447, "language_loss": 0.77213895, "learning_rate": 1.353587705687918e-06, "loss": 0.79055262, "num_input_tokens_seen": 110481455, "step": 5128, "time_per_iteration": 2.7335567474365234 }, { "auxiliary_loss_clip": 0.01116958, "auxiliary_loss_mlp": 0.01044823, "balance_loss_clip": 1.04247952, "balance_loss_mlp": 1.02675068, "epoch": 0.6167257860878975, "flos": 17785262943360.0, "grad_norm": 2.629418680002442, "language_loss": 0.7231636, "learning_rate": 1.3528505916017096e-06, "loss": 0.74478137, "num_input_tokens_seen": 110499155, "step": 5129, "time_per_iteration": 2.6308178901672363 }, { "auxiliary_loss_clip": 0.01128837, "auxiliary_loss_mlp": 0.01040893, "balance_loss_clip": 1.04460669, "balance_loss_mlp": 1.02477574, "epoch": 0.6168460289785367, "flos": 23214811898880.0, "grad_norm": 4.059820046601659, "language_loss": 0.88745373, "learning_rate": 1.3521135756875514e-06, "loss": 0.90915102, "num_input_tokens_seen": 110515470, "step": 5130, "time_per_iteration": 3.5728137493133545 }, { "auxiliary_loss_clip": 0.01064864, "auxiliary_loss_mlp": 0.01039543, "balance_loss_clip": 1.03803468, "balance_loss_mlp": 1.02483261, "epoch": 0.6169662718691757, "flos": 26213281482240.0, "grad_norm": 1.4337084662537207, "language_loss": 0.86274666, "learning_rate": 1.3513766580572496e-06, "loss": 0.88379073, "num_input_tokens_seen": 110538290, "step": 5131, "time_per_iteration": 2.797391653060913 }, { "auxiliary_loss_clip": 0.0112422, "auxiliary_loss_mlp": 0.01041474, "balance_loss_clip": 1.04500413, "balance_loss_mlp": 1.02620316, "epoch": 0.6170865147598148, "flos": 19026120228480.0, "grad_norm": 2.294108345379095, "language_loss": 0.77361542, "learning_rate": 1.3506398388225924e-06, "loss": 0.79527235, "num_input_tokens_seen": 110555610, "step": 5132, "time_per_iteration": 2.598785400390625 }, { "auxiliary_loss_clip": 0.01139399, "auxiliary_loss_mlp": 0.01043814, "balance_loss_clip": 1.04842794, "balance_loss_mlp": 1.0271368, "epoch": 0.617206757650454, "flos": 18260361158400.0, "grad_norm": 1.7772946364759794, "language_loss": 0.71917671, "learning_rate": 1.349903118095355e-06, "loss": 0.74100882, "num_input_tokens_seen": 110574745, "step": 5133, "time_per_iteration": 2.628007411956787 }, { "auxiliary_loss_clip": 0.01127696, "auxiliary_loss_mlp": 0.01038055, "balance_loss_clip": 1.04313231, "balance_loss_mlp": 1.02192569, "epoch": 0.617327000541093, "flos": 18186959715840.0, "grad_norm": 6.334353793996886, "language_loss": 0.73571229, "learning_rate": 1.349166495987298e-06, "loss": 0.75736976, "num_input_tokens_seen": 110593310, "step": 5134, "time_per_iteration": 2.6153805255889893 }, { "auxiliary_loss_clip": 0.01015473, "auxiliary_loss_mlp": 0.01000896, "balance_loss_clip": 1.00982666, "balance_loss_mlp": 0.99902457, "epoch": 0.6174472434317321, "flos": 61833796122240.0, "grad_norm": 0.8124641721534498, "language_loss": 0.60870969, "learning_rate": 1.348429972610166e-06, "loss": 0.62887335, "num_input_tokens_seen": 110657615, "step": 5135, "time_per_iteration": 4.338713645935059 }, { "auxiliary_loss_clip": 0.00995332, "auxiliary_loss_mlp": 0.01004665, "balance_loss_clip": 1.01147926, "balance_loss_mlp": 1.00309169, "epoch": 0.6175674863223712, "flos": 71230970494080.0, "grad_norm": 0.8839894170485539, "language_loss": 0.57838583, "learning_rate": 1.3476935480756897e-06, "loss": 0.59838581, "num_input_tokens_seen": 110714365, "step": 5136, "time_per_iteration": 3.2297093868255615 }, { "auxiliary_loss_clip": 0.01091532, "auxiliary_loss_mlp": 0.01050585, "balance_loss_clip": 1.04044628, "balance_loss_mlp": 1.03124905, "epoch": 0.6176877292130103, "flos": 21835447770240.0, "grad_norm": 2.164790293140089, "language_loss": 0.75194472, "learning_rate": 1.346957222495583e-06, "loss": 0.77336591, "num_input_tokens_seen": 110732160, "step": 5137, "time_per_iteration": 2.658196449279785 }, { "auxiliary_loss_clip": 0.01121661, "auxiliary_loss_mlp": 0.00773247, "balance_loss_clip": 1.04672956, "balance_loss_mlp": 1.00060964, "epoch": 0.6178079721036493, "flos": 17741738638080.0, "grad_norm": 2.794797515769164, "language_loss": 0.71213448, "learning_rate": 1.3462209959815466e-06, "loss": 0.73108357, "num_input_tokens_seen": 110746900, "step": 5138, "time_per_iteration": 3.514491081237793 }, { "auxiliary_loss_clip": 0.01112993, "auxiliary_loss_mlp": 0.01036903, "balance_loss_clip": 1.04257524, "balance_loss_mlp": 1.02173984, "epoch": 0.6179282149942885, "flos": 22633131052800.0, "grad_norm": 1.836272400060904, "language_loss": 0.74580121, "learning_rate": 1.345484868645265e-06, "loss": 0.76730013, "num_input_tokens_seen": 110765710, "step": 5139, "time_per_iteration": 2.6263251304626465 }, { "auxiliary_loss_clip": 0.01107167, "auxiliary_loss_mlp": 0.01034733, "balance_loss_clip": 1.04335606, "balance_loss_mlp": 1.01841378, "epoch": 0.6180484578849276, "flos": 22310330503680.0, "grad_norm": 2.011490580358348, "language_loss": 0.78531539, "learning_rate": 1.3447488405984088e-06, "loss": 0.80673438, "num_input_tokens_seen": 110783970, "step": 5140, "time_per_iteration": 2.6993937492370605 }, { "auxiliary_loss_clip": 0.01113937, "auxiliary_loss_mlp": 0.01048255, "balance_loss_clip": 1.04255867, "balance_loss_mlp": 1.03150618, "epoch": 0.6181687007755666, "flos": 35225458905600.0, "grad_norm": 2.6111819676248187, "language_loss": 0.70402312, "learning_rate": 1.3440129119526322e-06, "loss": 0.72564507, "num_input_tokens_seen": 110806395, "step": 5141, "time_per_iteration": 2.791656017303467 }, { "auxiliary_loss_clip": 0.0104465, "auxiliary_loss_mlp": 0.00999694, "balance_loss_clip": 1.0142808, "balance_loss_mlp": 0.99828702, "epoch": 0.6182889436662057, "flos": 61547370094080.0, "grad_norm": 0.8068220238114084, "language_loss": 0.51178885, "learning_rate": 1.3432770828195762e-06, "loss": 0.53223228, "num_input_tokens_seen": 110867380, "step": 5142, "time_per_iteration": 3.2966959476470947 }, { "auxiliary_loss_clip": 0.01089059, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.03689539, "balance_loss_mlp": 1.02180064, "epoch": 0.6184091865568448, "flos": 19609991804160.0, "grad_norm": 2.6139080466115545, "language_loss": 0.70621705, "learning_rate": 1.3425413533108635e-06, "loss": 0.72749329, "num_input_tokens_seen": 110885980, "step": 5143, "time_per_iteration": 2.6508731842041016 }, { "auxiliary_loss_clip": 0.01095368, "auxiliary_loss_mlp": 0.01048381, "balance_loss_clip": 1.04590857, "balance_loss_mlp": 1.03121531, "epoch": 0.6185294294474839, "flos": 23586882929280.0, "grad_norm": 2.1244912878269155, "language_loss": 0.70944488, "learning_rate": 1.341805723538105e-06, "loss": 0.73088235, "num_input_tokens_seen": 110906085, "step": 5144, "time_per_iteration": 2.7264318466186523 }, { "auxiliary_loss_clip": 0.01119671, "auxiliary_loss_mlp": 0.01043641, "balance_loss_clip": 1.04473019, "balance_loss_mlp": 1.02776265, "epoch": 0.618649672338123, "flos": 26762032535040.0, "grad_norm": 1.7039022172562355, "language_loss": 0.77580428, "learning_rate": 1.3410701936128948e-06, "loss": 0.79743743, "num_input_tokens_seen": 110928865, "step": 5145, "time_per_iteration": 2.688196897506714 }, { "auxiliary_loss_clip": 0.01129262, "auxiliary_loss_mlp": 0.01050348, "balance_loss_clip": 1.0476265, "balance_loss_mlp": 1.03480268, "epoch": 0.6187699152287621, "flos": 14456630522880.0, "grad_norm": 2.765736198162867, "language_loss": 0.84840155, "learning_rate": 1.340334763646812e-06, "loss": 0.87019765, "num_input_tokens_seen": 110943000, "step": 5146, "time_per_iteration": 2.565692663192749 }, { "auxiliary_loss_clip": 0.01140902, "auxiliary_loss_mlp": 0.01039342, "balance_loss_clip": 1.04537165, "balance_loss_mlp": 1.02315354, "epoch": 0.6188901581194012, "flos": 20084766796800.0, "grad_norm": 1.6340519529694377, "language_loss": 0.74530387, "learning_rate": 1.3395994337514218e-06, "loss": 0.76710629, "num_input_tokens_seen": 110963170, "step": 5147, "time_per_iteration": 2.5889039039611816 }, { "auxiliary_loss_clip": 0.01118395, "auxiliary_loss_mlp": 0.01048191, "balance_loss_clip": 1.04168439, "balance_loss_mlp": 1.03239536, "epoch": 0.6190104010100402, "flos": 25700728360320.0, "grad_norm": 1.529920542370817, "language_loss": 0.78658628, "learning_rate": 1.3388642040382725e-06, "loss": 0.8082521, "num_input_tokens_seen": 110983595, "step": 5148, "time_per_iteration": 2.6148552894592285 }, { "auxiliary_loss_clip": 0.01101201, "auxiliary_loss_mlp": 0.01041052, "balance_loss_clip": 1.03953123, "balance_loss_mlp": 1.02373075, "epoch": 0.6191306439006794, "flos": 30442372974720.0, "grad_norm": 1.7375165451156218, "language_loss": 0.84398651, "learning_rate": 1.3381290746188975e-06, "loss": 0.86540902, "num_input_tokens_seen": 111002965, "step": 5149, "time_per_iteration": 2.7277746200561523 }, { "auxiliary_loss_clip": 0.01126227, "auxiliary_loss_mlp": 0.01039523, "balance_loss_clip": 1.04481614, "balance_loss_mlp": 1.02468109, "epoch": 0.6192508867913185, "flos": 26685793918080.0, "grad_norm": 1.749905312408953, "language_loss": 0.67259049, "learning_rate": 1.3373940456048152e-06, "loss": 0.69424808, "num_input_tokens_seen": 111022990, "step": 5150, "time_per_iteration": 2.665835380554199 }, { "auxiliary_loss_clip": 0.01138225, "auxiliary_loss_mlp": 0.01040122, "balance_loss_clip": 1.04778552, "balance_loss_mlp": 1.02497029, "epoch": 0.6193711296819575, "flos": 36722036090880.0, "grad_norm": 1.6402904331495125, "language_loss": 0.5935232, "learning_rate": 1.3366591171075299e-06, "loss": 0.61530668, "num_input_tokens_seen": 111046495, "step": 5151, "time_per_iteration": 2.7075107097625732 }, { "auxiliary_loss_clip": 0.01113121, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.04395556, "balance_loss_mlp": 1.02049148, "epoch": 0.6194913725725967, "flos": 25192556697600.0, "grad_norm": 2.6294893076510975, "language_loss": 0.91024578, "learning_rate": 1.335924289238529e-06, "loss": 0.93173218, "num_input_tokens_seen": 111065705, "step": 5152, "time_per_iteration": 2.712000846862793 }, { "auxiliary_loss_clip": 0.01122178, "auxiliary_loss_mlp": 0.00772465, "balance_loss_clip": 1.04622436, "balance_loss_mlp": 1.00072098, "epoch": 0.6196116154632357, "flos": 21178821196800.0, "grad_norm": 1.7399651592349807, "language_loss": 0.76951528, "learning_rate": 1.3351895621092859e-06, "loss": 0.78846174, "num_input_tokens_seen": 111086050, "step": 5153, "time_per_iteration": 3.469853162765503 }, { "auxiliary_loss_clip": 0.01033568, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.03055573, "balance_loss_mlp": 1.0286659, "epoch": 0.6197318583538748, "flos": 16253744803200.0, "grad_norm": 2.034695429680172, "language_loss": 0.76254022, "learning_rate": 1.3344549358312567e-06, "loss": 0.78333199, "num_input_tokens_seen": 111104450, "step": 5154, "time_per_iteration": 2.9533700942993164 }, { "auxiliary_loss_clip": 0.01130427, "auxiliary_loss_mlp": 0.01035928, "balance_loss_clip": 1.04520202, "balance_loss_mlp": 1.01985884, "epoch": 0.619852101244514, "flos": 24425612478720.0, "grad_norm": 1.9096074572533064, "language_loss": 0.78426182, "learning_rate": 1.3337204105158852e-06, "loss": 0.80592537, "num_input_tokens_seen": 111123320, "step": 5155, "time_per_iteration": 2.860733985900879 }, { "auxiliary_loss_clip": 0.01087607, "auxiliary_loss_mlp": 0.01037969, "balance_loss_clip": 1.03531706, "balance_loss_mlp": 1.0218761, "epoch": 0.619972344135153, "flos": 16727298733440.0, "grad_norm": 2.0605920865436165, "language_loss": 0.72508192, "learning_rate": 1.332985986274597e-06, "loss": 0.74633771, "num_input_tokens_seen": 111140950, "step": 5156, "time_per_iteration": 3.5870795249938965 }, { "auxiliary_loss_clip": 0.01066134, "auxiliary_loss_mlp": 0.00771356, "balance_loss_clip": 1.0390147, "balance_loss_mlp": 1.00066328, "epoch": 0.6200925870257921, "flos": 12495190498560.0, "grad_norm": 2.009854656072832, "language_loss": 0.75531, "learning_rate": 1.3322516632188047e-06, "loss": 0.77368492, "num_input_tokens_seen": 111157845, "step": 5157, "time_per_iteration": 2.714803695678711 }, { "auxiliary_loss_clip": 0.01099286, "auxiliary_loss_mlp": 0.01043861, "balance_loss_clip": 1.03922951, "balance_loss_mlp": 1.02633762, "epoch": 0.6202128299164312, "flos": 26539350168960.0, "grad_norm": 2.0442204324936117, "language_loss": 0.6693151, "learning_rate": 1.3315174414599045e-06, "loss": 0.69074661, "num_input_tokens_seen": 111179165, "step": 5158, "time_per_iteration": 2.7332890033721924 }, { "auxiliary_loss_clip": 0.01121566, "auxiliary_loss_mlp": 0.01047518, "balance_loss_clip": 1.04305625, "balance_loss_mlp": 1.02957654, "epoch": 0.6203330728070703, "flos": 18770508069120.0, "grad_norm": 2.0521969417994192, "language_loss": 0.7531969, "learning_rate": 1.3307833211092768e-06, "loss": 0.77488774, "num_input_tokens_seen": 111197830, "step": 5159, "time_per_iteration": 2.6328232288360596 }, { "auxiliary_loss_clip": 0.01136302, "auxiliary_loss_mlp": 0.01043547, "balance_loss_clip": 1.04527116, "balance_loss_mlp": 1.02797818, "epoch": 0.6204533156977093, "flos": 20629782835200.0, "grad_norm": 1.5662004367642166, "language_loss": 0.75102782, "learning_rate": 1.3300493022782873e-06, "loss": 0.77282631, "num_input_tokens_seen": 111218400, "step": 5160, "time_per_iteration": 2.587661027908325 }, { "auxiliary_loss_clip": 0.01076715, "auxiliary_loss_mlp": 0.00775332, "balance_loss_clip": 1.03633404, "balance_loss_mlp": 1.00061953, "epoch": 0.6205735585883485, "flos": 17348050598400.0, "grad_norm": 1.9395737693283643, "language_loss": 0.72369444, "learning_rate": 1.3293153850782855e-06, "loss": 0.74221498, "num_input_tokens_seen": 111236720, "step": 5161, "time_per_iteration": 3.694838523864746 }, { "auxiliary_loss_clip": 0.01092183, "auxiliary_loss_mlp": 0.01043291, "balance_loss_clip": 1.03907835, "balance_loss_mlp": 1.02554059, "epoch": 0.6206938014789876, "flos": 22965017742720.0, "grad_norm": 3.03186368330571, "language_loss": 0.71475875, "learning_rate": 1.3285815696206069e-06, "loss": 0.73611355, "num_input_tokens_seen": 111258265, "step": 5162, "time_per_iteration": 2.7175753116607666 }, { "auxiliary_loss_clip": 0.0110208, "auxiliary_loss_mlp": 0.0103703, "balance_loss_clip": 1.03991199, "balance_loss_mlp": 1.02074575, "epoch": 0.6208140443696266, "flos": 23983192661760.0, "grad_norm": 3.6413074597544095, "language_loss": 0.76731431, "learning_rate": 1.32784785601657e-06, "loss": 0.78870541, "num_input_tokens_seen": 111277675, "step": 5163, "time_per_iteration": 3.588961362838745 }, { "auxiliary_loss_clip": 0.0111247, "auxiliary_loss_mlp": 0.01042623, "balance_loss_clip": 1.04087973, "balance_loss_mlp": 1.02598178, "epoch": 0.6209342872602658, "flos": 35077291303680.0, "grad_norm": 1.8690048211763688, "language_loss": 0.73849916, "learning_rate": 1.3271142443774798e-06, "loss": 0.76005006, "num_input_tokens_seen": 111299910, "step": 5164, "time_per_iteration": 2.75592041015625 }, { "auxiliary_loss_clip": 0.01108737, "auxiliary_loss_mlp": 0.01028595, "balance_loss_clip": 1.04202127, "balance_loss_mlp": 1.0147543, "epoch": 0.6210545301509048, "flos": 26979327861120.0, "grad_norm": 1.9320107909496271, "language_loss": 0.81669247, "learning_rate": 1.3263807348146228e-06, "loss": 0.83806574, "num_input_tokens_seen": 111319765, "step": 5165, "time_per_iteration": 2.7603538036346436 }, { "auxiliary_loss_clip": 0.01108441, "auxiliary_loss_mlp": 0.01042596, "balance_loss_clip": 1.03874946, "balance_loss_mlp": 1.02613294, "epoch": 0.6211747730415439, "flos": 33618240852480.0, "grad_norm": 5.471627734935179, "language_loss": 0.73535389, "learning_rate": 1.3256473274392733e-06, "loss": 0.75686425, "num_input_tokens_seen": 111341110, "step": 5166, "time_per_iteration": 2.7635576725006104 }, { "auxiliary_loss_clip": 0.01136928, "auxiliary_loss_mlp": 0.01040891, "balance_loss_clip": 1.04444575, "balance_loss_mlp": 1.02387977, "epoch": 0.6212950159321831, "flos": 34167099646080.0, "grad_norm": 1.7763235541075217, "language_loss": 0.69982219, "learning_rate": 1.3249140223626873e-06, "loss": 0.72160041, "num_input_tokens_seen": 111362730, "step": 5167, "time_per_iteration": 2.6514415740966797 }, { "auxiliary_loss_clip": 0.0112189, "auxiliary_loss_mlp": 0.01048359, "balance_loss_clip": 1.04393172, "balance_loss_mlp": 1.0324204, "epoch": 0.6214152588228221, "flos": 27965758135680.0, "grad_norm": 1.8414394135506509, "language_loss": 0.75373983, "learning_rate": 1.3241808196961077e-06, "loss": 0.77544224, "num_input_tokens_seen": 111383855, "step": 5168, "time_per_iteration": 2.674501895904541 }, { "auxiliary_loss_clip": 0.01103015, "auxiliary_loss_mlp": 0.01038341, "balance_loss_clip": 1.0420959, "balance_loss_mlp": 1.02274835, "epoch": 0.6215355017134612, "flos": 20230204965120.0, "grad_norm": 1.7028828502300049, "language_loss": 0.70894909, "learning_rate": 1.3234477195507608e-06, "loss": 0.73036259, "num_input_tokens_seen": 111402685, "step": 5169, "time_per_iteration": 2.66593861579895 }, { "auxiliary_loss_clip": 0.0110177, "auxiliary_loss_mlp": 0.01038382, "balance_loss_clip": 1.04301667, "balance_loss_mlp": 1.02176452, "epoch": 0.6216557446041003, "flos": 41428129219200.0, "grad_norm": 2.1404167959869347, "language_loss": 0.63064116, "learning_rate": 1.322714722037857e-06, "loss": 0.65204275, "num_input_tokens_seen": 111424130, "step": 5170, "time_per_iteration": 2.846234083175659 }, { "auxiliary_loss_clip": 0.01112428, "auxiliary_loss_mlp": 0.01049446, "balance_loss_clip": 1.04368997, "balance_loss_mlp": 1.03145766, "epoch": 0.6217759874947394, "flos": 27928770105600.0, "grad_norm": 2.376019041874857, "language_loss": 0.77855802, "learning_rate": 1.321981827268591e-06, "loss": 0.80017674, "num_input_tokens_seen": 111444785, "step": 5171, "time_per_iteration": 2.737314224243164 }, { "auxiliary_loss_clip": 0.01112322, "auxiliary_loss_mlp": 0.01035915, "balance_loss_clip": 1.03938556, "balance_loss_mlp": 1.02063286, "epoch": 0.6218962303853784, "flos": 21765673601280.0, "grad_norm": 1.6798285764839824, "language_loss": 0.8142221, "learning_rate": 1.3212490353541426e-06, "loss": 0.83570451, "num_input_tokens_seen": 111467045, "step": 5172, "time_per_iteration": 2.6694862842559814 }, { "auxiliary_loss_clip": 0.01139067, "auxiliary_loss_mlp": 0.01043034, "balance_loss_clip": 1.04571998, "balance_loss_mlp": 1.02652311, "epoch": 0.6220164732760175, "flos": 21246260981760.0, "grad_norm": 1.9317045018619137, "language_loss": 0.80578583, "learning_rate": 1.3205163464056762e-06, "loss": 0.82760686, "num_input_tokens_seen": 111483650, "step": 5173, "time_per_iteration": 2.583378791809082 }, { "auxiliary_loss_clip": 0.01123986, "auxiliary_loss_mlp": 0.01035447, "balance_loss_clip": 1.04301858, "balance_loss_mlp": 1.01973486, "epoch": 0.6221367161666567, "flos": 26136360506880.0, "grad_norm": 1.7927306227058541, "language_loss": 0.72650254, "learning_rate": 1.319783760534339e-06, "loss": 0.74809682, "num_input_tokens_seen": 111502895, "step": 5174, "time_per_iteration": 2.651792287826538 }, { "auxiliary_loss_clip": 0.01125851, "auxiliary_loss_mlp": 0.0103843, "balance_loss_clip": 1.04324067, "balance_loss_mlp": 1.02166963, "epoch": 0.6222569590572957, "flos": 16284196558080.0, "grad_norm": 2.37888746925432, "language_loss": 0.75433505, "learning_rate": 1.319051277851266e-06, "loss": 0.77597791, "num_input_tokens_seen": 111519180, "step": 5175, "time_per_iteration": 2.5912535190582275 }, { "auxiliary_loss_clip": 0.01126732, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.04482889, "balance_loss_mlp": 1.01888072, "epoch": 0.6223772019479348, "flos": 18223840005120.0, "grad_norm": 3.11549284148937, "language_loss": 0.84235537, "learning_rate": 1.3183188984675716e-06, "loss": 0.8639667, "num_input_tokens_seen": 111537545, "step": 5176, "time_per_iteration": 2.6124086380004883 }, { "auxiliary_loss_clip": 0.01114391, "auxiliary_loss_mlp": 0.0103971, "balance_loss_clip": 1.04421043, "balance_loss_mlp": 1.02260315, "epoch": 0.6224974448385739, "flos": 27489797994240.0, "grad_norm": 2.2774996488322703, "language_loss": 0.71453774, "learning_rate": 1.3175866224943586e-06, "loss": 0.73607868, "num_input_tokens_seen": 111556265, "step": 5177, "time_per_iteration": 2.6875176429748535 }, { "auxiliary_loss_clip": 0.0111856, "auxiliary_loss_mlp": 0.01039154, "balance_loss_clip": 1.04483521, "balance_loss_mlp": 1.02272081, "epoch": 0.622617687729213, "flos": 19791951125760.0, "grad_norm": 2.2935739557135815, "language_loss": 0.73575234, "learning_rate": 1.316854450042712e-06, "loss": 0.75732952, "num_input_tokens_seen": 111574205, "step": 5178, "time_per_iteration": 2.6583988666534424 }, { "auxiliary_loss_clip": 0.01128739, "auxiliary_loss_mlp": 0.01038496, "balance_loss_clip": 1.04411674, "balance_loss_mlp": 1.02349985, "epoch": 0.622737930619852, "flos": 23038886062080.0, "grad_norm": 3.182040564684381, "language_loss": 0.74774766, "learning_rate": 1.3161223812237024e-06, "loss": 0.76941991, "num_input_tokens_seen": 111593560, "step": 5179, "time_per_iteration": 3.5539650917053223 }, { "auxiliary_loss_clip": 0.01140308, "auxiliary_loss_mlp": 0.01044297, "balance_loss_clip": 1.04651642, "balance_loss_mlp": 1.02877581, "epoch": 0.6228581735104912, "flos": 12634271959680.0, "grad_norm": 3.0005752959394525, "language_loss": 0.85367244, "learning_rate": 1.3153904161483842e-06, "loss": 0.8755185, "num_input_tokens_seen": 111608860, "step": 5180, "time_per_iteration": 2.56467342376709 }, { "auxiliary_loss_clip": 0.01098909, "auxiliary_loss_mlp": 0.01038011, "balance_loss_clip": 1.03991413, "balance_loss_mlp": 1.02045178, "epoch": 0.6229784164011303, "flos": 23802813538560.0, "grad_norm": 2.243242676219412, "language_loss": 0.85740042, "learning_rate": 1.3146585549277953e-06, "loss": 0.87876964, "num_input_tokens_seen": 111627500, "step": 5181, "time_per_iteration": 2.6975040435791016 }, { "auxiliary_loss_clip": 0.01122421, "auxiliary_loss_mlp": 0.01042309, "balance_loss_clip": 1.04641032, "balance_loss_mlp": 1.0255003, "epoch": 0.6230986592917693, "flos": 22414219614720.0, "grad_norm": 9.247387724674045, "language_loss": 0.78195381, "learning_rate": 1.3139267976729591e-06, "loss": 0.80360115, "num_input_tokens_seen": 111647690, "step": 5182, "time_per_iteration": 3.951742172241211 }, { "auxiliary_loss_clip": 0.01128272, "auxiliary_loss_mlp": 0.01042665, "balance_loss_clip": 1.04507267, "balance_loss_mlp": 1.02589226, "epoch": 0.6232189021824085, "flos": 34528217028480.0, "grad_norm": 1.6583735197257417, "language_loss": 0.72243273, "learning_rate": 1.3131951444948815e-06, "loss": 0.74414206, "num_input_tokens_seen": 111667090, "step": 5183, "time_per_iteration": 2.7547714710235596 }, { "auxiliary_loss_clip": 0.01118375, "auxiliary_loss_mlp": 0.01039762, "balance_loss_clip": 1.04618943, "balance_loss_mlp": 1.02201152, "epoch": 0.6233391450730476, "flos": 22237000888320.0, "grad_norm": 4.190132173146591, "language_loss": 0.76554942, "learning_rate": 1.3124635955045546e-06, "loss": 0.78713071, "num_input_tokens_seen": 111686905, "step": 5184, "time_per_iteration": 2.6898694038391113 }, { "auxiliary_loss_clip": 0.0107529, "auxiliary_loss_mlp": 0.00773298, "balance_loss_clip": 1.03547597, "balance_loss_mlp": 1.00051236, "epoch": 0.6234593879636866, "flos": 20332693445760.0, "grad_norm": 1.994218007628383, "language_loss": 0.83992767, "learning_rate": 1.3117321508129537e-06, "loss": 0.85841358, "num_input_tokens_seen": 111704985, "step": 5185, "time_per_iteration": 2.711043119430542 }, { "auxiliary_loss_clip": 0.01116758, "auxiliary_loss_mlp": 0.01035348, "balance_loss_clip": 1.04397821, "balance_loss_mlp": 1.01839614, "epoch": 0.6235796308543258, "flos": 20664903358080.0, "grad_norm": 1.9005468026557275, "language_loss": 0.76618493, "learning_rate": 1.3110008105310388e-06, "loss": 0.78770602, "num_input_tokens_seen": 111724805, "step": 5186, "time_per_iteration": 2.666736602783203 }, { "auxiliary_loss_clip": 0.0113901, "auxiliary_loss_mlp": 0.0103775, "balance_loss_clip": 1.04388511, "balance_loss_mlp": 1.02095389, "epoch": 0.6236998737449648, "flos": 26618641441920.0, "grad_norm": 1.771005034214594, "language_loss": 0.78107429, "learning_rate": 1.3102695747697526e-06, "loss": 0.8028419, "num_input_tokens_seen": 111747675, "step": 5187, "time_per_iteration": 3.6458168029785156 }, { "auxiliary_loss_clip": 0.01075518, "auxiliary_loss_mlp": 0.01038805, "balance_loss_clip": 1.04007363, "balance_loss_mlp": 1.02229428, "epoch": 0.6238201166356039, "flos": 12674599954560.0, "grad_norm": 2.7088572973403355, "language_loss": 0.90530407, "learning_rate": 1.3095384436400237e-06, "loss": 0.92644727, "num_input_tokens_seen": 111759205, "step": 5188, "time_per_iteration": 2.687962532043457 }, { "auxiliary_loss_clip": 0.01122689, "auxiliary_loss_mlp": 0.01043481, "balance_loss_clip": 1.04670382, "balance_loss_mlp": 1.0269351, "epoch": 0.623940359526243, "flos": 10452160730880.0, "grad_norm": 2.0058514486323453, "language_loss": 0.82053125, "learning_rate": 1.3088074172527633e-06, "loss": 0.84219295, "num_input_tokens_seen": 111776335, "step": 5189, "time_per_iteration": 3.57544207572937 }, { "auxiliary_loss_clip": 0.01118554, "auxiliary_loss_mlp": 0.01035996, "balance_loss_clip": 1.04178011, "balance_loss_mlp": 1.01946139, "epoch": 0.6240606024168821, "flos": 29059525226880.0, "grad_norm": 3.945909912686173, "language_loss": 0.71731031, "learning_rate": 1.3080764957188684e-06, "loss": 0.73885572, "num_input_tokens_seen": 111796580, "step": 5190, "time_per_iteration": 2.681781053543091 }, { "auxiliary_loss_clip": 0.01088082, "auxiliary_loss_mlp": 0.01039136, "balance_loss_clip": 1.03898919, "balance_loss_mlp": 1.02223206, "epoch": 0.6241808453075212, "flos": 22018089450240.0, "grad_norm": 2.004821671559425, "language_loss": 0.70971966, "learning_rate": 1.3073456791492192e-06, "loss": 0.73099184, "num_input_tokens_seen": 111816290, "step": 5191, "time_per_iteration": 2.727843999862671 }, { "auxiliary_loss_clip": 0.0111528, "auxiliary_loss_mlp": 0.01045896, "balance_loss_clip": 1.04081869, "balance_loss_mlp": 1.02938533, "epoch": 0.6243010881981603, "flos": 21138708683520.0, "grad_norm": 1.8310746303882564, "language_loss": 0.78195685, "learning_rate": 1.3066149676546801e-06, "loss": 0.80356854, "num_input_tokens_seen": 111834470, "step": 5192, "time_per_iteration": 2.654707431793213 }, { "auxiliary_loss_clip": 0.01109318, "auxiliary_loss_mlp": 0.01042017, "balance_loss_clip": 1.04374361, "balance_loss_mlp": 1.02733076, "epoch": 0.6244213310887994, "flos": 22344948236160.0, "grad_norm": 1.9017762714149624, "language_loss": 0.66275793, "learning_rate": 1.3058843613460985e-06, "loss": 0.68427134, "num_input_tokens_seen": 111852410, "step": 5193, "time_per_iteration": 2.652968645095825 }, { "auxiliary_loss_clip": 0.01105208, "auxiliary_loss_mlp": 0.01035892, "balance_loss_clip": 1.04052842, "balance_loss_mlp": 1.01925063, "epoch": 0.6245415739794384, "flos": 15231978524160.0, "grad_norm": 1.8981056914053211, "language_loss": 0.74679607, "learning_rate": 1.3051538603343075e-06, "loss": 0.76820707, "num_input_tokens_seen": 111870340, "step": 5194, "time_per_iteration": 2.6582915782928467 }, { "auxiliary_loss_clip": 0.01127121, "auxiliary_loss_mlp": 0.01034964, "balance_loss_clip": 1.04754329, "balance_loss_mlp": 1.01954985, "epoch": 0.6246618168700776, "flos": 18879891960960.0, "grad_norm": 2.034305520023983, "language_loss": 0.67927992, "learning_rate": 1.3044234647301235e-06, "loss": 0.70090079, "num_input_tokens_seen": 111888365, "step": 5195, "time_per_iteration": 2.6129353046417236 }, { "auxiliary_loss_clip": 0.01119295, "auxiliary_loss_mlp": 0.01054466, "balance_loss_clip": 1.04141998, "balance_loss_mlp": 1.0387665, "epoch": 0.6247820597607167, "flos": 14319201087360.0, "grad_norm": 1.8020982624361792, "language_loss": 0.7263974, "learning_rate": 1.303693174644347e-06, "loss": 0.74813497, "num_input_tokens_seen": 111905840, "step": 5196, "time_per_iteration": 2.5933480262756348 }, { "auxiliary_loss_clip": 0.01108548, "auxiliary_loss_mlp": 0.01046612, "balance_loss_clip": 1.04051638, "balance_loss_mlp": 1.03042316, "epoch": 0.6249023026513557, "flos": 22637979388800.0, "grad_norm": 3.946102275067225, "language_loss": 0.80410624, "learning_rate": 1.3029629901877625e-06, "loss": 0.82565778, "num_input_tokens_seen": 111925215, "step": 5197, "time_per_iteration": 2.6797943115234375 }, { "auxiliary_loss_clip": 0.01132204, "auxiliary_loss_mlp": 0.01042599, "balance_loss_clip": 1.04580927, "balance_loss_mlp": 1.0271672, "epoch": 0.6250225455419949, "flos": 20266690204800.0, "grad_norm": 3.028181627079213, "language_loss": 0.77234513, "learning_rate": 1.3022329114711376e-06, "loss": 0.79409319, "num_input_tokens_seen": 111943925, "step": 5198, "time_per_iteration": 2.593273162841797 }, { "auxiliary_loss_clip": 0.01110943, "auxiliary_loss_mlp": 0.01048891, "balance_loss_clip": 1.04175305, "balance_loss_mlp": 1.03306043, "epoch": 0.6251427884326339, "flos": 23437853400960.0, "grad_norm": 1.9524016243143982, "language_loss": 0.69657165, "learning_rate": 1.3015029386052256e-06, "loss": 0.71816999, "num_input_tokens_seen": 111964095, "step": 5199, "time_per_iteration": 2.7066009044647217 }, { "auxiliary_loss_clip": 0.0110697, "auxiliary_loss_mlp": 0.01043291, "balance_loss_clip": 1.0419699, "balance_loss_mlp": 1.02684069, "epoch": 0.625263031323273, "flos": 31723055464320.0, "grad_norm": 2.0898194722166243, "language_loss": 0.73191255, "learning_rate": 1.3007730717007622e-06, "loss": 0.75341523, "num_input_tokens_seen": 111984910, "step": 5200, "time_per_iteration": 2.7156410217285156 }, { "auxiliary_loss_clip": 0.01144175, "auxiliary_loss_mlp": 0.0104112, "balance_loss_clip": 1.04875576, "balance_loss_mlp": 1.02402568, "epoch": 0.6253832742139122, "flos": 24134341092480.0, "grad_norm": 1.6456838234834414, "language_loss": 0.75782633, "learning_rate": 1.3000433108684676e-06, "loss": 0.7796793, "num_input_tokens_seen": 112005410, "step": 5201, "time_per_iteration": 2.592106819152832 }, { "auxiliary_loss_clip": 0.01121564, "auxiliary_loss_mlp": 0.01038924, "balance_loss_clip": 1.04320025, "balance_loss_mlp": 1.02323592, "epoch": 0.6255035171045512, "flos": 27668812400640.0, "grad_norm": 2.5560418187357996, "language_loss": 0.79659569, "learning_rate": 1.2993136562190467e-06, "loss": 0.81820059, "num_input_tokens_seen": 112024530, "step": 5202, "time_per_iteration": 2.6616852283477783 }, { "auxiliary_loss_clip": 0.01115249, "auxiliary_loss_mlp": 0.01048217, "balance_loss_clip": 1.04184341, "balance_loss_mlp": 1.03133678, "epoch": 0.6256237599951903, "flos": 20227798753920.0, "grad_norm": 1.5860928468049524, "language_loss": 0.70633239, "learning_rate": 1.2985841078631871e-06, "loss": 0.72796708, "num_input_tokens_seen": 112043850, "step": 5203, "time_per_iteration": 2.5916337966918945 }, { "auxiliary_loss_clip": 0.01073234, "auxiliary_loss_mlp": 0.01038483, "balance_loss_clip": 1.03603804, "balance_loss_mlp": 1.02092314, "epoch": 0.6257440028858293, "flos": 24170574936960.0, "grad_norm": 1.940997323775643, "language_loss": 0.78133404, "learning_rate": 1.2978546659115608e-06, "loss": 0.80245125, "num_input_tokens_seen": 112061930, "step": 5204, "time_per_iteration": 2.7498600482940674 }, { "auxiliary_loss_clip": 0.01116683, "auxiliary_loss_mlp": 0.01041398, "balance_loss_clip": 1.0427314, "balance_loss_mlp": 1.0251379, "epoch": 0.6258642457764685, "flos": 15851940289920.0, "grad_norm": 1.922987537779398, "language_loss": 0.85437608, "learning_rate": 1.2971253304748228e-06, "loss": 0.87595683, "num_input_tokens_seen": 112079645, "step": 5205, "time_per_iteration": 3.49120831489563 }, { "auxiliary_loss_clip": 0.01132959, "auxiliary_loss_mlp": 0.01046372, "balance_loss_clip": 1.04589403, "balance_loss_mlp": 1.02961731, "epoch": 0.6259844886671075, "flos": 11911354836480.0, "grad_norm": 1.7244930871860105, "language_loss": 0.74831188, "learning_rate": 1.296396101663614e-06, "loss": 0.77010512, "num_input_tokens_seen": 112096205, "step": 5206, "time_per_iteration": 2.5456666946411133 }, { "auxiliary_loss_clip": 0.01128162, "auxiliary_loss_mlp": 0.01045529, "balance_loss_clip": 1.04549384, "balance_loss_mlp": 1.02945948, "epoch": 0.6261047315577466, "flos": 15887958652800.0, "grad_norm": 2.1434837423650484, "language_loss": 0.84377652, "learning_rate": 1.2956669795885565e-06, "loss": 0.86551344, "num_input_tokens_seen": 112112835, "step": 5207, "time_per_iteration": 3.612670421600342 }, { "auxiliary_loss_clip": 0.01096271, "auxiliary_loss_mlp": 0.01048561, "balance_loss_clip": 1.04377031, "balance_loss_mlp": 1.0318594, "epoch": 0.6262249744483858, "flos": 31248926916480.0, "grad_norm": 1.8917335911026891, "language_loss": 0.68182623, "learning_rate": 1.294937964360259e-06, "loss": 0.70327455, "num_input_tokens_seen": 112133105, "step": 5208, "time_per_iteration": 2.7669172286987305 }, { "auxiliary_loss_clip": 0.01117206, "auxiliary_loss_mlp": 0.01048465, "balance_loss_clip": 1.04203248, "balance_loss_mlp": 1.03201365, "epoch": 0.6263452173390248, "flos": 27198598435200.0, "grad_norm": 2.1993364551728773, "language_loss": 0.71008837, "learning_rate": 1.2942090560893108e-06, "loss": 0.73174506, "num_input_tokens_seen": 112152510, "step": 5209, "time_per_iteration": 2.7070517539978027 }, { "auxiliary_loss_clip": 0.01134954, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.04398429, "balance_loss_mlp": 1.0221498, "epoch": 0.6264654602296639, "flos": 37342069683840.0, "grad_norm": 1.7498891315356002, "language_loss": 0.60340756, "learning_rate": 1.2934802548862882e-06, "loss": 0.62512815, "num_input_tokens_seen": 112175295, "step": 5210, "time_per_iteration": 2.709517002105713 }, { "auxiliary_loss_clip": 0.01110875, "auxiliary_loss_mlp": 0.01037412, "balance_loss_clip": 1.04058552, "balance_loss_mlp": 1.02143848, "epoch": 0.626585703120303, "flos": 14756952136320.0, "grad_norm": 1.8847712023052867, "language_loss": 0.82671207, "learning_rate": 1.292751560861749e-06, "loss": 0.84819496, "num_input_tokens_seen": 112190200, "step": 5211, "time_per_iteration": 2.577814817428589 }, { "auxiliary_loss_clip": 0.01142081, "auxiliary_loss_mlp": 0.0104165, "balance_loss_clip": 1.04603457, "balance_loss_mlp": 1.0244962, "epoch": 0.6267059460109421, "flos": 22347318533760.0, "grad_norm": 1.8200436406847327, "language_loss": 0.79563433, "learning_rate": 1.2920229741262354e-06, "loss": 0.81747168, "num_input_tokens_seen": 112208205, "step": 5212, "time_per_iteration": 2.582235097885132 }, { "auxiliary_loss_clip": 0.0111645, "auxiliary_loss_mlp": 0.0103721, "balance_loss_clip": 1.04368281, "balance_loss_mlp": 1.02106905, "epoch": 0.6268261889015811, "flos": 17748813617280.0, "grad_norm": 2.4300165009453645, "language_loss": 0.7537967, "learning_rate": 1.2912944947902739e-06, "loss": 0.77533334, "num_input_tokens_seen": 112224690, "step": 5213, "time_per_iteration": 3.6465368270874023 }, { "auxiliary_loss_clip": 0.01119689, "auxiliary_loss_mlp": 0.01038073, "balance_loss_clip": 1.04331458, "balance_loss_mlp": 1.02103758, "epoch": 0.6269464317922203, "flos": 32846484211200.0, "grad_norm": 2.0375893033374277, "language_loss": 0.71538484, "learning_rate": 1.2905661229643742e-06, "loss": 0.73696244, "num_input_tokens_seen": 112244450, "step": 5214, "time_per_iteration": 2.7052001953125 }, { "auxiliary_loss_clip": 0.01138443, "auxiliary_loss_mlp": 0.01043286, "balance_loss_clip": 1.0443958, "balance_loss_mlp": 1.02613199, "epoch": 0.6270666746828594, "flos": 17929192740480.0, "grad_norm": 2.1387127904590635, "language_loss": 0.83697903, "learning_rate": 1.2898378587590299e-06, "loss": 0.85879636, "num_input_tokens_seen": 112261050, "step": 5215, "time_per_iteration": 3.464637279510498 }, { "auxiliary_loss_clip": 0.01120207, "auxiliary_loss_mlp": 0.01036663, "balance_loss_clip": 1.04302025, "balance_loss_mlp": 1.02154768, "epoch": 0.6271869175734984, "flos": 17457326749440.0, "grad_norm": 1.7246577227705744, "language_loss": 0.87495518, "learning_rate": 1.2891097022847173e-06, "loss": 0.89652383, "num_input_tokens_seen": 112278395, "step": 5216, "time_per_iteration": 2.5920209884643555 }, { "auxiliary_loss_clip": 0.01120463, "auxiliary_loss_mlp": 0.01043569, "balance_loss_clip": 1.04539371, "balance_loss_mlp": 1.02643871, "epoch": 0.6273071604641376, "flos": 26868615166080.0, "grad_norm": 2.335676548626349, "language_loss": 0.67067724, "learning_rate": 1.2883816536518978e-06, "loss": 0.69231755, "num_input_tokens_seen": 112299535, "step": 5217, "time_per_iteration": 2.698643445968628 }, { "auxiliary_loss_clip": 0.01122536, "auxiliary_loss_mlp": 0.01045135, "balance_loss_clip": 1.04302335, "balance_loss_mlp": 1.02908933, "epoch": 0.6274274033547766, "flos": 26062384446720.0, "grad_norm": 1.990349729523523, "language_loss": 0.82212281, "learning_rate": 1.2876537129710155e-06, "loss": 0.84379947, "num_input_tokens_seen": 112317265, "step": 5218, "time_per_iteration": 2.629326105117798 }, { "auxiliary_loss_clip": 0.01108525, "auxiliary_loss_mlp": 0.01041826, "balance_loss_clip": 1.04470503, "balance_loss_mlp": 1.0244925, "epoch": 0.6275476462454157, "flos": 20266259241600.0, "grad_norm": 2.0191073649203237, "language_loss": 0.75469816, "learning_rate": 1.286925880352499e-06, "loss": 0.77620161, "num_input_tokens_seen": 112336125, "step": 5219, "time_per_iteration": 2.6325182914733887 }, { "auxiliary_loss_clip": 0.01107666, "auxiliary_loss_mlp": 0.01036015, "balance_loss_clip": 1.04162538, "balance_loss_mlp": 1.01974308, "epoch": 0.6276678891360549, "flos": 26320402817280.0, "grad_norm": 1.8153274357318567, "language_loss": 0.71709132, "learning_rate": 1.2861981559067592e-06, "loss": 0.73852813, "num_input_tokens_seen": 112356730, "step": 5220, "time_per_iteration": 2.7457709312438965 }, { "auxiliary_loss_clip": 0.01077978, "auxiliary_loss_mlp": 0.01040896, "balance_loss_clip": 1.03811133, "balance_loss_mlp": 1.02453446, "epoch": 0.6277881320266939, "flos": 13912512324480.0, "grad_norm": 4.123936097366629, "language_loss": 0.80498219, "learning_rate": 1.2854705397441917e-06, "loss": 0.82617092, "num_input_tokens_seen": 112372270, "step": 5221, "time_per_iteration": 2.6948275566101074 }, { "auxiliary_loss_clip": 0.01100301, "auxiliary_loss_mlp": 0.01038813, "balance_loss_clip": 1.04030502, "balance_loss_mlp": 1.02230263, "epoch": 0.627908374917333, "flos": 27048922462080.0, "grad_norm": 2.2122677566492643, "language_loss": 0.77558869, "learning_rate": 1.2847430319751747e-06, "loss": 0.79697984, "num_input_tokens_seen": 112390365, "step": 5222, "time_per_iteration": 2.721808671951294 }, { "auxiliary_loss_clip": 0.01120618, "auxiliary_loss_mlp": 0.01047395, "balance_loss_clip": 1.04417944, "balance_loss_mlp": 1.03229666, "epoch": 0.6280286178079721, "flos": 23769201386880.0, "grad_norm": 2.158839215522361, "language_loss": 0.6770891, "learning_rate": 1.2840156327100712e-06, "loss": 0.69876921, "num_input_tokens_seen": 112407490, "step": 5223, "time_per_iteration": 2.6818172931671143 }, { "auxiliary_loss_clip": 0.01138052, "auxiliary_loss_mlp": 0.01034899, "balance_loss_clip": 1.04767132, "balance_loss_mlp": 1.01956892, "epoch": 0.6281488606986112, "flos": 26359150613760.0, "grad_norm": 1.859338566228063, "language_loss": 0.72164524, "learning_rate": 1.2832883420592272e-06, "loss": 0.74337476, "num_input_tokens_seen": 112426385, "step": 5224, "time_per_iteration": 2.5808684825897217 }, { "auxiliary_loss_clip": 0.01112981, "auxiliary_loss_mlp": 0.01040526, "balance_loss_clip": 1.04375172, "balance_loss_mlp": 1.02456379, "epoch": 0.6282691035892503, "flos": 36137194848000.0, "grad_norm": 2.192636380413062, "language_loss": 0.64383352, "learning_rate": 1.282561160132972e-06, "loss": 0.66536856, "num_input_tokens_seen": 112446905, "step": 5225, "time_per_iteration": 2.753645658493042 }, { "auxiliary_loss_clip": 0.01120743, "auxiliary_loss_mlp": 0.01047442, "balance_loss_clip": 1.04242277, "balance_loss_mlp": 1.03082395, "epoch": 0.6283893464798894, "flos": 26537231266560.0, "grad_norm": 1.6811782073681356, "language_loss": 0.81001025, "learning_rate": 1.2818340870416186e-06, "loss": 0.8316921, "num_input_tokens_seen": 112468040, "step": 5226, "time_per_iteration": 2.6939847469329834 }, { "auxiliary_loss_clip": 0.01109852, "auxiliary_loss_mlp": 0.01037821, "balance_loss_clip": 1.04087353, "balance_loss_mlp": 1.02120352, "epoch": 0.6285095893705285, "flos": 22237216369920.0, "grad_norm": 2.038347388682996, "language_loss": 0.75728863, "learning_rate": 1.2811071228954626e-06, "loss": 0.77876532, "num_input_tokens_seen": 112486675, "step": 5227, "time_per_iteration": 2.660411834716797 }, { "auxiliary_loss_clip": 0.01113572, "auxiliary_loss_mlp": 0.01040346, "balance_loss_clip": 1.04403615, "balance_loss_mlp": 1.02492034, "epoch": 0.6286298322611675, "flos": 26542259170560.0, "grad_norm": 2.095268240457331, "language_loss": 0.8094489, "learning_rate": 1.2803802678047846e-06, "loss": 0.83098805, "num_input_tokens_seen": 112506825, "step": 5228, "time_per_iteration": 2.6749985218048096 }, { "auxiliary_loss_clip": 0.0112285, "auxiliary_loss_mlp": 0.01047792, "balance_loss_clip": 1.04713976, "balance_loss_mlp": 1.03044713, "epoch": 0.6287500751518067, "flos": 21795227516160.0, "grad_norm": 1.8152996032922626, "language_loss": 0.74121702, "learning_rate": 1.279653521879848e-06, "loss": 0.76292348, "num_input_tokens_seen": 112526890, "step": 5229, "time_per_iteration": 2.6493284702301025 }, { "auxiliary_loss_clip": 0.01053801, "auxiliary_loss_mlp": 0.01041, "balance_loss_clip": 1.0346601, "balance_loss_mlp": 1.02583623, "epoch": 0.6288703180424458, "flos": 20009605587840.0, "grad_norm": 2.0023659493677646, "language_loss": 0.83743495, "learning_rate": 1.2789268852308997e-06, "loss": 0.85838294, "num_input_tokens_seen": 112542100, "step": 5230, "time_per_iteration": 2.7058026790618896 }, { "auxiliary_loss_clip": 0.01120059, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.04457664, "balance_loss_mlp": 1.02070808, "epoch": 0.6289905609330848, "flos": 22124923476480.0, "grad_norm": 2.058600255917797, "language_loss": 0.70586848, "learning_rate": 1.2782003579681688e-06, "loss": 0.72743309, "num_input_tokens_seen": 112561630, "step": 5231, "time_per_iteration": 3.579319477081299 }, { "auxiliary_loss_clip": 0.01139207, "auxiliary_loss_mlp": 0.01042517, "balance_loss_clip": 1.0447979, "balance_loss_mlp": 1.02672219, "epoch": 0.629110803823724, "flos": 25518481729920.0, "grad_norm": 1.9815988151567772, "language_loss": 0.741023, "learning_rate": 1.2774739402018701e-06, "loss": 0.76284027, "num_input_tokens_seen": 112582465, "step": 5232, "time_per_iteration": 2.600430965423584 }, { "auxiliary_loss_clip": 0.0112465, "auxiliary_loss_mlp": 0.01045154, "balance_loss_clip": 1.04726124, "balance_loss_mlp": 1.02802384, "epoch": 0.629231046714363, "flos": 20886616056960.0, "grad_norm": 1.933681473398445, "language_loss": 0.73165166, "learning_rate": 1.2767476320422002e-06, "loss": 0.75334978, "num_input_tokens_seen": 112602390, "step": 5233, "time_per_iteration": 3.6259868144989014 }, { "auxiliary_loss_clip": 0.01023193, "auxiliary_loss_mlp": 0.0100611, "balance_loss_clip": 1.01952446, "balance_loss_mlp": 1.00447702, "epoch": 0.6293512896050021, "flos": 65050027908480.0, "grad_norm": 0.681526795923473, "language_loss": 0.57216585, "learning_rate": 1.2760214335993392e-06, "loss": 0.5924589, "num_input_tokens_seen": 112669035, "step": 5234, "time_per_iteration": 3.3287360668182373 }, { "auxiliary_loss_clip": 0.01116815, "auxiliary_loss_mlp": 0.010449, "balance_loss_clip": 1.04133701, "balance_loss_mlp": 1.02835345, "epoch": 0.6294715324956413, "flos": 34677857088000.0, "grad_norm": 1.8656204110380556, "language_loss": 0.58683038, "learning_rate": 1.2752953449834514e-06, "loss": 0.60844743, "num_input_tokens_seen": 112691485, "step": 5235, "time_per_iteration": 2.7428762912750244 }, { "auxiliary_loss_clip": 0.01136859, "auxiliary_loss_mlp": 0.01034044, "balance_loss_clip": 1.04491115, "balance_loss_mlp": 1.01959562, "epoch": 0.6295917753862803, "flos": 22784207656320.0, "grad_norm": 1.9398155212456587, "language_loss": 0.80665576, "learning_rate": 1.2745693663046836e-06, "loss": 0.82836473, "num_input_tokens_seen": 112710555, "step": 5236, "time_per_iteration": 2.5846974849700928 }, { "auxiliary_loss_clip": 0.01122084, "auxiliary_loss_mlp": 0.01037358, "balance_loss_clip": 1.04350805, "balance_loss_mlp": 1.02282095, "epoch": 0.6297120182769194, "flos": 20850454039680.0, "grad_norm": 2.2693828720464584, "language_loss": 0.80596364, "learning_rate": 1.2738434976731662e-06, "loss": 0.8275581, "num_input_tokens_seen": 112728740, "step": 5237, "time_per_iteration": 2.602992057800293 }, { "auxiliary_loss_clip": 0.01115235, "auxiliary_loss_mlp": 0.01042345, "balance_loss_clip": 1.0435195, "balance_loss_mlp": 1.02721095, "epoch": 0.6298322611675584, "flos": 19497662997120.0, "grad_norm": 1.601163702905886, "language_loss": 0.75306916, "learning_rate": 1.2731177391990125e-06, "loss": 0.77464497, "num_input_tokens_seen": 112748665, "step": 5238, "time_per_iteration": 2.631810426712036 }, { "auxiliary_loss_clip": 0.01114382, "auxiliary_loss_mlp": 0.01037062, "balance_loss_clip": 1.04140258, "balance_loss_mlp": 1.02019405, "epoch": 0.6299525040581976, "flos": 12604466649600.0, "grad_norm": 3.0360839717232455, "language_loss": 0.81694824, "learning_rate": 1.2723920909923203e-06, "loss": 0.83846271, "num_input_tokens_seen": 112764410, "step": 5239, "time_per_iteration": 3.5740952491760254 }, { "auxiliary_loss_clip": 0.01047792, "auxiliary_loss_mlp": 0.01005933, "balance_loss_clip": 1.01685333, "balance_loss_mlp": 1.00443685, "epoch": 0.6300727469488366, "flos": 57725685636480.0, "grad_norm": 0.8627000557643144, "language_loss": 0.60450077, "learning_rate": 1.2716665531631688e-06, "loss": 0.62503803, "num_input_tokens_seen": 112818695, "step": 5240, "time_per_iteration": 3.1120266914367676 }, { "auxiliary_loss_clip": 0.01130914, "auxiliary_loss_mlp": 0.0103896, "balance_loss_clip": 1.04501486, "balance_loss_mlp": 1.02221072, "epoch": 0.6301929898394757, "flos": 22527302607360.0, "grad_norm": 1.7205239662510605, "language_loss": 0.77086306, "learning_rate": 1.270941125821623e-06, "loss": 0.79256177, "num_input_tokens_seen": 112839120, "step": 5241, "time_per_iteration": 3.4683730602264404 }, { "auxiliary_loss_clip": 0.01119343, "auxiliary_loss_mlp": 0.01042053, "balance_loss_clip": 1.0402739, "balance_loss_mlp": 1.0268414, "epoch": 0.6303132327301149, "flos": 28293550675200.0, "grad_norm": 1.7259268661120657, "language_loss": 0.75593942, "learning_rate": 1.2702158090777278e-06, "loss": 0.77755344, "num_input_tokens_seen": 112860210, "step": 5242, "time_per_iteration": 2.6478068828582764 }, { "auxiliary_loss_clip": 0.0109386, "auxiliary_loss_mlp": 0.01056795, "balance_loss_clip": 1.03899288, "balance_loss_mlp": 1.03916395, "epoch": 0.6304334756207539, "flos": 25264521596160.0, "grad_norm": 3.1931642050338582, "language_loss": 0.74807161, "learning_rate": 1.2694906030415148e-06, "loss": 0.76957816, "num_input_tokens_seen": 112877955, "step": 5243, "time_per_iteration": 2.6909427642822266 }, { "auxiliary_loss_clip": 0.01119285, "auxiliary_loss_mlp": 0.01041787, "balance_loss_clip": 1.04120493, "balance_loss_mlp": 1.02433491, "epoch": 0.630553718511393, "flos": 18033548728320.0, "grad_norm": 5.77390003690187, "language_loss": 0.82383651, "learning_rate": 1.2687655078229958e-06, "loss": 0.84544718, "num_input_tokens_seen": 112892285, "step": 5244, "time_per_iteration": 2.588473081588745 }, { "auxiliary_loss_clip": 0.0111501, "auxiliary_loss_mlp": 0.01037531, "balance_loss_clip": 1.04445839, "balance_loss_mlp": 1.02116346, "epoch": 0.6306739614020321, "flos": 27304103658240.0, "grad_norm": 2.1458323189453314, "language_loss": 0.69336021, "learning_rate": 1.2680405235321678e-06, "loss": 0.71488559, "num_input_tokens_seen": 112913620, "step": 5245, "time_per_iteration": 2.6939737796783447 }, { "auxiliary_loss_clip": 0.01115066, "auxiliary_loss_mlp": 0.00773076, "balance_loss_clip": 1.04390669, "balance_loss_mlp": 1.00039685, "epoch": 0.6307942042926712, "flos": 15341434243200.0, "grad_norm": 1.9383959855959079, "language_loss": 0.78566146, "learning_rate": 1.267315650279011e-06, "loss": 0.8045429, "num_input_tokens_seen": 112932090, "step": 5246, "time_per_iteration": 2.6303486824035645 }, { "auxiliary_loss_clip": 0.01093695, "auxiliary_loss_mlp": 0.01035937, "balance_loss_clip": 1.03871596, "balance_loss_mlp": 1.01964152, "epoch": 0.6309144471833102, "flos": 19606400444160.0, "grad_norm": 2.391025980187531, "language_loss": 0.73997712, "learning_rate": 1.2665908881734874e-06, "loss": 0.7612735, "num_input_tokens_seen": 112950925, "step": 5247, "time_per_iteration": 2.678494930267334 }, { "auxiliary_loss_clip": 0.01126943, "auxiliary_loss_mlp": 0.01037313, "balance_loss_clip": 1.04576659, "balance_loss_mlp": 1.02119613, "epoch": 0.6310346900739494, "flos": 17493345112320.0, "grad_norm": 2.4755779068254893, "language_loss": 0.84905875, "learning_rate": 1.2658662373255432e-06, "loss": 0.87070131, "num_input_tokens_seen": 112969315, "step": 5248, "time_per_iteration": 2.5922343730926514 }, { "auxiliary_loss_clip": 0.01026427, "auxiliary_loss_mlp": 0.0100206, "balance_loss_clip": 1.01505804, "balance_loss_mlp": 1.00058138, "epoch": 0.6311549329645885, "flos": 55070164131840.0, "grad_norm": 0.7268990184399778, "language_loss": 0.52254182, "learning_rate": 1.2651416978451063e-06, "loss": 0.54282665, "num_input_tokens_seen": 113034700, "step": 5249, "time_per_iteration": 3.3226938247680664 }, { "auxiliary_loss_clip": 0.01144744, "auxiliary_loss_mlp": 0.01045993, "balance_loss_clip": 1.04767585, "balance_loss_mlp": 1.02885103, "epoch": 0.6312751758552275, "flos": 41902545075840.0, "grad_norm": 2.2351767017697775, "language_loss": 0.65171194, "learning_rate": 1.2644172698420903e-06, "loss": 0.67361927, "num_input_tokens_seen": 113056805, "step": 5250, "time_per_iteration": 2.735736131668091 }, { "auxiliary_loss_clip": 0.01101964, "auxiliary_loss_mlp": 0.01040086, "balance_loss_clip": 1.04134893, "balance_loss_mlp": 1.02393317, "epoch": 0.6313954187458667, "flos": 19646800266240.0, "grad_norm": 2.1875365441351793, "language_loss": 0.84821934, "learning_rate": 1.2636929534263892e-06, "loss": 0.86963981, "num_input_tokens_seen": 113075790, "step": 5251, "time_per_iteration": 2.645653247833252 }, { "auxiliary_loss_clip": 0.011041, "auxiliary_loss_mlp": 0.010406, "balance_loss_clip": 1.04040289, "balance_loss_mlp": 1.02403033, "epoch": 0.6315156616365057, "flos": 22894273906560.0, "grad_norm": 1.6987919717726059, "language_loss": 0.77600014, "learning_rate": 1.2629687487078821e-06, "loss": 0.7974472, "num_input_tokens_seen": 113094600, "step": 5252, "time_per_iteration": 2.6914799213409424 }, { "auxiliary_loss_clip": 0.01131781, "auxiliary_loss_mlp": 0.01039068, "balance_loss_clip": 1.04585433, "balance_loss_mlp": 1.02223539, "epoch": 0.6316359045271448, "flos": 23726251699200.0, "grad_norm": 2.0830513247107056, "language_loss": 0.76419818, "learning_rate": 1.2622446557964293e-06, "loss": 0.78590667, "num_input_tokens_seen": 113112605, "step": 5253, "time_per_iteration": 2.6985983848571777 }, { "auxiliary_loss_clip": 0.011118, "auxiliary_loss_mlp": 0.01039648, "balance_loss_clip": 1.0394125, "balance_loss_mlp": 1.02249348, "epoch": 0.631756147417784, "flos": 33108417164160.0, "grad_norm": 1.650331604964623, "language_loss": 0.71453923, "learning_rate": 1.261520674801876e-06, "loss": 0.73605371, "num_input_tokens_seen": 113133200, "step": 5254, "time_per_iteration": 2.71622633934021 }, { "auxiliary_loss_clip": 0.01113313, "auxiliary_loss_mlp": 0.01036099, "balance_loss_clip": 1.04513907, "balance_loss_mlp": 1.0197072, "epoch": 0.631876390308423, "flos": 31248424126080.0, "grad_norm": 3.127007041823102, "language_loss": 0.72657239, "learning_rate": 1.2607968058340488e-06, "loss": 0.74806654, "num_input_tokens_seen": 113152895, "step": 5255, "time_per_iteration": 2.7007052898406982 }, { "auxiliary_loss_clip": 0.01110304, "auxiliary_loss_mlp": 0.0104418, "balance_loss_clip": 1.04112124, "balance_loss_mlp": 1.02854013, "epoch": 0.6319966331990621, "flos": 24681152810880.0, "grad_norm": 1.746037641813806, "language_loss": 0.73249519, "learning_rate": 1.2600730490027583e-06, "loss": 0.75404, "num_input_tokens_seen": 113173135, "step": 5256, "time_per_iteration": 2.691152811050415 }, { "auxiliary_loss_clip": 0.01098404, "auxiliary_loss_mlp": 0.01037746, "balance_loss_clip": 1.0406878, "balance_loss_mlp": 1.02193904, "epoch": 0.6321168760897012, "flos": 17491764913920.0, "grad_norm": 1.6702335749098896, "language_loss": 0.80027604, "learning_rate": 1.2593494044177984e-06, "loss": 0.82163763, "num_input_tokens_seen": 113191440, "step": 5257, "time_per_iteration": 3.6548337936401367 }, { "auxiliary_loss_clip": 0.01140973, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.04369724, "balance_loss_mlp": 1.02270508, "epoch": 0.6322371189803403, "flos": 18295373940480.0, "grad_norm": 3.1836022162181346, "language_loss": 0.80538929, "learning_rate": 1.2586258721889448e-06, "loss": 0.82719946, "num_input_tokens_seen": 113208790, "step": 5258, "time_per_iteration": 2.530245542526245 }, { "auxiliary_loss_clip": 0.0108061, "auxiliary_loss_mlp": 0.01047015, "balance_loss_clip": 1.04038095, "balance_loss_mlp": 1.02951539, "epoch": 0.6323573618709794, "flos": 20157270399360.0, "grad_norm": 1.8698395958546314, "language_loss": 0.81994432, "learning_rate": 1.2579024524259573e-06, "loss": 0.84122062, "num_input_tokens_seen": 113225050, "step": 5259, "time_per_iteration": 2.68249249458313 }, { "auxiliary_loss_clip": 0.01109249, "auxiliary_loss_mlp": 0.0103746, "balance_loss_clip": 1.03936458, "balance_loss_mlp": 1.01981711, "epoch": 0.6324776047616185, "flos": 20042391726720.0, "grad_norm": 1.7817460771891138, "language_loss": 0.91347718, "learning_rate": 1.2571791452385768e-06, "loss": 0.93494427, "num_input_tokens_seen": 113242315, "step": 5260, "time_per_iteration": 3.592184543609619 }, { "auxiliary_loss_clip": 0.01112028, "auxiliary_loss_mlp": 0.01037276, "balance_loss_clip": 1.04371786, "balance_loss_mlp": 1.02155185, "epoch": 0.6325978476522576, "flos": 30848235724800.0, "grad_norm": 1.8257072724099808, "language_loss": 0.77141428, "learning_rate": 1.2564559507365301e-06, "loss": 0.7929073, "num_input_tokens_seen": 113264720, "step": 5261, "time_per_iteration": 2.715902328491211 }, { "auxiliary_loss_clip": 0.01113236, "auxiliary_loss_mlp": 0.01036612, "balance_loss_clip": 1.04237175, "balance_loss_mlp": 1.01958895, "epoch": 0.6327180905428966, "flos": 24535104111360.0, "grad_norm": 2.3096316553460783, "language_loss": 0.79000002, "learning_rate": 1.2557328690295244e-06, "loss": 0.81149858, "num_input_tokens_seen": 113282910, "step": 5262, "time_per_iteration": 2.6362802982330322 }, { "auxiliary_loss_clip": 0.01102004, "auxiliary_loss_mlp": 0.01044182, "balance_loss_clip": 1.04113221, "balance_loss_mlp": 1.02904224, "epoch": 0.6328383334335358, "flos": 21575274583680.0, "grad_norm": 1.8612974677453686, "language_loss": 0.76205218, "learning_rate": 1.255009900227251e-06, "loss": 0.78351408, "num_input_tokens_seen": 113301935, "step": 5263, "time_per_iteration": 2.6733193397521973 }, { "auxiliary_loss_clip": 0.01136347, "auxiliary_loss_mlp": 0.0103533, "balance_loss_clip": 1.04687512, "balance_loss_mlp": 1.01978564, "epoch": 0.6329585763241748, "flos": 22929861306240.0, "grad_norm": 1.8552779435562508, "language_loss": 0.79443872, "learning_rate": 1.254287044439383e-06, "loss": 0.81615555, "num_input_tokens_seen": 113321540, "step": 5264, "time_per_iteration": 3.489171266555786 }, { "auxiliary_loss_clip": 0.010439, "auxiliary_loss_mlp": 0.01000732, "balance_loss_clip": 1.01327729, "balance_loss_mlp": 0.99930173, "epoch": 0.6330788192148139, "flos": 70936897847040.0, "grad_norm": 0.8057742662468466, "language_loss": 0.54383564, "learning_rate": 1.2535643017755776e-06, "loss": 0.56428194, "num_input_tokens_seen": 113383730, "step": 5265, "time_per_iteration": 3.25018310546875 }, { "auxiliary_loss_clip": 0.01101882, "auxiliary_loss_mlp": 0.01038584, "balance_loss_clip": 1.04137039, "balance_loss_mlp": 1.02303934, "epoch": 0.6331990621054531, "flos": 21244501215360.0, "grad_norm": 2.424935947534231, "language_loss": 0.7217108, "learning_rate": 1.2528416723454737e-06, "loss": 0.74311543, "num_input_tokens_seen": 113400400, "step": 5266, "time_per_iteration": 2.606194019317627 }, { "auxiliary_loss_clip": 0.01132895, "auxiliary_loss_mlp": 0.01040317, "balance_loss_clip": 1.04499459, "balance_loss_mlp": 1.02549899, "epoch": 0.6333193049960921, "flos": 34459412526720.0, "grad_norm": 1.6885776024841297, "language_loss": 0.71257323, "learning_rate": 1.2521191562586945e-06, "loss": 0.73430538, "num_input_tokens_seen": 113424050, "step": 5267, "time_per_iteration": 3.597163200378418 }, { "auxiliary_loss_clip": 0.01137443, "auxiliary_loss_mlp": 0.00772078, "balance_loss_clip": 1.04569101, "balance_loss_mlp": 1.00038064, "epoch": 0.6334395478867312, "flos": 18329883932160.0, "grad_norm": 2.010693563621386, "language_loss": 0.76468146, "learning_rate": 1.2513967536248445e-06, "loss": 0.78377664, "num_input_tokens_seen": 113440370, "step": 5268, "time_per_iteration": 2.5486321449279785 }, { "auxiliary_loss_clip": 0.01119734, "auxiliary_loss_mlp": 0.01040391, "balance_loss_clip": 1.04228747, "balance_loss_mlp": 1.02415419, "epoch": 0.6335597907773702, "flos": 23623152687360.0, "grad_norm": 2.0371459372465885, "language_loss": 0.8106519, "learning_rate": 1.2506744645535117e-06, "loss": 0.8322531, "num_input_tokens_seen": 113460800, "step": 5269, "time_per_iteration": 2.591064929962158 }, { "auxiliary_loss_clip": 0.01103731, "auxiliary_loss_mlp": 0.01047485, "balance_loss_clip": 1.03816998, "balance_loss_mlp": 1.02919793, "epoch": 0.6336800336680094, "flos": 22710913954560.0, "grad_norm": 4.422414556043852, "language_loss": 0.60431612, "learning_rate": 1.249952289154267e-06, "loss": 0.62582827, "num_input_tokens_seen": 113480840, "step": 5270, "time_per_iteration": 2.666240930557251 }, { "auxiliary_loss_clip": 0.01063401, "auxiliary_loss_mlp": 0.0105027, "balance_loss_clip": 1.03669298, "balance_loss_mlp": 1.03368807, "epoch": 0.6338002765586485, "flos": 23622757637760.0, "grad_norm": 2.119994644419572, "language_loss": 0.76449502, "learning_rate": 1.2492302275366635e-06, "loss": 0.78563178, "num_input_tokens_seen": 113500515, "step": 5271, "time_per_iteration": 2.7400310039520264 }, { "auxiliary_loss_clip": 0.01122501, "auxiliary_loss_mlp": 0.01038476, "balance_loss_clip": 1.0437789, "balance_loss_mlp": 1.02126241, "epoch": 0.6339205194492875, "flos": 26505450708480.0, "grad_norm": 2.33716274199518, "language_loss": 0.65079057, "learning_rate": 1.2485082798102377e-06, "loss": 0.67240036, "num_input_tokens_seen": 113520930, "step": 5272, "time_per_iteration": 2.634413480758667 }, { "auxiliary_loss_clip": 0.01105309, "auxiliary_loss_mlp": 0.01042899, "balance_loss_clip": 1.0424521, "balance_loss_mlp": 1.02607894, "epoch": 0.6340407623399267, "flos": 18544306170240.0, "grad_norm": 2.102979725337797, "language_loss": 0.68361628, "learning_rate": 1.2477864460845084e-06, "loss": 0.70509839, "num_input_tokens_seen": 113537330, "step": 5273, "time_per_iteration": 2.665025234222412 }, { "auxiliary_loss_clip": 0.01112921, "auxiliary_loss_mlp": 0.01040826, "balance_loss_clip": 1.04042649, "balance_loss_mlp": 1.02226472, "epoch": 0.6341610052305657, "flos": 17712579772800.0, "grad_norm": 2.8498716568003295, "language_loss": 0.73354775, "learning_rate": 1.2470647264689776e-06, "loss": 0.75508523, "num_input_tokens_seen": 113555810, "step": 5274, "time_per_iteration": 2.624173164367676 }, { "auxiliary_loss_clip": 0.0108177, "auxiliary_loss_mlp": 0.01037121, "balance_loss_clip": 1.03854561, "balance_loss_mlp": 1.02058625, "epoch": 0.6342812481212048, "flos": 23587026583680.0, "grad_norm": 2.096123655609714, "language_loss": 0.71253753, "learning_rate": 1.2463431210731282e-06, "loss": 0.73372638, "num_input_tokens_seen": 113575395, "step": 5275, "time_per_iteration": 2.772526979446411 }, { "auxiliary_loss_clip": 0.01094468, "auxiliary_loss_mlp": 0.0104083, "balance_loss_clip": 1.03899419, "balance_loss_mlp": 1.023664, "epoch": 0.634401491011844, "flos": 17821927751040.0, "grad_norm": 2.1700651609679285, "language_loss": 0.76485205, "learning_rate": 1.2456216300064289e-06, "loss": 0.78620499, "num_input_tokens_seen": 113592945, "step": 5276, "time_per_iteration": 2.695521593093872 }, { "auxiliary_loss_clip": 0.01105931, "auxiliary_loss_mlp": 0.01033103, "balance_loss_clip": 1.04016232, "balance_loss_mlp": 1.01586509, "epoch": 0.634521733902483, "flos": 21358158825600.0, "grad_norm": 1.8642854616946944, "language_loss": 0.78196704, "learning_rate": 1.244900253378328e-06, "loss": 0.80335736, "num_input_tokens_seen": 113613000, "step": 5277, "time_per_iteration": 2.6916775703430176 }, { "auxiliary_loss_clip": 0.01049158, "auxiliary_loss_mlp": 0.0103656, "balance_loss_clip": 1.03561926, "balance_loss_mlp": 1.0214206, "epoch": 0.6346419767931221, "flos": 16545052103040.0, "grad_norm": 1.9058130355350937, "language_loss": 0.6894601, "learning_rate": 1.2441789912982583e-06, "loss": 0.71031725, "num_input_tokens_seen": 113630085, "step": 5278, "time_per_iteration": 2.855834722518921 }, { "auxiliary_loss_clip": 0.01131487, "auxiliary_loss_mlp": 0.01043631, "balance_loss_clip": 1.04674637, "balance_loss_mlp": 1.02638137, "epoch": 0.6347622196837612, "flos": 24350989973760.0, "grad_norm": 3.0070430280725318, "language_loss": 0.6466341, "learning_rate": 1.2434578438756346e-06, "loss": 0.66838533, "num_input_tokens_seen": 113650515, "step": 5279, "time_per_iteration": 2.916215419769287 }, { "auxiliary_loss_clip": 0.01124558, "auxiliary_loss_mlp": 0.01034911, "balance_loss_clip": 1.04239023, "balance_loss_mlp": 1.01946175, "epoch": 0.6348824625744003, "flos": 64523178195840.0, "grad_norm": 2.3813236866862684, "language_loss": 0.78177595, "learning_rate": 1.242736811219855e-06, "loss": 0.80337065, "num_input_tokens_seen": 113676475, "step": 5280, "time_per_iteration": 3.038888692855835 }, { "auxiliary_loss_clip": 0.01119323, "auxiliary_loss_mlp": 0.01039941, "balance_loss_clip": 1.04263091, "balance_loss_mlp": 1.02273929, "epoch": 0.6350027054650393, "flos": 28622133313920.0, "grad_norm": 2.161534289552255, "language_loss": 0.82090735, "learning_rate": 1.2420158934402988e-06, "loss": 0.84249997, "num_input_tokens_seen": 113697090, "step": 5281, "time_per_iteration": 2.672656774520874 }, { "auxiliary_loss_clip": 0.01087098, "auxiliary_loss_mlp": 0.01051051, "balance_loss_clip": 1.03680885, "balance_loss_mlp": 1.03293157, "epoch": 0.6351229483556785, "flos": 23002544476800.0, "grad_norm": 2.1606651405687467, "language_loss": 0.84887052, "learning_rate": 1.2412950906463286e-06, "loss": 0.87025201, "num_input_tokens_seen": 113714395, "step": 5282, "time_per_iteration": 2.6891918182373047 }, { "auxiliary_loss_clip": 0.01087288, "auxiliary_loss_mlp": 0.01033666, "balance_loss_clip": 1.03966272, "balance_loss_mlp": 1.01846123, "epoch": 0.6352431912463176, "flos": 21939300967680.0, "grad_norm": 2.626363096456449, "language_loss": 0.89911968, "learning_rate": 1.2405744029472902e-06, "loss": 0.92032921, "num_input_tokens_seen": 113733880, "step": 5283, "time_per_iteration": 3.6645500659942627 }, { "auxiliary_loss_clip": 0.0111061, "auxiliary_loss_mlp": 0.01043027, "balance_loss_clip": 1.04124951, "balance_loss_mlp": 1.02735078, "epoch": 0.6353634341369566, "flos": 13735257684480.0, "grad_norm": 2.1705403248017445, "language_loss": 0.7629683, "learning_rate": 1.2398538304525108e-06, "loss": 0.78450471, "num_input_tokens_seen": 113752505, "step": 5284, "time_per_iteration": 2.6259241104125977 }, { "auxiliary_loss_clip": 0.01095363, "auxiliary_loss_mlp": 0.01056289, "balance_loss_clip": 1.04131806, "balance_loss_mlp": 1.03725147, "epoch": 0.6354836770275958, "flos": 19316170552320.0, "grad_norm": 2.776114753053492, "language_loss": 0.75679034, "learning_rate": 1.2391333732713016e-06, "loss": 0.77830684, "num_input_tokens_seen": 113770310, "step": 5285, "time_per_iteration": 3.581106662750244 }, { "auxiliary_loss_clip": 0.0109823, "auxiliary_loss_mlp": 0.01047328, "balance_loss_clip": 1.04039311, "balance_loss_mlp": 1.03016198, "epoch": 0.6356039199182348, "flos": 21613375935360.0, "grad_norm": 2.950520075405681, "language_loss": 0.78687418, "learning_rate": 1.2384130315129543e-06, "loss": 0.80832982, "num_input_tokens_seen": 113788635, "step": 5286, "time_per_iteration": 2.6560637950897217 }, { "auxiliary_loss_clip": 0.01053202, "auxiliary_loss_mlp": 0.01043541, "balance_loss_clip": 1.0359776, "balance_loss_mlp": 1.0253973, "epoch": 0.6357241628088739, "flos": 18111978074880.0, "grad_norm": 16.024838487394902, "language_loss": 0.73271143, "learning_rate": 1.2376928052867447e-06, "loss": 0.75367892, "num_input_tokens_seen": 113807755, "step": 5287, "time_per_iteration": 3.064030170440674 }, { "auxiliary_loss_clip": 0.01114873, "auxiliary_loss_mlp": 0.01039763, "balance_loss_clip": 1.0443269, "balance_loss_mlp": 1.02442682, "epoch": 0.6358444056995131, "flos": 24935256599040.0, "grad_norm": 2.167710572576348, "language_loss": 0.78004473, "learning_rate": 1.2369726947019299e-06, "loss": 0.8015911, "num_input_tokens_seen": 113828230, "step": 5288, "time_per_iteration": 3.625706195831299 }, { "auxiliary_loss_clip": 0.01126282, "auxiliary_loss_mlp": 0.01033654, "balance_loss_clip": 1.04579425, "balance_loss_mlp": 1.0189321, "epoch": 0.6359646485901521, "flos": 23293348986240.0, "grad_norm": 2.6057627802667676, "language_loss": 0.67481393, "learning_rate": 1.2362526998677511e-06, "loss": 0.69641328, "num_input_tokens_seen": 113844595, "step": 5289, "time_per_iteration": 2.6371428966522217 }, { "auxiliary_loss_clip": 0.01113065, "auxiliary_loss_mlp": 0.01041767, "balance_loss_clip": 1.04243433, "balance_loss_mlp": 1.02580488, "epoch": 0.6360848914807912, "flos": 20887442069760.0, "grad_norm": 3.329150407568459, "language_loss": 0.8439728, "learning_rate": 1.2355328208934301e-06, "loss": 0.86552113, "num_input_tokens_seen": 113863470, "step": 5290, "time_per_iteration": 2.6486878395080566 }, { "auxiliary_loss_clip": 0.01125184, "auxiliary_loss_mlp": 0.00772432, "balance_loss_clip": 1.0410763, "balance_loss_mlp": 1.00043988, "epoch": 0.6362051343714303, "flos": 18479775386880.0, "grad_norm": 1.8558269227643716, "language_loss": 0.72220218, "learning_rate": 1.2348130578881728e-06, "loss": 0.74117839, "num_input_tokens_seen": 113881690, "step": 5291, "time_per_iteration": 3.9391415119171143 }, { "auxiliary_loss_clip": 0.01138908, "auxiliary_loss_mlp": 0.01038824, "balance_loss_clip": 1.04377639, "balance_loss_mlp": 1.02185488, "epoch": 0.6363253772620694, "flos": 24389594115840.0, "grad_norm": 2.8898835476810723, "language_loss": 0.7639966, "learning_rate": 1.2340934109611664e-06, "loss": 0.78577387, "num_input_tokens_seen": 113902450, "step": 5292, "time_per_iteration": 3.5127763748168945 }, { "auxiliary_loss_clip": 0.01120495, "auxiliary_loss_mlp": 0.01042791, "balance_loss_clip": 1.04506361, "balance_loss_mlp": 1.02417088, "epoch": 0.6364456201527084, "flos": 25958243940480.0, "grad_norm": 2.2963755784205837, "language_loss": 0.68717402, "learning_rate": 1.2333738802215798e-06, "loss": 0.70880693, "num_input_tokens_seen": 113922670, "step": 5293, "time_per_iteration": 2.665973424911499 }, { "auxiliary_loss_clip": 0.0108273, "auxiliary_loss_mlp": 0.01036587, "balance_loss_clip": 1.03929019, "balance_loss_mlp": 1.02064896, "epoch": 0.6365658630433476, "flos": 20740711011840.0, "grad_norm": 1.85426672652407, "language_loss": 0.80864829, "learning_rate": 1.2326544657785668e-06, "loss": 0.82984143, "num_input_tokens_seen": 113942360, "step": 5294, "time_per_iteration": 2.7253403663635254 }, { "auxiliary_loss_clip": 0.01091224, "auxiliary_loss_mlp": 0.01046905, "balance_loss_clip": 1.04035783, "balance_loss_mlp": 1.02805805, "epoch": 0.6366861059339867, "flos": 21434146047360.0, "grad_norm": 2.6950596891256153, "language_loss": 0.74268317, "learning_rate": 1.2319351677412608e-06, "loss": 0.76406443, "num_input_tokens_seen": 113959405, "step": 5295, "time_per_iteration": 2.665539264678955 }, { "auxiliary_loss_clip": 0.01110028, "auxiliary_loss_mlp": 0.01039128, "balance_loss_clip": 1.04428256, "balance_loss_mlp": 1.02160406, "epoch": 0.6368063488246257, "flos": 22267093507200.0, "grad_norm": 2.0723784307095676, "language_loss": 0.74348694, "learning_rate": 1.2312159862187796e-06, "loss": 0.76497853, "num_input_tokens_seen": 113977815, "step": 5296, "time_per_iteration": 2.679022789001465 }, { "auxiliary_loss_clip": 0.01140795, "auxiliary_loss_mlp": 0.01038078, "balance_loss_clip": 1.04638529, "balance_loss_mlp": 1.02184176, "epoch": 0.6369265917152649, "flos": 22420719976320.0, "grad_norm": 6.29357887299589, "language_loss": 0.76178896, "learning_rate": 1.2304969213202217e-06, "loss": 0.78357768, "num_input_tokens_seen": 113999075, "step": 5297, "time_per_iteration": 2.587160110473633 }, { "auxiliary_loss_clip": 0.01109643, "auxiliary_loss_mlp": 0.01040496, "balance_loss_clip": 1.04270339, "balance_loss_mlp": 1.02524912, "epoch": 0.6370468346059039, "flos": 24718176754560.0, "grad_norm": 2.6769037654089, "language_loss": 0.79264098, "learning_rate": 1.2297779731546692e-06, "loss": 0.81414235, "num_input_tokens_seen": 114018170, "step": 5298, "time_per_iteration": 2.6799662113189697 }, { "auxiliary_loss_clip": 0.01113664, "auxiliary_loss_mlp": 0.01040764, "balance_loss_clip": 1.04446149, "balance_loss_mlp": 1.02469444, "epoch": 0.637167077496543, "flos": 25296589463040.0, "grad_norm": 2.1438643349567603, "language_loss": 0.78234005, "learning_rate": 1.2290591418311853e-06, "loss": 0.80388433, "num_input_tokens_seen": 114035565, "step": 5299, "time_per_iteration": 2.6400742530822754 }, { "auxiliary_loss_clip": 0.01123853, "auxiliary_loss_mlp": 0.01040406, "balance_loss_clip": 1.04511571, "balance_loss_mlp": 1.02428865, "epoch": 0.637287320387182, "flos": 27671110871040.0, "grad_norm": 1.6387320803329657, "language_loss": 0.72407466, "learning_rate": 1.2283404274588172e-06, "loss": 0.74571729, "num_input_tokens_seen": 114054510, "step": 5300, "time_per_iteration": 2.6310956478118896 }, { "auxiliary_loss_clip": 0.00978898, "auxiliary_loss_mlp": 0.01004516, "balance_loss_clip": 1.01241422, "balance_loss_mlp": 1.00269246, "epoch": 0.6374075632778212, "flos": 63173406873600.0, "grad_norm": 0.743788571868164, "language_loss": 0.52813053, "learning_rate": 1.227621830146592e-06, "loss": 0.54796469, "num_input_tokens_seen": 114109875, "step": 5301, "time_per_iteration": 3.2599759101867676 }, { "auxiliary_loss_clip": 0.01103055, "auxiliary_loss_mlp": 0.01040217, "balance_loss_clip": 1.04308248, "balance_loss_mlp": 1.02327693, "epoch": 0.6375278061684603, "flos": 25558127366400.0, "grad_norm": 1.7088089513734896, "language_loss": 0.79117215, "learning_rate": 1.2269033500035217e-06, "loss": 0.8126049, "num_input_tokens_seen": 114130010, "step": 5302, "time_per_iteration": 2.880465269088745 }, { "auxiliary_loss_clip": 0.0110137, "auxiliary_loss_mlp": 0.01036612, "balance_loss_clip": 1.0434866, "balance_loss_mlp": 1.02138901, "epoch": 0.6376480490590993, "flos": 25666362023040.0, "grad_norm": 2.0889837785618677, "language_loss": 0.73493814, "learning_rate": 1.2261849871385988e-06, "loss": 0.75631797, "num_input_tokens_seen": 114151115, "step": 5303, "time_per_iteration": 2.6912708282470703 }, { "auxiliary_loss_clip": 0.011386, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.04442549, "balance_loss_mlp": 1.02646852, "epoch": 0.6377682919497385, "flos": 31537684350720.0, "grad_norm": 2.021444750156723, "language_loss": 0.62293416, "learning_rate": 1.2254667416607972e-06, "loss": 0.64476305, "num_input_tokens_seen": 114172715, "step": 5304, "time_per_iteration": 2.6539878845214844 }, { "auxiliary_loss_clip": 0.01128185, "auxiliary_loss_mlp": 0.01043836, "balance_loss_clip": 1.04607391, "balance_loss_mlp": 1.02751589, "epoch": 0.6378885348403776, "flos": 23039209284480.0, "grad_norm": 1.8990960698288704, "language_loss": 0.82701212, "learning_rate": 1.2247486136790756e-06, "loss": 0.84873229, "num_input_tokens_seen": 114192195, "step": 5305, "time_per_iteration": 2.6292271614074707 }, { "auxiliary_loss_clip": 0.01128582, "auxiliary_loss_mlp": 0.01045343, "balance_loss_clip": 1.04667795, "balance_loss_mlp": 1.02976215, "epoch": 0.6380087777310166, "flos": 18697070712960.0, "grad_norm": 2.6199711538695993, "language_loss": 0.80668855, "learning_rate": 1.2240306033023726e-06, "loss": 0.82842779, "num_input_tokens_seen": 114210020, "step": 5306, "time_per_iteration": 2.6019680500030518 }, { "auxiliary_loss_clip": 0.01097986, "auxiliary_loss_mlp": 0.01038594, "balance_loss_clip": 1.03668737, "balance_loss_mlp": 1.0225246, "epoch": 0.6381290206216558, "flos": 23331558078720.0, "grad_norm": 4.0075500282431875, "language_loss": 0.72174966, "learning_rate": 1.223312710639611e-06, "loss": 0.74311543, "num_input_tokens_seen": 114228740, "step": 5307, "time_per_iteration": 2.690885543823242 }, { "auxiliary_loss_clip": 0.01115931, "auxiliary_loss_mlp": 0.01045613, "balance_loss_clip": 1.04473948, "balance_loss_mlp": 1.02773166, "epoch": 0.6382492635122948, "flos": 18880466578560.0, "grad_norm": 2.0668206869585464, "language_loss": 0.86921203, "learning_rate": 1.2225949357996928e-06, "loss": 0.89082754, "num_input_tokens_seen": 114246865, "step": 5308, "time_per_iteration": 2.6011838912963867 }, { "auxiliary_loss_clip": 0.01120658, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.04400527, "balance_loss_mlp": 1.01739907, "epoch": 0.6383695064029339, "flos": 27819134818560.0, "grad_norm": 1.7257988772356356, "language_loss": 0.80284965, "learning_rate": 1.221877278891505e-06, "loss": 0.82438171, "num_input_tokens_seen": 114266120, "step": 5309, "time_per_iteration": 3.63928484916687 }, { "auxiliary_loss_clip": 0.01134072, "auxiliary_loss_mlp": 0.01041169, "balance_loss_clip": 1.04568136, "balance_loss_mlp": 1.02314448, "epoch": 0.638489749293573, "flos": 26395635853440.0, "grad_norm": 2.986037686801646, "language_loss": 0.71286261, "learning_rate": 1.221159740023915e-06, "loss": 0.73461503, "num_input_tokens_seen": 114285950, "step": 5310, "time_per_iteration": 2.6471831798553467 }, { "auxiliary_loss_clip": 0.01111804, "auxiliary_loss_mlp": 0.00773704, "balance_loss_clip": 1.04273427, "balance_loss_mlp": 1.00039279, "epoch": 0.6386099921842121, "flos": 23988328306560.0, "grad_norm": 2.051288965773832, "language_loss": 0.72190464, "learning_rate": 1.2204423193057735e-06, "loss": 0.74075973, "num_input_tokens_seen": 114304780, "step": 5311, "time_per_iteration": 3.6738481521606445 }, { "auxiliary_loss_clip": 0.01026449, "auxiliary_loss_mlp": 0.01002036, "balance_loss_clip": 1.014727, "balance_loss_mlp": 1.00041437, "epoch": 0.6387302350748512, "flos": 71731169337600.0, "grad_norm": 0.8504952308447036, "language_loss": 0.63237101, "learning_rate": 1.2197250168459122e-06, "loss": 0.65265584, "num_input_tokens_seen": 114361180, "step": 5312, "time_per_iteration": 3.2551817893981934 }, { "auxiliary_loss_clip": 0.01123559, "auxiliary_loss_mlp": 0.01041773, "balance_loss_clip": 1.04283929, "balance_loss_mlp": 1.0255847, "epoch": 0.6388504779654903, "flos": 14535778141440.0, "grad_norm": 2.226615283763116, "language_loss": 0.74453521, "learning_rate": 1.2190078327531454e-06, "loss": 0.7661885, "num_input_tokens_seen": 114377425, "step": 5313, "time_per_iteration": 2.5844173431396484 }, { "auxiliary_loss_clip": 0.01129763, "auxiliary_loss_mlp": 0.01037424, "balance_loss_clip": 1.04646564, "balance_loss_mlp": 1.02124751, "epoch": 0.6389707208561294, "flos": 22346133384960.0, "grad_norm": 1.4715866592334477, "language_loss": 0.7290433, "learning_rate": 1.2182907671362697e-06, "loss": 0.75071514, "num_input_tokens_seen": 114398120, "step": 5314, "time_per_iteration": 2.5595602989196777 }, { "auxiliary_loss_clip": 0.01128319, "auxiliary_loss_mlp": 0.01040928, "balance_loss_clip": 1.04583764, "balance_loss_mlp": 1.02448916, "epoch": 0.6390909637467684, "flos": 19426883247360.0, "grad_norm": 2.9206140803944773, "language_loss": 0.7874105, "learning_rate": 1.2175738201040626e-06, "loss": 0.80910301, "num_input_tokens_seen": 114415160, "step": 5315, "time_per_iteration": 2.584430456161499 }, { "auxiliary_loss_clip": 0.01129016, "auxiliary_loss_mlp": 0.01048109, "balance_loss_clip": 1.0451678, "balance_loss_mlp": 1.03068054, "epoch": 0.6392112066374076, "flos": 24090852700800.0, "grad_norm": 2.385456243917893, "language_loss": 0.78269625, "learning_rate": 1.2168569917652855e-06, "loss": 0.80446756, "num_input_tokens_seen": 114435015, "step": 5316, "time_per_iteration": 2.6544950008392334 }, { "auxiliary_loss_clip": 0.01126834, "auxiliary_loss_mlp": 0.01041771, "balance_loss_clip": 1.04443419, "balance_loss_mlp": 1.02548718, "epoch": 0.6393314495280467, "flos": 26795141896320.0, "grad_norm": 1.6971392979649516, "language_loss": 0.63894171, "learning_rate": 1.2161402822286797e-06, "loss": 0.66062772, "num_input_tokens_seen": 114455700, "step": 5317, "time_per_iteration": 3.559821367263794 }, { "auxiliary_loss_clip": 0.01099598, "auxiliary_loss_mlp": 0.01040206, "balance_loss_clip": 1.04214716, "balance_loss_mlp": 1.02214622, "epoch": 0.6394516924186857, "flos": 20260692633600.0, "grad_norm": 2.3993820931025023, "language_loss": 0.78650844, "learning_rate": 1.2154236916029703e-06, "loss": 0.80790651, "num_input_tokens_seen": 114473675, "step": 5318, "time_per_iteration": 3.559798002243042 }, { "auxiliary_loss_clip": 0.01088291, "auxiliary_loss_mlp": 0.01044846, "balance_loss_clip": 1.03779221, "balance_loss_mlp": 1.02849054, "epoch": 0.6395719353093249, "flos": 18368847210240.0, "grad_norm": 2.5006074644363516, "language_loss": 0.73791599, "learning_rate": 1.2147072199968627e-06, "loss": 0.75924742, "num_input_tokens_seen": 114492310, "step": 5319, "time_per_iteration": 2.659050226211548 }, { "auxiliary_loss_clip": 0.01121386, "auxiliary_loss_mlp": 0.01040735, "balance_loss_clip": 1.04316163, "balance_loss_mlp": 1.02458239, "epoch": 0.6396921781999639, "flos": 17566315591680.0, "grad_norm": 4.21013056279685, "language_loss": 0.71725655, "learning_rate": 1.2139908675190454e-06, "loss": 0.73887777, "num_input_tokens_seen": 114511520, "step": 5320, "time_per_iteration": 2.5830509662628174 }, { "auxiliary_loss_clip": 0.01070267, "auxiliary_loss_mlp": 0.0103779, "balance_loss_clip": 1.03799748, "balance_loss_mlp": 1.02040982, "epoch": 0.639812421090603, "flos": 21251252972160.0, "grad_norm": 2.580110399238187, "language_loss": 0.74984658, "learning_rate": 1.2132746342781883e-06, "loss": 0.77092719, "num_input_tokens_seen": 114532680, "step": 5321, "time_per_iteration": 2.761549472808838 }, { "auxiliary_loss_clip": 0.01139582, "auxiliary_loss_mlp": 0.0103744, "balance_loss_clip": 1.04590726, "balance_loss_mlp": 1.02120376, "epoch": 0.6399326639812422, "flos": 11180967684480.0, "grad_norm": 2.681652782538981, "language_loss": 0.79608387, "learning_rate": 1.2125585203829442e-06, "loss": 0.81785405, "num_input_tokens_seen": 114548320, "step": 5322, "time_per_iteration": 2.5441970825195312 }, { "auxiliary_loss_clip": 0.01091538, "auxiliary_loss_mlp": 0.01040173, "balance_loss_clip": 1.04120684, "balance_loss_mlp": 1.02335274, "epoch": 0.6400529068718812, "flos": 23911048195200.0, "grad_norm": 1.904667785432371, "language_loss": 0.73806965, "learning_rate": 1.211842525941946e-06, "loss": 0.75938672, "num_input_tokens_seen": 114568115, "step": 5323, "time_per_iteration": 2.695082426071167 }, { "auxiliary_loss_clip": 0.01085933, "auxiliary_loss_mlp": 0.01036056, "balance_loss_clip": 1.04069257, "balance_loss_mlp": 1.0203023, "epoch": 0.6401731497625203, "flos": 44018724890880.0, "grad_norm": 2.7703955852763955, "language_loss": 0.7894066, "learning_rate": 1.2111266510638105e-06, "loss": 0.81062651, "num_input_tokens_seen": 114591040, "step": 5324, "time_per_iteration": 2.8762664794921875 }, { "auxiliary_loss_clip": 0.01068982, "auxiliary_loss_mlp": 0.01042568, "balance_loss_clip": 1.03714645, "balance_loss_mlp": 1.02605152, "epoch": 0.6402933926531594, "flos": 20662209838080.0, "grad_norm": 1.6299191756830027, "language_loss": 0.80001265, "learning_rate": 1.2104108958571346e-06, "loss": 0.82112813, "num_input_tokens_seen": 114609310, "step": 5325, "time_per_iteration": 2.739295721054077 }, { "auxiliary_loss_clip": 0.01121804, "auxiliary_loss_mlp": 0.01043394, "balance_loss_clip": 1.04323721, "balance_loss_mlp": 1.02737272, "epoch": 0.6404136355437985, "flos": 24863327614080.0, "grad_norm": 1.5151848183230479, "language_loss": 0.75624764, "learning_rate": 1.2096952604304975e-06, "loss": 0.77789962, "num_input_tokens_seen": 114629740, "step": 5326, "time_per_iteration": 2.626830577850342 }, { "auxiliary_loss_clip": 0.01126946, "auxiliary_loss_mlp": 0.01038777, "balance_loss_clip": 1.04186308, "balance_loss_mlp": 1.02193236, "epoch": 0.6405338784344375, "flos": 40479548901120.0, "grad_norm": 2.1822331232880416, "language_loss": 0.70437062, "learning_rate": 1.2089797448924616e-06, "loss": 0.72602785, "num_input_tokens_seen": 114653615, "step": 5327, "time_per_iteration": 2.7405242919921875 }, { "auxiliary_loss_clip": 0.01090453, "auxiliary_loss_mlp": 0.01044729, "balance_loss_clip": 1.03721929, "balance_loss_mlp": 1.02777791, "epoch": 0.6406541213250767, "flos": 20886041439360.0, "grad_norm": 2.317655692465472, "language_loss": 0.66098773, "learning_rate": 1.2082643493515692e-06, "loss": 0.68233961, "num_input_tokens_seen": 114671935, "step": 5328, "time_per_iteration": 2.7085142135620117 }, { "auxiliary_loss_clip": 0.0112292, "auxiliary_loss_mlp": 0.01037851, "balance_loss_clip": 1.04275203, "balance_loss_mlp": 1.0217694, "epoch": 0.6407743642157158, "flos": 23295970679040.0, "grad_norm": 1.9883548512490363, "language_loss": 0.81954116, "learning_rate": 1.207549073916346e-06, "loss": 0.84114885, "num_input_tokens_seen": 114692870, "step": 5329, "time_per_iteration": 2.61434006690979 }, { "auxiliary_loss_clip": 0.01108426, "auxiliary_loss_mlp": 0.01043831, "balance_loss_clip": 1.04357624, "balance_loss_mlp": 1.02684355, "epoch": 0.6408946071063548, "flos": 15012636122880.0, "grad_norm": 2.5485913319462834, "language_loss": 0.77808404, "learning_rate": 1.2068339186952976e-06, "loss": 0.79960662, "num_input_tokens_seen": 114710410, "step": 5330, "time_per_iteration": 2.627321481704712 }, { "auxiliary_loss_clip": 0.01128351, "auxiliary_loss_mlp": 0.01039871, "balance_loss_clip": 1.04576731, "balance_loss_mlp": 1.02468395, "epoch": 0.6410148499969939, "flos": 22528595496960.0, "grad_norm": 2.3213944943056313, "language_loss": 0.73094714, "learning_rate": 1.2061188837969136e-06, "loss": 0.75262934, "num_input_tokens_seen": 114730020, "step": 5331, "time_per_iteration": 2.598111152648926 }, { "auxiliary_loss_clip": 0.01096642, "auxiliary_loss_mlp": 0.01042272, "balance_loss_clip": 1.03756714, "balance_loss_mlp": 1.02430725, "epoch": 0.641135092887633, "flos": 12422004537600.0, "grad_norm": 2.7370071269082956, "language_loss": 0.84142548, "learning_rate": 1.2054039693296631e-06, "loss": 0.86281466, "num_input_tokens_seen": 114748015, "step": 5332, "time_per_iteration": 2.6577930450439453 }, { "auxiliary_loss_clip": 0.01093249, "auxiliary_loss_mlp": 0.01040583, "balance_loss_clip": 1.03942513, "balance_loss_mlp": 1.02552056, "epoch": 0.6412553357782721, "flos": 22127329687680.0, "grad_norm": 2.851600720640303, "language_loss": 0.8145988, "learning_rate": 1.2046891754019992e-06, "loss": 0.83593714, "num_input_tokens_seen": 114768625, "step": 5333, "time_per_iteration": 2.68825101852417 }, { "auxiliary_loss_clip": 0.01125058, "auxiliary_loss_mlp": 0.01036435, "balance_loss_clip": 1.04423666, "balance_loss_mlp": 1.01944757, "epoch": 0.6413755786689112, "flos": 15888605097600.0, "grad_norm": 2.1931831922968636, "language_loss": 0.82669687, "learning_rate": 1.2039745021223548e-06, "loss": 0.84831184, "num_input_tokens_seen": 114786045, "step": 5334, "time_per_iteration": 2.5300557613372803 }, { "auxiliary_loss_clip": 0.00998775, "auxiliary_loss_mlp": 0.01000005, "balance_loss_clip": 1.01168597, "balance_loss_mlp": 0.99859792, "epoch": 0.6414958215595503, "flos": 68039159955840.0, "grad_norm": 0.7946778388273692, "language_loss": 0.5703938, "learning_rate": 1.2032599495991456e-06, "loss": 0.59038162, "num_input_tokens_seen": 114850785, "step": 5335, "time_per_iteration": 3.347249746322632 }, { "auxiliary_loss_clip": 0.0112892, "auxiliary_loss_mlp": 0.01038856, "balance_loss_clip": 1.04576111, "balance_loss_mlp": 1.02266765, "epoch": 0.6416160644501894, "flos": 44091300320640.0, "grad_norm": 2.085850486980474, "language_loss": 0.69484282, "learning_rate": 1.2025455179407685e-06, "loss": 0.71652055, "num_input_tokens_seen": 114871945, "step": 5336, "time_per_iteration": 3.7578563690185547 }, { "auxiliary_loss_clip": 0.01121567, "auxiliary_loss_mlp": 0.00773192, "balance_loss_clip": 1.04188275, "balance_loss_mlp": 1.00037909, "epoch": 0.6417363073408284, "flos": 20959837931520.0, "grad_norm": 2.5634560967233218, "language_loss": 0.73830163, "learning_rate": 1.2018312072556022e-06, "loss": 0.75724918, "num_input_tokens_seen": 114890445, "step": 5337, "time_per_iteration": 3.513378381729126 }, { "auxiliary_loss_clip": 0.01134568, "auxiliary_loss_mlp": 0.00773237, "balance_loss_clip": 1.04335332, "balance_loss_mlp": 1.00044799, "epoch": 0.6418565502314676, "flos": 22455122227200.0, "grad_norm": 1.8358852856514332, "language_loss": 0.74956691, "learning_rate": 1.2011170176520077e-06, "loss": 0.76864493, "num_input_tokens_seen": 114911360, "step": 5338, "time_per_iteration": 2.5855941772460938 }, { "auxiliary_loss_clip": 0.01056979, "auxiliary_loss_mlp": 0.01041252, "balance_loss_clip": 1.03512871, "balance_loss_mlp": 1.02503991, "epoch": 0.6419767931221066, "flos": 25045502417280.0, "grad_norm": 1.5476881469947323, "language_loss": 0.81310987, "learning_rate": 1.2004029492383256e-06, "loss": 0.83409226, "num_input_tokens_seen": 114932700, "step": 5339, "time_per_iteration": 2.769399404525757 }, { "auxiliary_loss_clip": 0.01122965, "auxiliary_loss_mlp": 0.01040755, "balance_loss_clip": 1.0432502, "balance_loss_mlp": 1.02488828, "epoch": 0.6420970360127457, "flos": 19463691709440.0, "grad_norm": 2.1404450286458254, "language_loss": 0.73436528, "learning_rate": 1.1996890021228814e-06, "loss": 0.75600255, "num_input_tokens_seen": 114949475, "step": 5340, "time_per_iteration": 2.5995004177093506 }, { "auxiliary_loss_clip": 0.0110966, "auxiliary_loss_mlp": 0.01044602, "balance_loss_clip": 1.04201186, "balance_loss_mlp": 1.02834189, "epoch": 0.6422172789033849, "flos": 40406147458560.0, "grad_norm": 1.4878020240585113, "language_loss": 0.69880974, "learning_rate": 1.1989751764139785e-06, "loss": 0.72035229, "num_input_tokens_seen": 114973125, "step": 5341, "time_per_iteration": 2.792059898376465 }, { "auxiliary_loss_clip": 0.01084998, "auxiliary_loss_mlp": 0.01042606, "balance_loss_clip": 1.03563643, "balance_loss_mlp": 1.02687001, "epoch": 0.6423375217940239, "flos": 27672870637440.0, "grad_norm": 1.782360151922935, "language_loss": 0.83061028, "learning_rate": 1.1982614722199044e-06, "loss": 0.85188633, "num_input_tokens_seen": 114994300, "step": 5342, "time_per_iteration": 3.6857988834381104 }, { "auxiliary_loss_clip": 0.01116103, "auxiliary_loss_mlp": 0.01036658, "balance_loss_clip": 1.0429275, "balance_loss_mlp": 1.0203979, "epoch": 0.642457764684663, "flos": 18369242259840.0, "grad_norm": 2.2550295661344335, "language_loss": 0.77792048, "learning_rate": 1.1975478896489276e-06, "loss": 0.79944807, "num_input_tokens_seen": 115012135, "step": 5343, "time_per_iteration": 2.6106491088867188 }, { "auxiliary_loss_clip": 0.01134283, "auxiliary_loss_mlp": 0.01038758, "balance_loss_clip": 1.04399574, "balance_loss_mlp": 1.0236783, "epoch": 0.6425780075753021, "flos": 19750509809280.0, "grad_norm": 1.9177139959160248, "language_loss": 0.76499611, "learning_rate": 1.1968344288092981e-06, "loss": 0.78672647, "num_input_tokens_seen": 115028715, "step": 5344, "time_per_iteration": 3.475175380706787 }, { "auxiliary_loss_clip": 0.0112318, "auxiliary_loss_mlp": 0.00772921, "balance_loss_clip": 1.0423243, "balance_loss_mlp": 1.00046265, "epoch": 0.6426982504659412, "flos": 20558536208640.0, "grad_norm": 1.719972072754676, "language_loss": 0.64735198, "learning_rate": 1.1961210898092468e-06, "loss": 0.66631299, "num_input_tokens_seen": 115047665, "step": 5345, "time_per_iteration": 2.6029927730560303 }, { "auxiliary_loss_clip": 0.01119675, "auxiliary_loss_mlp": 0.01039341, "balance_loss_clip": 1.04528213, "balance_loss_mlp": 1.02302086, "epoch": 0.6428184933565803, "flos": 17851984456320.0, "grad_norm": 6.467521545942766, "language_loss": 0.79109108, "learning_rate": 1.1954078727569874e-06, "loss": 0.81268126, "num_input_tokens_seen": 115064965, "step": 5346, "time_per_iteration": 2.6074934005737305 }, { "auxiliary_loss_clip": 0.01106764, "auxiliary_loss_mlp": 0.00771951, "balance_loss_clip": 1.0417552, "balance_loss_mlp": 1.00035477, "epoch": 0.6429387362472194, "flos": 22456953820800.0, "grad_norm": 1.6581520072462916, "language_loss": 0.77807558, "learning_rate": 1.1946947777607141e-06, "loss": 0.79686272, "num_input_tokens_seen": 115086100, "step": 5347, "time_per_iteration": 2.6819279193878174 }, { "auxiliary_loss_clip": 0.01079996, "auxiliary_loss_mlp": 0.01047399, "balance_loss_clip": 1.03790879, "balance_loss_mlp": 1.0296129, "epoch": 0.6430589791378585, "flos": 24752579005440.0, "grad_norm": 2.1317659781608818, "language_loss": 0.79937959, "learning_rate": 1.1939818049286024e-06, "loss": 0.82065356, "num_input_tokens_seen": 115104260, "step": 5348, "time_per_iteration": 2.69980525970459 }, { "auxiliary_loss_clip": 0.01063481, "auxiliary_loss_mlp": 0.01044535, "balance_loss_clip": 1.03530467, "balance_loss_mlp": 1.02745175, "epoch": 0.6431792220284975, "flos": 24901249397760.0, "grad_norm": 1.878878977160307, "language_loss": 0.75749266, "learning_rate": 1.1932689543688101e-06, "loss": 0.7785728, "num_input_tokens_seen": 115125365, "step": 5349, "time_per_iteration": 2.762972354888916 }, { "auxiliary_loss_clip": 0.01112418, "auxiliary_loss_mlp": 0.01043665, "balance_loss_clip": 1.04303861, "balance_loss_mlp": 1.02701104, "epoch": 0.6432994649191367, "flos": 21032305620480.0, "grad_norm": 2.2960191120964306, "language_loss": 0.72202539, "learning_rate": 1.1925562261894756e-06, "loss": 0.74358618, "num_input_tokens_seen": 115144445, "step": 5350, "time_per_iteration": 2.626952648162842 }, { "auxiliary_loss_clip": 0.01108839, "auxiliary_loss_mlp": 0.01032647, "balance_loss_clip": 1.04126406, "balance_loss_mlp": 1.01639938, "epoch": 0.6434197078097758, "flos": 30884433655680.0, "grad_norm": 1.8791162098021466, "language_loss": 0.77309871, "learning_rate": 1.1918436204987207e-06, "loss": 0.79451358, "num_input_tokens_seen": 115166305, "step": 5351, "time_per_iteration": 2.712430238723755 }, { "auxiliary_loss_clip": 0.01118885, "auxiliary_loss_mlp": 0.01037813, "balance_loss_clip": 1.04446483, "balance_loss_mlp": 1.02224422, "epoch": 0.6435399507004148, "flos": 15012492468480.0, "grad_norm": 2.698618659092634, "language_loss": 0.81254703, "learning_rate": 1.191131137404645e-06, "loss": 0.83411407, "num_input_tokens_seen": 115183045, "step": 5352, "time_per_iteration": 2.5842156410217285 }, { "auxiliary_loss_clip": 0.01089109, "auxiliary_loss_mlp": 0.01044402, "balance_loss_clip": 1.03813243, "balance_loss_mlp": 1.02641296, "epoch": 0.643660193591054, "flos": 19901981462400.0, "grad_norm": 2.10031264544824, "language_loss": 0.77660477, "learning_rate": 1.190418777015333e-06, "loss": 0.79793984, "num_input_tokens_seen": 115201955, "step": 5353, "time_per_iteration": 2.6402409076690674 }, { "auxiliary_loss_clip": 0.01108747, "auxiliary_loss_mlp": 0.01037245, "balance_loss_clip": 1.04025865, "balance_loss_mlp": 1.02187872, "epoch": 0.643780436481693, "flos": 24133622820480.0, "grad_norm": 1.589166107115373, "language_loss": 0.73666155, "learning_rate": 1.1897065394388487e-06, "loss": 0.75812149, "num_input_tokens_seen": 115222395, "step": 5354, "time_per_iteration": 2.670128107070923 }, { "auxiliary_loss_clip": 0.01110366, "auxiliary_loss_mlp": 0.01044608, "balance_loss_clip": 1.04423535, "balance_loss_mlp": 1.02919459, "epoch": 0.6439006793723321, "flos": 23148808657920.0, "grad_norm": 1.499934656619541, "language_loss": 0.76390278, "learning_rate": 1.1889944247832385e-06, "loss": 0.78545254, "num_input_tokens_seen": 115242635, "step": 5355, "time_per_iteration": 2.6242241859436035 }, { "auxiliary_loss_clip": 0.01125402, "auxiliary_loss_mlp": 0.01040534, "balance_loss_clip": 1.0426327, "balance_loss_mlp": 1.0238328, "epoch": 0.6440209222629713, "flos": 23617909301760.0, "grad_norm": 1.7718127550150828, "language_loss": 0.71022493, "learning_rate": 1.1882824331565283e-06, "loss": 0.73188424, "num_input_tokens_seen": 115262095, "step": 5356, "time_per_iteration": 2.653225898742676 }, { "auxiliary_loss_clip": 0.01090649, "auxiliary_loss_mlp": 0.01048149, "balance_loss_clip": 1.03704834, "balance_loss_mlp": 1.03032756, "epoch": 0.6441411651536103, "flos": 16544872535040.0, "grad_norm": 2.2797056341589466, "language_loss": 0.89169395, "learning_rate": 1.1875705646667287e-06, "loss": 0.913082, "num_input_tokens_seen": 115279985, "step": 5357, "time_per_iteration": 2.6331818103790283 }, { "auxiliary_loss_clip": 0.01118982, "auxiliary_loss_mlp": 0.0103579, "balance_loss_clip": 1.03932619, "balance_loss_mlp": 1.01938677, "epoch": 0.6442614080442494, "flos": 25410965345280.0, "grad_norm": 1.9511961690096291, "language_loss": 0.75393391, "learning_rate": 1.1868588194218282e-06, "loss": 0.77548158, "num_input_tokens_seen": 115300365, "step": 5358, "time_per_iteration": 2.6288340091705322 }, { "auxiliary_loss_clip": 0.01116503, "auxiliary_loss_mlp": 0.0103874, "balance_loss_clip": 1.04217792, "balance_loss_mlp": 1.02287304, "epoch": 0.6443816509348885, "flos": 28294017552000.0, "grad_norm": 1.5512988498651612, "language_loss": 0.73693448, "learning_rate": 1.1861471975297979e-06, "loss": 0.75848687, "num_input_tokens_seen": 115322060, "step": 5359, "time_per_iteration": 2.70743727684021 }, { "auxiliary_loss_clip": 0.01098111, "auxiliary_loss_mlp": 0.01043317, "balance_loss_clip": 1.04124427, "balance_loss_mlp": 1.02635336, "epoch": 0.6445018938255276, "flos": 36690075964800.0, "grad_norm": 1.6515470010461706, "language_loss": 0.70711863, "learning_rate": 1.185435699098591e-06, "loss": 0.72853291, "num_input_tokens_seen": 115348255, "step": 5360, "time_per_iteration": 2.809323787689209 }, { "auxiliary_loss_clip": 0.01115648, "auxiliary_loss_mlp": 0.01040809, "balance_loss_clip": 1.04197407, "balance_loss_mlp": 1.02445304, "epoch": 0.6446221367161666, "flos": 14501411804160.0, "grad_norm": 2.7944775946015903, "language_loss": 0.78550309, "learning_rate": 1.1847243242361403e-06, "loss": 0.80706763, "num_input_tokens_seen": 115366845, "step": 5361, "time_per_iteration": 3.551939010620117 }, { "auxiliary_loss_clip": 0.01110556, "auxiliary_loss_mlp": 0.01037918, "balance_loss_clip": 1.04208326, "balance_loss_mlp": 1.02133644, "epoch": 0.6447423796068057, "flos": 24609367480320.0, "grad_norm": 1.6592081205214348, "language_loss": 0.78021592, "learning_rate": 1.1840130730503624e-06, "loss": 0.80170065, "num_input_tokens_seen": 115388125, "step": 5362, "time_per_iteration": 2.6469907760620117 }, { "auxiliary_loss_clip": 0.01136164, "auxiliary_loss_mlp": 0.01038417, "balance_loss_clip": 1.04378498, "balance_loss_mlp": 1.02356362, "epoch": 0.6448626224974449, "flos": 25047298097280.0, "grad_norm": 2.3061972836711138, "language_loss": 0.74858844, "learning_rate": 1.1833019456491518e-06, "loss": 0.7703343, "num_input_tokens_seen": 115409655, "step": 5363, "time_per_iteration": 3.613572597503662 }, { "auxiliary_loss_clip": 0.01124472, "auxiliary_loss_mlp": 0.01043179, "balance_loss_clip": 1.04316354, "balance_loss_mlp": 1.02654886, "epoch": 0.6449828653880839, "flos": 22530355263360.0, "grad_norm": 1.9829280573940047, "language_loss": 0.78797781, "learning_rate": 1.1825909421403871e-06, "loss": 0.80965436, "num_input_tokens_seen": 115428750, "step": 5364, "time_per_iteration": 2.634073257446289 }, { "auxiliary_loss_clip": 0.01122895, "auxiliary_loss_mlp": 0.01039813, "balance_loss_clip": 1.04167902, "balance_loss_mlp": 1.0246197, "epoch": 0.645103108278723, "flos": 25695736369920.0, "grad_norm": 2.0336145266356653, "language_loss": 0.76604575, "learning_rate": 1.181880062631926e-06, "loss": 0.78767288, "num_input_tokens_seen": 115448085, "step": 5365, "time_per_iteration": 2.6454672813415527 }, { "auxiliary_loss_clip": 0.01106774, "auxiliary_loss_mlp": 0.0104569, "balance_loss_clip": 1.04276466, "balance_loss_mlp": 1.02718866, "epoch": 0.6452233511693621, "flos": 27450331925760.0, "grad_norm": 2.2016800295386747, "language_loss": 0.8504771, "learning_rate": 1.1811693072316093e-06, "loss": 0.87200171, "num_input_tokens_seen": 115465765, "step": 5366, "time_per_iteration": 2.629753351211548 }, { "auxiliary_loss_clip": 0.01134584, "auxiliary_loss_mlp": 0.00772372, "balance_loss_clip": 1.04213655, "balance_loss_mlp": 1.00031137, "epoch": 0.6453435940600012, "flos": 19208618254080.0, "grad_norm": 2.4196751148631472, "language_loss": 0.84011745, "learning_rate": 1.1804586760472574e-06, "loss": 0.85918701, "num_input_tokens_seen": 115482230, "step": 5367, "time_per_iteration": 2.574526309967041 }, { "auxiliary_loss_clip": 0.01098007, "auxiliary_loss_mlp": 0.0103781, "balance_loss_clip": 1.04131031, "balance_loss_mlp": 1.02213442, "epoch": 0.6454638369506402, "flos": 25737680476800.0, "grad_norm": 4.198275583408652, "language_loss": 0.79735613, "learning_rate": 1.1797481691866736e-06, "loss": 0.81871426, "num_input_tokens_seen": 115499455, "step": 5368, "time_per_iteration": 2.666964054107666 }, { "auxiliary_loss_clip": 0.01098995, "auxiliary_loss_mlp": 0.0104763, "balance_loss_clip": 1.0409584, "balance_loss_mlp": 1.0318588, "epoch": 0.6455840798412794, "flos": 20989176364800.0, "grad_norm": 6.8662572800897435, "language_loss": 0.83060336, "learning_rate": 1.1790377867576393e-06, "loss": 0.85206968, "num_input_tokens_seen": 115517205, "step": 5369, "time_per_iteration": 3.6052112579345703 }, { "auxiliary_loss_clip": 0.01113785, "auxiliary_loss_mlp": 0.01042343, "balance_loss_clip": 1.04266453, "balance_loss_mlp": 1.02709615, "epoch": 0.6457043227319185, "flos": 26067556005120.0, "grad_norm": 1.768998293814218, "language_loss": 0.76371288, "learning_rate": 1.1783275288679203e-06, "loss": 0.78527415, "num_input_tokens_seen": 115534370, "step": 5370, "time_per_iteration": 3.6380934715270996 }, { "auxiliary_loss_clip": 0.01035256, "auxiliary_loss_mlp": 0.01002467, "balance_loss_clip": 1.01403475, "balance_loss_mlp": 1.00079811, "epoch": 0.6458245656225575, "flos": 60370831088640.0, "grad_norm": 0.8458784403570808, "language_loss": 0.57133639, "learning_rate": 1.177617395625262e-06, "loss": 0.59171355, "num_input_tokens_seen": 115592345, "step": 5371, "time_per_iteration": 3.1464319229125977 }, { "auxiliary_loss_clip": 0.01124355, "auxiliary_loss_mlp": 0.01039788, "balance_loss_clip": 1.04191875, "balance_loss_mlp": 1.02463698, "epoch": 0.6459448085131967, "flos": 23076771932160.0, "grad_norm": 9.736581765480244, "language_loss": 0.75426662, "learning_rate": 1.1769073871373908e-06, "loss": 0.77590811, "num_input_tokens_seen": 115612550, "step": 5372, "time_per_iteration": 2.5909342765808105 }, { "auxiliary_loss_clip": 0.01096516, "auxiliary_loss_mlp": 0.01042505, "balance_loss_clip": 1.03937268, "balance_loss_mlp": 1.02450442, "epoch": 0.6460650514038357, "flos": 22598190097920.0, "grad_norm": 1.859703030355024, "language_loss": 0.8400557, "learning_rate": 1.176197503512015e-06, "loss": 0.8614459, "num_input_tokens_seen": 115632265, "step": 5373, "time_per_iteration": 2.697343111038208 }, { "auxiliary_loss_clip": 0.01105619, "auxiliary_loss_mlp": 0.01052778, "balance_loss_clip": 1.04068291, "balance_loss_mlp": 1.03736472, "epoch": 0.6461852942944748, "flos": 20266726118400.0, "grad_norm": 2.087471831888825, "language_loss": 0.82953924, "learning_rate": 1.1754877448568223e-06, "loss": 0.85112321, "num_input_tokens_seen": 115651720, "step": 5374, "time_per_iteration": 2.624030590057373 }, { "auxiliary_loss_clip": 0.0111225, "auxiliary_loss_mlp": 0.01038435, "balance_loss_clip": 1.04107833, "balance_loss_mlp": 1.02211559, "epoch": 0.646305537185114, "flos": 23367109564800.0, "grad_norm": 2.6107635165222134, "language_loss": 0.90277505, "learning_rate": 1.1747781112794837e-06, "loss": 0.9242819, "num_input_tokens_seen": 115668215, "step": 5375, "time_per_iteration": 2.6169493198394775 }, { "auxiliary_loss_clip": 0.01096202, "auxiliary_loss_mlp": 0.01037745, "balance_loss_clip": 1.0421381, "balance_loss_mlp": 1.0215925, "epoch": 0.646425780075753, "flos": 24277480790400.0, "grad_norm": 1.6191447839762896, "language_loss": 0.83180094, "learning_rate": 1.1740686028876487e-06, "loss": 0.85314035, "num_input_tokens_seen": 115687080, "step": 5376, "time_per_iteration": 2.686391830444336 }, { "auxiliary_loss_clip": 0.01121633, "auxiliary_loss_mlp": 0.01039022, "balance_loss_clip": 1.0431689, "balance_loss_mlp": 1.0235846, "epoch": 0.6465460229663921, "flos": 20813968800000.0, "grad_norm": 2.636524366214108, "language_loss": 0.74806845, "learning_rate": 1.1733592197889507e-06, "loss": 0.76967502, "num_input_tokens_seen": 115703990, "step": 5377, "time_per_iteration": 2.6014113426208496 }, { "auxiliary_loss_clip": 0.01115147, "auxiliary_loss_mlp": 0.01032211, "balance_loss_clip": 1.04175591, "balance_loss_mlp": 1.01720214, "epoch": 0.6466662658570312, "flos": 22853299466880.0, "grad_norm": 4.173714124635129, "language_loss": 0.72708923, "learning_rate": 1.1726499620910014e-06, "loss": 0.74856281, "num_input_tokens_seen": 115724270, "step": 5378, "time_per_iteration": 2.6246867179870605 }, { "auxiliary_loss_clip": 0.01121316, "auxiliary_loss_mlp": 0.01041789, "balance_loss_clip": 1.04247308, "balance_loss_mlp": 1.02520716, "epoch": 0.6467865087476703, "flos": 15304553953920.0, "grad_norm": 2.1351700602884702, "language_loss": 0.77901495, "learning_rate": 1.1719408299013955e-06, "loss": 0.80064607, "num_input_tokens_seen": 115742995, "step": 5379, "time_per_iteration": 2.5950562953948975 }, { "auxiliary_loss_clip": 0.01133635, "auxiliary_loss_mlp": 0.01039184, "balance_loss_clip": 1.04363251, "balance_loss_mlp": 1.02436578, "epoch": 0.6469067516383094, "flos": 19573650218880.0, "grad_norm": 3.210817806756732, "language_loss": 0.76158512, "learning_rate": 1.1712318233277067e-06, "loss": 0.78331327, "num_input_tokens_seen": 115762015, "step": 5380, "time_per_iteration": 2.5303688049316406 }, { "auxiliary_loss_clip": 0.0103303, "auxiliary_loss_mlp": 0.0100335, "balance_loss_clip": 1.01264405, "balance_loss_mlp": 1.00183606, "epoch": 0.6470269945289485, "flos": 65098002522240.0, "grad_norm": 0.7496111845504397, "language_loss": 0.57815361, "learning_rate": 1.1705229424774916e-06, "loss": 0.59851742, "num_input_tokens_seen": 115816285, "step": 5381, "time_per_iteration": 3.047729253768921 }, { "auxiliary_loss_clip": 0.01109851, "auxiliary_loss_mlp": 0.0103958, "balance_loss_clip": 1.04140067, "balance_loss_mlp": 1.02292621, "epoch": 0.6471472374195876, "flos": 30696943639680.0, "grad_norm": 1.6219333372090778, "language_loss": 0.64367175, "learning_rate": 1.1698141874582867e-06, "loss": 0.66516602, "num_input_tokens_seen": 115837330, "step": 5382, "time_per_iteration": 2.7112624645233154 }, { "auxiliary_loss_clip": 0.01135183, "auxiliary_loss_mlp": 0.01040039, "balance_loss_clip": 1.04515672, "balance_loss_mlp": 1.02451754, "epoch": 0.6472674803102266, "flos": 20521835487360.0, "grad_norm": 1.8941245507301685, "language_loss": 0.72553372, "learning_rate": 1.169105558377609e-06, "loss": 0.7472859, "num_input_tokens_seen": 115857420, "step": 5383, "time_per_iteration": 2.552666187286377 }, { "auxiliary_loss_clip": 0.01082912, "auxiliary_loss_mlp": 0.00772671, "balance_loss_clip": 1.04244053, "balance_loss_mlp": 1.00042212, "epoch": 0.6473877232008658, "flos": 24715447320960.0, "grad_norm": 1.6510161106780412, "language_loss": 0.78327852, "learning_rate": 1.1683970553429587e-06, "loss": 0.80183434, "num_input_tokens_seen": 115878875, "step": 5384, "time_per_iteration": 2.729166269302368 }, { "auxiliary_loss_clip": 0.01102386, "auxiliary_loss_mlp": 0.01042118, "balance_loss_clip": 1.04239082, "balance_loss_mlp": 1.0262754, "epoch": 0.6475079660915048, "flos": 15885552441600.0, "grad_norm": 3.613229275539006, "language_loss": 0.82158685, "learning_rate": 1.1676886784618128e-06, "loss": 0.84303182, "num_input_tokens_seen": 115895540, "step": 5385, "time_per_iteration": 2.638862371444702 }, { "auxiliary_loss_clip": 0.0112218, "auxiliary_loss_mlp": 0.01041618, "balance_loss_clip": 1.04283357, "balance_loss_mlp": 1.02525091, "epoch": 0.6476282089821439, "flos": 17381590922880.0, "grad_norm": 2.1464106097627567, "language_loss": 0.83858711, "learning_rate": 1.1669804278416332e-06, "loss": 0.86022508, "num_input_tokens_seen": 115910265, "step": 5386, "time_per_iteration": 2.5732686519622803 }, { "auxiliary_loss_clip": 0.01117713, "auxiliary_loss_mlp": 0.01031667, "balance_loss_clip": 1.04303837, "balance_loss_mlp": 1.01498985, "epoch": 0.6477484518727831, "flos": 20194078861440.0, "grad_norm": 1.8642737337893491, "language_loss": 0.71587741, "learning_rate": 1.1662723035898602e-06, "loss": 0.73737133, "num_input_tokens_seen": 115930025, "step": 5387, "time_per_iteration": 3.638000726699829 }, { "auxiliary_loss_clip": 0.01123791, "auxiliary_loss_mlp": 0.0103715, "balance_loss_clip": 1.04247868, "balance_loss_mlp": 1.01976883, "epoch": 0.6478686947634221, "flos": 25410426641280.0, "grad_norm": 1.773464608768767, "language_loss": 0.81962985, "learning_rate": 1.165564305813915e-06, "loss": 0.84123927, "num_input_tokens_seen": 115949025, "step": 5388, "time_per_iteration": 2.6382131576538086 }, { "auxiliary_loss_clip": 0.01117768, "auxiliary_loss_mlp": 0.01039182, "balance_loss_clip": 1.04225397, "balance_loss_mlp": 1.02429903, "epoch": 0.6479889376540612, "flos": 20083581648000.0, "grad_norm": 3.256251823793626, "language_loss": 0.811607, "learning_rate": 1.1648564346212019e-06, "loss": 0.83317655, "num_input_tokens_seen": 115968145, "step": 5389, "time_per_iteration": 3.588026762008667 }, { "auxiliary_loss_clip": 0.01116741, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.04268336, "balance_loss_mlp": 1.02433264, "epoch": 0.6481091805447003, "flos": 26758082039040.0, "grad_norm": 1.910812063510309, "language_loss": 0.76420832, "learning_rate": 1.164148690119104e-06, "loss": 0.7857762, "num_input_tokens_seen": 115989425, "step": 5390, "time_per_iteration": 2.648327112197876 }, { "auxiliary_loss_clip": 0.01135839, "auxiliary_loss_mlp": 0.0104418, "balance_loss_clip": 1.04636097, "balance_loss_mlp": 1.02879047, "epoch": 0.6482294234353394, "flos": 23952094462080.0, "grad_norm": 1.9161702242796295, "language_loss": 0.74036741, "learning_rate": 1.163441072414985e-06, "loss": 0.76216757, "num_input_tokens_seen": 116009630, "step": 5391, "time_per_iteration": 2.600365400314331 }, { "auxiliary_loss_clip": 0.01122328, "auxiliary_loss_mlp": 0.01032006, "balance_loss_clip": 1.04415512, "balance_loss_mlp": 1.01621079, "epoch": 0.6483496663259785, "flos": 26209833776640.0, "grad_norm": 2.5530265964512613, "language_loss": 0.70251071, "learning_rate": 1.16273358161619e-06, "loss": 0.72405404, "num_input_tokens_seen": 116029965, "step": 5392, "time_per_iteration": 2.637655019760132 }, { "auxiliary_loss_clip": 0.011176, "auxiliary_loss_mlp": 0.0103734, "balance_loss_clip": 1.04290128, "balance_loss_mlp": 1.02117538, "epoch": 0.6484699092166175, "flos": 20922239370240.0, "grad_norm": 2.0267793947178876, "language_loss": 0.83052921, "learning_rate": 1.1620262178300446e-06, "loss": 0.85207862, "num_input_tokens_seen": 116048580, "step": 5393, "time_per_iteration": 2.626312494277954 }, { "auxiliary_loss_clip": 0.01099441, "auxiliary_loss_mlp": 0.01033881, "balance_loss_clip": 1.03968668, "balance_loss_mlp": 1.01753736, "epoch": 0.6485901521072567, "flos": 33072865678080.0, "grad_norm": 1.7688996830357506, "language_loss": 0.75742114, "learning_rate": 1.1613189811638563e-06, "loss": 0.77875435, "num_input_tokens_seen": 116070305, "step": 5394, "time_per_iteration": 2.7237398624420166 }, { "auxiliary_loss_clip": 0.01126574, "auxiliary_loss_mlp": 0.01037824, "balance_loss_clip": 1.04578185, "balance_loss_mlp": 1.02192163, "epoch": 0.6487103949978957, "flos": 22274060745600.0, "grad_norm": 1.788682551990081, "language_loss": 0.78120762, "learning_rate": 1.1606118717249117e-06, "loss": 0.80285162, "num_input_tokens_seen": 116090405, "step": 5395, "time_per_iteration": 3.5249788761138916 }, { "auxiliary_loss_clip": 0.01137806, "auxiliary_loss_mlp": 0.01042533, "balance_loss_clip": 1.04348087, "balance_loss_mlp": 1.0256412, "epoch": 0.6488306378885348, "flos": 22930400010240.0, "grad_norm": 1.9524829348498782, "language_loss": 0.6826967, "learning_rate": 1.1599048896204787e-06, "loss": 0.70450008, "num_input_tokens_seen": 116110285, "step": 5396, "time_per_iteration": 3.5464751720428467 }, { "auxiliary_loss_clip": 0.01098501, "auxiliary_loss_mlp": 0.0104714, "balance_loss_clip": 1.04088402, "balance_loss_mlp": 1.03024793, "epoch": 0.648950880779174, "flos": 20376110010240.0, "grad_norm": 1.9789698340659596, "language_loss": 0.80863249, "learning_rate": 1.1591980349578061e-06, "loss": 0.83008897, "num_input_tokens_seen": 116128955, "step": 5397, "time_per_iteration": 2.6566178798675537 }, { "auxiliary_loss_clip": 0.01012286, "auxiliary_loss_mlp": 0.00999644, "balance_loss_clip": 1.01049519, "balance_loss_mlp": 0.99808228, "epoch": 0.649071123669813, "flos": 59930889310080.0, "grad_norm": 0.7322820214487153, "language_loss": 0.54294258, "learning_rate": 1.158491307844123e-06, "loss": 0.56306183, "num_input_tokens_seen": 116188875, "step": 5398, "time_per_iteration": 3.219891309738159 }, { "auxiliary_loss_clip": 0.01108122, "auxiliary_loss_mlp": 0.01038432, "balance_loss_clip": 1.04236126, "balance_loss_mlp": 1.02214766, "epoch": 0.6491913665604521, "flos": 20446566537600.0, "grad_norm": 1.777042544956419, "language_loss": 0.83985454, "learning_rate": 1.1577847083866387e-06, "loss": 0.86132008, "num_input_tokens_seen": 116207910, "step": 5399, "time_per_iteration": 2.633336305618286 }, { "auxiliary_loss_clip": 0.01101116, "auxiliary_loss_mlp": 0.01035714, "balance_loss_clip": 1.03976178, "balance_loss_mlp": 1.01854813, "epoch": 0.6493116094510912, "flos": 16946820702720.0, "grad_norm": 1.7951675727695657, "language_loss": 0.72511673, "learning_rate": 1.1570782366925453e-06, "loss": 0.74648505, "num_input_tokens_seen": 116226425, "step": 5400, "time_per_iteration": 2.627396583557129 }, { "auxiliary_loss_clip": 0.01112249, "auxiliary_loss_mlp": 0.01039901, "balance_loss_clip": 1.03860927, "balance_loss_mlp": 1.02352166, "epoch": 0.6494318523417303, "flos": 18802935072000.0, "grad_norm": 1.9279797715592373, "language_loss": 0.75859129, "learning_rate": 1.1563718928690132e-06, "loss": 0.78011274, "num_input_tokens_seen": 116243860, "step": 5401, "time_per_iteration": 2.6075692176818848 }, { "auxiliary_loss_clip": 0.01101548, "auxiliary_loss_mlp": 0.01048545, "balance_loss_clip": 1.04327726, "balance_loss_mlp": 1.03222561, "epoch": 0.6495520952323693, "flos": 18982847318400.0, "grad_norm": 2.471466256556771, "language_loss": 0.7135486, "learning_rate": 1.1556656770231942e-06, "loss": 0.73504955, "num_input_tokens_seen": 116260055, "step": 5402, "time_per_iteration": 2.652529001235962 }, { "auxiliary_loss_clip": 0.01121426, "auxiliary_loss_mlp": 0.01039766, "balance_loss_clip": 1.04187155, "balance_loss_mlp": 1.02387583, "epoch": 0.6496723381230085, "flos": 22745388032640.0, "grad_norm": 1.7013549341787084, "language_loss": 0.76075917, "learning_rate": 1.1549595892622207e-06, "loss": 0.78237116, "num_input_tokens_seen": 116278825, "step": 5403, "time_per_iteration": 2.5944297313690186 }, { "auxiliary_loss_clip": 0.00998706, "auxiliary_loss_mlp": 0.00998164, "balance_loss_clip": 1.01275539, "balance_loss_mlp": 0.99661392, "epoch": 0.6497925810136476, "flos": 62145283887360.0, "grad_norm": 0.8176663145836183, "language_loss": 0.58966219, "learning_rate": 1.1542536296932047e-06, "loss": 0.60963088, "num_input_tokens_seen": 116342360, "step": 5404, "time_per_iteration": 3.224644184112549 }, { "auxiliary_loss_clip": 0.01108805, "auxiliary_loss_mlp": 0.01045113, "balance_loss_clip": 1.04155731, "balance_loss_mlp": 1.02848339, "epoch": 0.6499128239042866, "flos": 20156731695360.0, "grad_norm": 1.7230921374776293, "language_loss": 0.69865078, "learning_rate": 1.1535477984232414e-06, "loss": 0.72018993, "num_input_tokens_seen": 116362235, "step": 5405, "time_per_iteration": 2.642996072769165 }, { "auxiliary_loss_clip": 0.01086522, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.03699541, "balance_loss_mlp": 1.02129865, "epoch": 0.6500330667949258, "flos": 24462420940800.0, "grad_norm": 1.9183958872843438, "language_loss": 0.77079105, "learning_rate": 1.152842095559404e-06, "loss": 0.79203665, "num_input_tokens_seen": 116382895, "step": 5406, "time_per_iteration": 2.727179765701294 }, { "auxiliary_loss_clip": 0.01116318, "auxiliary_loss_mlp": 0.01039525, "balance_loss_clip": 1.04364944, "balance_loss_mlp": 1.02306271, "epoch": 0.6501533096855648, "flos": 25477399549440.0, "grad_norm": 2.7669486018613996, "language_loss": 0.76859111, "learning_rate": 1.1521365212087474e-06, "loss": 0.79014957, "num_input_tokens_seen": 116402880, "step": 5407, "time_per_iteration": 2.6740777492523193 }, { "auxiliary_loss_clip": 0.01126802, "auxiliary_loss_mlp": 0.01038531, "balance_loss_clip": 1.0447855, "balance_loss_mlp": 1.02167439, "epoch": 0.6502735525762039, "flos": 44819245347840.0, "grad_norm": 4.530642141905626, "language_loss": 0.70807117, "learning_rate": 1.1514310754783062e-06, "loss": 0.72972453, "num_input_tokens_seen": 116425830, "step": 5408, "time_per_iteration": 2.8296401500701904 }, { "auxiliary_loss_clip": 0.01118189, "auxiliary_loss_mlp": 0.01046835, "balance_loss_clip": 1.04723048, "balance_loss_mlp": 1.02953815, "epoch": 0.6503937954668431, "flos": 28658546726400.0, "grad_norm": 2.0534239720446252, "language_loss": 0.73473138, "learning_rate": 1.1507257584750964e-06, "loss": 0.75638163, "num_input_tokens_seen": 116446010, "step": 5409, "time_per_iteration": 2.692647933959961 }, { "auxiliary_loss_clip": 0.01136456, "auxiliary_loss_mlp": 0.01047852, "balance_loss_clip": 1.04551661, "balance_loss_mlp": 1.03190756, "epoch": 0.6505140383574821, "flos": 20922562592640.0, "grad_norm": 1.868804588787123, "language_loss": 0.77329755, "learning_rate": 1.150020570306113e-06, "loss": 0.79514068, "num_input_tokens_seen": 116465150, "step": 5410, "time_per_iteration": 2.5567867755889893 }, { "auxiliary_loss_clip": 0.01107234, "auxiliary_loss_mlp": 0.01040712, "balance_loss_clip": 1.03883553, "balance_loss_mlp": 1.02320039, "epoch": 0.6506342812481212, "flos": 20595236929920.0, "grad_norm": 5.585824427428666, "language_loss": 0.74791479, "learning_rate": 1.1493155110783338e-06, "loss": 0.76939428, "num_input_tokens_seen": 116483675, "step": 5411, "time_per_iteration": 2.647003412246704 }, { "auxiliary_loss_clip": 0.01126491, "auxiliary_loss_mlp": 0.01036341, "balance_loss_clip": 1.04463983, "balance_loss_mlp": 1.02011716, "epoch": 0.6507545241387603, "flos": 30226478279040.0, "grad_norm": 3.0967921656131527, "language_loss": 0.70585644, "learning_rate": 1.1486105808987155e-06, "loss": 0.7274847, "num_input_tokens_seen": 116505165, "step": 5412, "time_per_iteration": 2.658139228820801 }, { "auxiliary_loss_clip": 0.0113278, "auxiliary_loss_mlp": 0.0104148, "balance_loss_clip": 1.04817343, "balance_loss_mlp": 1.02386689, "epoch": 0.6508747670293994, "flos": 17128241320320.0, "grad_norm": 1.9161702784764851, "language_loss": 0.81507683, "learning_rate": 1.1479057798741947e-06, "loss": 0.83681935, "num_input_tokens_seen": 116523220, "step": 5413, "time_per_iteration": 3.5779221057891846 }, { "auxiliary_loss_clip": 0.01019337, "auxiliary_loss_mlp": 0.01010543, "balance_loss_clip": 1.01381052, "balance_loss_mlp": 1.00904655, "epoch": 0.6509950099200384, "flos": 68559826573440.0, "grad_norm": 0.7851886655130874, "language_loss": 0.53262746, "learning_rate": 1.14720110811169e-06, "loss": 0.5529263, "num_input_tokens_seen": 116580450, "step": 5414, "time_per_iteration": 3.2485907077789307 }, { "auxiliary_loss_clip": 0.01128636, "auxiliary_loss_mlp": 0.01045131, "balance_loss_clip": 1.04504371, "balance_loss_mlp": 1.02838159, "epoch": 0.6511152528106776, "flos": 22347462188160.0, "grad_norm": 1.896136494288712, "language_loss": 0.76871932, "learning_rate": 1.146496565718098e-06, "loss": 0.79045701, "num_input_tokens_seen": 116601020, "step": 5415, "time_per_iteration": 3.5848546028137207 }, { "auxiliary_loss_clip": 0.01110713, "auxiliary_loss_mlp": 0.01039267, "balance_loss_clip": 1.04291618, "balance_loss_mlp": 1.02218437, "epoch": 0.6512354957013167, "flos": 20522158709760.0, "grad_norm": 2.047006835383282, "language_loss": 0.76021308, "learning_rate": 1.1457921528002996e-06, "loss": 0.78171289, "num_input_tokens_seen": 116619455, "step": 5416, "time_per_iteration": 2.674426555633545 }, { "auxiliary_loss_clip": 0.01139734, "auxiliary_loss_mlp": 0.00772673, "balance_loss_clip": 1.04768443, "balance_loss_mlp": 1.00042999, "epoch": 0.6513557385919557, "flos": 32337342881280.0, "grad_norm": 2.4619867751602587, "language_loss": 0.71965516, "learning_rate": 1.1450878694651522e-06, "loss": 0.73877919, "num_input_tokens_seen": 116640020, "step": 5417, "time_per_iteration": 2.6714682579040527 }, { "auxiliary_loss_clip": 0.01086334, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.03894711, "balance_loss_mlp": 1.01919222, "epoch": 0.6514759814825949, "flos": 12093206417280.0, "grad_norm": 2.487675758160298, "language_loss": 0.63540065, "learning_rate": 1.1443837158194954e-06, "loss": 0.65661621, "num_input_tokens_seen": 116655165, "step": 5418, "time_per_iteration": 2.709717273712158 }, { "auxiliary_loss_clip": 0.01096982, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.04358745, "balance_loss_mlp": 1.01833272, "epoch": 0.651596224373234, "flos": 22526907557760.0, "grad_norm": 1.636453187936156, "language_loss": 0.74665415, "learning_rate": 1.1436796919701484e-06, "loss": 0.76797241, "num_input_tokens_seen": 116673880, "step": 5419, "time_per_iteration": 2.6819028854370117 }, { "auxiliary_loss_clip": 0.0110829, "auxiliary_loss_mlp": 0.01034122, "balance_loss_clip": 1.0431993, "balance_loss_mlp": 1.01759911, "epoch": 0.651716467263873, "flos": 27818955250560.0, "grad_norm": 1.8681067056333456, "language_loss": 0.61680186, "learning_rate": 1.1429757980239115e-06, "loss": 0.63822603, "num_input_tokens_seen": 116694305, "step": 5420, "time_per_iteration": 3.728898525238037 }, { "auxiliary_loss_clip": 0.01143362, "auxiliary_loss_mlp": 0.01052796, "balance_loss_clip": 1.04619133, "balance_loss_mlp": 1.0340687, "epoch": 0.6518367101545122, "flos": 24316300414080.0, "grad_norm": 5.47910812320327, "language_loss": 0.81952107, "learning_rate": 1.1422720340875636e-06, "loss": 0.84148264, "num_input_tokens_seen": 116713055, "step": 5421, "time_per_iteration": 2.601536989212036 }, { "auxiliary_loss_clip": 0.01130513, "auxiliary_loss_mlp": 0.01039646, "balance_loss_clip": 1.04454398, "balance_loss_mlp": 1.02293253, "epoch": 0.6519569530451512, "flos": 20011939971840.0, "grad_norm": 4.102253556178004, "language_loss": 0.79568052, "learning_rate": 1.1415684002678671e-06, "loss": 0.8173821, "num_input_tokens_seen": 116731815, "step": 5422, "time_per_iteration": 3.5449130535125732 }, { "auxiliary_loss_clip": 0.01114932, "auxiliary_loss_mlp": 0.01044959, "balance_loss_clip": 1.04077101, "balance_loss_mlp": 1.0264225, "epoch": 0.6520771959357903, "flos": 21576064682880.0, "grad_norm": 5.090458199040746, "language_loss": 0.77422208, "learning_rate": 1.1408648966715617e-06, "loss": 0.79582095, "num_input_tokens_seen": 116749335, "step": 5423, "time_per_iteration": 2.6137566566467285 }, { "auxiliary_loss_clip": 0.01115436, "auxiliary_loss_mlp": 0.01041606, "balance_loss_clip": 1.04210901, "balance_loss_mlp": 1.02402282, "epoch": 0.6521974388264293, "flos": 22711021695360.0, "grad_norm": 2.1821803154793433, "language_loss": 0.72965968, "learning_rate": 1.1401615234053683e-06, "loss": 0.75123006, "num_input_tokens_seen": 116768155, "step": 5424, "time_per_iteration": 2.634207248687744 }, { "auxiliary_loss_clip": 0.01113842, "auxiliary_loss_mlp": 0.01043019, "balance_loss_clip": 1.04119301, "balance_loss_mlp": 1.02554286, "epoch": 0.6523176817170685, "flos": 23002939526400.0, "grad_norm": 2.1832649241488635, "language_loss": 0.75848019, "learning_rate": 1.1394582805759885e-06, "loss": 0.78004873, "num_input_tokens_seen": 116787435, "step": 5425, "time_per_iteration": 2.6601321697235107 }, { "auxiliary_loss_clip": 0.01128011, "auxiliary_loss_mlp": 0.01039754, "balance_loss_clip": 1.04539967, "balance_loss_mlp": 1.02341092, "epoch": 0.6524379246077076, "flos": 21688249835520.0, "grad_norm": 2.597135420139158, "language_loss": 0.75790524, "learning_rate": 1.1387551682901022e-06, "loss": 0.77958286, "num_input_tokens_seen": 116808040, "step": 5426, "time_per_iteration": 2.610429525375366 }, { "auxiliary_loss_clip": 0.01100206, "auxiliary_loss_mlp": 0.01041194, "balance_loss_clip": 1.04204714, "balance_loss_mlp": 1.02388525, "epoch": 0.6525581674983466, "flos": 19390936711680.0, "grad_norm": 2.6573776100062583, "language_loss": 0.70821702, "learning_rate": 1.138052186654373e-06, "loss": 0.72963095, "num_input_tokens_seen": 116825510, "step": 5427, "time_per_iteration": 2.692837953567505 }, { "auxiliary_loss_clip": 0.0111621, "auxiliary_loss_mlp": 0.01041609, "balance_loss_clip": 1.04397678, "balance_loss_mlp": 1.02342916, "epoch": 0.6526784103889858, "flos": 17165444832000.0, "grad_norm": 2.84315781181487, "language_loss": 0.88129807, "learning_rate": 1.1373493357754417e-06, "loss": 0.90287626, "num_input_tokens_seen": 116844415, "step": 5428, "time_per_iteration": 2.607825517654419 }, { "auxiliary_loss_clip": 0.01135369, "auxiliary_loss_mlp": 0.01039576, "balance_loss_clip": 1.0450114, "balance_loss_mlp": 1.02407885, "epoch": 0.6527986532796248, "flos": 18989168112000.0, "grad_norm": 1.8523576417385077, "language_loss": 0.77592874, "learning_rate": 1.1366466157599303e-06, "loss": 0.79767817, "num_input_tokens_seen": 116863690, "step": 5429, "time_per_iteration": 2.511910915374756 }, { "auxiliary_loss_clip": 0.01088138, "auxiliary_loss_mlp": 0.00774302, "balance_loss_clip": 1.03968036, "balance_loss_mlp": 1.00050712, "epoch": 0.6529188961702639, "flos": 14238581011200.0, "grad_norm": 2.0651814228232066, "language_loss": 0.76222306, "learning_rate": 1.1359440267144412e-06, "loss": 0.78084743, "num_input_tokens_seen": 116881145, "step": 5430, "time_per_iteration": 2.6547751426696777 }, { "auxiliary_loss_clip": 0.01126981, "auxiliary_loss_mlp": 0.01039013, "balance_loss_clip": 1.04524517, "balance_loss_mlp": 1.02270484, "epoch": 0.653039139060903, "flos": 36682929158400.0, "grad_norm": 3.8398702268481255, "language_loss": 0.74439639, "learning_rate": 1.1352415687455556e-06, "loss": 0.7660563, "num_input_tokens_seen": 116902405, "step": 5431, "time_per_iteration": 2.7391228675842285 }, { "auxiliary_loss_clip": 0.01128385, "auxiliary_loss_mlp": 0.01041411, "balance_loss_clip": 1.04633999, "balance_loss_mlp": 1.02323163, "epoch": 0.6531593819515421, "flos": 25376275785600.0, "grad_norm": 2.233315728202343, "language_loss": 0.63868445, "learning_rate": 1.1345392419598362e-06, "loss": 0.66038239, "num_input_tokens_seen": 116921285, "step": 5432, "time_per_iteration": 2.676996946334839 }, { "auxiliary_loss_clip": 0.01119033, "auxiliary_loss_mlp": 0.01047298, "balance_loss_clip": 1.0419426, "balance_loss_mlp": 1.02979827, "epoch": 0.6532796248421812, "flos": 21178533888000.0, "grad_norm": 1.7859633350441746, "language_loss": 0.72038573, "learning_rate": 1.1338370464638263e-06, "loss": 0.74204904, "num_input_tokens_seen": 116940685, "step": 5433, "time_per_iteration": 2.587662935256958 }, { "auxiliary_loss_clip": 0.01135966, "auxiliary_loss_mlp": 0.01038483, "balance_loss_clip": 1.04491496, "balance_loss_mlp": 1.02080441, "epoch": 0.6533998677328203, "flos": 17675950878720.0, "grad_norm": 2.619094453944798, "language_loss": 0.63927609, "learning_rate": 1.1331349823640474e-06, "loss": 0.66102058, "num_input_tokens_seen": 116958115, "step": 5434, "time_per_iteration": 2.5446460247039795 }, { "auxiliary_loss_clip": 0.01127583, "auxiliary_loss_mlp": 0.0077229, "balance_loss_clip": 1.04514933, "balance_loss_mlp": 1.00037813, "epoch": 0.6535201106234594, "flos": 28400384701440.0, "grad_norm": 2.9182968215509213, "language_loss": 0.77926862, "learning_rate": 1.132433049767003e-06, "loss": 0.79826736, "num_input_tokens_seen": 116976030, "step": 5435, "time_per_iteration": 2.675759792327881 }, { "auxiliary_loss_clip": 0.01110576, "auxiliary_loss_mlp": 0.01038639, "balance_loss_clip": 1.04338932, "balance_loss_mlp": 1.02286148, "epoch": 0.6536403535140984, "flos": 23586667447680.0, "grad_norm": 1.5546551512599094, "language_loss": 0.8117491, "learning_rate": 1.1317312487791748e-06, "loss": 0.83324122, "num_input_tokens_seen": 116997680, "step": 5436, "time_per_iteration": 2.6646568775177 }, { "auxiliary_loss_clip": 0.01119689, "auxiliary_loss_mlp": 0.01043627, "balance_loss_clip": 1.0419718, "balance_loss_mlp": 1.02634192, "epoch": 0.6537605964047376, "flos": 21579476474880.0, "grad_norm": 1.8374298797658457, "language_loss": 0.7264986, "learning_rate": 1.1310295795070253e-06, "loss": 0.74813175, "num_input_tokens_seen": 117017620, "step": 5437, "time_per_iteration": 2.5876517295837402 }, { "auxiliary_loss_clip": 0.01087593, "auxiliary_loss_mlp": 0.01039643, "balance_loss_clip": 1.03998828, "balance_loss_mlp": 1.0233587, "epoch": 0.6538808392953767, "flos": 26833997433600.0, "grad_norm": 1.814662841294308, "language_loss": 0.8112883, "learning_rate": 1.1303280420569982e-06, "loss": 0.8325606, "num_input_tokens_seen": 117039505, "step": 5438, "time_per_iteration": 2.746575117111206 }, { "auxiliary_loss_clip": 0.01121586, "auxiliary_loss_mlp": 0.01046333, "balance_loss_clip": 1.04395628, "balance_loss_mlp": 1.029966, "epoch": 0.6540010821860157, "flos": 30738241301760.0, "grad_norm": 1.7877608223620176, "language_loss": 0.77435434, "learning_rate": 1.1296266365355158e-06, "loss": 0.7960335, "num_input_tokens_seen": 117062890, "step": 5439, "time_per_iteration": 3.633251905441284 }, { "auxiliary_loss_clip": 0.01103818, "auxiliary_loss_mlp": 0.0103975, "balance_loss_clip": 1.0413723, "balance_loss_mlp": 1.02321553, "epoch": 0.6541213250766549, "flos": 26907147480960.0, "grad_norm": 3.3494580382154147, "language_loss": 0.73686039, "learning_rate": 1.1289253630489806e-06, "loss": 0.75829607, "num_input_tokens_seen": 117083940, "step": 5440, "time_per_iteration": 3.652631998062134 }, { "auxiliary_loss_clip": 0.01131349, "auxiliary_loss_mlp": 0.01049356, "balance_loss_clip": 1.04335523, "balance_loss_mlp": 1.03121185, "epoch": 0.6542415679672939, "flos": 19172384409600.0, "grad_norm": 2.9577635938100664, "language_loss": 0.72167349, "learning_rate": 1.1282242217037753e-06, "loss": 0.74348056, "num_input_tokens_seen": 117101440, "step": 5441, "time_per_iteration": 2.5655765533447266 }, { "auxiliary_loss_clip": 0.01083508, "auxiliary_loss_mlp": 0.01045188, "balance_loss_clip": 1.03699017, "balance_loss_mlp": 1.02510166, "epoch": 0.654361810857933, "flos": 48173517100800.0, "grad_norm": 2.287381816348268, "language_loss": 0.61503243, "learning_rate": 1.127523212606262e-06, "loss": 0.6363194, "num_input_tokens_seen": 117124265, "step": 5442, "time_per_iteration": 2.94014310836792 }, { "auxiliary_loss_clip": 0.01122409, "auxiliary_loss_mlp": 0.01034824, "balance_loss_clip": 1.04328942, "balance_loss_mlp": 1.0200541, "epoch": 0.6544820537485722, "flos": 26943165843840.0, "grad_norm": 1.5550741426594956, "language_loss": 0.73047817, "learning_rate": 1.1268223358627835e-06, "loss": 0.75205046, "num_input_tokens_seen": 117146755, "step": 5443, "time_per_iteration": 2.6699533462524414 }, { "auxiliary_loss_clip": 0.01138138, "auxiliary_loss_mlp": 0.01042291, "balance_loss_clip": 1.0456028, "balance_loss_mlp": 1.02613807, "epoch": 0.6546022966392112, "flos": 20886328748160.0, "grad_norm": 2.0023387902916383, "language_loss": 0.72011244, "learning_rate": 1.126121591579663e-06, "loss": 0.74191678, "num_input_tokens_seen": 117165960, "step": 5444, "time_per_iteration": 2.57580828666687 }, { "auxiliary_loss_clip": 0.01119613, "auxiliary_loss_mlp": 0.01041501, "balance_loss_clip": 1.04326737, "balance_loss_mlp": 1.02564621, "epoch": 0.6547225395298503, "flos": 24936693143040.0, "grad_norm": 1.5884632274925836, "language_loss": 0.68827641, "learning_rate": 1.1254209798632018e-06, "loss": 0.70988756, "num_input_tokens_seen": 117186980, "step": 5445, "time_per_iteration": 2.613081932067871 }, { "auxiliary_loss_clip": 0.01063351, "auxiliary_loss_mlp": 0.01037808, "balance_loss_clip": 1.03652191, "balance_loss_mlp": 1.02145219, "epoch": 0.6548427824204894, "flos": 22565942663040.0, "grad_norm": 1.6467867009647925, "language_loss": 0.8471545, "learning_rate": 1.124720500819683e-06, "loss": 0.86816609, "num_input_tokens_seen": 117205135, "step": 5446, "time_per_iteration": 3.7564496994018555 }, { "auxiliary_loss_clip": 0.01139504, "auxiliary_loss_mlp": 0.01038634, "balance_loss_clip": 1.04715061, "balance_loss_mlp": 1.02211213, "epoch": 0.6549630253111285, "flos": 18442500048000.0, "grad_norm": 2.374725429734513, "language_loss": 0.82624549, "learning_rate": 1.1240201545553682e-06, "loss": 0.84802687, "num_input_tokens_seen": 117222935, "step": 5447, "time_per_iteration": 3.4731860160827637 }, { "auxiliary_loss_clip": 0.01101955, "auxiliary_loss_mlp": 0.01044696, "balance_loss_clip": 1.04201591, "balance_loss_mlp": 1.02743447, "epoch": 0.6550832682017675, "flos": 25187313312000.0, "grad_norm": 2.4947306506899496, "language_loss": 0.73340702, "learning_rate": 1.1233199411764987e-06, "loss": 0.75487351, "num_input_tokens_seen": 117242370, "step": 5448, "time_per_iteration": 2.7250559329986572 }, { "auxiliary_loss_clip": 0.01088528, "auxiliary_loss_mlp": 0.01044417, "balance_loss_clip": 1.03809118, "balance_loss_mlp": 1.02634525, "epoch": 0.6552035110924067, "flos": 22748153379840.0, "grad_norm": 2.5009794793865074, "language_loss": 0.69143128, "learning_rate": 1.1226198607892978e-06, "loss": 0.71276075, "num_input_tokens_seen": 117262930, "step": 5449, "time_per_iteration": 2.6807281970977783 }, { "auxiliary_loss_clip": 0.01086534, "auxiliary_loss_mlp": 0.01033216, "balance_loss_clip": 1.03936183, "balance_loss_mlp": 1.01729512, "epoch": 0.6553237539830458, "flos": 21799178012160.0, "grad_norm": 1.750266470244621, "language_loss": 0.79931062, "learning_rate": 1.1219199134999664e-06, "loss": 0.82050812, "num_input_tokens_seen": 117281430, "step": 5450, "time_per_iteration": 2.7096099853515625 }, { "auxiliary_loss_clip": 0.01112092, "auxiliary_loss_mlp": 0.01044781, "balance_loss_clip": 1.04216313, "balance_loss_mlp": 1.02556419, "epoch": 0.6554439968736848, "flos": 20887226588160.0, "grad_norm": 1.9188751010131604, "language_loss": 0.78699178, "learning_rate": 1.1212200994146863e-06, "loss": 0.80856049, "num_input_tokens_seen": 117299185, "step": 5451, "time_per_iteration": 2.6225526332855225 }, { "auxiliary_loss_clip": 0.01095971, "auxiliary_loss_mlp": 0.01043325, "balance_loss_clip": 1.0384624, "balance_loss_mlp": 1.0266, "epoch": 0.655564239764324, "flos": 16139045698560.0, "grad_norm": 1.860888565397169, "language_loss": 0.75724542, "learning_rate": 1.120520418639618e-06, "loss": 0.77863842, "num_input_tokens_seen": 117317720, "step": 5452, "time_per_iteration": 2.6676676273345947 }, { "auxiliary_loss_clip": 0.01127144, "auxiliary_loss_mlp": 0.01047368, "balance_loss_clip": 1.04503727, "balance_loss_mlp": 1.0318234, "epoch": 0.655684482654963, "flos": 29570354496000.0, "grad_norm": 1.7918778036658232, "language_loss": 0.83412826, "learning_rate": 1.119820871280903e-06, "loss": 0.85587341, "num_input_tokens_seen": 117338795, "step": 5453, "time_per_iteration": 2.684390068054199 }, { "auxiliary_loss_clip": 0.01121066, "auxiliary_loss_mlp": 0.01041248, "balance_loss_clip": 1.041996, "balance_loss_mlp": 1.02566683, "epoch": 0.6558047255456021, "flos": 29789409588480.0, "grad_norm": 1.9616459931164796, "language_loss": 0.73379034, "learning_rate": 1.1191214574446614e-06, "loss": 0.75541347, "num_input_tokens_seen": 117359040, "step": 5454, "time_per_iteration": 2.6817469596862793 }, { "auxiliary_loss_clip": 0.01107399, "auxiliary_loss_mlp": 0.01039477, "balance_loss_clip": 1.0403564, "balance_loss_mlp": 1.02196491, "epoch": 0.6559249684362413, "flos": 29059166090880.0, "grad_norm": 1.55789397031536, "language_loss": 0.79821801, "learning_rate": 1.118422177236995e-06, "loss": 0.81968677, "num_input_tokens_seen": 117380865, "step": 5455, "time_per_iteration": 2.825064182281494 }, { "auxiliary_loss_clip": 0.01114268, "auxiliary_loss_mlp": 0.01043835, "balance_loss_clip": 1.04232967, "balance_loss_mlp": 1.0263226, "epoch": 0.6560452113268803, "flos": 20225464369920.0, "grad_norm": 2.170687447361138, "language_loss": 0.85879362, "learning_rate": 1.1177230307639835e-06, "loss": 0.88037467, "num_input_tokens_seen": 117398405, "step": 5456, "time_per_iteration": 2.6373698711395264 }, { "auxiliary_loss_clip": 0.01097052, "auxiliary_loss_mlp": 0.0104337, "balance_loss_clip": 1.04027557, "balance_loss_mlp": 1.02634668, "epoch": 0.6561654542175194, "flos": 25045538330880.0, "grad_norm": 1.668664704998868, "language_loss": 0.78641933, "learning_rate": 1.1170240181316865e-06, "loss": 0.80782354, "num_input_tokens_seen": 117419850, "step": 5457, "time_per_iteration": 2.697270393371582 }, { "auxiliary_loss_clip": 0.01095315, "auxiliary_loss_mlp": 0.0104844, "balance_loss_clip": 1.03801668, "balance_loss_mlp": 1.030797, "epoch": 0.6562856971081584, "flos": 22856711258880.0, "grad_norm": 2.0693377598162614, "language_loss": 0.79185891, "learning_rate": 1.1163251394461442e-06, "loss": 0.81329644, "num_input_tokens_seen": 117438330, "step": 5458, "time_per_iteration": 2.6339313983917236 }, { "auxiliary_loss_clip": 0.01121705, "auxiliary_loss_mlp": 0.01046422, "balance_loss_clip": 1.04390264, "balance_loss_mlp": 1.02899361, "epoch": 0.6564059399987976, "flos": 18872565586560.0, "grad_norm": 2.0915402259115172, "language_loss": 0.82122779, "learning_rate": 1.1156263948133746e-06, "loss": 0.84290904, "num_input_tokens_seen": 117454985, "step": 5459, "time_per_iteration": 2.6036903858184814 }, { "auxiliary_loss_clip": 0.01082749, "auxiliary_loss_mlp": 0.00773631, "balance_loss_clip": 1.03935218, "balance_loss_mlp": 1.00037146, "epoch": 0.6565261828894366, "flos": 25484187219840.0, "grad_norm": 1.8863968908614774, "language_loss": 0.77480489, "learning_rate": 1.1149277843393787e-06, "loss": 0.7933687, "num_input_tokens_seen": 117476145, "step": 5460, "time_per_iteration": 2.757786989212036 }, { "auxiliary_loss_clip": 0.01070753, "auxiliary_loss_mlp": 0.00774006, "balance_loss_clip": 1.0362196, "balance_loss_mlp": 1.00040507, "epoch": 0.6566464257800757, "flos": 19683500987520.0, "grad_norm": 2.71869946508877, "language_loss": 0.63269401, "learning_rate": 1.1142293081301342e-06, "loss": 0.65114158, "num_input_tokens_seen": 117494025, "step": 5461, "time_per_iteration": 2.715439796447754 }, { "auxiliary_loss_clip": 0.01106332, "auxiliary_loss_mlp": 0.01040347, "balance_loss_clip": 1.04122949, "balance_loss_mlp": 1.02524328, "epoch": 0.6567666686707149, "flos": 23514127931520.0, "grad_norm": 1.6376233625905874, "language_loss": 0.68050981, "learning_rate": 1.1135309662915995e-06, "loss": 0.7019766, "num_input_tokens_seen": 117514190, "step": 5462, "time_per_iteration": 2.6634888648986816 }, { "auxiliary_loss_clip": 0.01092886, "auxiliary_loss_mlp": 0.01037746, "balance_loss_clip": 1.04106462, "balance_loss_mlp": 1.02092564, "epoch": 0.6568869115613539, "flos": 32781342896640.0, "grad_norm": 2.163129819130376, "language_loss": 0.60327291, "learning_rate": 1.112832758929712e-06, "loss": 0.62457919, "num_input_tokens_seen": 117536800, "step": 5463, "time_per_iteration": 2.8260881900787354 }, { "auxiliary_loss_clip": 0.01119713, "auxiliary_loss_mlp": 0.01036006, "balance_loss_clip": 1.04230821, "balance_loss_mlp": 1.01986504, "epoch": 0.657007154451993, "flos": 18442428220800.0, "grad_norm": 3.43427328413281, "language_loss": 0.74659228, "learning_rate": 1.11213468615039e-06, "loss": 0.7681495, "num_input_tokens_seen": 117556230, "step": 5464, "time_per_iteration": 2.573951244354248 }, { "auxiliary_loss_clip": 0.01069454, "auxiliary_loss_mlp": 0.01050353, "balance_loss_clip": 1.03659964, "balance_loss_mlp": 1.03323448, "epoch": 0.6571273973426321, "flos": 25156717902720.0, "grad_norm": 1.599305882566697, "language_loss": 0.75236785, "learning_rate": 1.1114367480595292e-06, "loss": 0.77356595, "num_input_tokens_seen": 117577310, "step": 5465, "time_per_iteration": 3.787233352661133 }, { "auxiliary_loss_clip": 0.01074205, "auxiliary_loss_mlp": 0.01046008, "balance_loss_clip": 1.03932667, "balance_loss_mlp": 1.02805495, "epoch": 0.6572476402332712, "flos": 17529830352000.0, "grad_norm": 2.1159852266308596, "language_loss": 0.81305516, "learning_rate": 1.1107389447630086e-06, "loss": 0.83425725, "num_input_tokens_seen": 117596010, "step": 5466, "time_per_iteration": 3.663193941116333 }, { "auxiliary_loss_clip": 0.01107026, "auxiliary_loss_mlp": 0.0077491, "balance_loss_clip": 1.03955126, "balance_loss_mlp": 1.00029874, "epoch": 0.6573678831239103, "flos": 17014260487680.0, "grad_norm": 2.201325128311343, "language_loss": 0.78404784, "learning_rate": 1.1100412763666818e-06, "loss": 0.80286717, "num_input_tokens_seen": 117611270, "step": 5467, "time_per_iteration": 2.6574294567108154 }, { "auxiliary_loss_clip": 0.01113367, "auxiliary_loss_mlp": 0.01042586, "balance_loss_clip": 1.04320347, "balance_loss_mlp": 1.02648103, "epoch": 0.6574881260145494, "flos": 23910078528000.0, "grad_norm": 1.5411127107920837, "language_loss": 0.79930592, "learning_rate": 1.1093437429763865e-06, "loss": 0.82086545, "num_input_tokens_seen": 117631535, "step": 5468, "time_per_iteration": 2.643112897872925 }, { "auxiliary_loss_clip": 0.01124037, "auxiliary_loss_mlp": 0.01042102, "balance_loss_clip": 1.04473782, "balance_loss_mlp": 1.02462578, "epoch": 0.6576083689051885, "flos": 11218458504960.0, "grad_norm": 2.001006677400307, "language_loss": 0.73828197, "learning_rate": 1.1086463446979361e-06, "loss": 0.75994343, "num_input_tokens_seen": 117649885, "step": 5469, "time_per_iteration": 2.6047186851501465 }, { "auxiliary_loss_clip": 0.01127731, "auxiliary_loss_mlp": 0.01042769, "balance_loss_clip": 1.0462935, "balance_loss_mlp": 1.02627087, "epoch": 0.6577286117958275, "flos": 22455553190400.0, "grad_norm": 1.8670369650283332, "language_loss": 0.77196443, "learning_rate": 1.1079490816371277e-06, "loss": 0.79366946, "num_input_tokens_seen": 117669650, "step": 5470, "time_per_iteration": 2.627612352371216 }, { "auxiliary_loss_clip": 0.01126231, "auxiliary_loss_mlp": 0.00772673, "balance_loss_clip": 1.04268861, "balance_loss_mlp": 1.00037003, "epoch": 0.6578488546864667, "flos": 21872184405120.0, "grad_norm": 2.4801776180795816, "language_loss": 0.74639958, "learning_rate": 1.1072519538997352e-06, "loss": 0.76538861, "num_input_tokens_seen": 117688790, "step": 5471, "time_per_iteration": 2.625826120376587 }, { "auxiliary_loss_clip": 0.01114084, "auxiliary_loss_mlp": 0.01041687, "balance_loss_clip": 1.04091454, "balance_loss_mlp": 1.02514029, "epoch": 0.6579690975771058, "flos": 23543753673600.0, "grad_norm": 2.469788794675979, "language_loss": 0.82579923, "learning_rate": 1.1065549615915095e-06, "loss": 0.84735698, "num_input_tokens_seen": 117708620, "step": 5472, "time_per_iteration": 3.637310743331909 }, { "auxiliary_loss_clip": 0.01125223, "auxiliary_loss_mlp": 0.01043446, "balance_loss_clip": 1.04743457, "balance_loss_mlp": 1.02704906, "epoch": 0.6580893404677448, "flos": 32743995730560.0, "grad_norm": 2.3023707430171774, "language_loss": 0.78584248, "learning_rate": 1.105858104818187e-06, "loss": 0.80752921, "num_input_tokens_seen": 117729775, "step": 5473, "time_per_iteration": 2.7177248001098633 }, { "auxiliary_loss_clip": 0.0112961, "auxiliary_loss_mlp": 0.01038723, "balance_loss_clip": 1.04577231, "balance_loss_mlp": 1.02144945, "epoch": 0.658209583358384, "flos": 15888138220800.0, "grad_norm": 14.31849989669908, "language_loss": 0.74867594, "learning_rate": 1.105161383685478e-06, "loss": 0.77035922, "num_input_tokens_seen": 117746160, "step": 5474, "time_per_iteration": 3.517369031906128 }, { "auxiliary_loss_clip": 0.01010375, "auxiliary_loss_mlp": 0.0100034, "balance_loss_clip": 1.01052165, "balance_loss_mlp": 0.99895096, "epoch": 0.658329826249023, "flos": 62695902447360.0, "grad_norm": 0.745393810030867, "language_loss": 0.56362802, "learning_rate": 1.1044647982990771e-06, "loss": 0.58373517, "num_input_tokens_seen": 117808045, "step": 5475, "time_per_iteration": 3.20784592628479 }, { "auxiliary_loss_clip": 0.011155, "auxiliary_loss_mlp": 0.01047469, "balance_loss_clip": 1.04280484, "balance_loss_mlp": 1.03057718, "epoch": 0.6584500691396621, "flos": 31722624501120.0, "grad_norm": 2.5577953276096594, "language_loss": 0.6478008, "learning_rate": 1.1037683487646536e-06, "loss": 0.66943049, "num_input_tokens_seen": 117828330, "step": 5476, "time_per_iteration": 2.7312920093536377 }, { "auxiliary_loss_clip": 0.01109912, "auxiliary_loss_mlp": 0.0077255, "balance_loss_clip": 1.04309976, "balance_loss_mlp": 1.0004034, "epoch": 0.6585703120303013, "flos": 18406086635520.0, "grad_norm": 4.146996575790423, "language_loss": 0.76921177, "learning_rate": 1.1030720351878583e-06, "loss": 0.78803635, "num_input_tokens_seen": 117846450, "step": 5477, "time_per_iteration": 2.642855644226074 }, { "auxiliary_loss_clip": 0.01024693, "auxiliary_loss_mlp": 0.01006439, "balance_loss_clip": 1.01424098, "balance_loss_mlp": 1.00490117, "epoch": 0.6586905549209403, "flos": 58309880434560.0, "grad_norm": 0.8117993043727196, "language_loss": 0.57633436, "learning_rate": 1.102375857674323e-06, "loss": 0.59664571, "num_input_tokens_seen": 117908365, "step": 5478, "time_per_iteration": 3.205202102661133 }, { "auxiliary_loss_clip": 0.01109624, "auxiliary_loss_mlp": 0.0103547, "balance_loss_clip": 1.04034626, "balance_loss_mlp": 1.0199846, "epoch": 0.6588107978115794, "flos": 22782627457920.0, "grad_norm": 1.7365583279187018, "language_loss": 0.90794289, "learning_rate": 1.1016798163296561e-06, "loss": 0.92939383, "num_input_tokens_seen": 117927565, "step": 5479, "time_per_iteration": 2.663067102432251 }, { "auxiliary_loss_clip": 0.011284, "auxiliary_loss_mlp": 0.01036508, "balance_loss_clip": 1.04691339, "balance_loss_mlp": 1.02006912, "epoch": 0.6589310407022185, "flos": 20667525050880.0, "grad_norm": 2.00427447915834, "language_loss": 0.66343486, "learning_rate": 1.1009839112594471e-06, "loss": 0.68508393, "num_input_tokens_seen": 117945590, "step": 5480, "time_per_iteration": 2.624814033508301 }, { "auxiliary_loss_clip": 0.0112548, "auxiliary_loss_mlp": 0.01039513, "balance_loss_clip": 1.04464221, "balance_loss_mlp": 1.02257323, "epoch": 0.6590512835928576, "flos": 25630595055360.0, "grad_norm": 2.4108935912817824, "language_loss": 0.71939611, "learning_rate": 1.1002881425692638e-06, "loss": 0.74104607, "num_input_tokens_seen": 117966020, "step": 5481, "time_per_iteration": 2.6655309200286865 }, { "auxiliary_loss_clip": 0.01119191, "auxiliary_loss_mlp": 0.01045195, "balance_loss_clip": 1.04133868, "balance_loss_mlp": 1.02732563, "epoch": 0.6591715264834966, "flos": 23726108044800.0, "grad_norm": 2.03287538380034, "language_loss": 0.75564981, "learning_rate": 1.0995925103646532e-06, "loss": 0.77729368, "num_input_tokens_seen": 117984620, "step": 5482, "time_per_iteration": 2.61171555519104 }, { "auxiliary_loss_clip": 0.0109806, "auxiliary_loss_mlp": 0.01042755, "balance_loss_clip": 1.04374456, "balance_loss_mlp": 1.02573824, "epoch": 0.6592917693741358, "flos": 35773850822400.0, "grad_norm": 1.8362060144400012, "language_loss": 0.6672585, "learning_rate": 1.0988970147511437e-06, "loss": 0.68866664, "num_input_tokens_seen": 118006500, "step": 5483, "time_per_iteration": 2.834758996963501 }, { "auxiliary_loss_clip": 0.01114483, "auxiliary_loss_mlp": 0.01040282, "balance_loss_clip": 1.04434526, "balance_loss_mlp": 1.02397394, "epoch": 0.6594120122647749, "flos": 21396834794880.0, "grad_norm": 2.449945834605205, "language_loss": 0.80318475, "learning_rate": 1.0982016558342405e-06, "loss": 0.82473242, "num_input_tokens_seen": 118025470, "step": 5484, "time_per_iteration": 2.6500747203826904 }, { "auxiliary_loss_clip": 0.01138173, "auxiliary_loss_mlp": 0.01046378, "balance_loss_clip": 1.04504681, "balance_loss_mlp": 1.03007603, "epoch": 0.6595322551554139, "flos": 19351829779200.0, "grad_norm": 1.9710119314534693, "language_loss": 0.71137249, "learning_rate": 1.0975064337194291e-06, "loss": 0.73321795, "num_input_tokens_seen": 118043515, "step": 5485, "time_per_iteration": 2.594644069671631 }, { "auxiliary_loss_clip": 0.01093596, "auxiliary_loss_mlp": 0.01049717, "balance_loss_clip": 1.04144764, "balance_loss_mlp": 1.03282523, "epoch": 0.6596524980460531, "flos": 16837113588480.0, "grad_norm": 1.566857013401716, "language_loss": 0.70398813, "learning_rate": 1.0968113485121743e-06, "loss": 0.72542131, "num_input_tokens_seen": 118063105, "step": 5486, "time_per_iteration": 2.686063766479492 }, { "auxiliary_loss_clip": 0.01124995, "auxiliary_loss_mlp": 0.00773978, "balance_loss_clip": 1.04259443, "balance_loss_mlp": 1.00035691, "epoch": 0.6597727409366921, "flos": 21798567480960.0, "grad_norm": 2.077617244696172, "language_loss": 0.80251718, "learning_rate": 1.0961164003179185e-06, "loss": 0.82150698, "num_input_tokens_seen": 118081615, "step": 5487, "time_per_iteration": 2.633327007293701 }, { "auxiliary_loss_clip": 0.01098878, "auxiliary_loss_mlp": 0.01041939, "balance_loss_clip": 1.04093003, "balance_loss_mlp": 1.02411699, "epoch": 0.6598929838273312, "flos": 23730704985600.0, "grad_norm": 2.035707046037248, "language_loss": 0.84174341, "learning_rate": 1.0954215892420884e-06, "loss": 0.86315155, "num_input_tokens_seen": 118102315, "step": 5488, "time_per_iteration": 2.6885061264038086 }, { "auxiliary_loss_clip": 0.01104749, "auxiliary_loss_mlp": 0.01041047, "balance_loss_clip": 1.04230928, "balance_loss_mlp": 1.02455986, "epoch": 0.6600132267179702, "flos": 19974520978560.0, "grad_norm": 1.76310870283351, "language_loss": 0.70604074, "learning_rate": 1.094726915390082e-06, "loss": 0.72749871, "num_input_tokens_seen": 118120650, "step": 5489, "time_per_iteration": 2.6650917530059814 }, { "auxiliary_loss_clip": 0.01124619, "auxiliary_loss_mlp": 0.01036603, "balance_loss_clip": 1.04363847, "balance_loss_mlp": 1.02027082, "epoch": 0.6601334696086094, "flos": 22342649765760.0, "grad_norm": 1.896184600784266, "language_loss": 0.69783473, "learning_rate": 1.0940323788672836e-06, "loss": 0.71944696, "num_input_tokens_seen": 118139825, "step": 5490, "time_per_iteration": 2.6106643676757812 }, { "auxiliary_loss_clip": 0.01120391, "auxiliary_loss_mlp": 0.01030274, "balance_loss_clip": 1.0443809, "balance_loss_mlp": 1.01551628, "epoch": 0.6602537124992485, "flos": 25703098657920.0, "grad_norm": 1.8131070046670512, "language_loss": 0.73722488, "learning_rate": 1.0933379797790522e-06, "loss": 0.75873148, "num_input_tokens_seen": 118159240, "step": 5491, "time_per_iteration": 3.6107547283172607 }, { "auxiliary_loss_clip": 0.01134112, "auxiliary_loss_mlp": 0.01044425, "balance_loss_clip": 1.04380059, "balance_loss_mlp": 1.02766418, "epoch": 0.6603739553898875, "flos": 25848572739840.0, "grad_norm": 2.4773203826453956, "language_loss": 0.71354878, "learning_rate": 1.0926437182307293e-06, "loss": 0.73533416, "num_input_tokens_seen": 118178050, "step": 5492, "time_per_iteration": 3.5810904502868652 }, { "auxiliary_loss_clip": 0.01117262, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.04414713, "balance_loss_mlp": 1.02850461, "epoch": 0.6604941982805267, "flos": 24570296461440.0, "grad_norm": 1.68996918245706, "language_loss": 0.78101057, "learning_rate": 1.0919495943276338e-06, "loss": 0.80263978, "num_input_tokens_seen": 118199070, "step": 5493, "time_per_iteration": 2.6758134365081787 }, { "auxiliary_loss_clip": 0.01103639, "auxiliary_loss_mlp": 0.0104807, "balance_loss_clip": 1.04015577, "balance_loss_mlp": 1.0294975, "epoch": 0.6606144411711657, "flos": 13261775581440.0, "grad_norm": 2.214529398382872, "language_loss": 0.76180822, "learning_rate": 1.0912556081750611e-06, "loss": 0.78332531, "num_input_tokens_seen": 118217000, "step": 5494, "time_per_iteration": 2.654254198074341 }, { "auxiliary_loss_clip": 0.01109821, "auxiliary_loss_mlp": 0.01038093, "balance_loss_clip": 1.04336286, "balance_loss_mlp": 1.02176154, "epoch": 0.6607346840618048, "flos": 25155281358720.0, "grad_norm": 3.2615304229405644, "language_loss": 0.76423764, "learning_rate": 1.0905617598782909e-06, "loss": 0.78571677, "num_input_tokens_seen": 118237205, "step": 5495, "time_per_iteration": 2.698232412338257 }, { "auxiliary_loss_clip": 0.01081286, "auxiliary_loss_mlp": 0.01045985, "balance_loss_clip": 1.0389297, "balance_loss_mlp": 1.02875936, "epoch": 0.660854926952444, "flos": 17638029095040.0, "grad_norm": 1.8741832381654797, "language_loss": 0.81543314, "learning_rate": 1.0898680495425775e-06, "loss": 0.83670592, "num_input_tokens_seen": 118255495, "step": 5496, "time_per_iteration": 2.6590449810028076 }, { "auxiliary_loss_clip": 0.01114819, "auxiliary_loss_mlp": 0.01039179, "balance_loss_clip": 1.04258764, "balance_loss_mlp": 1.02215624, "epoch": 0.660975169843083, "flos": 16836000266880.0, "grad_norm": 2.085261742299592, "language_loss": 0.80529886, "learning_rate": 1.0891744772731594e-06, "loss": 0.82683885, "num_input_tokens_seen": 118273310, "step": 5497, "time_per_iteration": 2.657165765762329 }, { "auxiliary_loss_clip": 0.0112351, "auxiliary_loss_mlp": 0.01043476, "balance_loss_clip": 1.04320669, "balance_loss_mlp": 1.02820563, "epoch": 0.6610954127337221, "flos": 26870410846080.0, "grad_norm": 1.9517381001347587, "language_loss": 0.65881169, "learning_rate": 1.088481043175248e-06, "loss": 0.68048149, "num_input_tokens_seen": 118293880, "step": 5498, "time_per_iteration": 3.545194387435913 }, { "auxiliary_loss_clip": 0.01103889, "auxiliary_loss_mlp": 0.01047466, "balance_loss_clip": 1.04118824, "balance_loss_mlp": 1.03132534, "epoch": 0.6612156556243612, "flos": 26465697331200.0, "grad_norm": 1.7345617957741732, "language_loss": 0.75892675, "learning_rate": 1.0877877473540368e-06, "loss": 0.78044033, "num_input_tokens_seen": 118314465, "step": 5499, "time_per_iteration": 2.6751601696014404 }, { "auxiliary_loss_clip": 0.0113816, "auxiliary_loss_mlp": 0.01040698, "balance_loss_clip": 1.04506195, "balance_loss_mlp": 1.02510548, "epoch": 0.6613358985150003, "flos": 19791915212160.0, "grad_norm": 2.3489885209201065, "language_loss": 0.72598982, "learning_rate": 1.0870945899147002e-06, "loss": 0.74777842, "num_input_tokens_seen": 118331110, "step": 5500, "time_per_iteration": 3.4064860343933105 }, { "auxiliary_loss_clip": 0.01121443, "auxiliary_loss_mlp": 0.01039408, "balance_loss_clip": 1.04345751, "balance_loss_mlp": 1.02419078, "epoch": 0.6614561414056394, "flos": 26831627136000.0, "grad_norm": 2.2025487282965233, "language_loss": 0.76368707, "learning_rate": 1.0864015709623879e-06, "loss": 0.78529561, "num_input_tokens_seen": 118351980, "step": 5501, "time_per_iteration": 2.684343099594116 }, { "auxiliary_loss_clip": 0.01128246, "auxiliary_loss_mlp": 0.01042407, "balance_loss_clip": 1.04378378, "balance_loss_mlp": 1.02543116, "epoch": 0.6615763842962785, "flos": 22894597128960.0, "grad_norm": 2.1828616417294615, "language_loss": 0.80509329, "learning_rate": 1.0857086906022313e-06, "loss": 0.82679981, "num_input_tokens_seen": 118370315, "step": 5502, "time_per_iteration": 2.6283140182495117 }, { "auxiliary_loss_clip": 0.01061591, "auxiliary_loss_mlp": 0.01042153, "balance_loss_clip": 1.03707385, "balance_loss_mlp": 1.02435493, "epoch": 0.6616966271869176, "flos": 24790321221120.0, "grad_norm": 2.170878576443668, "language_loss": 0.73386514, "learning_rate": 1.0850159489393388e-06, "loss": 0.75490254, "num_input_tokens_seen": 118389575, "step": 5503, "time_per_iteration": 2.7476329803466797 }, { "auxiliary_loss_clip": 0.01092664, "auxiliary_loss_mlp": 0.01042031, "balance_loss_clip": 1.03889525, "balance_loss_mlp": 1.02543688, "epoch": 0.6618168700775566, "flos": 17202109639680.0, "grad_norm": 2.2006708499218233, "language_loss": 0.82089293, "learning_rate": 1.0843233460787992e-06, "loss": 0.84223992, "num_input_tokens_seen": 118406790, "step": 5504, "time_per_iteration": 2.658709764480591 }, { "auxiliary_loss_clip": 0.01083457, "auxiliary_loss_mlp": 0.01052213, "balance_loss_clip": 1.03990936, "balance_loss_mlp": 1.03470135, "epoch": 0.6619371129681958, "flos": 25447091448960.0, "grad_norm": 2.0542830204915834, "language_loss": 0.77905208, "learning_rate": 1.0836308821256805e-06, "loss": 0.80040872, "num_input_tokens_seen": 118427590, "step": 5505, "time_per_iteration": 2.7283341884613037 }, { "auxiliary_loss_clip": 0.01122207, "auxiliary_loss_mlp": 0.01046175, "balance_loss_clip": 1.04441857, "balance_loss_mlp": 1.03065383, "epoch": 0.6620573558588349, "flos": 18040444139520.0, "grad_norm": 1.9638984085790967, "language_loss": 0.7798779, "learning_rate": 1.0829385571850282e-06, "loss": 0.80156171, "num_input_tokens_seen": 118444570, "step": 5506, "time_per_iteration": 2.5612499713897705 }, { "auxiliary_loss_clip": 0.01142266, "auxiliary_loss_mlp": 0.01044915, "balance_loss_clip": 1.04539073, "balance_loss_mlp": 1.0271889, "epoch": 0.6621775987494739, "flos": 17785586165760.0, "grad_norm": 3.3546206411757353, "language_loss": 0.83432376, "learning_rate": 1.0822463713618679e-06, "loss": 0.85619557, "num_input_tokens_seen": 118461425, "step": 5507, "time_per_iteration": 2.571223497390747 }, { "auxiliary_loss_clip": 0.01101553, "auxiliary_loss_mlp": 0.01048443, "balance_loss_clip": 1.04131591, "balance_loss_mlp": 1.03197992, "epoch": 0.6622978416401131, "flos": 17492590926720.0, "grad_norm": 1.9936113584312647, "language_loss": 0.84842038, "learning_rate": 1.0815543247612034e-06, "loss": 0.86992025, "num_input_tokens_seen": 118478495, "step": 5508, "time_per_iteration": 2.6498758792877197 }, { "auxiliary_loss_clip": 0.0110904, "auxiliary_loss_mlp": 0.01042297, "balance_loss_clip": 1.03870201, "balance_loss_mlp": 1.02535748, "epoch": 0.6624180845307521, "flos": 21648352803840.0, "grad_norm": 1.5574129291255085, "language_loss": 0.82991421, "learning_rate": 1.0808624174880168e-06, "loss": 0.85142756, "num_input_tokens_seen": 118499145, "step": 5509, "time_per_iteration": 2.6362295150756836 }, { "auxiliary_loss_clip": 0.01133095, "auxiliary_loss_mlp": 0.01038444, "balance_loss_clip": 1.04336858, "balance_loss_mlp": 1.02267301, "epoch": 0.6625383274213912, "flos": 23805902108160.0, "grad_norm": 1.7653625483963806, "language_loss": 0.80119359, "learning_rate": 1.080170649647272e-06, "loss": 0.822909, "num_input_tokens_seen": 118518950, "step": 5510, "time_per_iteration": 2.593729019165039 }, { "auxiliary_loss_clip": 0.01134453, "auxiliary_loss_mlp": 0.01039672, "balance_loss_clip": 1.04301429, "balance_loss_mlp": 1.0235194, "epoch": 0.6626585703120303, "flos": 33262941473280.0, "grad_norm": 1.7664567722133102, "language_loss": 0.67198145, "learning_rate": 1.0794790213439068e-06, "loss": 0.69372272, "num_input_tokens_seen": 118545850, "step": 5511, "time_per_iteration": 2.7416155338287354 }, { "auxiliary_loss_clip": 0.01087673, "auxiliary_loss_mlp": 0.01045925, "balance_loss_clip": 1.03905678, "balance_loss_mlp": 1.02928388, "epoch": 0.6627788132026694, "flos": 22085780630400.0, "grad_norm": 2.4760667471535456, "language_loss": 0.78157264, "learning_rate": 1.078787532682843e-06, "loss": 0.80290866, "num_input_tokens_seen": 118563325, "step": 5512, "time_per_iteration": 2.674877643585205 }, { "auxiliary_loss_clip": 0.01124911, "auxiliary_loss_mlp": 0.01040283, "balance_loss_clip": 1.04409313, "balance_loss_mlp": 1.02401066, "epoch": 0.6628990560933085, "flos": 36173608260480.0, "grad_norm": 2.784216737148304, "language_loss": 0.75806093, "learning_rate": 1.0780961837689773e-06, "loss": 0.77971292, "num_input_tokens_seen": 118582835, "step": 5513, "time_per_iteration": 2.738872766494751 }, { "auxiliary_loss_clip": 0.01105126, "auxiliary_loss_mlp": 0.01045327, "balance_loss_clip": 1.04154277, "balance_loss_mlp": 1.02684903, "epoch": 0.6630192989839476, "flos": 18513567106560.0, "grad_norm": 1.6376736270536143, "language_loss": 0.70138317, "learning_rate": 1.0774049747071883e-06, "loss": 0.72288775, "num_input_tokens_seen": 118600715, "step": 5514, "time_per_iteration": 2.6032521724700928 }, { "auxiliary_loss_clip": 0.01084191, "auxiliary_loss_mlp": 0.01044779, "balance_loss_clip": 1.04054582, "balance_loss_mlp": 1.02801836, "epoch": 0.6631395418745867, "flos": 35809510049280.0, "grad_norm": 1.5590724909037255, "language_loss": 0.680565, "learning_rate": 1.076713905602332e-06, "loss": 0.70185471, "num_input_tokens_seen": 118621290, "step": 5515, "time_per_iteration": 2.84476375579834 }, { "auxiliary_loss_clip": 0.01128865, "auxiliary_loss_mlp": 0.01041358, "balance_loss_clip": 1.04705524, "balance_loss_mlp": 1.02551508, "epoch": 0.6632597847652257, "flos": 20047742853120.0, "grad_norm": 2.0337378592119406, "language_loss": 0.81246138, "learning_rate": 1.07602297655924e-06, "loss": 0.83416361, "num_input_tokens_seen": 118639610, "step": 5516, "time_per_iteration": 2.604372262954712 }, { "auxiliary_loss_clip": 0.01135621, "auxiliary_loss_mlp": 0.01040209, "balance_loss_clip": 1.04521084, "balance_loss_mlp": 1.02548635, "epoch": 0.6633800276558649, "flos": 21214480423680.0, "grad_norm": 1.8352771904225198, "language_loss": 0.81171536, "learning_rate": 1.0753321876827292e-06, "loss": 0.83347368, "num_input_tokens_seen": 118658895, "step": 5517, "time_per_iteration": 3.505687713623047 }, { "auxiliary_loss_clip": 0.01135959, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.0435406, "balance_loss_mlp": 1.01986623, "epoch": 0.663500270546504, "flos": 23987753688960.0, "grad_norm": 2.08206653978515, "language_loss": 0.74349785, "learning_rate": 1.0746415390775893e-06, "loss": 0.76522028, "num_input_tokens_seen": 118677025, "step": 5518, "time_per_iteration": 2.5921919345855713 }, { "auxiliary_loss_clip": 0.01135129, "auxiliary_loss_mlp": 0.0103758, "balance_loss_clip": 1.04624319, "balance_loss_mlp": 1.02227402, "epoch": 0.663620513437143, "flos": 17932389050880.0, "grad_norm": 1.9502366638400954, "language_loss": 0.76448041, "learning_rate": 1.0739510308485939e-06, "loss": 0.78620756, "num_input_tokens_seen": 118694240, "step": 5519, "time_per_iteration": 3.5172524452209473 }, { "auxiliary_loss_clip": 0.01018459, "auxiliary_loss_mlp": 0.01001977, "balance_loss_clip": 1.01592422, "balance_loss_mlp": 1.00046921, "epoch": 0.6637407563277821, "flos": 57840241086720.0, "grad_norm": 0.8015610222499401, "language_loss": 0.62483287, "learning_rate": 1.07326066310049e-06, "loss": 0.64503723, "num_input_tokens_seen": 118758365, "step": 5520, "time_per_iteration": 3.256962299346924 }, { "auxiliary_loss_clip": 0.01098581, "auxiliary_loss_mlp": 0.01048827, "balance_loss_clip": 1.04203486, "balance_loss_mlp": 1.03106439, "epoch": 0.6638609992184212, "flos": 27306007079040.0, "grad_norm": 1.8478072989873566, "language_loss": 0.79134429, "learning_rate": 1.0725704359380059e-06, "loss": 0.81281841, "num_input_tokens_seen": 118778220, "step": 5521, "time_per_iteration": 2.7254629135131836 }, { "auxiliary_loss_clip": 0.01135268, "auxiliary_loss_mlp": 0.01036993, "balance_loss_clip": 1.04355669, "balance_loss_mlp": 1.02145994, "epoch": 0.6639812421090603, "flos": 18624854419200.0, "grad_norm": 6.391510347877492, "language_loss": 0.72064352, "learning_rate": 1.0718803494658497e-06, "loss": 0.74236614, "num_input_tokens_seen": 118797110, "step": 5522, "time_per_iteration": 2.550982713699341 }, { "auxiliary_loss_clip": 0.01037569, "auxiliary_loss_mlp": 0.01048569, "balance_loss_clip": 1.03311944, "balance_loss_mlp": 1.03134346, "epoch": 0.6641014849996993, "flos": 15924479806080.0, "grad_norm": 2.1671876884673487, "language_loss": 0.83935249, "learning_rate": 1.071190403788707e-06, "loss": 0.86021388, "num_input_tokens_seen": 118812415, "step": 5523, "time_per_iteration": 3.0131852626800537 }, { "auxiliary_loss_clip": 0.01105448, "auxiliary_loss_mlp": 0.01037602, "balance_loss_clip": 1.04287076, "balance_loss_mlp": 1.0195415, "epoch": 0.6642217278903385, "flos": 26505486622080.0, "grad_norm": 1.9086665135536363, "language_loss": 0.75936061, "learning_rate": 1.0705005990112415e-06, "loss": 0.78079104, "num_input_tokens_seen": 118832195, "step": 5524, "time_per_iteration": 4.192440986633301 }, { "auxiliary_loss_clip": 0.01073274, "auxiliary_loss_mlp": 0.01043313, "balance_loss_clip": 1.03688765, "balance_loss_mlp": 1.02696919, "epoch": 0.6643419707809776, "flos": 15377308951680.0, "grad_norm": 2.6711154074545274, "language_loss": 0.74339092, "learning_rate": 1.0698109352380957e-06, "loss": 0.76455683, "num_input_tokens_seen": 118849795, "step": 5525, "time_per_iteration": 2.747338056564331 }, { "auxiliary_loss_clip": 0.01137546, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.04482651, "balance_loss_mlp": 1.01984096, "epoch": 0.6644622136716166, "flos": 25117610970240.0, "grad_norm": 1.8477225670733353, "language_loss": 0.77850717, "learning_rate": 1.0691214125738909e-06, "loss": 0.80024529, "num_input_tokens_seen": 118870000, "step": 5526, "time_per_iteration": 3.498685121536255 }, { "auxiliary_loss_clip": 0.01040546, "auxiliary_loss_mlp": 0.01000752, "balance_loss_clip": 1.01108646, "balance_loss_mlp": 0.99933356, "epoch": 0.6645824565622558, "flos": 66201717680640.0, "grad_norm": 0.8010304351963686, "language_loss": 0.57520747, "learning_rate": 1.0684320311232287e-06, "loss": 0.59562045, "num_input_tokens_seen": 118932905, "step": 5527, "time_per_iteration": 3.2569386959075928 }, { "auxiliary_loss_clip": 0.01108163, "auxiliary_loss_mlp": 0.0103894, "balance_loss_clip": 1.03996956, "balance_loss_mlp": 1.02138615, "epoch": 0.6647026994528948, "flos": 25082131311360.0, "grad_norm": 2.0749648006500623, "language_loss": 0.81201613, "learning_rate": 1.0677427909906865e-06, "loss": 0.83348715, "num_input_tokens_seen": 118953355, "step": 5528, "time_per_iteration": 2.678597927093506 }, { "auxiliary_loss_clip": 0.01139906, "auxiliary_loss_mlp": 0.01040539, "balance_loss_clip": 1.04572535, "balance_loss_mlp": 1.02445745, "epoch": 0.6648229423435339, "flos": 18222187979520.0, "grad_norm": 1.7359830614460525, "language_loss": 0.72190642, "learning_rate": 1.0670536922808216e-06, "loss": 0.74371088, "num_input_tokens_seen": 118973480, "step": 5529, "time_per_iteration": 2.6327717304229736 }, { "auxiliary_loss_clip": 0.01109125, "auxiliary_loss_mlp": 0.01041013, "balance_loss_clip": 1.04110575, "balance_loss_mlp": 1.02576637, "epoch": 0.6649431852341731, "flos": 18296882311680.0, "grad_norm": 2.4889646202257, "language_loss": 0.72068876, "learning_rate": 1.06636473509817e-06, "loss": 0.74219012, "num_input_tokens_seen": 118989860, "step": 5530, "time_per_iteration": 2.6130027770996094 }, { "auxiliary_loss_clip": 0.01106985, "auxiliary_loss_mlp": 0.00772275, "balance_loss_clip": 1.03919411, "balance_loss_mlp": 1.0004425, "epoch": 0.6650634281248121, "flos": 17019575700480.0, "grad_norm": 2.096399948278433, "language_loss": 0.80705023, "learning_rate": 1.0656759195472447e-06, "loss": 0.82584292, "num_input_tokens_seen": 119007150, "step": 5531, "time_per_iteration": 2.6471829414367676 }, { "auxiliary_loss_clip": 0.01023663, "auxiliary_loss_mlp": 0.00999554, "balance_loss_clip": 1.01340568, "balance_loss_mlp": 0.99806404, "epoch": 0.6651836710154512, "flos": 69294810666240.0, "grad_norm": 1.096555899688308, "language_loss": 0.59704173, "learning_rate": 1.0649872457325414e-06, "loss": 0.61727393, "num_input_tokens_seen": 119068435, "step": 5532, "time_per_iteration": 3.1696810722351074 }, { "auxiliary_loss_clip": 0.01031916, "auxiliary_loss_mlp": 0.01003016, "balance_loss_clip": 1.01154685, "balance_loss_mlp": 1.00153732, "epoch": 0.6653039139060903, "flos": 66883444882560.0, "grad_norm": 0.8493781020508724, "language_loss": 0.55064082, "learning_rate": 1.0642987137585278e-06, "loss": 0.57099015, "num_input_tokens_seen": 119127960, "step": 5533, "time_per_iteration": 3.1529831886291504 }, { "auxiliary_loss_clip": 0.01108237, "auxiliary_loss_mlp": 0.01036784, "balance_loss_clip": 1.04123032, "balance_loss_mlp": 1.01980817, "epoch": 0.6654241567967294, "flos": 21470056669440.0, "grad_norm": 7.225949826772992, "language_loss": 0.82690966, "learning_rate": 1.0636103237296561e-06, "loss": 0.84835994, "num_input_tokens_seen": 119146885, "step": 5534, "time_per_iteration": 2.658949613571167 }, { "auxiliary_loss_clip": 0.01119349, "auxiliary_loss_mlp": 0.01032601, "balance_loss_clip": 1.04380226, "balance_loss_mlp": 1.01882637, "epoch": 0.6655443996873684, "flos": 25119514391040.0, "grad_norm": 1.8042112380008224, "language_loss": 0.84860909, "learning_rate": 1.062922075750353e-06, "loss": 0.87012863, "num_input_tokens_seen": 119166900, "step": 5535, "time_per_iteration": 2.6551496982574463 }, { "auxiliary_loss_clip": 0.01101603, "auxiliary_loss_mlp": 0.01041336, "balance_loss_clip": 1.04378247, "balance_loss_mlp": 1.02726293, "epoch": 0.6656646425780076, "flos": 17457326749440.0, "grad_norm": 2.234681144495227, "language_loss": 0.7220996, "learning_rate": 1.0622339699250267e-06, "loss": 0.74352896, "num_input_tokens_seen": 119184820, "step": 5536, "time_per_iteration": 2.6595423221588135 }, { "auxiliary_loss_clip": 0.01095233, "auxiliary_loss_mlp": 0.01039503, "balance_loss_clip": 1.03977454, "balance_loss_mlp": 1.0238266, "epoch": 0.6657848854686467, "flos": 23434190213760.0, "grad_norm": 1.8221690306526237, "language_loss": 0.7943725, "learning_rate": 1.0615460063580624e-06, "loss": 0.81571984, "num_input_tokens_seen": 119203295, "step": 5537, "time_per_iteration": 2.726422071456909 }, { "auxiliary_loss_clip": 0.0111216, "auxiliary_loss_mlp": 0.01033684, "balance_loss_clip": 1.04190421, "balance_loss_mlp": 1.01862836, "epoch": 0.6659051283592857, "flos": 11509909459200.0, "grad_norm": 2.9631890786789836, "language_loss": 0.72548056, "learning_rate": 1.060858185153821e-06, "loss": 0.74693906, "num_input_tokens_seen": 119221395, "step": 5538, "time_per_iteration": 2.6492578983306885 }, { "auxiliary_loss_clip": 0.0111651, "auxiliary_loss_mlp": 0.01040113, "balance_loss_clip": 1.04246795, "balance_loss_mlp": 1.02276802, "epoch": 0.6660253712499249, "flos": 20594554571520.0, "grad_norm": 2.208859657688973, "language_loss": 0.76122284, "learning_rate": 1.0601705064166474e-06, "loss": 0.78278911, "num_input_tokens_seen": 119239790, "step": 5539, "time_per_iteration": 2.644993543624878 }, { "auxiliary_loss_clip": 0.01104059, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.04178262, "balance_loss_mlp": 1.02381325, "epoch": 0.666145614140564, "flos": 21251504367360.0, "grad_norm": 2.6738631806791595, "language_loss": 0.7390787, "learning_rate": 1.0594829702508596e-06, "loss": 0.7605198, "num_input_tokens_seen": 119257505, "step": 5540, "time_per_iteration": 2.6367604732513428 }, { "auxiliary_loss_clip": 0.0110017, "auxiliary_loss_mlp": 0.01037936, "balance_loss_clip": 1.04106283, "balance_loss_mlp": 1.02142596, "epoch": 0.666265857031203, "flos": 33726188200320.0, "grad_norm": 1.6990428851838602, "language_loss": 0.54983783, "learning_rate": 1.0587955767607592e-06, "loss": 0.57121885, "num_input_tokens_seen": 119279365, "step": 5541, "time_per_iteration": 2.8119373321533203 }, { "auxiliary_loss_clip": 0.01137007, "auxiliary_loss_mlp": 0.01033771, "balance_loss_clip": 1.04481602, "balance_loss_mlp": 1.01757622, "epoch": 0.6663860999218422, "flos": 17456644391040.0, "grad_norm": 5.128266083340959, "language_loss": 0.77268839, "learning_rate": 1.0581083260506206e-06, "loss": 0.79439616, "num_input_tokens_seen": 119296150, "step": 5542, "time_per_iteration": 2.5552608966827393 }, { "auxiliary_loss_clip": 0.0110858, "auxiliary_loss_mlp": 0.01038043, "balance_loss_clip": 1.0414362, "balance_loss_mlp": 1.02225947, "epoch": 0.6665063428124812, "flos": 17676740977920.0, "grad_norm": 3.002753974740566, "language_loss": 0.76574969, "learning_rate": 1.0574212182246993e-06, "loss": 0.78721583, "num_input_tokens_seen": 119314845, "step": 5543, "time_per_iteration": 3.6009366512298584 }, { "auxiliary_loss_clip": 0.01118929, "auxiliary_loss_mlp": 0.01040021, "balance_loss_clip": 1.04196095, "balance_loss_mlp": 1.02213931, "epoch": 0.6666265857031203, "flos": 27673265687040.0, "grad_norm": 3.7785485964969374, "language_loss": 0.76098418, "learning_rate": 1.0567342533872303e-06, "loss": 0.78257364, "num_input_tokens_seen": 119334875, "step": 5544, "time_per_iteration": 2.678748369216919 }, { "auxiliary_loss_clip": 0.01113269, "auxiliary_loss_mlp": 0.01044632, "balance_loss_clip": 1.04429114, "balance_loss_mlp": 1.02943325, "epoch": 0.6667468285937594, "flos": 25046831220480.0, "grad_norm": 1.6675475867391267, "language_loss": 0.81161535, "learning_rate": 1.0560474316424255e-06, "loss": 0.83319438, "num_input_tokens_seen": 119354635, "step": 5545, "time_per_iteration": 3.6561598777770996 }, { "auxiliary_loss_clip": 0.01113268, "auxiliary_loss_mlp": 0.01050604, "balance_loss_clip": 1.04162526, "balance_loss_mlp": 1.03219819, "epoch": 0.6668670714843985, "flos": 22780472641920.0, "grad_norm": 2.9724740464134443, "language_loss": 0.73852003, "learning_rate": 1.0553607530944746e-06, "loss": 0.76015878, "num_input_tokens_seen": 119372690, "step": 5546, "time_per_iteration": 2.635549306869507 }, { "auxiliary_loss_clip": 0.01101654, "auxiliary_loss_mlp": 0.01043873, "balance_loss_clip": 1.04115915, "balance_loss_mlp": 1.02742171, "epoch": 0.6669873143750376, "flos": 22163886754560.0, "grad_norm": 3.744219620808661, "language_loss": 0.89473909, "learning_rate": 1.0546742178475463e-06, "loss": 0.91619432, "num_input_tokens_seen": 119391685, "step": 5547, "time_per_iteration": 2.740778923034668 }, { "auxiliary_loss_clip": 0.01088153, "auxiliary_loss_mlp": 0.01034986, "balance_loss_clip": 1.04112017, "balance_loss_mlp": 1.02174783, "epoch": 0.6671075572656767, "flos": 20514832335360.0, "grad_norm": 2.282385888671897, "language_loss": 0.86741674, "learning_rate": 1.0539878260057868e-06, "loss": 0.88864815, "num_input_tokens_seen": 119410725, "step": 5548, "time_per_iteration": 2.7115225791931152 }, { "auxiliary_loss_clip": 0.01126548, "auxiliary_loss_mlp": 0.0104565, "balance_loss_clip": 1.04534471, "balance_loss_mlp": 1.02867508, "epoch": 0.6672278001563158, "flos": 17931203902080.0, "grad_norm": 9.437044894666457, "language_loss": 0.68790996, "learning_rate": 1.0533015776733226e-06, "loss": 0.70963198, "num_input_tokens_seen": 119426875, "step": 5549, "time_per_iteration": 2.59208083152771 }, { "auxiliary_loss_clip": 0.01109477, "auxiliary_loss_mlp": 0.01049443, "balance_loss_clip": 1.04367304, "balance_loss_mlp": 1.03082263, "epoch": 0.6673480430469548, "flos": 22342146975360.0, "grad_norm": 2.117799843000859, "language_loss": 0.78503084, "learning_rate": 1.0526154729542566e-06, "loss": 0.80662, "num_input_tokens_seen": 119446935, "step": 5550, "time_per_iteration": 3.58418607711792 }, { "auxiliary_loss_clip": 0.01102662, "auxiliary_loss_mlp": 0.01044476, "balance_loss_clip": 1.04484177, "balance_loss_mlp": 1.02750015, "epoch": 0.6674682859375939, "flos": 20703830722560.0, "grad_norm": 3.212640062479525, "language_loss": 0.79963732, "learning_rate": 1.0519295119526699e-06, "loss": 0.8211087, "num_input_tokens_seen": 119463240, "step": 5551, "time_per_iteration": 2.698218584060669 }, { "auxiliary_loss_clip": 0.01115209, "auxiliary_loss_mlp": 0.01048338, "balance_loss_clip": 1.04321241, "balance_loss_mlp": 1.03285241, "epoch": 0.667588528828233, "flos": 26206673379840.0, "grad_norm": 1.6575693030938983, "language_loss": 0.82952511, "learning_rate": 1.0512436947726227e-06, "loss": 0.85116059, "num_input_tokens_seen": 119484655, "step": 5552, "time_per_iteration": 3.607590436935425 }, { "auxiliary_loss_clip": 0.01101628, "auxiliary_loss_mlp": 0.01042, "balance_loss_clip": 1.03914309, "balance_loss_mlp": 1.02522779, "epoch": 0.6677087717188721, "flos": 23071025756160.0, "grad_norm": 4.566294486496321, "language_loss": 0.65176296, "learning_rate": 1.0505580215181517e-06, "loss": 0.67319924, "num_input_tokens_seen": 119502895, "step": 5553, "time_per_iteration": 2.7184526920318604 }, { "auxiliary_loss_clip": 0.0101058, "auxiliary_loss_mlp": 0.01009697, "balance_loss_clip": 1.01564765, "balance_loss_mlp": 1.00838017, "epoch": 0.6678290146095112, "flos": 70941315219840.0, "grad_norm": 0.7736279455724187, "language_loss": 0.56580943, "learning_rate": 1.0498724922932753e-06, "loss": 0.58601218, "num_input_tokens_seen": 119561010, "step": 5554, "time_per_iteration": 3.1822617053985596 }, { "auxiliary_loss_clip": 0.01141153, "auxiliary_loss_mlp": 0.01041798, "balance_loss_clip": 1.04666328, "balance_loss_mlp": 1.02324891, "epoch": 0.6679492575001503, "flos": 18661088263680.0, "grad_norm": 2.198931380365775, "language_loss": 0.86371279, "learning_rate": 1.0491871072019851e-06, "loss": 0.88554227, "num_input_tokens_seen": 119578900, "step": 5555, "time_per_iteration": 2.618372678756714 }, { "auxiliary_loss_clip": 0.0109957, "auxiliary_loss_mlp": 0.01033501, "balance_loss_clip": 1.03886545, "balance_loss_mlp": 1.01752722, "epoch": 0.6680695003907894, "flos": 29711985822720.0, "grad_norm": 1.8253753602979153, "language_loss": 0.64045608, "learning_rate": 1.0485018663482555e-06, "loss": 0.66178679, "num_input_tokens_seen": 119598920, "step": 5556, "time_per_iteration": 2.7387614250183105 }, { "auxiliary_loss_clip": 0.01119075, "auxiliary_loss_mlp": 0.01046036, "balance_loss_clip": 1.04199219, "balance_loss_mlp": 1.02760649, "epoch": 0.6681897432814284, "flos": 28218964083840.0, "grad_norm": 2.494455253567072, "language_loss": 0.70716393, "learning_rate": 1.0478167698360354e-06, "loss": 0.72881502, "num_input_tokens_seen": 119618220, "step": 5557, "time_per_iteration": 2.663811445236206 }, { "auxiliary_loss_clip": 0.01118279, "auxiliary_loss_mlp": 0.01053455, "balance_loss_clip": 1.0423125, "balance_loss_mlp": 1.03653967, "epoch": 0.6683099861720676, "flos": 25046543911680.0, "grad_norm": 2.664967125840541, "language_loss": 0.70410383, "learning_rate": 1.0471318177692556e-06, "loss": 0.72582114, "num_input_tokens_seen": 119638520, "step": 5558, "time_per_iteration": 2.6499178409576416 }, { "auxiliary_loss_clip": 0.01091641, "auxiliary_loss_mlp": 0.01044334, "balance_loss_clip": 1.03960681, "balance_loss_mlp": 1.02784193, "epoch": 0.6684302290627067, "flos": 22996977868800.0, "grad_norm": 4.059958041259378, "language_loss": 0.75914943, "learning_rate": 1.046447010251821e-06, "loss": 0.78050911, "num_input_tokens_seen": 119655850, "step": 5559, "time_per_iteration": 2.7278072834014893 }, { "auxiliary_loss_clip": 0.01109081, "auxiliary_loss_mlp": 0.01035632, "balance_loss_clip": 1.04183233, "balance_loss_mlp": 1.01987267, "epoch": 0.6685504719533457, "flos": 26573824247040.0, "grad_norm": 1.779703636247496, "language_loss": 0.75791645, "learning_rate": 1.0457623473876157e-06, "loss": 0.77936357, "num_input_tokens_seen": 119675355, "step": 5560, "time_per_iteration": 2.6983935832977295 }, { "auxiliary_loss_clip": 0.01132276, "auxiliary_loss_mlp": 0.01039793, "balance_loss_clip": 1.04312611, "balance_loss_mlp": 1.0242126, "epoch": 0.6686707148439849, "flos": 28986087870720.0, "grad_norm": 2.968482550271003, "language_loss": 0.70872873, "learning_rate": 1.0450778292805046e-06, "loss": 0.73044944, "num_input_tokens_seen": 119695340, "step": 5561, "time_per_iteration": 2.6462550163269043 }, { "auxiliary_loss_clip": 0.01128056, "auxiliary_loss_mlp": 0.01041611, "balance_loss_clip": 1.0460242, "balance_loss_mlp": 1.02563739, "epoch": 0.6687909577346239, "flos": 23623152687360.0, "grad_norm": 1.6898966514135498, "language_loss": 0.78671157, "learning_rate": 1.0443934560343267e-06, "loss": 0.80840826, "num_input_tokens_seen": 119716750, "step": 5562, "time_per_iteration": 2.610581874847412 }, { "auxiliary_loss_clip": 0.01087899, "auxiliary_loss_mlp": 0.01039574, "balance_loss_clip": 1.03911138, "balance_loss_mlp": 1.02321839, "epoch": 0.668911200625263, "flos": 23148593176320.0, "grad_norm": 2.0986238250253977, "language_loss": 0.77746809, "learning_rate": 1.0437092277529034e-06, "loss": 0.79874277, "num_input_tokens_seen": 119736005, "step": 5563, "time_per_iteration": 2.7103869915008545 }, { "auxiliary_loss_clip": 0.01105824, "auxiliary_loss_mlp": 0.01039554, "balance_loss_clip": 1.03908885, "balance_loss_mlp": 1.02341342, "epoch": 0.6690314435159022, "flos": 18551919853440.0, "grad_norm": 2.490374750375549, "language_loss": 0.73593092, "learning_rate": 1.0430251445400292e-06, "loss": 0.75738466, "num_input_tokens_seen": 119754050, "step": 5564, "time_per_iteration": 2.633131742477417 }, { "auxiliary_loss_clip": 0.01049784, "auxiliary_loss_mlp": 0.01039077, "balance_loss_clip": 1.03682959, "balance_loss_mlp": 1.02264977, "epoch": 0.6691516864065412, "flos": 31759540704000.0, "grad_norm": 2.0901282379320634, "language_loss": 0.6212343, "learning_rate": 1.0423412064994787e-06, "loss": 0.64212286, "num_input_tokens_seen": 119774820, "step": 5565, "time_per_iteration": 3.0295844078063965 }, { "auxiliary_loss_clip": 0.01099967, "auxiliary_loss_mlp": 0.01042121, "balance_loss_clip": 1.04008126, "balance_loss_mlp": 1.0267309, "epoch": 0.6692719292971803, "flos": 34933864296960.0, "grad_norm": 1.9918327656760548, "language_loss": 0.73886949, "learning_rate": 1.0416574137350064e-06, "loss": 0.76029038, "num_input_tokens_seen": 119795525, "step": 5566, "time_per_iteration": 3.0035738945007324 }, { "auxiliary_loss_clip": 0.01119615, "auxiliary_loss_mlp": 0.01037555, "balance_loss_clip": 1.04431665, "balance_loss_mlp": 1.02261782, "epoch": 0.6693921721878194, "flos": 20449188230400.0, "grad_norm": 2.522613105414403, "language_loss": 0.81030238, "learning_rate": 1.0409737663503428e-06, "loss": 0.83187413, "num_input_tokens_seen": 119813905, "step": 5567, "time_per_iteration": 2.624130964279175 }, { "auxiliary_loss_clip": 0.01121437, "auxiliary_loss_mlp": 0.01038031, "balance_loss_clip": 1.04098439, "balance_loss_mlp": 1.02044713, "epoch": 0.6695124150784585, "flos": 16614538963200.0, "grad_norm": 1.8986055499579357, "language_loss": 0.82940578, "learning_rate": 1.040290264449196e-06, "loss": 0.85100049, "num_input_tokens_seen": 119832010, "step": 5568, "time_per_iteration": 2.5938572883605957 }, { "auxiliary_loss_clip": 0.01118684, "auxiliary_loss_mlp": 0.01032099, "balance_loss_clip": 1.0439117, "balance_loss_mlp": 1.01618409, "epoch": 0.6696326579690975, "flos": 26652145852800.0, "grad_norm": 2.1245239076619415, "language_loss": 0.64214343, "learning_rate": 1.0396069081352532e-06, "loss": 0.66365123, "num_input_tokens_seen": 119851165, "step": 5569, "time_per_iteration": 4.019429922103882 }, { "auxiliary_loss_clip": 0.01041211, "auxiliary_loss_mlp": 0.01001855, "balance_loss_clip": 1.01193547, "balance_loss_mlp": 1.00046599, "epoch": 0.6697529008597367, "flos": 66964603662720.0, "grad_norm": 0.7821929381916526, "language_loss": 0.56052637, "learning_rate": 1.0389236975121782e-06, "loss": 0.580957, "num_input_tokens_seen": 119906015, "step": 5570, "time_per_iteration": 3.3882102966308594 }, { "auxiliary_loss_clip": 0.01139436, "auxiliary_loss_mlp": 0.01042935, "balance_loss_clip": 1.0458715, "balance_loss_mlp": 1.02590001, "epoch": 0.6698731437503758, "flos": 20886939279360.0, "grad_norm": 2.594210887146642, "language_loss": 0.71451986, "learning_rate": 1.0382406326836147e-06, "loss": 0.73634362, "num_input_tokens_seen": 119925160, "step": 5571, "time_per_iteration": 3.5406265258789062 }, { "auxiliary_loss_clip": 0.01129036, "auxiliary_loss_mlp": 0.01049055, "balance_loss_clip": 1.0448972, "balance_loss_mlp": 1.03169799, "epoch": 0.6699933866410148, "flos": 20409470766720.0, "grad_norm": 2.01514653539824, "language_loss": 0.76001471, "learning_rate": 1.0375577137531828e-06, "loss": 0.78179562, "num_input_tokens_seen": 119943720, "step": 5572, "time_per_iteration": 2.6062135696411133 }, { "auxiliary_loss_clip": 0.01116666, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.04416084, "balance_loss_mlp": 1.02365959, "epoch": 0.670113629531654, "flos": 29023075900800.0, "grad_norm": 1.5896728360625725, "language_loss": 0.72098076, "learning_rate": 1.0368749408244802e-06, "loss": 0.74254632, "num_input_tokens_seen": 119966640, "step": 5573, "time_per_iteration": 2.729793071746826 }, { "auxiliary_loss_clip": 0.0111867, "auxiliary_loss_mlp": 0.01045995, "balance_loss_clip": 1.04349375, "balance_loss_mlp": 1.02977097, "epoch": 0.670233872422293, "flos": 19791699730560.0, "grad_norm": 1.9764289522295888, "language_loss": 0.78942561, "learning_rate": 1.0361923140010836e-06, "loss": 0.81107223, "num_input_tokens_seen": 119985125, "step": 5574, "time_per_iteration": 2.6000077724456787 }, { "auxiliary_loss_clip": 0.01129417, "auxiliary_loss_mlp": 0.01039811, "balance_loss_clip": 1.04479361, "balance_loss_mlp": 1.02353907, "epoch": 0.6703541153129321, "flos": 24243689070720.0, "grad_norm": 2.2633056700703733, "language_loss": 0.64005613, "learning_rate": 1.0355098333865455e-06, "loss": 0.66174841, "num_input_tokens_seen": 120004355, "step": 5575, "time_per_iteration": 3.536442279815674 }, { "auxiliary_loss_clip": 0.01117084, "auxiliary_loss_mlp": 0.01036193, "balance_loss_clip": 1.0448314, "balance_loss_mlp": 1.02025509, "epoch": 0.6704743582035713, "flos": 26688523351680.0, "grad_norm": 2.9565038327498883, "language_loss": 0.69333637, "learning_rate": 1.0348274990844006e-06, "loss": 0.71486902, "num_input_tokens_seen": 120027115, "step": 5576, "time_per_iteration": 2.6228268146514893 }, { "auxiliary_loss_clip": 0.01124183, "auxiliary_loss_mlp": 0.01038239, "balance_loss_clip": 1.04353869, "balance_loss_mlp": 1.02299809, "epoch": 0.6705946010942103, "flos": 23514379326720.0, "grad_norm": 3.923420179201398, "language_loss": 0.72516268, "learning_rate": 1.034145311198155e-06, "loss": 0.74678695, "num_input_tokens_seen": 120047130, "step": 5577, "time_per_iteration": 2.650341272354126 }, { "auxiliary_loss_clip": 0.01132125, "auxiliary_loss_mlp": 0.01041764, "balance_loss_clip": 1.04342866, "balance_loss_mlp": 1.02646947, "epoch": 0.6707148439848494, "flos": 24061011477120.0, "grad_norm": 1.814791798607894, "language_loss": 0.64320385, "learning_rate": 1.0334632698312989e-06, "loss": 0.66494274, "num_input_tokens_seen": 120067925, "step": 5578, "time_per_iteration": 3.527050256729126 }, { "auxiliary_loss_clip": 0.01106113, "auxiliary_loss_mlp": 0.01047875, "balance_loss_clip": 1.04031014, "balance_loss_mlp": 1.02956486, "epoch": 0.6708350868754885, "flos": 22528667324160.0, "grad_norm": 1.9004691999632668, "language_loss": 0.75096875, "learning_rate": 1.032781375087295e-06, "loss": 0.77250862, "num_input_tokens_seen": 120087825, "step": 5579, "time_per_iteration": 2.6169180870056152 }, { "auxiliary_loss_clip": 0.01114987, "auxiliary_loss_mlp": 0.010424, "balance_loss_clip": 1.04636621, "balance_loss_mlp": 1.02594864, "epoch": 0.6709553297661276, "flos": 25227749047680.0, "grad_norm": 1.5378736776408664, "language_loss": 0.67447555, "learning_rate": 1.0320996270695891e-06, "loss": 0.69604945, "num_input_tokens_seen": 120108895, "step": 5580, "time_per_iteration": 2.671847105026245 }, { "auxiliary_loss_clip": 0.01096297, "auxiliary_loss_mlp": 0.01044846, "balance_loss_clip": 1.03871751, "balance_loss_mlp": 1.02882457, "epoch": 0.6710755726567667, "flos": 20448757267200.0, "grad_norm": 2.8414311571502235, "language_loss": 0.7320978, "learning_rate": 1.0314180258815998e-06, "loss": 0.75350922, "num_input_tokens_seen": 120127535, "step": 5581, "time_per_iteration": 2.681675910949707 }, { "auxiliary_loss_clip": 0.01090372, "auxiliary_loss_mlp": 0.01038746, "balance_loss_clip": 1.04013395, "balance_loss_mlp": 1.02352297, "epoch": 0.6711958155474057, "flos": 25995411538560.0, "grad_norm": 1.640519036658757, "language_loss": 0.74224854, "learning_rate": 1.0307365716267247e-06, "loss": 0.76353967, "num_input_tokens_seen": 120147980, "step": 5582, "time_per_iteration": 2.7116143703460693 }, { "auxiliary_loss_clip": 0.01123661, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.04250467, "balance_loss_mlp": 1.0201695, "epoch": 0.6713160584380449, "flos": 19937712516480.0, "grad_norm": 5.383489223142013, "language_loss": 0.784639, "learning_rate": 1.0300552644083423e-06, "loss": 0.80623084, "num_input_tokens_seen": 120166905, "step": 5583, "time_per_iteration": 2.6266565322875977 }, { "auxiliary_loss_clip": 0.0110283, "auxiliary_loss_mlp": 0.01047094, "balance_loss_clip": 1.04320002, "balance_loss_mlp": 1.02908134, "epoch": 0.6714363013286839, "flos": 18223373128320.0, "grad_norm": 2.697603849597727, "language_loss": 0.72968733, "learning_rate": 1.0293741043298036e-06, "loss": 0.75118661, "num_input_tokens_seen": 120185255, "step": 5584, "time_per_iteration": 2.664994955062866 }, { "auxiliary_loss_clip": 0.01101834, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.04454494, "balance_loss_mlp": 1.02664232, "epoch": 0.671556544219323, "flos": 25812374808960.0, "grad_norm": 2.35388257336481, "language_loss": 0.71686673, "learning_rate": 1.0286930914944436e-06, "loss": 0.738307, "num_input_tokens_seen": 120205070, "step": 5585, "time_per_iteration": 2.6915225982666016 }, { "auxiliary_loss_clip": 0.01136428, "auxiliary_loss_mlp": 0.01033881, "balance_loss_clip": 1.04265618, "balance_loss_mlp": 1.01852679, "epoch": 0.6716767871099621, "flos": 15850431918720.0, "grad_norm": 2.607127571323022, "language_loss": 0.7710861, "learning_rate": 1.0280122260055684e-06, "loss": 0.79278916, "num_input_tokens_seen": 120220780, "step": 5586, "time_per_iteration": 2.5679385662078857 }, { "auxiliary_loss_clip": 0.01135962, "auxiliary_loss_mlp": 0.01036976, "balance_loss_clip": 1.04481304, "balance_loss_mlp": 1.02059639, "epoch": 0.6717970300006012, "flos": 19756112330880.0, "grad_norm": 1.9446743937545405, "language_loss": 0.82507038, "learning_rate": 1.0273315079664652e-06, "loss": 0.84679973, "num_input_tokens_seen": 120238735, "step": 5587, "time_per_iteration": 2.5959713459014893 }, { "auxiliary_loss_clip": 0.01127962, "auxiliary_loss_mlp": 0.0103794, "balance_loss_clip": 1.04618692, "balance_loss_mlp": 1.02274132, "epoch": 0.6719172728912403, "flos": 25485049146240.0, "grad_norm": 1.9334069447448494, "language_loss": 0.74256468, "learning_rate": 1.0266509374803992e-06, "loss": 0.76422369, "num_input_tokens_seen": 120259895, "step": 5588, "time_per_iteration": 2.662107467651367 }, { "auxiliary_loss_clip": 0.01138805, "auxiliary_loss_mlp": 0.00772869, "balance_loss_clip": 1.04546368, "balance_loss_mlp": 1.00043869, "epoch": 0.6720375157818794, "flos": 15880344969600.0, "grad_norm": 3.5424605950176042, "language_loss": 0.83784699, "learning_rate": 1.0259705146506123e-06, "loss": 0.85696375, "num_input_tokens_seen": 120274790, "step": 5589, "time_per_iteration": 2.5798821449279785 }, { "auxiliary_loss_clip": 0.01126349, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.0452044, "balance_loss_mlp": 1.02257371, "epoch": 0.6721577586725185, "flos": 32010843231360.0, "grad_norm": 2.0472887944855613, "language_loss": 0.77536893, "learning_rate": 1.025290239580324e-06, "loss": 0.79702532, "num_input_tokens_seen": 120295460, "step": 5590, "time_per_iteration": 2.69268536567688 }, { "auxiliary_loss_clip": 0.01087415, "auxiliary_loss_mlp": 0.01046036, "balance_loss_clip": 1.03949428, "balance_loss_mlp": 1.02777314, "epoch": 0.6722780015631575, "flos": 20737873837440.0, "grad_norm": 1.603750358267078, "language_loss": 0.75545251, "learning_rate": 1.0246101123727313e-06, "loss": 0.77678698, "num_input_tokens_seen": 120314440, "step": 5591, "time_per_iteration": 2.7008917331695557 }, { "auxiliary_loss_clip": 0.01122256, "auxiliary_loss_mlp": 0.01036591, "balance_loss_clip": 1.04275358, "balance_loss_mlp": 1.02129602, "epoch": 0.6723982444537967, "flos": 16909617191040.0, "grad_norm": 8.362340732729184, "language_loss": 0.7841059, "learning_rate": 1.0239301331310085e-06, "loss": 0.80569434, "num_input_tokens_seen": 120332060, "step": 5592, "time_per_iteration": 2.622086524963379 }, { "auxiliary_loss_clip": 0.01119542, "auxiliary_loss_mlp": 0.01034734, "balance_loss_clip": 1.04230797, "balance_loss_mlp": 1.01960647, "epoch": 0.6725184873444358, "flos": 20667812359680.0, "grad_norm": 1.9140695222774615, "language_loss": 0.88463038, "learning_rate": 1.0232503019583088e-06, "loss": 0.90617311, "num_input_tokens_seen": 120351670, "step": 5593, "time_per_iteration": 2.638430595397949 }, { "auxiliary_loss_clip": 0.01117024, "auxiliary_loss_mlp": 0.01049518, "balance_loss_clip": 1.04068351, "balance_loss_mlp": 1.03089213, "epoch": 0.6726387302350748, "flos": 23727616416000.0, "grad_norm": 1.9940645535578232, "language_loss": 0.69625109, "learning_rate": 1.0225706189577619e-06, "loss": 0.71791655, "num_input_tokens_seen": 120370195, "step": 5594, "time_per_iteration": 2.6388635635375977 }, { "auxiliary_loss_clip": 0.01125502, "auxiliary_loss_mlp": 0.01041919, "balance_loss_clip": 1.04467177, "balance_loss_mlp": 1.02573037, "epoch": 0.672758973125714, "flos": 15188274650880.0, "grad_norm": 2.256460739275324, "language_loss": 0.74743348, "learning_rate": 1.021891084232475e-06, "loss": 0.7691077, "num_input_tokens_seen": 120388130, "step": 5595, "time_per_iteration": 3.5213141441345215 }, { "auxiliary_loss_clip": 0.01123906, "auxiliary_loss_mlp": 0.01045839, "balance_loss_clip": 1.04244435, "balance_loss_mlp": 1.02904224, "epoch": 0.672879216016353, "flos": 18077252601600.0, "grad_norm": 2.4519697172798742, "language_loss": 0.80096722, "learning_rate": 1.0212116978855325e-06, "loss": 0.82266462, "num_input_tokens_seen": 120406145, "step": 5596, "time_per_iteration": 2.5952627658843994 }, { "auxiliary_loss_clip": 0.01097484, "auxiliary_loss_mlp": 0.01046646, "balance_loss_clip": 1.04066324, "balance_loss_mlp": 1.03049362, "epoch": 0.6729994589069921, "flos": 23476349802240.0, "grad_norm": 1.8589318383162383, "language_loss": 0.78848749, "learning_rate": 1.020532460019997e-06, "loss": 0.80992877, "num_input_tokens_seen": 120425395, "step": 5597, "time_per_iteration": 3.612900733947754 }, { "auxiliary_loss_clip": 0.01066595, "auxiliary_loss_mlp": 0.01042782, "balance_loss_clip": 1.03778315, "balance_loss_mlp": 1.02701116, "epoch": 0.6731197017976313, "flos": 26322018929280.0, "grad_norm": 1.7684391446197851, "language_loss": 0.71022081, "learning_rate": 1.0198533707389096e-06, "loss": 0.73131454, "num_input_tokens_seen": 120446270, "step": 5598, "time_per_iteration": 2.8555781841278076 }, { "auxiliary_loss_clip": 0.01119613, "auxiliary_loss_mlp": 0.00772543, "balance_loss_clip": 1.04363263, "balance_loss_mlp": 1.00041842, "epoch": 0.6732399446882703, "flos": 21616428591360.0, "grad_norm": 2.9027231544490992, "language_loss": 0.73342049, "learning_rate": 1.0191744301452853e-06, "loss": 0.75234199, "num_input_tokens_seen": 120465570, "step": 5599, "time_per_iteration": 2.8113088607788086 }, { "auxiliary_loss_clip": 0.0113476, "auxiliary_loss_mlp": 0.01037702, "balance_loss_clip": 1.04298162, "balance_loss_mlp": 1.02145362, "epoch": 0.6733601875789094, "flos": 25880173729920.0, "grad_norm": 3.1783390602530144, "language_loss": 0.70268321, "learning_rate": 1.0184956383421208e-06, "loss": 0.72440785, "num_input_tokens_seen": 120484220, "step": 5600, "time_per_iteration": 2.6192591190338135 }, { "auxiliary_loss_clip": 0.0112606, "auxiliary_loss_mlp": 0.01047808, "balance_loss_clip": 1.04574001, "balance_loss_mlp": 1.03073716, "epoch": 0.6734804304695485, "flos": 22929573997440.0, "grad_norm": 5.120679786942627, "language_loss": 0.66073376, "learning_rate": 1.017816995432387e-06, "loss": 0.68247247, "num_input_tokens_seen": 120503320, "step": 5601, "time_per_iteration": 3.6247377395629883 }, { "auxiliary_loss_clip": 0.01108365, "auxiliary_loss_mlp": 0.01049066, "balance_loss_clip": 1.04177237, "balance_loss_mlp": 1.0328598, "epoch": 0.6736006733601876, "flos": 18697968552960.0, "grad_norm": 1.9598316488109984, "language_loss": 0.74552238, "learning_rate": 1.0171385015190353e-06, "loss": 0.76709676, "num_input_tokens_seen": 120523180, "step": 5602, "time_per_iteration": 2.6226389408111572 }, { "auxiliary_loss_clip": 0.01104096, "auxiliary_loss_mlp": 0.00771604, "balance_loss_clip": 1.04234469, "balance_loss_mlp": 1.00035989, "epoch": 0.6737209162508266, "flos": 19427745173760.0, "grad_norm": 2.06769702803552, "language_loss": 0.73488092, "learning_rate": 1.0164601567049908e-06, "loss": 0.75363791, "num_input_tokens_seen": 120541710, "step": 5603, "time_per_iteration": 2.6215367317199707 }, { "auxiliary_loss_clip": 0.01111517, "auxiliary_loss_mlp": 0.01054475, "balance_loss_clip": 1.04359126, "balance_loss_mlp": 1.03633165, "epoch": 0.6738411591414658, "flos": 20158060498560.0, "grad_norm": 1.811586661296515, "language_loss": 0.80354685, "learning_rate": 1.015781961093158e-06, "loss": 0.82520676, "num_input_tokens_seen": 120561030, "step": 5604, "time_per_iteration": 3.5711793899536133 }, { "auxiliary_loss_clip": 0.01110304, "auxiliary_loss_mlp": 0.01043966, "balance_loss_clip": 1.03854561, "balance_loss_mlp": 1.02856469, "epoch": 0.6739614020321049, "flos": 21653847584640.0, "grad_norm": 1.6199065436769424, "language_loss": 0.77224869, "learning_rate": 1.0151039147864197e-06, "loss": 0.79379135, "num_input_tokens_seen": 120581005, "step": 5605, "time_per_iteration": 2.6115262508392334 }, { "auxiliary_loss_clip": 0.01054495, "auxiliary_loss_mlp": 0.0104211, "balance_loss_clip": 1.03885627, "balance_loss_mlp": 1.02520573, "epoch": 0.6740816449227439, "flos": 19171702051200.0, "grad_norm": 5.105506973127694, "language_loss": 0.66035795, "learning_rate": 1.0144260178876336e-06, "loss": 0.68132395, "num_input_tokens_seen": 120600350, "step": 5606, "time_per_iteration": 2.9303863048553467 }, { "auxiliary_loss_clip": 0.01115953, "auxiliary_loss_mlp": 0.0103649, "balance_loss_clip": 1.04275584, "balance_loss_mlp": 1.02058792, "epoch": 0.6742018878133831, "flos": 21097015971840.0, "grad_norm": 2.2961334007246643, "language_loss": 0.67657161, "learning_rate": 1.0137482704996388e-06, "loss": 0.69809604, "num_input_tokens_seen": 120614700, "step": 5607, "time_per_iteration": 2.9130945205688477 }, { "auxiliary_loss_clip": 0.01102309, "auxiliary_loss_mlp": 0.01040416, "balance_loss_clip": 1.04308248, "balance_loss_mlp": 1.02416825, "epoch": 0.6743221307040221, "flos": 23549966726400.0, "grad_norm": 1.8365539681830645, "language_loss": 0.78942323, "learning_rate": 1.0130706727252461e-06, "loss": 0.8108505, "num_input_tokens_seen": 120631755, "step": 5608, "time_per_iteration": 2.6618449687957764 }, { "auxiliary_loss_clip": 0.01103124, "auxiliary_loss_mlp": 0.01042189, "balance_loss_clip": 1.04206741, "balance_loss_mlp": 1.02625728, "epoch": 0.6744423735946612, "flos": 16249542912000.0, "grad_norm": 2.389781184741381, "language_loss": 0.68409497, "learning_rate": 1.0123932246672468e-06, "loss": 0.70554817, "num_input_tokens_seen": 120645900, "step": 5609, "time_per_iteration": 2.6367955207824707 }, { "auxiliary_loss_clip": 0.01004857, "auxiliary_loss_mlp": 0.00755924, "balance_loss_clip": 1.01340449, "balance_loss_mlp": 1.00024354, "epoch": 0.6745626164853004, "flos": 57843257829120.0, "grad_norm": 0.7470058884697123, "language_loss": 0.55828655, "learning_rate": 1.0117159264284114e-06, "loss": 0.57589436, "num_input_tokens_seen": 120709070, "step": 5610, "time_per_iteration": 3.2834715843200684 }, { "auxiliary_loss_clip": 0.01112128, "auxiliary_loss_mlp": 0.01042363, "balance_loss_clip": 1.04304671, "balance_loss_mlp": 1.02519727, "epoch": 0.6746828593759394, "flos": 20485027025280.0, "grad_norm": 1.6828636276936788, "language_loss": 0.77179193, "learning_rate": 1.0110387781114837e-06, "loss": 0.79333687, "num_input_tokens_seen": 120727685, "step": 5611, "time_per_iteration": 2.6535043716430664 }, { "auxiliary_loss_clip": 0.01137072, "auxiliary_loss_mlp": 0.01036814, "balance_loss_clip": 1.04573989, "balance_loss_mlp": 1.0211854, "epoch": 0.6748031022665785, "flos": 19208223204480.0, "grad_norm": 1.9965015400444364, "language_loss": 0.77398002, "learning_rate": 1.0103617798191872e-06, "loss": 0.79571885, "num_input_tokens_seen": 120747160, "step": 5612, "time_per_iteration": 2.5623693466186523 }, { "auxiliary_loss_clip": 0.0110979, "auxiliary_loss_mlp": 0.01036255, "balance_loss_clip": 1.04475045, "balance_loss_mlp": 1.01974487, "epoch": 0.6749233451572175, "flos": 15195026407680.0, "grad_norm": 3.0394500630431347, "language_loss": 0.8251428, "learning_rate": 1.0096849316542217e-06, "loss": 0.84660321, "num_input_tokens_seen": 120763710, "step": 5613, "time_per_iteration": 2.6439595222473145 }, { "auxiliary_loss_clip": 0.01050384, "auxiliary_loss_mlp": 0.01047105, "balance_loss_clip": 1.03456497, "balance_loss_mlp": 1.02937806, "epoch": 0.6750435880478567, "flos": 26499489050880.0, "grad_norm": 2.2602708536819436, "language_loss": 0.74706984, "learning_rate": 1.0090082337192643e-06, "loss": 0.76804471, "num_input_tokens_seen": 120783355, "step": 5614, "time_per_iteration": 2.782957077026367 }, { "auxiliary_loss_clip": 0.01069406, "auxiliary_loss_mlp": 0.01035592, "balance_loss_clip": 1.03485072, "balance_loss_mlp": 1.01932001, "epoch": 0.6751638309384957, "flos": 23404313076480.0, "grad_norm": 2.149534759747858, "language_loss": 0.78372598, "learning_rate": 1.0083316861169705e-06, "loss": 0.80477595, "num_input_tokens_seen": 120802090, "step": 5615, "time_per_iteration": 2.743474245071411 }, { "auxiliary_loss_clip": 0.01106022, "auxiliary_loss_mlp": 0.01037868, "balance_loss_clip": 1.0401988, "balance_loss_mlp": 1.02054644, "epoch": 0.6752840738291348, "flos": 23441408847360.0, "grad_norm": 4.670375410333605, "language_loss": 0.7231518, "learning_rate": 1.0076552889499713e-06, "loss": 0.74459064, "num_input_tokens_seen": 120822855, "step": 5616, "time_per_iteration": 2.740901231765747 }, { "auxiliary_loss_clip": 0.01123578, "auxiliary_loss_mlp": 0.01040588, "balance_loss_clip": 1.04473019, "balance_loss_mlp": 1.02515078, "epoch": 0.675404316719774, "flos": 30335826257280.0, "grad_norm": 2.0520174906632365, "language_loss": 0.73630619, "learning_rate": 1.006979042320876e-06, "loss": 0.75794792, "num_input_tokens_seen": 120843070, "step": 5617, "time_per_iteration": 2.6611437797546387 }, { "auxiliary_loss_clip": 0.01107747, "auxiliary_loss_mlp": 0.01042217, "balance_loss_clip": 1.04144168, "balance_loss_mlp": 1.02427554, "epoch": 0.675524559610413, "flos": 23622613983360.0, "grad_norm": 2.2019674306378016, "language_loss": 0.63125509, "learning_rate": 1.0063029463322702e-06, "loss": 0.65275472, "num_input_tokens_seen": 120863345, "step": 5618, "time_per_iteration": 2.6897659301757812 }, { "auxiliary_loss_clip": 0.01076444, "auxiliary_loss_mlp": 0.0077259, "balance_loss_clip": 1.03490698, "balance_loss_mlp": 1.00031614, "epoch": 0.6756448025010521, "flos": 21248631279360.0, "grad_norm": 2.8595726115051274, "language_loss": 0.75895607, "learning_rate": 1.0056270010867164e-06, "loss": 0.77744639, "num_input_tokens_seen": 120880915, "step": 5619, "time_per_iteration": 2.797515630722046 }, { "auxiliary_loss_clip": 0.01114554, "auxiliary_loss_mlp": 0.01043415, "balance_loss_clip": 1.04072177, "balance_loss_mlp": 1.02660632, "epoch": 0.6757650453916912, "flos": 21646521210240.0, "grad_norm": 2.797503274943936, "language_loss": 0.7846216, "learning_rate": 1.004951206686758e-06, "loss": 0.80620122, "num_input_tokens_seen": 120899190, "step": 5620, "time_per_iteration": 2.623298406600952 }, { "auxiliary_loss_clip": 0.01118301, "auxiliary_loss_mlp": 0.01042175, "balance_loss_clip": 1.04204535, "balance_loss_mlp": 1.02378058, "epoch": 0.6758852882823303, "flos": 21795658479360.0, "grad_norm": 2.565870514330946, "language_loss": 0.71897757, "learning_rate": 1.0042755632349087e-06, "loss": 0.74058229, "num_input_tokens_seen": 120916080, "step": 5621, "time_per_iteration": 3.47005558013916 }, { "auxiliary_loss_clip": 0.01097398, "auxiliary_loss_mlp": 0.01036902, "balance_loss_clip": 1.04080737, "balance_loss_mlp": 1.01977134, "epoch": 0.6760055311729694, "flos": 27088783580160.0, "grad_norm": 3.40832622529602, "language_loss": 0.62797308, "learning_rate": 1.0036000708336653e-06, "loss": 0.64931607, "num_input_tokens_seen": 120935210, "step": 5622, "time_per_iteration": 3.6708180904388428 }, { "auxiliary_loss_clip": 0.01116387, "auxiliary_loss_mlp": 0.01034184, "balance_loss_clip": 1.0441308, "balance_loss_mlp": 1.01885319, "epoch": 0.6761257740636085, "flos": 17999792922240.0, "grad_norm": 2.4382554326758217, "language_loss": 0.79616666, "learning_rate": 1.0029247295854984e-06, "loss": 0.81767237, "num_input_tokens_seen": 120951830, "step": 5623, "time_per_iteration": 2.617955446243286 }, { "auxiliary_loss_clip": 0.01103266, "auxiliary_loss_mlp": 0.01040769, "balance_loss_clip": 1.04503226, "balance_loss_mlp": 1.02574301, "epoch": 0.6762460169542476, "flos": 15121912273920.0, "grad_norm": 1.7958544717791232, "language_loss": 0.71835923, "learning_rate": 1.0022495395928588e-06, "loss": 0.73979956, "num_input_tokens_seen": 120970310, "step": 5624, "time_per_iteration": 2.6692302227020264 }, { "auxiliary_loss_clip": 0.01042458, "auxiliary_loss_mlp": 0.01001903, "balance_loss_clip": 1.01332414, "balance_loss_mlp": 1.00056219, "epoch": 0.6763662598448866, "flos": 67886970030720.0, "grad_norm": 0.7920107499313551, "language_loss": 0.62308425, "learning_rate": 1.0015745009581697e-06, "loss": 0.64352787, "num_input_tokens_seen": 121031915, "step": 5625, "time_per_iteration": 3.172126531600952 }, { "auxiliary_loss_clip": 0.0112145, "auxiliary_loss_mlp": 0.01042533, "balance_loss_clip": 1.04348302, "balance_loss_mlp": 1.0268451, "epoch": 0.6764865027355258, "flos": 20631829910400.0, "grad_norm": 2.180429554647178, "language_loss": 0.66805863, "learning_rate": 1.0008996137838343e-06, "loss": 0.68969846, "num_input_tokens_seen": 121050890, "step": 5626, "time_per_iteration": 2.5813701152801514 }, { "auxiliary_loss_clip": 0.0114146, "auxiliary_loss_mlp": 0.01040949, "balance_loss_clip": 1.04488111, "balance_loss_mlp": 1.02346146, "epoch": 0.6766067456261649, "flos": 21215809226880.0, "grad_norm": 2.0842904545510867, "language_loss": 0.80277318, "learning_rate": 1.000224878172234e-06, "loss": 0.82459724, "num_input_tokens_seen": 121070015, "step": 5627, "time_per_iteration": 2.56622314453125 }, { "auxiliary_loss_clip": 0.01124036, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.04294407, "balance_loss_mlp": 1.01947308, "epoch": 0.6767269885168039, "flos": 19938251220480.0, "grad_norm": 2.2696444387108827, "language_loss": 0.73088932, "learning_rate": 9.99550294225724e-07, "loss": 0.75247979, "num_input_tokens_seen": 121089170, "step": 5628, "time_per_iteration": 3.4394991397857666 }, { "auxiliary_loss_clip": 0.01087127, "auxiliary_loss_mlp": 0.01038796, "balance_loss_clip": 1.03895879, "balance_loss_mlp": 1.02241659, "epoch": 0.6768472314074431, "flos": 20814076540800.0, "grad_norm": 2.033124922893747, "language_loss": 0.72816962, "learning_rate": 9.988758620466402e-07, "loss": 0.74942887, "num_input_tokens_seen": 121108040, "step": 5629, "time_per_iteration": 2.694573163986206 }, { "auxiliary_loss_clip": 0.01081147, "auxiliary_loss_mlp": 0.01039577, "balance_loss_clip": 1.04082394, "balance_loss_mlp": 1.02481902, "epoch": 0.6769674742980821, "flos": 23186012169600.0, "grad_norm": 1.8457618067177977, "language_loss": 0.76411593, "learning_rate": 9.982015817372917e-07, "loss": 0.7853232, "num_input_tokens_seen": 121128480, "step": 5630, "time_per_iteration": 4.246249198913574 }, { "auxiliary_loss_clip": 0.01084848, "auxiliary_loss_mlp": 0.01038934, "balance_loss_clip": 1.03845, "balance_loss_mlp": 1.02098131, "epoch": 0.6770877171887212, "flos": 24242934885120.0, "grad_norm": 2.164997220909147, "language_loss": 0.82092071, "learning_rate": 9.975274533999657e-07, "loss": 0.84215856, "num_input_tokens_seen": 121148010, "step": 5631, "time_per_iteration": 2.7417540550231934 }, { "auxiliary_loss_clip": 0.0113953, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.04526949, "balance_loss_mlp": 1.02329063, "epoch": 0.6772079600793603, "flos": 18141567903360.0, "grad_norm": 2.8234950128717737, "language_loss": 0.83352244, "learning_rate": 9.96853477136929e-07, "loss": 0.85531408, "num_input_tokens_seen": 121162755, "step": 5632, "time_per_iteration": 2.551154136657715 }, { "auxiliary_loss_clip": 0.01093563, "auxiliary_loss_mlp": 0.01041236, "balance_loss_clip": 1.04016423, "balance_loss_mlp": 1.02464151, "epoch": 0.6773282029699994, "flos": 22452069571200.0, "grad_norm": 2.0103551081256428, "language_loss": 0.75120962, "learning_rate": 9.96179653050422e-07, "loss": 0.77255762, "num_input_tokens_seen": 121182915, "step": 5633, "time_per_iteration": 2.6410281658172607 }, { "auxiliary_loss_clip": 0.01091456, "auxiliary_loss_mlp": 0.01041266, "balance_loss_clip": 1.03826725, "balance_loss_mlp": 1.02350366, "epoch": 0.6774484458606385, "flos": 18693730748160.0, "grad_norm": 3.063358290739641, "language_loss": 0.73953998, "learning_rate": 9.955059812426635e-07, "loss": 0.76086718, "num_input_tokens_seen": 121200445, "step": 5634, "time_per_iteration": 2.616084575653076 }, { "auxiliary_loss_clip": 0.01137141, "auxiliary_loss_mlp": 0.01042344, "balance_loss_clip": 1.04654932, "balance_loss_mlp": 1.02668607, "epoch": 0.6775686887512776, "flos": 25994046821760.0, "grad_norm": 2.047597383323994, "language_loss": 0.82890451, "learning_rate": 9.948324618158493e-07, "loss": 0.85069937, "num_input_tokens_seen": 121220785, "step": 5635, "time_per_iteration": 2.6335952281951904 }, { "auxiliary_loss_clip": 0.01126159, "auxiliary_loss_mlp": 0.01040016, "balance_loss_clip": 1.04215276, "balance_loss_mlp": 1.02253962, "epoch": 0.6776889316419167, "flos": 13587987922560.0, "grad_norm": 2.652693131587329, "language_loss": 0.77628165, "learning_rate": 9.941590948721502e-07, "loss": 0.79794335, "num_input_tokens_seen": 121237985, "step": 5636, "time_per_iteration": 2.573044538497925 }, { "auxiliary_loss_clip": 0.01107619, "auxiliary_loss_mlp": 0.01032895, "balance_loss_clip": 1.04330993, "balance_loss_mlp": 1.01738012, "epoch": 0.6778091745325557, "flos": 27601121220480.0, "grad_norm": 2.2056943598624827, "language_loss": 0.76685834, "learning_rate": 9.934858805137188e-07, "loss": 0.78826344, "num_input_tokens_seen": 121258635, "step": 5637, "time_per_iteration": 2.6948299407958984 }, { "auxiliary_loss_clip": 0.01118193, "auxiliary_loss_mlp": 0.01038405, "balance_loss_clip": 1.04311013, "balance_loss_mlp": 1.02363467, "epoch": 0.6779294174231949, "flos": 18734058743040.0, "grad_norm": 1.6529344962107566, "language_loss": 0.80821908, "learning_rate": 9.92812818842677e-07, "loss": 0.82978499, "num_input_tokens_seen": 121277810, "step": 5638, "time_per_iteration": 2.5655674934387207 }, { "auxiliary_loss_clip": 0.01117097, "auxiliary_loss_mlp": 0.01040211, "balance_loss_clip": 1.04234529, "balance_loss_mlp": 1.02391469, "epoch": 0.678049660313834, "flos": 45873797765760.0, "grad_norm": 1.898186243422223, "language_loss": 0.64124846, "learning_rate": 9.921399099611306e-07, "loss": 0.66282153, "num_input_tokens_seen": 121298975, "step": 5639, "time_per_iteration": 2.910878896713257 }, { "auxiliary_loss_clip": 0.01110608, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.04182482, "balance_loss_mlp": 1.02176976, "epoch": 0.678169903204473, "flos": 19974556892160.0, "grad_norm": 1.7510305399492752, "language_loss": 0.69142729, "learning_rate": 9.914671539711588e-07, "loss": 0.71290123, "num_input_tokens_seen": 121318495, "step": 5640, "time_per_iteration": 2.6691653728485107 }, { "auxiliary_loss_clip": 0.01050031, "auxiliary_loss_mlp": 0.00772895, "balance_loss_clip": 1.03549731, "balance_loss_mlp": 1.00039494, "epoch": 0.6782901460951122, "flos": 21395613732480.0, "grad_norm": 2.309593079395345, "language_loss": 0.78540534, "learning_rate": 9.90794550974817e-07, "loss": 0.80363458, "num_input_tokens_seen": 121338890, "step": 5641, "time_per_iteration": 3.212324857711792 }, { "auxiliary_loss_clip": 0.01095847, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.04102325, "balance_loss_mlp": 1.02314568, "epoch": 0.6784103889857512, "flos": 21434002392960.0, "grad_norm": 2.536710880531313, "language_loss": 0.81672537, "learning_rate": 9.901221010741407e-07, "loss": 0.83808112, "num_input_tokens_seen": 121358210, "step": 5642, "time_per_iteration": 3.3442325592041016 }, { "auxiliary_loss_clip": 0.01125543, "auxiliary_loss_mlp": 0.0103933, "balance_loss_clip": 1.04410815, "balance_loss_mlp": 1.02416039, "epoch": 0.6785306318763903, "flos": 32671923091200.0, "grad_norm": 2.08547575140747, "language_loss": 0.75051814, "learning_rate": 9.894498043711375e-07, "loss": 0.77216685, "num_input_tokens_seen": 121379955, "step": 5643, "time_per_iteration": 2.7268805503845215 }, { "auxiliary_loss_clip": 0.01106403, "auxiliary_loss_mlp": 0.01038211, "balance_loss_clip": 1.03901219, "balance_loss_mlp": 1.02183127, "epoch": 0.6786508747670293, "flos": 25632139340160.0, "grad_norm": 2.9604735259241832, "language_loss": 0.69430035, "learning_rate": 9.887776609677962e-07, "loss": 0.71574652, "num_input_tokens_seen": 121401325, "step": 5644, "time_per_iteration": 2.696131706237793 }, { "auxiliary_loss_clip": 0.01087162, "auxiliary_loss_mlp": 0.01044315, "balance_loss_clip": 1.03659439, "balance_loss_mlp": 1.02761436, "epoch": 0.6787711176576685, "flos": 19171881619200.0, "grad_norm": 1.6664248863062654, "language_loss": 0.72613567, "learning_rate": 9.88105670966079e-07, "loss": 0.74745047, "num_input_tokens_seen": 121419785, "step": 5645, "time_per_iteration": 2.6303305625915527 }, { "auxiliary_loss_clip": 0.01073766, "auxiliary_loss_mlp": 0.01037556, "balance_loss_clip": 1.03844309, "balance_loss_mlp": 1.02149808, "epoch": 0.6788913605483076, "flos": 13985159581440.0, "grad_norm": 2.13370459964907, "language_loss": 0.79135382, "learning_rate": 9.874338344679283e-07, "loss": 0.81246698, "num_input_tokens_seen": 121435630, "step": 5646, "time_per_iteration": 2.703404664993286 }, { "auxiliary_loss_clip": 0.01133777, "auxiliary_loss_mlp": 0.01038483, "balance_loss_clip": 1.04481149, "balance_loss_mlp": 1.02376664, "epoch": 0.6790116034389466, "flos": 22017586659840.0, "grad_norm": 1.7894245432911264, "language_loss": 0.73969674, "learning_rate": 9.86762151575259e-07, "loss": 0.7614193, "num_input_tokens_seen": 121455625, "step": 5647, "time_per_iteration": 4.427554130554199 }, { "auxiliary_loss_clip": 0.0108194, "auxiliary_loss_mlp": 0.00770703, "balance_loss_clip": 1.03925228, "balance_loss_mlp": 1.00038564, "epoch": 0.6791318463295858, "flos": 20922454851840.0, "grad_norm": 1.451650651322282, "language_loss": 0.80386138, "learning_rate": 9.860906223899651e-07, "loss": 0.82238781, "num_input_tokens_seen": 121475020, "step": 5648, "time_per_iteration": 3.7761285305023193 }, { "auxiliary_loss_clip": 0.01111799, "auxiliary_loss_mlp": 0.01041808, "balance_loss_clip": 1.0401535, "balance_loss_mlp": 1.0260129, "epoch": 0.6792520892202248, "flos": 28512749422080.0, "grad_norm": 1.6452822039810022, "language_loss": 0.760975, "learning_rate": 9.854192470139184e-07, "loss": 0.78251112, "num_input_tokens_seen": 121496500, "step": 5649, "time_per_iteration": 2.7005908489227295 }, { "auxiliary_loss_clip": 0.01109701, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.0423429, "balance_loss_mlp": 1.02114022, "epoch": 0.6793723321108639, "flos": 20011904058240.0, "grad_norm": 2.849560495044611, "language_loss": 0.71840376, "learning_rate": 9.847480255489645e-07, "loss": 0.73987412, "num_input_tokens_seen": 121515525, "step": 5650, "time_per_iteration": 2.659198522567749 }, { "auxiliary_loss_clip": 0.01114218, "auxiliary_loss_mlp": 0.01044357, "balance_loss_clip": 1.04183078, "balance_loss_mlp": 1.02747655, "epoch": 0.6794925750015031, "flos": 26649488246400.0, "grad_norm": 1.887139070736837, "language_loss": 0.69277298, "learning_rate": 9.840769580969295e-07, "loss": 0.71435881, "num_input_tokens_seen": 121535965, "step": 5651, "time_per_iteration": 2.6589419841766357 }, { "auxiliary_loss_clip": 0.01116352, "auxiliary_loss_mlp": 0.01040057, "balance_loss_clip": 1.04186916, "balance_loss_mlp": 1.02246165, "epoch": 0.6796128178921421, "flos": 21580374314880.0, "grad_norm": 1.9223298027779248, "language_loss": 0.79666114, "learning_rate": 9.834060447596114e-07, "loss": 0.81822526, "num_input_tokens_seen": 121555235, "step": 5652, "time_per_iteration": 2.604254722595215 }, { "auxiliary_loss_clip": 0.0112536, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.04292917, "balance_loss_mlp": 1.02401602, "epoch": 0.6797330607827812, "flos": 22492002516480.0, "grad_norm": 1.8627609123561744, "language_loss": 0.78380764, "learning_rate": 9.827352856387868e-07, "loss": 0.80546522, "num_input_tokens_seen": 121574945, "step": 5653, "time_per_iteration": 3.5469284057617188 }, { "auxiliary_loss_clip": 0.00999407, "auxiliary_loss_mlp": 0.01020322, "balance_loss_clip": 1.01150441, "balance_loss_mlp": 1.01816428, "epoch": 0.6798533036734203, "flos": 66306648286080.0, "grad_norm": 0.7811062581055784, "language_loss": 0.64247859, "learning_rate": 9.820646808362118e-07, "loss": 0.66267598, "num_input_tokens_seen": 121641200, "step": 5654, "time_per_iteration": 3.3725969791412354 }, { "auxiliary_loss_clip": 0.01109383, "auxiliary_loss_mlp": 0.01040235, "balance_loss_clip": 1.04399574, "balance_loss_mlp": 1.02355766, "epoch": 0.6799735465640594, "flos": 16180163792640.0, "grad_norm": 2.2941237271753354, "language_loss": 0.73488206, "learning_rate": 9.813942304536154e-07, "loss": 0.75637823, "num_input_tokens_seen": 121659170, "step": 5655, "time_per_iteration": 3.502713680267334 }, { "auxiliary_loss_clip": 0.01111807, "auxiliary_loss_mlp": 0.01036823, "balance_loss_clip": 1.04179811, "balance_loss_mlp": 1.02201104, "epoch": 0.6800937894546984, "flos": 22125749489280.0, "grad_norm": 2.0891005302978183, "language_loss": 0.63795239, "learning_rate": 9.807239345927043e-07, "loss": 0.65943873, "num_input_tokens_seen": 121679180, "step": 5656, "time_per_iteration": 2.630645513534546 }, { "auxiliary_loss_clip": 0.01108662, "auxiliary_loss_mlp": 0.0103562, "balance_loss_clip": 1.03844762, "balance_loss_mlp": 1.02038467, "epoch": 0.6802140323453376, "flos": 31612953300480.0, "grad_norm": 2.120617773259194, "language_loss": 0.72217524, "learning_rate": 9.80053793355162e-07, "loss": 0.74361801, "num_input_tokens_seen": 121697875, "step": 5657, "time_per_iteration": 2.7423248291015625 }, { "auxiliary_loss_clip": 0.01077138, "auxiliary_loss_mlp": 0.0103921, "balance_loss_clip": 1.03793859, "balance_loss_mlp": 1.02303362, "epoch": 0.6803342752359767, "flos": 17712938908800.0, "grad_norm": 2.0976463455365684, "language_loss": 0.75130403, "learning_rate": 9.793838068426472e-07, "loss": 0.77246749, "num_input_tokens_seen": 121715570, "step": 5658, "time_per_iteration": 2.6631503105163574 }, { "auxiliary_loss_clip": 0.01134644, "auxiliary_loss_mlp": 0.0103622, "balance_loss_clip": 1.04397488, "balance_loss_mlp": 1.01981699, "epoch": 0.6804545181266157, "flos": 11326800902400.0, "grad_norm": 2.1091370822265407, "language_loss": 0.609366, "learning_rate": 9.78713975156799e-07, "loss": 0.63107461, "num_input_tokens_seen": 121731435, "step": 5659, "time_per_iteration": 2.562781810760498 }, { "auxiliary_loss_clip": 0.0109705, "auxiliary_loss_mlp": 0.01041003, "balance_loss_clip": 1.04145384, "balance_loss_mlp": 1.02467167, "epoch": 0.6805747610172549, "flos": 29350976181120.0, "grad_norm": 1.7320838275083337, "language_loss": 0.71740627, "learning_rate": 9.780442983992273e-07, "loss": 0.73878676, "num_input_tokens_seen": 121749950, "step": 5660, "time_per_iteration": 2.7418372631073 }, { "auxiliary_loss_clip": 0.01104683, "auxiliary_loss_mlp": 0.01040427, "balance_loss_clip": 1.04087591, "balance_loss_mlp": 1.02372575, "epoch": 0.680695003907894, "flos": 37631868612480.0, "grad_norm": 1.6863389174883632, "language_loss": 0.71977317, "learning_rate": 9.773747766715238e-07, "loss": 0.74122423, "num_input_tokens_seen": 121770770, "step": 5661, "time_per_iteration": 2.7965147495269775 }, { "auxiliary_loss_clip": 0.01113282, "auxiliary_loss_mlp": 0.01034731, "balance_loss_clip": 1.04308629, "balance_loss_mlp": 1.01788688, "epoch": 0.680815246798533, "flos": 22127365601280.0, "grad_norm": 1.8720928507666723, "language_loss": 0.80116612, "learning_rate": 9.767054100752536e-07, "loss": 0.82264626, "num_input_tokens_seen": 121790720, "step": 5662, "time_per_iteration": 2.6418399810791016 }, { "auxiliary_loss_clip": 0.01096144, "auxiliary_loss_mlp": 0.01039265, "balance_loss_clip": 1.04050541, "balance_loss_mlp": 1.02237296, "epoch": 0.6809354896891722, "flos": 17201822330880.0, "grad_norm": 4.236664560156077, "language_loss": 0.81409764, "learning_rate": 9.760361987119584e-07, "loss": 0.83545172, "num_input_tokens_seen": 121808455, "step": 5663, "time_per_iteration": 2.6919403076171875 }, { "auxiliary_loss_clip": 0.01109629, "auxiliary_loss_mlp": 0.01044251, "balance_loss_clip": 1.03984916, "balance_loss_mlp": 1.02657223, "epoch": 0.6810557325798112, "flos": 12458166554880.0, "grad_norm": 2.876474566698573, "language_loss": 0.67825007, "learning_rate": 9.753671426831592e-07, "loss": 0.69978887, "num_input_tokens_seen": 121824470, "step": 5664, "time_per_iteration": 2.6198911666870117 }, { "auxiliary_loss_clip": 0.01113466, "auxiliary_loss_mlp": 0.01043002, "balance_loss_clip": 1.03902221, "balance_loss_mlp": 1.02656329, "epoch": 0.6811759754704503, "flos": 22156165330560.0, "grad_norm": 1.822624665169865, "language_loss": 0.79823941, "learning_rate": 9.746982420903483e-07, "loss": 0.81980407, "num_input_tokens_seen": 121842665, "step": 5665, "time_per_iteration": 2.6798272132873535 }, { "auxiliary_loss_clip": 0.01116375, "auxiliary_loss_mlp": 0.01037015, "balance_loss_clip": 1.04214406, "balance_loss_mlp": 1.02226913, "epoch": 0.6812962183610894, "flos": 17525377065600.0, "grad_norm": 1.5355574508337941, "language_loss": 0.74917299, "learning_rate": 9.740294970349993e-07, "loss": 0.77070689, "num_input_tokens_seen": 121859080, "step": 5666, "time_per_iteration": 2.57482647895813 }, { "auxiliary_loss_clip": 0.01023544, "auxiliary_loss_mlp": 0.01002589, "balance_loss_clip": 1.01263213, "balance_loss_mlp": 1.00128937, "epoch": 0.6814164612517285, "flos": 60274480855680.0, "grad_norm": 0.8810228519315081, "language_loss": 0.60878885, "learning_rate": 9.733609076185594e-07, "loss": 0.62905014, "num_input_tokens_seen": 121915485, "step": 5667, "time_per_iteration": 3.085646390914917 }, { "auxiliary_loss_clip": 0.01126138, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.04542828, "balance_loss_mlp": 1.0247314, "epoch": 0.6815367041423676, "flos": 19317750750720.0, "grad_norm": 2.1058330202161435, "language_loss": 0.84114724, "learning_rate": 9.72692473942455e-07, "loss": 0.86280972, "num_input_tokens_seen": 121932710, "step": 5668, "time_per_iteration": 2.608222484588623 }, { "auxiliary_loss_clip": 0.01088388, "auxiliary_loss_mlp": 0.01035335, "balance_loss_clip": 1.04261303, "balance_loss_mlp": 1.01919413, "epoch": 0.6816569470330067, "flos": 22161696024960.0, "grad_norm": 1.6497319432405797, "language_loss": 0.77302158, "learning_rate": 9.720241961080849e-07, "loss": 0.79425877, "num_input_tokens_seen": 121952025, "step": 5669, "time_per_iteration": 2.720275640487671 }, { "auxiliary_loss_clip": 0.01132997, "auxiliary_loss_mlp": 0.01035011, "balance_loss_clip": 1.04228723, "balance_loss_mlp": 1.01970434, "epoch": 0.6817771899236458, "flos": 41463501137280.0, "grad_norm": 1.9734470183916155, "language_loss": 0.73011112, "learning_rate": 9.713560742168259e-07, "loss": 0.75179124, "num_input_tokens_seen": 121974650, "step": 5670, "time_per_iteration": 2.7829813957214355 }, { "auxiliary_loss_clip": 0.01097135, "auxiliary_loss_mlp": 0.01040508, "balance_loss_clip": 1.0396564, "balance_loss_mlp": 1.02470064, "epoch": 0.6818974328142848, "flos": 21106138026240.0, "grad_norm": 3.012792084911751, "language_loss": 0.71243829, "learning_rate": 9.706881083700333e-07, "loss": 0.73381472, "num_input_tokens_seen": 121994335, "step": 5671, "time_per_iteration": 2.672760248184204 }, { "auxiliary_loss_clip": 0.01070719, "auxiliary_loss_mlp": 0.01040051, "balance_loss_clip": 1.04221189, "balance_loss_mlp": 1.02424335, "epoch": 0.682017675704924, "flos": 20441897769600.0, "grad_norm": 2.2879736034460394, "language_loss": 0.8218044, "learning_rate": 9.700202986690357e-07, "loss": 0.84291214, "num_input_tokens_seen": 122012635, "step": 5672, "time_per_iteration": 2.9234910011291504 }, { "auxiliary_loss_clip": 0.01121154, "auxiliary_loss_mlp": 0.00773019, "balance_loss_clip": 1.04114199, "balance_loss_mlp": 1.00040388, "epoch": 0.682137918595563, "flos": 20044438801920.0, "grad_norm": 3.3869583198502036, "language_loss": 0.66564858, "learning_rate": 9.693526452151413e-07, "loss": 0.68459034, "num_input_tokens_seen": 122031685, "step": 5673, "time_per_iteration": 3.598767042160034 }, { "auxiliary_loss_clip": 0.01103864, "auxiliary_loss_mlp": 0.01043768, "balance_loss_clip": 1.03984809, "balance_loss_mlp": 1.02672112, "epoch": 0.6822581614862021, "flos": 31684559063040.0, "grad_norm": 2.115796239625859, "language_loss": 0.75258619, "learning_rate": 9.686851481096305e-07, "loss": 0.77406251, "num_input_tokens_seen": 122052995, "step": 5674, "time_per_iteration": 2.7479326725006104 }, { "auxiliary_loss_clip": 0.01066621, "auxiliary_loss_mlp": 0.01036738, "balance_loss_clip": 1.03726172, "balance_loss_mlp": 1.02050149, "epoch": 0.6823784043768413, "flos": 23477570864640.0, "grad_norm": 5.292954203418371, "language_loss": 0.72119558, "learning_rate": 9.68017807453762e-07, "loss": 0.7422291, "num_input_tokens_seen": 122071740, "step": 5675, "time_per_iteration": 3.719707489013672 }, { "auxiliary_loss_clip": 0.01109328, "auxiliary_loss_mlp": 0.00772052, "balance_loss_clip": 1.04100728, "balance_loss_mlp": 1.0005163, "epoch": 0.6824986472674803, "flos": 14137134024960.0, "grad_norm": 2.6690210439495545, "language_loss": 0.73501301, "learning_rate": 9.673506233487721e-07, "loss": 0.7538268, "num_input_tokens_seen": 122089705, "step": 5676, "time_per_iteration": 2.6183953285217285 }, { "auxiliary_loss_clip": 0.01109193, "auxiliary_loss_mlp": 0.0077145, "balance_loss_clip": 1.04256988, "balance_loss_mlp": 1.00041556, "epoch": 0.6826188901581194, "flos": 21504997624320.0, "grad_norm": 1.7130076933216911, "language_loss": 0.85902101, "learning_rate": 9.666835958958717e-07, "loss": 0.87782747, "num_input_tokens_seen": 122109025, "step": 5677, "time_per_iteration": 2.638400077819824 }, { "auxiliary_loss_clip": 0.01134646, "auxiliary_loss_mlp": 0.01042368, "balance_loss_clip": 1.04562736, "balance_loss_mlp": 1.02682304, "epoch": 0.6827391330487584, "flos": 20810126044800.0, "grad_norm": 3.006183006569046, "language_loss": 0.80801433, "learning_rate": 9.660167251962484e-07, "loss": 0.82978445, "num_input_tokens_seen": 122127385, "step": 5678, "time_per_iteration": 2.571798801422119 }, { "auxiliary_loss_clip": 0.01100454, "auxiliary_loss_mlp": 0.01040185, "balance_loss_clip": 1.04066169, "balance_loss_mlp": 1.02526593, "epoch": 0.6828593759393976, "flos": 21688788539520.0, "grad_norm": 1.5216395744357762, "language_loss": 0.77965629, "learning_rate": 9.653500113510654e-07, "loss": 0.8010627, "num_input_tokens_seen": 122146500, "step": 5679, "time_per_iteration": 2.6786632537841797 }, { "auxiliary_loss_clip": 0.01105666, "auxiliary_loss_mlp": 0.0105781, "balance_loss_clip": 1.04009056, "balance_loss_mlp": 1.04002404, "epoch": 0.6829796188300367, "flos": 25337707557120.0, "grad_norm": 4.9742907969513785, "language_loss": 0.6736623, "learning_rate": 9.646834544614627e-07, "loss": 0.69529712, "num_input_tokens_seen": 122167000, "step": 5680, "time_per_iteration": 3.8750452995300293 }, { "auxiliary_loss_clip": 0.01099473, "auxiliary_loss_mlp": 0.01048664, "balance_loss_clip": 1.03949368, "balance_loss_mlp": 1.0312115, "epoch": 0.6830998617206757, "flos": 20704800389760.0, "grad_norm": 1.8909850455691037, "language_loss": 0.76098639, "learning_rate": 9.64017054628558e-07, "loss": 0.78246778, "num_input_tokens_seen": 122185825, "step": 5681, "time_per_iteration": 3.539670944213867 }, { "auxiliary_loss_clip": 0.01086595, "auxiliary_loss_mlp": 0.01041591, "balance_loss_clip": 1.03662288, "balance_loss_mlp": 1.02641582, "epoch": 0.6832201046113149, "flos": 21726638496000.0, "grad_norm": 1.8124679845610425, "language_loss": 0.79067415, "learning_rate": 9.63350811953441e-07, "loss": 0.81195599, "num_input_tokens_seen": 122206200, "step": 5682, "time_per_iteration": 2.7258899211883545 }, { "auxiliary_loss_clip": 0.01101099, "auxiliary_loss_mlp": 0.01033101, "balance_loss_clip": 1.04044545, "balance_loss_mlp": 1.01703143, "epoch": 0.6833403475019539, "flos": 19536554448000.0, "grad_norm": 2.3477846188900413, "language_loss": 0.70372224, "learning_rate": 9.626847265371826e-07, "loss": 0.72506416, "num_input_tokens_seen": 122225520, "step": 5683, "time_per_iteration": 2.729900598526001 }, { "auxiliary_loss_clip": 0.01101077, "auxiliary_loss_mlp": 0.01041164, "balance_loss_clip": 1.03760195, "balance_loss_mlp": 1.0252142, "epoch": 0.683460590392593, "flos": 19352153001600.0, "grad_norm": 3.2861949572738736, "language_loss": 0.7865935, "learning_rate": 9.620187984808262e-07, "loss": 0.80801594, "num_input_tokens_seen": 122244320, "step": 5684, "time_per_iteration": 2.670084238052368 }, { "auxiliary_loss_clip": 0.01108762, "auxiliary_loss_mlp": 0.00771788, "balance_loss_clip": 1.04265618, "balance_loss_mlp": 1.00040996, "epoch": 0.6835808332832322, "flos": 23288500650240.0, "grad_norm": 1.8345337309131233, "language_loss": 0.86170876, "learning_rate": 9.613530278853919e-07, "loss": 0.88051426, "num_input_tokens_seen": 122264295, "step": 5685, "time_per_iteration": 2.745650053024292 }, { "auxiliary_loss_clip": 0.01117749, "auxiliary_loss_mlp": 0.01040199, "balance_loss_clip": 1.04250693, "balance_loss_mlp": 1.02455878, "epoch": 0.6837010761738712, "flos": 21653416621440.0, "grad_norm": 2.0550855505780175, "language_loss": 0.7426151, "learning_rate": 9.60687414851879e-07, "loss": 0.76419461, "num_input_tokens_seen": 122285300, "step": 5686, "time_per_iteration": 2.6474218368530273 }, { "auxiliary_loss_clip": 0.01108835, "auxiliary_loss_mlp": 0.01040262, "balance_loss_clip": 1.03994358, "balance_loss_mlp": 1.0248307, "epoch": 0.6838213190645103, "flos": 17566387418880.0, "grad_norm": 2.272464389070063, "language_loss": 0.77832431, "learning_rate": 9.600219594812575e-07, "loss": 0.7998153, "num_input_tokens_seen": 122303240, "step": 5687, "time_per_iteration": 2.6457037925720215 }, { "auxiliary_loss_clip": 0.01129535, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.0418222, "balance_loss_mlp": 1.02068734, "epoch": 0.6839415619551494, "flos": 23112538899840.0, "grad_norm": 1.7010408415009224, "language_loss": 0.72901678, "learning_rate": 9.593566618744786e-07, "loss": 0.75066084, "num_input_tokens_seen": 122323390, "step": 5688, "time_per_iteration": 2.58829665184021 }, { "auxiliary_loss_clip": 0.01135185, "auxiliary_loss_mlp": 0.0104125, "balance_loss_clip": 1.04500508, "balance_loss_mlp": 1.02537096, "epoch": 0.6840618048457885, "flos": 22127868391680.0, "grad_norm": 1.764307239286021, "language_loss": 0.74015743, "learning_rate": 9.58691522132466e-07, "loss": 0.76192176, "num_input_tokens_seen": 122342200, "step": 5689, "time_per_iteration": 2.589169979095459 }, { "auxiliary_loss_clip": 0.01115242, "auxiliary_loss_mlp": 0.01037271, "balance_loss_clip": 1.0435425, "balance_loss_mlp": 1.02179766, "epoch": 0.6841820477364275, "flos": 22015898720640.0, "grad_norm": 2.3602520745535545, "language_loss": 0.84708107, "learning_rate": 9.58026540356123e-07, "loss": 0.86860621, "num_input_tokens_seen": 122360465, "step": 5690, "time_per_iteration": 2.643360137939453 }, { "auxiliary_loss_clip": 0.01125847, "auxiliary_loss_mlp": 0.01042212, "balance_loss_clip": 1.04359508, "balance_loss_mlp": 1.02566552, "epoch": 0.6843022906270667, "flos": 24900531125760.0, "grad_norm": 1.5992564234293176, "language_loss": 0.86631685, "learning_rate": 9.573617166463246e-07, "loss": 0.88799745, "num_input_tokens_seen": 122381680, "step": 5691, "time_per_iteration": 2.6516222953796387 }, { "auxiliary_loss_clip": 0.01112607, "auxiliary_loss_mlp": 0.01039342, "balance_loss_clip": 1.04238057, "balance_loss_mlp": 1.02342749, "epoch": 0.6844225335177058, "flos": 19969924037760.0, "grad_norm": 2.490299975457216, "language_loss": 0.60170352, "learning_rate": 9.56697051103924e-07, "loss": 0.62322307, "num_input_tokens_seen": 122399120, "step": 5692, "time_per_iteration": 2.655696392059326 }, { "auxiliary_loss_clip": 0.01102304, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 1.03909814, "balance_loss_mlp": 1.02316272, "epoch": 0.6845427764083448, "flos": 25883334126720.0, "grad_norm": 2.32548368302063, "language_loss": 0.81449455, "learning_rate": 9.560325438297522e-07, "loss": 0.8359015, "num_input_tokens_seen": 122417430, "step": 5693, "time_per_iteration": 2.7094929218292236 }, { "auxiliary_loss_clip": 0.01110249, "auxiliary_loss_mlp": 0.01043757, "balance_loss_clip": 1.04508078, "balance_loss_mlp": 1.0280453, "epoch": 0.684663019298984, "flos": 18880143356160.0, "grad_norm": 1.9876377138404535, "language_loss": 0.86699629, "learning_rate": 9.553681949246127e-07, "loss": 0.88853633, "num_input_tokens_seen": 122435055, "step": 5694, "time_per_iteration": 2.639697551727295 }, { "auxiliary_loss_clip": 0.01102467, "auxiliary_loss_mlp": 0.01039737, "balance_loss_clip": 1.04195726, "balance_loss_mlp": 1.02344084, "epoch": 0.684783262189623, "flos": 54193725302400.0, "grad_norm": 2.1434106743790453, "language_loss": 0.75600994, "learning_rate": 9.547040044892886e-07, "loss": 0.77743196, "num_input_tokens_seen": 122462570, "step": 5695, "time_per_iteration": 3.1118369102478027 }, { "auxiliary_loss_clip": 0.01031397, "auxiliary_loss_mlp": 0.00999851, "balance_loss_clip": 1.0104754, "balance_loss_mlp": 0.9984743, "epoch": 0.6849035050802621, "flos": 63970264143360.0, "grad_norm": 0.8572806097522686, "language_loss": 0.60060453, "learning_rate": 9.540399726245354e-07, "loss": 0.62091708, "num_input_tokens_seen": 122519275, "step": 5696, "time_per_iteration": 3.0840137004852295 }, { "auxiliary_loss_clip": 0.01107107, "auxiliary_loss_mlp": 0.01037529, "balance_loss_clip": 1.04065216, "balance_loss_mlp": 1.02054155, "epoch": 0.6850237479709013, "flos": 25224121774080.0, "grad_norm": 3.802665062304672, "language_loss": 0.68970305, "learning_rate": 9.533760994310859e-07, "loss": 0.71114945, "num_input_tokens_seen": 122539675, "step": 5697, "time_per_iteration": 2.6793596744537354 }, { "auxiliary_loss_clip": 0.01135783, "auxiliary_loss_mlp": 0.01037804, "balance_loss_clip": 1.04461741, "balance_loss_mlp": 1.02172303, "epoch": 0.6851439908615403, "flos": 19354128249600.0, "grad_norm": 2.0412563864842506, "language_loss": 0.7486639, "learning_rate": 9.527123850096508e-07, "loss": 0.77039981, "num_input_tokens_seen": 122558035, "step": 5698, "time_per_iteration": 2.6348769664764404 }, { "auxiliary_loss_clip": 0.01124663, "auxiliary_loss_mlp": 0.01036493, "balance_loss_clip": 1.0425818, "balance_loss_mlp": 1.02168775, "epoch": 0.6852642337521794, "flos": 23182133500800.0, "grad_norm": 3.1866673064856177, "language_loss": 0.71791846, "learning_rate": 9.520488294609142e-07, "loss": 0.73952997, "num_input_tokens_seen": 122576815, "step": 5699, "time_per_iteration": 3.5229408740997314 }, { "auxiliary_loss_clip": 0.01003566, "auxiliary_loss_mlp": 0.01002547, "balance_loss_clip": 1.0126853, "balance_loss_mlp": 1.00083053, "epoch": 0.6853844766428185, "flos": 62647206583680.0, "grad_norm": 0.7354940839812093, "language_loss": 0.53789151, "learning_rate": 9.513854328855368e-07, "loss": 0.55795264, "num_input_tokens_seen": 122634690, "step": 5700, "time_per_iteration": 4.257664203643799 }, { "auxiliary_loss_clip": 0.01129597, "auxiliary_loss_mlp": 0.01041858, "balance_loss_clip": 1.04215264, "balance_loss_mlp": 1.02597916, "epoch": 0.6855047195334576, "flos": 23437242869760.0, "grad_norm": 2.103474081961939, "language_loss": 0.81582481, "learning_rate": 9.507221953841558e-07, "loss": 0.83753937, "num_input_tokens_seen": 122652320, "step": 5701, "time_per_iteration": 2.6260318756103516 }, { "auxiliary_loss_clip": 0.01126195, "auxiliary_loss_mlp": 0.01042212, "balance_loss_clip": 1.04428673, "balance_loss_mlp": 1.02621388, "epoch": 0.6856249624240967, "flos": 20664831530880.0, "grad_norm": 5.374106703892345, "language_loss": 0.77723074, "learning_rate": 9.500591170573824e-07, "loss": 0.79891485, "num_input_tokens_seen": 122672340, "step": 5702, "time_per_iteration": 2.666498899459839 }, { "auxiliary_loss_clip": 0.01084518, "auxiliary_loss_mlp": 0.01037067, "balance_loss_clip": 1.03964782, "balance_loss_mlp": 1.02073514, "epoch": 0.6857452053147358, "flos": 17087302794240.0, "grad_norm": 2.3472530001043768, "language_loss": 0.74171317, "learning_rate": 9.493961980058078e-07, "loss": 0.76292908, "num_input_tokens_seen": 122689935, "step": 5703, "time_per_iteration": 2.7059919834136963 }, { "auxiliary_loss_clip": 0.0105742, "auxiliary_loss_mlp": 0.01045126, "balance_loss_clip": 1.03562582, "balance_loss_mlp": 1.02769804, "epoch": 0.6858654482053749, "flos": 30847266057600.0, "grad_norm": 1.8295133675385633, "language_loss": 0.67822969, "learning_rate": 9.48733438329993e-07, "loss": 0.69925511, "num_input_tokens_seen": 122710200, "step": 5704, "time_per_iteration": 2.839841842651367 }, { "auxiliary_loss_clip": 0.01131038, "auxiliary_loss_mlp": 0.00771738, "balance_loss_clip": 1.04444528, "balance_loss_mlp": 1.00043035, "epoch": 0.6859856910960139, "flos": 28877314510080.0, "grad_norm": 1.724964617584637, "language_loss": 0.74333608, "learning_rate": 9.480708381304807e-07, "loss": 0.76236391, "num_input_tokens_seen": 122731495, "step": 5705, "time_per_iteration": 2.6678340435028076 }, { "auxiliary_loss_clip": 0.01082235, "auxiliary_loss_mlp": 0.01036577, "balance_loss_clip": 1.04143119, "balance_loss_mlp": 1.02026963, "epoch": 0.6861059339866531, "flos": 19354523299200.0, "grad_norm": 2.031897823991528, "language_loss": 0.83472502, "learning_rate": 9.474083975077858e-07, "loss": 0.85591316, "num_input_tokens_seen": 122748620, "step": 5706, "time_per_iteration": 3.680568218231201 }, { "auxiliary_loss_clip": 0.01116929, "auxiliary_loss_mlp": 0.01041656, "balance_loss_clip": 1.04318428, "balance_loss_mlp": 1.0252651, "epoch": 0.6862261768772921, "flos": 22199976944640.0, "grad_norm": 2.6364342868232167, "language_loss": 0.79956198, "learning_rate": 9.467461165623994e-07, "loss": 0.8211478, "num_input_tokens_seen": 122767670, "step": 5707, "time_per_iteration": 3.5584800243377686 }, { "auxiliary_loss_clip": 0.01124388, "auxiliary_loss_mlp": 0.01038311, "balance_loss_clip": 1.04323566, "balance_loss_mlp": 1.02330899, "epoch": 0.6863464197679312, "flos": 26285677344000.0, "grad_norm": 3.529699373154508, "language_loss": 0.79471374, "learning_rate": 9.46083995394791e-07, "loss": 0.81634074, "num_input_tokens_seen": 122785480, "step": 5708, "time_per_iteration": 2.6856679916381836 }, { "auxiliary_loss_clip": 0.01121472, "auxiliary_loss_mlp": 0.00771631, "balance_loss_clip": 1.04326105, "balance_loss_mlp": 1.00050926, "epoch": 0.6864666626585703, "flos": 37815228564480.0, "grad_norm": 2.7117097279212308, "language_loss": 0.63545448, "learning_rate": 9.454220341054012e-07, "loss": 0.65438551, "num_input_tokens_seen": 122810265, "step": 5709, "time_per_iteration": 2.7612485885620117 }, { "auxiliary_loss_clip": 0.01096222, "auxiliary_loss_mlp": 0.01038148, "balance_loss_clip": 1.03920519, "balance_loss_mlp": 1.02209008, "epoch": 0.6865869055492094, "flos": 19391152193280.0, "grad_norm": 2.3435874063233992, "language_loss": 0.80574346, "learning_rate": 9.447602327946512e-07, "loss": 0.82708704, "num_input_tokens_seen": 122828905, "step": 5710, "time_per_iteration": 2.7169597148895264 }, { "auxiliary_loss_clip": 0.01109231, "auxiliary_loss_mlp": 0.01037607, "balance_loss_clip": 1.04132175, "balance_loss_mlp": 1.02090621, "epoch": 0.6867071484398485, "flos": 20375966355840.0, "grad_norm": 2.0690537958699156, "language_loss": 0.76824701, "learning_rate": 9.440985915629338e-07, "loss": 0.78971535, "num_input_tokens_seen": 122846235, "step": 5711, "time_per_iteration": 2.678595542907715 }, { "auxiliary_loss_clip": 0.01131761, "auxiliary_loss_mlp": 0.01036439, "balance_loss_clip": 1.04415083, "balance_loss_mlp": 1.02225959, "epoch": 0.6868273913304875, "flos": 15889143801600.0, "grad_norm": 2.205067341853911, "language_loss": 0.72823048, "learning_rate": 9.434371105106223e-07, "loss": 0.74991256, "num_input_tokens_seen": 122863835, "step": 5712, "time_per_iteration": 2.5296292304992676 }, { "auxiliary_loss_clip": 0.0109408, "auxiliary_loss_mlp": 0.01040583, "balance_loss_clip": 1.03926528, "balance_loss_mlp": 1.02359605, "epoch": 0.6869476342211267, "flos": 24462492768000.0, "grad_norm": 1.7081549875773026, "language_loss": 0.70591772, "learning_rate": 9.427757897380602e-07, "loss": 0.7272644, "num_input_tokens_seen": 122883235, "step": 5713, "time_per_iteration": 2.786869525909424 }, { "auxiliary_loss_clip": 0.01095671, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.04130697, "balance_loss_mlp": 1.02126169, "epoch": 0.6870678771117658, "flos": 18442571875200.0, "grad_norm": 2.220859682039579, "language_loss": 0.85448158, "learning_rate": 9.421146293455695e-07, "loss": 0.87580884, "num_input_tokens_seen": 122898975, "step": 5714, "time_per_iteration": 2.6679952144622803 }, { "auxiliary_loss_clip": 0.0110448, "auxiliary_loss_mlp": 0.01043274, "balance_loss_clip": 1.03953171, "balance_loss_mlp": 1.02738333, "epoch": 0.6871881200024048, "flos": 22200371994240.0, "grad_norm": 1.9181725266443108, "language_loss": 0.68277907, "learning_rate": 9.414536294334489e-07, "loss": 0.70425665, "num_input_tokens_seen": 122918995, "step": 5715, "time_per_iteration": 2.728663921356201 }, { "auxiliary_loss_clip": 0.01111115, "auxiliary_loss_mlp": 0.01038027, "balance_loss_clip": 1.03963637, "balance_loss_mlp": 1.02189755, "epoch": 0.687308362893044, "flos": 22127724737280.0, "grad_norm": 2.1185487711111017, "language_loss": 0.69485879, "learning_rate": 9.407927901019708e-07, "loss": 0.7163502, "num_input_tokens_seen": 122938125, "step": 5716, "time_per_iteration": 2.6770761013031006 }, { "auxiliary_loss_clip": 0.01122424, "auxiliary_loss_mlp": 0.01043761, "balance_loss_clip": 1.04349244, "balance_loss_mlp": 1.02771544, "epoch": 0.687428605783683, "flos": 25040546340480.0, "grad_norm": 2.126461116826187, "language_loss": 0.76855755, "learning_rate": 9.401321114513854e-07, "loss": 0.79021943, "num_input_tokens_seen": 122957020, "step": 5717, "time_per_iteration": 2.6944222450256348 }, { "auxiliary_loss_clip": 0.01136508, "auxiliary_loss_mlp": 0.01040258, "balance_loss_clip": 1.04480469, "balance_loss_mlp": 1.02316296, "epoch": 0.6875488486743221, "flos": 23770063313280.0, "grad_norm": 1.9732206729997743, "language_loss": 0.7524367, "learning_rate": 9.394715935819155e-07, "loss": 0.77420431, "num_input_tokens_seen": 122977410, "step": 5718, "time_per_iteration": 2.620208501815796 }, { "auxiliary_loss_clip": 0.01124471, "auxiliary_loss_mlp": 0.01042547, "balance_loss_clip": 1.04234529, "balance_loss_mlp": 1.02562547, "epoch": 0.6876690915649613, "flos": 25516937445120.0, "grad_norm": 3.271038007689352, "language_loss": 0.62648112, "learning_rate": 9.388112365937608e-07, "loss": 0.64815128, "num_input_tokens_seen": 122996875, "step": 5719, "time_per_iteration": 2.679077386856079 }, { "auxiliary_loss_clip": 0.01096594, "auxiliary_loss_mlp": 0.01046312, "balance_loss_clip": 1.04042768, "balance_loss_mlp": 1.02881193, "epoch": 0.6877893344556003, "flos": 19427996568960.0, "grad_norm": 3.2857700924809587, "language_loss": 0.82698441, "learning_rate": 9.381510405870985e-07, "loss": 0.84841347, "num_input_tokens_seen": 123015890, "step": 5720, "time_per_iteration": 2.667285919189453 }, { "auxiliary_loss_clip": 0.0112412, "auxiliary_loss_mlp": 0.01055956, "balance_loss_clip": 1.04540622, "balance_loss_mlp": 1.03743088, "epoch": 0.6879095773462394, "flos": 18661303745280.0, "grad_norm": 4.854186300361074, "language_loss": 0.77269912, "learning_rate": 9.374910056620791e-07, "loss": 0.79449987, "num_input_tokens_seen": 123034955, "step": 5721, "time_per_iteration": 2.656949520111084 }, { "auxiliary_loss_clip": 0.01127379, "auxiliary_loss_mlp": 0.01040296, "balance_loss_clip": 1.04491568, "balance_loss_mlp": 1.02403617, "epoch": 0.6880298202368785, "flos": 20883132437760.0, "grad_norm": 1.9847564791975807, "language_loss": 0.80958551, "learning_rate": 9.368311319188293e-07, "loss": 0.83126223, "num_input_tokens_seen": 123052770, "step": 5722, "time_per_iteration": 2.6348612308502197 }, { "auxiliary_loss_clip": 0.01097502, "auxiliary_loss_mlp": 0.01039079, "balance_loss_clip": 1.03971851, "balance_loss_mlp": 1.02389216, "epoch": 0.6881500631275176, "flos": 30153292318080.0, "grad_norm": 1.8353118153319308, "language_loss": 0.79422569, "learning_rate": 9.361714194574515e-07, "loss": 0.81559145, "num_input_tokens_seen": 123075105, "step": 5723, "time_per_iteration": 2.8203506469726562 }, { "auxiliary_loss_clip": 0.01039306, "auxiliary_loss_mlp": 0.01003715, "balance_loss_clip": 1.01006413, "balance_loss_mlp": 1.00217772, "epoch": 0.6882703060181566, "flos": 66181537215360.0, "grad_norm": 0.8968598159799116, "language_loss": 0.58263397, "learning_rate": 9.355118683780228e-07, "loss": 0.60306418, "num_input_tokens_seen": 123145175, "step": 5724, "time_per_iteration": 3.2684683799743652 }, { "auxiliary_loss_clip": 0.01135032, "auxiliary_loss_mlp": 0.01032569, "balance_loss_clip": 1.043787, "balance_loss_mlp": 1.01696491, "epoch": 0.6883905489087958, "flos": 18214646123520.0, "grad_norm": 3.094734150715932, "language_loss": 0.79441148, "learning_rate": 9.348524787805987e-07, "loss": 0.81608748, "num_input_tokens_seen": 123160365, "step": 5725, "time_per_iteration": 3.5297083854675293 }, { "auxiliary_loss_clip": 0.01097587, "auxiliary_loss_mlp": 0.01039949, "balance_loss_clip": 1.03607845, "balance_loss_mlp": 1.02325976, "epoch": 0.6885107917994349, "flos": 14056262553600.0, "grad_norm": 3.6211585206379953, "language_loss": 0.8511132, "learning_rate": 9.341932507652053e-07, "loss": 0.87248856, "num_input_tokens_seen": 123174855, "step": 5726, "time_per_iteration": 2.694460391998291 }, { "auxiliary_loss_clip": 0.01109711, "auxiliary_loss_mlp": 0.01046604, "balance_loss_clip": 1.03982723, "balance_loss_mlp": 1.02861524, "epoch": 0.6886310346900739, "flos": 28690722334080.0, "grad_norm": 1.9588362529034287, "language_loss": 0.79070938, "learning_rate": 9.335341844318489e-07, "loss": 0.81227255, "num_input_tokens_seen": 123194995, "step": 5727, "time_per_iteration": 3.710038900375366 }, { "auxiliary_loss_clip": 0.01109528, "auxiliary_loss_mlp": 0.0104197, "balance_loss_clip": 1.0411346, "balance_loss_mlp": 1.02550721, "epoch": 0.6887512775807131, "flos": 24535319592960.0, "grad_norm": 2.0695796747527497, "language_loss": 0.72906446, "learning_rate": 9.328752798805091e-07, "loss": 0.75057948, "num_input_tokens_seen": 123213465, "step": 5728, "time_per_iteration": 2.6763765811920166 }, { "auxiliary_loss_clip": 0.01126866, "auxiliary_loss_mlp": 0.0104438, "balance_loss_clip": 1.04390967, "balance_loss_mlp": 1.02820277, "epoch": 0.6888715204713521, "flos": 22414363269120.0, "grad_norm": 3.0291220200367457, "language_loss": 0.76367503, "learning_rate": 9.322165372111399e-07, "loss": 0.78538752, "num_input_tokens_seen": 123231610, "step": 5729, "time_per_iteration": 2.708177089691162 }, { "auxiliary_loss_clip": 0.01096151, "auxiliary_loss_mlp": 0.01043447, "balance_loss_clip": 1.04107714, "balance_loss_mlp": 1.02783108, "epoch": 0.6889917633619912, "flos": 22054323294720.0, "grad_norm": 7.046671161447685, "language_loss": 0.75616825, "learning_rate": 9.315579565236747e-07, "loss": 0.77756417, "num_input_tokens_seen": 123250715, "step": 5730, "time_per_iteration": 2.7057855129241943 }, { "auxiliary_loss_clip": 0.01103615, "auxiliary_loss_mlp": 0.01045967, "balance_loss_clip": 1.041399, "balance_loss_mlp": 1.02938449, "epoch": 0.6891120062526304, "flos": 23949724164480.0, "grad_norm": 1.907349388160701, "language_loss": 0.74294877, "learning_rate": 9.308995379180162e-07, "loss": 0.76444465, "num_input_tokens_seen": 123270270, "step": 5731, "time_per_iteration": 2.7462403774261475 }, { "auxiliary_loss_clip": 0.01030236, "auxiliary_loss_mlp": 0.01001202, "balance_loss_clip": 1.00988078, "balance_loss_mlp": 0.99962884, "epoch": 0.6892322491432694, "flos": 64117354337280.0, "grad_norm": 0.9079488170627814, "language_loss": 0.59513396, "learning_rate": 9.302412814940488e-07, "loss": 0.61544836, "num_input_tokens_seen": 123333045, "step": 5732, "time_per_iteration": 4.112678527832031 }, { "auxiliary_loss_clip": 0.01111601, "auxiliary_loss_mlp": 0.01040966, "balance_loss_clip": 1.04257905, "balance_loss_mlp": 1.0243547, "epoch": 0.6893524920339085, "flos": 23002436736000.0, "grad_norm": 3.270599899212196, "language_loss": 0.71092194, "learning_rate": 9.295831873516276e-07, "loss": 0.73244768, "num_input_tokens_seen": 123352320, "step": 5733, "time_per_iteration": 3.626084327697754 }, { "auxiliary_loss_clip": 0.0113394, "auxiliary_loss_mlp": 0.01034681, "balance_loss_clip": 1.04483986, "balance_loss_mlp": 1.02019715, "epoch": 0.6894727349245476, "flos": 21396260177280.0, "grad_norm": 14.292728754152048, "language_loss": 0.76019281, "learning_rate": 9.289252555905873e-07, "loss": 0.78187907, "num_input_tokens_seen": 123372400, "step": 5734, "time_per_iteration": 2.6170108318328857 }, { "auxiliary_loss_clip": 0.01125199, "auxiliary_loss_mlp": 0.0104602, "balance_loss_clip": 1.04445028, "balance_loss_mlp": 1.02869892, "epoch": 0.6895929778151867, "flos": 19865316654720.0, "grad_norm": 2.463220614752352, "language_loss": 0.75645727, "learning_rate": 9.282674863107334e-07, "loss": 0.77816945, "num_input_tokens_seen": 123390215, "step": 5735, "time_per_iteration": 2.648270606994629 }, { "auxiliary_loss_clip": 0.01121721, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.04508638, "balance_loss_mlp": 1.01941419, "epoch": 0.6897132207058257, "flos": 18179166464640.0, "grad_norm": 4.248036698316263, "language_loss": 0.76072788, "learning_rate": 9.276098796118488e-07, "loss": 0.78229493, "num_input_tokens_seen": 123406700, "step": 5736, "time_per_iteration": 2.793452024459839 }, { "auxiliary_loss_clip": 0.01110678, "auxiliary_loss_mlp": 0.01037014, "balance_loss_clip": 1.04134488, "balance_loss_mlp": 1.02129006, "epoch": 0.6898334635964649, "flos": 32561641359360.0, "grad_norm": 2.913400102281545, "language_loss": 0.65965903, "learning_rate": 9.269524355936938e-07, "loss": 0.68113589, "num_input_tokens_seen": 123429880, "step": 5737, "time_per_iteration": 2.778993844985962 }, { "auxiliary_loss_clip": 0.01104733, "auxiliary_loss_mlp": 0.01048914, "balance_loss_clip": 1.04182887, "balance_loss_mlp": 1.0325346, "epoch": 0.689953706487104, "flos": 22819004956800.0, "grad_norm": 1.7841355680305881, "language_loss": 0.84598696, "learning_rate": 9.262951543560002e-07, "loss": 0.86752343, "num_input_tokens_seen": 123449105, "step": 5738, "time_per_iteration": 2.6934401988983154 }, { "auxiliary_loss_clip": 0.01110479, "auxiliary_loss_mlp": 0.01038995, "balance_loss_clip": 1.04418671, "balance_loss_mlp": 1.02279437, "epoch": 0.690073949377743, "flos": 18515362786560.0, "grad_norm": 2.7275503087250788, "language_loss": 0.86369681, "learning_rate": 9.256380359984795e-07, "loss": 0.88519156, "num_input_tokens_seen": 123466215, "step": 5739, "time_per_iteration": 2.6149098873138428 }, { "auxiliary_loss_clip": 0.01090184, "auxiliary_loss_mlp": 0.01035038, "balance_loss_clip": 1.0372088, "balance_loss_mlp": 1.01884937, "epoch": 0.6901941922683821, "flos": 34857194716800.0, "grad_norm": 2.104664731768311, "language_loss": 0.74961615, "learning_rate": 9.249810806208139e-07, "loss": 0.77086842, "num_input_tokens_seen": 123485480, "step": 5740, "time_per_iteration": 2.8128225803375244 }, { "auxiliary_loss_clip": 0.01084944, "auxiliary_loss_mlp": 0.00772855, "balance_loss_clip": 1.03750753, "balance_loss_mlp": 1.00058103, "epoch": 0.6903144351590212, "flos": 16253672976000.0, "grad_norm": 3.4239240146596877, "language_loss": 0.80268723, "learning_rate": 9.243242883226627e-07, "loss": 0.82126522, "num_input_tokens_seen": 123504575, "step": 5741, "time_per_iteration": 2.7423858642578125 }, { "auxiliary_loss_clip": 0.01127896, "auxiliary_loss_mlp": 0.01038716, "balance_loss_clip": 1.04232621, "balance_loss_mlp": 1.02178836, "epoch": 0.6904346780496603, "flos": 28035137255040.0, "grad_norm": 2.4096658243007765, "language_loss": 0.69646883, "learning_rate": 9.236676592036628e-07, "loss": 0.71813494, "num_input_tokens_seen": 123524250, "step": 5742, "time_per_iteration": 2.6700642108917236 }, { "auxiliary_loss_clip": 0.0110416, "auxiliary_loss_mlp": 0.01036091, "balance_loss_clip": 1.04107356, "balance_loss_mlp": 1.02005768, "epoch": 0.6905549209402994, "flos": 23624266008960.0, "grad_norm": 1.8745510761822115, "language_loss": 0.73795915, "learning_rate": 9.230111933634228e-07, "loss": 0.75936162, "num_input_tokens_seen": 123545845, "step": 5743, "time_per_iteration": 2.7357537746429443 }, { "auxiliary_loss_clip": 0.01127349, "auxiliary_loss_mlp": 0.01047698, "balance_loss_clip": 1.04710531, "balance_loss_mlp": 1.03131843, "epoch": 0.6906751638309385, "flos": 23114945111040.0, "grad_norm": 1.4724891503723312, "language_loss": 0.80793583, "learning_rate": 9.223548909015288e-07, "loss": 0.82968634, "num_input_tokens_seen": 123567535, "step": 5744, "time_per_iteration": 2.677320957183838 }, { "auxiliary_loss_clip": 0.01078795, "auxiliary_loss_mlp": 0.01036411, "balance_loss_clip": 1.03823328, "balance_loss_mlp": 1.02124763, "epoch": 0.6907954067215776, "flos": 27305468375040.0, "grad_norm": 1.8976510670773505, "language_loss": 0.72201514, "learning_rate": 9.216987519175407e-07, "loss": 0.74316722, "num_input_tokens_seen": 123587710, "step": 5745, "time_per_iteration": 2.7942731380462646 }, { "auxiliary_loss_clip": 0.01115776, "auxiliary_loss_mlp": 0.01042502, "balance_loss_clip": 1.04134607, "balance_loss_mlp": 1.02663541, "epoch": 0.6909156496122166, "flos": 21689399070720.0, "grad_norm": 3.80991405531026, "language_loss": 0.68551564, "learning_rate": 9.210427765109942e-07, "loss": 0.70709842, "num_input_tokens_seen": 123607385, "step": 5746, "time_per_iteration": 2.6196749210357666 }, { "auxiliary_loss_clip": 0.01111807, "auxiliary_loss_mlp": 0.01042182, "balance_loss_clip": 1.04058504, "balance_loss_mlp": 1.02428889, "epoch": 0.6910358925028558, "flos": 22561453463040.0, "grad_norm": 1.963708681579799, "language_loss": 0.81548697, "learning_rate": 9.20386964781402e-07, "loss": 0.83702683, "num_input_tokens_seen": 123625405, "step": 5747, "time_per_iteration": 2.6954345703125 }, { "auxiliary_loss_clip": 0.01107053, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.04033267, "balance_loss_mlp": 1.02291584, "epoch": 0.6911561353934949, "flos": 22054107813120.0, "grad_norm": 2.0484705647759958, "language_loss": 0.84484625, "learning_rate": 9.197313168282472e-07, "loss": 0.86630708, "num_input_tokens_seen": 123642850, "step": 5748, "time_per_iteration": 2.6686205863952637 }, { "auxiliary_loss_clip": 0.01118949, "auxiliary_loss_mlp": 0.01042092, "balance_loss_clip": 1.04057813, "balance_loss_mlp": 1.025069, "epoch": 0.6912763782841339, "flos": 24206557386240.0, "grad_norm": 2.22684679616697, "language_loss": 0.71964866, "learning_rate": 9.190758327509935e-07, "loss": 0.7412591, "num_input_tokens_seen": 123661595, "step": 5749, "time_per_iteration": 2.720062017440796 }, { "auxiliary_loss_clip": 0.01008368, "auxiliary_loss_mlp": 0.00757022, "balance_loss_clip": 1.01559067, "balance_loss_mlp": 1.00030649, "epoch": 0.6913966211747731, "flos": 52329641091840.0, "grad_norm": 0.9497639796835163, "language_loss": 0.64459038, "learning_rate": 9.184205126490767e-07, "loss": 0.66224426, "num_input_tokens_seen": 123710490, "step": 5750, "time_per_iteration": 3.1019413471221924 }, { "auxiliary_loss_clip": 0.0101346, "auxiliary_loss_mlp": 0.00755767, "balance_loss_clip": 1.01273608, "balance_loss_mlp": 1.00027514, "epoch": 0.6915168640654121, "flos": 66741274851840.0, "grad_norm": 1.0969898563679672, "language_loss": 0.59640157, "learning_rate": 9.177653566219075e-07, "loss": 0.61409384, "num_input_tokens_seen": 123765215, "step": 5751, "time_per_iteration": 4.040383338928223 }, { "auxiliary_loss_clip": 0.01103938, "auxiliary_loss_mlp": 0.01039604, "balance_loss_clip": 1.0425601, "balance_loss_mlp": 1.02345145, "epoch": 0.6916371069560512, "flos": 18296523175680.0, "grad_norm": 2.1933866820208068, "language_loss": 0.75971681, "learning_rate": 9.171103647688744e-07, "loss": 0.78115225, "num_input_tokens_seen": 123783955, "step": 5752, "time_per_iteration": 2.723637342453003 }, { "auxiliary_loss_clip": 0.01053636, "auxiliary_loss_mlp": 0.01052626, "balance_loss_clip": 1.0351603, "balance_loss_mlp": 1.03610361, "epoch": 0.6917573498466904, "flos": 19645794685440.0, "grad_norm": 1.9837406888953626, "language_loss": 0.68998313, "learning_rate": 9.164555371893367e-07, "loss": 0.71104574, "num_input_tokens_seen": 123803885, "step": 5753, "time_per_iteration": 3.882885694503784 }, { "auxiliary_loss_clip": 0.01122394, "auxiliary_loss_mlp": 0.00772221, "balance_loss_clip": 1.04365826, "balance_loss_mlp": 1.0005374, "epoch": 0.6918775927373294, "flos": 14210319985920.0, "grad_norm": 1.848951179618031, "language_loss": 0.75317979, "learning_rate": 9.158008739826333e-07, "loss": 0.77212596, "num_input_tokens_seen": 123821485, "step": 5754, "time_per_iteration": 2.8627703189849854 }, { "auxiliary_loss_clip": 0.0110879, "auxiliary_loss_mlp": 0.01040467, "balance_loss_clip": 1.04206276, "balance_loss_mlp": 1.02437329, "epoch": 0.6919978356279685, "flos": 23985455218560.0, "grad_norm": 2.07415604427377, "language_loss": 0.86495835, "learning_rate": 9.151463752480744e-07, "loss": 0.88645089, "num_input_tokens_seen": 123840215, "step": 5755, "time_per_iteration": 2.7193262577056885 }, { "auxiliary_loss_clip": 0.01085963, "auxiliary_loss_mlp": 0.0104061, "balance_loss_clip": 1.03742218, "balance_loss_mlp": 1.0244453, "epoch": 0.6921180785186076, "flos": 23622937205760.0, "grad_norm": 1.5092874251977575, "language_loss": 0.8029542, "learning_rate": 9.144920410849493e-07, "loss": 0.82421988, "num_input_tokens_seen": 123861450, "step": 5756, "time_per_iteration": 2.750107765197754 }, { "auxiliary_loss_clip": 0.01113833, "auxiliary_loss_mlp": 0.01039097, "balance_loss_clip": 1.04336488, "balance_loss_mlp": 1.02413595, "epoch": 0.6922383214092467, "flos": 21142623265920.0, "grad_norm": 1.747553173088818, "language_loss": 0.80228901, "learning_rate": 9.138378715925176e-07, "loss": 0.82381827, "num_input_tokens_seen": 123880545, "step": 5757, "time_per_iteration": 2.72554087638855 }, { "auxiliary_loss_clip": 0.01104972, "auxiliary_loss_mlp": 0.0104007, "balance_loss_clip": 1.04082465, "balance_loss_mlp": 1.02448905, "epoch": 0.6923585642998857, "flos": 21470667200640.0, "grad_norm": 2.7092590842088047, "language_loss": 0.80837989, "learning_rate": 9.131838668700167e-07, "loss": 0.82983029, "num_input_tokens_seen": 123900615, "step": 5758, "time_per_iteration": 3.660447359085083 }, { "auxiliary_loss_clip": 0.01095837, "auxiliary_loss_mlp": 0.01037136, "balance_loss_clip": 1.03923631, "balance_loss_mlp": 1.02082825, "epoch": 0.6924788071905249, "flos": 21105204272640.0, "grad_norm": 2.1013727492001757, "language_loss": 0.86704111, "learning_rate": 9.125300270166598e-07, "loss": 0.88837081, "num_input_tokens_seen": 123921220, "step": 5759, "time_per_iteration": 3.661284923553467 }, { "auxiliary_loss_clip": 0.01100247, "auxiliary_loss_mlp": 0.0103933, "balance_loss_clip": 1.03858292, "balance_loss_mlp": 1.02420259, "epoch": 0.692599050081164, "flos": 26250018117120.0, "grad_norm": 1.9723179841258933, "language_loss": 0.85690856, "learning_rate": 9.118763521316324e-07, "loss": 0.87830436, "num_input_tokens_seen": 123941795, "step": 5760, "time_per_iteration": 2.728118896484375 }, { "auxiliary_loss_clip": 0.01135491, "auxiliary_loss_mlp": 0.00772905, "balance_loss_clip": 1.04411602, "balance_loss_mlp": 1.00046349, "epoch": 0.692719292971803, "flos": 20885215426560.0, "grad_norm": 4.769482792335293, "language_loss": 0.75886571, "learning_rate": 9.112228423140987e-07, "loss": 0.77794969, "num_input_tokens_seen": 123960715, "step": 5761, "time_per_iteration": 2.6627919673919678 }, { "auxiliary_loss_clip": 0.0111501, "auxiliary_loss_mlp": 0.01039902, "balance_loss_clip": 1.04117489, "balance_loss_mlp": 1.02366543, "epoch": 0.6928395358624422, "flos": 25921938268800.0, "grad_norm": 2.281559492974795, "language_loss": 0.86259621, "learning_rate": 9.105694976631932e-07, "loss": 0.88414538, "num_input_tokens_seen": 123978625, "step": 5762, "time_per_iteration": 2.737664222717285 }, { "auxiliary_loss_clip": 0.01124951, "auxiliary_loss_mlp": 0.01047746, "balance_loss_clip": 1.04437852, "balance_loss_mlp": 1.03092527, "epoch": 0.6929597787530812, "flos": 23586559706880.0, "grad_norm": 6.417924723453625, "language_loss": 0.72506684, "learning_rate": 9.099163182780283e-07, "loss": 0.74679381, "num_input_tokens_seen": 123996780, "step": 5763, "time_per_iteration": 2.6583774089813232 }, { "auxiliary_loss_clip": 0.01106459, "auxiliary_loss_mlp": 0.01035412, "balance_loss_clip": 1.04050875, "balance_loss_mlp": 1.01873481, "epoch": 0.6930800216437203, "flos": 18255656476800.0, "grad_norm": 2.8929295198084297, "language_loss": 0.49305341, "learning_rate": 9.092633042576916e-07, "loss": 0.51447213, "num_input_tokens_seen": 124014045, "step": 5764, "time_per_iteration": 2.624868869781494 }, { "auxiliary_loss_clip": 0.01107675, "auxiliary_loss_mlp": 0.01040269, "balance_loss_clip": 1.04186058, "balance_loss_mlp": 1.02427125, "epoch": 0.6932002645343595, "flos": 29168621809920.0, "grad_norm": 1.8907242692228967, "language_loss": 0.56301987, "learning_rate": 9.086104557012446e-07, "loss": 0.58449936, "num_input_tokens_seen": 124034615, "step": 5765, "time_per_iteration": 2.760993003845215 }, { "auxiliary_loss_clip": 0.01115902, "auxiliary_loss_mlp": 0.01041687, "balance_loss_clip": 1.04180574, "balance_loss_mlp": 1.02658319, "epoch": 0.6933205074249985, "flos": 23842746483840.0, "grad_norm": 2.540092300023835, "language_loss": 0.65558326, "learning_rate": 9.079577727077239e-07, "loss": 0.67715913, "num_input_tokens_seen": 124053445, "step": 5766, "time_per_iteration": 2.6380786895751953 }, { "auxiliary_loss_clip": 0.0112241, "auxiliary_loss_mlp": 0.01039215, "balance_loss_clip": 1.0436492, "balance_loss_mlp": 1.02334869, "epoch": 0.6934407503156376, "flos": 24166696268160.0, "grad_norm": 2.242109625936376, "language_loss": 0.72021627, "learning_rate": 9.073052553761404e-07, "loss": 0.74183249, "num_input_tokens_seen": 124072810, "step": 5767, "time_per_iteration": 2.7028934955596924 }, { "auxiliary_loss_clip": 0.01088685, "auxiliary_loss_mlp": 0.01043071, "balance_loss_clip": 1.03935158, "balance_loss_mlp": 1.02533317, "epoch": 0.6935609932062767, "flos": 20631327120000.0, "grad_norm": 1.7565637344371938, "language_loss": 0.77953601, "learning_rate": 9.066529038054805e-07, "loss": 0.80085361, "num_input_tokens_seen": 124092875, "step": 5768, "time_per_iteration": 2.7070701122283936 }, { "auxiliary_loss_clip": 0.01108965, "auxiliary_loss_mlp": 0.01035453, "balance_loss_clip": 1.04169226, "balance_loss_mlp": 1.01949072, "epoch": 0.6936812360969158, "flos": 18254184019200.0, "grad_norm": 1.7776343017361116, "language_loss": 0.74166834, "learning_rate": 9.060007180947071e-07, "loss": 0.76311255, "num_input_tokens_seen": 124110930, "step": 5769, "time_per_iteration": 2.655869245529175 }, { "auxiliary_loss_clip": 0.01087467, "auxiliary_loss_mlp": 0.01047117, "balance_loss_clip": 1.03626013, "balance_loss_mlp": 1.02941453, "epoch": 0.6938014789875548, "flos": 31317336368640.0, "grad_norm": 1.869568820392557, "language_loss": 0.7296595, "learning_rate": 9.053486983427534e-07, "loss": 0.75100529, "num_input_tokens_seen": 124132180, "step": 5770, "time_per_iteration": 2.787320852279663 }, { "auxiliary_loss_clip": 0.01112579, "auxiliary_loss_mlp": 0.0104393, "balance_loss_clip": 1.04101229, "balance_loss_mlp": 1.02697814, "epoch": 0.6939217218781939, "flos": 17528429721600.0, "grad_norm": 2.208241253964661, "language_loss": 0.70280755, "learning_rate": 9.046968446485326e-07, "loss": 0.72437263, "num_input_tokens_seen": 124150585, "step": 5771, "time_per_iteration": 2.679398536682129 }, { "auxiliary_loss_clip": 0.01125148, "auxiliary_loss_mlp": 0.0103572, "balance_loss_clip": 1.04274726, "balance_loss_mlp": 1.01814842, "epoch": 0.6940419647688331, "flos": 18551776199040.0, "grad_norm": 3.1621370735335192, "language_loss": 0.7090643, "learning_rate": 9.040451571109295e-07, "loss": 0.73067296, "num_input_tokens_seen": 124166205, "step": 5772, "time_per_iteration": 2.579664468765259 }, { "auxiliary_loss_clip": 0.01005142, "auxiliary_loss_mlp": 0.01001662, "balance_loss_clip": 1.00908315, "balance_loss_mlp": 1.0002141, "epoch": 0.6941622076594721, "flos": 66926286829440.0, "grad_norm": 0.8326909568589616, "language_loss": 0.60377049, "learning_rate": 9.033936358288042e-07, "loss": 0.62383854, "num_input_tokens_seen": 124219940, "step": 5773, "time_per_iteration": 3.1763222217559814 }, { "auxiliary_loss_clip": 0.0113753, "auxiliary_loss_mlp": 0.01039191, "balance_loss_clip": 1.04454505, "balance_loss_mlp": 1.02337217, "epoch": 0.6942824505501112, "flos": 26578062051840.0, "grad_norm": 1.836637478451399, "language_loss": 0.82498729, "learning_rate": 9.027422809009937e-07, "loss": 0.84675455, "num_input_tokens_seen": 124239885, "step": 5774, "time_per_iteration": 2.671374559402466 }, { "auxiliary_loss_clip": 0.01124657, "auxiliary_loss_mlp": 0.01038587, "balance_loss_clip": 1.04239154, "balance_loss_mlp": 1.02199364, "epoch": 0.6944026934407503, "flos": 21248308056960.0, "grad_norm": 1.624438859831428, "language_loss": 0.83194721, "learning_rate": 9.020910924263054e-07, "loss": 0.85357964, "num_input_tokens_seen": 124258410, "step": 5775, "time_per_iteration": 2.642444610595703 }, { "auxiliary_loss_clip": 0.01004891, "auxiliary_loss_mlp": 0.01003965, "balance_loss_clip": 1.00970721, "balance_loss_mlp": 1.00236785, "epoch": 0.6945229363313894, "flos": 70677191537280.0, "grad_norm": 0.8083507258499111, "language_loss": 0.58114386, "learning_rate": 9.014400705035261e-07, "loss": 0.60123241, "num_input_tokens_seen": 124315315, "step": 5776, "time_per_iteration": 3.326165199279785 }, { "auxiliary_loss_clip": 0.01133661, "auxiliary_loss_mlp": 0.01035389, "balance_loss_clip": 1.04496193, "balance_loss_mlp": 1.01971352, "epoch": 0.6946431792220285, "flos": 18952934267520.0, "grad_norm": 2.8228675976015905, "language_loss": 0.76842284, "learning_rate": 9.00789215231414e-07, "loss": 0.79011333, "num_input_tokens_seen": 124333710, "step": 5777, "time_per_iteration": 3.5864970684051514 }, { "auxiliary_loss_clip": 0.01096795, "auxiliary_loss_mlp": 0.00775461, "balance_loss_clip": 1.03784871, "balance_loss_mlp": 1.00050545, "epoch": 0.6947634221126676, "flos": 20338834671360.0, "grad_norm": 1.9392952280213756, "language_loss": 0.82177097, "learning_rate": 9.001385267087056e-07, "loss": 0.84049344, "num_input_tokens_seen": 124352855, "step": 5778, "time_per_iteration": 2.73290753364563 }, { "auxiliary_loss_clip": 0.01123051, "auxiliary_loss_mlp": 0.01036721, "balance_loss_clip": 1.04189932, "balance_loss_mlp": 1.020962, "epoch": 0.6948836650033067, "flos": 21833723917440.0, "grad_norm": 1.497191566086051, "language_loss": 0.70639265, "learning_rate": 8.994880050341072e-07, "loss": 0.72799039, "num_input_tokens_seen": 124372960, "step": 5779, "time_per_iteration": 3.6246447563171387 }, { "auxiliary_loss_clip": 0.01105664, "auxiliary_loss_mlp": 0.01040709, "balance_loss_clip": 1.04241502, "balance_loss_mlp": 1.02360272, "epoch": 0.6950039078939457, "flos": 23657519024640.0, "grad_norm": 1.6859480140217902, "language_loss": 0.77329075, "learning_rate": 8.988376503063026e-07, "loss": 0.79475451, "num_input_tokens_seen": 124394220, "step": 5780, "time_per_iteration": 2.6812851428985596 }, { "auxiliary_loss_clip": 0.01096041, "auxiliary_loss_mlp": 0.01038638, "balance_loss_clip": 1.04214275, "balance_loss_mlp": 1.02191281, "epoch": 0.6951241507845849, "flos": 21792462168960.0, "grad_norm": 2.156261237938696, "language_loss": 0.81289059, "learning_rate": 8.981874626239521e-07, "loss": 0.83423734, "num_input_tokens_seen": 124412795, "step": 5781, "time_per_iteration": 2.7590508460998535 }, { "auxiliary_loss_clip": 0.01124176, "auxiliary_loss_mlp": 0.01037098, "balance_loss_clip": 1.04446697, "balance_loss_mlp": 1.02142835, "epoch": 0.695244393675224, "flos": 14647568244480.0, "grad_norm": 2.2028868784004256, "language_loss": 0.88390297, "learning_rate": 8.975374420856872e-07, "loss": 0.90551573, "num_input_tokens_seen": 124429690, "step": 5782, "time_per_iteration": 2.6084001064300537 }, { "auxiliary_loss_clip": 0.01089182, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.03722048, "balance_loss_mlp": 1.02030861, "epoch": 0.695364636565863, "flos": 16873203778560.0, "grad_norm": 3.2602629948268023, "language_loss": 0.72979379, "learning_rate": 8.968875887901157e-07, "loss": 0.7510469, "num_input_tokens_seen": 124447070, "step": 5783, "time_per_iteration": 3.736067295074463 }, { "auxiliary_loss_clip": 0.0111015, "auxiliary_loss_mlp": 0.01044461, "balance_loss_clip": 1.03959441, "balance_loss_mlp": 1.02812934, "epoch": 0.6954848794565022, "flos": 19354523299200.0, "grad_norm": 2.266603071240432, "language_loss": 0.62706137, "learning_rate": 8.9623790283582e-07, "loss": 0.64860749, "num_input_tokens_seen": 124464950, "step": 5784, "time_per_iteration": 2.67891526222229 }, { "auxiliary_loss_clip": 0.01099721, "auxiliary_loss_mlp": 0.01044754, "balance_loss_clip": 1.03965521, "balance_loss_mlp": 1.02702749, "epoch": 0.6956051223471412, "flos": 18990209606400.0, "grad_norm": 2.705303658718535, "language_loss": 0.7636345, "learning_rate": 8.955883843213561e-07, "loss": 0.78507924, "num_input_tokens_seen": 124483965, "step": 5785, "time_per_iteration": 3.616100311279297 }, { "auxiliary_loss_clip": 0.0113354, "auxiliary_loss_mlp": 0.0104448, "balance_loss_clip": 1.04654729, "balance_loss_mlp": 1.02606261, "epoch": 0.6957253652377803, "flos": 16107229226880.0, "grad_norm": 1.8894062243854306, "language_loss": 0.86914164, "learning_rate": 8.949390333452569e-07, "loss": 0.89092183, "num_input_tokens_seen": 124501910, "step": 5786, "time_per_iteration": 2.7007577419281006 }, { "auxiliary_loss_clip": 0.01132914, "auxiliary_loss_mlp": 0.01041399, "balance_loss_clip": 1.04288101, "balance_loss_mlp": 1.02523422, "epoch": 0.6958456081284194, "flos": 29388646569600.0, "grad_norm": 1.8010550439752007, "language_loss": 0.67527634, "learning_rate": 8.942898500060279e-07, "loss": 0.69701946, "num_input_tokens_seen": 124521625, "step": 5787, "time_per_iteration": 2.6126339435577393 }, { "auxiliary_loss_clip": 0.01094228, "auxiliary_loss_mlp": 0.01047123, "balance_loss_clip": 1.04027498, "balance_loss_mlp": 1.03031492, "epoch": 0.6959658510190585, "flos": 25154850395520.0, "grad_norm": 8.216133410060873, "language_loss": 0.71698767, "learning_rate": 8.936408344021493e-07, "loss": 0.73840117, "num_input_tokens_seen": 124538540, "step": 5788, "time_per_iteration": 2.7667460441589355 }, { "auxiliary_loss_clip": 0.01119384, "auxiliary_loss_mlp": 0.01046196, "balance_loss_clip": 1.04431534, "balance_loss_mlp": 1.02855301, "epoch": 0.6960860939096976, "flos": 42814388759040.0, "grad_norm": 2.1887594820846332, "language_loss": 0.71200061, "learning_rate": 8.929919866320765e-07, "loss": 0.73365641, "num_input_tokens_seen": 124559355, "step": 5789, "time_per_iteration": 2.871194362640381 }, { "auxiliary_loss_clip": 0.01104047, "auxiliary_loss_mlp": 0.00773325, "balance_loss_clip": 1.04172826, "balance_loss_mlp": 1.000489, "epoch": 0.6962063368003367, "flos": 17566566986880.0, "grad_norm": 2.070322678725045, "language_loss": 0.81476593, "learning_rate": 8.923433067942385e-07, "loss": 0.83353961, "num_input_tokens_seen": 124577920, "step": 5790, "time_per_iteration": 2.700721025466919 }, { "auxiliary_loss_clip": 0.01103063, "auxiliary_loss_mlp": 0.01038054, "balance_loss_clip": 1.04275596, "balance_loss_mlp": 1.02240801, "epoch": 0.6963265796909758, "flos": 21251648021760.0, "grad_norm": 1.9662578437492655, "language_loss": 0.68528092, "learning_rate": 8.916947949870417e-07, "loss": 0.70669204, "num_input_tokens_seen": 124597585, "step": 5791, "time_per_iteration": 2.7312002182006836 }, { "auxiliary_loss_clip": 0.01030108, "auxiliary_loss_mlp": 0.01003184, "balance_loss_clip": 1.00974441, "balance_loss_mlp": 1.00183678, "epoch": 0.6964468225816148, "flos": 68828295801600.0, "grad_norm": 0.7451413414470373, "language_loss": 0.58157557, "learning_rate": 8.910464513088615e-07, "loss": 0.60190845, "num_input_tokens_seen": 124661625, "step": 5792, "time_per_iteration": 3.2894067764282227 }, { "auxiliary_loss_clip": 0.01104823, "auxiliary_loss_mlp": 0.01051709, "balance_loss_clip": 1.04003668, "balance_loss_mlp": 1.03348184, "epoch": 0.696567065472254, "flos": 18950887192320.0, "grad_norm": 2.612269667249838, "language_loss": 0.78560483, "learning_rate": 8.903982758580542e-07, "loss": 0.80717015, "num_input_tokens_seen": 124680565, "step": 5793, "time_per_iteration": 2.673088550567627 }, { "auxiliary_loss_clip": 0.01107031, "auxiliary_loss_mlp": 0.01039106, "balance_loss_clip": 1.04197502, "balance_loss_mlp": 1.02279866, "epoch": 0.696687308362893, "flos": 22856675345280.0, "grad_norm": 1.8825716428572432, "language_loss": 0.8023693, "learning_rate": 8.897502687329457e-07, "loss": 0.8238306, "num_input_tokens_seen": 124700365, "step": 5794, "time_per_iteration": 2.6693432331085205 }, { "auxiliary_loss_clip": 0.01095954, "auxiliary_loss_mlp": 0.01040941, "balance_loss_clip": 1.042485, "balance_loss_mlp": 1.0252645, "epoch": 0.6968075512535321, "flos": 24972926987520.0, "grad_norm": 2.959734731641946, "language_loss": 0.80057001, "learning_rate": 8.891024300318382e-07, "loss": 0.82193899, "num_input_tokens_seen": 124718935, "step": 5795, "time_per_iteration": 2.7735977172851562 }, { "auxiliary_loss_clip": 0.01090324, "auxiliary_loss_mlp": 0.01050398, "balance_loss_clip": 1.03755379, "balance_loss_mlp": 1.03310108, "epoch": 0.6969277941441713, "flos": 21030438113280.0, "grad_norm": 1.452620690371238, "language_loss": 0.75630796, "learning_rate": 8.884547598530103e-07, "loss": 0.77771521, "num_input_tokens_seen": 124739505, "step": 5796, "time_per_iteration": 2.7427916526794434 }, { "auxiliary_loss_clip": 0.01050364, "auxiliary_loss_mlp": 0.01046196, "balance_loss_clip": 1.03464258, "balance_loss_mlp": 1.02810001, "epoch": 0.6970480370348103, "flos": 21579404647680.0, "grad_norm": 2.2009914510948914, "language_loss": 0.75462073, "learning_rate": 8.8780725829471e-07, "loss": 0.77558631, "num_input_tokens_seen": 124757410, "step": 5797, "time_per_iteration": 2.9157233238220215 }, { "auxiliary_loss_clip": 0.01137951, "auxiliary_loss_mlp": 0.01047308, "balance_loss_clip": 1.04525781, "balance_loss_mlp": 1.02995753, "epoch": 0.6971682799254494, "flos": 22419175691520.0, "grad_norm": 4.888369582989255, "language_loss": 0.78150105, "learning_rate": 8.87159925455165e-07, "loss": 0.80335361, "num_input_tokens_seen": 124777240, "step": 5798, "time_per_iteration": 2.891414165496826 }, { "auxiliary_loss_clip": 0.0109183, "auxiliary_loss_mlp": 0.01052055, "balance_loss_clip": 1.03773046, "balance_loss_mlp": 1.03295767, "epoch": 0.6972885228160886, "flos": 20005834659840.0, "grad_norm": 1.8553759694537209, "language_loss": 0.73049366, "learning_rate": 8.865127614325738e-07, "loss": 0.75193256, "num_input_tokens_seen": 124795670, "step": 5799, "time_per_iteration": 2.676424741744995 }, { "auxiliary_loss_clip": 0.01105567, "auxiliary_loss_mlp": 0.01043078, "balance_loss_clip": 1.0409205, "balance_loss_mlp": 1.02460039, "epoch": 0.6974087657067276, "flos": 37853437656960.0, "grad_norm": 2.554357885723532, "language_loss": 0.66405785, "learning_rate": 8.85865766325113e-07, "loss": 0.68554431, "num_input_tokens_seen": 124819600, "step": 5800, "time_per_iteration": 2.841144561767578 }, { "auxiliary_loss_clip": 0.01108272, "auxiliary_loss_mlp": 0.01040217, "balance_loss_clip": 1.04242921, "balance_loss_mlp": 1.02327752, "epoch": 0.6975290085973667, "flos": 29489267543040.0, "grad_norm": 2.9351033830095328, "language_loss": 0.72070014, "learning_rate": 8.852189402309287e-07, "loss": 0.742185, "num_input_tokens_seen": 124838785, "step": 5801, "time_per_iteration": 2.7177326679229736 }, { "auxiliary_loss_clip": 0.01123396, "auxiliary_loss_mlp": 0.01045238, "balance_loss_clip": 1.04350531, "balance_loss_mlp": 1.02763128, "epoch": 0.6976492514880057, "flos": 12895630295040.0, "grad_norm": 2.6996385306093447, "language_loss": 0.74427223, "learning_rate": 8.845722832481441e-07, "loss": 0.76595855, "num_input_tokens_seen": 124854215, "step": 5802, "time_per_iteration": 2.662447214126587 }, { "auxiliary_loss_clip": 0.01121981, "auxiliary_loss_mlp": 0.01041219, "balance_loss_clip": 1.04211962, "balance_loss_mlp": 1.02436304, "epoch": 0.6977694943786449, "flos": 24352929308160.0, "grad_norm": 2.2484125542303883, "language_loss": 0.77598095, "learning_rate": 8.83925795474858e-07, "loss": 0.79761302, "num_input_tokens_seen": 124874340, "step": 5803, "time_per_iteration": 3.6008918285369873 }, { "auxiliary_loss_clip": 0.01100281, "auxiliary_loss_mlp": 0.01042799, "balance_loss_clip": 1.04363489, "balance_loss_mlp": 1.02578747, "epoch": 0.6978897372692839, "flos": 29898470257920.0, "grad_norm": 5.176857882573802, "language_loss": 0.59336948, "learning_rate": 8.832794770091414e-07, "loss": 0.61480027, "num_input_tokens_seen": 124895175, "step": 5804, "time_per_iteration": 2.7270655632019043 }, { "auxiliary_loss_clip": 0.01114347, "auxiliary_loss_mlp": 0.01045965, "balance_loss_clip": 1.04102612, "balance_loss_mlp": 1.0293591, "epoch": 0.698009980159923, "flos": 21761579450880.0, "grad_norm": 2.2045683119984396, "language_loss": 0.82400763, "learning_rate": 8.826333279490401e-07, "loss": 0.8456108, "num_input_tokens_seen": 124915810, "step": 5805, "time_per_iteration": 3.6884849071502686 }, { "auxiliary_loss_clip": 0.01114462, "auxiliary_loss_mlp": 0.01034796, "balance_loss_clip": 1.04444528, "balance_loss_mlp": 1.01907253, "epoch": 0.6981302230505622, "flos": 19857164267520.0, "grad_norm": 2.2643230703627824, "language_loss": 0.67947495, "learning_rate": 8.819873483925748e-07, "loss": 0.70096749, "num_input_tokens_seen": 124932930, "step": 5806, "time_per_iteration": 2.618969678878784 }, { "auxiliary_loss_clip": 0.01104628, "auxiliary_loss_mlp": 0.00772523, "balance_loss_clip": 1.04352093, "balance_loss_mlp": 1.00045729, "epoch": 0.6982504659412012, "flos": 22198648141440.0, "grad_norm": 2.197224436088265, "language_loss": 0.74899691, "learning_rate": 8.81341538437739e-07, "loss": 0.76776844, "num_input_tokens_seen": 124951220, "step": 5807, "time_per_iteration": 2.740981101989746 }, { "auxiliary_loss_clip": 0.01111405, "auxiliary_loss_mlp": 0.0103844, "balance_loss_clip": 1.03988457, "balance_loss_mlp": 1.02212071, "epoch": 0.6983707088318403, "flos": 35588479708800.0, "grad_norm": 1.8384819299950836, "language_loss": 0.68348873, "learning_rate": 8.80695898182503e-07, "loss": 0.70498717, "num_input_tokens_seen": 124972200, "step": 5808, "time_per_iteration": 2.7779335975646973 }, { "auxiliary_loss_clip": 0.01026104, "auxiliary_loss_mlp": 0.00999673, "balance_loss_clip": 1.01131606, "balance_loss_mlp": 0.99811155, "epoch": 0.6984909517224794, "flos": 65440052760960.0, "grad_norm": 0.8452315190713128, "language_loss": 0.65047598, "learning_rate": 8.800504277248093e-07, "loss": 0.67073381, "num_input_tokens_seen": 125036950, "step": 5809, "time_per_iteration": 3.2364795207977295 }, { "auxiliary_loss_clip": 0.0109112, "auxiliary_loss_mlp": 0.0077282, "balance_loss_clip": 1.04068208, "balance_loss_mlp": 1.00053263, "epoch": 0.6986111946131185, "flos": 18546927863040.0, "grad_norm": 2.1635461646964447, "language_loss": 0.75331807, "learning_rate": 8.794051271625753e-07, "loss": 0.77195746, "num_input_tokens_seen": 125054585, "step": 5810, "time_per_iteration": 3.5708649158477783 }, { "auxiliary_loss_clip": 0.01110818, "auxiliary_loss_mlp": 0.01033818, "balance_loss_clip": 1.04268575, "balance_loss_mlp": 1.0191195, "epoch": 0.6987314375037575, "flos": 23039173370880.0, "grad_norm": 4.170377532085837, "language_loss": 0.83448887, "learning_rate": 8.787599965936925e-07, "loss": 0.85593522, "num_input_tokens_seen": 125075515, "step": 5811, "time_per_iteration": 3.616858959197998 }, { "auxiliary_loss_clip": 0.01091256, "auxiliary_loss_mlp": 0.01043058, "balance_loss_clip": 1.04177248, "balance_loss_mlp": 1.02737045, "epoch": 0.6988516803943967, "flos": 38400393029760.0, "grad_norm": 1.7740169948459317, "language_loss": 0.72012663, "learning_rate": 8.781150361160261e-07, "loss": 0.74146974, "num_input_tokens_seen": 125097425, "step": 5812, "time_per_iteration": 2.8083159923553467 }, { "auxiliary_loss_clip": 0.0109994, "auxiliary_loss_mlp": 0.01040384, "balance_loss_clip": 1.04238129, "balance_loss_mlp": 1.02337325, "epoch": 0.6989719232850358, "flos": 24096993926400.0, "grad_norm": 2.1955284796856858, "language_loss": 0.7356106, "learning_rate": 8.774702458274181e-07, "loss": 0.75701386, "num_input_tokens_seen": 125117830, "step": 5813, "time_per_iteration": 2.7239456176757812 }, { "auxiliary_loss_clip": 0.01119039, "auxiliary_loss_mlp": 0.01038518, "balance_loss_clip": 1.04063368, "balance_loss_mlp": 1.02167356, "epoch": 0.6990921661756748, "flos": 14866838818560.0, "grad_norm": 2.152719887221249, "language_loss": 0.70514357, "learning_rate": 8.768256258256799e-07, "loss": 0.72671914, "num_input_tokens_seen": 125134455, "step": 5814, "time_per_iteration": 2.6188106536865234 }, { "auxiliary_loss_clip": 0.01123798, "auxiliary_loss_mlp": 0.01038349, "balance_loss_clip": 1.04205441, "balance_loss_mlp": 1.02286375, "epoch": 0.699212409066314, "flos": 20193719725440.0, "grad_norm": 1.8784632720487906, "language_loss": 0.7374723, "learning_rate": 8.76181176208602e-07, "loss": 0.75909376, "num_input_tokens_seen": 125152555, "step": 5815, "time_per_iteration": 2.6092333793640137 }, { "auxiliary_loss_clip": 0.01071612, "auxiliary_loss_mlp": 0.01048362, "balance_loss_clip": 1.03481221, "balance_loss_mlp": 1.02946699, "epoch": 0.699332651956953, "flos": 19427888828160.0, "grad_norm": 2.0623885632280556, "language_loss": 0.7384817, "learning_rate": 8.755368970739461e-07, "loss": 0.75968146, "num_input_tokens_seen": 125171915, "step": 5816, "time_per_iteration": 2.67622971534729 }, { "auxiliary_loss_clip": 0.01103093, "auxiliary_loss_mlp": 0.01044024, "balance_loss_clip": 1.03913283, "balance_loss_mlp": 1.02654779, "epoch": 0.6994528948475921, "flos": 16143714466560.0, "grad_norm": 2.5888776642527223, "language_loss": 0.61576837, "learning_rate": 8.748927885194479e-07, "loss": 0.63723958, "num_input_tokens_seen": 125190220, "step": 5817, "time_per_iteration": 2.699587345123291 }, { "auxiliary_loss_clip": 0.01003416, "auxiliary_loss_mlp": 0.01001401, "balance_loss_clip": 1.00970149, "balance_loss_mlp": 0.99968415, "epoch": 0.6995731377382313, "flos": 64952420699520.0, "grad_norm": 0.7954269750225036, "language_loss": 0.57388151, "learning_rate": 8.742488506428209e-07, "loss": 0.59392965, "num_input_tokens_seen": 125249310, "step": 5818, "time_per_iteration": 3.2331936359405518 }, { "auxiliary_loss_clip": 0.01109968, "auxiliary_loss_mlp": 0.00772754, "balance_loss_clip": 1.04004335, "balance_loss_mlp": 1.00044358, "epoch": 0.6996933806288703, "flos": 24900136076160.0, "grad_norm": 1.880667944381032, "language_loss": 0.7846173, "learning_rate": 8.736050835417466e-07, "loss": 0.8034445, "num_input_tokens_seen": 125269350, "step": 5819, "time_per_iteration": 2.881047248840332 }, { "auxiliary_loss_clip": 0.0112686, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.04327178, "balance_loss_mlp": 1.0218035, "epoch": 0.6998136235195094, "flos": 20777806782720.0, "grad_norm": 1.8202282891023482, "language_loss": 0.6110754, "learning_rate": 8.729614873138862e-07, "loss": 0.63273036, "num_input_tokens_seen": 125286985, "step": 5820, "time_per_iteration": 2.7756869792938232 }, { "auxiliary_loss_clip": 0.01098268, "auxiliary_loss_mlp": 0.01053561, "balance_loss_clip": 1.04317319, "balance_loss_mlp": 1.03576303, "epoch": 0.6999338664101485, "flos": 23733470332800.0, "grad_norm": 1.9347449612181944, "language_loss": 0.77409673, "learning_rate": 8.723180620568716e-07, "loss": 0.79561496, "num_input_tokens_seen": 125306240, "step": 5821, "time_per_iteration": 2.888699531555176 }, { "auxiliary_loss_clip": 0.01110355, "auxiliary_loss_mlp": 0.01033637, "balance_loss_clip": 1.04142928, "balance_loss_mlp": 1.01877141, "epoch": 0.7000541093007876, "flos": 19864598382720.0, "grad_norm": 7.416408398584821, "language_loss": 0.85224462, "learning_rate": 8.716748078683116e-07, "loss": 0.87368453, "num_input_tokens_seen": 125323015, "step": 5822, "time_per_iteration": 2.78692626953125 }, { "auxiliary_loss_clip": 0.01048562, "auxiliary_loss_mlp": 0.01055482, "balance_loss_clip": 1.03361166, "balance_loss_mlp": 1.0360148, "epoch": 0.7001743521914267, "flos": 29679056029440.0, "grad_norm": 8.305715768074162, "language_loss": 0.68645048, "learning_rate": 8.710317248457855e-07, "loss": 0.70749092, "num_input_tokens_seen": 125342630, "step": 5823, "time_per_iteration": 2.9337987899780273 }, { "auxiliary_loss_clip": 0.01105983, "auxiliary_loss_mlp": 0.01044637, "balance_loss_clip": 1.04194689, "balance_loss_mlp": 1.02854419, "epoch": 0.7002945950820658, "flos": 27489762080640.0, "grad_norm": 1.9219995348041348, "language_loss": 0.72323006, "learning_rate": 8.703888130868482e-07, "loss": 0.74473625, "num_input_tokens_seen": 125364480, "step": 5824, "time_per_iteration": 2.7958104610443115 }, { "auxiliary_loss_clip": 0.01099465, "auxiliary_loss_mlp": 0.01041412, "balance_loss_clip": 1.03978169, "balance_loss_mlp": 1.02516985, "epoch": 0.7004148379727049, "flos": 22158463800960.0, "grad_norm": 2.55689778291698, "language_loss": 0.8221662, "learning_rate": 8.697460726890307e-07, "loss": 0.843575, "num_input_tokens_seen": 125381625, "step": 5825, "time_per_iteration": 2.7549221515655518 }, { "auxiliary_loss_clip": 0.01098106, "auxiliary_loss_mlp": 0.00773163, "balance_loss_clip": 1.03989637, "balance_loss_mlp": 1.00051141, "epoch": 0.7005350808633439, "flos": 19423758764160.0, "grad_norm": 2.018062543988608, "language_loss": 0.90529692, "learning_rate": 8.691035037498354e-07, "loss": 0.92400968, "num_input_tokens_seen": 125397615, "step": 5826, "time_per_iteration": 2.7226321697235107 }, { "auxiliary_loss_clip": 0.01105644, "auxiliary_loss_mlp": 0.01049531, "balance_loss_clip": 1.0385437, "balance_loss_mlp": 1.03213823, "epoch": 0.7006553237539831, "flos": 23476708938240.0, "grad_norm": 1.9924365571209146, "language_loss": 0.72466493, "learning_rate": 8.684611063667391e-07, "loss": 0.74621665, "num_input_tokens_seen": 125418080, "step": 5827, "time_per_iteration": 2.7977099418640137 }, { "auxiliary_loss_clip": 0.0111814, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.03925061, "balance_loss_mlp": 1.02305603, "epoch": 0.7007755666446221, "flos": 31212872640000.0, "grad_norm": 2.0440585518306964, "language_loss": 0.76994944, "learning_rate": 8.678188806371935e-07, "loss": 0.79151839, "num_input_tokens_seen": 125440115, "step": 5828, "time_per_iteration": 2.7723746299743652 }, { "auxiliary_loss_clip": 0.01119853, "auxiliary_loss_mlp": 0.01033904, "balance_loss_clip": 1.04233813, "balance_loss_mlp": 1.01953936, "epoch": 0.7008958095352612, "flos": 18149899858560.0, "grad_norm": 1.7196122126175548, "language_loss": 0.85220587, "learning_rate": 8.671768266586228e-07, "loss": 0.87374336, "num_input_tokens_seen": 125458240, "step": 5829, "time_per_iteration": 3.5193827152252197 }, { "auxiliary_loss_clip": 0.01098291, "auxiliary_loss_mlp": 0.01038841, "balance_loss_clip": 1.04226017, "balance_loss_mlp": 1.02182996, "epoch": 0.7010160524259004, "flos": 27452307173760.0, "grad_norm": 4.458582654181657, "language_loss": 0.78078508, "learning_rate": 8.665349445284275e-07, "loss": 0.80215645, "num_input_tokens_seen": 125477980, "step": 5830, "time_per_iteration": 3.7449138164520264 }, { "auxiliary_loss_clip": 0.0109634, "auxiliary_loss_mlp": 0.01041891, "balance_loss_clip": 1.04179716, "balance_loss_mlp": 1.02588153, "epoch": 0.7011362953165394, "flos": 23842064125440.0, "grad_norm": 1.595513056836478, "language_loss": 0.81002474, "learning_rate": 8.658932343439799e-07, "loss": 0.83140707, "num_input_tokens_seen": 125497765, "step": 5831, "time_per_iteration": 2.715411424636841 }, { "auxiliary_loss_clip": 0.01137926, "auxiliary_loss_mlp": 0.01045271, "balance_loss_clip": 1.04531789, "balance_loss_mlp": 1.02703214, "epoch": 0.7012565382071785, "flos": 24823430582400.0, "grad_norm": 2.3205338495837915, "language_loss": 0.77899086, "learning_rate": 8.65251696202627e-07, "loss": 0.80082285, "num_input_tokens_seen": 125514145, "step": 5832, "time_per_iteration": 2.676391363143921 }, { "auxiliary_loss_clip": 0.01097422, "auxiliary_loss_mlp": 0.01044264, "balance_loss_clip": 1.04001868, "balance_loss_mlp": 1.0281589, "epoch": 0.7013767810978175, "flos": 21397445326080.0, "grad_norm": 2.179875489353207, "language_loss": 0.87343395, "learning_rate": 8.646103302016896e-07, "loss": 0.89485085, "num_input_tokens_seen": 125533115, "step": 5833, "time_per_iteration": 2.6773815155029297 }, { "auxiliary_loss_clip": 0.01094258, "auxiliary_loss_mlp": 0.01036539, "balance_loss_clip": 1.04039454, "balance_loss_mlp": 1.01944458, "epoch": 0.7014970239884567, "flos": 16687150306560.0, "grad_norm": 1.854594390065909, "language_loss": 0.88665509, "learning_rate": 8.639691364384614e-07, "loss": 0.90796304, "num_input_tokens_seen": 125550740, "step": 5834, "time_per_iteration": 2.755998373031616 }, { "auxiliary_loss_clip": 0.01112671, "auxiliary_loss_mlp": 0.01052738, "balance_loss_clip": 1.04161596, "balance_loss_mlp": 1.03638208, "epoch": 0.7016172668790958, "flos": 12568268718720.0, "grad_norm": 4.301600901927964, "language_loss": 0.73191351, "learning_rate": 8.633281150102136e-07, "loss": 0.75356758, "num_input_tokens_seen": 125567590, "step": 5835, "time_per_iteration": 2.6397969722747803 }, { "auxiliary_loss_clip": 0.01111819, "auxiliary_loss_mlp": 0.01042267, "balance_loss_clip": 1.04302263, "balance_loss_mlp": 1.02749753, "epoch": 0.7017375097697348, "flos": 17452729808640.0, "grad_norm": 2.0953931135736723, "language_loss": 0.67895854, "learning_rate": 8.626872660141855e-07, "loss": 0.70049942, "num_input_tokens_seen": 125585500, "step": 5836, "time_per_iteration": 3.675924062728882 }, { "auxiliary_loss_clip": 0.01089869, "auxiliary_loss_mlp": 0.01035099, "balance_loss_clip": 1.04304349, "balance_loss_mlp": 1.02066243, "epoch": 0.701857752660374, "flos": 18513028402560.0, "grad_norm": 1.9375851020617159, "language_loss": 0.74718332, "learning_rate": 8.620465895475957e-07, "loss": 0.76843297, "num_input_tokens_seen": 125603720, "step": 5837, "time_per_iteration": 3.610276460647583 }, { "auxiliary_loss_clip": 0.01082242, "auxiliary_loss_mlp": 0.01041362, "balance_loss_clip": 1.03933775, "balance_loss_mlp": 1.02470851, "epoch": 0.701977995551013, "flos": 24425971614720.0, "grad_norm": 1.4587087790337292, "language_loss": 0.75015295, "learning_rate": 8.614060857076333e-07, "loss": 0.77138901, "num_input_tokens_seen": 125624390, "step": 5838, "time_per_iteration": 2.747976064682007 }, { "auxiliary_loss_clip": 0.01105842, "auxiliary_loss_mlp": 0.01042627, "balance_loss_clip": 1.03963017, "balance_loss_mlp": 1.02668858, "epoch": 0.7020982384416521, "flos": 23002759958400.0, "grad_norm": 1.8839719544104725, "language_loss": 0.74957412, "learning_rate": 8.60765754591462e-07, "loss": 0.77105886, "num_input_tokens_seen": 125644085, "step": 5839, "time_per_iteration": 2.7409558296203613 }, { "auxiliary_loss_clip": 0.01133797, "auxiliary_loss_mlp": 0.01043825, "balance_loss_clip": 1.04274964, "balance_loss_mlp": 1.02802956, "epoch": 0.7022184813322913, "flos": 20449080489600.0, "grad_norm": 1.8977710589391903, "language_loss": 0.73024756, "learning_rate": 8.601255962962211e-07, "loss": 0.75202376, "num_input_tokens_seen": 125663095, "step": 5840, "time_per_iteration": 2.596827745437622 }, { "auxiliary_loss_clip": 0.01129348, "auxiliary_loss_mlp": 0.01045042, "balance_loss_clip": 1.04386437, "balance_loss_mlp": 1.02747071, "epoch": 0.7023387242229303, "flos": 19790514581760.0, "grad_norm": 2.2701835805403685, "language_loss": 0.72127229, "learning_rate": 8.594856109190194e-07, "loss": 0.74301618, "num_input_tokens_seen": 125680125, "step": 5841, "time_per_iteration": 2.692166328430176 }, { "auxiliary_loss_clip": 0.01138037, "auxiliary_loss_mlp": 0.01041658, "balance_loss_clip": 1.0458746, "balance_loss_mlp": 1.02499294, "epoch": 0.7024589671135694, "flos": 33259278286080.0, "grad_norm": 1.5566638947293174, "language_loss": 0.69390285, "learning_rate": 8.588457985569446e-07, "loss": 0.71569979, "num_input_tokens_seen": 125703035, "step": 5842, "time_per_iteration": 2.725905656814575 }, { "auxiliary_loss_clip": 0.01136508, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.0433538, "balance_loss_mlp": 1.03116846, "epoch": 0.7025792100042085, "flos": 19098982967040.0, "grad_norm": 2.1676819040023068, "language_loss": 0.71335816, "learning_rate": 8.582061593070542e-07, "loss": 0.73521423, "num_input_tokens_seen": 125723765, "step": 5843, "time_per_iteration": 2.6430225372314453 }, { "auxiliary_loss_clip": 0.01137043, "auxiliary_loss_mlp": 0.00772838, "balance_loss_clip": 1.04420412, "balance_loss_mlp": 1.00052571, "epoch": 0.7026994528948476, "flos": 18952611045120.0, "grad_norm": 2.7725816363171085, "language_loss": 0.77119863, "learning_rate": 8.57566693266383e-07, "loss": 0.79029739, "num_input_tokens_seen": 125741455, "step": 5844, "time_per_iteration": 2.6082229614257812 }, { "auxiliary_loss_clip": 0.01116119, "auxiliary_loss_mlp": 0.00772698, "balance_loss_clip": 1.0423311, "balance_loss_mlp": 1.00050855, "epoch": 0.7028196957854866, "flos": 19536662188800.0, "grad_norm": 2.133667375032107, "language_loss": 0.69327033, "learning_rate": 8.569274005319354e-07, "loss": 0.7121585, "num_input_tokens_seen": 125759855, "step": 5845, "time_per_iteration": 2.8065640926361084 }, { "auxiliary_loss_clip": 0.01117302, "auxiliary_loss_mlp": 0.01044227, "balance_loss_clip": 1.04031456, "balance_loss_mlp": 1.02886057, "epoch": 0.7029399386761258, "flos": 20845318394880.0, "grad_norm": 1.760691004024928, "language_loss": 0.8006919, "learning_rate": 8.562882812006913e-07, "loss": 0.82230717, "num_input_tokens_seen": 125777345, "step": 5846, "time_per_iteration": 2.6797256469726562 }, { "auxiliary_loss_clip": 0.01133531, "auxiliary_loss_mlp": 0.01035223, "balance_loss_clip": 1.04234648, "balance_loss_mlp": 1.0184741, "epoch": 0.7030601815667649, "flos": 22055005653120.0, "grad_norm": 2.15902091503284, "language_loss": 0.77365178, "learning_rate": 8.556493353696066e-07, "loss": 0.79533935, "num_input_tokens_seen": 125796345, "step": 5847, "time_per_iteration": 2.6425342559814453 }, { "auxiliary_loss_clip": 0.01126493, "auxiliary_loss_mlp": 0.00773169, "balance_loss_clip": 1.04567599, "balance_loss_mlp": 1.00049353, "epoch": 0.7031804244574039, "flos": 27198742089600.0, "grad_norm": 20.567952191669505, "language_loss": 0.67969167, "learning_rate": 8.550105631356077e-07, "loss": 0.69868827, "num_input_tokens_seen": 125816070, "step": 5848, "time_per_iteration": 2.7248027324676514 }, { "auxiliary_loss_clip": 0.01091284, "auxiliary_loss_mlp": 0.01042589, "balance_loss_clip": 1.03708005, "balance_loss_mlp": 1.02610266, "epoch": 0.7033006673480431, "flos": 22379853277440.0, "grad_norm": 1.940520425817261, "language_loss": 0.77268243, "learning_rate": 8.543719645955961e-07, "loss": 0.79402113, "num_input_tokens_seen": 125834400, "step": 5849, "time_per_iteration": 2.7309396266937256 }, { "auxiliary_loss_clip": 0.01110101, "auxiliary_loss_mlp": 0.0104382, "balance_loss_clip": 1.03925776, "balance_loss_mlp": 1.02778649, "epoch": 0.7034209102386821, "flos": 24715986024960.0, "grad_norm": 1.7905685505522009, "language_loss": 0.74306798, "learning_rate": 8.537335398464467e-07, "loss": 0.76460719, "num_input_tokens_seen": 125854720, "step": 5850, "time_per_iteration": 2.788043737411499 }, { "auxiliary_loss_clip": 0.01111497, "auxiliary_loss_mlp": 0.01048881, "balance_loss_clip": 1.04021847, "balance_loss_mlp": 1.03011799, "epoch": 0.7035411531293212, "flos": 22556174163840.0, "grad_norm": 2.772943647373398, "language_loss": 0.84855831, "learning_rate": 8.53095288985007e-07, "loss": 0.87016201, "num_input_tokens_seen": 125868455, "step": 5851, "time_per_iteration": 2.781043767929077 }, { "auxiliary_loss_clip": 0.01130132, "auxiliary_loss_mlp": 0.01038958, "balance_loss_clip": 1.04262233, "balance_loss_mlp": 1.02372265, "epoch": 0.7036613960199604, "flos": 22674967418880.0, "grad_norm": 1.5884074566650461, "language_loss": 0.82534242, "learning_rate": 8.524572121081009e-07, "loss": 0.84703332, "num_input_tokens_seen": 125888555, "step": 5852, "time_per_iteration": 2.643815755844116 }, { "auxiliary_loss_clip": 0.01130876, "auxiliary_loss_mlp": 0.01041371, "balance_loss_clip": 1.04410744, "balance_loss_mlp": 1.02475286, "epoch": 0.7037816389105994, "flos": 22492146170880.0, "grad_norm": 2.107422132835635, "language_loss": 0.62212831, "learning_rate": 8.518193093125232e-07, "loss": 0.6438508, "num_input_tokens_seen": 125907610, "step": 5853, "time_per_iteration": 2.681899309158325 }, { "auxiliary_loss_clip": 0.01114294, "auxiliary_loss_mlp": 0.0104058, "balance_loss_clip": 1.04318976, "balance_loss_mlp": 1.02493942, "epoch": 0.7039018818012385, "flos": 27087490690560.0, "grad_norm": 1.685665770403327, "language_loss": 0.81151831, "learning_rate": 8.511815806950436e-07, "loss": 0.833067, "num_input_tokens_seen": 125928640, "step": 5854, "time_per_iteration": 2.6956074237823486 }, { "auxiliary_loss_clip": 0.01118799, "auxiliary_loss_mlp": 0.01037732, "balance_loss_clip": 1.04010868, "balance_loss_mlp": 1.02262807, "epoch": 0.7040221246918776, "flos": 17749819198080.0, "grad_norm": 1.6474905068047596, "language_loss": 0.77976668, "learning_rate": 8.505440263524044e-07, "loss": 0.801332, "num_input_tokens_seen": 125947485, "step": 5855, "time_per_iteration": 3.6465678215026855 }, { "auxiliary_loss_clip": 0.01126201, "auxiliary_loss_mlp": 0.0103869, "balance_loss_clip": 1.04269767, "balance_loss_mlp": 1.02042675, "epoch": 0.7041423675825167, "flos": 16279851012480.0, "grad_norm": 4.737566891235351, "language_loss": 0.88036942, "learning_rate": 8.49906646381322e-07, "loss": 0.90201837, "num_input_tokens_seen": 125960320, "step": 5856, "time_per_iteration": 3.509486198425293 }, { "auxiliary_loss_clip": 0.01097195, "auxiliary_loss_mlp": 0.0103827, "balance_loss_clip": 1.04154348, "balance_loss_mlp": 1.02331471, "epoch": 0.7042626104731557, "flos": 25483181639040.0, "grad_norm": 1.7934357472106244, "language_loss": 0.72064614, "learning_rate": 8.492694408784884e-07, "loss": 0.74200082, "num_input_tokens_seen": 125980575, "step": 5857, "time_per_iteration": 2.776691198348999 }, { "auxiliary_loss_clip": 0.01125733, "auxiliary_loss_mlp": 0.01051473, "balance_loss_clip": 1.04243708, "balance_loss_mlp": 1.03490281, "epoch": 0.7043828533637949, "flos": 17857622891520.0, "grad_norm": 3.1273255940652693, "language_loss": 0.62143695, "learning_rate": 8.486324099405642e-07, "loss": 0.64320898, "num_input_tokens_seen": 125997420, "step": 5858, "time_per_iteration": 2.649056911468506 }, { "auxiliary_loss_clip": 0.01120196, "auxiliary_loss_mlp": 0.01040113, "balance_loss_clip": 1.0405432, "balance_loss_mlp": 1.02343559, "epoch": 0.704503096254434, "flos": 29494259533440.0, "grad_norm": 2.3130052872054434, "language_loss": 0.74785435, "learning_rate": 8.479955536641887e-07, "loss": 0.76945746, "num_input_tokens_seen": 126018915, "step": 5859, "time_per_iteration": 2.735527992248535 }, { "auxiliary_loss_clip": 0.0110054, "auxiliary_loss_mlp": 0.01043686, "balance_loss_clip": 1.03655505, "balance_loss_mlp": 1.0263406, "epoch": 0.704623339145073, "flos": 30920739327360.0, "grad_norm": 1.778220321620968, "language_loss": 0.66014254, "learning_rate": 8.473588721459716e-07, "loss": 0.68158472, "num_input_tokens_seen": 126038825, "step": 5860, "time_per_iteration": 2.753103256225586 }, { "auxiliary_loss_clip": 0.01124152, "auxiliary_loss_mlp": 0.01043805, "balance_loss_clip": 1.04202557, "balance_loss_mlp": 1.02544653, "epoch": 0.7047435820357122, "flos": 23914747296000.0, "grad_norm": 1.948701684623128, "language_loss": 0.70571685, "learning_rate": 8.467223654824967e-07, "loss": 0.72739643, "num_input_tokens_seen": 126058280, "step": 5861, "time_per_iteration": 2.646392345428467 }, { "auxiliary_loss_clip": 0.01118237, "auxiliary_loss_mlp": 0.0103133, "balance_loss_clip": 1.04271793, "balance_loss_mlp": 1.01572561, "epoch": 0.7048638249263512, "flos": 46494010926720.0, "grad_norm": 2.8773185740916905, "language_loss": 0.62669218, "learning_rate": 8.460860337703233e-07, "loss": 0.64818788, "num_input_tokens_seen": 126078885, "step": 5862, "time_per_iteration": 3.8271045684814453 }, { "auxiliary_loss_clip": 0.01084359, "auxiliary_loss_mlp": 0.01043263, "balance_loss_clip": 1.03776908, "balance_loss_mlp": 1.02390325, "epoch": 0.7049840678169903, "flos": 21689219502720.0, "grad_norm": 1.8953655443007402, "language_loss": 0.70444679, "learning_rate": 8.454498771059797e-07, "loss": 0.72572297, "num_input_tokens_seen": 126098260, "step": 5863, "time_per_iteration": 3.6534180641174316 }, { "auxiliary_loss_clip": 0.01077582, "auxiliary_loss_mlp": 0.01039203, "balance_loss_clip": 1.03710389, "balance_loss_mlp": 1.02260959, "epoch": 0.7051043107076294, "flos": 18405081054720.0, "grad_norm": 2.4406289572812363, "language_loss": 0.83128417, "learning_rate": 8.448138955859725e-07, "loss": 0.85245204, "num_input_tokens_seen": 126114845, "step": 5864, "time_per_iteration": 2.6863222122192383 }, { "auxiliary_loss_clip": 0.01110464, "auxiliary_loss_mlp": 0.0104323, "balance_loss_clip": 1.04055798, "balance_loss_mlp": 1.02634954, "epoch": 0.7052245535982685, "flos": 19319043640320.0, "grad_norm": 2.580014072176894, "language_loss": 0.90264487, "learning_rate": 8.44178089306778e-07, "loss": 0.92418182, "num_input_tokens_seen": 126132780, "step": 5865, "time_per_iteration": 2.70704984664917 }, { "auxiliary_loss_clip": 0.01134689, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 1.04451275, "balance_loss_mlp": 1.02587605, "epoch": 0.7053447964889076, "flos": 19062138591360.0, "grad_norm": 1.9492604790264978, "language_loss": 0.77030075, "learning_rate": 8.4354245836485e-07, "loss": 0.792068, "num_input_tokens_seen": 126151225, "step": 5866, "time_per_iteration": 2.5786855220794678 }, { "auxiliary_loss_clip": 0.01099784, "auxiliary_loss_mlp": 0.0104194, "balance_loss_clip": 1.04021204, "balance_loss_mlp": 1.02458358, "epoch": 0.7054650393795466, "flos": 27379228953600.0, "grad_norm": 2.5355587231917003, "language_loss": 0.732503, "learning_rate": 8.429070028566108e-07, "loss": 0.75392026, "num_input_tokens_seen": 126172535, "step": 5867, "time_per_iteration": 2.769693613052368 }, { "auxiliary_loss_clip": 0.01122355, "auxiliary_loss_mlp": 0.01045585, "balance_loss_clip": 1.04319787, "balance_loss_mlp": 1.02978957, "epoch": 0.7055852822701858, "flos": 16102201322880.0, "grad_norm": 1.824666774124458, "language_loss": 0.7487849, "learning_rate": 8.422717228784586e-07, "loss": 0.7704643, "num_input_tokens_seen": 126189410, "step": 5868, "time_per_iteration": 2.6239736080169678 }, { "auxiliary_loss_clip": 0.01089076, "auxiliary_loss_mlp": 0.01041009, "balance_loss_clip": 1.04296505, "balance_loss_mlp": 1.02453446, "epoch": 0.7057055251608249, "flos": 11692299744000.0, "grad_norm": 1.9927712060757106, "language_loss": 0.69407755, "learning_rate": 8.416366185267663e-07, "loss": 0.7153784, "num_input_tokens_seen": 126206910, "step": 5869, "time_per_iteration": 2.728316307067871 }, { "auxiliary_loss_clip": 0.01121683, "auxiliary_loss_mlp": 0.01038932, "balance_loss_clip": 1.04200089, "balance_loss_mlp": 1.02379274, "epoch": 0.7058257680514639, "flos": 22711560399360.0, "grad_norm": 1.8260684732953714, "language_loss": 0.77844596, "learning_rate": 8.410016898978778e-07, "loss": 0.80005211, "num_input_tokens_seen": 126224385, "step": 5870, "time_per_iteration": 2.650859832763672 }, { "auxiliary_loss_clip": 0.01084259, "auxiliary_loss_mlp": 0.01035099, "balance_loss_clip": 1.04105854, "balance_loss_mlp": 1.01925063, "epoch": 0.7059460109421031, "flos": 17529543043200.0, "grad_norm": 2.261946184707155, "language_loss": 0.79038876, "learning_rate": 8.403669370881115e-07, "loss": 0.81158233, "num_input_tokens_seen": 126243120, "step": 5871, "time_per_iteration": 2.794983148574829 }, { "auxiliary_loss_clip": 0.01134579, "auxiliary_loss_mlp": 0.01036125, "balance_loss_clip": 1.04521918, "balance_loss_mlp": 1.02026987, "epoch": 0.7060662538327421, "flos": 23544687427200.0, "grad_norm": 1.937398907041289, "language_loss": 0.78629345, "learning_rate": 8.397323601937587e-07, "loss": 0.80800045, "num_input_tokens_seen": 126263020, "step": 5872, "time_per_iteration": 2.596508502960205 }, { "auxiliary_loss_clip": 0.01095805, "auxiliary_loss_mlp": 0.01041679, "balance_loss_clip": 1.040025, "balance_loss_mlp": 1.02466762, "epoch": 0.7061864967233812, "flos": 30260736875520.0, "grad_norm": 2.0319621509784866, "language_loss": 0.77454972, "learning_rate": 8.390979593110838e-07, "loss": 0.7959246, "num_input_tokens_seen": 126285150, "step": 5873, "time_per_iteration": 2.8243298530578613 }, { "auxiliary_loss_clip": 0.01118645, "auxiliary_loss_mlp": 0.01040889, "balance_loss_clip": 1.04384947, "balance_loss_mlp": 1.02389026, "epoch": 0.7063067396140204, "flos": 20701460424960.0, "grad_norm": 6.679078065171553, "language_loss": 0.81861341, "learning_rate": 8.384637345363262e-07, "loss": 0.84020877, "num_input_tokens_seen": 126304340, "step": 5874, "time_per_iteration": 2.6995701789855957 }, { "auxiliary_loss_clip": 0.01103692, "auxiliary_loss_mlp": 0.01035718, "balance_loss_clip": 1.0389719, "balance_loss_mlp": 1.01846862, "epoch": 0.7064269825046594, "flos": 32266168081920.0, "grad_norm": 2.9432489456782984, "language_loss": 0.76593137, "learning_rate": 8.378296859656964e-07, "loss": 0.7873255, "num_input_tokens_seen": 126325495, "step": 5875, "time_per_iteration": 2.7760372161865234 }, { "auxiliary_loss_clip": 0.01112208, "auxiliary_loss_mlp": 0.01038545, "balance_loss_clip": 1.04185009, "balance_loss_mlp": 1.02270246, "epoch": 0.7065472253952985, "flos": 30227124723840.0, "grad_norm": 3.003472290571046, "language_loss": 0.6858207, "learning_rate": 8.371958136953792e-07, "loss": 0.7073282, "num_input_tokens_seen": 126345525, "step": 5876, "time_per_iteration": 2.752203941345215 }, { "auxiliary_loss_clip": 0.01102727, "auxiliary_loss_mlp": 0.01040758, "balance_loss_clip": 1.04023445, "balance_loss_mlp": 1.02423525, "epoch": 0.7066674682859376, "flos": 16216720859520.0, "grad_norm": 3.0028793577159347, "language_loss": 0.66413903, "learning_rate": 8.365621178215326e-07, "loss": 0.68557382, "num_input_tokens_seen": 126361995, "step": 5877, "time_per_iteration": 2.661548137664795 }, { "auxiliary_loss_clip": 0.01116366, "auxiliary_loss_mlp": 0.01032704, "balance_loss_clip": 1.04113936, "balance_loss_mlp": 1.01702809, "epoch": 0.7067877111765767, "flos": 14830461319680.0, "grad_norm": 2.1585328751905073, "language_loss": 0.75155699, "learning_rate": 8.359285984402871e-07, "loss": 0.77304769, "num_input_tokens_seen": 126379260, "step": 5878, "time_per_iteration": 2.6559112071990967 }, { "auxiliary_loss_clip": 0.01104174, "auxiliary_loss_mlp": 0.0103791, "balance_loss_clip": 1.03911495, "balance_loss_mlp": 1.02209711, "epoch": 0.7069079540672157, "flos": 25440196037760.0, "grad_norm": 2.0997539220118546, "language_loss": 0.73979849, "learning_rate": 8.352952556477489e-07, "loss": 0.76121938, "num_input_tokens_seen": 126397170, "step": 5879, "time_per_iteration": 2.6589457988739014 }, { "auxiliary_loss_clip": 0.01125122, "auxiliary_loss_mlp": 0.01037247, "balance_loss_clip": 1.04431057, "balance_loss_mlp": 1.0216192, "epoch": 0.7070281969578549, "flos": 24607751368320.0, "grad_norm": 1.877228461304237, "language_loss": 0.76816308, "learning_rate": 8.34662089539993e-07, "loss": 0.78978676, "num_input_tokens_seen": 126416680, "step": 5880, "time_per_iteration": 2.6786959171295166 }, { "auxiliary_loss_clip": 0.011318, "auxiliary_loss_mlp": 0.01044307, "balance_loss_clip": 1.04363441, "balance_loss_mlp": 1.02842796, "epoch": 0.707148439848494, "flos": 26724469887360.0, "grad_norm": 2.1379987636272633, "language_loss": 0.79258263, "learning_rate": 8.340291002130722e-07, "loss": 0.81434369, "num_input_tokens_seen": 126435870, "step": 5881, "time_per_iteration": 3.6266775131225586 }, { "auxiliary_loss_clip": 0.01142582, "auxiliary_loss_mlp": 0.01042202, "balance_loss_clip": 1.04770374, "balance_loss_mlp": 1.02552497, "epoch": 0.707268682739133, "flos": 15085750256640.0, "grad_norm": 2.7065102587358973, "language_loss": 0.7985903, "learning_rate": 8.3339628776301e-07, "loss": 0.82043815, "num_input_tokens_seen": 126454010, "step": 5882, "time_per_iteration": 3.5403730869293213 }, { "auxiliary_loss_clip": 0.01130036, "auxiliary_loss_mlp": 0.01040836, "balance_loss_clip": 1.04111052, "balance_loss_mlp": 1.02278805, "epoch": 0.7073889256297722, "flos": 34313148345600.0, "grad_norm": 2.5210554819894018, "language_loss": 0.57329035, "learning_rate": 8.327636522858033e-07, "loss": 0.59499907, "num_input_tokens_seen": 126473615, "step": 5883, "time_per_iteration": 2.713517665863037 }, { "auxiliary_loss_clip": 0.01088166, "auxiliary_loss_mlp": 0.01035788, "balance_loss_clip": 1.04177761, "balance_loss_mlp": 1.01838398, "epoch": 0.7075091685204112, "flos": 20083940784000.0, "grad_norm": 2.0453718678913266, "language_loss": 0.76774073, "learning_rate": 8.321311938774225e-07, "loss": 0.78898036, "num_input_tokens_seen": 126492705, "step": 5884, "time_per_iteration": 2.695175886154175 }, { "auxiliary_loss_clip": 0.0113868, "auxiliary_loss_mlp": 0.01041754, "balance_loss_clip": 1.04546332, "balance_loss_mlp": 1.02543449, "epoch": 0.7076294114110503, "flos": 20777124424320.0, "grad_norm": 1.9714126282607622, "language_loss": 0.79276538, "learning_rate": 8.314989126338104e-07, "loss": 0.81456977, "num_input_tokens_seen": 126512715, "step": 5885, "time_per_iteration": 2.6334872245788574 }, { "auxiliary_loss_clip": 0.01127221, "auxiliary_loss_mlp": 0.01044198, "balance_loss_clip": 1.04449391, "balance_loss_mlp": 1.02692413, "epoch": 0.7077496543016895, "flos": 17967689141760.0, "grad_norm": 1.7272787463231274, "language_loss": 0.84475589, "learning_rate": 8.308668086508847e-07, "loss": 0.86647004, "num_input_tokens_seen": 126530795, "step": 5886, "time_per_iteration": 2.6254818439483643 }, { "auxiliary_loss_clip": 0.01101208, "auxiliary_loss_mlp": 0.01040849, "balance_loss_clip": 1.03895855, "balance_loss_mlp": 1.02390909, "epoch": 0.7078698971923285, "flos": 45478098564480.0, "grad_norm": 2.1591564217310264, "language_loss": 0.740574, "learning_rate": 8.302348820245342e-07, "loss": 0.7619946, "num_input_tokens_seen": 126553360, "step": 5887, "time_per_iteration": 3.044734477996826 }, { "auxiliary_loss_clip": 0.01101783, "auxiliary_loss_mlp": 0.01046782, "balance_loss_clip": 1.04035211, "balance_loss_mlp": 1.02630222, "epoch": 0.7079901400829676, "flos": 26943704547840.0, "grad_norm": 2.66493091179031, "language_loss": 0.69831467, "learning_rate": 8.296031328506232e-07, "loss": 0.71980029, "num_input_tokens_seen": 126573110, "step": 5888, "time_per_iteration": 3.7583470344543457 }, { "auxiliary_loss_clip": 0.01113135, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.0421195, "balance_loss_mlp": 1.02070749, "epoch": 0.7081103829736067, "flos": 24423206267520.0, "grad_norm": 1.9958734754332106, "language_loss": 0.75570285, "learning_rate": 8.289715612249857e-07, "loss": 0.77721179, "num_input_tokens_seen": 126593725, "step": 5889, "time_per_iteration": 3.625345468521118 }, { "auxiliary_loss_clip": 0.01109339, "auxiliary_loss_mlp": 0.01042944, "balance_loss_clip": 1.04132926, "balance_loss_mlp": 1.02599227, "epoch": 0.7082306258642458, "flos": 18543300589440.0, "grad_norm": 2.7813279669951765, "language_loss": 0.77335334, "learning_rate": 8.283401672434305e-07, "loss": 0.79487622, "num_input_tokens_seen": 126608950, "step": 5890, "time_per_iteration": 2.6310274600982666 }, { "auxiliary_loss_clip": 0.01110708, "auxiliary_loss_mlp": 0.01040098, "balance_loss_clip": 1.04231715, "balance_loss_mlp": 1.02404094, "epoch": 0.7083508687548848, "flos": 23477534951040.0, "grad_norm": 1.9775027987158467, "language_loss": 0.70406532, "learning_rate": 8.277089510017412e-07, "loss": 0.72557336, "num_input_tokens_seen": 126629755, "step": 5891, "time_per_iteration": 2.6774964332580566 }, { "auxiliary_loss_clip": 0.01109399, "auxiliary_loss_mlp": 0.01042274, "balance_loss_clip": 1.04179573, "balance_loss_mlp": 1.02655005, "epoch": 0.708471111645524, "flos": 22419463000320.0, "grad_norm": 1.8538087483078836, "language_loss": 0.82044399, "learning_rate": 8.270779125956719e-07, "loss": 0.84196073, "num_input_tokens_seen": 126650135, "step": 5892, "time_per_iteration": 2.6788761615753174 }, { "auxiliary_loss_clip": 0.01082813, "auxiliary_loss_mlp": 0.0104948, "balance_loss_clip": 1.03910375, "balance_loss_mlp": 1.03161097, "epoch": 0.7085913545361631, "flos": 20922885815040.0, "grad_norm": 2.0429346780326982, "language_loss": 0.79952878, "learning_rate": 8.264470521209505e-07, "loss": 0.82085174, "num_input_tokens_seen": 126668500, "step": 5893, "time_per_iteration": 2.7298340797424316 }, { "auxiliary_loss_clip": 0.01115459, "auxiliary_loss_mlp": 0.01043322, "balance_loss_clip": 1.03974056, "balance_loss_mlp": 1.02660894, "epoch": 0.7087115974268021, "flos": 15012384727680.0, "grad_norm": 3.011406667772246, "language_loss": 0.76419908, "learning_rate": 8.258163696732785e-07, "loss": 0.78578687, "num_input_tokens_seen": 126686090, "step": 5894, "time_per_iteration": 2.6147427558898926 }, { "auxiliary_loss_clip": 0.01119257, "auxiliary_loss_mlp": 0.01045295, "balance_loss_clip": 1.04272485, "balance_loss_mlp": 1.02811682, "epoch": 0.7088318403174413, "flos": 21539040739200.0, "grad_norm": 1.8308388568062226, "language_loss": 0.77201152, "learning_rate": 8.251858653483288e-07, "loss": 0.79365706, "num_input_tokens_seen": 126704255, "step": 5895, "time_per_iteration": 2.668180465698242 }, { "auxiliary_loss_clip": 0.01127242, "auxiliary_loss_mlp": 0.01048086, "balance_loss_clip": 1.04493642, "balance_loss_mlp": 1.03187394, "epoch": 0.7089520832080803, "flos": 15516785462400.0, "grad_norm": 2.423421983469364, "language_loss": 0.8580966, "learning_rate": 8.245555392417501e-07, "loss": 0.87984985, "num_input_tokens_seen": 126718910, "step": 5896, "time_per_iteration": 2.5580270290374756 }, { "auxiliary_loss_clip": 0.01073595, "auxiliary_loss_mlp": 0.01039548, "balance_loss_clip": 1.0362463, "balance_loss_mlp": 1.02372885, "epoch": 0.7090723260987194, "flos": 20412667077120.0, "grad_norm": 1.888364957825875, "language_loss": 0.78909743, "learning_rate": 8.239253914491613e-07, "loss": 0.81022888, "num_input_tokens_seen": 126737235, "step": 5897, "time_per_iteration": 2.7887091636657715 }, { "auxiliary_loss_clip": 0.01095469, "auxiliary_loss_mlp": 0.01040189, "balance_loss_clip": 1.04002428, "balance_loss_mlp": 1.02266526, "epoch": 0.7091925689893585, "flos": 25668337271040.0, "grad_norm": 2.055626927613484, "language_loss": 0.75326383, "learning_rate": 8.232954220661556e-07, "loss": 0.77462041, "num_input_tokens_seen": 126759970, "step": 5898, "time_per_iteration": 2.73640513420105 }, { "auxiliary_loss_clip": 0.01135479, "auxiliary_loss_mlp": 0.01043502, "balance_loss_clip": 1.04615736, "balance_loss_mlp": 1.02812386, "epoch": 0.7093128118799976, "flos": 24206629213440.0, "grad_norm": 2.5812042489486986, "language_loss": 0.70537984, "learning_rate": 8.226656311882989e-07, "loss": 0.72716963, "num_input_tokens_seen": 126779280, "step": 5899, "time_per_iteration": 2.6569290161132812 }, { "auxiliary_loss_clip": 0.01120075, "auxiliary_loss_mlp": 0.01040507, "balance_loss_clip": 1.04352307, "balance_loss_mlp": 1.02573645, "epoch": 0.7094330547706367, "flos": 16646786398080.0, "grad_norm": 2.2954490960427356, "language_loss": 0.77034193, "learning_rate": 8.22036018911129e-07, "loss": 0.79194772, "num_input_tokens_seen": 126797310, "step": 5900, "time_per_iteration": 2.609361410140991 }, { "auxiliary_loss_clip": 0.01140979, "auxiliary_loss_mlp": 0.01049201, "balance_loss_clip": 1.04482448, "balance_loss_mlp": 1.03129554, "epoch": 0.7095532976612757, "flos": 16283370545280.0, "grad_norm": 2.453605390953832, "language_loss": 0.80419862, "learning_rate": 8.214065853301599e-07, "loss": 0.82610041, "num_input_tokens_seen": 126812840, "step": 5901, "time_per_iteration": 2.610865831375122 }, { "auxiliary_loss_clip": 0.0102886, "auxiliary_loss_mlp": 0.01002395, "balance_loss_clip": 1.00868714, "balance_loss_mlp": 1.00089884, "epoch": 0.7096735405519149, "flos": 70722080559360.0, "grad_norm": 0.801987472805036, "language_loss": 0.58241594, "learning_rate": 8.207773305408734e-07, "loss": 0.60272849, "num_input_tokens_seen": 126880060, "step": 5902, "time_per_iteration": 3.3070406913757324 }, { "auxiliary_loss_clip": 0.01090544, "auxiliary_loss_mlp": 0.01038457, "balance_loss_clip": 1.03862345, "balance_loss_mlp": 1.02083766, "epoch": 0.709793783442554, "flos": 23621500661760.0, "grad_norm": 2.025989808866006, "language_loss": 0.79935473, "learning_rate": 8.201482546387288e-07, "loss": 0.82064474, "num_input_tokens_seen": 126899535, "step": 5903, "time_per_iteration": 2.79052734375 }, { "auxiliary_loss_clip": 0.0112749, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.04738414, "balance_loss_mlp": 1.0222156, "epoch": 0.709914026333193, "flos": 25993472204160.0, "grad_norm": 1.5952283680633947, "language_loss": 0.91653597, "learning_rate": 8.195193577191553e-07, "loss": 0.93819332, "num_input_tokens_seen": 126921365, "step": 5904, "time_per_iteration": 2.701211452484131 }, { "auxiliary_loss_clip": 0.0112033, "auxiliary_loss_mlp": 0.00772231, "balance_loss_clip": 1.04292357, "balance_loss_mlp": 1.00048518, "epoch": 0.7100342692238322, "flos": 24861531934080.0, "grad_norm": 1.7089106672176277, "language_loss": 0.84592646, "learning_rate": 8.188906398775579e-07, "loss": 0.86485207, "num_input_tokens_seen": 126941910, "step": 5905, "time_per_iteration": 2.644549608230591 }, { "auxiliary_loss_clip": 0.01136733, "auxiliary_loss_mlp": 0.0077267, "balance_loss_clip": 1.04381275, "balance_loss_mlp": 1.00057757, "epoch": 0.7101545121144712, "flos": 24932203943040.0, "grad_norm": 2.0172634462684105, "language_loss": 0.68498874, "learning_rate": 8.18262101209311e-07, "loss": 0.70408273, "num_input_tokens_seen": 126961120, "step": 5906, "time_per_iteration": 2.651214122772217 }, { "auxiliary_loss_clip": 0.01124098, "auxiliary_loss_mlp": 0.01043682, "balance_loss_clip": 1.04286337, "balance_loss_mlp": 1.02744579, "epoch": 0.7102747550051103, "flos": 23768842250880.0, "grad_norm": 2.328185369546585, "language_loss": 0.70082641, "learning_rate": 8.176337418097626e-07, "loss": 0.72250426, "num_input_tokens_seen": 126981590, "step": 5907, "time_per_iteration": 2.6555542945861816 }, { "auxiliary_loss_clip": 0.01124326, "auxiliary_loss_mlp": 0.00772615, "balance_loss_clip": 1.04496861, "balance_loss_mlp": 1.00053883, "epoch": 0.7103949978957494, "flos": 15303907509120.0, "grad_norm": 2.38502905138214, "language_loss": 0.79833472, "learning_rate": 8.170055617742364e-07, "loss": 0.81730413, "num_input_tokens_seen": 126998870, "step": 5908, "time_per_iteration": 3.613452196121216 }, { "auxiliary_loss_clip": 0.01106159, "auxiliary_loss_mlp": 0.01047799, "balance_loss_clip": 1.04056561, "balance_loss_mlp": 1.0302273, "epoch": 0.7105152407863885, "flos": 22638805401600.0, "grad_norm": 3.15383925873535, "language_loss": 0.70624924, "learning_rate": 8.163775611980252e-07, "loss": 0.72778881, "num_input_tokens_seen": 127017980, "step": 5909, "time_per_iteration": 3.6630053520202637 }, { "auxiliary_loss_clip": 0.0110954, "auxiliary_loss_mlp": 0.01039503, "balance_loss_clip": 1.04203629, "balance_loss_mlp": 1.02333808, "epoch": 0.7106354836770276, "flos": 17238594879360.0, "grad_norm": 2.1047232352675938, "language_loss": 0.78829968, "learning_rate": 8.157497401763982e-07, "loss": 0.80979013, "num_input_tokens_seen": 127035645, "step": 5910, "time_per_iteration": 2.6195027828216553 }, { "auxiliary_loss_clip": 0.011214, "auxiliary_loss_mlp": 0.01036263, "balance_loss_clip": 1.04127526, "balance_loss_mlp": 1.0197649, "epoch": 0.7107557265676667, "flos": 20193647898240.0, "grad_norm": 2.1054906656580616, "language_loss": 0.77876067, "learning_rate": 8.151220988045935e-07, "loss": 0.80033731, "num_input_tokens_seen": 127054900, "step": 5911, "time_per_iteration": 2.6448023319244385 }, { "auxiliary_loss_clip": 0.01121556, "auxiliary_loss_mlp": 0.01039601, "balance_loss_clip": 1.04177082, "balance_loss_mlp": 1.02435422, "epoch": 0.7108759694583058, "flos": 21507080613120.0, "grad_norm": 1.7284580808052776, "language_loss": 0.82511795, "learning_rate": 8.144946371778234e-07, "loss": 0.84672946, "num_input_tokens_seen": 127075010, "step": 5912, "time_per_iteration": 2.584855794906616 }, { "auxiliary_loss_clip": 0.01110491, "auxiliary_loss_mlp": 0.00772821, "balance_loss_clip": 1.04214668, "balance_loss_mlp": 1.00054741, "epoch": 0.7109962123489448, "flos": 24061909317120.0, "grad_norm": 2.145698498743194, "language_loss": 0.77969652, "learning_rate": 8.138673553912751e-07, "loss": 0.79852962, "num_input_tokens_seen": 127095570, "step": 5913, "time_per_iteration": 2.7286956310272217 }, { "auxiliary_loss_clip": 0.01082636, "auxiliary_loss_mlp": 0.01043894, "balance_loss_clip": 1.03806341, "balance_loss_mlp": 1.02755058, "epoch": 0.711116455239584, "flos": 30480474326400.0, "grad_norm": 2.6230954733426133, "language_loss": 0.57140708, "learning_rate": 8.132402535401059e-07, "loss": 0.59267235, "num_input_tokens_seen": 127116825, "step": 5914, "time_per_iteration": 3.7559142112731934 }, { "auxiliary_loss_clip": 0.01117119, "auxiliary_loss_mlp": 0.01035133, "balance_loss_clip": 1.04327679, "balance_loss_mlp": 1.01948667, "epoch": 0.711236698130223, "flos": 25045610158080.0, "grad_norm": 1.900578827409968, "language_loss": 0.74687988, "learning_rate": 8.126133317194465e-07, "loss": 0.7684024, "num_input_tokens_seen": 127137015, "step": 5915, "time_per_iteration": 3.6881728172302246 }, { "auxiliary_loss_clip": 0.01078368, "auxiliary_loss_mlp": 0.01045902, "balance_loss_clip": 1.0375371, "balance_loss_mlp": 1.02793682, "epoch": 0.7113569410208621, "flos": 24206701040640.0, "grad_norm": 2.396227936412575, "language_loss": 0.7426039, "learning_rate": 8.11986590024401e-07, "loss": 0.76384658, "num_input_tokens_seen": 127156755, "step": 5916, "time_per_iteration": 2.765146017074585 }, { "auxiliary_loss_clip": 0.0111248, "auxiliary_loss_mlp": 0.01041029, "balance_loss_clip": 1.04367924, "balance_loss_mlp": 1.02431643, "epoch": 0.7114771839115013, "flos": 35439306526080.0, "grad_norm": 1.6162238897543064, "language_loss": 0.69002795, "learning_rate": 8.113600285500442e-07, "loss": 0.71156305, "num_input_tokens_seen": 127176965, "step": 5917, "time_per_iteration": 2.767550468444824 }, { "auxiliary_loss_clip": 0.01135219, "auxiliary_loss_mlp": 0.01041354, "balance_loss_clip": 1.04382563, "balance_loss_mlp": 1.02623844, "epoch": 0.7115974268021403, "flos": 21099458096640.0, "grad_norm": 2.049271130921067, "language_loss": 0.74544346, "learning_rate": 8.107336473914268e-07, "loss": 0.76720917, "num_input_tokens_seen": 127195595, "step": 5918, "time_per_iteration": 2.619696617126465 }, { "auxiliary_loss_clip": 0.01020109, "auxiliary_loss_mlp": 0.01014524, "balance_loss_clip": 1.00997031, "balance_loss_mlp": 1.01230717, "epoch": 0.7117176696927794, "flos": 56752866616320.0, "grad_norm": 0.7734382587155059, "language_loss": 0.55758727, "learning_rate": 8.101074466435694e-07, "loss": 0.57793361, "num_input_tokens_seen": 127255070, "step": 5919, "time_per_iteration": 3.1748206615448 }, { "auxiliary_loss_clip": 0.01117706, "auxiliary_loss_mlp": 0.01041765, "balance_loss_clip": 1.04191065, "balance_loss_mlp": 1.02474165, "epoch": 0.7118379125834186, "flos": 15925269905280.0, "grad_norm": 1.812833950303948, "language_loss": 0.68374687, "learning_rate": 8.094814264014662e-07, "loss": 0.70534158, "num_input_tokens_seen": 127273825, "step": 5920, "time_per_iteration": 2.602436065673828 }, { "auxiliary_loss_clip": 0.01140348, "auxiliary_loss_mlp": 0.01047422, "balance_loss_clip": 1.04435945, "balance_loss_mlp": 1.02950442, "epoch": 0.7119581554740576, "flos": 20193360589440.0, "grad_norm": 2.2593780031515815, "language_loss": 0.81354344, "learning_rate": 8.088555867600844e-07, "loss": 0.83542114, "num_input_tokens_seen": 127289990, "step": 5921, "time_per_iteration": 2.6102776527404785 }, { "auxiliary_loss_clip": 0.01094381, "auxiliary_loss_mlp": 0.01042541, "balance_loss_clip": 1.03972101, "balance_loss_mlp": 1.02744913, "epoch": 0.7120783983646967, "flos": 34715383822080.0, "grad_norm": 2.306152519924396, "language_loss": 0.60162276, "learning_rate": 8.08229927814362e-07, "loss": 0.62299192, "num_input_tokens_seen": 127312880, "step": 5922, "time_per_iteration": 2.802896738052368 }, { "auxiliary_loss_clip": 0.01096096, "auxiliary_loss_mlp": 0.01037232, "balance_loss_clip": 1.04135203, "balance_loss_mlp": 1.02205682, "epoch": 0.7121986412553358, "flos": 26359114700160.0, "grad_norm": 1.8435086370791745, "language_loss": 0.6505006, "learning_rate": 8.076044496592134e-07, "loss": 0.67183387, "num_input_tokens_seen": 127334730, "step": 5923, "time_per_iteration": 2.728388547897339 }, { "auxiliary_loss_clip": 0.0111178, "auxiliary_loss_mlp": 0.01040531, "balance_loss_clip": 1.04289556, "balance_loss_mlp": 1.02446139, "epoch": 0.7123188841459749, "flos": 11145344371200.0, "grad_norm": 4.173995630247619, "language_loss": 0.77831995, "learning_rate": 8.069791523895204e-07, "loss": 0.79984301, "num_input_tokens_seen": 127351180, "step": 5924, "time_per_iteration": 2.616739511489868 }, { "auxiliary_loss_clip": 0.01087078, "auxiliary_loss_mlp": 0.01039066, "balance_loss_clip": 1.03744817, "balance_loss_mlp": 1.02279425, "epoch": 0.7124391270366139, "flos": 20811670329600.0, "grad_norm": 1.9010305984284637, "language_loss": 0.7723099, "learning_rate": 8.063540361001422e-07, "loss": 0.79357135, "num_input_tokens_seen": 127369750, "step": 5925, "time_per_iteration": 2.7084991931915283 }, { "auxiliary_loss_clip": 0.0109312, "auxiliary_loss_mlp": 0.0105443, "balance_loss_clip": 1.03893209, "balance_loss_mlp": 1.03625035, "epoch": 0.7125593699272531, "flos": 17603734584960.0, "grad_norm": 2.212900634252197, "language_loss": 0.79391426, "learning_rate": 8.057291008859069e-07, "loss": 0.81538975, "num_input_tokens_seen": 127387910, "step": 5926, "time_per_iteration": 2.651643753051758 }, { "auxiliary_loss_clip": 0.01118731, "auxiliary_loss_mlp": 0.01034832, "balance_loss_clip": 1.04096007, "balance_loss_mlp": 1.01956165, "epoch": 0.7126796128178922, "flos": 28654057526400.0, "grad_norm": 2.2861221870842536, "language_loss": 0.68359959, "learning_rate": 8.051043468416187e-07, "loss": 0.70513523, "num_input_tokens_seen": 127409160, "step": 5927, "time_per_iteration": 2.705127000808716 }, { "auxiliary_loss_clip": 0.01130275, "auxiliary_loss_mlp": 0.01041284, "balance_loss_clip": 1.04293966, "balance_loss_mlp": 1.02649665, "epoch": 0.7127998557085312, "flos": 16034438315520.0, "grad_norm": 1.9091184909587957, "language_loss": 0.82004976, "learning_rate": 8.044797740620506e-07, "loss": 0.84176528, "num_input_tokens_seen": 127427765, "step": 5928, "time_per_iteration": 2.567279815673828 }, { "auxiliary_loss_clip": 0.01080507, "auxiliary_loss_mlp": 0.01042078, "balance_loss_clip": 1.0402, "balance_loss_mlp": 1.02544808, "epoch": 0.7129200985991703, "flos": 23403271582080.0, "grad_norm": 1.9729310621614837, "language_loss": 0.78887308, "learning_rate": 8.038553826419494e-07, "loss": 0.81009901, "num_input_tokens_seen": 127446475, "step": 5929, "time_per_iteration": 2.775620698928833 }, { "auxiliary_loss_clip": 0.01134622, "auxiliary_loss_mlp": 0.0103793, "balance_loss_clip": 1.043239, "balance_loss_mlp": 1.02236748, "epoch": 0.7130403414898094, "flos": 21397445326080.0, "grad_norm": 1.613389537065344, "language_loss": 0.81104517, "learning_rate": 8.032311726760364e-07, "loss": 0.83277071, "num_input_tokens_seen": 127467695, "step": 5930, "time_per_iteration": 2.5931410789489746 }, { "auxiliary_loss_clip": 0.01092051, "auxiliary_loss_mlp": 0.01047479, "balance_loss_clip": 1.03913307, "balance_loss_mlp": 1.02921593, "epoch": 0.7131605843804485, "flos": 74739045306240.0, "grad_norm": 2.1697089503257203, "language_loss": 0.68917036, "learning_rate": 8.026071442590022e-07, "loss": 0.71056563, "num_input_tokens_seen": 127494590, "step": 5931, "time_per_iteration": 3.0522990226745605 }, { "auxiliary_loss_clip": 0.01121657, "auxiliary_loss_mlp": 0.01037101, "balance_loss_clip": 1.043612, "balance_loss_mlp": 1.02194953, "epoch": 0.7132808272710875, "flos": 18368739469440.0, "grad_norm": 1.9865525992430282, "language_loss": 0.80561459, "learning_rate": 8.019832974855134e-07, "loss": 0.8272022, "num_input_tokens_seen": 127512550, "step": 5932, "time_per_iteration": 2.640590190887451 }, { "auxiliary_loss_clip": 0.01101067, "auxiliary_loss_mlp": 0.01044447, "balance_loss_clip": 1.04189038, "balance_loss_mlp": 1.02805603, "epoch": 0.7134010701617267, "flos": 23253380127360.0, "grad_norm": 2.933837213350749, "language_loss": 0.82326639, "learning_rate": 8.013596324502052e-07, "loss": 0.8447215, "num_input_tokens_seen": 127531015, "step": 5933, "time_per_iteration": 2.8267204761505127 }, { "auxiliary_loss_clip": 0.01114541, "auxiliary_loss_mlp": 0.01036437, "balance_loss_clip": 1.04172444, "balance_loss_mlp": 1.0210228, "epoch": 0.7135213130523658, "flos": 23653137565440.0, "grad_norm": 2.003878775772564, "language_loss": 0.79065263, "learning_rate": 8.007361492476872e-07, "loss": 0.8121624, "num_input_tokens_seen": 127550340, "step": 5934, "time_per_iteration": 3.590761661529541 }, { "auxiliary_loss_clip": 0.01104017, "auxiliary_loss_mlp": 0.0104518, "balance_loss_clip": 1.0410701, "balance_loss_mlp": 1.02918255, "epoch": 0.7136415559430048, "flos": 24790644443520.0, "grad_norm": 1.8797514114249427, "language_loss": 0.79079545, "learning_rate": 8.001128479725426e-07, "loss": 0.81228745, "num_input_tokens_seen": 127572245, "step": 5935, "time_per_iteration": 3.654237985610962 }, { "auxiliary_loss_clip": 0.01079062, "auxiliary_loss_mlp": 0.01042047, "balance_loss_clip": 1.03827047, "balance_loss_mlp": 1.0263474, "epoch": 0.713761798833644, "flos": 18296954138880.0, "grad_norm": 1.6675125906861317, "language_loss": 0.80935347, "learning_rate": 7.994897287193248e-07, "loss": 0.8305645, "num_input_tokens_seen": 127591625, "step": 5936, "time_per_iteration": 2.7111546993255615 }, { "auxiliary_loss_clip": 0.01129737, "auxiliary_loss_mlp": 0.01042097, "balance_loss_clip": 1.04632998, "balance_loss_mlp": 1.02584839, "epoch": 0.713882041724283, "flos": 15558262692480.0, "grad_norm": 2.826732582316014, "language_loss": 0.83416826, "learning_rate": 7.988667915825605e-07, "loss": 0.85588658, "num_input_tokens_seen": 127608690, "step": 5937, "time_per_iteration": 2.6126813888549805 }, { "auxiliary_loss_clip": 0.01111285, "auxiliary_loss_mlp": 0.01046435, "balance_loss_clip": 1.04089022, "balance_loss_mlp": 1.03034186, "epoch": 0.7140022846149221, "flos": 24061011477120.0, "grad_norm": 2.3250340147760937, "language_loss": 0.7568422, "learning_rate": 7.982440366567491e-07, "loss": 0.77841938, "num_input_tokens_seen": 127627180, "step": 5938, "time_per_iteration": 2.6295175552368164 }, { "auxiliary_loss_clip": 0.01114391, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.04005766, "balance_loss_mlp": 1.01932371, "epoch": 0.7141225275055613, "flos": 27891710248320.0, "grad_norm": 1.7162321186081333, "language_loss": 0.75208104, "learning_rate": 7.97621464036361e-07, "loss": 0.77356577, "num_input_tokens_seen": 127648940, "step": 5939, "time_per_iteration": 2.732874870300293 }, { "auxiliary_loss_clip": 0.01122206, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.04175901, "balance_loss_mlp": 1.02513099, "epoch": 0.7142427703962003, "flos": 19682603147520.0, "grad_norm": 1.6651458050555712, "language_loss": 0.67544878, "learning_rate": 7.969990738158417e-07, "loss": 0.69707757, "num_input_tokens_seen": 127667350, "step": 5940, "time_per_iteration": 3.5847272872924805 }, { "auxiliary_loss_clip": 0.01130158, "auxiliary_loss_mlp": 0.01046473, "balance_loss_clip": 1.048208, "balance_loss_mlp": 1.0302608, "epoch": 0.7143630132868394, "flos": 21032377447680.0, "grad_norm": 2.028669538211811, "language_loss": 0.85121334, "learning_rate": 7.963768660896062e-07, "loss": 0.87297964, "num_input_tokens_seen": 127685760, "step": 5941, "time_per_iteration": 3.5565524101257324 }, { "auxiliary_loss_clip": 0.01123576, "auxiliary_loss_mlp": 0.01039317, "balance_loss_clip": 1.04296589, "balance_loss_mlp": 1.02323604, "epoch": 0.7144832561774785, "flos": 24129923719680.0, "grad_norm": 4.209928365910366, "language_loss": 0.82749116, "learning_rate": 7.957548409520432e-07, "loss": 0.84912014, "num_input_tokens_seen": 127704985, "step": 5942, "time_per_iteration": 2.7142422199249268 }, { "auxiliary_loss_clip": 0.01097366, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.03951275, "balance_loss_mlp": 1.02516603, "epoch": 0.7146034990681176, "flos": 16325817442560.0, "grad_norm": 2.074266860763255, "language_loss": 0.84392005, "learning_rate": 7.951329984975135e-07, "loss": 0.86529565, "num_input_tokens_seen": 127721925, "step": 5943, "time_per_iteration": 2.72699236869812 }, { "auxiliary_loss_clip": 0.01011087, "auxiliary_loss_mlp": 0.01007041, "balance_loss_clip": 1.00886798, "balance_loss_mlp": 1.00537193, "epoch": 0.7147237419587567, "flos": 69627164232960.0, "grad_norm": 0.7112349539812484, "language_loss": 0.54286104, "learning_rate": 7.94511338820349e-07, "loss": 0.56304234, "num_input_tokens_seen": 127784230, "step": 5944, "time_per_iteration": 3.3454232215881348 }, { "auxiliary_loss_clip": 0.01116216, "auxiliary_loss_mlp": 0.00773347, "balance_loss_clip": 1.04214096, "balance_loss_mlp": 1.0005126, "epoch": 0.7148439848493958, "flos": 22266806198400.0, "grad_norm": 2.5487725231320058, "language_loss": 0.78611475, "learning_rate": 7.938898620148575e-07, "loss": 0.80501038, "num_input_tokens_seen": 127801990, "step": 5945, "time_per_iteration": 2.7286272048950195 }, { "auxiliary_loss_clip": 0.01112866, "auxiliary_loss_mlp": 0.01045579, "balance_loss_clip": 1.04416561, "balance_loss_mlp": 1.02997422, "epoch": 0.7149642277400349, "flos": 17931383470080.0, "grad_norm": 2.302911603930367, "language_loss": 0.71220553, "learning_rate": 7.932685681753135e-07, "loss": 0.73378998, "num_input_tokens_seen": 127819270, "step": 5946, "time_per_iteration": 2.837275743484497 }, { "auxiliary_loss_clip": 0.0113292, "auxiliary_loss_mlp": 0.0103973, "balance_loss_clip": 1.04467571, "balance_loss_mlp": 1.02477479, "epoch": 0.7150844706306739, "flos": 31681937370240.0, "grad_norm": 2.041269793626441, "language_loss": 0.62538123, "learning_rate": 7.92647457395969e-07, "loss": 0.64710772, "num_input_tokens_seen": 127841095, "step": 5947, "time_per_iteration": 2.7479517459869385 }, { "auxiliary_loss_clip": 0.01079819, "auxiliary_loss_mlp": 0.01046956, "balance_loss_clip": 1.03720856, "balance_loss_mlp": 1.027668, "epoch": 0.7152047135213131, "flos": 10926217451520.0, "grad_norm": 2.534089641780932, "language_loss": 0.74356008, "learning_rate": 7.920265297710444e-07, "loss": 0.76482785, "num_input_tokens_seen": 127858485, "step": 5948, "time_per_iteration": 2.895183801651001 }, { "auxiliary_loss_clip": 0.01120725, "auxiliary_loss_mlp": 0.01038931, "balance_loss_clip": 1.04212093, "balance_loss_mlp": 1.02309966, "epoch": 0.7153249564119522, "flos": 20995640812800.0, "grad_norm": 1.9220006128374356, "language_loss": 0.73625624, "learning_rate": 7.914057853947363e-07, "loss": 0.75785279, "num_input_tokens_seen": 127877665, "step": 5949, "time_per_iteration": 2.7482759952545166 }, { "auxiliary_loss_clip": 0.01092399, "auxiliary_loss_mlp": 0.01044647, "balance_loss_clip": 1.0393796, "balance_loss_mlp": 1.02829194, "epoch": 0.7154451993025912, "flos": 24243114453120.0, "grad_norm": 2.225135189853235, "language_loss": 0.62665296, "learning_rate": 7.907852243612089e-07, "loss": 0.64802343, "num_input_tokens_seen": 127898070, "step": 5950, "time_per_iteration": 2.7555155754089355 }, { "auxiliary_loss_clip": 0.01103406, "auxiliary_loss_mlp": 0.01043743, "balance_loss_clip": 1.03810573, "balance_loss_mlp": 1.02841878, "epoch": 0.7155654421932304, "flos": 23330947547520.0, "grad_norm": 1.9894204911559594, "language_loss": 0.72099268, "learning_rate": 7.901648467646009e-07, "loss": 0.74246418, "num_input_tokens_seen": 127917010, "step": 5951, "time_per_iteration": 2.6985130310058594 }, { "auxiliary_loss_clip": 0.01137172, "auxiliary_loss_mlp": 0.01036282, "balance_loss_clip": 1.04441476, "balance_loss_mlp": 1.01960468, "epoch": 0.7156856850838694, "flos": 22711883621760.0, "grad_norm": 1.8836989024543982, "language_loss": 0.72587919, "learning_rate": 7.895446526990244e-07, "loss": 0.74761373, "num_input_tokens_seen": 127937025, "step": 5952, "time_per_iteration": 2.6192498207092285 }, { "auxiliary_loss_clip": 0.01096383, "auxiliary_loss_mlp": 0.0104365, "balance_loss_clip": 1.04047263, "balance_loss_mlp": 1.02674603, "epoch": 0.7158059279745085, "flos": 19865424395520.0, "grad_norm": 1.5468410460182693, "language_loss": 0.75856304, "learning_rate": 7.889246422585609e-07, "loss": 0.77996331, "num_input_tokens_seen": 127956410, "step": 5953, "time_per_iteration": 2.764127492904663 }, { "auxiliary_loss_clip": 0.01134136, "auxiliary_loss_mlp": 0.0104239, "balance_loss_clip": 1.04304385, "balance_loss_mlp": 1.02728593, "epoch": 0.7159261708651476, "flos": 24134772055680.0, "grad_norm": 2.532157781059355, "language_loss": 0.73586565, "learning_rate": 7.883048155372675e-07, "loss": 0.75763094, "num_input_tokens_seen": 127974925, "step": 5954, "time_per_iteration": 2.6424572467803955 }, { "auxiliary_loss_clip": 0.01115757, "auxiliary_loss_mlp": 0.01040428, "balance_loss_clip": 1.04317975, "balance_loss_mlp": 1.02363157, "epoch": 0.7160464137557867, "flos": 16983198201600.0, "grad_norm": 2.8638584472381288, "language_loss": 0.71165097, "learning_rate": 7.876851726291698e-07, "loss": 0.73321283, "num_input_tokens_seen": 127993225, "step": 5955, "time_per_iteration": 2.7045578956604004 }, { "auxiliary_loss_clip": 0.01101365, "auxiliary_loss_mlp": 0.0104375, "balance_loss_clip": 1.04196978, "balance_loss_mlp": 1.02870607, "epoch": 0.7161666566464258, "flos": 25228251838080.0, "grad_norm": 1.9705526556567123, "language_loss": 0.78395683, "learning_rate": 7.870657136282666e-07, "loss": 0.80540794, "num_input_tokens_seen": 128012085, "step": 5956, "time_per_iteration": 2.7581517696380615 }, { "auxiliary_loss_clip": 0.01115448, "auxiliary_loss_mlp": 0.01043388, "balance_loss_clip": 1.04052043, "balance_loss_mlp": 1.02661526, "epoch": 0.7162868995370649, "flos": 26468390851200.0, "grad_norm": 1.6010445515164453, "language_loss": 0.81832647, "learning_rate": 7.86446438628531e-07, "loss": 0.83991486, "num_input_tokens_seen": 128033155, "step": 5957, "time_per_iteration": 2.8139164447784424 }, { "auxiliary_loss_clip": 0.01037025, "auxiliary_loss_mlp": 0.01001098, "balance_loss_clip": 1.00805163, "balance_loss_mlp": 0.99976856, "epoch": 0.716407142427704, "flos": 69998912040960.0, "grad_norm": 0.7602362014510239, "language_loss": 0.56856692, "learning_rate": 7.858273477239059e-07, "loss": 0.58894813, "num_input_tokens_seen": 128101575, "step": 5958, "time_per_iteration": 3.2282755374908447 }, { "auxiliary_loss_clip": 0.01072818, "auxiliary_loss_mlp": 0.01041437, "balance_loss_clip": 1.03648484, "balance_loss_mlp": 1.02533174, "epoch": 0.716527385318343, "flos": 20740459616640.0, "grad_norm": 2.2990671768684026, "language_loss": 0.7131294, "learning_rate": 7.852084410083067e-07, "loss": 0.734272, "num_input_tokens_seen": 128120395, "step": 5959, "time_per_iteration": 3.7126922607421875 }, { "auxiliary_loss_clip": 0.01105101, "auxiliary_loss_mlp": 0.01038198, "balance_loss_clip": 1.04154456, "balance_loss_mlp": 1.02328467, "epoch": 0.7166476282089821, "flos": 25371966153600.0, "grad_norm": 1.7634375207560835, "language_loss": 0.63759142, "learning_rate": 7.84589718575621e-07, "loss": 0.65902442, "num_input_tokens_seen": 128140840, "step": 5960, "time_per_iteration": 3.762613534927368 }, { "auxiliary_loss_clip": 0.01109799, "auxiliary_loss_mlp": 0.01038647, "balance_loss_clip": 1.03620219, "balance_loss_mlp": 1.02232695, "epoch": 0.7167678710996213, "flos": 24133730561280.0, "grad_norm": 3.622615087847022, "language_loss": 0.69057542, "learning_rate": 7.83971180519708e-07, "loss": 0.71205992, "num_input_tokens_seen": 128159695, "step": 5961, "time_per_iteration": 2.6863906383514404 }, { "auxiliary_loss_clip": 0.01136273, "auxiliary_loss_mlp": 0.01035584, "balance_loss_clip": 1.04477894, "balance_loss_mlp": 1.01814401, "epoch": 0.7168881139902603, "flos": 30226586019840.0, "grad_norm": 3.2984813357908473, "language_loss": 0.75504398, "learning_rate": 7.833528269344008e-07, "loss": 0.7767626, "num_input_tokens_seen": 128179600, "step": 5962, "time_per_iteration": 2.6577608585357666 }, { "auxiliary_loss_clip": 0.01096214, "auxiliary_loss_mlp": 0.01043839, "balance_loss_clip": 1.04207098, "balance_loss_mlp": 1.02643418, "epoch": 0.7170083568808994, "flos": 14606414236800.0, "grad_norm": 4.03556364602265, "language_loss": 0.77315545, "learning_rate": 7.827346579135023e-07, "loss": 0.79455602, "num_input_tokens_seen": 128196940, "step": 5963, "time_per_iteration": 2.7054319381713867 }, { "auxiliary_loss_clip": 0.0110674, "auxiliary_loss_mlp": 0.01038891, "balance_loss_clip": 1.03980637, "balance_loss_mlp": 1.02232051, "epoch": 0.7171285997715385, "flos": 23331091201920.0, "grad_norm": 1.9602859744431198, "language_loss": 0.83475101, "learning_rate": 7.821166735507885e-07, "loss": 0.85620731, "num_input_tokens_seen": 128215970, "step": 5964, "time_per_iteration": 2.6551616191864014 }, { "auxiliary_loss_clip": 0.01131129, "auxiliary_loss_mlp": 0.01043529, "balance_loss_clip": 1.04198396, "balance_loss_mlp": 1.0279367, "epoch": 0.7172488426621776, "flos": 16543543731840.0, "grad_norm": 1.7064805442650284, "language_loss": 0.68810773, "learning_rate": 7.81498873940007e-07, "loss": 0.7098543, "num_input_tokens_seen": 128233185, "step": 5965, "time_per_iteration": 2.6314961910247803 }, { "auxiliary_loss_clip": 0.0112855, "auxiliary_loss_mlp": 0.01038124, "balance_loss_clip": 1.04181838, "balance_loss_mlp": 1.02177405, "epoch": 0.7173690855528166, "flos": 26541612725760.0, "grad_norm": 2.5056130129566796, "language_loss": 0.7774874, "learning_rate": 7.808812591748768e-07, "loss": 0.79915416, "num_input_tokens_seen": 128253565, "step": 5966, "time_per_iteration": 3.7220752239227295 }, { "auxiliary_loss_clip": 0.01093826, "auxiliary_loss_mlp": 0.01041257, "balance_loss_clip": 1.03778791, "balance_loss_mlp": 1.02343547, "epoch": 0.7174893284434558, "flos": 22784099915520.0, "grad_norm": 2.345314718988221, "language_loss": 0.65116215, "learning_rate": 7.802638293490915e-07, "loss": 0.67251301, "num_input_tokens_seen": 128273210, "step": 5967, "time_per_iteration": 3.6495139598846436 }, { "auxiliary_loss_clip": 0.01114867, "auxiliary_loss_mlp": 0.0105079, "balance_loss_clip": 1.0431596, "balance_loss_mlp": 1.03433895, "epoch": 0.7176095713340949, "flos": 23293564467840.0, "grad_norm": 2.4174382719677903, "language_loss": 0.76946354, "learning_rate": 7.796465845563123e-07, "loss": 0.79112005, "num_input_tokens_seen": 128292085, "step": 5968, "time_per_iteration": 2.673386812210083 }, { "auxiliary_loss_clip": 0.0110481, "auxiliary_loss_mlp": 0.00773676, "balance_loss_clip": 1.04145837, "balance_loss_mlp": 1.0005753, "epoch": 0.7177298142247339, "flos": 25591631777280.0, "grad_norm": 1.7747864792542614, "language_loss": 0.7963289, "learning_rate": 7.790295248901766e-07, "loss": 0.8151139, "num_input_tokens_seen": 128313215, "step": 5969, "time_per_iteration": 2.695063352584839 }, { "auxiliary_loss_clip": 0.01121994, "auxiliary_loss_mlp": 0.01039956, "balance_loss_clip": 1.04304445, "balance_loss_mlp": 1.02412522, "epoch": 0.7178500571153731, "flos": 31652778504960.0, "grad_norm": 1.906221380414213, "language_loss": 0.62305731, "learning_rate": 7.784126504442902e-07, "loss": 0.6446768, "num_input_tokens_seen": 128336445, "step": 5970, "time_per_iteration": 2.74259090423584 }, { "auxiliary_loss_clip": 0.01089627, "auxiliary_loss_mlp": 0.01045122, "balance_loss_clip": 1.03746545, "balance_loss_mlp": 1.02827764, "epoch": 0.7179703000060121, "flos": 19427242383360.0, "grad_norm": 1.4292609247217423, "language_loss": 0.6770882, "learning_rate": 7.777959613122351e-07, "loss": 0.69843566, "num_input_tokens_seen": 128356270, "step": 5971, "time_per_iteration": 2.711111307144165 }, { "auxiliary_loss_clip": 0.01101879, "auxiliary_loss_mlp": 0.01036939, "balance_loss_clip": 1.04066992, "balance_loss_mlp": 1.02263355, "epoch": 0.7180905428966512, "flos": 28839249072000.0, "grad_norm": 1.7044249773128621, "language_loss": 0.77869499, "learning_rate": 7.771794575875604e-07, "loss": 0.8000831, "num_input_tokens_seen": 128378140, "step": 5972, "time_per_iteration": 2.724778413772583 }, { "auxiliary_loss_clip": 0.01121597, "auxiliary_loss_mlp": 0.01044038, "balance_loss_clip": 1.04517674, "balance_loss_mlp": 1.02603745, "epoch": 0.7182107857872904, "flos": 20047563285120.0, "grad_norm": 9.703650299174855, "language_loss": 0.77796435, "learning_rate": 7.765631393637888e-07, "loss": 0.79962075, "num_input_tokens_seen": 128396335, "step": 5973, "time_per_iteration": 2.650470018386841 }, { "auxiliary_loss_clip": 0.01121285, "auxiliary_loss_mlp": 0.01049541, "balance_loss_clip": 1.04244733, "balance_loss_mlp": 1.03102815, "epoch": 0.7183310286779294, "flos": 22747686503040.0, "grad_norm": 2.992885705345385, "language_loss": 0.48759738, "learning_rate": 7.75947006734417e-07, "loss": 0.5093056, "num_input_tokens_seen": 128414115, "step": 5974, "time_per_iteration": 2.6668384075164795 }, { "auxiliary_loss_clip": 0.01135399, "auxiliary_loss_mlp": 0.01040972, "balance_loss_clip": 1.04393721, "balance_loss_mlp": 1.02403188, "epoch": 0.7184512715685685, "flos": 17158262112000.0, "grad_norm": 8.107633360346265, "language_loss": 0.8285991, "learning_rate": 7.753310597929101e-07, "loss": 0.85036284, "num_input_tokens_seen": 128430755, "step": 5975, "time_per_iteration": 2.5741729736328125 }, { "auxiliary_loss_clip": 0.01037074, "auxiliary_loss_mlp": 0.01001238, "balance_loss_clip": 1.00809956, "balance_loss_mlp": 0.99990922, "epoch": 0.7185715144592076, "flos": 65509611448320.0, "grad_norm": 0.7575034631054359, "language_loss": 0.55121815, "learning_rate": 7.747152986327095e-07, "loss": 0.57160127, "num_input_tokens_seen": 128491300, "step": 5976, "time_per_iteration": 3.100156307220459 }, { "auxiliary_loss_clip": 0.01087514, "auxiliary_loss_mlp": 0.01041644, "balance_loss_clip": 1.04029965, "balance_loss_mlp": 1.02425122, "epoch": 0.7186917573498467, "flos": 16180522928640.0, "grad_norm": 2.148742189817916, "language_loss": 0.68245786, "learning_rate": 7.740997233472228e-07, "loss": 0.70374942, "num_input_tokens_seen": 128508920, "step": 5977, "time_per_iteration": 2.6813673973083496 }, { "auxiliary_loss_clip": 0.01107768, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.03991222, "balance_loss_mlp": 1.02312922, "epoch": 0.7188120002404857, "flos": 29242274647680.0, "grad_norm": 2.0237668184018545, "language_loss": 0.70340633, "learning_rate": 7.734843340298329e-07, "loss": 0.72486436, "num_input_tokens_seen": 128528745, "step": 5978, "time_per_iteration": 2.717686414718628 }, { "auxiliary_loss_clip": 0.01117128, "auxiliary_loss_mlp": 0.0103764, "balance_loss_clip": 1.04141283, "balance_loss_mlp": 1.02142727, "epoch": 0.7189322431311249, "flos": 33401161008000.0, "grad_norm": 2.583326226410811, "language_loss": 0.75184071, "learning_rate": 7.72869130773895e-07, "loss": 0.77338833, "num_input_tokens_seen": 128549345, "step": 5979, "time_per_iteration": 2.74357533454895 }, { "auxiliary_loss_clip": 0.01027898, "auxiliary_loss_mlp": 0.01000611, "balance_loss_clip": 1.00803947, "balance_loss_mlp": 0.99927634, "epoch": 0.719052486021764, "flos": 61351263792000.0, "grad_norm": 0.7869861069821863, "language_loss": 0.59396428, "learning_rate": 7.722541136727343e-07, "loss": 0.61424941, "num_input_tokens_seen": 128605360, "step": 5980, "time_per_iteration": 3.0682358741760254 }, { "auxiliary_loss_clip": 0.01122557, "auxiliary_loss_mlp": 0.01042531, "balance_loss_clip": 1.0425626, "balance_loss_mlp": 1.02690268, "epoch": 0.719172728912403, "flos": 15596795007360.0, "grad_norm": 1.8199378850922587, "language_loss": 0.80636883, "learning_rate": 7.716392828196483e-07, "loss": 0.82801974, "num_input_tokens_seen": 128623160, "step": 5981, "time_per_iteration": 2.6223912239074707 }, { "auxiliary_loss_clip": 0.01124051, "auxiliary_loss_mlp": 0.01035265, "balance_loss_clip": 1.04341745, "balance_loss_mlp": 1.01853967, "epoch": 0.7192929718030422, "flos": 15553162961280.0, "grad_norm": 2.5885761090152877, "language_loss": 0.77177155, "learning_rate": 7.710246383079064e-07, "loss": 0.79336464, "num_input_tokens_seen": 128638545, "step": 5982, "time_per_iteration": 2.582204580307007 }, { "auxiliary_loss_clip": 0.01105578, "auxiliary_loss_mlp": 0.0103668, "balance_loss_clip": 1.03700686, "balance_loss_mlp": 1.02094436, "epoch": 0.7194132146936812, "flos": 21862487733120.0, "grad_norm": 3.1364103747951626, "language_loss": 0.92289758, "learning_rate": 7.704101802307492e-07, "loss": 0.94432008, "num_input_tokens_seen": 128650845, "step": 5983, "time_per_iteration": 2.613072633743286 }, { "auxiliary_loss_clip": 0.0108946, "auxiliary_loss_mlp": 0.01036705, "balance_loss_clip": 1.03790057, "balance_loss_mlp": 1.01999223, "epoch": 0.7195334575843203, "flos": 27338900958720.0, "grad_norm": 2.47580092991951, "language_loss": 0.86805999, "learning_rate": 7.697959086813912e-07, "loss": 0.88932168, "num_input_tokens_seen": 128667010, "step": 5984, "time_per_iteration": 2.701942205429077 }, { "auxiliary_loss_clip": 0.01089201, "auxiliary_loss_mlp": 0.01039898, "balance_loss_clip": 1.03732979, "balance_loss_mlp": 1.02368522, "epoch": 0.7196537004749595, "flos": 18770615809920.0, "grad_norm": 1.6586242173415326, "language_loss": 0.80094004, "learning_rate": 7.691818237530145e-07, "loss": 0.82223105, "num_input_tokens_seen": 128685870, "step": 5985, "time_per_iteration": 3.6709301471710205 }, { "auxiliary_loss_clip": 0.01097753, "auxiliary_loss_mlp": 0.01039649, "balance_loss_clip": 1.04112959, "balance_loss_mlp": 1.02397263, "epoch": 0.7197739433655985, "flos": 24531009960960.0, "grad_norm": 1.9327646992133598, "language_loss": 0.77616811, "learning_rate": 7.685679255387774e-07, "loss": 0.7975421, "num_input_tokens_seen": 128704185, "step": 5986, "time_per_iteration": 2.728578805923462 }, { "auxiliary_loss_clip": 0.01110064, "auxiliary_loss_mlp": 0.01035836, "balance_loss_clip": 1.04239321, "balance_loss_mlp": 1.01999354, "epoch": 0.7198941862562376, "flos": 18040587793920.0, "grad_norm": 2.116629633936169, "language_loss": 0.76893049, "learning_rate": 7.679542141318065e-07, "loss": 0.79038954, "num_input_tokens_seen": 128721290, "step": 5987, "time_per_iteration": 3.631544589996338 }, { "auxiliary_loss_clip": 0.01096704, "auxiliary_loss_mlp": 0.01036907, "balance_loss_clip": 1.03835797, "balance_loss_mlp": 1.01976442, "epoch": 0.7200144291468767, "flos": 29022393542400.0, "grad_norm": 1.9229494920042138, "language_loss": 0.75799429, "learning_rate": 7.673406896252013e-07, "loss": 0.77933037, "num_input_tokens_seen": 128742665, "step": 5988, "time_per_iteration": 2.6999599933624268 }, { "auxiliary_loss_clip": 0.01096516, "auxiliary_loss_mlp": 0.01046214, "balance_loss_clip": 1.03918409, "balance_loss_mlp": 1.02827322, "epoch": 0.7201346720375158, "flos": 25374264624000.0, "grad_norm": 1.6048460740517516, "language_loss": 0.78241789, "learning_rate": 7.667273521120347e-07, "loss": 0.80384517, "num_input_tokens_seen": 128762225, "step": 5989, "time_per_iteration": 2.743166923522949 }, { "auxiliary_loss_clip": 0.01101973, "auxiliary_loss_mlp": 0.01041076, "balance_loss_clip": 1.04104555, "balance_loss_mlp": 1.02476788, "epoch": 0.7202549149281549, "flos": 14355614499840.0, "grad_norm": 2.202513184072349, "language_loss": 0.79858053, "learning_rate": 7.661142016853468e-07, "loss": 0.82001102, "num_input_tokens_seen": 128779585, "step": 5990, "time_per_iteration": 2.6784465312957764 }, { "auxiliary_loss_clip": 0.01080727, "auxiliary_loss_mlp": 0.01041119, "balance_loss_clip": 1.03757465, "balance_loss_mlp": 1.02396512, "epoch": 0.7203751578187939, "flos": 23001682550400.0, "grad_norm": 1.7727367366961375, "language_loss": 0.74808919, "learning_rate": 7.655012384381543e-07, "loss": 0.76930761, "num_input_tokens_seen": 128799070, "step": 5991, "time_per_iteration": 2.7029566764831543 }, { "auxiliary_loss_clip": 0.01104518, "auxiliary_loss_mlp": 0.01044197, "balance_loss_clip": 1.04062986, "balance_loss_mlp": 1.02822268, "epoch": 0.7204954007094331, "flos": 23692424065920.0, "grad_norm": 2.3157117468127892, "language_loss": 0.81693459, "learning_rate": 7.648884624634415e-07, "loss": 0.8384217, "num_input_tokens_seen": 128817620, "step": 5992, "time_per_iteration": 3.6585121154785156 }, { "auxiliary_loss_clip": 0.01123663, "auxiliary_loss_mlp": 0.01040587, "balance_loss_clip": 1.04484653, "balance_loss_mlp": 1.02517295, "epoch": 0.7206156436000721, "flos": 16253026531200.0, "grad_norm": 1.8428718069894658, "language_loss": 0.88848221, "learning_rate": 7.642758738541683e-07, "loss": 0.91012478, "num_input_tokens_seen": 128834200, "step": 5993, "time_per_iteration": 3.5194242000579834 }, { "auxiliary_loss_clip": 0.01027525, "auxiliary_loss_mlp": 0.01005427, "balance_loss_clip": 1.00833058, "balance_loss_mlp": 1.00398409, "epoch": 0.7207358864907112, "flos": 54377806504320.0, "grad_norm": 0.7655557103754222, "language_loss": 0.60761368, "learning_rate": 7.636634727032621e-07, "loss": 0.62794316, "num_input_tokens_seen": 128891305, "step": 5994, "time_per_iteration": 3.0561630725860596 }, { "auxiliary_loss_clip": 0.0109977, "auxiliary_loss_mlp": 0.01056968, "balance_loss_clip": 1.03861213, "balance_loss_mlp": 1.03893137, "epoch": 0.7208561293813504, "flos": 19135540033920.0, "grad_norm": 1.881038088523908, "language_loss": 0.78946406, "learning_rate": 7.630512591036231e-07, "loss": 0.81103134, "num_input_tokens_seen": 128910615, "step": 5995, "time_per_iteration": 2.6418557167053223 }, { "auxiliary_loss_clip": 0.01123133, "auxiliary_loss_mlp": 0.01048697, "balance_loss_clip": 1.04268861, "balance_loss_mlp": 1.03328347, "epoch": 0.7209763722719894, "flos": 17748526308480.0, "grad_norm": 2.619690548414498, "language_loss": 0.65306497, "learning_rate": 7.624392331481255e-07, "loss": 0.67478329, "num_input_tokens_seen": 128928270, "step": 5996, "time_per_iteration": 2.5843276977539062 }, { "auxiliary_loss_clip": 0.01027815, "auxiliary_loss_mlp": 0.01005412, "balance_loss_clip": 1.00861371, "balance_loss_mlp": 1.00386262, "epoch": 0.7210966151626285, "flos": 66819488716800.0, "grad_norm": 0.7504505383396788, "language_loss": 0.51760417, "learning_rate": 7.618273949296115e-07, "loss": 0.53793645, "num_input_tokens_seen": 128987780, "step": 5997, "time_per_iteration": 3.0993099212646484 }, { "auxiliary_loss_clip": 0.01103915, "auxiliary_loss_mlp": 0.010371, "balance_loss_clip": 1.03813422, "balance_loss_mlp": 1.02120948, "epoch": 0.7212168580532676, "flos": 21141869080320.0, "grad_norm": 2.332734752315369, "language_loss": 0.68883902, "learning_rate": 7.612157445408987e-07, "loss": 0.71024919, "num_input_tokens_seen": 129005590, "step": 5998, "time_per_iteration": 2.648135185241699 }, { "auxiliary_loss_clip": 0.01113245, "auxiliary_loss_mlp": 0.01035905, "balance_loss_clip": 1.04543185, "balance_loss_mlp": 1.01946557, "epoch": 0.7213371009439067, "flos": 22345738335360.0, "grad_norm": 2.52953907800288, "language_loss": 0.74270326, "learning_rate": 7.606042820747716e-07, "loss": 0.76419473, "num_input_tokens_seen": 129021995, "step": 5999, "time_per_iteration": 2.674375057220459 }, { "auxiliary_loss_clip": 0.011177, "auxiliary_loss_mlp": 0.01039111, "balance_loss_clip": 1.0463388, "balance_loss_mlp": 1.02369726, "epoch": 0.7214573438345457, "flos": 18515901490560.0, "grad_norm": 1.8163515788919848, "language_loss": 0.85514152, "learning_rate": 7.599930076239889e-07, "loss": 0.87670958, "num_input_tokens_seen": 129039280, "step": 6000, "time_per_iteration": 2.650216579437256 }, { "auxiliary_loss_clip": 0.0108977, "auxiliary_loss_mlp": 0.00772923, "balance_loss_clip": 1.04074621, "balance_loss_mlp": 1.0006249, "epoch": 0.7215775867251849, "flos": 35736108606720.0, "grad_norm": 2.1091653786014324, "language_loss": 0.70806503, "learning_rate": 7.593819212812818e-07, "loss": 0.72669196, "num_input_tokens_seen": 129060860, "step": 6001, "time_per_iteration": 2.8286325931549072 }, { "auxiliary_loss_clip": 0.01119228, "auxiliary_loss_mlp": 0.01035905, "balance_loss_clip": 1.04127765, "balance_loss_mlp": 1.02068186, "epoch": 0.721697829615824, "flos": 20372410909440.0, "grad_norm": 1.81817332284148, "language_loss": 0.72041094, "learning_rate": 7.587710231393508e-07, "loss": 0.74196231, "num_input_tokens_seen": 129079215, "step": 6002, "time_per_iteration": 2.575695276260376 }, { "auxiliary_loss_clip": 0.01051914, "auxiliary_loss_mlp": 0.01037292, "balance_loss_clip": 1.034235, "balance_loss_mlp": 1.0201025, "epoch": 0.721818072506463, "flos": 20229809915520.0, "grad_norm": 1.9512553137822926, "language_loss": 0.83525068, "learning_rate": 7.581603132908685e-07, "loss": 0.85614276, "num_input_tokens_seen": 129097185, "step": 6003, "time_per_iteration": 2.832146167755127 }, { "auxiliary_loss_clip": 0.01092558, "auxiliary_loss_mlp": 0.01039966, "balance_loss_clip": 1.03899455, "balance_loss_mlp": 1.02275264, "epoch": 0.7219383153971022, "flos": 18186887888640.0, "grad_norm": 2.6318712663579706, "language_loss": 0.78777355, "learning_rate": 7.575497918284795e-07, "loss": 0.80909884, "num_input_tokens_seen": 129114730, "step": 6004, "time_per_iteration": 2.7611773014068604 }, { "auxiliary_loss_clip": 0.01136567, "auxiliary_loss_mlp": 0.0104048, "balance_loss_clip": 1.04176998, "balance_loss_mlp": 1.02348673, "epoch": 0.7220585582877412, "flos": 17342124854400.0, "grad_norm": 2.2405793362735977, "language_loss": 0.74728751, "learning_rate": 7.569394588447984e-07, "loss": 0.76905799, "num_input_tokens_seen": 129131745, "step": 6005, "time_per_iteration": 2.6278462409973145 }, { "auxiliary_loss_clip": 0.01114877, "auxiliary_loss_mlp": 0.01042151, "balance_loss_clip": 1.04157543, "balance_loss_mlp": 1.02535427, "epoch": 0.7221788011783803, "flos": 16976338704000.0, "grad_norm": 4.838231291564188, "language_loss": 0.78131306, "learning_rate": 7.563293144324146e-07, "loss": 0.80288327, "num_input_tokens_seen": 129147295, "step": 6006, "time_per_iteration": 2.725735664367676 }, { "auxiliary_loss_clip": 0.01132923, "auxiliary_loss_mlp": 0.01037359, "balance_loss_clip": 1.04512978, "balance_loss_mlp": 1.0226965, "epoch": 0.7222990440690195, "flos": 26286359702400.0, "grad_norm": 1.8730938133204251, "language_loss": 0.80235887, "learning_rate": 7.557193586838834e-07, "loss": 0.82406175, "num_input_tokens_seen": 129162660, "step": 6007, "time_per_iteration": 2.788640022277832 }, { "auxiliary_loss_clip": 0.01113306, "auxiliary_loss_mlp": 0.0103953, "balance_loss_clip": 1.04160404, "balance_loss_mlp": 1.02456951, "epoch": 0.7224192869596585, "flos": 17601687509760.0, "grad_norm": 2.0812084082163236, "language_loss": 0.70326352, "learning_rate": 7.551095916917371e-07, "loss": 0.72479188, "num_input_tokens_seen": 129179990, "step": 6008, "time_per_iteration": 2.817657232284546 }, { "auxiliary_loss_clip": 0.01106433, "auxiliary_loss_mlp": 0.01039564, "balance_loss_clip": 1.03989029, "balance_loss_mlp": 1.02218318, "epoch": 0.7225395298502976, "flos": 12932331016320.0, "grad_norm": 2.9699161803585725, "language_loss": 0.66502577, "learning_rate": 7.545000135484758e-07, "loss": 0.68648571, "num_input_tokens_seen": 129197425, "step": 6009, "time_per_iteration": 2.7851719856262207 }, { "auxiliary_loss_clip": 0.01136868, "auxiliary_loss_mlp": 0.0077294, "balance_loss_clip": 1.04455519, "balance_loss_mlp": 1.00063777, "epoch": 0.7226597727409367, "flos": 29643899592960.0, "grad_norm": 2.1914118853346634, "language_loss": 0.62739491, "learning_rate": 7.538906243465714e-07, "loss": 0.64649296, "num_input_tokens_seen": 129217560, "step": 6010, "time_per_iteration": 2.706043004989624 }, { "auxiliary_loss_clip": 0.01136793, "auxiliary_loss_mlp": 0.01041695, "balance_loss_clip": 1.04489565, "balance_loss_mlp": 1.0255779, "epoch": 0.7227800156315758, "flos": 13771635183360.0, "grad_norm": 2.264973673913859, "language_loss": 0.78854364, "learning_rate": 7.5328142417847e-07, "loss": 0.81032854, "num_input_tokens_seen": 129234325, "step": 6011, "time_per_iteration": 3.611806869506836 }, { "auxiliary_loss_clip": 0.01120517, "auxiliary_loss_mlp": 0.01034099, "balance_loss_clip": 1.04142523, "balance_loss_mlp": 1.01833999, "epoch": 0.7229002585222148, "flos": 20301882554880.0, "grad_norm": 3.422053648652037, "language_loss": 0.69315159, "learning_rate": 7.526724131365838e-07, "loss": 0.71469772, "num_input_tokens_seen": 129255280, "step": 6012, "time_per_iteration": 2.6930272579193115 }, { "auxiliary_loss_clip": 0.01109184, "auxiliary_loss_mlp": 0.01043407, "balance_loss_clip": 1.04277396, "balance_loss_mlp": 1.02653909, "epoch": 0.723020501412854, "flos": 16581250033920.0, "grad_norm": 1.8333847822017086, "language_loss": 0.70517749, "learning_rate": 7.520635913133017e-07, "loss": 0.72670341, "num_input_tokens_seen": 129273910, "step": 6013, "time_per_iteration": 3.659914970397949 }, { "auxiliary_loss_clip": 0.01126543, "auxiliary_loss_mlp": 0.01037452, "balance_loss_clip": 1.04199719, "balance_loss_mlp": 1.0201906, "epoch": 0.7231407443034931, "flos": 28548300908160.0, "grad_norm": 2.286052095486364, "language_loss": 0.81955755, "learning_rate": 7.514549588009798e-07, "loss": 0.84119749, "num_input_tokens_seen": 129294785, "step": 6014, "time_per_iteration": 2.7678568363189697 }, { "auxiliary_loss_clip": 0.0111812, "auxiliary_loss_mlp": 0.01049914, "balance_loss_clip": 1.04555464, "balance_loss_mlp": 1.03433335, "epoch": 0.7232609871941321, "flos": 30008536508160.0, "grad_norm": 2.5041865968012553, "language_loss": 0.70752668, "learning_rate": 7.508465156919492e-07, "loss": 0.72920704, "num_input_tokens_seen": 129318295, "step": 6015, "time_per_iteration": 2.8025808334350586 }, { "auxiliary_loss_clip": 0.01110648, "auxiliary_loss_mlp": 0.01036112, "balance_loss_clip": 1.04135704, "balance_loss_mlp": 1.0203526, "epoch": 0.7233812300847713, "flos": 16654005031680.0, "grad_norm": 2.474312356680138, "language_loss": 0.61484802, "learning_rate": 7.502382620785083e-07, "loss": 0.6363157, "num_input_tokens_seen": 129334845, "step": 6016, "time_per_iteration": 2.701833963394165 }, { "auxiliary_loss_clip": 0.01007143, "auxiliary_loss_mlp": 0.01003336, "balance_loss_clip": 1.01200449, "balance_loss_mlp": 1.0018754, "epoch": 0.7235014729754103, "flos": 67258784050560.0, "grad_norm": 0.8057361946797749, "language_loss": 0.62551272, "learning_rate": 7.496301980529289e-07, "loss": 0.64561749, "num_input_tokens_seen": 129398055, "step": 6017, "time_per_iteration": 3.307558059692383 }, { "auxiliary_loss_clip": 0.01135822, "auxiliary_loss_mlp": 0.01039252, "balance_loss_clip": 1.04400659, "balance_loss_mlp": 1.02322388, "epoch": 0.7236217158660494, "flos": 26943237671040.0, "grad_norm": 2.223628649423301, "language_loss": 0.74747741, "learning_rate": 7.490223237074547e-07, "loss": 0.76922816, "num_input_tokens_seen": 129417765, "step": 6018, "time_per_iteration": 4.573638677597046 }, { "auxiliary_loss_clip": 0.01096256, "auxiliary_loss_mlp": 0.01044249, "balance_loss_clip": 1.03898931, "balance_loss_mlp": 1.02696347, "epoch": 0.7237419587566886, "flos": 29423372042880.0, "grad_norm": 2.2227815677071177, "language_loss": 0.66003329, "learning_rate": 7.484146391342989e-07, "loss": 0.68143833, "num_input_tokens_seen": 129437560, "step": 6019, "time_per_iteration": 3.741132974624634 }, { "auxiliary_loss_clip": 0.01104466, "auxiliary_loss_mlp": 0.01034523, "balance_loss_clip": 1.03913617, "balance_loss_mlp": 1.01832283, "epoch": 0.7238622016473276, "flos": 17821496787840.0, "grad_norm": 2.4939409845963034, "language_loss": 0.56570995, "learning_rate": 7.478071444256484e-07, "loss": 0.58709985, "num_input_tokens_seen": 129455320, "step": 6020, "time_per_iteration": 2.705193519592285 }, { "auxiliary_loss_clip": 0.01102653, "auxiliary_loss_mlp": 0.0103752, "balance_loss_clip": 1.03997731, "balance_loss_mlp": 1.02127194, "epoch": 0.7239824445379667, "flos": 25739117020800.0, "grad_norm": 2.351812903625409, "language_loss": 0.79293442, "learning_rate": 7.471998396736579e-07, "loss": 0.81433618, "num_input_tokens_seen": 129475700, "step": 6021, "time_per_iteration": 2.749225378036499 }, { "auxiliary_loss_clip": 0.01099898, "auxiliary_loss_mlp": 0.01034693, "balance_loss_clip": 1.04102182, "balance_loss_mlp": 1.01907086, "epoch": 0.7241026874286057, "flos": 23148916398720.0, "grad_norm": 1.7714140272670573, "language_loss": 0.76144242, "learning_rate": 7.465927249704549e-07, "loss": 0.7827884, "num_input_tokens_seen": 129493585, "step": 6022, "time_per_iteration": 2.7256460189819336 }, { "auxiliary_loss_clip": 0.01120719, "auxiliary_loss_mlp": 0.01037789, "balance_loss_clip": 1.04122913, "balance_loss_mlp": 1.02284038, "epoch": 0.7242229303192449, "flos": 20266905686400.0, "grad_norm": 2.1975785590327277, "language_loss": 0.77620912, "learning_rate": 7.459858004081398e-07, "loss": 0.79779422, "num_input_tokens_seen": 129511555, "step": 6023, "time_per_iteration": 2.678616762161255 }, { "auxiliary_loss_clip": 0.0100624, "auxiliary_loss_mlp": 0.01004779, "balance_loss_clip": 1.00949299, "balance_loss_mlp": 1.00331867, "epoch": 0.724343173209884, "flos": 62311659684480.0, "grad_norm": 1.0062104090165358, "language_loss": 0.57957596, "learning_rate": 7.453790660787815e-07, "loss": 0.59968615, "num_input_tokens_seen": 129579650, "step": 6024, "time_per_iteration": 3.3632349967956543 }, { "auxiliary_loss_clip": 0.01115396, "auxiliary_loss_mlp": 0.0104114, "balance_loss_clip": 1.04322529, "balance_loss_mlp": 1.02418804, "epoch": 0.724463416100523, "flos": 35006403813120.0, "grad_norm": 2.663588580205395, "language_loss": 0.63602501, "learning_rate": 7.447725220744214e-07, "loss": 0.65759039, "num_input_tokens_seen": 129601895, "step": 6025, "time_per_iteration": 2.8338048458099365 }, { "auxiliary_loss_clip": 0.01136265, "auxiliary_loss_mlp": 0.01034691, "balance_loss_clip": 1.04412246, "balance_loss_mlp": 1.01950347, "epoch": 0.7245836589911622, "flos": 21871968923520.0, "grad_norm": 2.008478409984869, "language_loss": 0.77198923, "learning_rate": 7.441661684870717e-07, "loss": 0.79369879, "num_input_tokens_seen": 129622150, "step": 6026, "time_per_iteration": 2.65754771232605 }, { "auxiliary_loss_clip": 0.01132747, "auxiliary_loss_mlp": 0.01038202, "balance_loss_clip": 1.0426538, "balance_loss_mlp": 1.02274001, "epoch": 0.7247039018818012, "flos": 23006494972800.0, "grad_norm": 3.1624751843957153, "language_loss": 0.81600243, "learning_rate": 7.435600054087152e-07, "loss": 0.83771193, "num_input_tokens_seen": 129644315, "step": 6027, "time_per_iteration": 2.6887710094451904 }, { "auxiliary_loss_clip": 0.0113713, "auxiliary_loss_mlp": 0.01044786, "balance_loss_clip": 1.04672706, "balance_loss_mlp": 1.02728605, "epoch": 0.7248241447724403, "flos": 31722588587520.0, "grad_norm": 2.1532898075585942, "language_loss": 0.74380791, "learning_rate": 7.42954032931308e-07, "loss": 0.76562715, "num_input_tokens_seen": 129665355, "step": 6028, "time_per_iteration": 2.705791473388672 }, { "auxiliary_loss_clip": 0.01110456, "auxiliary_loss_mlp": 0.01039587, "balance_loss_clip": 1.04132009, "balance_loss_mlp": 1.02357721, "epoch": 0.7249443876630794, "flos": 34896984007680.0, "grad_norm": 1.804818903288261, "language_loss": 0.7486307, "learning_rate": 7.423482511467733e-07, "loss": 0.77013111, "num_input_tokens_seen": 129686125, "step": 6029, "time_per_iteration": 2.83333158493042 }, { "auxiliary_loss_clip": 0.01063188, "auxiliary_loss_mlp": 0.01034348, "balance_loss_clip": 1.03560305, "balance_loss_mlp": 1.01776624, "epoch": 0.7250646305537185, "flos": 26359294268160.0, "grad_norm": 2.964874124262651, "language_loss": 0.65209186, "learning_rate": 7.417426601470099e-07, "loss": 0.67306727, "num_input_tokens_seen": 129706485, "step": 6030, "time_per_iteration": 2.8628711700439453 }, { "auxiliary_loss_clip": 0.01126497, "auxiliary_loss_mlp": 0.01035712, "balance_loss_clip": 1.04319739, "balance_loss_mlp": 1.01858211, "epoch": 0.7251848734443576, "flos": 30081614728320.0, "grad_norm": 2.4510281374991396, "language_loss": 0.78843433, "learning_rate": 7.411372600238841e-07, "loss": 0.81005645, "num_input_tokens_seen": 129727100, "step": 6031, "time_per_iteration": 2.6980230808258057 }, { "auxiliary_loss_clip": 0.01135104, "auxiliary_loss_mlp": 0.01041673, "balance_loss_clip": 1.04523158, "balance_loss_mlp": 1.02548409, "epoch": 0.7253051163349967, "flos": 17785262943360.0, "grad_norm": 2.722727284444822, "language_loss": 0.74070024, "learning_rate": 7.405320508692346e-07, "loss": 0.76246804, "num_input_tokens_seen": 129745840, "step": 6032, "time_per_iteration": 2.619697332382202 }, { "auxiliary_loss_clip": 0.0113106, "auxiliary_loss_mlp": 0.01041842, "balance_loss_clip": 1.04321682, "balance_loss_mlp": 1.02678585, "epoch": 0.7254253592256358, "flos": 12641346938880.0, "grad_norm": 2.0062373462639416, "language_loss": 0.75485152, "learning_rate": 7.399270327748727e-07, "loss": 0.77658057, "num_input_tokens_seen": 129763500, "step": 6033, "time_per_iteration": 2.609067440032959 }, { "auxiliary_loss_clip": 0.01095427, "auxiliary_loss_mlp": 0.00771944, "balance_loss_clip": 1.03980803, "balance_loss_mlp": 1.00052011, "epoch": 0.7255456021162748, "flos": 27199208966400.0, "grad_norm": 1.99755671922009, "language_loss": 0.74259317, "learning_rate": 7.39322205832577e-07, "loss": 0.76126695, "num_input_tokens_seen": 129784390, "step": 6034, "time_per_iteration": 2.7811408042907715 }, { "auxiliary_loss_clip": 0.01104012, "auxiliary_loss_mlp": 0.01040943, "balance_loss_clip": 1.03901255, "balance_loss_mlp": 1.02485001, "epoch": 0.725665845006914, "flos": 21288205088640.0, "grad_norm": 2.2305731708350653, "language_loss": 0.81006086, "learning_rate": 7.387175701341009e-07, "loss": 0.83151042, "num_input_tokens_seen": 129803060, "step": 6035, "time_per_iteration": 2.636594533920288 }, { "auxiliary_loss_clip": 0.01124764, "auxiliary_loss_mlp": 0.01041228, "balance_loss_clip": 1.04215217, "balance_loss_mlp": 1.02374005, "epoch": 0.7257860878975531, "flos": 16033684129920.0, "grad_norm": 2.3005693686092314, "language_loss": 0.72129095, "learning_rate": 7.381131257711659e-07, "loss": 0.74295086, "num_input_tokens_seen": 129820165, "step": 6036, "time_per_iteration": 2.6304705142974854 }, { "auxiliary_loss_clip": 0.01104872, "auxiliary_loss_mlp": 0.01043834, "balance_loss_clip": 1.04468751, "balance_loss_mlp": 1.02848601, "epoch": 0.7259063307881921, "flos": 12129943052160.0, "grad_norm": 1.871775136784693, "language_loss": 0.83458626, "learning_rate": 7.375088728354677e-07, "loss": 0.85607338, "num_input_tokens_seen": 129835195, "step": 6037, "time_per_iteration": 3.54591965675354 }, { "auxiliary_loss_clip": 0.01099048, "auxiliary_loss_mlp": 0.01037219, "balance_loss_clip": 1.04044843, "balance_loss_mlp": 1.02190042, "epoch": 0.7260265736788313, "flos": 30443845432320.0, "grad_norm": 1.5888769893794839, "language_loss": 0.67481661, "learning_rate": 7.369048114186691e-07, "loss": 0.69617927, "num_input_tokens_seen": 129856240, "step": 6038, "time_per_iteration": 2.7813994884490967 }, { "auxiliary_loss_clip": 0.01101412, "auxiliary_loss_mlp": 0.00772116, "balance_loss_clip": 1.04307365, "balance_loss_mlp": 1.00058115, "epoch": 0.7261468165694703, "flos": 21142264129920.0, "grad_norm": 1.926276266787391, "language_loss": 0.82958895, "learning_rate": 7.363009416124055e-07, "loss": 0.84832424, "num_input_tokens_seen": 129875565, "step": 6039, "time_per_iteration": 3.6573538780212402 }, { "auxiliary_loss_clip": 0.01098512, "auxiliary_loss_mlp": 0.01037094, "balance_loss_clip": 1.03844547, "balance_loss_mlp": 1.02067852, "epoch": 0.7262670594601094, "flos": 22306308180480.0, "grad_norm": 3.2811993873180985, "language_loss": 0.62854725, "learning_rate": 7.356972635082852e-07, "loss": 0.6499033, "num_input_tokens_seen": 129894420, "step": 6040, "time_per_iteration": 2.682737350463867 }, { "auxiliary_loss_clip": 0.01085059, "auxiliary_loss_mlp": 0.01042054, "balance_loss_clip": 1.04158998, "balance_loss_mlp": 1.02467275, "epoch": 0.7263873023507486, "flos": 25335049950720.0, "grad_norm": 2.1427993461597885, "language_loss": 0.75685793, "learning_rate": 7.35093777197884e-07, "loss": 0.77812898, "num_input_tokens_seen": 129914490, "step": 6041, "time_per_iteration": 2.783578634262085 }, { "auxiliary_loss_clip": 0.01105574, "auxiliary_loss_mlp": 0.01042824, "balance_loss_clip": 1.04064214, "balance_loss_mlp": 1.02797019, "epoch": 0.7265075452413876, "flos": 23878621192320.0, "grad_norm": 2.5007900533993888, "language_loss": 0.85451096, "learning_rate": 7.344904827727525e-07, "loss": 0.87599492, "num_input_tokens_seen": 129931670, "step": 6042, "time_per_iteration": 2.716386079788208 }, { "auxiliary_loss_clip": 0.01094238, "auxiliary_loss_mlp": 0.01039924, "balance_loss_clip": 1.03792155, "balance_loss_mlp": 1.02466536, "epoch": 0.7266277881320267, "flos": 28724549967360.0, "grad_norm": 2.2619426670158918, "language_loss": 0.73877442, "learning_rate": 7.338873803244076e-07, "loss": 0.76011604, "num_input_tokens_seen": 129946905, "step": 6043, "time_per_iteration": 3.8084511756896973 }, { "auxiliary_loss_clip": 0.01106987, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.04101729, "balance_loss_mlp": 1.02328706, "epoch": 0.7267480310226658, "flos": 24863507182080.0, "grad_norm": 1.928308762524935, "language_loss": 0.80613095, "learning_rate": 7.332844699443401e-07, "loss": 0.82758594, "num_input_tokens_seen": 129965505, "step": 6044, "time_per_iteration": 3.5664191246032715 }, { "auxiliary_loss_clip": 0.01075571, "auxiliary_loss_mlp": 0.01044991, "balance_loss_clip": 1.03556502, "balance_loss_mlp": 1.02993512, "epoch": 0.7268682739133049, "flos": 27198490694400.0, "grad_norm": 1.8116350436644784, "language_loss": 0.75575173, "learning_rate": 7.326817517240121e-07, "loss": 0.77695733, "num_input_tokens_seen": 129987210, "step": 6045, "time_per_iteration": 2.762195587158203 }, { "auxiliary_loss_clip": 0.01123404, "auxiliary_loss_mlp": 0.00771188, "balance_loss_clip": 1.04385829, "balance_loss_mlp": 1.00051594, "epoch": 0.7269885168039439, "flos": 33508138688640.0, "grad_norm": 1.835145703816179, "language_loss": 0.83737051, "learning_rate": 7.320792257548545e-07, "loss": 0.85631645, "num_input_tokens_seen": 130008385, "step": 6046, "time_per_iteration": 2.7145016193389893 }, { "auxiliary_loss_clip": 0.01115274, "auxiliary_loss_mlp": 0.01046374, "balance_loss_clip": 1.04296708, "balance_loss_mlp": 1.02988696, "epoch": 0.7271087596945831, "flos": 24313750548480.0, "grad_norm": 4.793642735936728, "language_loss": 0.76255268, "learning_rate": 7.314768921282704e-07, "loss": 0.7841692, "num_input_tokens_seen": 130029040, "step": 6047, "time_per_iteration": 2.70682430267334 }, { "auxiliary_loss_clip": 0.01123445, "auxiliary_loss_mlp": 0.01039475, "balance_loss_clip": 1.04365766, "balance_loss_mlp": 1.02487159, "epoch": 0.7272290025852222, "flos": 23805147922560.0, "grad_norm": 4.263479148377751, "language_loss": 0.72133839, "learning_rate": 7.30874750935633e-07, "loss": 0.74296761, "num_input_tokens_seen": 130048725, "step": 6048, "time_per_iteration": 2.640605926513672 }, { "auxiliary_loss_clip": 0.01093429, "auxiliary_loss_mlp": 0.01039721, "balance_loss_clip": 1.03891373, "balance_loss_mlp": 1.02461696, "epoch": 0.7273492454758612, "flos": 16720367408640.0, "grad_norm": 1.9819736876373408, "language_loss": 0.79308558, "learning_rate": 7.30272802268286e-07, "loss": 0.814417, "num_input_tokens_seen": 130065720, "step": 6049, "time_per_iteration": 2.6779677867889404 }, { "auxiliary_loss_clip": 0.01045444, "auxiliary_loss_mlp": 0.01033629, "balance_loss_clip": 1.03279018, "balance_loss_mlp": 1.01921654, "epoch": 0.7274694883665004, "flos": 28031330413440.0, "grad_norm": 1.7154432613425308, "language_loss": 0.76450467, "learning_rate": 7.29671046217547e-07, "loss": 0.78529543, "num_input_tokens_seen": 130084830, "step": 6050, "time_per_iteration": 2.7948927879333496 }, { "auxiliary_loss_clip": 0.0109733, "auxiliary_loss_mlp": 0.01040389, "balance_loss_clip": 1.04082227, "balance_loss_mlp": 1.02466536, "epoch": 0.7275897312571394, "flos": 30372706546560.0, "grad_norm": 1.8388247750941171, "language_loss": 0.81782866, "learning_rate": 7.290694828746988e-07, "loss": 0.83920586, "num_input_tokens_seen": 130104495, "step": 6051, "time_per_iteration": 2.7534148693084717 }, { "auxiliary_loss_clip": 0.0110028, "auxiliary_loss_mlp": 0.01036588, "balance_loss_clip": 1.04133606, "balance_loss_mlp": 1.02126384, "epoch": 0.7277099741477785, "flos": 19204775498880.0, "grad_norm": 1.9359452694494632, "language_loss": 0.8601414, "learning_rate": 7.284681123310004e-07, "loss": 0.88151008, "num_input_tokens_seen": 130123210, "step": 6052, "time_per_iteration": 2.7025787830352783 }, { "auxiliary_loss_clip": 0.01120809, "auxiliary_loss_mlp": 0.0103602, "balance_loss_clip": 1.04125297, "balance_loss_mlp": 1.02104712, "epoch": 0.7278302170384175, "flos": 20667884186880.0, "grad_norm": 2.077886014717967, "language_loss": 0.79817224, "learning_rate": 7.27866934677678e-07, "loss": 0.81974053, "num_input_tokens_seen": 130142880, "step": 6053, "time_per_iteration": 2.6259021759033203 }, { "auxiliary_loss_clip": 0.01077814, "auxiliary_loss_mlp": 0.01041684, "balance_loss_clip": 1.03688347, "balance_loss_mlp": 1.02572215, "epoch": 0.7279504599290567, "flos": 19093200877440.0, "grad_norm": 1.6513188987863021, "language_loss": 0.78358412, "learning_rate": 7.272659500059297e-07, "loss": 0.80477917, "num_input_tokens_seen": 130160220, "step": 6054, "time_per_iteration": 2.736637592315674 }, { "auxiliary_loss_clip": 0.01115154, "auxiliary_loss_mlp": 0.0105168, "balance_loss_clip": 1.0400691, "balance_loss_mlp": 1.03359592, "epoch": 0.7280707028196958, "flos": 19062174504960.0, "grad_norm": 3.1161203357572145, "language_loss": 0.80339336, "learning_rate": 7.266651584069264e-07, "loss": 0.82506168, "num_input_tokens_seen": 130177885, "step": 6055, "time_per_iteration": 2.604382038116455 }, { "auxiliary_loss_clip": 0.01125334, "auxiliary_loss_mlp": 0.01045021, "balance_loss_clip": 1.04500008, "balance_loss_mlp": 1.02883863, "epoch": 0.7281909457103348, "flos": 37196308293120.0, "grad_norm": 2.0178814786319887, "language_loss": 0.57529122, "learning_rate": 7.260645599718045e-07, "loss": 0.59699482, "num_input_tokens_seen": 130204240, "step": 6056, "time_per_iteration": 2.741326332092285 }, { "auxiliary_loss_clip": 0.01110421, "auxiliary_loss_mlp": 0.01042806, "balance_loss_clip": 1.04086256, "balance_loss_mlp": 1.02577102, "epoch": 0.728311188600974, "flos": 20667094087680.0, "grad_norm": 2.339063641032498, "language_loss": 0.66907084, "learning_rate": 7.254641547916767e-07, "loss": 0.69060314, "num_input_tokens_seen": 130221735, "step": 6057, "time_per_iteration": 2.6616930961608887 }, { "auxiliary_loss_clip": 0.0113558, "auxiliary_loss_mlp": 0.01036247, "balance_loss_clip": 1.04449701, "balance_loss_mlp": 1.0193249, "epoch": 0.728431431491613, "flos": 28840685616000.0, "grad_norm": 2.105187987248484, "language_loss": 0.69107562, "learning_rate": 7.248639429576226e-07, "loss": 0.71279389, "num_input_tokens_seen": 130241190, "step": 6058, "time_per_iteration": 2.6330034732818604 }, { "auxiliary_loss_clip": 0.01122725, "auxiliary_loss_mlp": 0.01033893, "balance_loss_clip": 1.04233563, "balance_loss_mlp": 1.01925373, "epoch": 0.7285516743822521, "flos": 25991856092160.0, "grad_norm": 1.9209980493161405, "language_loss": 0.72028494, "learning_rate": 7.242639245606959e-07, "loss": 0.74185109, "num_input_tokens_seen": 130260980, "step": 6059, "time_per_iteration": 2.6673800945281982 }, { "auxiliary_loss_clip": 0.01114843, "auxiliary_loss_mlp": 0.01049896, "balance_loss_clip": 1.04176331, "balance_loss_mlp": 1.03424335, "epoch": 0.7286719172728913, "flos": 16399721675520.0, "grad_norm": 1.659391464708524, "language_loss": 0.82365239, "learning_rate": 7.236640996919168e-07, "loss": 0.84529978, "num_input_tokens_seen": 130280025, "step": 6060, "time_per_iteration": 2.619199752807617 }, { "auxiliary_loss_clip": 0.01124626, "auxiliary_loss_mlp": 0.01036151, "balance_loss_clip": 1.04462457, "balance_loss_mlp": 1.0203197, "epoch": 0.7287921601635303, "flos": 22018161277440.0, "grad_norm": 1.54796912697955, "language_loss": 0.70257592, "learning_rate": 7.230644684422782e-07, "loss": 0.72418368, "num_input_tokens_seen": 130300255, "step": 6061, "time_per_iteration": 2.6290149688720703 }, { "auxiliary_loss_clip": 0.01097324, "auxiliary_loss_mlp": 0.01039003, "balance_loss_clip": 1.03899002, "balance_loss_mlp": 1.02302933, "epoch": 0.7289124030541694, "flos": 24600927784320.0, "grad_norm": 2.1181679532724593, "language_loss": 0.81380248, "learning_rate": 7.224650309027451e-07, "loss": 0.8351658, "num_input_tokens_seen": 130320005, "step": 6062, "time_per_iteration": 2.7193734645843506 }, { "auxiliary_loss_clip": 0.01123976, "auxiliary_loss_mlp": 0.010402, "balance_loss_clip": 1.04389954, "balance_loss_mlp": 1.02517939, "epoch": 0.7290326459448085, "flos": 21393638484480.0, "grad_norm": 1.6440818833802722, "language_loss": 0.68556118, "learning_rate": 7.218657871642506e-07, "loss": 0.70720291, "num_input_tokens_seen": 130338810, "step": 6063, "time_per_iteration": 3.530860662460327 }, { "auxiliary_loss_clip": 0.01136479, "auxiliary_loss_mlp": 0.01041157, "balance_loss_clip": 1.04417253, "balance_loss_mlp": 1.02477741, "epoch": 0.7291528888354476, "flos": 18587686821120.0, "grad_norm": 2.446625535577481, "language_loss": 0.62175852, "learning_rate": 7.212667373177012e-07, "loss": 0.6435349, "num_input_tokens_seen": 130353805, "step": 6064, "time_per_iteration": 2.5622124671936035 }, { "auxiliary_loss_clip": 0.01096274, "auxiliary_loss_mlp": 0.0104105, "balance_loss_clip": 1.04098797, "balance_loss_mlp": 1.0246706, "epoch": 0.7292731317260867, "flos": 18951066760320.0, "grad_norm": 6.075951352957759, "language_loss": 0.75317168, "learning_rate": 7.206678814539704e-07, "loss": 0.77454495, "num_input_tokens_seen": 130372105, "step": 6065, "time_per_iteration": 3.5845820903778076 }, { "auxiliary_loss_clip": 0.01094608, "auxiliary_loss_mlp": 0.01035214, "balance_loss_clip": 1.04176354, "balance_loss_mlp": 1.0207479, "epoch": 0.7293933746167258, "flos": 21067569797760.0, "grad_norm": 1.6477961384170983, "language_loss": 0.72857356, "learning_rate": 7.20069219663904e-07, "loss": 0.74987173, "num_input_tokens_seen": 130391990, "step": 6066, "time_per_iteration": 2.6875240802764893 }, { "auxiliary_loss_clip": 0.01123787, "auxiliary_loss_mlp": 0.01040805, "balance_loss_clip": 1.04125798, "balance_loss_mlp": 1.02496195, "epoch": 0.7295136175073649, "flos": 22453326547200.0, "grad_norm": 1.8640658804763108, "language_loss": 0.79621887, "learning_rate": 7.1947075203832e-07, "loss": 0.81786478, "num_input_tokens_seen": 130411970, "step": 6067, "time_per_iteration": 2.6082587242126465 }, { "auxiliary_loss_clip": 0.01036714, "auxiliary_loss_mlp": 0.01000739, "balance_loss_clip": 1.00780082, "balance_loss_mlp": 0.99938601, "epoch": 0.7296338603980039, "flos": 56125506648960.0, "grad_norm": 0.8582632244572522, "language_loss": 0.60179818, "learning_rate": 7.188724786680049e-07, "loss": 0.62217271, "num_input_tokens_seen": 130472440, "step": 6068, "time_per_iteration": 3.1599347591400146 }, { "auxiliary_loss_clip": 0.01109545, "auxiliary_loss_mlp": 0.01040767, "balance_loss_clip": 1.04130769, "balance_loss_mlp": 1.02505505, "epoch": 0.7297541032886431, "flos": 25228287751680.0, "grad_norm": 1.673841577927526, "language_loss": 0.758178, "learning_rate": 7.182743996437162e-07, "loss": 0.77968109, "num_input_tokens_seen": 130491975, "step": 6069, "time_per_iteration": 3.6469428539276123 }, { "auxiliary_loss_clip": 0.01105461, "auxiliary_loss_mlp": 0.01040218, "balance_loss_clip": 1.04018402, "balance_loss_mlp": 1.02387428, "epoch": 0.7298743461792822, "flos": 26467600752000.0, "grad_norm": 1.8807004372632374, "language_loss": 0.6877414, "learning_rate": 7.176765150561819e-07, "loss": 0.70919818, "num_input_tokens_seen": 130510580, "step": 6070, "time_per_iteration": 3.5569818019866943 }, { "auxiliary_loss_clip": 0.01134416, "auxiliary_loss_mlp": 0.01037835, "balance_loss_clip": 1.04143298, "balance_loss_mlp": 1.02159834, "epoch": 0.7299945890699212, "flos": 19569053278080.0, "grad_norm": 1.9628179147433655, "language_loss": 0.80094993, "learning_rate": 7.170788249961002e-07, "loss": 0.82267249, "num_input_tokens_seen": 130529090, "step": 6071, "time_per_iteration": 2.569624185562134 }, { "auxiliary_loss_clip": 0.01131663, "auxiliary_loss_mlp": 0.01039349, "balance_loss_clip": 1.04212832, "balance_loss_mlp": 1.02358329, "epoch": 0.7301148319605604, "flos": 22928963466240.0, "grad_norm": 1.8000119750087176, "language_loss": 0.8807835, "learning_rate": 7.164813295541418e-07, "loss": 0.9024936, "num_input_tokens_seen": 130548655, "step": 6072, "time_per_iteration": 2.5857880115509033 }, { "auxiliary_loss_clip": 0.01112781, "auxiliary_loss_mlp": 0.0103966, "balance_loss_clip": 1.04111946, "balance_loss_mlp": 1.02450895, "epoch": 0.7302350748511994, "flos": 25369703596800.0, "grad_norm": 1.7924679504754166, "language_loss": 0.70124626, "learning_rate": 7.15884028820944e-07, "loss": 0.72277069, "num_input_tokens_seen": 130567710, "step": 6073, "time_per_iteration": 2.6759634017944336 }, { "auxiliary_loss_clip": 0.01092163, "auxiliary_loss_mlp": 0.01042594, "balance_loss_clip": 1.03694761, "balance_loss_mlp": 1.02613091, "epoch": 0.7303553177418385, "flos": 27819170732160.0, "grad_norm": 3.917241175288006, "language_loss": 0.60163635, "learning_rate": 7.152869228871185e-07, "loss": 0.62298393, "num_input_tokens_seen": 130590195, "step": 6074, "time_per_iteration": 2.7092020511627197 }, { "auxiliary_loss_clip": 0.01104441, "auxiliary_loss_mlp": 0.0104796, "balance_loss_clip": 1.04076838, "balance_loss_mlp": 1.03080595, "epoch": 0.7304755606324776, "flos": 24426510318720.0, "grad_norm": 3.3125824045996617, "language_loss": 0.72393465, "learning_rate": 7.146900118432457e-07, "loss": 0.74545866, "num_input_tokens_seen": 130609940, "step": 6075, "time_per_iteration": 2.6394269466400146 }, { "auxiliary_loss_clip": 0.01056179, "auxiliary_loss_mlp": 0.01040201, "balance_loss_clip": 1.03236568, "balance_loss_mlp": 1.02364302, "epoch": 0.7305958035231167, "flos": 23840483927040.0, "grad_norm": 2.110580456743968, "language_loss": 0.85921055, "learning_rate": 7.140932957798753e-07, "loss": 0.88017434, "num_input_tokens_seen": 130628380, "step": 6076, "time_per_iteration": 2.856867551803589 }, { "auxiliary_loss_clip": 0.01112377, "auxiliary_loss_mlp": 0.01042788, "balance_loss_clip": 1.04055691, "balance_loss_mlp": 1.02638483, "epoch": 0.7307160464137558, "flos": 16726939597440.0, "grad_norm": 19.37565673942194, "language_loss": 0.71503627, "learning_rate": 7.134967747875309e-07, "loss": 0.73658794, "num_input_tokens_seen": 130646590, "step": 6077, "time_per_iteration": 2.864867925643921 }, { "auxiliary_loss_clip": 0.01118159, "auxiliary_loss_mlp": 0.01042641, "balance_loss_clip": 1.04182243, "balance_loss_mlp": 1.02657151, "epoch": 0.7308362893043949, "flos": 21798280172160.0, "grad_norm": 1.9180323605517997, "language_loss": 0.82031107, "learning_rate": 7.129004489567014e-07, "loss": 0.84191906, "num_input_tokens_seen": 130664070, "step": 6078, "time_per_iteration": 2.610475540161133 }, { "auxiliary_loss_clip": 0.01101964, "auxiliary_loss_mlp": 0.01039838, "balance_loss_clip": 1.04072857, "balance_loss_mlp": 1.02472234, "epoch": 0.730956532195034, "flos": 10707377840640.0, "grad_norm": 2.5596467450478735, "language_loss": 0.77851391, "learning_rate": 7.123043183778512e-07, "loss": 0.79993188, "num_input_tokens_seen": 130681400, "step": 6079, "time_per_iteration": 2.6656124591827393 }, { "auxiliary_loss_clip": 0.01103778, "auxiliary_loss_mlp": 0.01038045, "balance_loss_clip": 1.04166222, "balance_loss_mlp": 1.02242804, "epoch": 0.731076775085673, "flos": 19791987039360.0, "grad_norm": 1.6489691519695218, "language_loss": 0.64984083, "learning_rate": 7.117083831414114e-07, "loss": 0.67125911, "num_input_tokens_seen": 130700675, "step": 6080, "time_per_iteration": 2.6453664302825928 }, { "auxiliary_loss_clip": 0.01132158, "auxiliary_loss_mlp": 0.01038684, "balance_loss_clip": 1.04384971, "balance_loss_mlp": 1.02331805, "epoch": 0.7311970179763122, "flos": 20447033414400.0, "grad_norm": 2.795958152949459, "language_loss": 0.69652402, "learning_rate": 7.11112643337787e-07, "loss": 0.71823251, "num_input_tokens_seen": 130719720, "step": 6081, "time_per_iteration": 2.5590436458587646 }, { "auxiliary_loss_clip": 0.01110506, "auxiliary_loss_mlp": 0.01041666, "balance_loss_clip": 1.04393744, "balance_loss_mlp": 1.0255487, "epoch": 0.7313172608669513, "flos": 18513818501760.0, "grad_norm": 4.939169304542218, "language_loss": 0.7674945, "learning_rate": 7.10517099057349e-07, "loss": 0.78901625, "num_input_tokens_seen": 130736670, "step": 6082, "time_per_iteration": 2.589386224746704 }, { "auxiliary_loss_clip": 0.01110751, "auxiliary_loss_mlp": 0.0104288, "balance_loss_clip": 1.0423162, "balance_loss_mlp": 1.02502263, "epoch": 0.7314375037575903, "flos": 16180738410240.0, "grad_norm": 2.163624772739609, "language_loss": 0.61323535, "learning_rate": 7.099217503904411e-07, "loss": 0.6347717, "num_input_tokens_seen": 130754525, "step": 6083, "time_per_iteration": 2.5994741916656494 }, { "auxiliary_loss_clip": 0.01112141, "auxiliary_loss_mlp": 0.01040449, "balance_loss_clip": 1.04199231, "balance_loss_mlp": 1.02553618, "epoch": 0.7315577466482295, "flos": 17967940536960.0, "grad_norm": 2.268269306942429, "language_loss": 0.89985454, "learning_rate": 7.093265974273788e-07, "loss": 0.9213804, "num_input_tokens_seen": 130772420, "step": 6084, "time_per_iteration": 2.6163339614868164 }, { "auxiliary_loss_clip": 0.0112061, "auxiliary_loss_mlp": 0.0103612, "balance_loss_clip": 1.04024458, "balance_loss_mlp": 1.02258396, "epoch": 0.7316779895388685, "flos": 18405440190720.0, "grad_norm": 2.455126421019777, "language_loss": 0.71728158, "learning_rate": 7.087316402584447e-07, "loss": 0.73884892, "num_input_tokens_seen": 130791245, "step": 6085, "time_per_iteration": 2.6130318641662598 }, { "auxiliary_loss_clip": 0.01133388, "auxiliary_loss_mlp": 0.01043926, "balance_loss_clip": 1.04287553, "balance_loss_mlp": 1.02808309, "epoch": 0.7317982324295076, "flos": 17928294900480.0, "grad_norm": 1.9965037520304607, "language_loss": 0.86307907, "learning_rate": 7.081368789738953e-07, "loss": 0.88485223, "num_input_tokens_seen": 130808445, "step": 6086, "time_per_iteration": 2.5208868980407715 }, { "auxiliary_loss_clip": 0.01107091, "auxiliary_loss_mlp": 0.01038785, "balance_loss_clip": 1.04059708, "balance_loss_mlp": 1.02254844, "epoch": 0.7319184753201466, "flos": 27229840289280.0, "grad_norm": 1.8478440411100023, "language_loss": 0.77641439, "learning_rate": 7.075423136639537e-07, "loss": 0.79787314, "num_input_tokens_seen": 130827700, "step": 6087, "time_per_iteration": 2.708004951477051 }, { "auxiliary_loss_clip": 0.01094082, "auxiliary_loss_mlp": 0.01046514, "balance_loss_clip": 1.04096723, "balance_loss_mlp": 1.03026581, "epoch": 0.7320387182107858, "flos": 37448544574080.0, "grad_norm": 6.627613047645023, "language_loss": 0.74588442, "learning_rate": 7.069479444188149e-07, "loss": 0.76729035, "num_input_tokens_seen": 130848290, "step": 6088, "time_per_iteration": 2.7785439491271973 }, { "auxiliary_loss_clip": 0.01103897, "auxiliary_loss_mlp": 0.01039251, "balance_loss_clip": 1.04126036, "balance_loss_mlp": 1.02339649, "epoch": 0.7321589611014249, "flos": 17859023521920.0, "grad_norm": 1.7978956924714304, "language_loss": 0.82047164, "learning_rate": 7.063537713286453e-07, "loss": 0.84190315, "num_input_tokens_seen": 130865970, "step": 6089, "time_per_iteration": 3.5198636054992676 }, { "auxiliary_loss_clip": 0.01113451, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.04077244, "balance_loss_mlp": 1.02431369, "epoch": 0.7322792039920639, "flos": 26100593539200.0, "grad_norm": 2.426724364148851, "language_loss": 0.80925125, "learning_rate": 7.057597944835803e-07, "loss": 0.8307907, "num_input_tokens_seen": 130885245, "step": 6090, "time_per_iteration": 2.6506855487823486 }, { "auxiliary_loss_clip": 0.01098487, "auxiliary_loss_mlp": 0.01036691, "balance_loss_clip": 1.03846002, "balance_loss_mlp": 1.0221653, "epoch": 0.7323994468827031, "flos": 25369093065600.0, "grad_norm": 2.7063583314081447, "language_loss": 0.74531609, "learning_rate": 7.051660139737253e-07, "loss": 0.76666796, "num_input_tokens_seen": 130903465, "step": 6091, "time_per_iteration": 2.701866865158081 }, { "auxiliary_loss_clip": 0.01118819, "auxiliary_loss_mlp": 0.00772677, "balance_loss_clip": 1.04172719, "balance_loss_mlp": 1.00051856, "epoch": 0.7325196897733421, "flos": 26907075653760.0, "grad_norm": 3.0623323718288495, "language_loss": 0.76859987, "learning_rate": 7.045724298891565e-07, "loss": 0.78751493, "num_input_tokens_seen": 130922935, "step": 6092, "time_per_iteration": 3.61104679107666 }, { "auxiliary_loss_clip": 0.01120332, "auxiliary_loss_mlp": 0.01039808, "balance_loss_clip": 1.04188538, "balance_loss_mlp": 1.022928, "epoch": 0.7326399326639812, "flos": 25775781828480.0, "grad_norm": 2.3347784544270884, "language_loss": 0.69336551, "learning_rate": 7.039790423199192e-07, "loss": 0.71496689, "num_input_tokens_seen": 130942575, "step": 6093, "time_per_iteration": 2.6071202754974365 }, { "auxiliary_loss_clip": 0.01111501, "auxiliary_loss_mlp": 0.01040535, "balance_loss_clip": 1.04038954, "balance_loss_mlp": 1.02460861, "epoch": 0.7327601755546204, "flos": 21032269706880.0, "grad_norm": 3.0167006921079174, "language_loss": 0.78418982, "learning_rate": 7.033858513560322e-07, "loss": 0.8057102, "num_input_tokens_seen": 130958870, "step": 6094, "time_per_iteration": 2.6733672618865967 }, { "auxiliary_loss_clip": 0.01119329, "auxiliary_loss_mlp": 0.01037533, "balance_loss_clip": 1.04194427, "balance_loss_mlp": 1.02249503, "epoch": 0.7328804184452594, "flos": 16289224462080.0, "grad_norm": 2.8920607335916895, "language_loss": 0.76350999, "learning_rate": 7.027928570874794e-07, "loss": 0.78507864, "num_input_tokens_seen": 130977060, "step": 6095, "time_per_iteration": 3.6015126705169678 }, { "auxiliary_loss_clip": 0.01134514, "auxiliary_loss_mlp": 0.01042469, "balance_loss_clip": 1.04440045, "balance_loss_mlp": 1.02713275, "epoch": 0.7330006613358985, "flos": 17858233422720.0, "grad_norm": 1.9855905134016973, "language_loss": 0.85636616, "learning_rate": 7.022000596042194e-07, "loss": 0.87813598, "num_input_tokens_seen": 130994160, "step": 6096, "time_per_iteration": 3.5605063438415527 }, { "auxiliary_loss_clip": 0.01097334, "auxiliary_loss_mlp": 0.01037877, "balance_loss_clip": 1.03692222, "balance_loss_mlp": 1.02286839, "epoch": 0.7331209042265376, "flos": 22492074343680.0, "grad_norm": 2.1964546684935984, "language_loss": 0.81715667, "learning_rate": 7.016074589961784e-07, "loss": 0.83850878, "num_input_tokens_seen": 131012725, "step": 6097, "time_per_iteration": 2.6841623783111572 }, { "auxiliary_loss_clip": 0.01104466, "auxiliary_loss_mlp": 0.01040646, "balance_loss_clip": 1.04222012, "balance_loss_mlp": 1.02527988, "epoch": 0.7332411471171767, "flos": 33072757937280.0, "grad_norm": 1.963094080216469, "language_loss": 0.67222434, "learning_rate": 7.01015055353253e-07, "loss": 0.69367546, "num_input_tokens_seen": 131035150, "step": 6098, "time_per_iteration": 2.7172117233276367 }, { "auxiliary_loss_clip": 0.01076431, "auxiliary_loss_mlp": 0.01043042, "balance_loss_clip": 1.03867531, "balance_loss_mlp": 1.02690101, "epoch": 0.7333613900078157, "flos": 22743017735040.0, "grad_norm": 1.931223073542241, "language_loss": 0.7830838, "learning_rate": 7.004228487653123e-07, "loss": 0.80427855, "num_input_tokens_seen": 131055955, "step": 6099, "time_per_iteration": 2.757689952850342 }, { "auxiliary_loss_clip": 0.0109358, "auxiliary_loss_mlp": 0.01040119, "balance_loss_clip": 1.03806686, "balance_loss_mlp": 1.0238353, "epoch": 0.7334816328984549, "flos": 22346133384960.0, "grad_norm": 1.9200474565830767, "language_loss": 0.78355843, "learning_rate": 6.998308393221906e-07, "loss": 0.8048954, "num_input_tokens_seen": 131074360, "step": 6100, "time_per_iteration": 2.6617183685302734 }, { "auxiliary_loss_clip": 0.0110054, "auxiliary_loss_mlp": 0.01037631, "balance_loss_clip": 1.04110968, "balance_loss_mlp": 1.02212811, "epoch": 0.733601875789094, "flos": 20736149984640.0, "grad_norm": 2.161505654834355, "language_loss": 0.71053129, "learning_rate": 6.992390271136977e-07, "loss": 0.73191303, "num_input_tokens_seen": 131090070, "step": 6101, "time_per_iteration": 2.654439926147461 }, { "auxiliary_loss_clip": 0.01113583, "auxiliary_loss_mlp": 0.01044654, "balance_loss_clip": 1.0395205, "balance_loss_mlp": 1.02594995, "epoch": 0.733722118679733, "flos": 22564362464640.0, "grad_norm": 3.6152031745951305, "language_loss": 0.85682827, "learning_rate": 6.986474122296094e-07, "loss": 0.87841064, "num_input_tokens_seen": 131109185, "step": 6102, "time_per_iteration": 2.64123797416687 }, { "auxiliary_loss_clip": 0.01141222, "auxiliary_loss_mlp": 0.0104455, "balance_loss_clip": 1.04739761, "balance_loss_mlp": 1.02778959, "epoch": 0.7338423615703722, "flos": 20084192179200.0, "grad_norm": 2.1783812599865806, "language_loss": 0.72291261, "learning_rate": 6.980559947596751e-07, "loss": 0.74477035, "num_input_tokens_seen": 131127725, "step": 6103, "time_per_iteration": 2.614234685897827 }, { "auxiliary_loss_clip": 0.01084346, "auxiliary_loss_mlp": 0.01035246, "balance_loss_clip": 1.0381763, "balance_loss_mlp": 1.0194751, "epoch": 0.7339626044610112, "flos": 21687675217920.0, "grad_norm": 3.7505874231945966, "language_loss": 0.76184654, "learning_rate": 6.974647747936109e-07, "loss": 0.78304243, "num_input_tokens_seen": 131146110, "step": 6104, "time_per_iteration": 2.810796022415161 }, { "auxiliary_loss_clip": 0.01134767, "auxiliary_loss_mlp": 0.00772618, "balance_loss_clip": 1.04482651, "balance_loss_mlp": 1.00059199, "epoch": 0.7340828473516503, "flos": 15268248282240.0, "grad_norm": 2.7071793366307055, "language_loss": 0.8229515, "learning_rate": 6.968737524211039e-07, "loss": 0.8420254, "num_input_tokens_seen": 131162920, "step": 6105, "time_per_iteration": 2.649848222732544 }, { "auxiliary_loss_clip": 0.01122359, "auxiliary_loss_mlp": 0.0103637, "balance_loss_clip": 1.04516804, "balance_loss_mlp": 1.02056313, "epoch": 0.7342030902422895, "flos": 22930112701440.0, "grad_norm": 2.4611895422907484, "language_loss": 0.80045193, "learning_rate": 6.962829277318132e-07, "loss": 0.82203919, "num_input_tokens_seen": 131182515, "step": 6106, "time_per_iteration": 2.724874258041382 }, { "auxiliary_loss_clip": 0.01124976, "auxiliary_loss_mlp": 0.01038348, "balance_loss_clip": 1.04548049, "balance_loss_mlp": 1.0231787, "epoch": 0.7343233331329285, "flos": 25847890381440.0, "grad_norm": 1.721253325594428, "language_loss": 0.83668804, "learning_rate": 6.956923008153652e-07, "loss": 0.85832131, "num_input_tokens_seen": 131202280, "step": 6107, "time_per_iteration": 2.7207727432250977 }, { "auxiliary_loss_clip": 0.01122688, "auxiliary_loss_mlp": 0.01036822, "balance_loss_clip": 1.04252183, "balance_loss_mlp": 1.0216589, "epoch": 0.7344435760235676, "flos": 18478985287680.0, "grad_norm": 2.4064272154903525, "language_loss": 0.84176457, "learning_rate": 6.951018717613593e-07, "loss": 0.86335969, "num_input_tokens_seen": 131221295, "step": 6108, "time_per_iteration": 2.686964511871338 }, { "auxiliary_loss_clip": 0.01122127, "auxiliary_loss_mlp": 0.01029669, "balance_loss_clip": 1.0434792, "balance_loss_mlp": 1.01530433, "epoch": 0.7345638189142067, "flos": 17640040256640.0, "grad_norm": 1.9272979984705043, "language_loss": 0.7796638, "learning_rate": 6.945116406593614e-07, "loss": 0.80118179, "num_input_tokens_seen": 131240150, "step": 6109, "time_per_iteration": 2.662053108215332 }, { "auxiliary_loss_clip": 0.01086541, "auxiliary_loss_mlp": 0.01043527, "balance_loss_clip": 1.0400629, "balance_loss_mlp": 1.02756476, "epoch": 0.7346840618048458, "flos": 20260225756800.0, "grad_norm": 3.229806675707589, "language_loss": 0.74090135, "learning_rate": 6.939216075989089e-07, "loss": 0.76220202, "num_input_tokens_seen": 131258080, "step": 6110, "time_per_iteration": 2.7762115001678467 }, { "auxiliary_loss_clip": 0.01108726, "auxiliary_loss_mlp": 0.01039929, "balance_loss_clip": 1.04072928, "balance_loss_mlp": 1.02393067, "epoch": 0.7348043046954849, "flos": 29023183641600.0, "grad_norm": 2.219109901266279, "language_loss": 0.65877086, "learning_rate": 6.933317726695109e-07, "loss": 0.68025744, "num_input_tokens_seen": 131279310, "step": 6111, "time_per_iteration": 2.776557445526123 }, { "auxiliary_loss_clip": 0.0109429, "auxiliary_loss_mlp": 0.01032441, "balance_loss_clip": 1.04283452, "balance_loss_mlp": 1.01850581, "epoch": 0.734924547586124, "flos": 17931203902080.0, "grad_norm": 2.7103396225538323, "language_loss": 0.79882419, "learning_rate": 6.92742135960644e-07, "loss": 0.82009149, "num_input_tokens_seen": 131297010, "step": 6112, "time_per_iteration": 2.7754924297332764 }, { "auxiliary_loss_clip": 0.01030116, "auxiliary_loss_mlp": 0.01007899, "balance_loss_clip": 1.01089084, "balance_loss_mlp": 1.00651586, "epoch": 0.7350447904767631, "flos": 63588319850880.0, "grad_norm": 4.224833834934503, "language_loss": 0.55628371, "learning_rate": 6.921526975617556e-07, "loss": 0.57666385, "num_input_tokens_seen": 131356470, "step": 6113, "time_per_iteration": 3.2859504222869873 }, { "auxiliary_loss_clip": 0.01110569, "auxiliary_loss_mlp": 0.01049467, "balance_loss_clip": 1.04055572, "balance_loss_mlp": 1.03246784, "epoch": 0.7351650333674021, "flos": 21580015178880.0, "grad_norm": 2.0548342637588255, "language_loss": 0.75501096, "learning_rate": 6.915634575622631e-07, "loss": 0.77661133, "num_input_tokens_seen": 131374985, "step": 6114, "time_per_iteration": 2.7447330951690674 }, { "auxiliary_loss_clip": 0.01134809, "auxiliary_loss_mlp": 0.01039992, "balance_loss_clip": 1.04460597, "balance_loss_mlp": 1.02405334, "epoch": 0.7352852762580413, "flos": 18186349184640.0, "grad_norm": 1.7889860508269924, "language_loss": 0.707614, "learning_rate": 6.909744160515532e-07, "loss": 0.72936201, "num_input_tokens_seen": 131393125, "step": 6115, "time_per_iteration": 3.5426323413848877 }, { "auxiliary_loss_clip": 0.01109331, "auxiliary_loss_mlp": 0.01034102, "balance_loss_clip": 1.04175425, "balance_loss_mlp": 1.0173049, "epoch": 0.7354055191486804, "flos": 38910073063680.0, "grad_norm": 1.9632019406291208, "language_loss": 0.69388843, "learning_rate": 6.903855731189849e-07, "loss": 0.71532273, "num_input_tokens_seen": 131415760, "step": 6116, "time_per_iteration": 2.890864849090576 }, { "auxiliary_loss_clip": 0.01116314, "auxiliary_loss_mlp": 0.01040091, "balance_loss_clip": 1.04172778, "balance_loss_mlp": 1.02369976, "epoch": 0.7355257620393194, "flos": 16289978647680.0, "grad_norm": 2.3521748237855213, "language_loss": 0.8185063, "learning_rate": 6.897969288538825e-07, "loss": 0.84007037, "num_input_tokens_seen": 131433705, "step": 6117, "time_per_iteration": 2.7031726837158203 }, { "auxiliary_loss_clip": 0.01109112, "auxiliary_loss_mlp": 0.01031726, "balance_loss_clip": 1.04171205, "balance_loss_mlp": 1.01575208, "epoch": 0.7356460049299585, "flos": 18114240631680.0, "grad_norm": 1.7178455857053514, "language_loss": 0.81219417, "learning_rate": 6.892084833455452e-07, "loss": 0.83360255, "num_input_tokens_seen": 131453275, "step": 6118, "time_per_iteration": 3.688915491104126 }, { "auxiliary_loss_clip": 0.0111909, "auxiliary_loss_mlp": 0.01033592, "balance_loss_clip": 1.04203749, "balance_loss_mlp": 1.01786804, "epoch": 0.7357662478205976, "flos": 21325193118720.0, "grad_norm": 1.3966893879338733, "language_loss": 0.84035742, "learning_rate": 6.886202366832384e-07, "loss": 0.8618843, "num_input_tokens_seen": 131474960, "step": 6119, "time_per_iteration": 2.796962022781372 }, { "auxiliary_loss_clip": 0.01083995, "auxiliary_loss_mlp": 0.01042995, "balance_loss_clip": 1.04041386, "balance_loss_mlp": 1.02704477, "epoch": 0.7358864907112367, "flos": 14246841139200.0, "grad_norm": 1.880781394511329, "language_loss": 0.73618472, "learning_rate": 6.880321889561987e-07, "loss": 0.75745463, "num_input_tokens_seen": 131492935, "step": 6120, "time_per_iteration": 2.7007052898406982 }, { "auxiliary_loss_clip": 0.01097121, "auxiliary_loss_mlp": 0.01044397, "balance_loss_clip": 1.03964448, "balance_loss_mlp": 1.0269928, "epoch": 0.7360067336018757, "flos": 22309684058880.0, "grad_norm": 2.014661730799073, "language_loss": 0.65613973, "learning_rate": 6.874443402536338e-07, "loss": 0.67755497, "num_input_tokens_seen": 131512025, "step": 6121, "time_per_iteration": 3.6623013019561768 }, { "auxiliary_loss_clip": 0.01119108, "auxiliary_loss_mlp": 0.01043151, "balance_loss_clip": 1.04744267, "balance_loss_mlp": 1.02629495, "epoch": 0.7361269764925149, "flos": 25554607833600.0, "grad_norm": 1.6739764076188126, "language_loss": 0.80506814, "learning_rate": 6.868566906647177e-07, "loss": 0.82669073, "num_input_tokens_seen": 131532975, "step": 6122, "time_per_iteration": 3.5517561435699463 }, { "auxiliary_loss_clip": 0.01123403, "auxiliary_loss_mlp": 0.01043603, "balance_loss_clip": 1.04287505, "balance_loss_mlp": 1.02743864, "epoch": 0.736247219383154, "flos": 20376505059840.0, "grad_norm": 3.8518722937190804, "language_loss": 0.83791471, "learning_rate": 6.862692402785984e-07, "loss": 0.85958475, "num_input_tokens_seen": 131553225, "step": 6123, "time_per_iteration": 2.6508290767669678 }, { "auxiliary_loss_clip": 0.01009235, "auxiliary_loss_mlp": 0.01004027, "balance_loss_clip": 1.01373124, "balance_loss_mlp": 1.00226307, "epoch": 0.736367462273793, "flos": 70339525735680.0, "grad_norm": 0.6841358876543684, "language_loss": 0.49574944, "learning_rate": 6.856819891843899e-07, "loss": 0.51588202, "num_input_tokens_seen": 131617930, "step": 6124, "time_per_iteration": 3.302529811859131 }, { "auxiliary_loss_clip": 0.01072698, "auxiliary_loss_mlp": 0.01043414, "balance_loss_clip": 1.03869891, "balance_loss_mlp": 1.0269872, "epoch": 0.7364877051644322, "flos": 22412711243520.0, "grad_norm": 2.5193440919253187, "language_loss": 0.7235409, "learning_rate": 6.8509493747118e-07, "loss": 0.74470204, "num_input_tokens_seen": 131636740, "step": 6125, "time_per_iteration": 2.804107427597046 }, { "auxiliary_loss_clip": 0.01139303, "auxiliary_loss_mlp": 0.01041872, "balance_loss_clip": 1.04725385, "balance_loss_mlp": 1.02551079, "epoch": 0.7366079480550712, "flos": 12130266274560.0, "grad_norm": 2.286221616173595, "language_loss": 0.88324809, "learning_rate": 6.845080852280221e-07, "loss": 0.90505987, "num_input_tokens_seen": 131653810, "step": 6126, "time_per_iteration": 2.53279972076416 }, { "auxiliary_loss_clip": 0.01097817, "auxiliary_loss_mlp": 0.01037929, "balance_loss_clip": 1.04021657, "balance_loss_mlp": 1.02371383, "epoch": 0.7367281909457103, "flos": 15049336844160.0, "grad_norm": 1.6946500856518807, "language_loss": 0.74786514, "learning_rate": 6.839214325439409e-07, "loss": 0.76922262, "num_input_tokens_seen": 131671505, "step": 6127, "time_per_iteration": 2.649834156036377 }, { "auxiliary_loss_clip": 0.01103125, "auxiliary_loss_mlp": 0.01038531, "balance_loss_clip": 1.04141712, "balance_loss_mlp": 1.02284265, "epoch": 0.7368484338363495, "flos": 23510752053120.0, "grad_norm": 1.7803361462498684, "language_loss": 0.71709406, "learning_rate": 6.833349795079327e-07, "loss": 0.73851061, "num_input_tokens_seen": 131690615, "step": 6128, "time_per_iteration": 2.6790597438812256 }, { "auxiliary_loss_clip": 0.01098179, "auxiliary_loss_mlp": 0.01040663, "balance_loss_clip": 1.04057467, "balance_loss_mlp": 1.02539217, "epoch": 0.7369686767269885, "flos": 27417833095680.0, "grad_norm": 2.7644431752248595, "language_loss": 0.68576479, "learning_rate": 6.827487262089613e-07, "loss": 0.7071532, "num_input_tokens_seen": 131711120, "step": 6129, "time_per_iteration": 2.7496707439422607 }, { "auxiliary_loss_clip": 0.01015356, "auxiliary_loss_mlp": 0.01004683, "balance_loss_clip": 1.00800848, "balance_loss_mlp": 1.00312173, "epoch": 0.7370889196176276, "flos": 70293343824000.0, "grad_norm": 0.9543323958613401, "language_loss": 0.56787294, "learning_rate": 6.821626727359606e-07, "loss": 0.58807331, "num_input_tokens_seen": 131776680, "step": 6130, "time_per_iteration": 3.2766993045806885 }, { "auxiliary_loss_clip": 0.01110026, "auxiliary_loss_mlp": 0.01043191, "balance_loss_clip": 1.04388881, "balance_loss_mlp": 1.02641797, "epoch": 0.7372091625082667, "flos": 18040839189120.0, "grad_norm": 2.117964894442575, "language_loss": 0.77282888, "learning_rate": 6.815768191778348e-07, "loss": 0.79436105, "num_input_tokens_seen": 131794760, "step": 6131, "time_per_iteration": 2.6123430728912354 }, { "auxiliary_loss_clip": 0.01119941, "auxiliary_loss_mlp": 0.01043843, "balance_loss_clip": 1.0427289, "balance_loss_mlp": 1.02639055, "epoch": 0.7373294053989058, "flos": 33726331854720.0, "grad_norm": 1.932545050460012, "language_loss": 0.73087627, "learning_rate": 6.809911656234569e-07, "loss": 0.75251412, "num_input_tokens_seen": 131816735, "step": 6132, "time_per_iteration": 2.744443655014038 }, { "auxiliary_loss_clip": 0.0109777, "auxiliary_loss_mlp": 0.01035473, "balance_loss_clip": 1.03954697, "balance_loss_mlp": 1.02020264, "epoch": 0.7374496482895448, "flos": 21506326427520.0, "grad_norm": 2.572068193119204, "language_loss": 0.78088254, "learning_rate": 6.804057121616707e-07, "loss": 0.80221498, "num_input_tokens_seen": 131834940, "step": 6133, "time_per_iteration": 2.677879810333252 }, { "auxiliary_loss_clip": 0.01120622, "auxiliary_loss_mlp": 0.01039565, "balance_loss_clip": 1.04161811, "balance_loss_mlp": 1.02421069, "epoch": 0.737569891180184, "flos": 24936908624640.0, "grad_norm": 1.9441274448508752, "language_loss": 0.71868688, "learning_rate": 6.798204588812888e-07, "loss": 0.74028867, "num_input_tokens_seen": 131854355, "step": 6134, "time_per_iteration": 2.6604273319244385 }, { "auxiliary_loss_clip": 0.01062666, "auxiliary_loss_mlp": 0.00772265, "balance_loss_clip": 1.03495026, "balance_loss_mlp": 1.00058568, "epoch": 0.7376901340708231, "flos": 20664544222080.0, "grad_norm": 1.7820084179355589, "language_loss": 0.75486737, "learning_rate": 6.792354058710937e-07, "loss": 0.77321672, "num_input_tokens_seen": 131871825, "step": 6135, "time_per_iteration": 2.7314531803131104 }, { "auxiliary_loss_clip": 0.01127564, "auxiliary_loss_mlp": 0.01031681, "balance_loss_clip": 1.04348004, "balance_loss_mlp": 1.01710165, "epoch": 0.7378103769614621, "flos": 23805794367360.0, "grad_norm": 2.3625010114555782, "language_loss": 0.65048599, "learning_rate": 6.786505532198374e-07, "loss": 0.67207837, "num_input_tokens_seen": 131890770, "step": 6136, "time_per_iteration": 2.615368604660034 }, { "auxiliary_loss_clip": 0.01136409, "auxiliary_loss_mlp": 0.01043359, "balance_loss_clip": 1.04623449, "balance_loss_mlp": 1.02774239, "epoch": 0.7379306198521013, "flos": 22237216369920.0, "grad_norm": 2.8407815437773465, "language_loss": 0.85479712, "learning_rate": 6.780659010162411e-07, "loss": 0.87659484, "num_input_tokens_seen": 131909720, "step": 6137, "time_per_iteration": 2.5771708488464355 }, { "auxiliary_loss_clip": 0.01097347, "auxiliary_loss_mlp": 0.01038324, "balance_loss_clip": 1.04046166, "balance_loss_mlp": 1.02464437, "epoch": 0.7380508627427403, "flos": 14903108576640.0, "grad_norm": 1.748278497119275, "language_loss": 0.83473396, "learning_rate": 6.774814493489975e-07, "loss": 0.85609066, "num_input_tokens_seen": 131927395, "step": 6138, "time_per_iteration": 2.649397850036621 }, { "auxiliary_loss_clip": 0.0111872, "auxiliary_loss_mlp": 0.01038278, "balance_loss_clip": 1.04285407, "balance_loss_mlp": 1.02337706, "epoch": 0.7381711056333794, "flos": 21685843624320.0, "grad_norm": 2.0408011021286288, "language_loss": 0.6618861, "learning_rate": 6.768971983067655e-07, "loss": 0.68345606, "num_input_tokens_seen": 131947725, "step": 6139, "time_per_iteration": 2.6241495609283447 }, { "auxiliary_loss_clip": 0.01038205, "auxiliary_loss_mlp": 0.01004916, "balance_loss_clip": 1.00989521, "balance_loss_mlp": 1.00354517, "epoch": 0.7382913485240186, "flos": 52404263596800.0, "grad_norm": 1.0168133828590376, "language_loss": 0.67664468, "learning_rate": 6.763131479781772e-07, "loss": 0.6970759, "num_input_tokens_seen": 131997485, "step": 6140, "time_per_iteration": 2.9663968086242676 }, { "auxiliary_loss_clip": 0.01098514, "auxiliary_loss_mlp": 0.01038466, "balance_loss_clip": 1.03975153, "balance_loss_mlp": 1.0236721, "epoch": 0.7384115914146576, "flos": 21798818876160.0, "grad_norm": 2.00626462470882, "language_loss": 0.76328367, "learning_rate": 6.757292984518316e-07, "loss": 0.78465348, "num_input_tokens_seen": 132016885, "step": 6141, "time_per_iteration": 3.5975334644317627 }, { "auxiliary_loss_clip": 0.01028272, "auxiliary_loss_mlp": 0.01002996, "balance_loss_clip": 1.00941098, "balance_loss_mlp": 1.0016551, "epoch": 0.7385318343052967, "flos": 61494331662720.0, "grad_norm": 0.7353288990178999, "language_loss": 0.56416833, "learning_rate": 6.751456498162981e-07, "loss": 0.584481, "num_input_tokens_seen": 132075920, "step": 6142, "time_per_iteration": 3.0993595123291016 }, { "auxiliary_loss_clip": 0.01120926, "auxiliary_loss_mlp": 0.01042252, "balance_loss_clip": 1.04091001, "balance_loss_mlp": 1.02724361, "epoch": 0.7386520771959358, "flos": 17013757697280.0, "grad_norm": 3.319170860480493, "language_loss": 0.85442865, "learning_rate": 6.745622021601174e-07, "loss": 0.87606043, "num_input_tokens_seen": 132092945, "step": 6143, "time_per_iteration": 2.6074278354644775 }, { "auxiliary_loss_clip": 0.01104015, "auxiliary_loss_mlp": 0.01044865, "balance_loss_clip": 1.04220295, "balance_loss_mlp": 1.02902198, "epoch": 0.7387723200865749, "flos": 18770759464320.0, "grad_norm": 1.8327904066406375, "language_loss": 0.69784868, "learning_rate": 6.739789555717954e-07, "loss": 0.71933746, "num_input_tokens_seen": 132109920, "step": 6144, "time_per_iteration": 3.622180700302124 }, { "auxiliary_loss_clip": 0.01131913, "auxiliary_loss_mlp": 0.01044355, "balance_loss_clip": 1.0426929, "balance_loss_mlp": 1.02902508, "epoch": 0.738892562977214, "flos": 22525542840960.0, "grad_norm": 2.66750879336201, "language_loss": 0.77153981, "learning_rate": 6.733959101398124e-07, "loss": 0.79330254, "num_input_tokens_seen": 132128050, "step": 6145, "time_per_iteration": 2.5888736248016357 }, { "auxiliary_loss_clip": 0.01107566, "auxiliary_loss_mlp": 0.01037229, "balance_loss_clip": 1.04084134, "balance_loss_mlp": 1.02105224, "epoch": 0.7390128058678531, "flos": 21501478091520.0, "grad_norm": 1.8759484972634777, "language_loss": 0.81666017, "learning_rate": 6.728130659526143e-07, "loss": 0.83810818, "num_input_tokens_seen": 132145860, "step": 6146, "time_per_iteration": 3.6494808197021484 }, { "auxiliary_loss_clip": 0.01111775, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.04277194, "balance_loss_mlp": 1.01983404, "epoch": 0.7391330487584922, "flos": 25776176878080.0, "grad_norm": 2.655077781149537, "language_loss": 0.71329683, "learning_rate": 6.7223042309862e-07, "loss": 0.73476744, "num_input_tokens_seen": 132166060, "step": 6147, "time_per_iteration": 2.679716110229492 }, { "auxiliary_loss_clip": 0.01117169, "auxiliary_loss_mlp": 0.01038492, "balance_loss_clip": 1.04052913, "balance_loss_mlp": 1.02269721, "epoch": 0.7392532916491312, "flos": 28366736636160.0, "grad_norm": 1.8076412571268479, "language_loss": 0.73337984, "learning_rate": 6.716479816662144e-07, "loss": 0.75493646, "num_input_tokens_seen": 132187790, "step": 6148, "time_per_iteration": 3.8343799114227295 }, { "auxiliary_loss_clip": 0.01111005, "auxiliary_loss_mlp": 0.01044638, "balance_loss_clip": 1.04104948, "balance_loss_mlp": 1.02915311, "epoch": 0.7393735345397703, "flos": 23585877348480.0, "grad_norm": 1.8677550064016455, "language_loss": 0.72953963, "learning_rate": 6.710657417437531e-07, "loss": 0.75109601, "num_input_tokens_seen": 132207495, "step": 6149, "time_per_iteration": 2.688175916671753 }, { "auxiliary_loss_clip": 0.01105712, "auxiliary_loss_mlp": 0.01035562, "balance_loss_clip": 1.04076505, "balance_loss_mlp": 1.02037454, "epoch": 0.7394937774304094, "flos": 19974772373760.0, "grad_norm": 2.1547049924845636, "language_loss": 0.79748905, "learning_rate": 6.704837034195628e-07, "loss": 0.81890184, "num_input_tokens_seen": 132225960, "step": 6150, "time_per_iteration": 2.6336886882781982 }, { "auxiliary_loss_clip": 0.01114162, "auxiliary_loss_mlp": 0.01039257, "balance_loss_clip": 1.03956532, "balance_loss_mlp": 1.02333045, "epoch": 0.7396140203210485, "flos": 23478037741440.0, "grad_norm": 1.745901952407481, "language_loss": 0.84992933, "learning_rate": 6.699018667819376e-07, "loss": 0.87146348, "num_input_tokens_seen": 132245360, "step": 6151, "time_per_iteration": 2.64426326751709 }, { "auxiliary_loss_clip": 0.01119386, "auxiliary_loss_mlp": 0.01041999, "balance_loss_clip": 1.0423733, "balance_loss_mlp": 1.02525043, "epoch": 0.7397342632116876, "flos": 25555433846400.0, "grad_norm": 1.7462869846087494, "language_loss": 0.7264753, "learning_rate": 6.693202319191415e-07, "loss": 0.74808919, "num_input_tokens_seen": 132267095, "step": 6152, "time_per_iteration": 2.642585515975952 }, { "auxiliary_loss_clip": 0.01135912, "auxiliary_loss_mlp": 0.01046578, "balance_loss_clip": 1.04863369, "balance_loss_mlp": 1.03164709, "epoch": 0.7398545061023267, "flos": 24755021130240.0, "grad_norm": 1.8126721667243872, "language_loss": 0.74682724, "learning_rate": 6.687387989194084e-07, "loss": 0.76865214, "num_input_tokens_seen": 132286610, "step": 6153, "time_per_iteration": 2.6387598514556885 }, { "auxiliary_loss_clip": 0.01101551, "auxiliary_loss_mlp": 0.01038757, "balance_loss_clip": 1.04129386, "balance_loss_mlp": 1.02367711, "epoch": 0.7399747489929658, "flos": 16508602776960.0, "grad_norm": 1.8993367337574008, "language_loss": 0.7899937, "learning_rate": 6.681575678709404e-07, "loss": 0.81139684, "num_input_tokens_seen": 132305300, "step": 6154, "time_per_iteration": 2.6122584342956543 }, { "auxiliary_loss_clip": 0.01120269, "auxiliary_loss_mlp": 0.01042973, "balance_loss_clip": 1.04141164, "balance_loss_mlp": 1.02640319, "epoch": 0.7400949918836048, "flos": 24097065753600.0, "grad_norm": 2.2330146285313206, "language_loss": 0.71047562, "learning_rate": 6.67576538861911e-07, "loss": 0.73210806, "num_input_tokens_seen": 132323875, "step": 6155, "time_per_iteration": 2.6369407176971436 }, { "auxiliary_loss_clip": 0.01105722, "auxiliary_loss_mlp": 0.01036053, "balance_loss_clip": 1.04218614, "balance_loss_mlp": 1.02144361, "epoch": 0.740215234774244, "flos": 21802517976960.0, "grad_norm": 1.4663377344422701, "language_loss": 0.82290131, "learning_rate": 6.669957119804612e-07, "loss": 0.84431899, "num_input_tokens_seen": 132345510, "step": 6156, "time_per_iteration": 2.652409315109253 }, { "auxiliary_loss_clip": 0.01115501, "auxiliary_loss_mlp": 0.01047074, "balance_loss_clip": 1.0414629, "balance_loss_mlp": 1.03076637, "epoch": 0.7403354776648831, "flos": 18733196816640.0, "grad_norm": 2.7566589444376346, "language_loss": 0.72452164, "learning_rate": 6.66415087314702e-07, "loss": 0.74614739, "num_input_tokens_seen": 132360465, "step": 6157, "time_per_iteration": 2.611588954925537 }, { "auxiliary_loss_clip": 0.01109926, "auxiliary_loss_mlp": 0.01039464, "balance_loss_clip": 1.04191995, "balance_loss_mlp": 1.02302527, "epoch": 0.7404557205555221, "flos": 16909581277440.0, "grad_norm": 2.2006904885526803, "language_loss": 0.7309109, "learning_rate": 6.65834664952714e-07, "loss": 0.75240481, "num_input_tokens_seen": 132377915, "step": 6158, "time_per_iteration": 2.6547932624816895 }, { "auxiliary_loss_clip": 0.01099664, "auxiliary_loss_mlp": 0.01042618, "balance_loss_clip": 1.04239917, "balance_loss_mlp": 1.02722824, "epoch": 0.7405759634461613, "flos": 21214408596480.0, "grad_norm": 1.784185095662934, "language_loss": 0.76027948, "learning_rate": 6.652544449825457e-07, "loss": 0.78170228, "num_input_tokens_seen": 132398170, "step": 6159, "time_per_iteration": 2.6866681575775146 }, { "auxiliary_loss_clip": 0.01116251, "auxiliary_loss_mlp": 0.01039047, "balance_loss_clip": 1.04156923, "balance_loss_mlp": 1.0225842, "epoch": 0.7406962063368003, "flos": 20480106862080.0, "grad_norm": 1.6572410812748086, "language_loss": 0.76653767, "learning_rate": 6.646744274922182e-07, "loss": 0.78809065, "num_input_tokens_seen": 132416615, "step": 6160, "time_per_iteration": 2.6619315147399902 }, { "auxiliary_loss_clip": 0.01108364, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.04081225, "balance_loss_mlp": 1.01855302, "epoch": 0.7408164492274394, "flos": 19791915212160.0, "grad_norm": 3.3377373169864883, "language_loss": 0.75204945, "learning_rate": 6.640946125697171e-07, "loss": 0.77347219, "num_input_tokens_seen": 132434145, "step": 6161, "time_per_iteration": 2.619774580001831 }, { "auxiliary_loss_clip": 0.01125309, "auxiliary_loss_mlp": 0.01039584, "balance_loss_clip": 1.0428977, "balance_loss_mlp": 1.02276325, "epoch": 0.7409366921180786, "flos": 29204855654400.0, "grad_norm": 3.1335000792272565, "language_loss": 0.75974983, "learning_rate": 6.635150003030017e-07, "loss": 0.78139871, "num_input_tokens_seen": 132452670, "step": 6162, "time_per_iteration": 2.64218807220459 }, { "auxiliary_loss_clip": 0.01084408, "auxiliary_loss_mlp": 0.01039382, "balance_loss_clip": 1.03789949, "balance_loss_mlp": 1.02399242, "epoch": 0.7410569350087176, "flos": 22930004960640.0, "grad_norm": 2.4386026466330257, "language_loss": 0.8591243, "learning_rate": 6.629355907799981e-07, "loss": 0.88036227, "num_input_tokens_seen": 132472475, "step": 6163, "time_per_iteration": 2.665532350540161 }, { "auxiliary_loss_clip": 0.01125585, "auxiliary_loss_mlp": 0.01039251, "balance_loss_clip": 1.04342544, "balance_loss_mlp": 1.02419496, "epoch": 0.7411771778993567, "flos": 30440397726720.0, "grad_norm": 1.8046107202800277, "language_loss": 0.69200802, "learning_rate": 6.623563840886015e-07, "loss": 0.71365643, "num_input_tokens_seen": 132493400, "step": 6164, "time_per_iteration": 2.684743642807007 }, { "auxiliary_loss_clip": 0.01116266, "auxiliary_loss_mlp": 0.01037086, "balance_loss_clip": 1.04078031, "balance_loss_mlp": 1.02230406, "epoch": 0.7412974207899958, "flos": 20522050968960.0, "grad_norm": 3.303458521331651, "language_loss": 0.69497228, "learning_rate": 6.617773803166795e-07, "loss": 0.71650577, "num_input_tokens_seen": 132511725, "step": 6165, "time_per_iteration": 2.6060876846313477 }, { "auxiliary_loss_clip": 0.01114566, "auxiliary_loss_mlp": 0.00772989, "balance_loss_clip": 1.043154, "balance_loss_mlp": 1.00045455, "epoch": 0.7414176636806349, "flos": 22090700793600.0, "grad_norm": 2.276805538221645, "language_loss": 0.82199985, "learning_rate": 6.611985795520634e-07, "loss": 0.84087539, "num_input_tokens_seen": 132530270, "step": 6166, "time_per_iteration": 2.6813461780548096 }, { "auxiliary_loss_clip": 0.01108335, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.04457903, "balance_loss_mlp": 1.0220412, "epoch": 0.7415379065712739, "flos": 25155245445120.0, "grad_norm": 2.018715697632432, "language_loss": 0.77802277, "learning_rate": 6.606199818825588e-07, "loss": 0.79948258, "num_input_tokens_seen": 132550725, "step": 6167, "time_per_iteration": 2.708645820617676 }, { "auxiliary_loss_clip": 0.01111091, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.03941667, "balance_loss_mlp": 1.02111208, "epoch": 0.7416581494619131, "flos": 16871731320960.0, "grad_norm": 2.060078227066485, "language_loss": 0.8192426, "learning_rate": 6.600415873959377e-07, "loss": 0.84073007, "num_input_tokens_seen": 132568600, "step": 6168, "time_per_iteration": 3.5030248165130615 }, { "auxiliary_loss_clip": 0.01064403, "auxiliary_loss_mlp": 0.00771589, "balance_loss_clip": 1.03500462, "balance_loss_mlp": 1.00049376, "epoch": 0.7417783923525522, "flos": 28438881102720.0, "grad_norm": 3.4263885920804706, "language_loss": 0.65012032, "learning_rate": 6.594633961799437e-07, "loss": 0.66848028, "num_input_tokens_seen": 132587640, "step": 6169, "time_per_iteration": 2.7827844619750977 }, { "auxiliary_loss_clip": 0.01101742, "auxiliary_loss_mlp": 0.01040936, "balance_loss_clip": 1.04041433, "balance_loss_mlp": 1.02523613, "epoch": 0.7418986352431912, "flos": 20084299920000.0, "grad_norm": 1.8524948432049773, "language_loss": 0.81868446, "learning_rate": 6.588854083222857e-07, "loss": 0.84011126, "num_input_tokens_seen": 132607075, "step": 6170, "time_per_iteration": 3.6454415321350098 }, { "auxiliary_loss_clip": 0.01113589, "auxiliary_loss_mlp": 0.01045392, "balance_loss_clip": 1.04399085, "balance_loss_mlp": 1.02894115, "epoch": 0.7420188781338304, "flos": 18259571059200.0, "grad_norm": 1.9192483230841324, "language_loss": 0.80331111, "learning_rate": 6.583076239106444e-07, "loss": 0.82490093, "num_input_tokens_seen": 132625580, "step": 6171, "time_per_iteration": 2.6267337799072266 }, { "auxiliary_loss_clip": 0.01113742, "auxiliary_loss_mlp": 0.01046051, "balance_loss_clip": 1.04276299, "balance_loss_mlp": 1.0303514, "epoch": 0.7421391210244694, "flos": 13771994319360.0, "grad_norm": 4.328978251615106, "language_loss": 0.75429654, "learning_rate": 6.577300430326707e-07, "loss": 0.77589446, "num_input_tokens_seen": 132640525, "step": 6172, "time_per_iteration": 3.617811918258667 }, { "auxiliary_loss_clip": 0.01093422, "auxiliary_loss_mlp": 0.0104003, "balance_loss_clip": 1.03988779, "balance_loss_mlp": 1.02398443, "epoch": 0.7422593639151085, "flos": 15961683317760.0, "grad_norm": 2.5967443713383926, "language_loss": 0.718732, "learning_rate": 6.571526657759821e-07, "loss": 0.74006647, "num_input_tokens_seen": 132656265, "step": 6173, "time_per_iteration": 2.6251678466796875 }, { "auxiliary_loss_clip": 0.01116551, "auxiliary_loss_mlp": 0.01033693, "balance_loss_clip": 1.0412302, "balance_loss_mlp": 1.0180763, "epoch": 0.7423796068057477, "flos": 30114400867200.0, "grad_norm": 1.967324448809838, "language_loss": 0.70560336, "learning_rate": 6.565754922281663e-07, "loss": 0.7271058, "num_input_tokens_seen": 132678510, "step": 6174, "time_per_iteration": 3.569084644317627 }, { "auxiliary_loss_clip": 0.01108659, "auxiliary_loss_mlp": 0.01047296, "balance_loss_clip": 1.04110003, "balance_loss_mlp": 1.02976048, "epoch": 0.7424998496963867, "flos": 20521907314560.0, "grad_norm": 1.9646006438126313, "language_loss": 0.7831468, "learning_rate": 6.559985224767801e-07, "loss": 0.80470634, "num_input_tokens_seen": 132696385, "step": 6175, "time_per_iteration": 2.6339211463928223 }, { "auxiliary_loss_clip": 0.01101486, "auxiliary_loss_mlp": 0.010401, "balance_loss_clip": 1.04172492, "balance_loss_mlp": 1.02459049, "epoch": 0.7426200925870258, "flos": 21871573873920.0, "grad_norm": 2.764042059749453, "language_loss": 0.7602936, "learning_rate": 6.55421756609349e-07, "loss": 0.78170943, "num_input_tokens_seen": 132714640, "step": 6176, "time_per_iteration": 2.681274652481079 }, { "auxiliary_loss_clip": 0.01116994, "auxiliary_loss_mlp": 0.01047151, "balance_loss_clip": 1.04249036, "balance_loss_mlp": 1.03201163, "epoch": 0.7427403354776649, "flos": 26432049265920.0, "grad_norm": 1.858548060049441, "language_loss": 0.7887696, "learning_rate": 6.54845194713369e-07, "loss": 0.81041104, "num_input_tokens_seen": 132735590, "step": 6177, "time_per_iteration": 2.62127423286438 }, { "auxiliary_loss_clip": 0.01121274, "auxiliary_loss_mlp": 0.01040187, "balance_loss_clip": 1.04535079, "balance_loss_mlp": 1.02480292, "epoch": 0.742860578368304, "flos": 19898390102400.0, "grad_norm": 2.1654736946855238, "language_loss": 0.79879367, "learning_rate": 6.542688368763034e-07, "loss": 0.82040823, "num_input_tokens_seen": 132753995, "step": 6178, "time_per_iteration": 2.610456705093384 }, { "auxiliary_loss_clip": 0.01119433, "auxiliary_loss_mlp": 0.01037892, "balance_loss_clip": 1.04142952, "balance_loss_mlp": 1.02289522, "epoch": 0.742980821258943, "flos": 24827201510400.0, "grad_norm": 1.8118873592202602, "language_loss": 0.76963806, "learning_rate": 6.536926831855854e-07, "loss": 0.79121125, "num_input_tokens_seen": 132773160, "step": 6179, "time_per_iteration": 2.6207919120788574 }, { "auxiliary_loss_clip": 0.01108047, "auxiliary_loss_mlp": 0.01032151, "balance_loss_clip": 1.04165578, "balance_loss_mlp": 1.0172677, "epoch": 0.7431010641495821, "flos": 25228646887680.0, "grad_norm": 2.8561626690164474, "language_loss": 0.73379534, "learning_rate": 6.531167337286165e-07, "loss": 0.75519735, "num_input_tokens_seen": 132793180, "step": 6180, "time_per_iteration": 2.635728597640991 }, { "auxiliary_loss_clip": 0.01108139, "auxiliary_loss_mlp": 0.01042931, "balance_loss_clip": 1.042274, "balance_loss_mlp": 1.02663505, "epoch": 0.7432213070402213, "flos": 21762369550080.0, "grad_norm": 1.505003446963786, "language_loss": 0.79645848, "learning_rate": 6.52540988592768e-07, "loss": 0.81796914, "num_input_tokens_seen": 132814200, "step": 6181, "time_per_iteration": 2.7040228843688965 }, { "auxiliary_loss_clip": 0.01109631, "auxiliary_loss_mlp": 0.01039449, "balance_loss_clip": 1.04055583, "balance_loss_mlp": 1.02457154, "epoch": 0.7433415499308603, "flos": 14793832425600.0, "grad_norm": 2.2677728267347788, "language_loss": 0.83739078, "learning_rate": 6.519654478653814e-07, "loss": 0.85888159, "num_input_tokens_seen": 132832565, "step": 6182, "time_per_iteration": 2.6278486251831055 }, { "auxiliary_loss_clip": 0.01023392, "auxiliary_loss_mlp": 0.01002138, "balance_loss_clip": 1.01254153, "balance_loss_mlp": 1.00068939, "epoch": 0.7434617928214994, "flos": 67155577297920.0, "grad_norm": 0.7478267783589769, "language_loss": 0.56077021, "learning_rate": 6.51390111633763e-07, "loss": 0.58102554, "num_input_tokens_seen": 132897840, "step": 6183, "time_per_iteration": 3.2605538368225098 }, { "auxiliary_loss_clip": 0.01067001, "auxiliary_loss_mlp": 0.01046392, "balance_loss_clip": 1.03577328, "balance_loss_mlp": 1.02862966, "epoch": 0.7435820357121385, "flos": 27377576928000.0, "grad_norm": 1.7930683691725195, "language_loss": 0.76329309, "learning_rate": 6.508149799851932e-07, "loss": 0.78442705, "num_input_tokens_seen": 132919505, "step": 6184, "time_per_iteration": 2.7939343452453613 }, { "auxiliary_loss_clip": 0.01100912, "auxiliary_loss_mlp": 0.01035845, "balance_loss_clip": 1.03781211, "balance_loss_mlp": 1.021492, "epoch": 0.7437022786027776, "flos": 23987645948160.0, "grad_norm": 3.033774377469854, "language_loss": 0.61201024, "learning_rate": 6.502400530069183e-07, "loss": 0.63337785, "num_input_tokens_seen": 132939390, "step": 6185, "time_per_iteration": 2.6482014656066895 }, { "auxiliary_loss_clip": 0.01102439, "auxiliary_loss_mlp": 0.01042537, "balance_loss_clip": 1.04193771, "balance_loss_mlp": 1.02597904, "epoch": 0.7438225214934167, "flos": 21866761451520.0, "grad_norm": 1.7185782954480304, "language_loss": 0.6852119, "learning_rate": 6.496653307861535e-07, "loss": 0.7066617, "num_input_tokens_seen": 132960060, "step": 6186, "time_per_iteration": 2.7798004150390625 }, { "auxiliary_loss_clip": 0.01123435, "auxiliary_loss_mlp": 0.0104277, "balance_loss_clip": 1.04324174, "balance_loss_mlp": 1.02597356, "epoch": 0.7439427643840558, "flos": 20230097224320.0, "grad_norm": 1.6550981569000816, "language_loss": 0.65589452, "learning_rate": 6.490908134100857e-07, "loss": 0.67755663, "num_input_tokens_seen": 132978525, "step": 6187, "time_per_iteration": 2.635986804962158 }, { "auxiliary_loss_clip": 0.01125277, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.04479158, "balance_loss_mlp": 1.02228081, "epoch": 0.7440630072746949, "flos": 20849915335680.0, "grad_norm": 2.3756047173300923, "language_loss": 0.69194508, "learning_rate": 6.48516500965866e-07, "loss": 0.71356642, "num_input_tokens_seen": 132998460, "step": 6188, "time_per_iteration": 2.6637816429138184 }, { "auxiliary_loss_clip": 0.01125196, "auxiliary_loss_mlp": 0.01032864, "balance_loss_clip": 1.04128516, "balance_loss_mlp": 1.01674747, "epoch": 0.7441832501653339, "flos": 26503762769280.0, "grad_norm": 1.8509417409266873, "language_loss": 0.81820381, "learning_rate": 6.479423935406192e-07, "loss": 0.8397845, "num_input_tokens_seen": 133018445, "step": 6189, "time_per_iteration": 2.6564528942108154 }, { "auxiliary_loss_clip": 0.01015007, "auxiliary_loss_mlp": 0.01007256, "balance_loss_clip": 1.00957787, "balance_loss_mlp": 1.00569475, "epoch": 0.7443034930559731, "flos": 68602848088320.0, "grad_norm": 0.7999147566813679, "language_loss": 0.62045032, "learning_rate": 6.473684912214357e-07, "loss": 0.64067298, "num_input_tokens_seen": 133082005, "step": 6190, "time_per_iteration": 3.3611690998077393 }, { "auxiliary_loss_clip": 0.01126625, "auxiliary_loss_mlp": 0.01040981, "balance_loss_clip": 1.04671943, "balance_loss_mlp": 1.02565074, "epoch": 0.7444237359466122, "flos": 18654982951680.0, "grad_norm": 1.9131153231177402, "language_loss": 0.69584495, "learning_rate": 6.467947940953778e-07, "loss": 0.71752107, "num_input_tokens_seen": 133100530, "step": 6191, "time_per_iteration": 2.6263368129730225 }, { "auxiliary_loss_clip": 0.01106167, "auxiliary_loss_mlp": 0.0103579, "balance_loss_clip": 1.03966498, "balance_loss_mlp": 1.0208174, "epoch": 0.7445439788372512, "flos": 22817604326400.0, "grad_norm": 2.0475298679873224, "language_loss": 0.72671819, "learning_rate": 6.462213022494732e-07, "loss": 0.74813777, "num_input_tokens_seen": 133119775, "step": 6192, "time_per_iteration": 2.69624400138855 }, { "auxiliary_loss_clip": 0.01028792, "auxiliary_loss_mlp": 0.01000658, "balance_loss_clip": 1.0091089, "balance_loss_mlp": 0.99928111, "epoch": 0.7446642217278904, "flos": 67045690615680.0, "grad_norm": 0.7704237312933293, "language_loss": 0.61022234, "learning_rate": 6.456480157707201e-07, "loss": 0.63051689, "num_input_tokens_seen": 133184550, "step": 6193, "time_per_iteration": 3.130242347717285 }, { "auxiliary_loss_clip": 0.01090744, "auxiliary_loss_mlp": 0.01033712, "balance_loss_clip": 1.03947592, "balance_loss_mlp": 1.01797628, "epoch": 0.7447844646185294, "flos": 17417465631360.0, "grad_norm": 2.4059199417241444, "language_loss": 0.85022837, "learning_rate": 6.450749347460866e-07, "loss": 0.8714729, "num_input_tokens_seen": 133201525, "step": 6194, "time_per_iteration": 3.8906424045562744 }, { "auxiliary_loss_clip": 0.01137151, "auxiliary_loss_mlp": 0.0104045, "balance_loss_clip": 1.04547787, "balance_loss_mlp": 1.02367735, "epoch": 0.7449047075091685, "flos": 26615876094720.0, "grad_norm": 2.278942058992147, "language_loss": 0.78846669, "learning_rate": 6.445020592625083e-07, "loss": 0.81024265, "num_input_tokens_seen": 133222175, "step": 6195, "time_per_iteration": 2.6517934799194336 }, { "auxiliary_loss_clip": 0.01133809, "auxiliary_loss_mlp": 0.01040825, "balance_loss_clip": 1.0440594, "balance_loss_mlp": 1.02462459, "epoch": 0.7450249503998077, "flos": 14170458867840.0, "grad_norm": 2.085052344069992, "language_loss": 0.80585736, "learning_rate": 6.4392938940689e-07, "loss": 0.8276037, "num_input_tokens_seen": 133237590, "step": 6196, "time_per_iteration": 3.5135457515716553 }, { "auxiliary_loss_clip": 0.01081144, "auxiliary_loss_mlp": 0.00773943, "balance_loss_clip": 1.03873527, "balance_loss_mlp": 1.00052023, "epoch": 0.7451451932904467, "flos": 19606687752960.0, "grad_norm": 2.6327256602977567, "language_loss": 0.71484339, "learning_rate": 6.433569252661049e-07, "loss": 0.73339427, "num_input_tokens_seen": 133255590, "step": 6197, "time_per_iteration": 2.7255027294158936 }, { "auxiliary_loss_clip": 0.01088031, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.03812504, "balance_loss_mlp": 1.02442098, "epoch": 0.7452654361810858, "flos": 12495405980160.0, "grad_norm": 1.8240063292624846, "language_loss": 0.71462989, "learning_rate": 6.427846669269952e-07, "loss": 0.73589271, "num_input_tokens_seen": 133273210, "step": 6198, "time_per_iteration": 3.652162551879883 }, { "auxiliary_loss_clip": 0.01134404, "auxiliary_loss_mlp": 0.01038012, "balance_loss_clip": 1.04626548, "balance_loss_mlp": 1.02294385, "epoch": 0.7453856790717249, "flos": 22127329687680.0, "grad_norm": 2.0260021902023215, "language_loss": 0.82512808, "learning_rate": 6.422126144763729e-07, "loss": 0.8468523, "num_input_tokens_seen": 133292600, "step": 6199, "time_per_iteration": 2.5881361961364746 }, { "auxiliary_loss_clip": 0.01097438, "auxiliary_loss_mlp": 0.00772731, "balance_loss_clip": 1.03876877, "balance_loss_mlp": 1.00040865, "epoch": 0.745505921962364, "flos": 20010682995840.0, "grad_norm": 3.8491738007346505, "language_loss": 0.76652259, "learning_rate": 6.416407680010174e-07, "loss": 0.78522426, "num_input_tokens_seen": 133306960, "step": 6200, "time_per_iteration": 2.696105718612671 }, { "auxiliary_loss_clip": 0.01095176, "auxiliary_loss_mlp": 0.0103909, "balance_loss_clip": 1.04242623, "balance_loss_mlp": 1.02372384, "epoch": 0.745626164853003, "flos": 24677884673280.0, "grad_norm": 2.049251486108895, "language_loss": 0.80953264, "learning_rate": 6.410691275876774e-07, "loss": 0.83087534, "num_input_tokens_seen": 133326380, "step": 6201, "time_per_iteration": 3.6047556400299072 }, { "auxiliary_loss_clip": 0.01114217, "auxiliary_loss_mlp": 0.01040805, "balance_loss_clip": 1.04337382, "balance_loss_mlp": 1.02506888, "epoch": 0.7457464077436422, "flos": 14538830797440.0, "grad_norm": 2.7425764735004954, "language_loss": 0.76803452, "learning_rate": 6.404976933230704e-07, "loss": 0.7895847, "num_input_tokens_seen": 133342900, "step": 6202, "time_per_iteration": 2.611299514770508 }, { "auxiliary_loss_clip": 0.01115375, "auxiliary_loss_mlp": 0.01038385, "balance_loss_clip": 1.04341698, "balance_loss_mlp": 1.02124286, "epoch": 0.7458666506342813, "flos": 34021194600960.0, "grad_norm": 1.8279305492653652, "language_loss": 0.72918004, "learning_rate": 6.399264652938813e-07, "loss": 0.75071764, "num_input_tokens_seen": 133363805, "step": 6203, "time_per_iteration": 2.7686681747436523 }, { "auxiliary_loss_clip": 0.01104801, "auxiliary_loss_mlp": 0.01032934, "balance_loss_clip": 1.03921235, "balance_loss_mlp": 1.01755607, "epoch": 0.7459868935249203, "flos": 24279025075200.0, "grad_norm": 1.9956126501609088, "language_loss": 0.7444573, "learning_rate": 6.393554435867679e-07, "loss": 0.76583469, "num_input_tokens_seen": 133384655, "step": 6204, "time_per_iteration": 2.688997983932495 }, { "auxiliary_loss_clip": 0.01090907, "auxiliary_loss_mlp": 0.01047227, "balance_loss_clip": 1.03723729, "balance_loss_mlp": 1.03009653, "epoch": 0.7461071364155595, "flos": 21908777385600.0, "grad_norm": 1.9091782299964468, "language_loss": 0.83604002, "learning_rate": 6.387846282883502e-07, "loss": 0.8574214, "num_input_tokens_seen": 133401185, "step": 6205, "time_per_iteration": 2.6451241970062256 }, { "auxiliary_loss_clip": 0.01132698, "auxiliary_loss_mlp": 0.0103643, "balance_loss_clip": 1.04269075, "balance_loss_mlp": 1.0212965, "epoch": 0.7462273793061985, "flos": 22889712879360.0, "grad_norm": 4.015157973301342, "language_loss": 0.76819444, "learning_rate": 6.38214019485223e-07, "loss": 0.7898857, "num_input_tokens_seen": 133420010, "step": 6206, "time_per_iteration": 2.615291118621826 }, { "auxiliary_loss_clip": 0.01067361, "auxiliary_loss_mlp": 0.01036801, "balance_loss_clip": 1.03530264, "balance_loss_mlp": 1.02094674, "epoch": 0.7463476221968376, "flos": 19968451580160.0, "grad_norm": 2.3204678385871476, "language_loss": 0.71617442, "learning_rate": 6.376436172639461e-07, "loss": 0.73721606, "num_input_tokens_seen": 133437855, "step": 6207, "time_per_iteration": 2.7618296146392822 }, { "auxiliary_loss_clip": 0.01058157, "auxiliary_loss_mlp": 0.0105299, "balance_loss_clip": 1.03604448, "balance_loss_mlp": 1.03466797, "epoch": 0.7464678650874768, "flos": 16836610798080.0, "grad_norm": 2.8650776974823837, "language_loss": 0.64499229, "learning_rate": 6.370734217110487e-07, "loss": 0.66610372, "num_input_tokens_seen": 133456600, "step": 6208, "time_per_iteration": 2.8009514808654785 }, { "auxiliary_loss_clip": 0.01113675, "auxiliary_loss_mlp": 0.01042341, "balance_loss_clip": 1.04271531, "balance_loss_mlp": 1.02501976, "epoch": 0.7465881079781158, "flos": 48100869843840.0, "grad_norm": 1.425561155922422, "language_loss": 0.64431816, "learning_rate": 6.36503432913031e-07, "loss": 0.6658783, "num_input_tokens_seen": 133479745, "step": 6209, "time_per_iteration": 3.1448867321014404 }, { "auxiliary_loss_clip": 0.01120927, "auxiliary_loss_mlp": 0.01037775, "balance_loss_clip": 1.04274583, "balance_loss_mlp": 1.02127624, "epoch": 0.7467083508687549, "flos": 19677359761920.0, "grad_norm": 2.322293879083247, "language_loss": 0.68497509, "learning_rate": 6.359336509563569e-07, "loss": 0.7065621, "num_input_tokens_seen": 133495765, "step": 6210, "time_per_iteration": 2.5987493991851807 }, { "auxiliary_loss_clip": 0.01083691, "auxiliary_loss_mlp": 0.01034912, "balance_loss_clip": 1.03737235, "balance_loss_mlp": 1.01823449, "epoch": 0.7468285937593939, "flos": 17895436934400.0, "grad_norm": 1.9306478794880306, "language_loss": 0.80976892, "learning_rate": 6.353640759274641e-07, "loss": 0.83095491, "num_input_tokens_seen": 133514655, "step": 6211, "time_per_iteration": 2.659921646118164 }, { "auxiliary_loss_clip": 0.01118118, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.03927326, "balance_loss_mlp": 1.02452517, "epoch": 0.7469488366500331, "flos": 23141446369920.0, "grad_norm": 2.6541484743275876, "language_loss": 0.75063026, "learning_rate": 6.347947079127556e-07, "loss": 0.77221429, "num_input_tokens_seen": 133532555, "step": 6212, "time_per_iteration": 2.5841851234436035 }, { "auxiliary_loss_clip": 0.01104427, "auxiliary_loss_mlp": 0.01044931, "balance_loss_clip": 1.04056394, "balance_loss_mlp": 1.02840853, "epoch": 0.7470690795406721, "flos": 16690849407360.0, "grad_norm": 7.033985634078085, "language_loss": 0.76828074, "learning_rate": 6.342255469986053e-07, "loss": 0.7897743, "num_input_tokens_seen": 133551300, "step": 6213, "time_per_iteration": 2.65498685836792 }, { "auxiliary_loss_clip": 0.01132573, "auxiliary_loss_mlp": 0.01034785, "balance_loss_clip": 1.04352391, "balance_loss_mlp": 1.0189538, "epoch": 0.7471893224313112, "flos": 25192700352000.0, "grad_norm": 1.8996720410083612, "language_loss": 0.76528448, "learning_rate": 6.336565932713533e-07, "loss": 0.7869581, "num_input_tokens_seen": 133570725, "step": 6214, "time_per_iteration": 2.6197264194488525 }, { "auxiliary_loss_clip": 0.01105176, "auxiliary_loss_mlp": 0.01041348, "balance_loss_clip": 1.04127884, "balance_loss_mlp": 1.02588701, "epoch": 0.7473095653219504, "flos": 22526225199360.0, "grad_norm": 2.934895474484184, "language_loss": 0.77938437, "learning_rate": 6.330878468173088e-07, "loss": 0.80084962, "num_input_tokens_seen": 133590790, "step": 6215, "time_per_iteration": 2.634871482849121 }, { "auxiliary_loss_clip": 0.01115269, "auxiliary_loss_mlp": 0.01040562, "balance_loss_clip": 1.04243052, "balance_loss_mlp": 1.02455223, "epoch": 0.7474298082125894, "flos": 18113989236480.0, "grad_norm": 2.419745723350803, "language_loss": 0.72628289, "learning_rate": 6.32519307722752e-07, "loss": 0.74784124, "num_input_tokens_seen": 133608685, "step": 6216, "time_per_iteration": 2.612574338912964 }, { "auxiliary_loss_clip": 0.01007914, "auxiliary_loss_mlp": 0.01004917, "balance_loss_clip": 1.01261508, "balance_loss_mlp": 1.00334346, "epoch": 0.7475500511032285, "flos": 62086535193600.0, "grad_norm": 0.8202249456952245, "language_loss": 0.54953843, "learning_rate": 6.31950976073929e-07, "loss": 0.56966674, "num_input_tokens_seen": 133662775, "step": 6217, "time_per_iteration": 3.2027649879455566 }, { "auxiliary_loss_clip": 0.01073766, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.03639746, "balance_loss_mlp": 1.01953995, "epoch": 0.7476702939938676, "flos": 17785586165760.0, "grad_norm": 2.0781413465321394, "language_loss": 0.81228787, "learning_rate": 6.31382851957055e-07, "loss": 0.83338094, "num_input_tokens_seen": 133679595, "step": 6218, "time_per_iteration": 2.6972198486328125 }, { "auxiliary_loss_clip": 0.0108988, "auxiliary_loss_mlp": 0.00771497, "balance_loss_clip": 1.03908324, "balance_loss_mlp": 1.00050199, "epoch": 0.7477905368845067, "flos": 27927944092800.0, "grad_norm": 2.1219528659217404, "language_loss": 0.71604568, "learning_rate": 6.308149354583143e-07, "loss": 0.73465943, "num_input_tokens_seen": 133699000, "step": 6219, "time_per_iteration": 2.7258424758911133 }, { "auxiliary_loss_clip": 0.01126422, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.04515576, "balance_loss_mlp": 1.02415919, "epoch": 0.7479107797751458, "flos": 26870374932480.0, "grad_norm": 2.1390282976485024, "language_loss": 0.81610334, "learning_rate": 6.302472266638586e-07, "loss": 0.83777916, "num_input_tokens_seen": 133719540, "step": 6220, "time_per_iteration": 3.5774261951446533 }, { "auxiliary_loss_clip": 0.01141114, "auxiliary_loss_mlp": 0.0104149, "balance_loss_clip": 1.04570317, "balance_loss_mlp": 1.02363217, "epoch": 0.7480310226657849, "flos": 33943375785600.0, "grad_norm": 2.2226010804544143, "language_loss": 0.70148408, "learning_rate": 6.296797256598101e-07, "loss": 0.72331011, "num_input_tokens_seen": 133741020, "step": 6221, "time_per_iteration": 2.641066074371338 }, { "auxiliary_loss_clip": 0.01087046, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.03845811, "balance_loss_mlp": 1.02088881, "epoch": 0.748151265556424, "flos": 24826555065600.0, "grad_norm": 2.150336436616979, "language_loss": 0.81154317, "learning_rate": 6.291124325322576e-07, "loss": 0.83277237, "num_input_tokens_seen": 133761145, "step": 6222, "time_per_iteration": 3.678253173828125 }, { "auxiliary_loss_clip": 0.01112332, "auxiliary_loss_mlp": 0.01036079, "balance_loss_clip": 1.04264033, "balance_loss_mlp": 1.02071261, "epoch": 0.748271508447063, "flos": 38399351535360.0, "grad_norm": 1.683265197573917, "language_loss": 0.62624717, "learning_rate": 6.285453473672595e-07, "loss": 0.64773118, "num_input_tokens_seen": 133783715, "step": 6223, "time_per_iteration": 2.7852604389190674 }, { "auxiliary_loss_clip": 0.0113191, "auxiliary_loss_mlp": 0.01037737, "balance_loss_clip": 1.04341483, "balance_loss_mlp": 1.02292478, "epoch": 0.7483917513377022, "flos": 21541842000000.0, "grad_norm": 2.2686355652711345, "language_loss": 0.7545318, "learning_rate": 6.279784702508415e-07, "loss": 0.77622825, "num_input_tokens_seen": 133804465, "step": 6224, "time_per_iteration": 3.5978472232818604 }, { "auxiliary_loss_clip": 0.01012662, "auxiliary_loss_mlp": 0.0100172, "balance_loss_clip": 1.01080036, "balance_loss_mlp": 1.00015855, "epoch": 0.7485119942283412, "flos": 62314532772480.0, "grad_norm": 0.7848266377681833, "language_loss": 0.58572936, "learning_rate": 6.274118012689979e-07, "loss": 0.60587323, "num_input_tokens_seen": 133866365, "step": 6225, "time_per_iteration": 3.3704004287719727 }, { "auxiliary_loss_clip": 0.01096941, "auxiliary_loss_mlp": 0.01043076, "balance_loss_clip": 1.03688622, "balance_loss_mlp": 1.02627933, "epoch": 0.7486322371189803, "flos": 29937613104000.0, "grad_norm": 1.4996131920533096, "language_loss": 0.68192673, "learning_rate": 6.268453405076943e-07, "loss": 0.70332688, "num_input_tokens_seen": 133888760, "step": 6226, "time_per_iteration": 3.5461878776550293 }, { "auxiliary_loss_clip": 0.01106938, "auxiliary_loss_mlp": 0.01041572, "balance_loss_clip": 1.03956747, "balance_loss_mlp": 1.02640843, "epoch": 0.7487524800096195, "flos": 18949414734720.0, "grad_norm": 2.977059539743509, "language_loss": 0.82274646, "learning_rate": 6.262790880528592e-07, "loss": 0.84423161, "num_input_tokens_seen": 133906380, "step": 6227, "time_per_iteration": 2.536071538925171 }, { "auxiliary_loss_clip": 0.01105035, "auxiliary_loss_mlp": 0.01044203, "balance_loss_clip": 1.03874862, "balance_loss_mlp": 1.02704859, "epoch": 0.7488727229002585, "flos": 18697393935360.0, "grad_norm": 2.483873095580833, "language_loss": 0.79400659, "learning_rate": 6.257130439903951e-07, "loss": 0.81549907, "num_input_tokens_seen": 133922875, "step": 6228, "time_per_iteration": 2.5663933753967285 }, { "auxiliary_loss_clip": 0.0113479, "auxiliary_loss_mlp": 0.01041597, "balance_loss_clip": 1.04542863, "balance_loss_mlp": 1.02503848, "epoch": 0.7489929657908976, "flos": 23623368168960.0, "grad_norm": 1.8136053672646961, "language_loss": 0.81207371, "learning_rate": 6.251472084061695e-07, "loss": 0.83383763, "num_input_tokens_seen": 133941795, "step": 6229, "time_per_iteration": 2.530320644378662 }, { "auxiliary_loss_clip": 0.01120232, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.04243231, "balance_loss_mlp": 1.02387047, "epoch": 0.7491132086815367, "flos": 20551533056640.0, "grad_norm": 2.4685690263642535, "language_loss": 0.89265668, "learning_rate": 6.245815813860191e-07, "loss": 0.9142496, "num_input_tokens_seen": 133957305, "step": 6230, "time_per_iteration": 2.589022159576416 }, { "auxiliary_loss_clip": 0.01138396, "auxiliary_loss_mlp": 0.01039888, "balance_loss_clip": 1.04370141, "balance_loss_mlp": 1.02365148, "epoch": 0.7492334515721758, "flos": 23003011353600.0, "grad_norm": 2.3619628086687587, "language_loss": 0.70114553, "learning_rate": 6.240161630157495e-07, "loss": 0.72292835, "num_input_tokens_seen": 133976660, "step": 6231, "time_per_iteration": 2.5793395042419434 }, { "auxiliary_loss_clip": 0.01135133, "auxiliary_loss_mlp": 0.01037868, "balance_loss_clip": 1.04271483, "balance_loss_mlp": 1.02104759, "epoch": 0.7493536944628149, "flos": 16398823835520.0, "grad_norm": 3.825327650013664, "language_loss": 0.70050275, "learning_rate": 6.23450953381133e-07, "loss": 0.72223282, "num_input_tokens_seen": 133994750, "step": 6232, "time_per_iteration": 2.56052565574646 }, { "auxiliary_loss_clip": 0.01100269, "auxiliary_loss_mlp": 0.01040174, "balance_loss_clip": 1.03929687, "balance_loss_mlp": 1.02480841, "epoch": 0.749473937353454, "flos": 15338561155200.0, "grad_norm": 1.8304271702665622, "language_loss": 0.67889637, "learning_rate": 6.228859525679131e-07, "loss": 0.70030081, "num_input_tokens_seen": 134009165, "step": 6233, "time_per_iteration": 2.6154215335845947 }, { "auxiliary_loss_clip": 0.01119802, "auxiliary_loss_mlp": 0.01037849, "balance_loss_clip": 1.04144371, "balance_loss_mlp": 1.02391934, "epoch": 0.7495941802440931, "flos": 18951138587520.0, "grad_norm": 4.3505542535671795, "language_loss": 0.79981333, "learning_rate": 6.223211606617986e-07, "loss": 0.82138985, "num_input_tokens_seen": 134027585, "step": 6234, "time_per_iteration": 2.5706429481506348 }, { "auxiliary_loss_clip": 0.01118107, "auxiliary_loss_mlp": 0.0103179, "balance_loss_clip": 1.04652739, "balance_loss_mlp": 1.01762211, "epoch": 0.7497144231347321, "flos": 22492469393280.0, "grad_norm": 1.9860334455870656, "language_loss": 0.84113312, "learning_rate": 6.217565777484701e-07, "loss": 0.86263204, "num_input_tokens_seen": 134046680, "step": 6235, "time_per_iteration": 2.581563949584961 }, { "auxiliary_loss_clip": 0.01105228, "auxiliary_loss_mlp": 0.00771917, "balance_loss_clip": 1.03953505, "balance_loss_mlp": 1.00051665, "epoch": 0.7498346660253713, "flos": 24243509502720.0, "grad_norm": 1.9369762527264562, "language_loss": 0.80435538, "learning_rate": 6.211922039135722e-07, "loss": 0.82312691, "num_input_tokens_seen": 134066825, "step": 6236, "time_per_iteration": 2.6938469409942627 }, { "auxiliary_loss_clip": 0.0113503, "auxiliary_loss_mlp": 0.01037327, "balance_loss_clip": 1.04482484, "balance_loss_mlp": 1.02224088, "epoch": 0.7499549089160104, "flos": 24387080163840.0, "grad_norm": 2.0262879568707852, "language_loss": 0.81023663, "learning_rate": 6.206280392427201e-07, "loss": 0.8319602, "num_input_tokens_seen": 134086410, "step": 6237, "time_per_iteration": 2.5757205486297607 }, { "auxiliary_loss_clip": 0.01112568, "auxiliary_loss_mlp": 0.01036197, "balance_loss_clip": 1.04032469, "balance_loss_mlp": 1.02126026, "epoch": 0.7500751518066494, "flos": 34057320704640.0, "grad_norm": 1.703828565879046, "language_loss": 0.73682523, "learning_rate": 6.200640838214983e-07, "loss": 0.75831288, "num_input_tokens_seen": 134109185, "step": 6238, "time_per_iteration": 2.7158255577087402 }, { "auxiliary_loss_clip": 0.01131004, "auxiliary_loss_mlp": 0.01043154, "balance_loss_clip": 1.04279232, "balance_loss_mlp": 1.02825856, "epoch": 0.7501953946972886, "flos": 18843586289280.0, "grad_norm": 2.35813942287924, "language_loss": 0.67121279, "learning_rate": 6.195003377354578e-07, "loss": 0.6929543, "num_input_tokens_seen": 134128455, "step": 6239, "time_per_iteration": 2.567168712615967 }, { "auxiliary_loss_clip": 0.01117968, "auxiliary_loss_mlp": 0.01043777, "balance_loss_clip": 1.04273844, "balance_loss_mlp": 1.02661109, "epoch": 0.7503156375879276, "flos": 20257675891200.0, "grad_norm": 2.382276093211071, "language_loss": 0.73339868, "learning_rate": 6.189368010701183e-07, "loss": 0.75501609, "num_input_tokens_seen": 134145515, "step": 6240, "time_per_iteration": 2.55727219581604 }, { "auxiliary_loss_clip": 0.01126416, "auxiliary_loss_mlp": 0.01037707, "balance_loss_clip": 1.04172492, "balance_loss_mlp": 1.0219121, "epoch": 0.7504358804785667, "flos": 13480040574720.0, "grad_norm": 2.05021322401866, "language_loss": 0.76382524, "learning_rate": 6.183734739109683e-07, "loss": 0.78546649, "num_input_tokens_seen": 134163335, "step": 6241, "time_per_iteration": 2.598867893218994 }, { "auxiliary_loss_clip": 0.01133662, "auxiliary_loss_mlp": 0.01043565, "balance_loss_clip": 1.04700112, "balance_loss_mlp": 1.02650595, "epoch": 0.7505561233692057, "flos": 29461042431360.0, "grad_norm": 3.4002740235633473, "language_loss": 0.68667877, "learning_rate": 6.178103563434629e-07, "loss": 0.70845103, "num_input_tokens_seen": 134182335, "step": 6242, "time_per_iteration": 2.6788761615753174 }, { "auxiliary_loss_clip": 0.0113417, "auxiliary_loss_mlp": 0.01040378, "balance_loss_clip": 1.0444541, "balance_loss_mlp": 1.02465391, "epoch": 0.7506763662598449, "flos": 20302457172480.0, "grad_norm": 1.5969422099764017, "language_loss": 0.83635688, "learning_rate": 6.172474484530283e-07, "loss": 0.85810238, "num_input_tokens_seen": 134201070, "step": 6243, "time_per_iteration": 2.583439588546753 }, { "auxiliary_loss_clip": 0.01098503, "auxiliary_loss_mlp": 0.01048734, "balance_loss_clip": 1.03655028, "balance_loss_mlp": 1.03081667, "epoch": 0.750796609150484, "flos": 37230961939200.0, "grad_norm": 2.370026326102004, "language_loss": 0.75886285, "learning_rate": 6.166847503250563e-07, "loss": 0.78033531, "num_input_tokens_seen": 134223310, "step": 6244, "time_per_iteration": 2.7233967781066895 }, { "auxiliary_loss_clip": 0.01107142, "auxiliary_loss_mlp": 0.01034899, "balance_loss_clip": 1.03893638, "balance_loss_mlp": 1.01931882, "epoch": 0.750916852041123, "flos": 19609417186560.0, "grad_norm": 2.643372077386158, "language_loss": 0.79068589, "learning_rate": 6.161222620449078e-07, "loss": 0.81210631, "num_input_tokens_seen": 134242085, "step": 6245, "time_per_iteration": 2.622847080230713 }, { "auxiliary_loss_clip": 0.01103033, "auxiliary_loss_mlp": 0.01034722, "balance_loss_clip": 1.04316688, "balance_loss_mlp": 1.01928401, "epoch": 0.7510370949317622, "flos": 25112690807040.0, "grad_norm": 2.3492543466269518, "language_loss": 0.80102038, "learning_rate": 6.155599836979117e-07, "loss": 0.82239795, "num_input_tokens_seen": 134260770, "step": 6246, "time_per_iteration": 3.6794774532318115 }, { "auxiliary_loss_clip": 0.01085817, "auxiliary_loss_mlp": 0.01042959, "balance_loss_clip": 1.03776097, "balance_loss_mlp": 1.02536964, "epoch": 0.7511573378224012, "flos": 19062282245760.0, "grad_norm": 2.4196463387096427, "language_loss": 0.81844544, "learning_rate": 6.149979153693649e-07, "loss": 0.83973324, "num_input_tokens_seen": 134278025, "step": 6247, "time_per_iteration": 3.6603798866271973 }, { "auxiliary_loss_clip": 0.01119385, "auxiliary_loss_mlp": 0.01048722, "balance_loss_clip": 1.04182386, "balance_loss_mlp": 1.03068578, "epoch": 0.7512775807130403, "flos": 19937676602880.0, "grad_norm": 2.0849620904037596, "language_loss": 0.76857889, "learning_rate": 6.144360571445343e-07, "loss": 0.79026008, "num_input_tokens_seen": 134297170, "step": 6248, "time_per_iteration": 2.5920064449310303 }, { "auxiliary_loss_clip": 0.01115876, "auxiliary_loss_mlp": 0.01039184, "balance_loss_clip": 1.04109478, "balance_loss_mlp": 1.02425909, "epoch": 0.7513978236036795, "flos": 20739920912640.0, "grad_norm": 1.8703219529149406, "language_loss": 0.800699, "learning_rate": 6.138744091086509e-07, "loss": 0.82224965, "num_input_tokens_seen": 134316755, "step": 6249, "time_per_iteration": 2.6045544147491455 }, { "auxiliary_loss_clip": 0.01106292, "auxiliary_loss_mlp": 0.0103979, "balance_loss_clip": 1.04566121, "balance_loss_mlp": 1.02426863, "epoch": 0.7515180664943185, "flos": 27563163523200.0, "grad_norm": 2.2623947744947355, "language_loss": 0.72803909, "learning_rate": 6.133129713469183e-07, "loss": 0.74949992, "num_input_tokens_seen": 134335960, "step": 6250, "time_per_iteration": 2.716479539871216 }, { "auxiliary_loss_clip": 0.01103994, "auxiliary_loss_mlp": 0.01036181, "balance_loss_clip": 1.04084229, "balance_loss_mlp": 1.0203141, "epoch": 0.7516383093849576, "flos": 33803181002880.0, "grad_norm": 2.2771913232691756, "language_loss": 0.64193189, "learning_rate": 6.127517439445053e-07, "loss": 0.66333354, "num_input_tokens_seen": 134356805, "step": 6251, "time_per_iteration": 3.7308003902435303 }, { "auxiliary_loss_clip": 0.01076808, "auxiliary_loss_mlp": 0.01037854, "balance_loss_clip": 1.03920603, "balance_loss_mlp": 1.02313709, "epoch": 0.7517585522755967, "flos": 29746172592000.0, "grad_norm": 2.386900861337832, "language_loss": 0.8177253, "learning_rate": 6.121907269865498e-07, "loss": 0.83887196, "num_input_tokens_seen": 134376295, "step": 6252, "time_per_iteration": 3.5912160873413086 }, { "auxiliary_loss_clip": 0.01009999, "auxiliary_loss_mlp": 0.01004879, "balance_loss_clip": 1.010252, "balance_loss_mlp": 1.00344896, "epoch": 0.7518787951662358, "flos": 69807974319360.0, "grad_norm": 0.9277545959281761, "language_loss": 0.67283481, "learning_rate": 6.116299205581577e-07, "loss": 0.69298363, "num_input_tokens_seen": 134431125, "step": 6253, "time_per_iteration": 3.2102463245391846 }, { "auxiliary_loss_clip": 0.01142113, "auxiliary_loss_mlp": 0.01039875, "balance_loss_clip": 1.04669356, "balance_loss_mlp": 1.0222199, "epoch": 0.7519990380568748, "flos": 34203225749760.0, "grad_norm": 1.9072127717478777, "language_loss": 0.68571246, "learning_rate": 6.110693247444018e-07, "loss": 0.70753229, "num_input_tokens_seen": 134452960, "step": 6254, "time_per_iteration": 2.6449403762817383 }, { "auxiliary_loss_clip": 0.01085074, "auxiliary_loss_mlp": 0.01037747, "balance_loss_clip": 1.03840137, "balance_loss_mlp": 1.02300656, "epoch": 0.752119280947514, "flos": 21725704742400.0, "grad_norm": 2.4086354147893303, "language_loss": 0.82471299, "learning_rate": 6.105089396303258e-07, "loss": 0.84594119, "num_input_tokens_seen": 134471350, "step": 6255, "time_per_iteration": 2.612287759780884 }, { "auxiliary_loss_clip": 0.01109462, "auxiliary_loss_mlp": 0.01043712, "balance_loss_clip": 1.04026842, "balance_loss_mlp": 1.02666521, "epoch": 0.7522395238381531, "flos": 32742774668160.0, "grad_norm": 1.7536620690779803, "language_loss": 0.7577399, "learning_rate": 6.099487653009383e-07, "loss": 0.77927166, "num_input_tokens_seen": 134490695, "step": 6256, "time_per_iteration": 2.6930601596832275 }, { "auxiliary_loss_clip": 0.01118162, "auxiliary_loss_mlp": 0.01036623, "balance_loss_clip": 1.04163694, "balance_loss_mlp": 1.02314043, "epoch": 0.7523597667287921, "flos": 23476026579840.0, "grad_norm": 8.936797551328144, "language_loss": 0.82959008, "learning_rate": 6.093888018412192e-07, "loss": 0.85113794, "num_input_tokens_seen": 134506885, "step": 6257, "time_per_iteration": 2.6091649532318115 }, { "auxiliary_loss_clip": 0.01028503, "auxiliary_loss_mlp": 0.01001322, "balance_loss_clip": 1.00921011, "balance_loss_mlp": 0.99985617, "epoch": 0.7524800096194313, "flos": 67346730501120.0, "grad_norm": 0.7089805004400221, "language_loss": 0.54613012, "learning_rate": 6.088290493361125e-07, "loss": 0.56642842, "num_input_tokens_seen": 134571770, "step": 6258, "time_per_iteration": 3.298956871032715 }, { "auxiliary_loss_clip": 0.01074682, "auxiliary_loss_mlp": 0.01034649, "balance_loss_clip": 1.03660667, "balance_loss_mlp": 1.01879406, "epoch": 0.7526002525100703, "flos": 13006055681280.0, "grad_norm": 2.391461786938548, "language_loss": 0.71654731, "learning_rate": 6.082695078705322e-07, "loss": 0.73764062, "num_input_tokens_seen": 134589250, "step": 6259, "time_per_iteration": 2.646752119064331 }, { "auxiliary_loss_clip": 0.01115656, "auxiliary_loss_mlp": 0.0104778, "balance_loss_clip": 1.04110444, "balance_loss_mlp": 1.03143609, "epoch": 0.7527204954007094, "flos": 21397229844480.0, "grad_norm": 2.514428587957104, "language_loss": 0.68988872, "learning_rate": 6.077101775293618e-07, "loss": 0.71152306, "num_input_tokens_seen": 134608075, "step": 6260, "time_per_iteration": 2.5713276863098145 }, { "auxiliary_loss_clip": 0.01124325, "auxiliary_loss_mlp": 0.01036569, "balance_loss_clip": 1.04273772, "balance_loss_mlp": 1.0190568, "epoch": 0.7528407382913486, "flos": 18947188091520.0, "grad_norm": 3.300329591977715, "language_loss": 0.82772535, "learning_rate": 6.071510583974504e-07, "loss": 0.84933424, "num_input_tokens_seen": 134623260, "step": 6261, "time_per_iteration": 2.5526909828186035 }, { "auxiliary_loss_clip": 0.01134945, "auxiliary_loss_mlp": 0.01045047, "balance_loss_clip": 1.04329228, "balance_loss_mlp": 1.02945435, "epoch": 0.7529609811819876, "flos": 15231798956160.0, "grad_norm": 2.2703978133946654, "language_loss": 0.72048652, "learning_rate": 6.065921505596161e-07, "loss": 0.74228644, "num_input_tokens_seen": 134641540, "step": 6262, "time_per_iteration": 2.547712802886963 }, { "auxiliary_loss_clip": 0.01099013, "auxiliary_loss_mlp": 0.01035946, "balance_loss_clip": 1.04270422, "balance_loss_mlp": 1.02080679, "epoch": 0.7530812240726267, "flos": 19354487385600.0, "grad_norm": 1.7164497373873886, "language_loss": 0.77096313, "learning_rate": 6.060334541006445e-07, "loss": 0.79231274, "num_input_tokens_seen": 134660035, "step": 6263, "time_per_iteration": 2.589419364929199 }, { "auxiliary_loss_clip": 0.01099286, "auxiliary_loss_mlp": 0.01038962, "balance_loss_clip": 1.03876519, "balance_loss_mlp": 1.02388167, "epoch": 0.7532014669632658, "flos": 27748247328000.0, "grad_norm": 2.135769972356215, "language_loss": 0.69106966, "learning_rate": 6.05474969105289e-07, "loss": 0.71245211, "num_input_tokens_seen": 134683025, "step": 6264, "time_per_iteration": 2.7414824962615967 }, { "auxiliary_loss_clip": 0.01125341, "auxiliary_loss_mlp": 0.01038289, "balance_loss_clip": 1.04344237, "balance_loss_mlp": 1.0214088, "epoch": 0.7533217098539049, "flos": 14137421333760.0, "grad_norm": 2.553908864693579, "language_loss": 0.73650503, "learning_rate": 6.049166956582725e-07, "loss": 0.75814134, "num_input_tokens_seen": 134701290, "step": 6265, "time_per_iteration": 2.5798962116241455 }, { "auxiliary_loss_clip": 0.01116241, "auxiliary_loss_mlp": 0.01039062, "balance_loss_clip": 1.04019713, "balance_loss_mlp": 1.02356482, "epoch": 0.753441952744544, "flos": 26429068437120.0, "grad_norm": 2.113790088603166, "language_loss": 0.876297, "learning_rate": 6.043586338442841e-07, "loss": 0.89784992, "num_input_tokens_seen": 134720345, "step": 6266, "time_per_iteration": 2.5971739292144775 }, { "auxiliary_loss_clip": 0.01129853, "auxiliary_loss_mlp": 0.01032766, "balance_loss_clip": 1.04513526, "balance_loss_mlp": 1.01959932, "epoch": 0.7535621956351831, "flos": 23878621192320.0, "grad_norm": 1.5463243260433315, "language_loss": 0.72908711, "learning_rate": 6.038007837479815e-07, "loss": 0.75071323, "num_input_tokens_seen": 134741450, "step": 6267, "time_per_iteration": 2.58616042137146 }, { "auxiliary_loss_clip": 0.01120854, "auxiliary_loss_mlp": 0.0103503, "balance_loss_clip": 1.0435226, "balance_loss_mlp": 1.02061772, "epoch": 0.7536824385258222, "flos": 21795873960960.0, "grad_norm": 1.9034273497571186, "language_loss": 0.64221299, "learning_rate": 6.032431454539897e-07, "loss": 0.66377181, "num_input_tokens_seen": 134760295, "step": 6268, "time_per_iteration": 2.597501039505005 }, { "auxiliary_loss_clip": 0.01097584, "auxiliary_loss_mlp": 0.01041836, "balance_loss_clip": 1.03935599, "balance_loss_mlp": 1.02610028, "epoch": 0.7538026814164612, "flos": 28911644933760.0, "grad_norm": 1.892745626883597, "language_loss": 0.81475139, "learning_rate": 6.026857190469014e-07, "loss": 0.83614558, "num_input_tokens_seen": 134782050, "step": 6269, "time_per_iteration": 2.7073230743408203 }, { "auxiliary_loss_clip": 0.01111827, "auxiliary_loss_mlp": 0.01043385, "balance_loss_clip": 1.04190373, "balance_loss_mlp": 1.02648091, "epoch": 0.7539229243071004, "flos": 21104701482240.0, "grad_norm": 1.9172979604935068, "language_loss": 0.74105775, "learning_rate": 6.0212850461128e-07, "loss": 0.76260984, "num_input_tokens_seen": 134801170, "step": 6270, "time_per_iteration": 2.629873514175415 }, { "auxiliary_loss_clip": 0.0111028, "auxiliary_loss_mlp": 0.01036341, "balance_loss_clip": 1.04025865, "balance_loss_mlp": 1.01955605, "epoch": 0.7540431671977395, "flos": 15158469340800.0, "grad_norm": 6.613559142410065, "language_loss": 0.74708521, "learning_rate": 6.015715022316516e-07, "loss": 0.76855141, "num_input_tokens_seen": 134819150, "step": 6271, "time_per_iteration": 2.613410472869873 }, { "auxiliary_loss_clip": 0.01083909, "auxiliary_loss_mlp": 0.01044706, "balance_loss_clip": 1.03685355, "balance_loss_mlp": 1.02622902, "epoch": 0.7541634100883785, "flos": 18770579896320.0, "grad_norm": 2.4249904815978973, "language_loss": 0.77881336, "learning_rate": 6.010147119925154e-07, "loss": 0.80009949, "num_input_tokens_seen": 134836905, "step": 6272, "time_per_iteration": 3.617612361907959 }, { "auxiliary_loss_clip": 0.01090241, "auxiliary_loss_mlp": 0.01041084, "balance_loss_clip": 1.03911126, "balance_loss_mlp": 1.0247879, "epoch": 0.7542836529790176, "flos": 20594770053120.0, "grad_norm": 1.9203087704012565, "language_loss": 0.66137856, "learning_rate": 6.004581339783348e-07, "loss": 0.68269181, "num_input_tokens_seen": 134855225, "step": 6273, "time_per_iteration": 3.580029010772705 }, { "auxiliary_loss_clip": 0.01125966, "auxiliary_loss_mlp": 0.01043498, "balance_loss_clip": 1.04227853, "balance_loss_mlp": 1.02694023, "epoch": 0.7544038958696567, "flos": 19095104298240.0, "grad_norm": 2.628724032363869, "language_loss": 0.68647975, "learning_rate": 5.999017682735425e-07, "loss": 0.70817435, "num_input_tokens_seen": 134871615, "step": 6274, "time_per_iteration": 2.580979347229004 }, { "auxiliary_loss_clip": 0.01080076, "auxiliary_loss_mlp": 0.01042659, "balance_loss_clip": 1.03869426, "balance_loss_mlp": 1.02655375, "epoch": 0.7545241387602958, "flos": 31723306859520.0, "grad_norm": 2.050511559674961, "language_loss": 0.66307294, "learning_rate": 5.993456149625387e-07, "loss": 0.6843003, "num_input_tokens_seen": 134892765, "step": 6275, "time_per_iteration": 2.813471794128418 }, { "auxiliary_loss_clip": 0.01089343, "auxiliary_loss_mlp": 0.01038417, "balance_loss_clip": 1.03996038, "balance_loss_mlp": 1.02427292, "epoch": 0.7546443816509348, "flos": 20296495514880.0, "grad_norm": 1.9212602956826283, "language_loss": 0.82515895, "learning_rate": 5.987896741296909e-07, "loss": 0.84643656, "num_input_tokens_seen": 134910505, "step": 6276, "time_per_iteration": 3.5805883407592773 }, { "auxiliary_loss_clip": 0.01102522, "auxiliary_loss_mlp": 0.01044633, "balance_loss_clip": 1.03939188, "balance_loss_mlp": 1.02955317, "epoch": 0.754764624541574, "flos": 23696159080320.0, "grad_norm": 2.4296863547910417, "language_loss": 0.78814477, "learning_rate": 5.982339458593361e-07, "loss": 0.80961633, "num_input_tokens_seen": 134930445, "step": 6277, "time_per_iteration": 2.6528260707855225 }, { "auxiliary_loss_clip": 0.01116246, "auxiliary_loss_mlp": 0.00771894, "balance_loss_clip": 1.04151464, "balance_loss_mlp": 1.00047719, "epoch": 0.7548848674322131, "flos": 25337204766720.0, "grad_norm": 2.3449582513138276, "language_loss": 0.84066701, "learning_rate": 5.976784302357767e-07, "loss": 0.85954845, "num_input_tokens_seen": 134951010, "step": 6278, "time_per_iteration": 3.528228521347046 }, { "auxiliary_loss_clip": 0.01123143, "auxiliary_loss_mlp": 0.01042815, "balance_loss_clip": 1.04325366, "balance_loss_mlp": 1.02673388, "epoch": 0.7550051103228521, "flos": 19573147428480.0, "grad_norm": 2.2149519822311, "language_loss": 0.73517269, "learning_rate": 5.971231273432855e-07, "loss": 0.7568323, "num_input_tokens_seen": 134970495, "step": 6279, "time_per_iteration": 2.5947184562683105 }, { "auxiliary_loss_clip": 0.01034624, "auxiliary_loss_mlp": 0.01001987, "balance_loss_clip": 1.01600313, "balance_loss_mlp": 1.00058007, "epoch": 0.7551253532134913, "flos": 64150068648960.0, "grad_norm": 0.8105842501754792, "language_loss": 0.54552782, "learning_rate": 5.965680372661e-07, "loss": 0.56589389, "num_input_tokens_seen": 135028060, "step": 6280, "time_per_iteration": 3.109396457672119 }, { "auxiliary_loss_clip": 0.0110654, "auxiliary_loss_mlp": 0.01034958, "balance_loss_clip": 1.04170191, "balance_loss_mlp": 1.02165473, "epoch": 0.7552455961041303, "flos": 26067986968320.0, "grad_norm": 1.8486211600121956, "language_loss": 0.56281847, "learning_rate": 5.960131600884266e-07, "loss": 0.5842334, "num_input_tokens_seen": 135047330, "step": 6281, "time_per_iteration": 2.688394784927368 }, { "auxiliary_loss_clip": 0.01099521, "auxiliary_loss_mlp": 0.01036711, "balance_loss_clip": 1.04303408, "balance_loss_mlp": 1.02194047, "epoch": 0.7553658389947694, "flos": 24498223822080.0, "grad_norm": 3.0295847132434255, "language_loss": 0.76337951, "learning_rate": 5.954584958944413e-07, "loss": 0.78474188, "num_input_tokens_seen": 135065995, "step": 6282, "time_per_iteration": 2.672548294067383 }, { "auxiliary_loss_clip": 0.01097469, "auxiliary_loss_mlp": 0.00771983, "balance_loss_clip": 1.03801119, "balance_loss_mlp": 1.00048304, "epoch": 0.7554860818854086, "flos": 21799465320960.0, "grad_norm": 2.0869374132322904, "language_loss": 0.81795692, "learning_rate": 5.949040447682854e-07, "loss": 0.83665144, "num_input_tokens_seen": 135085820, "step": 6283, "time_per_iteration": 2.6845169067382812 }, { "auxiliary_loss_clip": 0.01118305, "auxiliary_loss_mlp": 0.01038345, "balance_loss_clip": 1.04430139, "balance_loss_mlp": 1.02129841, "epoch": 0.7556063247760476, "flos": 16362123114240.0, "grad_norm": 2.135096966639279, "language_loss": 0.685633, "learning_rate": 5.943498067940686e-07, "loss": 0.70719951, "num_input_tokens_seen": 135102845, "step": 6284, "time_per_iteration": 2.5899770259857178 }, { "auxiliary_loss_clip": 0.01096895, "auxiliary_loss_mlp": 0.01041188, "balance_loss_clip": 1.04189849, "balance_loss_mlp": 1.02551174, "epoch": 0.7557265676666867, "flos": 27235155502080.0, "grad_norm": 2.039239163622916, "language_loss": 0.81328595, "learning_rate": 5.937957820558686e-07, "loss": 0.83466673, "num_input_tokens_seen": 135122190, "step": 6285, "time_per_iteration": 2.6596906185150146 }, { "auxiliary_loss_clip": 0.01018764, "auxiliary_loss_mlp": 0.01004619, "balance_loss_clip": 1.00843918, "balance_loss_mlp": 1.00314128, "epoch": 0.7558468105573258, "flos": 62189131415040.0, "grad_norm": 0.8542075137609418, "language_loss": 0.65334505, "learning_rate": 5.932419706377296e-07, "loss": 0.67357886, "num_input_tokens_seen": 135180495, "step": 6286, "time_per_iteration": 3.1726396083831787 }, { "auxiliary_loss_clip": 0.01093196, "auxiliary_loss_mlp": 0.01038211, "balance_loss_clip": 1.04073858, "balance_loss_mlp": 1.02351296, "epoch": 0.7559670534479649, "flos": 33249078823680.0, "grad_norm": 2.5385729480768027, "language_loss": 0.74509788, "learning_rate": 5.92688372623666e-07, "loss": 0.7664119, "num_input_tokens_seen": 135199200, "step": 6287, "time_per_iteration": 2.755532741546631 }, { "auxiliary_loss_clip": 0.01123199, "auxiliary_loss_mlp": 0.01039875, "balance_loss_clip": 1.04209566, "balance_loss_mlp": 1.02334118, "epoch": 0.7560872963386039, "flos": 14064379027200.0, "grad_norm": 8.08377997271476, "language_loss": 0.74474061, "learning_rate": 5.921349880976574e-07, "loss": 0.76637137, "num_input_tokens_seen": 135217035, "step": 6288, "time_per_iteration": 2.5772902965545654 }, { "auxiliary_loss_clip": 0.01115392, "auxiliary_loss_mlp": 0.00772697, "balance_loss_clip": 1.04433918, "balance_loss_mlp": 1.00046158, "epoch": 0.7562075392292431, "flos": 20412307941120.0, "grad_norm": 2.044427983256787, "language_loss": 0.81629956, "learning_rate": 5.915818171436515e-07, "loss": 0.83518046, "num_input_tokens_seen": 135236370, "step": 6289, "time_per_iteration": 2.619547128677368 }, { "auxiliary_loss_clip": 0.01106654, "auxiliary_loss_mlp": 0.01038662, "balance_loss_clip": 1.0373193, "balance_loss_mlp": 1.0214963, "epoch": 0.7563277821198822, "flos": 20376792368640.0, "grad_norm": 1.7731477722269728, "language_loss": 0.74750185, "learning_rate": 5.910288598455642e-07, "loss": 0.76895499, "num_input_tokens_seen": 135255720, "step": 6290, "time_per_iteration": 2.6190996170043945 }, { "auxiliary_loss_clip": 0.01128876, "auxiliary_loss_mlp": 0.01049711, "balance_loss_clip": 1.04314113, "balance_loss_mlp": 1.03304601, "epoch": 0.7564480250105212, "flos": 18588261438720.0, "grad_norm": 2.974692729454014, "language_loss": 0.7455374, "learning_rate": 5.90476116287278e-07, "loss": 0.76732326, "num_input_tokens_seen": 135273320, "step": 6291, "time_per_iteration": 2.5883045196533203 }, { "auxiliary_loss_clip": 0.01108179, "auxiliary_loss_mlp": 0.0103724, "balance_loss_clip": 1.04109144, "balance_loss_mlp": 1.02176702, "epoch": 0.7565682679011604, "flos": 21215521918080.0, "grad_norm": 1.8107429161907327, "language_loss": 0.68283677, "learning_rate": 5.899235865526456e-07, "loss": 0.70429099, "num_input_tokens_seen": 135292615, "step": 6292, "time_per_iteration": 2.604342460632324 }, { "auxiliary_loss_clip": 0.01087959, "auxiliary_loss_mlp": 0.01043405, "balance_loss_clip": 1.03835249, "balance_loss_mlp": 1.02792001, "epoch": 0.7566885107917994, "flos": 20449008662400.0, "grad_norm": 1.735082895938489, "language_loss": 0.82452303, "learning_rate": 5.893712707254825e-07, "loss": 0.84583664, "num_input_tokens_seen": 135310075, "step": 6293, "time_per_iteration": 2.6924006938934326 }, { "auxiliary_loss_clip": 0.01077217, "auxiliary_loss_mlp": 0.01039231, "balance_loss_clip": 1.03417015, "balance_loss_mlp": 1.02275658, "epoch": 0.7568087536824385, "flos": 19025832919680.0, "grad_norm": 2.393285130793231, "language_loss": 0.65826946, "learning_rate": 5.888191688895769e-07, "loss": 0.67943394, "num_input_tokens_seen": 135327335, "step": 6294, "time_per_iteration": 2.7059102058410645 }, { "auxiliary_loss_clip": 0.01135235, "auxiliary_loss_mlp": 0.0103398, "balance_loss_clip": 1.04318142, "balance_loss_mlp": 1.01773167, "epoch": 0.7569289965730777, "flos": 15225442248960.0, "grad_norm": 3.75412747233672, "language_loss": 0.62032598, "learning_rate": 5.882672811286813e-07, "loss": 0.64201814, "num_input_tokens_seen": 135343615, "step": 6295, "time_per_iteration": 2.540867567062378 }, { "auxiliary_loss_clip": 0.01136957, "auxiliary_loss_mlp": 0.01038008, "balance_loss_clip": 1.04425669, "balance_loss_mlp": 1.02183747, "epoch": 0.7570492394637167, "flos": 20769367086720.0, "grad_norm": 2.0832519322704575, "language_loss": 0.69487125, "learning_rate": 5.877156075265166e-07, "loss": 0.71662086, "num_input_tokens_seen": 135359880, "step": 6296, "time_per_iteration": 2.541879653930664 }, { "auxiliary_loss_clip": 0.01108478, "auxiliary_loss_mlp": 0.01045724, "balance_loss_clip": 1.04062307, "balance_loss_mlp": 1.02679372, "epoch": 0.7571694823543558, "flos": 15664091137920.0, "grad_norm": 2.986673550573476, "language_loss": 0.69695818, "learning_rate": 5.871641481667715e-07, "loss": 0.7185002, "num_input_tokens_seen": 135374325, "step": 6297, "time_per_iteration": 2.556114673614502 }, { "auxiliary_loss_clip": 0.0108921, "auxiliary_loss_mlp": 0.01036839, "balance_loss_clip": 1.0399754, "balance_loss_mlp": 1.02048922, "epoch": 0.7572897252449949, "flos": 25409241492480.0, "grad_norm": 2.0225825125291794, "language_loss": 0.84467906, "learning_rate": 5.866129031331011e-07, "loss": 0.86593956, "num_input_tokens_seen": 135393980, "step": 6298, "time_per_iteration": 3.6227269172668457 }, { "auxiliary_loss_clip": 0.01112531, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.04205108, "balance_loss_mlp": 1.01723051, "epoch": 0.757409968135634, "flos": 24279348297600.0, "grad_norm": 3.769650536441624, "language_loss": 0.83436525, "learning_rate": 5.8606187250913e-07, "loss": 0.85581666, "num_input_tokens_seen": 135412030, "step": 6299, "time_per_iteration": 3.8440020084381104 }, { "auxiliary_loss_clip": 0.01120254, "auxiliary_loss_mlp": 0.00772592, "balance_loss_clip": 1.04351246, "balance_loss_mlp": 1.00051737, "epoch": 0.757530211026273, "flos": 24133766474880.0, "grad_norm": 2.7680859283887815, "language_loss": 0.84031677, "learning_rate": 5.855110563784482e-07, "loss": 0.85924524, "num_input_tokens_seen": 135430565, "step": 6300, "time_per_iteration": 2.622767448425293 }, { "auxiliary_loss_clip": 0.0111297, "auxiliary_loss_mlp": 0.00773519, "balance_loss_clip": 1.03923941, "balance_loss_mlp": 1.00051045, "epoch": 0.7576504539169122, "flos": 23951807153280.0, "grad_norm": 1.8138508641307922, "language_loss": 0.64442694, "learning_rate": 5.849604548246156e-07, "loss": 0.66329187, "num_input_tokens_seen": 135451675, "step": 6301, "time_per_iteration": 2.662144899368286 }, { "auxiliary_loss_clip": 0.0111521, "auxiliary_loss_mlp": 0.00772318, "balance_loss_clip": 1.043257, "balance_loss_mlp": 1.00045061, "epoch": 0.7577706968075513, "flos": 21251360712960.0, "grad_norm": 2.2489556909037147, "language_loss": 0.8017751, "learning_rate": 5.844100679311565e-07, "loss": 0.8206504, "num_input_tokens_seen": 135470635, "step": 6302, "time_per_iteration": 3.5873541831970215 }, { "auxiliary_loss_clip": 0.01108619, "auxiliary_loss_mlp": 0.01037358, "balance_loss_clip": 1.04030502, "balance_loss_mlp": 1.02110946, "epoch": 0.7578909396981903, "flos": 18296595002880.0, "grad_norm": 3.601042069335817, "language_loss": 0.7670356, "learning_rate": 5.838598957815637e-07, "loss": 0.78849536, "num_input_tokens_seen": 135487865, "step": 6303, "time_per_iteration": 2.670741558074951 }, { "auxiliary_loss_clip": 0.01102495, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.04078245, "balance_loss_mlp": 1.01930094, "epoch": 0.7580111825888295, "flos": 25373869574400.0, "grad_norm": 1.5919772279077131, "language_loss": 0.8541286, "learning_rate": 5.833099384592996e-07, "loss": 0.87550199, "num_input_tokens_seen": 135508440, "step": 6304, "time_per_iteration": 2.735771417617798 }, { "auxiliary_loss_clip": 0.01105266, "auxiliary_loss_mlp": 0.01036222, "balance_loss_clip": 1.03992534, "balance_loss_mlp": 1.02031898, "epoch": 0.7581314254794685, "flos": 23768662682880.0, "grad_norm": 2.063428040016779, "language_loss": 0.71793681, "learning_rate": 5.827601960477913e-07, "loss": 0.73935169, "num_input_tokens_seen": 135526365, "step": 6305, "time_per_iteration": 3.5330166816711426 }, { "auxiliary_loss_clip": 0.01116156, "auxiliary_loss_mlp": 0.01038769, "balance_loss_clip": 1.03949392, "balance_loss_mlp": 1.0235455, "epoch": 0.7582516683701076, "flos": 22054610603520.0, "grad_norm": 1.9451999683616146, "language_loss": 0.70445466, "learning_rate": 5.822106686304344e-07, "loss": 0.72600389, "num_input_tokens_seen": 135545655, "step": 6306, "time_per_iteration": 2.6155920028686523 }, { "auxiliary_loss_clip": 0.01102317, "auxiliary_loss_mlp": 0.01044298, "balance_loss_clip": 1.04058635, "balance_loss_mlp": 1.02826405, "epoch": 0.7583719112607467, "flos": 31649725848960.0, "grad_norm": 2.191899886717796, "language_loss": 0.57719946, "learning_rate": 5.816613562905919e-07, "loss": 0.59866565, "num_input_tokens_seen": 135566840, "step": 6307, "time_per_iteration": 2.703843832015991 }, { "auxiliary_loss_clip": 0.01093206, "auxiliary_loss_mlp": 0.01039665, "balance_loss_clip": 1.04413831, "balance_loss_mlp": 1.02432835, "epoch": 0.7584921541513858, "flos": 33068376478080.0, "grad_norm": 1.8794599314099991, "language_loss": 0.70085204, "learning_rate": 5.811122591115933e-07, "loss": 0.72218072, "num_input_tokens_seen": 135587825, "step": 6308, "time_per_iteration": 2.7576465606689453 }, { "auxiliary_loss_clip": 0.01096352, "auxiliary_loss_mlp": 0.01042403, "balance_loss_clip": 1.04246223, "balance_loss_mlp": 1.02569008, "epoch": 0.7586123970420249, "flos": 23326350606720.0, "grad_norm": 2.216504431150652, "language_loss": 0.71532202, "learning_rate": 5.805633771767376e-07, "loss": 0.73670959, "num_input_tokens_seen": 135605220, "step": 6309, "time_per_iteration": 2.6802120208740234 }, { "auxiliary_loss_clip": 0.01105679, "auxiliary_loss_mlp": 0.01049171, "balance_loss_clip": 1.04301751, "balance_loss_mlp": 1.03387022, "epoch": 0.7587326399326639, "flos": 18334229477760.0, "grad_norm": 1.968058254418002, "language_loss": 0.77665102, "learning_rate": 5.800147105692888e-07, "loss": 0.79819953, "num_input_tokens_seen": 135624795, "step": 6310, "time_per_iteration": 2.65944766998291 }, { "auxiliary_loss_clip": 0.01124395, "auxiliary_loss_mlp": 0.01037087, "balance_loss_clip": 1.04207706, "balance_loss_mlp": 1.02148247, "epoch": 0.7588528828233031, "flos": 17275080119040.0, "grad_norm": 1.6821231134284267, "language_loss": 0.78965002, "learning_rate": 5.794662593724795e-07, "loss": 0.81126487, "num_input_tokens_seen": 135643800, "step": 6311, "time_per_iteration": 2.612330198287964 }, { "auxiliary_loss_clip": 0.01136392, "auxiliary_loss_mlp": 0.01049087, "balance_loss_clip": 1.04552412, "balance_loss_mlp": 1.03300571, "epoch": 0.7589731257139422, "flos": 17713621267200.0, "grad_norm": 1.8592062989001625, "language_loss": 0.75510687, "learning_rate": 5.789180236695091e-07, "loss": 0.77696168, "num_input_tokens_seen": 135660655, "step": 6312, "time_per_iteration": 2.5207979679107666 }, { "auxiliary_loss_clip": 0.01117211, "auxiliary_loss_mlp": 0.01033711, "balance_loss_clip": 1.04317176, "balance_loss_mlp": 1.01830888, "epoch": 0.7590933686045812, "flos": 15961072786560.0, "grad_norm": 4.548789863020566, "language_loss": 0.84902072, "learning_rate": 5.78370003543544e-07, "loss": 0.87052989, "num_input_tokens_seen": 135679410, "step": 6313, "time_per_iteration": 2.6219441890716553 }, { "auxiliary_loss_clip": 0.01124091, "auxiliary_loss_mlp": 0.00773809, "balance_loss_clip": 1.04232144, "balance_loss_mlp": 1.00045514, "epoch": 0.7592136114952204, "flos": 21068072588160.0, "grad_norm": 2.1589263538869803, "language_loss": 0.8384999, "learning_rate": 5.778221990777203e-07, "loss": 0.85747886, "num_input_tokens_seen": 135697150, "step": 6314, "time_per_iteration": 2.6105849742889404 }, { "auxiliary_loss_clip": 0.01112075, "auxiliary_loss_mlp": 0.01048372, "balance_loss_clip": 1.04255247, "balance_loss_mlp": 1.03113484, "epoch": 0.7593338543858594, "flos": 25297666871040.0, "grad_norm": 2.1897017927605047, "language_loss": 0.82781374, "learning_rate": 5.772746103551372e-07, "loss": 0.84941816, "num_input_tokens_seen": 135712545, "step": 6315, "time_per_iteration": 2.6454639434814453 }, { "auxiliary_loss_clip": 0.0110605, "auxiliary_loss_mlp": 0.01035064, "balance_loss_clip": 1.04257846, "balance_loss_mlp": 1.01950097, "epoch": 0.7594540972764985, "flos": 31832367528960.0, "grad_norm": 1.8367169572536746, "language_loss": 0.71885949, "learning_rate": 5.767272374588648e-07, "loss": 0.74027061, "num_input_tokens_seen": 135733950, "step": 6316, "time_per_iteration": 2.735868215560913 }, { "auxiliary_loss_clip": 0.0111975, "auxiliary_loss_mlp": 0.01039692, "balance_loss_clip": 1.04222369, "balance_loss_mlp": 1.02468371, "epoch": 0.7595743401671377, "flos": 37597250880000.0, "grad_norm": 3.1317858330219295, "language_loss": 0.78139669, "learning_rate": 5.76180080471939e-07, "loss": 0.80299115, "num_input_tokens_seen": 135757120, "step": 6317, "time_per_iteration": 2.7213971614837646 }, { "auxiliary_loss_clip": 0.01140742, "auxiliary_loss_mlp": 0.01044054, "balance_loss_clip": 1.04517722, "balance_loss_mlp": 1.02679229, "epoch": 0.7596945830577767, "flos": 18287724343680.0, "grad_norm": 2.145948652081959, "language_loss": 0.72112691, "learning_rate": 5.756331394773631e-07, "loss": 0.74297488, "num_input_tokens_seen": 135773335, "step": 6318, "time_per_iteration": 2.5278539657592773 }, { "auxiliary_loss_clip": 0.01072781, "auxiliary_loss_mlp": 0.00773283, "balance_loss_clip": 1.03825831, "balance_loss_mlp": 1.00043571, "epoch": 0.7598148259484158, "flos": 22233122219520.0, "grad_norm": 1.7526359740918553, "language_loss": 0.75922024, "learning_rate": 5.750864145581071e-07, "loss": 0.77768087, "num_input_tokens_seen": 135792555, "step": 6319, "time_per_iteration": 2.7625279426574707 }, { "auxiliary_loss_clip": 0.01132191, "auxiliary_loss_mlp": 0.01037678, "balance_loss_clip": 1.04437852, "balance_loss_mlp": 1.0225389, "epoch": 0.7599350688390549, "flos": 27161718145920.0, "grad_norm": 2.305986908657163, "language_loss": 0.86263931, "learning_rate": 5.745399057971085e-07, "loss": 0.88433802, "num_input_tokens_seen": 135813690, "step": 6320, "time_per_iteration": 2.5781710147857666 }, { "auxiliary_loss_clip": 0.01125711, "auxiliary_loss_mlp": 0.01036596, "balance_loss_clip": 1.0428654, "balance_loss_mlp": 1.02021694, "epoch": 0.760055311729694, "flos": 15560704817280.0, "grad_norm": 2.0151067753462155, "language_loss": 0.756823, "learning_rate": 5.739936132772738e-07, "loss": 0.77844608, "num_input_tokens_seen": 135832255, "step": 6321, "time_per_iteration": 2.5629193782806396 }, { "auxiliary_loss_clip": 0.01132619, "auxiliary_loss_mlp": 0.01038645, "balance_loss_clip": 1.04265535, "balance_loss_mlp": 1.02246785, "epoch": 0.760175554620333, "flos": 25155496840320.0, "grad_norm": 3.0348034092120155, "language_loss": 0.7421841, "learning_rate": 5.734475370814733e-07, "loss": 0.76389676, "num_input_tokens_seen": 135851935, "step": 6322, "time_per_iteration": 2.5934126377105713 }, { "auxiliary_loss_clip": 0.01124221, "auxiliary_loss_mlp": 0.01037924, "balance_loss_clip": 1.04201412, "balance_loss_mlp": 1.02080536, "epoch": 0.7602957975109722, "flos": 24353791234560.0, "grad_norm": 1.7075028893707256, "language_loss": 0.78744388, "learning_rate": 5.729016772925483e-07, "loss": 0.80906534, "num_input_tokens_seen": 135873510, "step": 6323, "time_per_iteration": 2.6142454147338867 }, { "auxiliary_loss_clip": 0.01075494, "auxiliary_loss_mlp": 0.01038993, "balance_loss_clip": 1.03655577, "balance_loss_mlp": 1.0228163, "epoch": 0.7604160404016113, "flos": 25192664438400.0, "grad_norm": 1.726876151488831, "language_loss": 0.70652127, "learning_rate": 5.723560339933038e-07, "loss": 0.72766614, "num_input_tokens_seen": 135893845, "step": 6324, "time_per_iteration": 4.610695123672485 }, { "auxiliary_loss_clip": 0.01119439, "auxiliary_loss_mlp": 0.00772863, "balance_loss_clip": 1.03993273, "balance_loss_mlp": 1.00051236, "epoch": 0.7605362832922503, "flos": 29861841363840.0, "grad_norm": 2.092021100942418, "language_loss": 0.65439802, "learning_rate": 5.71810607266513e-07, "loss": 0.67332101, "num_input_tokens_seen": 135912430, "step": 6325, "time_per_iteration": 2.657719850540161 }, { "auxiliary_loss_clip": 0.01121945, "auxiliary_loss_mlp": 0.01034591, "balance_loss_clip": 1.04255748, "balance_loss_mlp": 1.01854587, "epoch": 0.7606565261828895, "flos": 13917935278080.0, "grad_norm": 1.8565506736168036, "language_loss": 0.60309899, "learning_rate": 5.712653971949184e-07, "loss": 0.62466443, "num_input_tokens_seen": 135930550, "step": 6326, "time_per_iteration": 2.6052215099334717 }, { "auxiliary_loss_clip": 0.01117607, "auxiliary_loss_mlp": 0.01045946, "balance_loss_clip": 1.04213047, "balance_loss_mlp": 1.02801728, "epoch": 0.7607767690735285, "flos": 18551273408640.0, "grad_norm": 3.4943485433743877, "language_loss": 0.75878727, "learning_rate": 5.707204038612268e-07, "loss": 0.78042281, "num_input_tokens_seen": 135947980, "step": 6327, "time_per_iteration": 2.5610649585723877 }, { "auxiliary_loss_clip": 0.01118272, "auxiliary_loss_mlp": 0.01048346, "balance_loss_clip": 1.04606056, "balance_loss_mlp": 1.03054798, "epoch": 0.7608970119641676, "flos": 20922993555840.0, "grad_norm": 2.1424895466696006, "language_loss": 0.73959363, "learning_rate": 5.701756273481138e-07, "loss": 0.76125979, "num_input_tokens_seen": 135965400, "step": 6328, "time_per_iteration": 3.63885235786438 }, { "auxiliary_loss_clip": 0.01106754, "auxiliary_loss_mlp": 0.01035261, "balance_loss_clip": 1.0390861, "balance_loss_mlp": 1.02001452, "epoch": 0.7610172548548068, "flos": 23807302738560.0, "grad_norm": 1.782690975148632, "language_loss": 0.73739791, "learning_rate": 5.696310677382212e-07, "loss": 0.75881809, "num_input_tokens_seen": 135986795, "step": 6329, "time_per_iteration": 2.6635754108428955 }, { "auxiliary_loss_clip": 0.01007845, "auxiliary_loss_mlp": 0.01003573, "balance_loss_clip": 1.01119351, "balance_loss_mlp": 1.00203514, "epoch": 0.7611374977454458, "flos": 66496580426880.0, "grad_norm": 0.8746389206636098, "language_loss": 0.61662811, "learning_rate": 5.690867251141576e-07, "loss": 0.63674229, "num_input_tokens_seen": 136053450, "step": 6330, "time_per_iteration": 3.3869223594665527 }, { "auxiliary_loss_clip": 0.01129165, "auxiliary_loss_mlp": 0.01042429, "balance_loss_clip": 1.04446244, "balance_loss_mlp": 1.02643096, "epoch": 0.7612577406360849, "flos": 15633136592640.0, "grad_norm": 4.425013710728723, "language_loss": 0.91820371, "learning_rate": 5.685425995585013e-07, "loss": 0.93991959, "num_input_tokens_seen": 136071375, "step": 6331, "time_per_iteration": 3.550347089767456 }, { "auxiliary_loss_clip": 0.01019664, "auxiliary_loss_mlp": 0.00998223, "balance_loss_clip": 1.01000953, "balance_loss_mlp": 0.99666101, "epoch": 0.761377983526724, "flos": 60526253237760.0, "grad_norm": 0.756320557004928, "language_loss": 0.59040177, "learning_rate": 5.679986911537935e-07, "loss": 0.61058056, "num_input_tokens_seen": 136138905, "step": 6332, "time_per_iteration": 3.3431241512298584 }, { "auxiliary_loss_clip": 0.01069976, "auxiliary_loss_mlp": 0.01049888, "balance_loss_clip": 1.03684807, "balance_loss_mlp": 1.03238773, "epoch": 0.7614982264173631, "flos": 35772522019200.0, "grad_norm": 2.2492502027948893, "language_loss": 0.67414862, "learning_rate": 5.674549999825462e-07, "loss": 0.69534719, "num_input_tokens_seen": 136161720, "step": 6333, "time_per_iteration": 2.8477373123168945 }, { "auxiliary_loss_clip": 0.01029195, "auxiliary_loss_mlp": 0.01002293, "balance_loss_clip": 1.01007295, "balance_loss_mlp": 1.00061178, "epoch": 0.7616184693080021, "flos": 67925502345600.0, "grad_norm": 0.9112013761266405, "language_loss": 0.71370733, "learning_rate": 5.669115261272363e-07, "loss": 0.7340222, "num_input_tokens_seen": 136222040, "step": 6334, "time_per_iteration": 3.1797120571136475 }, { "auxiliary_loss_clip": 0.01125065, "auxiliary_loss_mlp": 0.01042748, "balance_loss_clip": 1.04364109, "balance_loss_mlp": 1.0260464, "epoch": 0.7617387121986413, "flos": 20521979141760.0, "grad_norm": 3.828628449678054, "language_loss": 0.72790122, "learning_rate": 5.663682696703081e-07, "loss": 0.74957937, "num_input_tokens_seen": 136240305, "step": 6335, "time_per_iteration": 2.6123125553131104 }, { "auxiliary_loss_clip": 0.01132665, "auxiliary_loss_mlp": 0.01034713, "balance_loss_clip": 1.04444551, "balance_loss_mlp": 1.01977623, "epoch": 0.7618589550892804, "flos": 18624495283200.0, "grad_norm": 2.0884892494669356, "language_loss": 0.82352793, "learning_rate": 5.658252306941746e-07, "loss": 0.84520167, "num_input_tokens_seen": 136259625, "step": 6336, "time_per_iteration": 2.637430429458618 }, { "auxiliary_loss_clip": 0.01086605, "auxiliary_loss_mlp": 0.01040736, "balance_loss_clip": 1.03962445, "balance_loss_mlp": 1.02473783, "epoch": 0.7619791979799194, "flos": 17453735389440.0, "grad_norm": 18.9163155343469, "language_loss": 0.75843865, "learning_rate": 5.65282409281212e-07, "loss": 0.77971202, "num_input_tokens_seen": 136277090, "step": 6337, "time_per_iteration": 2.665912628173828 }, { "auxiliary_loss_clip": 0.01107511, "auxiliary_loss_mlp": 0.01046152, "balance_loss_clip": 1.0415647, "balance_loss_mlp": 1.02879488, "epoch": 0.7620994408705585, "flos": 14137421333760.0, "grad_norm": 3.653365938616207, "language_loss": 0.70390904, "learning_rate": 5.64739805513768e-07, "loss": 0.72544569, "num_input_tokens_seen": 136294635, "step": 6338, "time_per_iteration": 2.643383741378784 }, { "auxiliary_loss_clip": 0.01028349, "auxiliary_loss_mlp": 0.0075558, "balance_loss_clip": 1.01009202, "balance_loss_mlp": 1.00017977, "epoch": 0.7622196837611976, "flos": 70708792527360.0, "grad_norm": 0.7877032660681033, "language_loss": 0.55664301, "learning_rate": 5.641974194741541e-07, "loss": 0.57448226, "num_input_tokens_seen": 136350320, "step": 6339, "time_per_iteration": 3.098651170730591 }, { "auxiliary_loss_clip": 0.01011872, "auxiliary_loss_mlp": 0.01001688, "balance_loss_clip": 1.01324892, "balance_loss_mlp": 0.99990034, "epoch": 0.7623399266518367, "flos": 60684150447360.0, "grad_norm": 0.7812916466069949, "language_loss": 0.63696277, "learning_rate": 5.636552512446502e-07, "loss": 0.65709829, "num_input_tokens_seen": 136411375, "step": 6340, "time_per_iteration": 3.1658129692077637 }, { "auxiliary_loss_clip": 0.01119572, "auxiliary_loss_mlp": 0.01041774, "balance_loss_clip": 1.04330933, "balance_loss_mlp": 1.02542996, "epoch": 0.7624601695424758, "flos": 26468893641600.0, "grad_norm": 3.7956305264214327, "language_loss": 0.78057355, "learning_rate": 5.631133009075027e-07, "loss": 0.80218703, "num_input_tokens_seen": 136430560, "step": 6341, "time_per_iteration": 2.6635477542877197 }, { "auxiliary_loss_clip": 0.01121971, "auxiliary_loss_mlp": 0.00772357, "balance_loss_clip": 1.04312134, "balance_loss_mlp": 1.00047839, "epoch": 0.7625804124331149, "flos": 19135755515520.0, "grad_norm": 1.8650304572817136, "language_loss": 0.68888605, "learning_rate": 5.625715685449242e-07, "loss": 0.70782936, "num_input_tokens_seen": 136448665, "step": 6342, "time_per_iteration": 2.6100831031799316 }, { "auxiliary_loss_clip": 0.01089693, "auxiliary_loss_mlp": 0.01043331, "balance_loss_clip": 1.0420115, "balance_loss_mlp": 1.02770281, "epoch": 0.762700655323754, "flos": 26213101914240.0, "grad_norm": 1.6216722144586624, "language_loss": 0.71606654, "learning_rate": 5.620300542390966e-07, "loss": 0.73739678, "num_input_tokens_seen": 136469710, "step": 6343, "time_per_iteration": 2.753871440887451 }, { "auxiliary_loss_clip": 0.01102634, "auxiliary_loss_mlp": 0.0103672, "balance_loss_clip": 1.03964949, "balance_loss_mlp": 1.02274823, "epoch": 0.762820898214393, "flos": 22382582711040.0, "grad_norm": 1.7631309659991048, "language_loss": 0.84742987, "learning_rate": 5.614887580721659e-07, "loss": 0.86882347, "num_input_tokens_seen": 136489855, "step": 6344, "time_per_iteration": 2.6278529167175293 }, { "auxiliary_loss_clip": 0.01090562, "auxiliary_loss_mlp": 0.01041585, "balance_loss_clip": 1.04095221, "balance_loss_mlp": 1.0250386, "epoch": 0.7629411411050322, "flos": 15700504550400.0, "grad_norm": 2.6490224655524246, "language_loss": 0.73879755, "learning_rate": 5.609476801262481e-07, "loss": 0.76011908, "num_input_tokens_seen": 136504715, "step": 6345, "time_per_iteration": 2.625793695449829 }, { "auxiliary_loss_clip": 0.01091753, "auxiliary_loss_mlp": 0.01039157, "balance_loss_clip": 1.0395987, "balance_loss_mlp": 1.02358794, "epoch": 0.7630613839956712, "flos": 13770342293760.0, "grad_norm": 2.4358343759436205, "language_loss": 0.63986129, "learning_rate": 5.604068204834223e-07, "loss": 0.66117036, "num_input_tokens_seen": 136521610, "step": 6346, "time_per_iteration": 2.647960901260376 }, { "auxiliary_loss_clip": 0.0108006, "auxiliary_loss_mlp": 0.0077324, "balance_loss_clip": 1.03764236, "balance_loss_mlp": 1.00043583, "epoch": 0.7631816268863103, "flos": 14569569861120.0, "grad_norm": 3.05163538885385, "language_loss": 0.76073462, "learning_rate": 5.598661792257367e-07, "loss": 0.77926767, "num_input_tokens_seen": 136538655, "step": 6347, "time_per_iteration": 2.663184881210327 }, { "auxiliary_loss_clip": 0.01115859, "auxiliary_loss_mlp": 0.01047711, "balance_loss_clip": 1.03867221, "balance_loss_mlp": 1.03131914, "epoch": 0.7633018697769495, "flos": 19062210418560.0, "grad_norm": 1.8975101724280459, "language_loss": 0.75847781, "learning_rate": 5.593257564352071e-07, "loss": 0.78011346, "num_input_tokens_seen": 136557095, "step": 6348, "time_per_iteration": 2.5954368114471436 }, { "auxiliary_loss_clip": 0.01118658, "auxiliary_loss_mlp": 0.01038278, "balance_loss_clip": 1.04253352, "balance_loss_mlp": 1.02275658, "epoch": 0.7634221126675885, "flos": 22052958577920.0, "grad_norm": 1.712186215969896, "language_loss": 0.75834852, "learning_rate": 5.58785552193815e-07, "loss": 0.7799179, "num_input_tokens_seen": 136577340, "step": 6349, "time_per_iteration": 2.6370818614959717 }, { "auxiliary_loss_clip": 0.01132976, "auxiliary_loss_mlp": 0.01040396, "balance_loss_clip": 1.04299748, "balance_loss_mlp": 1.02548862, "epoch": 0.7635423555582276, "flos": 29382720825600.0, "grad_norm": 2.2661808906913725, "language_loss": 0.75901198, "learning_rate": 5.582455665835086e-07, "loss": 0.78074574, "num_input_tokens_seen": 136597635, "step": 6350, "time_per_iteration": 4.413436412811279 }, { "auxiliary_loss_clip": 0.01119235, "auxiliary_loss_mlp": 0.01043428, "balance_loss_clip": 1.0421741, "balance_loss_mlp": 1.02554679, "epoch": 0.7636625984488667, "flos": 17784903807360.0, "grad_norm": 3.031657257304154, "language_loss": 0.72761995, "learning_rate": 5.577057996862036e-07, "loss": 0.7492466, "num_input_tokens_seen": 136615260, "step": 6351, "time_per_iteration": 2.636155128479004 }, { "auxiliary_loss_clip": 0.01131226, "auxiliary_loss_mlp": 0.01028478, "balance_loss_clip": 1.045048, "balance_loss_mlp": 1.01357722, "epoch": 0.7637828413395058, "flos": 23734583654400.0, "grad_norm": 1.5569704949597172, "language_loss": 0.76202798, "learning_rate": 5.571662515837814e-07, "loss": 0.78362501, "num_input_tokens_seen": 136637220, "step": 6352, "time_per_iteration": 2.627244710922241 }, { "auxiliary_loss_clip": 0.01111165, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.04381716, "balance_loss_mlp": 1.02493358, "epoch": 0.7639030842301449, "flos": 36283279461120.0, "grad_norm": 2.0534807237504897, "language_loss": 0.83518696, "learning_rate": 5.566269223580926e-07, "loss": 0.85669971, "num_input_tokens_seen": 136658930, "step": 6353, "time_per_iteration": 2.7663891315460205 }, { "auxiliary_loss_clip": 0.01122313, "auxiliary_loss_mlp": 0.0104024, "balance_loss_clip": 1.0424943, "balance_loss_mlp": 1.02561939, "epoch": 0.764023327120784, "flos": 28878104609280.0, "grad_norm": 1.6676535039904157, "language_loss": 0.75244176, "learning_rate": 5.560878120909511e-07, "loss": 0.77406728, "num_input_tokens_seen": 136681530, "step": 6354, "time_per_iteration": 3.628499984741211 }, { "auxiliary_loss_clip": 0.01028245, "auxiliary_loss_mlp": 0.01000939, "balance_loss_clip": 1.00900781, "balance_loss_mlp": 0.99941939, "epoch": 0.7641435700114231, "flos": 64789711067520.0, "grad_norm": 1.0819691639841438, "language_loss": 0.58561027, "learning_rate": 5.55548920864141e-07, "loss": 0.60590214, "num_input_tokens_seen": 136742185, "step": 6355, "time_per_iteration": 3.2335546016693115 }, { "auxiliary_loss_clip": 0.01122176, "auxiliary_loss_mlp": 0.01038357, "balance_loss_clip": 1.04492283, "balance_loss_mlp": 1.02423048, "epoch": 0.7642638129020621, "flos": 16835784785280.0, "grad_norm": 1.69914214912481, "language_loss": 0.78281057, "learning_rate": 5.550102487594113e-07, "loss": 0.80441594, "num_input_tokens_seen": 136760855, "step": 6356, "time_per_iteration": 2.6185989379882812 }, { "auxiliary_loss_clip": 0.0108555, "auxiliary_loss_mlp": 0.00772617, "balance_loss_clip": 1.03717566, "balance_loss_mlp": 1.00055432, "epoch": 0.7643840557927013, "flos": 30408940391040.0, "grad_norm": 2.0981981480886427, "language_loss": 0.71843451, "learning_rate": 5.54471795858477e-07, "loss": 0.7370162, "num_input_tokens_seen": 136780925, "step": 6357, "time_per_iteration": 3.6859657764434814 }, { "auxiliary_loss_clip": 0.01094586, "auxiliary_loss_mlp": 0.01034204, "balance_loss_clip": 1.03766942, "balance_loss_mlp": 1.01856351, "epoch": 0.7645042986833404, "flos": 16983234115200.0, "grad_norm": 2.0349836253929143, "language_loss": 0.82921827, "learning_rate": 5.539335622430235e-07, "loss": 0.85050619, "num_input_tokens_seen": 136799545, "step": 6358, "time_per_iteration": 2.6484928131103516 }, { "auxiliary_loss_clip": 0.01114379, "auxiliary_loss_mlp": 0.01042556, "balance_loss_clip": 1.03971016, "balance_loss_mlp": 1.0260458, "epoch": 0.7646245415739794, "flos": 17311493531520.0, "grad_norm": 1.957725589215392, "language_loss": 0.7460891, "learning_rate": 5.533955479946975e-07, "loss": 0.76765847, "num_input_tokens_seen": 136818325, "step": 6359, "time_per_iteration": 2.57116436958313 }, { "auxiliary_loss_clip": 0.01001381, "auxiliary_loss_mlp": 0.00755544, "balance_loss_clip": 1.01176095, "balance_loss_mlp": 1.0001421, "epoch": 0.7647447844646186, "flos": 70402332666240.0, "grad_norm": 0.8584918450456895, "language_loss": 0.65696776, "learning_rate": 5.528577531951173e-07, "loss": 0.67453706, "num_input_tokens_seen": 136878730, "step": 6360, "time_per_iteration": 3.224768877029419 }, { "auxiliary_loss_clip": 0.01108657, "auxiliary_loss_mlp": 0.01031208, "balance_loss_clip": 1.0413239, "balance_loss_mlp": 1.01622319, "epoch": 0.7648650273552576, "flos": 17675914965120.0, "grad_norm": 2.437665680925245, "language_loss": 0.74172324, "learning_rate": 5.523201779258653e-07, "loss": 0.76312184, "num_input_tokens_seen": 136897705, "step": 6361, "time_per_iteration": 2.6459901332855225 }, { "auxiliary_loss_clip": 0.01132235, "auxiliary_loss_mlp": 0.01036101, "balance_loss_clip": 1.04172254, "balance_loss_mlp": 1.01968551, "epoch": 0.7649852702458967, "flos": 22162019247360.0, "grad_norm": 1.841921707940435, "language_loss": 0.84054464, "learning_rate": 5.517828222684912e-07, "loss": 0.86222804, "num_input_tokens_seen": 136918360, "step": 6362, "time_per_iteration": 2.610628604888916 }, { "auxiliary_loss_clip": 0.010239, "auxiliary_loss_mlp": 0.01002668, "balance_loss_clip": 1.01497936, "balance_loss_mlp": 1.00101054, "epoch": 0.7651055131365359, "flos": 69848338227840.0, "grad_norm": 0.7731420384242447, "language_loss": 0.58992577, "learning_rate": 5.512456863045117e-07, "loss": 0.61019146, "num_input_tokens_seen": 136979050, "step": 6363, "time_per_iteration": 3.2138118743896484 }, { "auxiliary_loss_clip": 0.01134103, "auxiliary_loss_mlp": 0.01038873, "balance_loss_clip": 1.04367864, "balance_loss_mlp": 1.02295816, "epoch": 0.7652257560271749, "flos": 19464014931840.0, "grad_norm": 2.0702268639823025, "language_loss": 0.74086523, "learning_rate": 5.507087701154089e-07, "loss": 0.76259506, "num_input_tokens_seen": 136998970, "step": 6364, "time_per_iteration": 2.5852274894714355 }, { "auxiliary_loss_clip": 0.01088009, "auxiliary_loss_mlp": 0.01041996, "balance_loss_clip": 1.03894067, "balance_loss_mlp": 1.02603376, "epoch": 0.765345998917814, "flos": 15961108700160.0, "grad_norm": 1.8053062275465586, "language_loss": 0.75487226, "learning_rate": 5.50172073782634e-07, "loss": 0.77617234, "num_input_tokens_seen": 137016950, "step": 6365, "time_per_iteration": 2.709440231323242 }, { "auxiliary_loss_clip": 0.01092822, "auxiliary_loss_mlp": 0.01035407, "balance_loss_clip": 1.04097879, "balance_loss_mlp": 1.0207026, "epoch": 0.7654662418084531, "flos": 23659853408640.0, "grad_norm": 2.6143393032823727, "language_loss": 0.88092017, "learning_rate": 5.496355973876023e-07, "loss": 0.90220249, "num_input_tokens_seen": 137036205, "step": 6366, "time_per_iteration": 2.6949455738067627 }, { "auxiliary_loss_clip": 0.01096739, "auxiliary_loss_mlp": 0.00773375, "balance_loss_clip": 1.04028976, "balance_loss_mlp": 1.00037706, "epoch": 0.7655864846990922, "flos": 41463608878080.0, "grad_norm": 1.623318092724343, "language_loss": 0.71233761, "learning_rate": 5.490993410116984e-07, "loss": 0.73103869, "num_input_tokens_seen": 137059195, "step": 6367, "time_per_iteration": 2.873232364654541 }, { "auxiliary_loss_clip": 0.01086762, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.03962672, "balance_loss_mlp": 1.01637769, "epoch": 0.7657067275897312, "flos": 43142684088960.0, "grad_norm": 1.993395046006774, "language_loss": 0.6953429, "learning_rate": 5.485633047362704e-07, "loss": 0.71653348, "num_input_tokens_seen": 137081200, "step": 6368, "time_per_iteration": 2.8566555976867676 }, { "auxiliary_loss_clip": 0.01141584, "auxiliary_loss_mlp": 0.01038792, "balance_loss_clip": 1.04873586, "balance_loss_mlp": 1.02297282, "epoch": 0.7658269704803703, "flos": 17311780840320.0, "grad_norm": 1.8538813359090998, "language_loss": 0.79026425, "learning_rate": 5.480274886426341e-07, "loss": 0.81206805, "num_input_tokens_seen": 137097840, "step": 6369, "time_per_iteration": 2.585731267929077 }, { "auxiliary_loss_clip": 0.01113517, "auxiliary_loss_mlp": 0.01037395, "balance_loss_clip": 1.04192245, "balance_loss_mlp": 1.02282739, "epoch": 0.7659472133710095, "flos": 12568160977920.0, "grad_norm": 3.757948146938102, "language_loss": 0.78034329, "learning_rate": 5.474918928120744e-07, "loss": 0.80185246, "num_input_tokens_seen": 137114335, "step": 6370, "time_per_iteration": 2.59809947013855 }, { "auxiliary_loss_clip": 0.01119662, "auxiliary_loss_mlp": 0.01033958, "balance_loss_clip": 1.04281771, "balance_loss_mlp": 1.01934326, "epoch": 0.7660674562616485, "flos": 22707430335360.0, "grad_norm": 2.2691233352749456, "language_loss": 0.87314105, "learning_rate": 5.469565173258392e-07, "loss": 0.89467722, "num_input_tokens_seen": 137132850, "step": 6371, "time_per_iteration": 2.6834676265716553 }, { "auxiliary_loss_clip": 0.01137109, "auxiliary_loss_mlp": 0.01041243, "balance_loss_clip": 1.04313123, "balance_loss_mlp": 1.02411222, "epoch": 0.7661876991522876, "flos": 17056455989760.0, "grad_norm": 1.7671588922045687, "language_loss": 0.63718623, "learning_rate": 5.464213622651454e-07, "loss": 0.65896976, "num_input_tokens_seen": 137150665, "step": 6372, "time_per_iteration": 2.5752382278442383 }, { "auxiliary_loss_clip": 0.01097995, "auxiliary_loss_mlp": 0.0103969, "balance_loss_clip": 1.03990006, "balance_loss_mlp": 1.02412117, "epoch": 0.7663079420429267, "flos": 20084228092800.0, "grad_norm": 3.9066832434335446, "language_loss": 0.842435, "learning_rate": 5.458864277111753e-07, "loss": 0.86381185, "num_input_tokens_seen": 137168500, "step": 6373, "time_per_iteration": 2.6765120029449463 }, { "auxiliary_loss_clip": 0.011018, "auxiliary_loss_mlp": 0.00772155, "balance_loss_clip": 1.03894353, "balance_loss_mlp": 1.00038278, "epoch": 0.7664281849335658, "flos": 12677473042560.0, "grad_norm": 2.7428849528559036, "language_loss": 0.6968556, "learning_rate": 5.453517137450769e-07, "loss": 0.71559519, "num_input_tokens_seen": 137185075, "step": 6374, "time_per_iteration": 2.627077579498291 }, { "auxiliary_loss_clip": 0.01121944, "auxiliary_loss_mlp": 0.01036021, "balance_loss_clip": 1.04263961, "balance_loss_mlp": 1.02090549, "epoch": 0.7665484278242048, "flos": 22345271458560.0, "grad_norm": 1.8619709913567348, "language_loss": 0.76031613, "learning_rate": 5.448172204479684e-07, "loss": 0.78189576, "num_input_tokens_seen": 137204355, "step": 6375, "time_per_iteration": 2.623060941696167 }, { "auxiliary_loss_clip": 0.01131297, "auxiliary_loss_mlp": 0.01044592, "balance_loss_clip": 1.0422647, "balance_loss_mlp": 1.02896929, "epoch": 0.766668670714844, "flos": 23617909301760.0, "grad_norm": 1.7450891255505576, "language_loss": 0.7462709, "learning_rate": 5.442829479009294e-07, "loss": 0.76802981, "num_input_tokens_seen": 137223135, "step": 6376, "time_per_iteration": 4.569346189498901 }, { "auxiliary_loss_clip": 0.01126772, "auxiliary_loss_mlp": 0.01043982, "balance_loss_clip": 1.04256773, "balance_loss_mlp": 1.02660108, "epoch": 0.7667889136054831, "flos": 19427134642560.0, "grad_norm": 2.0033719414197617, "language_loss": 0.71201611, "learning_rate": 5.437488961850103e-07, "loss": 0.73372364, "num_input_tokens_seen": 137242935, "step": 6377, "time_per_iteration": 2.623079538345337 }, { "auxiliary_loss_clip": 0.01084647, "auxiliary_loss_mlp": 0.01029676, "balance_loss_clip": 1.04071546, "balance_loss_mlp": 1.01504922, "epoch": 0.7669091564961221, "flos": 26866352609280.0, "grad_norm": 1.8602724876111163, "language_loss": 0.75481343, "learning_rate": 5.432150653812258e-07, "loss": 0.77595669, "num_input_tokens_seen": 137262970, "step": 6378, "time_per_iteration": 2.7495923042297363 }, { "auxiliary_loss_clip": 0.01114942, "auxiliary_loss_mlp": 0.01041809, "balance_loss_clip": 1.04002857, "balance_loss_mlp": 1.02486944, "epoch": 0.7670293993867613, "flos": 12385303816320.0, "grad_norm": 4.240035918307656, "language_loss": 0.82416916, "learning_rate": 5.42681455570557e-07, "loss": 0.84573662, "num_input_tokens_seen": 137279500, "step": 6379, "time_per_iteration": 2.593017339706421 }, { "auxiliary_loss_clip": 0.01127567, "auxiliary_loss_mlp": 0.01034332, "balance_loss_clip": 1.03930879, "balance_loss_mlp": 1.01881123, "epoch": 0.7671496422774003, "flos": 21762944167680.0, "grad_norm": 1.9498524672405835, "language_loss": 0.64894313, "learning_rate": 5.42148066833954e-07, "loss": 0.67056215, "num_input_tokens_seen": 137298745, "step": 6380, "time_per_iteration": 3.571592092514038 }, { "auxiliary_loss_clip": 0.01130301, "auxiliary_loss_mlp": 0.01036, "balance_loss_clip": 1.04326463, "balance_loss_mlp": 1.02056837, "epoch": 0.7672698851680394, "flos": 21069221823360.0, "grad_norm": 2.3984982620433715, "language_loss": 0.75369531, "learning_rate": 5.416148992523289e-07, "loss": 0.77535832, "num_input_tokens_seen": 137317320, "step": 6381, "time_per_iteration": 2.6908106803894043 }, { "auxiliary_loss_clip": 0.01062898, "auxiliary_loss_mlp": 0.0104571, "balance_loss_clip": 1.03950644, "balance_loss_mlp": 1.02809143, "epoch": 0.7673901280586786, "flos": 16976697840000.0, "grad_norm": 2.3109574571117557, "language_loss": 0.7855286, "learning_rate": 5.410819529065644e-07, "loss": 0.80661476, "num_input_tokens_seen": 137335275, "step": 6382, "time_per_iteration": 2.875903367996216 }, { "auxiliary_loss_clip": 0.0108115, "auxiliary_loss_mlp": 0.01035743, "balance_loss_clip": 1.0367471, "balance_loss_mlp": 1.02037656, "epoch": 0.7675103709493176, "flos": 29242669697280.0, "grad_norm": 2.3001872620257915, "language_loss": 0.65744859, "learning_rate": 5.405492278775079e-07, "loss": 0.67861748, "num_input_tokens_seen": 137355055, "step": 6383, "time_per_iteration": 3.8389763832092285 }, { "auxiliary_loss_clip": 0.01109432, "auxiliary_loss_mlp": 0.01041111, "balance_loss_clip": 1.04021716, "balance_loss_mlp": 1.0231576, "epoch": 0.7676306138399567, "flos": 29023004073600.0, "grad_norm": 2.4731660757414837, "language_loss": 0.79637063, "learning_rate": 5.400167242459732e-07, "loss": 0.81787604, "num_input_tokens_seen": 137374015, "step": 6384, "time_per_iteration": 2.80031156539917 }, { "auxiliary_loss_clip": 0.01120921, "auxiliary_loss_mlp": 0.01039883, "balance_loss_clip": 1.04426932, "balance_loss_mlp": 1.02377808, "epoch": 0.7677508567305958, "flos": 22565116650240.0, "grad_norm": 2.1477202337861327, "language_loss": 0.8049165, "learning_rate": 5.394844420927405e-07, "loss": 0.82652456, "num_input_tokens_seen": 137393625, "step": 6385, "time_per_iteration": 2.7836179733276367 }, { "auxiliary_loss_clip": 0.01130889, "auxiliary_loss_mlp": 0.0103926, "balance_loss_clip": 1.04322338, "balance_loss_mlp": 1.0229516, "epoch": 0.7678710996212349, "flos": 25411432222080.0, "grad_norm": 3.7428639728712536, "language_loss": 0.73522472, "learning_rate": 5.389523814985562e-07, "loss": 0.75692624, "num_input_tokens_seen": 137413045, "step": 6386, "time_per_iteration": 2.7029030323028564 }, { "auxiliary_loss_clip": 0.01084618, "auxiliary_loss_mlp": 0.01044901, "balance_loss_clip": 1.04132807, "balance_loss_mlp": 1.02701998, "epoch": 0.767991342511874, "flos": 26756825063040.0, "grad_norm": 2.1714566287541084, "language_loss": 0.76236635, "learning_rate": 5.384205425441344e-07, "loss": 0.78366154, "num_input_tokens_seen": 137433955, "step": 6387, "time_per_iteration": 2.799368381500244 }, { "auxiliary_loss_clip": 0.01108167, "auxiliary_loss_mlp": 0.01039161, "balance_loss_clip": 1.04028034, "balance_loss_mlp": 1.02358079, "epoch": 0.7681115854025131, "flos": 26359509749760.0, "grad_norm": 2.1181756476209292, "language_loss": 0.84158576, "learning_rate": 5.378889253101537e-07, "loss": 0.86305898, "num_input_tokens_seen": 137454510, "step": 6388, "time_per_iteration": 2.8148210048675537 }, { "auxiliary_loss_clip": 0.01117893, "auxiliary_loss_mlp": 0.01039441, "balance_loss_clip": 1.04070711, "balance_loss_mlp": 1.02530336, "epoch": 0.7682318282931522, "flos": 23257043314560.0, "grad_norm": 1.726808974551087, "language_loss": 0.80716312, "learning_rate": 5.373575298772617e-07, "loss": 0.82873642, "num_input_tokens_seen": 137473630, "step": 6389, "time_per_iteration": 2.680661201477051 }, { "auxiliary_loss_clip": 0.01027485, "auxiliary_loss_mlp": 0.01015589, "balance_loss_clip": 1.00804543, "balance_loss_mlp": 1.01351476, "epoch": 0.7683520711837912, "flos": 70072457137920.0, "grad_norm": 0.7640114023641983, "language_loss": 0.61316198, "learning_rate": 5.368263563260689e-07, "loss": 0.63359272, "num_input_tokens_seen": 137538765, "step": 6390, "time_per_iteration": 3.376352548599243 }, { "auxiliary_loss_clip": 0.01121569, "auxiliary_loss_mlp": 0.01037943, "balance_loss_clip": 1.04222512, "balance_loss_mlp": 1.02190948, "epoch": 0.7684723140744304, "flos": 18624890332800.0, "grad_norm": 1.7603521977005971, "language_loss": 0.64257371, "learning_rate": 5.362954047371537e-07, "loss": 0.66416889, "num_input_tokens_seen": 137557875, "step": 6391, "time_per_iteration": 2.6678109169006348 }, { "auxiliary_loss_clip": 0.01100972, "auxiliary_loss_mlp": 0.0104847, "balance_loss_clip": 1.04399359, "balance_loss_mlp": 1.03060067, "epoch": 0.7685925569650695, "flos": 27452989532160.0, "grad_norm": 2.413895546551143, "language_loss": 0.72184122, "learning_rate": 5.357646751910627e-07, "loss": 0.74333566, "num_input_tokens_seen": 137579055, "step": 6392, "time_per_iteration": 2.8528363704681396 }, { "auxiliary_loss_clip": 0.01109355, "auxiliary_loss_mlp": 0.01044547, "balance_loss_clip": 1.04087174, "balance_loss_mlp": 1.02655816, "epoch": 0.7687127998557085, "flos": 24535714642560.0, "grad_norm": 3.2109920288327363, "language_loss": 0.79872698, "learning_rate": 5.352341677683061e-07, "loss": 0.82026601, "num_input_tokens_seen": 137600355, "step": 6393, "time_per_iteration": 2.7243549823760986 }, { "auxiliary_loss_clip": 0.01100102, "auxiliary_loss_mlp": 0.01033828, "balance_loss_clip": 1.04044724, "balance_loss_mlp": 1.01912963, "epoch": 0.7688330427463477, "flos": 25155963717120.0, "grad_norm": 2.2448728186250313, "language_loss": 0.79033482, "learning_rate": 5.347038825493617e-07, "loss": 0.81167412, "num_input_tokens_seen": 137621885, "step": 6394, "time_per_iteration": 2.7931768894195557 }, { "auxiliary_loss_clip": 0.01104215, "auxiliary_loss_mlp": 0.01038375, "balance_loss_clip": 1.04072177, "balance_loss_mlp": 1.02421284, "epoch": 0.7689532856369867, "flos": 21211284113280.0, "grad_norm": 2.574402935964901, "language_loss": 0.68788075, "learning_rate": 5.341738196146732e-07, "loss": 0.70930666, "num_input_tokens_seen": 137640230, "step": 6395, "time_per_iteration": 2.945009708404541 }, { "auxiliary_loss_clip": 0.01116467, "auxiliary_loss_mlp": 0.01038592, "balance_loss_clip": 1.03938794, "balance_loss_mlp": 1.02170026, "epoch": 0.7690735285276258, "flos": 25119083427840.0, "grad_norm": 2.576516381213198, "language_loss": 0.73681396, "learning_rate": 5.336439790446503e-07, "loss": 0.75836456, "num_input_tokens_seen": 137659330, "step": 6396, "time_per_iteration": 2.7707130908966064 }, { "auxiliary_loss_clip": 0.01094211, "auxiliary_loss_mlp": 0.01049804, "balance_loss_clip": 1.03953397, "balance_loss_mlp": 1.03208947, "epoch": 0.769193771418265, "flos": 54744020640000.0, "grad_norm": 1.6909142748695425, "language_loss": 0.62504256, "learning_rate": 5.331143609196711e-07, "loss": 0.64648271, "num_input_tokens_seen": 137683145, "step": 6397, "time_per_iteration": 3.043736457824707 }, { "auxiliary_loss_clip": 0.01117504, "auxiliary_loss_mlp": 0.01042783, "balance_loss_clip": 1.04175222, "balance_loss_mlp": 1.02715421, "epoch": 0.769314014308904, "flos": 37341890115840.0, "grad_norm": 2.0726872714662092, "language_loss": 0.77218062, "learning_rate": 5.325849653200758e-07, "loss": 0.79378343, "num_input_tokens_seen": 137707095, "step": 6398, "time_per_iteration": 2.743741750717163 }, { "auxiliary_loss_clip": 0.01132121, "auxiliary_loss_mlp": 0.0103583, "balance_loss_clip": 1.04326963, "balance_loss_mlp": 1.01967692, "epoch": 0.7694342571995431, "flos": 20631686256000.0, "grad_norm": 1.9547178047632658, "language_loss": 0.76472163, "learning_rate": 5.32055792326175e-07, "loss": 0.78640109, "num_input_tokens_seen": 137725520, "step": 6399, "time_per_iteration": 2.601325511932373 }, { "auxiliary_loss_clip": 0.01112417, "auxiliary_loss_mlp": 0.01041198, "balance_loss_clip": 1.04256558, "balance_loss_mlp": 1.02523613, "epoch": 0.7695545000901821, "flos": 24207706621440.0, "grad_norm": 1.9092033976694198, "language_loss": 0.73095393, "learning_rate": 5.315268420182437e-07, "loss": 0.75249004, "num_input_tokens_seen": 137744195, "step": 6400, "time_per_iteration": 2.6413626670837402 }, { "auxiliary_loss_clip": 0.01101322, "auxiliary_loss_mlp": 0.00773009, "balance_loss_clip": 1.03991902, "balance_loss_mlp": 1.00049722, "epoch": 0.7696747429808213, "flos": 28001273708160.0, "grad_norm": 2.242078833850585, "language_loss": 0.7627883, "learning_rate": 5.309981144765221e-07, "loss": 0.78153157, "num_input_tokens_seen": 137764340, "step": 6401, "time_per_iteration": 2.772043228149414 }, { "auxiliary_loss_clip": 0.01087812, "auxiliary_loss_mlp": 0.01031749, "balance_loss_clip": 1.03896677, "balance_loss_mlp": 1.0167402, "epoch": 0.7697949858714603, "flos": 11509550323200.0, "grad_norm": 5.930322441556973, "language_loss": 0.75712669, "learning_rate": 5.304696097812196e-07, "loss": 0.77832228, "num_input_tokens_seen": 137780940, "step": 6402, "time_per_iteration": 4.4932849407196045 }, { "auxiliary_loss_clip": 0.0110759, "auxiliary_loss_mlp": 0.01054887, "balance_loss_clip": 1.03974545, "balance_loss_mlp": 1.03637421, "epoch": 0.7699152287620994, "flos": 26688271956480.0, "grad_norm": 8.003437975930675, "language_loss": 0.60274744, "learning_rate": 5.299413280125078e-07, "loss": 0.62437224, "num_input_tokens_seen": 137799250, "step": 6403, "time_per_iteration": 2.910003423690796 }, { "auxiliary_loss_clip": 0.01108635, "auxiliary_loss_mlp": 0.0103995, "balance_loss_clip": 1.04114771, "balance_loss_mlp": 1.02470279, "epoch": 0.7700354716527386, "flos": 16544944362240.0, "grad_norm": 1.876545002341438, "language_loss": 0.72648025, "learning_rate": 5.294132692505284e-07, "loss": 0.74796617, "num_input_tokens_seen": 137817660, "step": 6404, "time_per_iteration": 2.711747646331787 }, { "auxiliary_loss_clip": 0.01072436, "auxiliary_loss_mlp": 0.01035977, "balance_loss_clip": 1.03524733, "balance_loss_mlp": 1.02124274, "epoch": 0.7701557145433776, "flos": 19242733196160.0, "grad_norm": 2.208892884982166, "language_loss": 0.7906822, "learning_rate": 5.288854335753861e-07, "loss": 0.81176639, "num_input_tokens_seen": 137835920, "step": 6405, "time_per_iteration": 2.7410194873809814 }, { "auxiliary_loss_clip": 0.01124194, "auxiliary_loss_mlp": 0.01039993, "balance_loss_clip": 1.04367232, "balance_loss_mlp": 1.02401876, "epoch": 0.7702759574340167, "flos": 31685744211840.0, "grad_norm": 2.0874899531532667, "language_loss": 0.75612235, "learning_rate": 5.283578210671551e-07, "loss": 0.7777642, "num_input_tokens_seen": 137858160, "step": 6406, "time_per_iteration": 3.7217822074890137 }, { "auxiliary_loss_clip": 0.01110692, "auxiliary_loss_mlp": 0.01040346, "balance_loss_clip": 1.04203653, "balance_loss_mlp": 1.02530205, "epoch": 0.7703962003246558, "flos": 16800089644800.0, "grad_norm": 2.535606996933676, "language_loss": 0.76930106, "learning_rate": 5.278304318058719e-07, "loss": 0.79081142, "num_input_tokens_seen": 137876015, "step": 6407, "time_per_iteration": 2.6555140018463135 }, { "auxiliary_loss_clip": 0.01067809, "auxiliary_loss_mlp": 0.01033311, "balance_loss_clip": 1.03597987, "balance_loss_mlp": 1.01758718, "epoch": 0.7705164432152949, "flos": 35736072693120.0, "grad_norm": 2.960888516050766, "language_loss": 0.79363561, "learning_rate": 5.273032658715411e-07, "loss": 0.81464684, "num_input_tokens_seen": 137898825, "step": 6408, "time_per_iteration": 2.898090362548828 }, { "auxiliary_loss_clip": 0.01077438, "auxiliary_loss_mlp": 0.01043985, "balance_loss_clip": 1.03637588, "balance_loss_mlp": 1.02649713, "epoch": 0.7706366861059339, "flos": 23365960329600.0, "grad_norm": 1.7406736013712953, "language_loss": 0.76379037, "learning_rate": 5.267763233441347e-07, "loss": 0.78500462, "num_input_tokens_seen": 137919455, "step": 6409, "time_per_iteration": 2.7430453300476074 }, { "auxiliary_loss_clip": 0.01127791, "auxiliary_loss_mlp": 0.01044168, "balance_loss_clip": 1.04372621, "balance_loss_mlp": 1.02590477, "epoch": 0.7707569289965731, "flos": 22929897219840.0, "grad_norm": 2.191148402549936, "language_loss": 0.69579637, "learning_rate": 5.26249604303588e-07, "loss": 0.71751595, "num_input_tokens_seen": 137937960, "step": 6410, "time_per_iteration": 3.593057155609131 }, { "auxiliary_loss_clip": 0.01131607, "auxiliary_loss_mlp": 0.01042177, "balance_loss_clip": 1.0434047, "balance_loss_mlp": 1.0264535, "epoch": 0.7708771718872122, "flos": 17420661941760.0, "grad_norm": 1.9994138913210011, "language_loss": 0.7830075, "learning_rate": 5.257231088298057e-07, "loss": 0.80474538, "num_input_tokens_seen": 137956370, "step": 6411, "time_per_iteration": 2.579652786254883 }, { "auxiliary_loss_clip": 0.01009192, "auxiliary_loss_mlp": 0.010021, "balance_loss_clip": 1.00981498, "balance_loss_mlp": 1.00060368, "epoch": 0.7709974147778512, "flos": 72241316248320.0, "grad_norm": 0.7933965544203841, "language_loss": 0.53959036, "learning_rate": 5.25196837002655e-07, "loss": 0.55970323, "num_input_tokens_seen": 138016080, "step": 6412, "time_per_iteration": 3.2873387336730957 }, { "auxiliary_loss_clip": 0.01108491, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.04213142, "balance_loss_mlp": 1.02268159, "epoch": 0.7711176576684904, "flos": 39859694876160.0, "grad_norm": 2.1118367000101763, "language_loss": 0.68371797, "learning_rate": 5.24670788901971e-07, "loss": 0.70518875, "num_input_tokens_seen": 138039170, "step": 6413, "time_per_iteration": 2.8244619369506836 }, { "auxiliary_loss_clip": 0.01111995, "auxiliary_loss_mlp": 0.01048658, "balance_loss_clip": 1.04492521, "balance_loss_mlp": 1.02932203, "epoch": 0.7712379005591294, "flos": 36976391274240.0, "grad_norm": 2.304519645427143, "language_loss": 0.68968421, "learning_rate": 5.241449646075557e-07, "loss": 0.71129072, "num_input_tokens_seen": 138062395, "step": 6414, "time_per_iteration": 2.7670514583587646 }, { "auxiliary_loss_clip": 0.01131374, "auxiliary_loss_mlp": 0.01042418, "balance_loss_clip": 1.04617608, "balance_loss_mlp": 1.02683187, "epoch": 0.7713581434497685, "flos": 22776773541120.0, "grad_norm": 2.0689517195847786, "language_loss": 0.72486556, "learning_rate": 5.236193641991762e-07, "loss": 0.74660349, "num_input_tokens_seen": 138080325, "step": 6415, "time_per_iteration": 2.6413650512695312 }, { "auxiliary_loss_clip": 0.01107366, "auxiliary_loss_mlp": 0.01038868, "balance_loss_clip": 1.04189467, "balance_loss_mlp": 1.02401412, "epoch": 0.7714783863404077, "flos": 24097460803200.0, "grad_norm": 1.9702505714121603, "language_loss": 0.69706297, "learning_rate": 5.23093987756565e-07, "loss": 0.71852529, "num_input_tokens_seen": 138099020, "step": 6416, "time_per_iteration": 2.6913199424743652 }, { "auxiliary_loss_clip": 0.01103607, "auxiliary_loss_mlp": 0.01040736, "balance_loss_clip": 1.03996587, "balance_loss_mlp": 1.02147162, "epoch": 0.7715986292310467, "flos": 21063655215360.0, "grad_norm": 2.188410879491269, "language_loss": 0.75508213, "learning_rate": 5.225688353594217e-07, "loss": 0.77652556, "num_input_tokens_seen": 138118650, "step": 6417, "time_per_iteration": 2.7141969203948975 }, { "auxiliary_loss_clip": 0.01110996, "auxiliary_loss_mlp": 0.00772214, "balance_loss_clip": 1.0417521, "balance_loss_mlp": 1.00045443, "epoch": 0.7717188721216858, "flos": 20594877793920.0, "grad_norm": 2.2142582543885263, "language_loss": 0.77410853, "learning_rate": 5.220439070874108e-07, "loss": 0.79294068, "num_input_tokens_seen": 138137890, "step": 6418, "time_per_iteration": 2.6519076824188232 }, { "auxiliary_loss_clip": 0.01122317, "auxiliary_loss_mlp": 0.01043298, "balance_loss_clip": 1.04485202, "balance_loss_mlp": 1.0283972, "epoch": 0.7718391150123249, "flos": 26250951870720.0, "grad_norm": 1.8998843307716549, "language_loss": 0.71035528, "learning_rate": 5.215192030201652e-07, "loss": 0.73201144, "num_input_tokens_seen": 138158880, "step": 6419, "time_per_iteration": 2.6573524475097656 }, { "auxiliary_loss_clip": 0.01083683, "auxiliary_loss_mlp": 0.01038975, "balance_loss_clip": 1.03503203, "balance_loss_mlp": 1.02219033, "epoch": 0.771959357902964, "flos": 22049762267520.0, "grad_norm": 1.8209782693605951, "language_loss": 0.86544442, "learning_rate": 5.209947232372798e-07, "loss": 0.88667101, "num_input_tokens_seen": 138176370, "step": 6420, "time_per_iteration": 2.701881170272827 }, { "auxiliary_loss_clip": 0.01123186, "auxiliary_loss_mlp": 0.00773127, "balance_loss_clip": 1.04238403, "balance_loss_mlp": 1.00038958, "epoch": 0.772079600793603, "flos": 30446000248320.0, "grad_norm": 2.086030692228477, "language_loss": 0.81288111, "learning_rate": 5.204704678183196e-07, "loss": 0.83184421, "num_input_tokens_seen": 138195105, "step": 6421, "time_per_iteration": 2.6901466846466064 }, { "auxiliary_loss_clip": 0.01135727, "auxiliary_loss_mlp": 0.01034951, "balance_loss_clip": 1.04539204, "balance_loss_mlp": 1.0184052, "epoch": 0.7721998436842422, "flos": 12969857750400.0, "grad_norm": 2.1041622014539523, "language_loss": 0.85581088, "learning_rate": 5.19946436842813e-07, "loss": 0.8775177, "num_input_tokens_seen": 138212235, "step": 6422, "time_per_iteration": 2.599355459213257 }, { "auxiliary_loss_clip": 0.01093083, "auxiliary_loss_mlp": 0.01035321, "balance_loss_clip": 1.04016256, "balance_loss_mlp": 1.02001429, "epoch": 0.7723200865748813, "flos": 32635509678720.0, "grad_norm": 1.591108979457449, "language_loss": 0.68515253, "learning_rate": 5.194226303902546e-07, "loss": 0.70643657, "num_input_tokens_seen": 138231970, "step": 6423, "time_per_iteration": 2.8151071071624756 }, { "auxiliary_loss_clip": 0.01105669, "auxiliary_loss_mlp": 0.01040152, "balance_loss_clip": 1.03862453, "balance_loss_mlp": 1.02451229, "epoch": 0.7724403294655203, "flos": 21105707063040.0, "grad_norm": 1.8472632803457407, "language_loss": 0.70779324, "learning_rate": 5.188990485401072e-07, "loss": 0.7292515, "num_input_tokens_seen": 138251175, "step": 6424, "time_per_iteration": 2.67677640914917 }, { "auxiliary_loss_clip": 0.01123259, "auxiliary_loss_mlp": 0.01040737, "balance_loss_clip": 1.04270077, "balance_loss_mlp": 1.02481019, "epoch": 0.7725605723561595, "flos": 22090736707200.0, "grad_norm": 1.7568478676931112, "language_loss": 0.86388469, "learning_rate": 5.183756913717954e-07, "loss": 0.88552463, "num_input_tokens_seen": 138270950, "step": 6425, "time_per_iteration": 2.6693241596221924 }, { "auxiliary_loss_clip": 0.01104646, "auxiliary_loss_mlp": 0.01043256, "balance_loss_clip": 1.0411824, "balance_loss_mlp": 1.02585125, "epoch": 0.7726808152467985, "flos": 34495610457600.0, "grad_norm": 3.2127147135315335, "language_loss": 0.73116648, "learning_rate": 5.178525589647136e-07, "loss": 0.75264549, "num_input_tokens_seen": 138292590, "step": 6426, "time_per_iteration": 2.746852397918701 }, { "auxiliary_loss_clip": 0.01112848, "auxiliary_loss_mlp": 0.01040279, "balance_loss_clip": 1.04246473, "balance_loss_mlp": 1.02444792, "epoch": 0.7728010581374376, "flos": 22306344094080.0, "grad_norm": 1.940855123902262, "language_loss": 0.79178965, "learning_rate": 5.173296513982197e-07, "loss": 0.81332088, "num_input_tokens_seen": 138311115, "step": 6427, "time_per_iteration": 3.611027240753174 }, { "auxiliary_loss_clip": 0.01108332, "auxiliary_loss_mlp": 0.01043841, "balance_loss_clip": 1.04242992, "balance_loss_mlp": 1.02572095, "epoch": 0.7729213010280768, "flos": 27126453968640.0, "grad_norm": 2.1099591115575924, "language_loss": 0.65126359, "learning_rate": 5.168069687516398e-07, "loss": 0.67278528, "num_input_tokens_seen": 138330885, "step": 6428, "time_per_iteration": 3.686170816421509 }, { "auxiliary_loss_clip": 0.01112254, "auxiliary_loss_mlp": 0.0104291, "balance_loss_clip": 1.04310966, "balance_loss_mlp": 1.02666819, "epoch": 0.7730415439187158, "flos": 18150223080960.0, "grad_norm": 2.5367515675819132, "language_loss": 0.71867871, "learning_rate": 5.16284511104263e-07, "loss": 0.74023038, "num_input_tokens_seen": 138350020, "step": 6429, "time_per_iteration": 2.6393091678619385 }, { "auxiliary_loss_clip": 0.0110817, "auxiliary_loss_mlp": 0.01042569, "balance_loss_clip": 1.04109621, "balance_loss_mlp": 1.02616596, "epoch": 0.7731617868093549, "flos": 11947480940160.0, "grad_norm": 3.5762472206900644, "language_loss": 0.8081404, "learning_rate": 5.157622785353457e-07, "loss": 0.82964778, "num_input_tokens_seen": 138368135, "step": 6430, "time_per_iteration": 2.6416897773742676 }, { "auxiliary_loss_clip": 0.01027287, "auxiliary_loss_mlp": 0.010019, "balance_loss_clip": 1.00810194, "balance_loss_mlp": 1.00033879, "epoch": 0.7732820296999939, "flos": 64201027069440.0, "grad_norm": 0.6597225061818865, "language_loss": 0.60310447, "learning_rate": 5.152402711241113e-07, "loss": 0.62339628, "num_input_tokens_seen": 138436040, "step": 6431, "time_per_iteration": 3.2777249813079834 }, { "auxiliary_loss_clip": 0.01091117, "auxiliary_loss_mlp": 0.01034599, "balance_loss_clip": 1.03652263, "balance_loss_mlp": 1.0199604, "epoch": 0.7734022725906331, "flos": 25302191984640.0, "grad_norm": 2.9331136671687923, "language_loss": 0.82875943, "learning_rate": 5.147184889497465e-07, "loss": 0.85001659, "num_input_tokens_seen": 138455510, "step": 6432, "time_per_iteration": 3.7119944095611572 }, { "auxiliary_loss_clip": 0.010896, "auxiliary_loss_mlp": 0.01040711, "balance_loss_clip": 1.03939879, "balance_loss_mlp": 1.0242784, "epoch": 0.7735225154812722, "flos": 17347440067200.0, "grad_norm": 2.227494943351517, "language_loss": 0.80015326, "learning_rate": 5.141969320914072e-07, "loss": 0.82145637, "num_input_tokens_seen": 138473015, "step": 6433, "time_per_iteration": 2.6978440284729004 }, { "auxiliary_loss_clip": 0.01139409, "auxiliary_loss_mlp": 0.01046311, "balance_loss_clip": 1.04467738, "balance_loss_mlp": 1.02821529, "epoch": 0.7736427583719112, "flos": 32630086725120.0, "grad_norm": 3.194662954785777, "language_loss": 0.62447321, "learning_rate": 5.136756006282113e-07, "loss": 0.64633036, "num_input_tokens_seen": 138491680, "step": 6434, "time_per_iteration": 2.6689906120300293 }, { "auxiliary_loss_clip": 0.01136156, "auxiliary_loss_mlp": 0.01042678, "balance_loss_clip": 1.04382849, "balance_loss_mlp": 1.02797949, "epoch": 0.7737630012625504, "flos": 19860073269120.0, "grad_norm": 3.8423233121491855, "language_loss": 0.84688592, "learning_rate": 5.131544946392446e-07, "loss": 0.86867428, "num_input_tokens_seen": 138506960, "step": 6435, "time_per_iteration": 3.5320630073547363 }, { "auxiliary_loss_clip": 0.01107895, "auxiliary_loss_mlp": 0.01043275, "balance_loss_clip": 1.04334271, "balance_loss_mlp": 1.02545309, "epoch": 0.7738832441531894, "flos": 36022639397760.0, "grad_norm": 2.2583867381007448, "language_loss": 0.64391106, "learning_rate": 5.126336142035592e-07, "loss": 0.66542274, "num_input_tokens_seen": 138526995, "step": 6436, "time_per_iteration": 2.770228147506714 }, { "auxiliary_loss_clip": 0.01107901, "auxiliary_loss_mlp": 0.0103783, "balance_loss_clip": 1.03970289, "balance_loss_mlp": 1.02143836, "epoch": 0.7740034870438285, "flos": 13405274415360.0, "grad_norm": 3.998868109482806, "language_loss": 0.72259706, "learning_rate": 5.121129594001721e-07, "loss": 0.74405438, "num_input_tokens_seen": 138541260, "step": 6437, "time_per_iteration": 2.606205940246582 }, { "auxiliary_loss_clip": 0.01122687, "auxiliary_loss_mlp": 0.01037651, "balance_loss_clip": 1.04407144, "balance_loss_mlp": 1.02118802, "epoch": 0.7741237299344677, "flos": 22086714384000.0, "grad_norm": 1.587705180635988, "language_loss": 0.81116986, "learning_rate": 5.115925303080661e-07, "loss": 0.83277327, "num_input_tokens_seen": 138560970, "step": 6438, "time_per_iteration": 2.6981077194213867 }, { "auxiliary_loss_clip": 0.0110474, "auxiliary_loss_mlp": 0.01037289, "balance_loss_clip": 1.03994226, "balance_loss_mlp": 1.02142227, "epoch": 0.7742439728251067, "flos": 19864777950720.0, "grad_norm": 2.0077519821655967, "language_loss": 0.79192698, "learning_rate": 5.110723270061899e-07, "loss": 0.81334728, "num_input_tokens_seen": 138577460, "step": 6439, "time_per_iteration": 2.633798599243164 }, { "auxiliary_loss_clip": 0.01133962, "auxiliary_loss_mlp": 0.01036004, "balance_loss_clip": 1.04416895, "balance_loss_mlp": 1.0191896, "epoch": 0.7743642157157458, "flos": 16690167048960.0, "grad_norm": 1.7417195337312583, "language_loss": 0.79424149, "learning_rate": 5.105523495734572e-07, "loss": 0.8159411, "num_input_tokens_seen": 138594860, "step": 6440, "time_per_iteration": 2.5626018047332764 }, { "auxiliary_loss_clip": 0.0113218, "auxiliary_loss_mlp": 0.01037557, "balance_loss_clip": 1.04223502, "balance_loss_mlp": 1.02268016, "epoch": 0.7744844586063849, "flos": 20304360593280.0, "grad_norm": 1.6980730388554939, "language_loss": 0.75161666, "learning_rate": 5.100325980887499e-07, "loss": 0.77331406, "num_input_tokens_seen": 138614785, "step": 6441, "time_per_iteration": 2.6028261184692383 }, { "auxiliary_loss_clip": 0.01113239, "auxiliary_loss_mlp": 0.01044719, "balance_loss_clip": 1.04183888, "balance_loss_mlp": 1.02942491, "epoch": 0.774604701497024, "flos": 22966705681920.0, "grad_norm": 1.7670489179278046, "language_loss": 0.83531415, "learning_rate": 5.095130726309116e-07, "loss": 0.85689366, "num_input_tokens_seen": 138634960, "step": 6442, "time_per_iteration": 2.6769959926605225 }, { "auxiliary_loss_clip": 0.01037137, "auxiliary_loss_mlp": 0.01001009, "balance_loss_clip": 1.00910723, "balance_loss_mlp": 0.99944735, "epoch": 0.774724944387663, "flos": 60288523073280.0, "grad_norm": 0.7869056274650292, "language_loss": 0.58982939, "learning_rate": 5.089937732787559e-07, "loss": 0.61021084, "num_input_tokens_seen": 138699520, "step": 6443, "time_per_iteration": 3.222858190536499 }, { "auxiliary_loss_clip": 0.01095218, "auxiliary_loss_mlp": 0.01040739, "balance_loss_clip": 1.03882301, "balance_loss_mlp": 1.02369261, "epoch": 0.7748451872783022, "flos": 26761026954240.0, "grad_norm": 3.409350029607448, "language_loss": 0.66741532, "learning_rate": 5.084747001110592e-07, "loss": 0.68877488, "num_input_tokens_seen": 138719145, "step": 6444, "time_per_iteration": 2.6952121257781982 }, { "auxiliary_loss_clip": 0.01113492, "auxiliary_loss_mlp": 0.00772107, "balance_loss_clip": 1.04312348, "balance_loss_mlp": 1.00044179, "epoch": 0.7749654301689413, "flos": 30338627518080.0, "grad_norm": 1.6828379110380116, "language_loss": 0.70141834, "learning_rate": 5.07955853206564e-07, "loss": 0.72027433, "num_input_tokens_seen": 138743850, "step": 6445, "time_per_iteration": 2.712681770324707 }, { "auxiliary_loss_clip": 0.0112622, "auxiliary_loss_mlp": 0.01046199, "balance_loss_clip": 1.04485047, "balance_loss_mlp": 1.03041553, "epoch": 0.7750856730595803, "flos": 43179851687040.0, "grad_norm": 1.7539224869646055, "language_loss": 0.70995408, "learning_rate": 5.074372326439807e-07, "loss": 0.73167825, "num_input_tokens_seen": 138766860, "step": 6446, "time_per_iteration": 2.8745787143707275 }, { "auxiliary_loss_clip": 0.01094646, "auxiliary_loss_mlp": 0.01038083, "balance_loss_clip": 1.03737879, "balance_loss_mlp": 1.02117908, "epoch": 0.7752059159502195, "flos": 17640040256640.0, "grad_norm": 2.1097215263039857, "language_loss": 0.7358526, "learning_rate": 5.069188385019814e-07, "loss": 0.75717986, "num_input_tokens_seen": 138784560, "step": 6447, "time_per_iteration": 2.6746084690093994 }, { "auxiliary_loss_clip": 0.01088268, "auxiliary_loss_mlp": 0.01040653, "balance_loss_clip": 1.03706372, "balance_loss_mlp": 1.02385616, "epoch": 0.7753261588408585, "flos": 12677688524160.0, "grad_norm": 2.4067973428865224, "language_loss": 0.61161721, "learning_rate": 5.064006708592077e-07, "loss": 0.63290638, "num_input_tokens_seen": 138800805, "step": 6448, "time_per_iteration": 2.6954498291015625 }, { "auxiliary_loss_clip": 0.01101515, "auxiliary_loss_mlp": 0.01037429, "balance_loss_clip": 1.04106045, "balance_loss_mlp": 1.02312422, "epoch": 0.7754464017314976, "flos": 16690741666560.0, "grad_norm": 2.688250305763619, "language_loss": 0.75775719, "learning_rate": 5.058827297942641e-07, "loss": 0.77914667, "num_input_tokens_seen": 138815910, "step": 6449, "time_per_iteration": 2.584848642349243 }, { "auxiliary_loss_clip": 0.011113, "auxiliary_loss_mlp": 0.01042034, "balance_loss_clip": 1.04098368, "balance_loss_mlp": 1.02641737, "epoch": 0.7755666446221368, "flos": 19718944732800.0, "grad_norm": 1.8935321183336642, "language_loss": 0.75338542, "learning_rate": 5.053650153857237e-07, "loss": 0.77491879, "num_input_tokens_seen": 138834920, "step": 6450, "time_per_iteration": 2.6591403484344482 }, { "auxiliary_loss_clip": 0.01121078, "auxiliary_loss_mlp": 0.01045059, "balance_loss_clip": 1.04335797, "balance_loss_mlp": 1.02944267, "epoch": 0.7756868875127758, "flos": 18693623007360.0, "grad_norm": 1.7531915013504122, "language_loss": 0.6976065, "learning_rate": 5.048475277121214e-07, "loss": 0.71926785, "num_input_tokens_seen": 138852135, "step": 6451, "time_per_iteration": 2.607888698577881 }, { "auxiliary_loss_clip": 0.01123169, "auxiliary_loss_mlp": 0.01035826, "balance_loss_clip": 1.04294813, "balance_loss_mlp": 1.01943481, "epoch": 0.7758071304034149, "flos": 28404191543040.0, "grad_norm": 1.6441259212945158, "language_loss": 0.76940811, "learning_rate": 5.043302668519598e-07, "loss": 0.7909981, "num_input_tokens_seen": 138871470, "step": 6452, "time_per_iteration": 2.7113170623779297 }, { "auxiliary_loss_clip": 0.01120584, "auxiliary_loss_mlp": 0.01029219, "balance_loss_clip": 1.03980374, "balance_loss_mlp": 1.01468134, "epoch": 0.775927373294054, "flos": 20595344670720.0, "grad_norm": 1.9137403519261647, "language_loss": 0.72111166, "learning_rate": 5.038132328837079e-07, "loss": 0.74260962, "num_input_tokens_seen": 138889860, "step": 6453, "time_per_iteration": 2.646491050720215 }, { "auxiliary_loss_clip": 0.01121398, "auxiliary_loss_mlp": 0.01035584, "balance_loss_clip": 1.04279411, "balance_loss_mlp": 1.01987815, "epoch": 0.7760476161846931, "flos": 22526368853760.0, "grad_norm": 2.336341575355788, "language_loss": 0.73901564, "learning_rate": 5.032964258857993e-07, "loss": 0.76058543, "num_input_tokens_seen": 138909955, "step": 6454, "time_per_iteration": 3.555508852005005 }, { "auxiliary_loss_clip": 0.0111819, "auxiliary_loss_mlp": 0.01036447, "balance_loss_clip": 1.03887188, "balance_loss_mlp": 1.01960278, "epoch": 0.7761678590753321, "flos": 48651488403840.0, "grad_norm": 1.824049844181137, "language_loss": 0.68403232, "learning_rate": 5.027798459366329e-07, "loss": 0.70557874, "num_input_tokens_seen": 138935320, "step": 6455, "time_per_iteration": 3.7083380222320557 }, { "auxiliary_loss_clip": 0.0112171, "auxiliary_loss_mlp": 0.01036115, "balance_loss_clip": 1.0413475, "balance_loss_mlp": 1.01993799, "epoch": 0.7762881019659713, "flos": 26177047637760.0, "grad_norm": 2.0411372613625285, "language_loss": 0.63539624, "learning_rate": 5.02263493114573e-07, "loss": 0.65697443, "num_input_tokens_seen": 138957115, "step": 6456, "time_per_iteration": 2.6366238594055176 }, { "auxiliary_loss_clip": 0.01129879, "auxiliary_loss_mlp": 0.01041999, "balance_loss_clip": 1.04176927, "balance_loss_mlp": 1.02572107, "epoch": 0.7764083448566104, "flos": 20588341518720.0, "grad_norm": 2.423488523914399, "language_loss": 0.76548809, "learning_rate": 5.017473674979502e-07, "loss": 0.78720689, "num_input_tokens_seen": 138973140, "step": 6457, "time_per_iteration": 2.562072277069092 }, { "auxiliary_loss_clip": 0.01001576, "auxiliary_loss_mlp": 0.01001989, "balance_loss_clip": 1.00999129, "balance_loss_mlp": 1.00014162, "epoch": 0.7765285877472494, "flos": 67293078560640.0, "grad_norm": 0.7437583203048204, "language_loss": 0.58308536, "learning_rate": 5.01231469165061e-07, "loss": 0.60312104, "num_input_tokens_seen": 139028965, "step": 6458, "time_per_iteration": 4.076924562454224 }, { "auxiliary_loss_clip": 0.01026994, "auxiliary_loss_mlp": 0.01000978, "balance_loss_clip": 1.00840831, "balance_loss_mlp": 0.99953538, "epoch": 0.7766488306378886, "flos": 61344476121600.0, "grad_norm": 0.8248023206769692, "language_loss": 0.56770873, "learning_rate": 5.007157981941663e-07, "loss": 0.58798844, "num_input_tokens_seen": 139094325, "step": 6459, "time_per_iteration": 3.300264358520508 }, { "auxiliary_loss_clip": 0.01020082, "auxiliary_loss_mlp": 0.01000965, "balance_loss_clip": 1.01006162, "balance_loss_mlp": 0.99945718, "epoch": 0.7767690735285276, "flos": 62946199393920.0, "grad_norm": 0.8886933041680827, "language_loss": 0.67380446, "learning_rate": 5.002003546634928e-07, "loss": 0.69401491, "num_input_tokens_seen": 139150425, "step": 6460, "time_per_iteration": 3.142526149749756 }, { "auxiliary_loss_clip": 0.01080738, "auxiliary_loss_mlp": 0.01036666, "balance_loss_clip": 1.04086256, "balance_loss_mlp": 1.02147317, "epoch": 0.7768893164191667, "flos": 20886400575360.0, "grad_norm": 1.9889913107490287, "language_loss": 0.7634002, "learning_rate": 4.996851386512331e-07, "loss": 0.78457427, "num_input_tokens_seen": 139169130, "step": 6461, "time_per_iteration": 2.7158315181732178 }, { "auxiliary_loss_clip": 0.01110982, "auxiliary_loss_mlp": 0.01041252, "balance_loss_clip": 1.04244184, "balance_loss_mlp": 1.02393138, "epoch": 0.7770095593098058, "flos": 20704584908160.0, "grad_norm": 1.6239812775799691, "language_loss": 0.82869649, "learning_rate": 4.991701502355444e-07, "loss": 0.85021877, "num_input_tokens_seen": 139189595, "step": 6462, "time_per_iteration": 3.952042818069458 }, { "auxiliary_loss_clip": 0.01123054, "auxiliary_loss_mlp": 0.01037648, "balance_loss_clip": 1.04300082, "balance_loss_mlp": 1.02235341, "epoch": 0.7771298022004449, "flos": 24717709877760.0, "grad_norm": 1.8034561775771005, "language_loss": 0.75908029, "learning_rate": 4.986553894945518e-07, "loss": 0.78068733, "num_input_tokens_seen": 139210805, "step": 6463, "time_per_iteration": 2.6614255905151367 }, { "auxiliary_loss_clip": 0.01083354, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 1.03723764, "balance_loss_mlp": 1.02315116, "epoch": 0.777250045091084, "flos": 25009232659200.0, "grad_norm": 2.1050424005932546, "language_loss": 0.86201608, "learning_rate": 4.981408565063416e-07, "loss": 0.88323343, "num_input_tokens_seen": 139230750, "step": 6464, "time_per_iteration": 2.774383544921875 }, { "auxiliary_loss_clip": 0.01135535, "auxiliary_loss_mlp": 0.01042776, "balance_loss_clip": 1.04445064, "balance_loss_mlp": 1.02669513, "epoch": 0.777370287981723, "flos": 20119887319680.0, "grad_norm": 1.889075899184502, "language_loss": 0.76196325, "learning_rate": 4.976265513489701e-07, "loss": 0.78374636, "num_input_tokens_seen": 139250720, "step": 6465, "time_per_iteration": 2.5831005573272705 }, { "auxiliary_loss_clip": 0.01118785, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.03991413, "balance_loss_mlp": 1.02287793, "epoch": 0.7774905308723622, "flos": 21718809331200.0, "grad_norm": 4.731089234738868, "language_loss": 0.80328476, "learning_rate": 4.971124741004562e-07, "loss": 0.82486159, "num_input_tokens_seen": 139269720, "step": 6466, "time_per_iteration": 2.6552863121032715 }, { "auxiliary_loss_clip": 0.01119493, "auxiliary_loss_mlp": 0.01036822, "balance_loss_clip": 1.04115617, "balance_loss_mlp": 1.02066958, "epoch": 0.7776107737630013, "flos": 16034115093120.0, "grad_norm": 1.717510025131339, "language_loss": 0.76573181, "learning_rate": 4.965986248387846e-07, "loss": 0.78729498, "num_input_tokens_seen": 139288035, "step": 6467, "time_per_iteration": 2.5789451599121094 }, { "auxiliary_loss_clip": 0.01111671, "auxiliary_loss_mlp": 0.01039094, "balance_loss_clip": 1.04117203, "balance_loss_mlp": 1.02412176, "epoch": 0.7777310166536403, "flos": 24790895838720.0, "grad_norm": 1.5906406352416644, "language_loss": 0.77195221, "learning_rate": 4.960850036419073e-07, "loss": 0.79345989, "num_input_tokens_seen": 139307135, "step": 6468, "time_per_iteration": 2.6823606491088867 }, { "auxiliary_loss_clip": 0.01106079, "auxiliary_loss_mlp": 0.01036229, "balance_loss_clip": 1.04025173, "balance_loss_mlp": 1.02074385, "epoch": 0.7778512595442795, "flos": 17272530253440.0, "grad_norm": 2.3652476901186237, "language_loss": 0.78441995, "learning_rate": 4.955716105877378e-07, "loss": 0.80584306, "num_input_tokens_seen": 139325905, "step": 6469, "time_per_iteration": 2.631263256072998 }, { "auxiliary_loss_clip": 0.01125566, "auxiliary_loss_mlp": 0.00772733, "balance_loss_clip": 1.04359627, "balance_loss_mlp": 1.00050128, "epoch": 0.7779715024349185, "flos": 17748418567680.0, "grad_norm": 1.7344480383357381, "language_loss": 0.83137941, "learning_rate": 4.950584457541598e-07, "loss": 0.85036242, "num_input_tokens_seen": 139344370, "step": 6470, "time_per_iteration": 2.5960566997528076 }, { "auxiliary_loss_clip": 0.01122595, "auxiliary_loss_mlp": 0.01046441, "balance_loss_clip": 1.04138756, "balance_loss_mlp": 1.0308013, "epoch": 0.7780917453255576, "flos": 24316875031680.0, "grad_norm": 1.46734436389102, "language_loss": 0.81765932, "learning_rate": 4.945455092190183e-07, "loss": 0.83934975, "num_input_tokens_seen": 139365625, "step": 6471, "time_per_iteration": 2.663356304168701 }, { "auxiliary_loss_clip": 0.01036728, "auxiliary_loss_mlp": 0.01005527, "balance_loss_clip": 1.00867271, "balance_loss_mlp": 1.0040009, "epoch": 0.7782119882161967, "flos": 56364601530240.0, "grad_norm": 0.6905709685724021, "language_loss": 0.559654, "learning_rate": 4.940328010601271e-07, "loss": 0.58007652, "num_input_tokens_seen": 139430540, "step": 6472, "time_per_iteration": 3.194182872772217 }, { "auxiliary_loss_clip": 0.0111467, "auxiliary_loss_mlp": 0.01046575, "balance_loss_clip": 1.04278111, "balance_loss_mlp": 1.02931309, "epoch": 0.7783322311068358, "flos": 46789986994560.0, "grad_norm": 1.9087504109489088, "language_loss": 0.77018487, "learning_rate": 4.935203213552621e-07, "loss": 0.79179728, "num_input_tokens_seen": 139454280, "step": 6473, "time_per_iteration": 2.836885929107666 }, { "auxiliary_loss_clip": 0.01107688, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.04164219, "balance_loss_mlp": 1.0198369, "epoch": 0.7784524739974749, "flos": 19057864872960.0, "grad_norm": 1.9791299074002129, "language_loss": 0.6702472, "learning_rate": 4.930080701821662e-07, "loss": 0.6916846, "num_input_tokens_seen": 139471745, "step": 6474, "time_per_iteration": 2.621753454208374 }, { "auxiliary_loss_clip": 0.01107782, "auxiliary_loss_mlp": 0.01040404, "balance_loss_clip": 1.04078066, "balance_loss_mlp": 1.02502656, "epoch": 0.778572716888114, "flos": 24791111320320.0, "grad_norm": 1.8517615583667053, "language_loss": 0.77116716, "learning_rate": 4.92496047618548e-07, "loss": 0.79264909, "num_input_tokens_seen": 139491505, "step": 6475, "time_per_iteration": 2.6855251789093018 }, { "auxiliary_loss_clip": 0.01125075, "auxiliary_loss_mlp": 0.01047065, "balance_loss_clip": 1.04533219, "balance_loss_mlp": 1.03018451, "epoch": 0.7786929597787531, "flos": 20078086867200.0, "grad_norm": 2.0699641405128477, "language_loss": 0.77763313, "learning_rate": 4.919842537420811e-07, "loss": 0.79935449, "num_input_tokens_seen": 139508620, "step": 6476, "time_per_iteration": 2.5925512313842773 }, { "auxiliary_loss_clip": 0.01107337, "auxiliary_loss_mlp": 0.0103412, "balance_loss_clip": 1.04115617, "balance_loss_mlp": 1.01868892, "epoch": 0.7788132026693921, "flos": 21872220318720.0, "grad_norm": 1.5999032927349757, "language_loss": 0.7938832, "learning_rate": 4.91472688630404e-07, "loss": 0.81529772, "num_input_tokens_seen": 139529360, "step": 6477, "time_per_iteration": 2.657766819000244 }, { "auxiliary_loss_clip": 0.01127445, "auxiliary_loss_mlp": 0.01034338, "balance_loss_clip": 1.04026222, "balance_loss_mlp": 1.01903188, "epoch": 0.7789334455600313, "flos": 11181937351680.0, "grad_norm": 1.7596235372081976, "language_loss": 0.73996723, "learning_rate": 4.909613523611202e-07, "loss": 0.76158506, "num_input_tokens_seen": 139546240, "step": 6478, "time_per_iteration": 2.5750818252563477 }, { "auxiliary_loss_clip": 0.0107872, "auxiliary_loss_mlp": 0.00773472, "balance_loss_clip": 1.03617537, "balance_loss_mlp": 1.0004468, "epoch": 0.7790536884506704, "flos": 28695427015680.0, "grad_norm": 2.0440408058161044, "language_loss": 0.74539769, "learning_rate": 4.904502450117991e-07, "loss": 0.76391965, "num_input_tokens_seen": 139567200, "step": 6479, "time_per_iteration": 2.7751057147979736 }, { "auxiliary_loss_clip": 0.0110522, "auxiliary_loss_mlp": 0.01044353, "balance_loss_clip": 1.04156375, "balance_loss_mlp": 1.02884436, "epoch": 0.7791739313413094, "flos": 11072302064640.0, "grad_norm": 2.2980140914498355, "language_loss": 0.72346187, "learning_rate": 4.899393666599762e-07, "loss": 0.74495757, "num_input_tokens_seen": 139583775, "step": 6480, "time_per_iteration": 3.5696709156036377 }, { "auxiliary_loss_clip": 0.01129444, "auxiliary_loss_mlp": 0.01037826, "balance_loss_clip": 1.04013801, "balance_loss_mlp": 1.02294898, "epoch": 0.7792941742319486, "flos": 14679276975360.0, "grad_norm": 2.4341915375097556, "language_loss": 0.72685921, "learning_rate": 4.894287173831506e-07, "loss": 0.74853194, "num_input_tokens_seen": 139599735, "step": 6481, "time_per_iteration": 2.556184768676758 }, { "auxiliary_loss_clip": 0.01109126, "auxiliary_loss_mlp": 0.01033556, "balance_loss_clip": 1.03823996, "balance_loss_mlp": 1.01728427, "epoch": 0.7794144171225876, "flos": 23258874908160.0, "grad_norm": 2.1032070121387547, "language_loss": 0.84547234, "learning_rate": 4.889182972587877e-07, "loss": 0.86689913, "num_input_tokens_seen": 139619030, "step": 6482, "time_per_iteration": 2.637718677520752 }, { "auxiliary_loss_clip": 0.0110602, "auxiliary_loss_mlp": 0.01048618, "balance_loss_clip": 1.04305005, "balance_loss_mlp": 1.03198791, "epoch": 0.7795346600132267, "flos": 21507080613120.0, "grad_norm": 1.8247845500788904, "language_loss": 0.65885842, "learning_rate": 4.884081063643177e-07, "loss": 0.68040478, "num_input_tokens_seen": 139637690, "step": 6483, "time_per_iteration": 2.635289430618286 }, { "auxiliary_loss_clip": 0.01014703, "auxiliary_loss_mlp": 0.01001073, "balance_loss_clip": 1.0096972, "balance_loss_mlp": 0.9994753, "epoch": 0.7796549029038659, "flos": 70052273694720.0, "grad_norm": 0.8594687419509685, "language_loss": 0.52442014, "learning_rate": 4.878981447771353e-07, "loss": 0.54457784, "num_input_tokens_seen": 139692070, "step": 6484, "time_per_iteration": 4.227233409881592 }, { "auxiliary_loss_clip": 0.0109403, "auxiliary_loss_mlp": 0.01045063, "balance_loss_clip": 1.04118323, "balance_loss_mlp": 1.02919614, "epoch": 0.7797751457945049, "flos": 23989405714560.0, "grad_norm": 1.4883212351205275, "language_loss": 0.73088741, "learning_rate": 4.873884125746035e-07, "loss": 0.75227839, "num_input_tokens_seen": 139713745, "step": 6485, "time_per_iteration": 2.6807711124420166 }, { "auxiliary_loss_clip": 0.01103572, "auxiliary_loss_mlp": 0.01038175, "balance_loss_clip": 1.04011381, "balance_loss_mlp": 1.02115202, "epoch": 0.779895388685144, "flos": 22674751937280.0, "grad_norm": 2.3399317993633666, "language_loss": 0.72206867, "learning_rate": 4.868789098340456e-07, "loss": 0.74348617, "num_input_tokens_seen": 139731650, "step": 6486, "time_per_iteration": 2.658111095428467 }, { "auxiliary_loss_clip": 0.01097609, "auxiliary_loss_mlp": 0.01038767, "balance_loss_clip": 1.03976631, "balance_loss_mlp": 1.02291203, "epoch": 0.7800156315757831, "flos": 23768698596480.0, "grad_norm": 3.921152125948475, "language_loss": 0.728387, "learning_rate": 4.863696366327543e-07, "loss": 0.74975073, "num_input_tokens_seen": 139750820, "step": 6487, "time_per_iteration": 2.702888011932373 }, { "auxiliary_loss_clip": 0.01119619, "auxiliary_loss_mlp": 0.01036551, "balance_loss_clip": 1.03984094, "balance_loss_mlp": 1.01988578, "epoch": 0.7801358744664222, "flos": 26429714881920.0, "grad_norm": 2.859471009731517, "language_loss": 0.7799778, "learning_rate": 4.85860593047986e-07, "loss": 0.80153948, "num_input_tokens_seen": 139770885, "step": 6488, "time_per_iteration": 3.530031442642212 }, { "auxiliary_loss_clip": 0.01089175, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.03654981, "balance_loss_mlp": 1.0232538, "epoch": 0.7802561173570612, "flos": 26322162583680.0, "grad_norm": 2.010531910998268, "language_loss": 0.74515718, "learning_rate": 4.853517791569613e-07, "loss": 0.76644111, "num_input_tokens_seen": 139793065, "step": 6489, "time_per_iteration": 2.7343623638153076 }, { "auxiliary_loss_clip": 0.01115853, "auxiliary_loss_mlp": 0.00773176, "balance_loss_clip": 1.04200077, "balance_loss_mlp": 1.00049448, "epoch": 0.7803763602477004, "flos": 40333751596800.0, "grad_norm": 1.7128658136530164, "language_loss": 0.66010058, "learning_rate": 4.848431950368684e-07, "loss": 0.6789909, "num_input_tokens_seen": 139815625, "step": 6490, "time_per_iteration": 2.7816126346588135 }, { "auxiliary_loss_clip": 0.0103676, "auxiliary_loss_mlp": 0.00755548, "balance_loss_clip": 1.0087285, "balance_loss_mlp": 1.00016928, "epoch": 0.7804966031383395, "flos": 67001448038400.0, "grad_norm": 0.7101822329599973, "language_loss": 0.55725503, "learning_rate": 4.843348407648569e-07, "loss": 0.57517815, "num_input_tokens_seen": 139876905, "step": 6491, "time_per_iteration": 3.138395309448242 }, { "auxiliary_loss_clip": 0.0111953, "auxiliary_loss_mlp": 0.01037723, "balance_loss_clip": 1.03828955, "balance_loss_mlp": 1.02141488, "epoch": 0.7806168460289785, "flos": 17740733057280.0, "grad_norm": 2.2862736814088844, "language_loss": 0.82865483, "learning_rate": 4.838267164180457e-07, "loss": 0.85022736, "num_input_tokens_seen": 139892575, "step": 6492, "time_per_iteration": 2.5674540996551514 }, { "auxiliary_loss_clip": 0.01136164, "auxiliary_loss_mlp": 0.01037206, "balance_loss_clip": 1.04301572, "balance_loss_mlp": 1.0208149, "epoch": 0.7807370889196176, "flos": 23946240545280.0, "grad_norm": 3.2271431469165957, "language_loss": 0.83834445, "learning_rate": 4.833188220735156e-07, "loss": 0.86007822, "num_input_tokens_seen": 139912245, "step": 6493, "time_per_iteration": 2.61625599861145 }, { "auxiliary_loss_clip": 0.01116765, "auxiliary_loss_mlp": 0.0103158, "balance_loss_clip": 1.04137135, "balance_loss_mlp": 1.01697648, "epoch": 0.7808573318102567, "flos": 18989024457600.0, "grad_norm": 3.056054064605557, "language_loss": 0.74905145, "learning_rate": 4.828111578083152e-07, "loss": 0.77053487, "num_input_tokens_seen": 139929150, "step": 6494, "time_per_iteration": 2.601327896118164 }, { "auxiliary_loss_clip": 0.01107973, "auxiliary_loss_mlp": 0.01037507, "balance_loss_clip": 1.04228544, "balance_loss_mlp": 1.02254033, "epoch": 0.7809775747008958, "flos": 23980750536960.0, "grad_norm": 2.567859843007373, "language_loss": 0.81265724, "learning_rate": 4.823037236994556e-07, "loss": 0.83411205, "num_input_tokens_seen": 139947315, "step": 6495, "time_per_iteration": 2.6928112506866455 }, { "auxiliary_loss_clip": 0.01029247, "auxiliary_loss_mlp": 0.01000607, "balance_loss_clip": 1.01004314, "balance_loss_mlp": 0.99896222, "epoch": 0.7810978175915348, "flos": 68535875180160.0, "grad_norm": 0.7174601669376647, "language_loss": 0.56323647, "learning_rate": 4.817965198239136e-07, "loss": 0.58353496, "num_input_tokens_seen": 140013775, "step": 6496, "time_per_iteration": 3.168968439102173 }, { "auxiliary_loss_clip": 0.01094754, "auxiliary_loss_mlp": 0.01046021, "balance_loss_clip": 1.03792787, "balance_loss_mlp": 1.02875924, "epoch": 0.781218060482174, "flos": 19642131498240.0, "grad_norm": 2.4323933214099984, "language_loss": 0.74311268, "learning_rate": 4.812895462586331e-07, "loss": 0.76452041, "num_input_tokens_seen": 140031600, "step": 6497, "time_per_iteration": 2.664963483810425 }, { "auxiliary_loss_clip": 0.01095956, "auxiliary_loss_mlp": 0.01036763, "balance_loss_clip": 1.04037547, "balance_loss_mlp": 1.0224402, "epoch": 0.7813383033728131, "flos": 25627865621760.0, "grad_norm": 1.7407510193564508, "language_loss": 0.81701571, "learning_rate": 4.807828030805207e-07, "loss": 0.83834291, "num_input_tokens_seen": 140050590, "step": 6498, "time_per_iteration": 2.705953598022461 }, { "auxiliary_loss_clip": 0.01118474, "auxiliary_loss_mlp": 0.01037895, "balance_loss_clip": 1.04315853, "balance_loss_mlp": 1.02260017, "epoch": 0.7814585462634521, "flos": 20485924865280.0, "grad_norm": 2.5583412086187725, "language_loss": 0.67909622, "learning_rate": 4.802762903664495e-07, "loss": 0.70065987, "num_input_tokens_seen": 140069770, "step": 6499, "time_per_iteration": 2.639960527420044 }, { "auxiliary_loss_clip": 0.01112987, "auxiliary_loss_mlp": 0.01041645, "balance_loss_clip": 1.04142332, "balance_loss_mlp": 1.02489662, "epoch": 0.7815787891540913, "flos": 22304297018880.0, "grad_norm": 2.3153253749647447, "language_loss": 0.73736906, "learning_rate": 4.797700081932565e-07, "loss": 0.75891531, "num_input_tokens_seen": 140087635, "step": 6500, "time_per_iteration": 2.6303999423980713 }, { "auxiliary_loss_clip": 0.01068677, "auxiliary_loss_mlp": 0.01041543, "balance_loss_clip": 1.03565192, "balance_loss_mlp": 1.02503228, "epoch": 0.7816990320447303, "flos": 22600668136320.0, "grad_norm": 2.2104432204867823, "language_loss": 0.8198998, "learning_rate": 4.792639566377442e-07, "loss": 0.84100205, "num_input_tokens_seen": 140105045, "step": 6501, "time_per_iteration": 2.7695062160491943 }, { "auxiliary_loss_clip": 0.01112469, "auxiliary_loss_mlp": 0.01047362, "balance_loss_clip": 1.03815174, "balance_loss_mlp": 1.03092277, "epoch": 0.7818192749353694, "flos": 24935974871040.0, "grad_norm": 1.888721767086285, "language_loss": 0.77543348, "learning_rate": 4.78758135776681e-07, "loss": 0.79703182, "num_input_tokens_seen": 140124900, "step": 6502, "time_per_iteration": 2.63047456741333 }, { "auxiliary_loss_clip": 0.01111916, "auxiliary_loss_mlp": 0.01043676, "balance_loss_clip": 1.04211843, "balance_loss_mlp": 1.02693892, "epoch": 0.7819395178260086, "flos": 23733039369600.0, "grad_norm": 5.58278289418435, "language_loss": 0.78713924, "learning_rate": 4.782525456867989e-07, "loss": 0.80869514, "num_input_tokens_seen": 140143755, "step": 6503, "time_per_iteration": 2.6929473876953125 }, { "auxiliary_loss_clip": 0.01100439, "auxiliary_loss_mlp": 0.0104251, "balance_loss_clip": 1.03986192, "balance_loss_mlp": 1.02549887, "epoch": 0.7820597607166476, "flos": 23221671396480.0, "grad_norm": 1.7763852248319334, "language_loss": 0.8328011, "learning_rate": 4.777471864447959e-07, "loss": 0.85423058, "num_input_tokens_seen": 140164495, "step": 6504, "time_per_iteration": 2.696544885635376 }, { "auxiliary_loss_clip": 0.01106678, "auxiliary_loss_mlp": 0.0103391, "balance_loss_clip": 1.037462, "balance_loss_mlp": 1.01824594, "epoch": 0.7821800036072867, "flos": 22309540404480.0, "grad_norm": 2.14371080921954, "language_loss": 0.80799687, "learning_rate": 4.772420581273344e-07, "loss": 0.8294028, "num_input_tokens_seen": 140181980, "step": 6505, "time_per_iteration": 2.6295700073242188 }, { "auxiliary_loss_clip": 0.01115679, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.0404017, "balance_loss_mlp": 1.02585554, "epoch": 0.7823002464979258, "flos": 21544176384000.0, "grad_norm": 90.39490501196853, "language_loss": 0.76324195, "learning_rate": 4.7673716081104134e-07, "loss": 0.78481734, "num_input_tokens_seen": 140202155, "step": 6506, "time_per_iteration": 3.5406055450439453 }, { "auxiliary_loss_clip": 0.01118836, "auxiliary_loss_mlp": 0.01041159, "balance_loss_clip": 1.04273105, "balance_loss_mlp": 1.02607894, "epoch": 0.7824204893885649, "flos": 24535642815360.0, "grad_norm": 1.677368851828284, "language_loss": 0.84339964, "learning_rate": 4.762324945725109e-07, "loss": 0.86499953, "num_input_tokens_seen": 140221600, "step": 6507, "time_per_iteration": 2.656594753265381 }, { "auxiliary_loss_clip": 0.01104987, "auxiliary_loss_mlp": 0.01042325, "balance_loss_clip": 1.04277408, "balance_loss_mlp": 1.0268991, "epoch": 0.782540732279204, "flos": 27415211402880.0, "grad_norm": 1.9298060092898772, "language_loss": 0.75557178, "learning_rate": 4.7572805948829844e-07, "loss": 0.77704489, "num_input_tokens_seen": 140241860, "step": 6508, "time_per_iteration": 2.682560443878174 }, { "auxiliary_loss_clip": 0.0108736, "auxiliary_loss_mlp": 0.01038676, "balance_loss_clip": 1.03794789, "balance_loss_mlp": 1.02271378, "epoch": 0.7826609751698431, "flos": 24353216616960.0, "grad_norm": 1.7105671064943866, "language_loss": 0.70992136, "learning_rate": 4.7522385563492795e-07, "loss": 0.73118174, "num_input_tokens_seen": 140262160, "step": 6509, "time_per_iteration": 2.7441518306732178 }, { "auxiliary_loss_clip": 0.01097459, "auxiliary_loss_mlp": 0.0104076, "balance_loss_clip": 1.04057288, "balance_loss_mlp": 1.02504849, "epoch": 0.7827812180604822, "flos": 23988543788160.0, "grad_norm": 2.094157870201537, "language_loss": 0.70396918, "learning_rate": 4.747198830888863e-07, "loss": 0.72535139, "num_input_tokens_seen": 140282030, "step": 6510, "time_per_iteration": 3.6448893547058105 }, { "auxiliary_loss_clip": 0.01103892, "auxiliary_loss_mlp": 0.0103261, "balance_loss_clip": 1.03985226, "balance_loss_mlp": 1.01768517, "epoch": 0.7829014609511212, "flos": 27454318335360.0, "grad_norm": 10.114007311547565, "language_loss": 0.68596375, "learning_rate": 4.742161419266251e-07, "loss": 0.70732874, "num_input_tokens_seen": 140301190, "step": 6511, "time_per_iteration": 2.728287935256958 }, { "auxiliary_loss_clip": 0.01129595, "auxiliary_loss_mlp": 0.01039525, "balance_loss_clip": 1.0470084, "balance_loss_mlp": 1.02185798, "epoch": 0.7830217038417604, "flos": 29204532432000.0, "grad_norm": 2.9889807767903016, "language_loss": 0.64993125, "learning_rate": 4.7371263222456304e-07, "loss": 0.67162246, "num_input_tokens_seen": 140318510, "step": 6512, "time_per_iteration": 2.6593432426452637 }, { "auxiliary_loss_clip": 0.0102839, "auxiliary_loss_mlp": 0.01000601, "balance_loss_clip": 1.01019311, "balance_loss_mlp": 0.99911672, "epoch": 0.7831419467323995, "flos": 60950895822720.0, "grad_norm": 0.8005743105913518, "language_loss": 0.61318737, "learning_rate": 4.7320935405908004e-07, "loss": 0.63347727, "num_input_tokens_seen": 140379380, "step": 6513, "time_per_iteration": 3.1685304641723633 }, { "auxiliary_loss_clip": 0.01137456, "auxiliary_loss_mlp": 0.01037028, "balance_loss_clip": 1.04298806, "balance_loss_mlp": 1.02066088, "epoch": 0.7832621896230385, "flos": 19682531320320.0, "grad_norm": 2.8146467434754285, "language_loss": 0.84283543, "learning_rate": 4.7270630750652475e-07, "loss": 0.86458021, "num_input_tokens_seen": 140395335, "step": 6514, "time_per_iteration": 3.4714267253875732 }, { "auxiliary_loss_clip": 0.01118083, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.04081917, "balance_loss_mlp": 1.02075732, "epoch": 0.7833824325136777, "flos": 25009232659200.0, "grad_norm": 1.7995623405411356, "language_loss": 0.80446082, "learning_rate": 4.7220349264320746e-07, "loss": 0.82599914, "num_input_tokens_seen": 140414420, "step": 6515, "time_per_iteration": 2.630929470062256 }, { "auxiliary_loss_clip": 0.01028073, "auxiliary_loss_mlp": 0.01003316, "balance_loss_clip": 1.00950503, "balance_loss_mlp": 1.00183773, "epoch": 0.7835026754043167, "flos": 68800142517120.0, "grad_norm": 0.7353966330951878, "language_loss": 0.54866976, "learning_rate": 4.71700909545407e-07, "loss": 0.56898367, "num_input_tokens_seen": 140477365, "step": 6516, "time_per_iteration": 3.1927058696746826 }, { "auxiliary_loss_clip": 0.01123111, "auxiliary_loss_mlp": 0.01037713, "balance_loss_clip": 1.0439285, "balance_loss_mlp": 1.02157164, "epoch": 0.7836229182949558, "flos": 19864598382720.0, "grad_norm": 2.7015417993891404, "language_loss": 0.77007806, "learning_rate": 4.711985582893627e-07, "loss": 0.7916863, "num_input_tokens_seen": 140495885, "step": 6517, "time_per_iteration": 2.603255033493042 }, { "auxiliary_loss_clip": 0.01085208, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.03591824, "balance_loss_mlp": 1.02145445, "epoch": 0.783743161185595, "flos": 22965843755520.0, "grad_norm": 1.8145055505010539, "language_loss": 0.71704197, "learning_rate": 4.706964389512811e-07, "loss": 0.73829305, "num_input_tokens_seen": 140515920, "step": 6518, "time_per_iteration": 2.71937894821167 }, { "auxiliary_loss_clip": 0.01134061, "auxiliary_loss_mlp": 0.01041521, "balance_loss_clip": 1.04611897, "balance_loss_mlp": 1.02579701, "epoch": 0.783863404076234, "flos": 12458489777280.0, "grad_norm": 4.125413670417761, "language_loss": 0.87568259, "learning_rate": 4.701945516073345e-07, "loss": 0.89743835, "num_input_tokens_seen": 140533395, "step": 6519, "time_per_iteration": 2.5426108837127686 }, { "auxiliary_loss_clip": 0.01092687, "auxiliary_loss_mlp": 0.01033438, "balance_loss_clip": 1.03872037, "balance_loss_mlp": 1.01826251, "epoch": 0.7839836469668731, "flos": 24243940465920.0, "grad_norm": 1.888466531530309, "language_loss": 0.75291997, "learning_rate": 4.696928963336577e-07, "loss": 0.77418125, "num_input_tokens_seen": 140552825, "step": 6520, "time_per_iteration": 2.6973397731781006 }, { "auxiliary_loss_clip": 0.01027691, "auxiliary_loss_mlp": 0.01004835, "balance_loss_clip": 1.00962067, "balance_loss_mlp": 1.00331521, "epoch": 0.7841038898575122, "flos": 62121978938880.0, "grad_norm": 0.8500605075760415, "language_loss": 0.61015737, "learning_rate": 4.6919147320635224e-07, "loss": 0.63048267, "num_input_tokens_seen": 140615535, "step": 6521, "time_per_iteration": 3.1606225967407227 }, { "auxiliary_loss_clip": 0.0112154, "auxiliary_loss_mlp": 0.01040862, "balance_loss_clip": 1.04309225, "balance_loss_mlp": 1.02441072, "epoch": 0.7842241327481513, "flos": 20193899293440.0, "grad_norm": 2.488671075973803, "language_loss": 0.73314005, "learning_rate": 4.6869028230148286e-07, "loss": 0.75476408, "num_input_tokens_seen": 140633330, "step": 6522, "time_per_iteration": 2.6716933250427246 }, { "auxiliary_loss_clip": 0.01093253, "auxiliary_loss_mlp": 0.01048736, "balance_loss_clip": 1.03872538, "balance_loss_mlp": 1.03183222, "epoch": 0.7843443756387903, "flos": 28074531496320.0, "grad_norm": 3.382046998399303, "language_loss": 0.60426474, "learning_rate": 4.6818932369507957e-07, "loss": 0.62568462, "num_input_tokens_seen": 140652830, "step": 6523, "time_per_iteration": 2.731539249420166 }, { "auxiliary_loss_clip": 0.01122194, "auxiliary_loss_mlp": 0.01043031, "balance_loss_clip": 1.043221, "balance_loss_mlp": 1.02643692, "epoch": 0.7844646185294295, "flos": 21323397438720.0, "grad_norm": 2.2346195825818977, "language_loss": 0.89338928, "learning_rate": 4.676885974631386e-07, "loss": 0.91504151, "num_input_tokens_seen": 140671190, "step": 6524, "time_per_iteration": 2.6053123474121094 }, { "auxiliary_loss_clip": 0.01121967, "auxiliary_loss_mlp": 0.01037866, "balance_loss_clip": 1.04339445, "balance_loss_mlp": 1.02213657, "epoch": 0.7845848614200686, "flos": 23656585271040.0, "grad_norm": 3.8862664608362265, "language_loss": 0.81227905, "learning_rate": 4.67188103681619e-07, "loss": 0.83387738, "num_input_tokens_seen": 140690975, "step": 6525, "time_per_iteration": 2.7001724243164062 }, { "auxiliary_loss_clip": 0.01112741, "auxiliary_loss_mlp": 0.0077207, "balance_loss_clip": 1.04236913, "balance_loss_mlp": 1.00039458, "epoch": 0.7847051043107076, "flos": 23402194174080.0, "grad_norm": 2.1828096136523407, "language_loss": 0.69402957, "learning_rate": 4.666878424264453e-07, "loss": 0.71287763, "num_input_tokens_seen": 140710930, "step": 6526, "time_per_iteration": 2.6035315990448 }, { "auxiliary_loss_clip": 0.01100003, "auxiliary_loss_mlp": 0.01029842, "balance_loss_clip": 1.041628, "balance_loss_mlp": 1.01585317, "epoch": 0.7848253472013467, "flos": 19022277473280.0, "grad_norm": 1.8502716967134951, "language_loss": 0.73846251, "learning_rate": 4.661878137735069e-07, "loss": 0.75976092, "num_input_tokens_seen": 140729120, "step": 6527, "time_per_iteration": 2.637688636779785 }, { "auxiliary_loss_clip": 0.01109888, "auxiliary_loss_mlp": 0.01036095, "balance_loss_clip": 1.04136157, "balance_loss_mlp": 1.02082443, "epoch": 0.7849455900919858, "flos": 21179180332800.0, "grad_norm": 2.20645305705558, "language_loss": 0.74813199, "learning_rate": 4.656880177986571e-07, "loss": 0.76959181, "num_input_tokens_seen": 140747665, "step": 6528, "time_per_iteration": 2.633451223373413 }, { "auxiliary_loss_clip": 0.01110991, "auxiliary_loss_mlp": 0.01036022, "balance_loss_clip": 1.03981209, "balance_loss_mlp": 1.01973248, "epoch": 0.7850658329826249, "flos": 19536482620800.0, "grad_norm": 2.302180103148286, "language_loss": 0.81762683, "learning_rate": 4.6518845457771607e-07, "loss": 0.8390969, "num_input_tokens_seen": 140766525, "step": 6529, "time_per_iteration": 2.61690616607666 }, { "auxiliary_loss_clip": 0.01114914, "auxiliary_loss_mlp": 0.00772916, "balance_loss_clip": 1.04113555, "balance_loss_mlp": 1.00041676, "epoch": 0.7851860758732639, "flos": 12495334152960.0, "grad_norm": 1.8820111905468302, "language_loss": 0.79068434, "learning_rate": 4.646891241864652e-07, "loss": 0.80956268, "num_input_tokens_seen": 140785090, "step": 6530, "time_per_iteration": 2.6090803146362305 }, { "auxiliary_loss_clip": 0.01120869, "auxiliary_loss_mlp": 0.01060993, "balance_loss_clip": 1.04266953, "balance_loss_mlp": 1.04293323, "epoch": 0.7853063187639031, "flos": 22960959505920.0, "grad_norm": 7.473506064493927, "language_loss": 0.73214334, "learning_rate": 4.6419002670065397e-07, "loss": 0.75396198, "num_input_tokens_seen": 140804670, "step": 6531, "time_per_iteration": 2.5965797901153564 }, { "auxiliary_loss_clip": 0.01102409, "auxiliary_loss_mlp": 0.01041145, "balance_loss_clip": 1.0438025, "balance_loss_mlp": 1.02482533, "epoch": 0.7854265616545422, "flos": 17347260499200.0, "grad_norm": 2.1247985109038003, "language_loss": 0.86576837, "learning_rate": 4.6369116219599445e-07, "loss": 0.88720393, "num_input_tokens_seen": 140820655, "step": 6532, "time_per_iteration": 4.414429187774658 }, { "auxiliary_loss_clip": 0.01094734, "auxiliary_loss_mlp": 0.01041485, "balance_loss_clip": 1.03886449, "balance_loss_mlp": 1.02580333, "epoch": 0.7855468045451812, "flos": 23838293197440.0, "grad_norm": 1.9847809722544674, "language_loss": 0.79241359, "learning_rate": 4.631925307481637e-07, "loss": 0.81377578, "num_input_tokens_seen": 140840470, "step": 6533, "time_per_iteration": 2.6848092079162598 }, { "auxiliary_loss_clip": 0.01108984, "auxiliary_loss_mlp": 0.01035098, "balance_loss_clip": 1.04344678, "balance_loss_mlp": 1.01984477, "epoch": 0.7856670474358204, "flos": 25666792986240.0, "grad_norm": 1.9741187165360095, "language_loss": 0.75679934, "learning_rate": 4.6269413243280533e-07, "loss": 0.77824008, "num_input_tokens_seen": 140859890, "step": 6534, "time_per_iteration": 2.6827049255371094 }, { "auxiliary_loss_clip": 0.01113664, "auxiliary_loss_mlp": 0.01045013, "balance_loss_clip": 1.04324698, "balance_loss_mlp": 1.02822852, "epoch": 0.7857872903264594, "flos": 18144656472960.0, "grad_norm": 2.4082966655237206, "language_loss": 0.7483685, "learning_rate": 4.621959673255236e-07, "loss": 0.76995534, "num_input_tokens_seen": 140876190, "step": 6535, "time_per_iteration": 2.6180667877197266 }, { "auxiliary_loss_clip": 0.01080539, "auxiliary_loss_mlp": 0.01043917, "balance_loss_clip": 1.03871274, "balance_loss_mlp": 1.02803886, "epoch": 0.7859075332170985, "flos": 14386138081920.0, "grad_norm": 2.2572651075086427, "language_loss": 0.9073019, "learning_rate": 4.6169803550189135e-07, "loss": 0.92854643, "num_input_tokens_seen": 140891885, "step": 6536, "time_per_iteration": 3.658766269683838 }, { "auxiliary_loss_clip": 0.01074618, "auxiliary_loss_mlp": 0.01039446, "balance_loss_clip": 1.03580785, "balance_loss_mlp": 1.02171946, "epoch": 0.7860277761077377, "flos": 19864059678720.0, "grad_norm": 1.9542900906342855, "language_loss": 0.77871281, "learning_rate": 4.6120033703744355e-07, "loss": 0.79985344, "num_input_tokens_seen": 140910780, "step": 6537, "time_per_iteration": 2.7742581367492676 }, { "auxiliary_loss_clip": 0.0110071, "auxiliary_loss_mlp": 0.0103953, "balance_loss_clip": 1.04042792, "balance_loss_mlp": 1.02352035, "epoch": 0.7861480189983767, "flos": 26396174557440.0, "grad_norm": 1.8500959340708525, "language_loss": 0.78463203, "learning_rate": 4.607028720076822e-07, "loss": 0.80603445, "num_input_tokens_seen": 140927460, "step": 6538, "time_per_iteration": 2.679253578186035 }, { "auxiliary_loss_clip": 0.01121799, "auxiliary_loss_mlp": 0.01040605, "balance_loss_clip": 1.04367352, "balance_loss_mlp": 1.02559102, "epoch": 0.7862682618890158, "flos": 24236578177920.0, "grad_norm": 1.8998718208924603, "language_loss": 0.73767626, "learning_rate": 4.6020564048807074e-07, "loss": 0.75930035, "num_input_tokens_seen": 140945135, "step": 6539, "time_per_iteration": 3.5637388229370117 }, { "auxiliary_loss_clip": 0.01120211, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.04089665, "balance_loss_mlp": 1.0184691, "epoch": 0.7863885047796549, "flos": 47551508259840.0, "grad_norm": 4.620309406185398, "language_loss": 0.72022855, "learning_rate": 4.5970864255403883e-07, "loss": 0.74176967, "num_input_tokens_seen": 140966660, "step": 6540, "time_per_iteration": 2.841567039489746 }, { "auxiliary_loss_clip": 0.01112285, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.0420624, "balance_loss_mlp": 1.01839995, "epoch": 0.786508747670294, "flos": 24389234979840.0, "grad_norm": 1.8558073066057685, "language_loss": 0.82311922, "learning_rate": 4.59211878280982e-07, "loss": 0.84457868, "num_input_tokens_seen": 140986175, "step": 6541, "time_per_iteration": 2.619187116622925 }, { "auxiliary_loss_clip": 0.01108555, "auxiliary_loss_mlp": 0.01036572, "balance_loss_clip": 1.04073548, "balance_loss_mlp": 1.02105665, "epoch": 0.786628990560933, "flos": 18041234238720.0, "grad_norm": 2.423597253229396, "language_loss": 0.69963574, "learning_rate": 4.587153477442578e-07, "loss": 0.72108698, "num_input_tokens_seen": 141002490, "step": 6542, "time_per_iteration": 2.6245999336242676 }, { "auxiliary_loss_clip": 0.01136998, "auxiliary_loss_mlp": 0.01036663, "balance_loss_clip": 1.04447258, "balance_loss_mlp": 1.01921082, "epoch": 0.7867492334515722, "flos": 25848860048640.0, "grad_norm": 3.2853978353888484, "language_loss": 0.8148042, "learning_rate": 4.582190510191899e-07, "loss": 0.83654082, "num_input_tokens_seen": 141021150, "step": 6543, "time_per_iteration": 2.6195244789123535 }, { "auxiliary_loss_clip": 0.01091802, "auxiliary_loss_mlp": 0.01039337, "balance_loss_clip": 1.04048848, "balance_loss_mlp": 1.0237205, "epoch": 0.7868694763422113, "flos": 16580819070720.0, "grad_norm": 1.967717114835771, "language_loss": 0.87406445, "learning_rate": 4.5772298818106625e-07, "loss": 0.89537585, "num_input_tokens_seen": 141036940, "step": 6544, "time_per_iteration": 2.8061277866363525 }, { "auxiliary_loss_clip": 0.01101175, "auxiliary_loss_mlp": 0.01044492, "balance_loss_clip": 1.04233432, "balance_loss_mlp": 1.02870893, "epoch": 0.7869897192328503, "flos": 29386276272000.0, "grad_norm": 2.767449071073894, "language_loss": 0.71938312, "learning_rate": 4.572271593051384e-07, "loss": 0.74083978, "num_input_tokens_seen": 141054295, "step": 6545, "time_per_iteration": 2.7731833457946777 }, { "auxiliary_loss_clip": 0.01077364, "auxiliary_loss_mlp": 0.01051303, "balance_loss_clip": 1.03789878, "balance_loss_mlp": 1.03404105, "epoch": 0.7871099621234895, "flos": 17128923678720.0, "grad_norm": 1.7454029999804166, "language_loss": 0.78110814, "learning_rate": 4.567315644666245e-07, "loss": 0.80239475, "num_input_tokens_seen": 141073090, "step": 6546, "time_per_iteration": 2.6479616165161133 }, { "auxiliary_loss_clip": 0.01086865, "auxiliary_loss_mlp": 0.01050061, "balance_loss_clip": 1.0400548, "balance_loss_mlp": 1.03301406, "epoch": 0.7872302050141285, "flos": 23440187784960.0, "grad_norm": 2.0457961242414378, "language_loss": 0.85171974, "learning_rate": 4.5623620374070507e-07, "loss": 0.87308896, "num_input_tokens_seen": 141092405, "step": 6547, "time_per_iteration": 2.666855573654175 }, { "auxiliary_loss_clip": 0.01007268, "auxiliary_loss_mlp": 0.01001465, "balance_loss_clip": 1.00970483, "balance_loss_mlp": 0.99971247, "epoch": 0.7873504479047676, "flos": 65959752689280.0, "grad_norm": 0.7613136848168136, "language_loss": 0.58324623, "learning_rate": 4.557410772025263e-07, "loss": 0.60333359, "num_input_tokens_seen": 141154355, "step": 6548, "time_per_iteration": 3.3961682319641113 }, { "auxiliary_loss_clip": 0.01104299, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 1.03993702, "balance_loss_mlp": 1.02010489, "epoch": 0.7874706907954068, "flos": 23258336204160.0, "grad_norm": 2.56073157181358, "language_loss": 0.66334522, "learning_rate": 4.5524618492719803e-07, "loss": 0.68475604, "num_input_tokens_seen": 141173575, "step": 6549, "time_per_iteration": 2.6733245849609375 }, { "auxiliary_loss_clip": 0.01118724, "auxiliary_loss_mlp": 0.01031811, "balance_loss_clip": 1.04111779, "balance_loss_mlp": 1.01656413, "epoch": 0.7875909336860458, "flos": 28767786963840.0, "grad_norm": 1.966461632360591, "language_loss": 0.7903344, "learning_rate": 4.54751526989795e-07, "loss": 0.81183976, "num_input_tokens_seen": 141195415, "step": 6550, "time_per_iteration": 2.6745553016662598 }, { "auxiliary_loss_clip": 0.01124132, "auxiliary_loss_mlp": 0.01038332, "balance_loss_clip": 1.04184747, "balance_loss_mlp": 1.02275097, "epoch": 0.7877111765766849, "flos": 18697286194560.0, "grad_norm": 2.2030181294767806, "language_loss": 0.7925334, "learning_rate": 4.5425710346535775e-07, "loss": 0.81415808, "num_input_tokens_seen": 141213360, "step": 6551, "time_per_iteration": 2.6032896041870117 }, { "auxiliary_loss_clip": 0.01121703, "auxiliary_loss_mlp": 0.01033453, "balance_loss_clip": 1.04189157, "balance_loss_mlp": 1.01778328, "epoch": 0.787831419467324, "flos": 27592968833280.0, "grad_norm": 2.1967237744748105, "language_loss": 0.81860203, "learning_rate": 4.537629144288877e-07, "loss": 0.84015357, "num_input_tokens_seen": 141230815, "step": 6552, "time_per_iteration": 2.658684015274048 }, { "auxiliary_loss_clip": 0.0108861, "auxiliary_loss_mlp": 0.01041733, "balance_loss_clip": 1.03830194, "balance_loss_mlp": 1.026021, "epoch": 0.7879516623579631, "flos": 18150187167360.0, "grad_norm": 2.241781127329345, "language_loss": 0.75215632, "learning_rate": 4.5326895995535477e-07, "loss": 0.77345979, "num_input_tokens_seen": 141249715, "step": 6553, "time_per_iteration": 2.785437822341919 }, { "auxiliary_loss_clip": 0.01118103, "auxiliary_loss_mlp": 0.01041106, "balance_loss_clip": 1.04110169, "balance_loss_mlp": 1.02510846, "epoch": 0.7880719052486022, "flos": 20339193807360.0, "grad_norm": 2.288091617743246, "language_loss": 0.84230185, "learning_rate": 4.527752401196907e-07, "loss": 0.86389393, "num_input_tokens_seen": 141267730, "step": 6554, "time_per_iteration": 2.6694107055664062 }, { "auxiliary_loss_clip": 0.0110112, "auxiliary_loss_mlp": 0.01046112, "balance_loss_clip": 1.03925323, "balance_loss_mlp": 1.02882671, "epoch": 0.7881921481392413, "flos": 21653237053440.0, "grad_norm": 1.9589815223966032, "language_loss": 0.66975212, "learning_rate": 4.5228175499679254e-07, "loss": 0.69122446, "num_input_tokens_seen": 141287315, "step": 6555, "time_per_iteration": 2.6490375995635986 }, { "auxiliary_loss_clip": 0.01027665, "auxiliary_loss_mlp": 0.01003919, "balance_loss_clip": 1.0090661, "balance_loss_mlp": 1.00241697, "epoch": 0.7883123910298804, "flos": 68565860058240.0, "grad_norm": 0.8224598248455818, "language_loss": 0.54554933, "learning_rate": 4.5178850466152174e-07, "loss": 0.56586516, "num_input_tokens_seen": 141346145, "step": 6556, "time_per_iteration": 3.2532360553741455 }, { "auxiliary_loss_clip": 0.01104737, "auxiliary_loss_mlp": 0.01039602, "balance_loss_clip": 1.04106879, "balance_loss_mlp": 1.02378297, "epoch": 0.7884326339205194, "flos": 19318217627520.0, "grad_norm": 2.041997808423815, "language_loss": 0.81791222, "learning_rate": 4.512954891887031e-07, "loss": 0.83935559, "num_input_tokens_seen": 141364445, "step": 6557, "time_per_iteration": 3.6293137073516846 }, { "auxiliary_loss_clip": 0.011008, "auxiliary_loss_mlp": 0.01041502, "balance_loss_clip": 1.04033613, "balance_loss_mlp": 1.02343011, "epoch": 0.7885528768111585, "flos": 17784903807360.0, "grad_norm": 2.449644026477495, "language_loss": 0.83685625, "learning_rate": 4.5080270865312806e-07, "loss": 0.85827929, "num_input_tokens_seen": 141381640, "step": 6558, "time_per_iteration": 2.6222381591796875 }, { "auxiliary_loss_clip": 0.01122076, "auxiliary_loss_mlp": 0.01039372, "balance_loss_clip": 1.04236269, "balance_loss_mlp": 1.02327907, "epoch": 0.7886731197017977, "flos": 18807639753600.0, "grad_norm": 3.085335812275234, "language_loss": 0.71298957, "learning_rate": 4.5031016312954985e-07, "loss": 0.734604, "num_input_tokens_seen": 141399955, "step": 6559, "time_per_iteration": 3.5506486892700195 }, { "auxiliary_loss_clip": 0.01130695, "auxiliary_loss_mlp": 0.01042218, "balance_loss_clip": 1.0465616, "balance_loss_mlp": 1.02518296, "epoch": 0.7887933625924367, "flos": 33365358126720.0, "grad_norm": 1.8861268097605166, "language_loss": 0.74543512, "learning_rate": 4.498178526926886e-07, "loss": 0.76716423, "num_input_tokens_seen": 141420820, "step": 6560, "time_per_iteration": 2.722646474838257 }, { "auxiliary_loss_clip": 0.01133035, "auxiliary_loss_mlp": 0.01039986, "balance_loss_clip": 1.04354429, "balance_loss_mlp": 1.0249294, "epoch": 0.7889136054830758, "flos": 17019360218880.0, "grad_norm": 2.016212648961006, "language_loss": 0.72586405, "learning_rate": 4.4932577741722635e-07, "loss": 0.74759424, "num_input_tokens_seen": 141439350, "step": 6561, "time_per_iteration": 3.5398128032684326 }, { "auxiliary_loss_clip": 0.01105745, "auxiliary_loss_mlp": 0.01043483, "balance_loss_clip": 1.04042959, "balance_loss_mlp": 1.02616239, "epoch": 0.7890338483737149, "flos": 29424629018880.0, "grad_norm": 2.299986070642068, "language_loss": 0.74258852, "learning_rate": 4.4883393737780985e-07, "loss": 0.76408076, "num_input_tokens_seen": 141460300, "step": 6562, "time_per_iteration": 2.7383899688720703 }, { "auxiliary_loss_clip": 0.011147, "auxiliary_loss_mlp": 0.01044563, "balance_loss_clip": 1.04001987, "balance_loss_mlp": 1.02752852, "epoch": 0.789154091264354, "flos": 19971576063360.0, "grad_norm": 1.8658191269173499, "language_loss": 0.78499472, "learning_rate": 4.4834233264905254e-07, "loss": 0.80658746, "num_input_tokens_seen": 141477315, "step": 6563, "time_per_iteration": 2.583319664001465 }, { "auxiliary_loss_clip": 0.01089652, "auxiliary_loss_mlp": 0.01040354, "balance_loss_clip": 1.03835332, "balance_loss_mlp": 1.02376032, "epoch": 0.789274334154993, "flos": 14537825216640.0, "grad_norm": 3.7527919102916054, "language_loss": 0.71854371, "learning_rate": 4.478509633055294e-07, "loss": 0.73984373, "num_input_tokens_seen": 141495025, "step": 6564, "time_per_iteration": 2.6395487785339355 }, { "auxiliary_loss_clip": 0.01113394, "auxiliary_loss_mlp": 0.01046275, "balance_loss_clip": 1.0423131, "balance_loss_mlp": 1.02933502, "epoch": 0.7893945770456322, "flos": 21827403123840.0, "grad_norm": 4.177310507956882, "language_loss": 0.80167115, "learning_rate": 4.473598294217813e-07, "loss": 0.82326782, "num_input_tokens_seen": 141510450, "step": 6565, "time_per_iteration": 3.5535471439361572 }, { "auxiliary_loss_clip": 0.01117077, "auxiliary_loss_mlp": 0.01034354, "balance_loss_clip": 1.04062176, "balance_loss_mlp": 1.02016783, "epoch": 0.7895148199362713, "flos": 20740639184640.0, "grad_norm": 2.0094589765906576, "language_loss": 0.72020984, "learning_rate": 4.468689310723124e-07, "loss": 0.74172413, "num_input_tokens_seen": 141528265, "step": 6566, "time_per_iteration": 2.6893179416656494 }, { "auxiliary_loss_clip": 0.01096924, "auxiliary_loss_mlp": 0.01038447, "balance_loss_clip": 1.04048872, "balance_loss_mlp": 1.02309251, "epoch": 0.7896350628269103, "flos": 16690669839360.0, "grad_norm": 1.7522953461233668, "language_loss": 0.7870937, "learning_rate": 4.463782683315913e-07, "loss": 0.80844742, "num_input_tokens_seen": 141547270, "step": 6567, "time_per_iteration": 2.7064895629882812 }, { "auxiliary_loss_clip": 0.01129569, "auxiliary_loss_mlp": 0.01039195, "balance_loss_clip": 1.04211974, "balance_loss_mlp": 1.02438927, "epoch": 0.7897553057175495, "flos": 22638374438400.0, "grad_norm": 1.777952059850571, "language_loss": 0.73360121, "learning_rate": 4.458878412740523e-07, "loss": 0.75528884, "num_input_tokens_seen": 141566050, "step": 6568, "time_per_iteration": 2.56852650642395 }, { "auxiliary_loss_clip": 0.01116884, "auxiliary_loss_mlp": 0.01033253, "balance_loss_clip": 1.04176712, "balance_loss_mlp": 1.01726675, "epoch": 0.7898755486081885, "flos": 14537573821440.0, "grad_norm": 2.2682361274488194, "language_loss": 0.78296024, "learning_rate": 4.453976499740919e-07, "loss": 0.8044616, "num_input_tokens_seen": 141583695, "step": 6569, "time_per_iteration": 2.6069397926330566 }, { "auxiliary_loss_clip": 0.01116869, "auxiliary_loss_mlp": 0.01038532, "balance_loss_clip": 1.04260886, "balance_loss_mlp": 1.02324975, "epoch": 0.7899957914988276, "flos": 17238487138560.0, "grad_norm": 1.7597120568119275, "language_loss": 0.77742285, "learning_rate": 4.4490769450607215e-07, "loss": 0.7989769, "num_input_tokens_seen": 141601320, "step": 6570, "time_per_iteration": 2.5803167819976807 }, { "auxiliary_loss_clip": 0.01091556, "auxiliary_loss_mlp": 0.01047335, "balance_loss_clip": 1.03800488, "balance_loss_mlp": 1.03002548, "epoch": 0.7901160343894668, "flos": 41279351086080.0, "grad_norm": 2.0216392062217206, "language_loss": 0.72830027, "learning_rate": 4.4441797494431845e-07, "loss": 0.74968922, "num_input_tokens_seen": 141623125, "step": 6571, "time_per_iteration": 2.8349387645721436 }, { "auxiliary_loss_clip": 0.01117796, "auxiliary_loss_mlp": 0.01040731, "balance_loss_clip": 1.04157555, "balance_loss_mlp": 1.02504325, "epoch": 0.7902362772801058, "flos": 16837005847680.0, "grad_norm": 2.3631843345149464, "language_loss": 0.7764495, "learning_rate": 4.439284913631207e-07, "loss": 0.79803479, "num_input_tokens_seen": 141640335, "step": 6572, "time_per_iteration": 2.5869781970977783 }, { "auxiliary_loss_clip": 0.01097971, "auxiliary_loss_mlp": 0.01044224, "balance_loss_clip": 1.03978074, "balance_loss_mlp": 1.02625966, "epoch": 0.7903565201707449, "flos": 27125987091840.0, "grad_norm": 2.174459473110217, "language_loss": 0.83671403, "learning_rate": 4.434392438367347e-07, "loss": 0.858136, "num_input_tokens_seen": 141659760, "step": 6573, "time_per_iteration": 2.7005786895751953 }, { "auxiliary_loss_clip": 0.01125303, "auxiliary_loss_mlp": 0.01035561, "balance_loss_clip": 1.04404807, "balance_loss_mlp": 1.02029014, "epoch": 0.790476763061384, "flos": 31025167142400.0, "grad_norm": 2.225610299551088, "language_loss": 0.74131477, "learning_rate": 4.4295023243937677e-07, "loss": 0.76292342, "num_input_tokens_seen": 141679965, "step": 6574, "time_per_iteration": 2.639275074005127 }, { "auxiliary_loss_clip": 0.01123366, "auxiliary_loss_mlp": 0.01045879, "balance_loss_clip": 1.04443836, "balance_loss_mlp": 1.02822399, "epoch": 0.7905970059520231, "flos": 22089084681600.0, "grad_norm": 1.7028643502893317, "language_loss": 0.80352259, "learning_rate": 4.4246145724523123e-07, "loss": 0.8252151, "num_input_tokens_seen": 141697710, "step": 6575, "time_per_iteration": 2.6750714778900146 }, { "auxiliary_loss_clip": 0.01095693, "auxiliary_loss_mlp": 0.01041703, "balance_loss_clip": 1.04221928, "balance_loss_mlp": 1.02685511, "epoch": 0.7907172488426621, "flos": 20558141159040.0, "grad_norm": 2.9726562681854523, "language_loss": 0.77127212, "learning_rate": 4.41972918328444e-07, "loss": 0.79264605, "num_input_tokens_seen": 141715145, "step": 6576, "time_per_iteration": 2.6358189582824707 }, { "auxiliary_loss_clip": 0.01118373, "auxiliary_loss_mlp": 0.01041638, "balance_loss_clip": 1.04284358, "balance_loss_mlp": 1.02574694, "epoch": 0.7908374917333013, "flos": 30081542901120.0, "grad_norm": 2.0562391240545503, "language_loss": 0.77510941, "learning_rate": 4.4148461576312646e-07, "loss": 0.79670954, "num_input_tokens_seen": 141734810, "step": 6577, "time_per_iteration": 2.7421815395355225 }, { "auxiliary_loss_clip": 0.01117917, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.04438961, "balance_loss_mlp": 1.01495802, "epoch": 0.7909577346239404, "flos": 20996359084800.0, "grad_norm": 1.4899473054234453, "language_loss": 0.74719954, "learning_rate": 4.4099654962335343e-07, "loss": 0.76866627, "num_input_tokens_seen": 141755260, "step": 6578, "time_per_iteration": 2.6398627758026123 }, { "auxiliary_loss_clip": 0.01112554, "auxiliary_loss_mlp": 0.01035367, "balance_loss_clip": 1.04241085, "balance_loss_mlp": 1.01914227, "epoch": 0.7910779775145794, "flos": 26247935128320.0, "grad_norm": 1.8511913785297807, "language_loss": 0.75008142, "learning_rate": 4.405087199831636e-07, "loss": 0.77156067, "num_input_tokens_seen": 141775500, "step": 6579, "time_per_iteration": 2.711393356323242 }, { "auxiliary_loss_clip": 0.01110543, "auxiliary_loss_mlp": 0.00772126, "balance_loss_clip": 1.04182363, "balance_loss_mlp": 1.00045168, "epoch": 0.7911982204052186, "flos": 22564434291840.0, "grad_norm": 2.198944444689283, "language_loss": 0.67128944, "learning_rate": 4.400211269165619e-07, "loss": 0.69011617, "num_input_tokens_seen": 141791955, "step": 6580, "time_per_iteration": 2.673219919204712 }, { "auxiliary_loss_clip": 0.01134754, "auxiliary_loss_mlp": 0.01036845, "balance_loss_clip": 1.04763031, "balance_loss_mlp": 1.02275431, "epoch": 0.7913184632958576, "flos": 23112538899840.0, "grad_norm": 1.5068158635514783, "language_loss": 0.76938879, "learning_rate": 4.3953377049751416e-07, "loss": 0.79110473, "num_input_tokens_seen": 141812380, "step": 6581, "time_per_iteration": 2.6847686767578125 }, { "auxiliary_loss_clip": 0.01112247, "auxiliary_loss_mlp": 0.01038138, "balance_loss_clip": 1.04167128, "balance_loss_mlp": 1.02276015, "epoch": 0.7914387061864967, "flos": 12311758719360.0, "grad_norm": 2.594101048382273, "language_loss": 0.78133059, "learning_rate": 4.390466507999537e-07, "loss": 0.80283439, "num_input_tokens_seen": 141828130, "step": 6582, "time_per_iteration": 2.5835185050964355 }, { "auxiliary_loss_clip": 0.01094932, "auxiliary_loss_mlp": 0.01038673, "balance_loss_clip": 1.03909659, "balance_loss_mlp": 1.02224576, "epoch": 0.7915589490771359, "flos": 17603267708160.0, "grad_norm": 2.311566960356665, "language_loss": 0.75840414, "learning_rate": 4.385597678977748e-07, "loss": 0.77974021, "num_input_tokens_seen": 141846965, "step": 6583, "time_per_iteration": 2.6552348136901855 }, { "auxiliary_loss_clip": 0.01104016, "auxiliary_loss_mlp": 0.01039744, "balance_loss_clip": 1.03846717, "balance_loss_mlp": 1.02288818, "epoch": 0.7916791919677749, "flos": 25591272641280.0, "grad_norm": 1.699919561174977, "language_loss": 0.75751978, "learning_rate": 4.3807312186483726e-07, "loss": 0.77895737, "num_input_tokens_seen": 141867685, "step": 6584, "time_per_iteration": 4.561579704284668 }, { "auxiliary_loss_clip": 0.01120039, "auxiliary_loss_mlp": 0.0103595, "balance_loss_clip": 1.04502141, "balance_loss_mlp": 1.02114391, "epoch": 0.791799434858414, "flos": 18844340474880.0, "grad_norm": 1.8098108679272706, "language_loss": 0.78450894, "learning_rate": 4.375867127749655e-07, "loss": 0.80606884, "num_input_tokens_seen": 141885960, "step": 6585, "time_per_iteration": 2.5789148807525635 }, { "auxiliary_loss_clip": 0.01096277, "auxiliary_loss_mlp": 0.01034957, "balance_loss_clip": 1.04176617, "balance_loss_mlp": 1.0189352, "epoch": 0.7919196777490531, "flos": 25812015672960.0, "grad_norm": 2.337023972781532, "language_loss": 0.67029357, "learning_rate": 4.3710054070194744e-07, "loss": 0.69160581, "num_input_tokens_seen": 141905655, "step": 6586, "time_per_iteration": 2.7101950645446777 }, { "auxiliary_loss_clip": 0.01136238, "auxiliary_loss_mlp": 0.0077259, "balance_loss_clip": 1.04452443, "balance_loss_mlp": 1.00050259, "epoch": 0.7920399206396922, "flos": 11947624594560.0, "grad_norm": 3.074760938918584, "language_loss": 0.6682871, "learning_rate": 4.3661460571953455e-07, "loss": 0.68737537, "num_input_tokens_seen": 141922390, "step": 6587, "time_per_iteration": 3.507411003112793 }, { "auxiliary_loss_clip": 0.01119412, "auxiliary_loss_mlp": 0.01029483, "balance_loss_clip": 1.04027903, "balance_loss_mlp": 1.0140934, "epoch": 0.7921601635303313, "flos": 21579907438080.0, "grad_norm": 4.628565803733823, "language_loss": 0.68341094, "learning_rate": 4.36128907901443e-07, "loss": 0.70489979, "num_input_tokens_seen": 141941985, "step": 6588, "time_per_iteration": 2.616950273513794 }, { "auxiliary_loss_clip": 0.0109521, "auxiliary_loss_mlp": 0.01040646, "balance_loss_clip": 1.0388, "balance_loss_mlp": 1.02430224, "epoch": 0.7922804064209703, "flos": 18113989236480.0, "grad_norm": 2.1641310710147708, "language_loss": 0.72687745, "learning_rate": 4.356434473213519e-07, "loss": 0.74823606, "num_input_tokens_seen": 141959435, "step": 6589, "time_per_iteration": 2.6640491485595703 }, { "auxiliary_loss_clip": 0.01110808, "auxiliary_loss_mlp": 0.01047358, "balance_loss_clip": 1.04377294, "balance_loss_mlp": 1.03261805, "epoch": 0.7924006493116095, "flos": 21652806090240.0, "grad_norm": 1.8861542267342049, "language_loss": 0.79479432, "learning_rate": 4.351582240529068e-07, "loss": 0.81637597, "num_input_tokens_seen": 141980265, "step": 6590, "time_per_iteration": 2.620692014694214 }, { "auxiliary_loss_clip": 0.01018407, "auxiliary_loss_mlp": 0.01003197, "balance_loss_clip": 1.00796092, "balance_loss_mlp": 1.00152802, "epoch": 0.7925208922022485, "flos": 64242755694720.0, "grad_norm": 0.6753329795140539, "language_loss": 0.58171439, "learning_rate": 4.346732381697149e-07, "loss": 0.60193038, "num_input_tokens_seen": 142044395, "step": 6591, "time_per_iteration": 3.240415573120117 }, { "auxiliary_loss_clip": 0.01102605, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.040236, "balance_loss_mlp": 1.02143335, "epoch": 0.7926411350928876, "flos": 16941541403520.0, "grad_norm": 3.443704142478683, "language_loss": 0.81279534, "learning_rate": 4.3418848974534825e-07, "loss": 0.83418953, "num_input_tokens_seen": 142061335, "step": 6592, "time_per_iteration": 3.500608444213867 }, { "auxiliary_loss_clip": 0.01100445, "auxiliary_loss_mlp": 0.01040917, "balance_loss_clip": 1.04113793, "balance_loss_mlp": 1.02537227, "epoch": 0.7927613779835267, "flos": 34459987144320.0, "grad_norm": 1.644726892752252, "language_loss": 0.68549716, "learning_rate": 4.3370397885334276e-07, "loss": 0.70691085, "num_input_tokens_seen": 142081965, "step": 6593, "time_per_iteration": 2.777670383453369 }, { "auxiliary_loss_clip": 0.01114718, "auxiliary_loss_mlp": 0.01040534, "balance_loss_clip": 1.04261506, "balance_loss_mlp": 1.02408338, "epoch": 0.7928816208741658, "flos": 18951174501120.0, "grad_norm": 2.363189478722823, "language_loss": 0.75493371, "learning_rate": 4.3321970556719777e-07, "loss": 0.77648628, "num_input_tokens_seen": 142100260, "step": 6594, "time_per_iteration": 2.610997438430786 }, { "auxiliary_loss_clip": 0.01134764, "auxiliary_loss_mlp": 0.0103564, "balance_loss_clip": 1.04477036, "balance_loss_mlp": 1.01915312, "epoch": 0.7930018637648049, "flos": 18623022825600.0, "grad_norm": 2.8414262593662416, "language_loss": 0.72156948, "learning_rate": 4.3273566996037856e-07, "loss": 0.7432735, "num_input_tokens_seen": 142116955, "step": 6595, "time_per_iteration": 2.555631160736084 }, { "auxiliary_loss_clip": 0.01103934, "auxiliary_loss_mlp": 0.01034297, "balance_loss_clip": 1.03930604, "balance_loss_mlp": 1.01968825, "epoch": 0.793122106655444, "flos": 24530650824960.0, "grad_norm": 2.1816695019167396, "language_loss": 0.80057764, "learning_rate": 4.322518721063113e-07, "loss": 0.82195997, "num_input_tokens_seen": 142135505, "step": 6596, "time_per_iteration": 2.6835193634033203 }, { "auxiliary_loss_clip": 0.01121938, "auxiliary_loss_mlp": 0.01038736, "balance_loss_clip": 1.04308915, "balance_loss_mlp": 1.02310133, "epoch": 0.7932423495460831, "flos": 34421203434240.0, "grad_norm": 2.014093450305094, "language_loss": 0.70122504, "learning_rate": 4.3176831207838906e-07, "loss": 0.72283173, "num_input_tokens_seen": 142158915, "step": 6597, "time_per_iteration": 2.7101314067840576 }, { "auxiliary_loss_clip": 0.01119512, "auxiliary_loss_mlp": 0.01044748, "balance_loss_clip": 1.04423344, "balance_loss_mlp": 1.02973962, "epoch": 0.7933625924367221, "flos": 26980333441920.0, "grad_norm": 1.9076698504118594, "language_loss": 0.74743927, "learning_rate": 4.3128498994996685e-07, "loss": 0.76908183, "num_input_tokens_seen": 142178390, "step": 6598, "time_per_iteration": 2.6619887351989746 }, { "auxiliary_loss_clip": 0.01124313, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.0426805, "balance_loss_mlp": 1.01890421, "epoch": 0.7934828353273613, "flos": 29568630643200.0, "grad_norm": 2.6979262588486854, "language_loss": 0.71477902, "learning_rate": 4.308019057943646e-07, "loss": 0.73637187, "num_input_tokens_seen": 142200115, "step": 6599, "time_per_iteration": 2.798248767852783 }, { "auxiliary_loss_clip": 0.01088857, "auxiliary_loss_mlp": 0.01044218, "balance_loss_clip": 1.03988504, "balance_loss_mlp": 1.02864969, "epoch": 0.7936030782180004, "flos": 28615381557120.0, "grad_norm": 1.696589008358485, "language_loss": 0.74348396, "learning_rate": 4.3031905968486535e-07, "loss": 0.76481473, "num_input_tokens_seen": 142220945, "step": 6600, "time_per_iteration": 2.755979299545288 }, { "auxiliary_loss_clip": 0.01083219, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.04071689, "balance_loss_mlp": 1.0162065, "epoch": 0.7937233211086394, "flos": 16392574869120.0, "grad_norm": 2.29491670570033, "language_loss": 0.68874729, "learning_rate": 4.298364516947162e-07, "loss": 0.70990193, "num_input_tokens_seen": 142238175, "step": 6601, "time_per_iteration": 2.648345947265625 }, { "auxiliary_loss_clip": 0.01083985, "auxiliary_loss_mlp": 0.01041644, "balance_loss_clip": 1.0404439, "balance_loss_mlp": 1.0257175, "epoch": 0.7938435639992786, "flos": 22013420682240.0, "grad_norm": 1.8202111736557793, "language_loss": 0.65985894, "learning_rate": 4.293540818971295e-07, "loss": 0.68111527, "num_input_tokens_seen": 142255980, "step": 6602, "time_per_iteration": 2.7257440090179443 }, { "auxiliary_loss_clip": 0.01126825, "auxiliary_loss_mlp": 0.01036498, "balance_loss_clip": 1.04478168, "balance_loss_mlp": 1.02102494, "epoch": 0.7939638068899176, "flos": 22197032029440.0, "grad_norm": 3.4071288808389846, "language_loss": 0.76878142, "learning_rate": 4.2887195036527934e-07, "loss": 0.79041469, "num_input_tokens_seen": 142274785, "step": 6603, "time_per_iteration": 2.689751625061035 }, { "auxiliary_loss_clip": 0.0111494, "auxiliary_loss_mlp": 0.01035421, "balance_loss_clip": 1.04090619, "balance_loss_mlp": 1.01923275, "epoch": 0.7940840497805567, "flos": 17745186343680.0, "grad_norm": 2.761385996988084, "language_loss": 0.73336345, "learning_rate": 4.28390057172306e-07, "loss": 0.75486708, "num_input_tokens_seen": 142291290, "step": 6604, "time_per_iteration": 2.751742362976074 }, { "auxiliary_loss_clip": 0.01092719, "auxiliary_loss_mlp": 0.01049577, "balance_loss_clip": 1.03944933, "balance_loss_mlp": 1.03084874, "epoch": 0.7942042926711959, "flos": 23805435231360.0, "grad_norm": 2.1891909846304456, "language_loss": 0.71876055, "learning_rate": 4.279084023913111e-07, "loss": 0.74018347, "num_input_tokens_seen": 142309165, "step": 6605, "time_per_iteration": 2.7551286220550537 }, { "auxiliary_loss_clip": 0.0111876, "auxiliary_loss_mlp": 0.01039612, "balance_loss_clip": 1.04319668, "balance_loss_mlp": 1.02457952, "epoch": 0.7943245355618349, "flos": 19244959839360.0, "grad_norm": 1.79439168977253, "language_loss": 0.69176358, "learning_rate": 4.2742698609536096e-07, "loss": 0.71334732, "num_input_tokens_seen": 142327475, "step": 6606, "time_per_iteration": 2.6534836292266846 }, { "auxiliary_loss_clip": 0.01110273, "auxiliary_loss_mlp": 0.01040494, "balance_loss_clip": 1.04207969, "balance_loss_mlp": 1.02462697, "epoch": 0.794444778452474, "flos": 25007616547200.0, "grad_norm": 1.7448713415214023, "language_loss": 0.78728092, "learning_rate": 4.2694580835748706e-07, "loss": 0.80878854, "num_input_tokens_seen": 142347335, "step": 6607, "time_per_iteration": 2.839423656463623 }, { "auxiliary_loss_clip": 0.01105138, "auxiliary_loss_mlp": 0.01040941, "balance_loss_clip": 1.03929424, "balance_loss_mlp": 1.02561069, "epoch": 0.7945650213431131, "flos": 23221491828480.0, "grad_norm": 2.0029720016409, "language_loss": 0.74382377, "learning_rate": 4.264648692506836e-07, "loss": 0.76528454, "num_input_tokens_seen": 142366125, "step": 6608, "time_per_iteration": 2.701383590698242 }, { "auxiliary_loss_clip": 0.01107362, "auxiliary_loss_mlp": 0.01047199, "balance_loss_clip": 1.04121292, "balance_loss_mlp": 1.02829242, "epoch": 0.7946852642337522, "flos": 26062887237120.0, "grad_norm": 1.9707718921154398, "language_loss": 0.72013134, "learning_rate": 4.2598416884790824e-07, "loss": 0.74167693, "num_input_tokens_seen": 142385175, "step": 6609, "time_per_iteration": 2.7164077758789062 }, { "auxiliary_loss_clip": 0.01117804, "auxiliary_loss_mlp": 0.01041653, "balance_loss_clip": 1.0422771, "balance_loss_mlp": 1.02528596, "epoch": 0.7948055071243912, "flos": 23769704177280.0, "grad_norm": 2.3528452278031984, "language_loss": 0.80844963, "learning_rate": 4.255037072220828e-07, "loss": 0.83004415, "num_input_tokens_seen": 142406545, "step": 6610, "time_per_iteration": 3.6896798610687256 }, { "auxiliary_loss_clip": 0.01128117, "auxiliary_loss_mlp": 0.01036624, "balance_loss_clip": 1.04179192, "balance_loss_mlp": 1.02125788, "epoch": 0.7949257500150304, "flos": 21980814111360.0, "grad_norm": 1.7169283497213002, "language_loss": 0.71630085, "learning_rate": 4.2502348444609293e-07, "loss": 0.7379483, "num_input_tokens_seen": 142426165, "step": 6611, "time_per_iteration": 2.6132359504699707 }, { "auxiliary_loss_clip": 0.01079868, "auxiliary_loss_mlp": 0.01037322, "balance_loss_clip": 1.03711057, "balance_loss_mlp": 1.02231359, "epoch": 0.7950459929056695, "flos": 25774129802880.0, "grad_norm": 2.5445319114068394, "language_loss": 0.69310111, "learning_rate": 4.2454350059278844e-07, "loss": 0.71427298, "num_input_tokens_seen": 142447225, "step": 6612, "time_per_iteration": 2.7154195308685303 }, { "auxiliary_loss_clip": 0.0110142, "auxiliary_loss_mlp": 0.01038635, "balance_loss_clip": 1.0370996, "balance_loss_mlp": 1.02346551, "epoch": 0.7951662357963085, "flos": 22158068751360.0, "grad_norm": 2.155316311684294, "language_loss": 0.84462368, "learning_rate": 4.240637557349824e-07, "loss": 0.86602426, "num_input_tokens_seen": 142464440, "step": 6613, "time_per_iteration": 3.761056423187256 }, { "auxiliary_loss_clip": 0.01093079, "auxiliary_loss_mlp": 0.01039707, "balance_loss_clip": 1.03698194, "balance_loss_mlp": 1.02246976, "epoch": 0.7952864786869477, "flos": 24641938137600.0, "grad_norm": 2.368465671951684, "language_loss": 0.66734946, "learning_rate": 4.235842499454516e-07, "loss": 0.68867731, "num_input_tokens_seen": 142484355, "step": 6614, "time_per_iteration": 2.688269853591919 }, { "auxiliary_loss_clip": 0.01108636, "auxiliary_loss_mlp": 0.01045745, "balance_loss_clip": 1.04276431, "balance_loss_mlp": 1.03004527, "epoch": 0.7954067215775867, "flos": 21830922656640.0, "grad_norm": 1.7436588856755169, "language_loss": 0.82743061, "learning_rate": 4.2310498329693687e-07, "loss": 0.84897447, "num_input_tokens_seen": 142505255, "step": 6615, "time_per_iteration": 2.7008979320526123 }, { "auxiliary_loss_clip": 0.01123401, "auxiliary_loss_mlp": 0.01045567, "balance_loss_clip": 1.04226279, "balance_loss_mlp": 1.02829385, "epoch": 0.7955269644682258, "flos": 24060652341120.0, "grad_norm": 1.6384673605264615, "language_loss": 0.80863655, "learning_rate": 4.2262595586214164e-07, "loss": 0.83032626, "num_input_tokens_seen": 142526350, "step": 6616, "time_per_iteration": 2.637385845184326 }, { "auxiliary_loss_clip": 0.01120863, "auxiliary_loss_mlp": 0.01037719, "balance_loss_clip": 1.04208982, "balance_loss_mlp": 1.02199531, "epoch": 0.795647207358865, "flos": 25010741030400.0, "grad_norm": 2.0369645813164223, "language_loss": 0.76931965, "learning_rate": 4.221471677137358e-07, "loss": 0.79090548, "num_input_tokens_seen": 142547165, "step": 6617, "time_per_iteration": 3.4841115474700928 }, { "auxiliary_loss_clip": 0.01095746, "auxiliary_loss_mlp": 0.01039046, "balance_loss_clip": 1.03827381, "balance_loss_mlp": 1.0240376, "epoch": 0.795767450249504, "flos": 14648358343680.0, "grad_norm": 3.0168543209319614, "language_loss": 0.70121396, "learning_rate": 4.216686189243492e-07, "loss": 0.7225619, "num_input_tokens_seen": 142565955, "step": 6618, "time_per_iteration": 2.6424784660339355 }, { "auxiliary_loss_clip": 0.01093186, "auxiliary_loss_mlp": 0.01041766, "balance_loss_clip": 1.03911614, "balance_loss_mlp": 1.02639973, "epoch": 0.7958876931401431, "flos": 18547897530240.0, "grad_norm": 1.894073628160096, "language_loss": 0.72645789, "learning_rate": 4.211903095665785e-07, "loss": 0.74780744, "num_input_tokens_seen": 142585340, "step": 6619, "time_per_iteration": 2.667837381362915 }, { "auxiliary_loss_clip": 0.0111501, "auxiliary_loss_mlp": 0.01032546, "balance_loss_clip": 1.04033422, "balance_loss_mlp": 1.0171684, "epoch": 0.7960079360307821, "flos": 21543960902400.0, "grad_norm": 1.7907391412130478, "language_loss": 0.75255549, "learning_rate": 4.2071223971298277e-07, "loss": 0.77403104, "num_input_tokens_seen": 142602525, "step": 6620, "time_per_iteration": 2.6443753242492676 }, { "auxiliary_loss_clip": 0.01121721, "auxiliary_loss_mlp": 0.01038103, "balance_loss_clip": 1.04153371, "balance_loss_mlp": 1.02192664, "epoch": 0.7961281789214213, "flos": 25481745095040.0, "grad_norm": 2.532491971221289, "language_loss": 0.61382604, "learning_rate": 4.2023440943608433e-07, "loss": 0.63542426, "num_input_tokens_seen": 142622490, "step": 6621, "time_per_iteration": 2.6568870544433594 }, { "auxiliary_loss_clip": 0.01114957, "auxiliary_loss_mlp": 0.0103496, "balance_loss_clip": 1.03911257, "balance_loss_mlp": 1.02034461, "epoch": 0.7962484218120603, "flos": 21944436612480.0, "grad_norm": 1.6737680311182246, "language_loss": 0.77918214, "learning_rate": 4.1975681880837023e-07, "loss": 0.80068135, "num_input_tokens_seen": 142642495, "step": 6622, "time_per_iteration": 2.614086151123047 }, { "auxiliary_loss_clip": 0.01094778, "auxiliary_loss_mlp": 0.01044364, "balance_loss_clip": 1.0398078, "balance_loss_mlp": 1.02941513, "epoch": 0.7963686647026994, "flos": 18876264687360.0, "grad_norm": 1.9081662945845221, "language_loss": 0.82133806, "learning_rate": 4.192794679022895e-07, "loss": 0.84272945, "num_input_tokens_seen": 142660820, "step": 6623, "time_per_iteration": 2.663119077682495 }, { "auxiliary_loss_clip": 0.01120581, "auxiliary_loss_mlp": 0.01042217, "balance_loss_clip": 1.04182613, "balance_loss_mlp": 1.02511048, "epoch": 0.7964889075933386, "flos": 29716582763520.0, "grad_norm": 1.9865047752238456, "language_loss": 0.72222972, "learning_rate": 4.1880235679025743e-07, "loss": 0.74385768, "num_input_tokens_seen": 142680915, "step": 6624, "time_per_iteration": 2.6546082496643066 }, { "auxiliary_loss_clip": 0.01073422, "auxiliary_loss_mlp": 0.01040358, "balance_loss_clip": 1.03824425, "balance_loss_mlp": 1.02416945, "epoch": 0.7966091504839776, "flos": 29491458272640.0, "grad_norm": 2.796355518129072, "language_loss": 0.64077252, "learning_rate": 4.1832548554464986e-07, "loss": 0.6619103, "num_input_tokens_seen": 142699210, "step": 6625, "time_per_iteration": 2.7887542247772217 }, { "auxiliary_loss_clip": 0.01023881, "auxiliary_loss_mlp": 0.01002765, "balance_loss_clip": 1.00774109, "balance_loss_mlp": 1.00128102, "epoch": 0.7967293933746167, "flos": 67288697101440.0, "grad_norm": 0.7415845006831767, "language_loss": 0.58693147, "learning_rate": 4.178488542378098e-07, "loss": 0.60719794, "num_input_tokens_seen": 142756790, "step": 6626, "time_per_iteration": 3.123469114303589 }, { "auxiliary_loss_clip": 0.01133361, "auxiliary_loss_mlp": 0.01036125, "balance_loss_clip": 1.04260767, "balance_loss_mlp": 1.01960254, "epoch": 0.7968496362652558, "flos": 25554679660800.0, "grad_norm": 1.7731415113172446, "language_loss": 0.89341778, "learning_rate": 4.173724629420401e-07, "loss": 0.91511261, "num_input_tokens_seen": 142778150, "step": 6627, "time_per_iteration": 2.593959331512451 }, { "auxiliary_loss_clip": 0.01113348, "auxiliary_loss_mlp": 0.0103873, "balance_loss_clip": 1.04184711, "balance_loss_mlp": 1.02212405, "epoch": 0.7969698791558949, "flos": 14501088581760.0, "grad_norm": 2.574547841675543, "language_loss": 0.68531251, "learning_rate": 4.168963117296087e-07, "loss": 0.70683336, "num_input_tokens_seen": 142795485, "step": 6628, "time_per_iteration": 2.593062162399292 }, { "auxiliary_loss_clip": 0.01135563, "auxiliary_loss_mlp": 0.01039098, "balance_loss_clip": 1.04589736, "balance_loss_mlp": 1.02350581, "epoch": 0.797090122046534, "flos": 22127545169280.0, "grad_norm": 3.034213241347086, "language_loss": 0.75945103, "learning_rate": 4.1642040067274876e-07, "loss": 0.78119767, "num_input_tokens_seen": 142815155, "step": 6629, "time_per_iteration": 2.6022441387176514 }, { "auxiliary_loss_clip": 0.01113209, "auxiliary_loss_mlp": 0.01048243, "balance_loss_clip": 1.04273534, "balance_loss_mlp": 1.03262639, "epoch": 0.7972103649371731, "flos": 19897671830400.0, "grad_norm": 1.8021358784601351, "language_loss": 0.72611505, "learning_rate": 4.1594472984365493e-07, "loss": 0.7477296, "num_input_tokens_seen": 142833840, "step": 6630, "time_per_iteration": 2.7666003704071045 }, { "auxiliary_loss_clip": 0.01114697, "auxiliary_loss_mlp": 0.01034507, "balance_loss_clip": 1.04022098, "balance_loss_mlp": 1.01920688, "epoch": 0.7973306078278122, "flos": 36058621847040.0, "grad_norm": 1.6388923339745138, "language_loss": 0.77445829, "learning_rate": 4.154692993144862e-07, "loss": 0.79595041, "num_input_tokens_seen": 142853610, "step": 6631, "time_per_iteration": 2.736727714538574 }, { "auxiliary_loss_clip": 0.01133414, "auxiliary_loss_mlp": 0.00771323, "balance_loss_clip": 1.04265797, "balance_loss_mlp": 1.00053132, "epoch": 0.7974508507184512, "flos": 21360600950400.0, "grad_norm": 2.222557541029541, "language_loss": 0.71443045, "learning_rate": 4.1499410915736476e-07, "loss": 0.73347783, "num_input_tokens_seen": 142872540, "step": 6632, "time_per_iteration": 2.5743656158447266 }, { "auxiliary_loss_clip": 0.01031101, "auxiliary_loss_mlp": 0.01001792, "balance_loss_clip": 1.01243722, "balance_loss_mlp": 1.00012302, "epoch": 0.7975710936090904, "flos": 68253115317120.0, "grad_norm": 0.765923290445216, "language_loss": 0.64215153, "learning_rate": 4.145191594443762e-07, "loss": 0.66248047, "num_input_tokens_seen": 142936895, "step": 6633, "time_per_iteration": 3.3645403385162354 }, { "auxiliary_loss_clip": 0.01094698, "auxiliary_loss_mlp": 0.01042647, "balance_loss_clip": 1.04184997, "balance_loss_mlp": 1.02600491, "epoch": 0.7976913364997295, "flos": 22492433479680.0, "grad_norm": 1.639012026421625, "language_loss": 0.70386624, "learning_rate": 4.140444502475713e-07, "loss": 0.72523975, "num_input_tokens_seen": 142956445, "step": 6634, "time_per_iteration": 2.706449270248413 }, { "auxiliary_loss_clip": 0.01117394, "auxiliary_loss_mlp": 0.01046743, "balance_loss_clip": 1.04118299, "balance_loss_mlp": 1.03047132, "epoch": 0.7978115793903685, "flos": 15263220378240.0, "grad_norm": 2.223272225129854, "language_loss": 0.7018559, "learning_rate": 4.1356998163896216e-07, "loss": 0.72349727, "num_input_tokens_seen": 142973495, "step": 6635, "time_per_iteration": 2.5634605884552 }, { "auxiliary_loss_clip": 0.01103052, "auxiliary_loss_mlp": 0.01039434, "balance_loss_clip": 1.04361033, "balance_loss_mlp": 1.02398419, "epoch": 0.7979318222810077, "flos": 19719232041600.0, "grad_norm": 2.2424564273291594, "language_loss": 0.75072503, "learning_rate": 4.130957536905255e-07, "loss": 0.77214992, "num_input_tokens_seen": 142991510, "step": 6636, "time_per_iteration": 4.48188853263855 }, { "auxiliary_loss_clip": 0.01117077, "auxiliary_loss_mlp": 0.01042186, "balance_loss_clip": 1.04384112, "balance_loss_mlp": 1.02457929, "epoch": 0.7980520651716467, "flos": 15560273854080.0, "grad_norm": 3.030683733298789, "language_loss": 0.71647429, "learning_rate": 4.1262176647420134e-07, "loss": 0.73806691, "num_input_tokens_seen": 143009675, "step": 6637, "time_per_iteration": 2.6488842964172363 }, { "auxiliary_loss_clip": 0.01111252, "auxiliary_loss_mlp": 0.01044688, "balance_loss_clip": 1.03992128, "balance_loss_mlp": 1.02849913, "epoch": 0.7981723080622858, "flos": 22309432663680.0, "grad_norm": 1.6681878337471256, "language_loss": 0.80075002, "learning_rate": 4.121480200618923e-07, "loss": 0.82230937, "num_input_tokens_seen": 143029330, "step": 6638, "time_per_iteration": 2.671595811843872 }, { "auxiliary_loss_clip": 0.01100586, "auxiliary_loss_mlp": 0.01048566, "balance_loss_clip": 1.03879464, "balance_loss_mlp": 1.03110182, "epoch": 0.798292550952925, "flos": 22929573997440.0, "grad_norm": 1.950935640826011, "language_loss": 0.799945, "learning_rate": 4.116745145254674e-07, "loss": 0.82143652, "num_input_tokens_seen": 143048865, "step": 6639, "time_per_iteration": 3.6201069355010986 }, { "auxiliary_loss_clip": 0.01017208, "auxiliary_loss_mlp": 0.01001407, "balance_loss_clip": 1.00917101, "balance_loss_mlp": 0.99988747, "epoch": 0.798412793843564, "flos": 64497936890880.0, "grad_norm": 0.7627647308897954, "language_loss": 0.57953173, "learning_rate": 4.1120124993675476e-07, "loss": 0.59971786, "num_input_tokens_seen": 143113295, "step": 6640, "time_per_iteration": 3.2327451705932617 }, { "auxiliary_loss_clip": 0.01113303, "auxiliary_loss_mlp": 0.01037649, "balance_loss_clip": 1.04085636, "balance_loss_mlp": 1.0215559, "epoch": 0.7985330367342031, "flos": 13586910514560.0, "grad_norm": 7.9871342857454035, "language_loss": 0.62162566, "learning_rate": 4.107282263675498e-07, "loss": 0.64313519, "num_input_tokens_seen": 143130965, "step": 6641, "time_per_iteration": 2.612802743911743 }, { "auxiliary_loss_clip": 0.01013705, "auxiliary_loss_mlp": 0.00755735, "balance_loss_clip": 1.00898695, "balance_loss_mlp": 1.00020754, "epoch": 0.7986532796248422, "flos": 67698797656320.0, "grad_norm": 0.7653284847752843, "language_loss": 0.52520126, "learning_rate": 4.1025544388960907e-07, "loss": 0.54289562, "num_input_tokens_seen": 143192005, "step": 6642, "time_per_iteration": 3.2340805530548096 }, { "auxiliary_loss_clip": 0.01121819, "auxiliary_loss_mlp": 0.01044307, "balance_loss_clip": 1.04355478, "balance_loss_mlp": 1.02829111, "epoch": 0.7987735225154813, "flos": 22455373622400.0, "grad_norm": 2.1262322073597626, "language_loss": 0.71705598, "learning_rate": 4.097829025746538e-07, "loss": 0.73871726, "num_input_tokens_seen": 143213550, "step": 6643, "time_per_iteration": 2.6583187580108643 }, { "auxiliary_loss_clip": 0.01027147, "auxiliary_loss_mlp": 0.01001886, "balance_loss_clip": 1.00854552, "balance_loss_mlp": 1.00032413, "epoch": 0.7988937654061203, "flos": 68864098682880.0, "grad_norm": 0.6624373804137652, "language_loss": 0.61017179, "learning_rate": 4.0931060249436757e-07, "loss": 0.63046217, "num_input_tokens_seen": 143277390, "step": 6644, "time_per_iteration": 4.1353538036346436 }, { "auxiliary_loss_clip": 0.01121351, "auxiliary_loss_mlp": 0.01043356, "balance_loss_clip": 1.04327393, "balance_loss_mlp": 1.02829957, "epoch": 0.7990140082967595, "flos": 20806893820800.0, "grad_norm": 2.1176424204592488, "language_loss": 0.6978665, "learning_rate": 4.088385437203978e-07, "loss": 0.7195136, "num_input_tokens_seen": 143294400, "step": 6645, "time_per_iteration": 2.6171770095825195 }, { "auxiliary_loss_clip": 0.01133453, "auxiliary_loss_mlp": 0.01044489, "balance_loss_clip": 1.04189849, "balance_loss_mlp": 1.02886653, "epoch": 0.7991342511873986, "flos": 18985289443200.0, "grad_norm": 4.049891501337942, "language_loss": 0.77637339, "learning_rate": 4.083667263243564e-07, "loss": 0.7981528, "num_input_tokens_seen": 143312745, "step": 6646, "time_per_iteration": 2.5710928440093994 }, { "auxiliary_loss_clip": 0.01113515, "auxiliary_loss_mlp": 0.01035703, "balance_loss_clip": 1.04193592, "balance_loss_mlp": 1.02040863, "epoch": 0.7992544940780376, "flos": 20816805974400.0, "grad_norm": 1.708062764474827, "language_loss": 0.71711266, "learning_rate": 4.0789515037781653e-07, "loss": 0.73860478, "num_input_tokens_seen": 143333470, "step": 6647, "time_per_iteration": 2.6228013038635254 }, { "auxiliary_loss_clip": 0.01124064, "auxiliary_loss_mlp": 0.01043403, "balance_loss_clip": 1.04356253, "balance_loss_mlp": 1.02640343, "epoch": 0.7993747369686768, "flos": 12640772321280.0, "grad_norm": 2.8374130149834, "language_loss": 0.82214326, "learning_rate": 4.0742381595231755e-07, "loss": 0.84381801, "num_input_tokens_seen": 143350195, "step": 6648, "time_per_iteration": 2.5907742977142334 }, { "auxiliary_loss_clip": 0.01097065, "auxiliary_loss_mlp": 0.01035092, "balance_loss_clip": 1.04017889, "balance_loss_mlp": 1.02084637, "epoch": 0.7994949798593158, "flos": 20078769225600.0, "grad_norm": 2.264697416382895, "language_loss": 0.78447521, "learning_rate": 4.06952723119359e-07, "loss": 0.80579674, "num_input_tokens_seen": 143370070, "step": 6649, "time_per_iteration": 2.6771998405456543 }, { "auxiliary_loss_clip": 0.01100325, "auxiliary_loss_mlp": 0.01048168, "balance_loss_clip": 1.03870499, "balance_loss_mlp": 1.03084707, "epoch": 0.7996152227499549, "flos": 38654209509120.0, "grad_norm": 2.11877801789488, "language_loss": 0.67176557, "learning_rate": 4.0648187195040504e-07, "loss": 0.69325054, "num_input_tokens_seen": 143392275, "step": 6650, "time_per_iteration": 2.7290115356445312 }, { "auxiliary_loss_clip": 0.01023795, "auxiliary_loss_mlp": 0.01002352, "balance_loss_clip": 1.00730062, "balance_loss_mlp": 1.00079083, "epoch": 0.799735465640594, "flos": 70243821947520.0, "grad_norm": 0.8108916544077435, "language_loss": 0.67609316, "learning_rate": 4.060112625168848e-07, "loss": 0.69635463, "num_input_tokens_seen": 143457385, "step": 6651, "time_per_iteration": 3.2871344089508057 }, { "auxiliary_loss_clip": 0.01133084, "auxiliary_loss_mlp": 0.01042946, "balance_loss_clip": 1.04365921, "balance_loss_mlp": 1.02717447, "epoch": 0.7998557085312331, "flos": 24240995550720.0, "grad_norm": 2.1352204995402873, "language_loss": 0.74113405, "learning_rate": 4.055408948901886e-07, "loss": 0.76289433, "num_input_tokens_seen": 143478785, "step": 6652, "time_per_iteration": 2.606146812438965 }, { "auxiliary_loss_clip": 0.01125281, "auxiliary_loss_mlp": 0.0104566, "balance_loss_clip": 1.04412019, "balance_loss_mlp": 1.02914989, "epoch": 0.7999759514218722, "flos": 27564025449600.0, "grad_norm": 2.0243134767762134, "language_loss": 0.71608102, "learning_rate": 4.050707691416708e-07, "loss": 0.73779041, "num_input_tokens_seen": 143500095, "step": 6653, "time_per_iteration": 2.6402528285980225 }, { "auxiliary_loss_clip": 0.01023284, "auxiliary_loss_mlp": 0.01000991, "balance_loss_clip": 1.00688159, "balance_loss_mlp": 0.99953634, "epoch": 0.8000961943125112, "flos": 67337428878720.0, "grad_norm": 0.6809015442586691, "language_loss": 0.5971452, "learning_rate": 4.046008853426495e-07, "loss": 0.61738789, "num_input_tokens_seen": 143563410, "step": 6654, "time_per_iteration": 3.2351975440979004 }, { "auxiliary_loss_clip": 0.01094267, "auxiliary_loss_mlp": 0.01044339, "balance_loss_clip": 1.03874421, "balance_loss_mlp": 1.02582598, "epoch": 0.8002164372031504, "flos": 28733815676160.0, "grad_norm": 8.43543332354832, "language_loss": 0.62601179, "learning_rate": 4.0413124356440464e-07, "loss": 0.64739788, "num_input_tokens_seen": 143587455, "step": 6655, "time_per_iteration": 2.766418218612671 }, { "auxiliary_loss_clip": 0.01087161, "auxiliary_loss_mlp": 0.01041437, "balance_loss_clip": 1.03798008, "balance_loss_mlp": 1.02586806, "epoch": 0.8003366800937894, "flos": 17639429725440.0, "grad_norm": 1.9403393866669676, "language_loss": 0.82107216, "learning_rate": 4.0366184387818223e-07, "loss": 0.84235811, "num_input_tokens_seen": 143605915, "step": 6656, "time_per_iteration": 2.6580488681793213 }, { "auxiliary_loss_clip": 0.01140231, "auxiliary_loss_mlp": 0.01042343, "balance_loss_clip": 1.04537368, "balance_loss_mlp": 1.02616632, "epoch": 0.8004569229844285, "flos": 25995303797760.0, "grad_norm": 2.0560375137248554, "language_loss": 0.8510493, "learning_rate": 4.0319268635518797e-07, "loss": 0.87287503, "num_input_tokens_seen": 143626490, "step": 6657, "time_per_iteration": 2.6335318088531494 }, { "auxiliary_loss_clip": 0.01121237, "auxiliary_loss_mlp": 0.01035492, "balance_loss_clip": 1.04230869, "balance_loss_mlp": 1.02050722, "epoch": 0.8005771658750677, "flos": 20812352688000.0, "grad_norm": 1.6032993705277192, "language_loss": 0.75177562, "learning_rate": 4.027237710665943e-07, "loss": 0.77334297, "num_input_tokens_seen": 143644955, "step": 6658, "time_per_iteration": 2.7358341217041016 }, { "auxiliary_loss_clip": 0.01096768, "auxiliary_loss_mlp": 0.01036965, "balance_loss_clip": 1.03745425, "balance_loss_mlp": 1.02148581, "epoch": 0.8006974087657067, "flos": 25812626204160.0, "grad_norm": 1.8571504740113058, "language_loss": 0.69456005, "learning_rate": 4.022550980835344e-07, "loss": 0.71589744, "num_input_tokens_seen": 143667200, "step": 6659, "time_per_iteration": 2.733914852142334 }, { "auxiliary_loss_clip": 0.01098647, "auxiliary_loss_mlp": 0.01039164, "balance_loss_clip": 1.03846312, "balance_loss_mlp": 1.02373791, "epoch": 0.8008176516563458, "flos": 17164690646400.0, "grad_norm": 2.689861791724367, "language_loss": 0.79686993, "learning_rate": 4.017866674771051e-07, "loss": 0.81824809, "num_input_tokens_seen": 143684685, "step": 6660, "time_per_iteration": 2.6426374912261963 }, { "auxiliary_loss_clip": 0.01079075, "auxiliary_loss_mlp": 0.01040781, "balance_loss_clip": 1.03771937, "balance_loss_mlp": 1.02448559, "epoch": 0.8009378945469849, "flos": 24207311571840.0, "grad_norm": 1.6620002018803806, "language_loss": 0.74550354, "learning_rate": 4.013184793183688e-07, "loss": 0.76670212, "num_input_tokens_seen": 143706780, "step": 6661, "time_per_iteration": 3.699201822280884 }, { "auxiliary_loss_clip": 0.01119402, "auxiliary_loss_mlp": 0.01038874, "balance_loss_clip": 1.03994513, "balance_loss_mlp": 1.02437818, "epoch": 0.801058137437624, "flos": 19787318271360.0, "grad_norm": 2.3393948583682382, "language_loss": 0.73016739, "learning_rate": 4.008505336783472e-07, "loss": 0.75175005, "num_input_tokens_seen": 143724505, "step": 6662, "time_per_iteration": 3.5704805850982666 }, { "auxiliary_loss_clip": 0.0110992, "auxiliary_loss_mlp": 0.01034067, "balance_loss_clip": 1.04055047, "balance_loss_mlp": 1.01934457, "epoch": 0.801178380328263, "flos": 18659400324480.0, "grad_norm": 2.2006788681156015, "language_loss": 0.80597985, "learning_rate": 4.003828306280284e-07, "loss": 0.82741976, "num_input_tokens_seen": 143742180, "step": 6663, "time_per_iteration": 2.59051251411438 }, { "auxiliary_loss_clip": 0.01120993, "auxiliary_loss_mlp": 0.01033144, "balance_loss_clip": 1.04380536, "balance_loss_mlp": 1.01790333, "epoch": 0.8012986232189022, "flos": 15706573948800.0, "grad_norm": 1.76844593626082, "language_loss": 0.77956247, "learning_rate": 3.999153702383626e-07, "loss": 0.80110389, "num_input_tokens_seen": 143760070, "step": 6664, "time_per_iteration": 2.587750196456909 }, { "auxiliary_loss_clip": 0.01123297, "auxiliary_loss_mlp": 0.01038354, "balance_loss_clip": 1.04279995, "balance_loss_mlp": 1.02212954, "epoch": 0.8014188661095413, "flos": 28584139703040.0, "grad_norm": 9.298045695830192, "language_loss": 0.73814535, "learning_rate": 3.9944815258026263e-07, "loss": 0.75976193, "num_input_tokens_seen": 143781890, "step": 6665, "time_per_iteration": 3.6654648780822754 }, { "auxiliary_loss_clip": 0.01124863, "auxiliary_loss_mlp": 0.01040375, "balance_loss_clip": 1.04283297, "balance_loss_mlp": 1.02448487, "epoch": 0.8015391090001803, "flos": 29310360877440.0, "grad_norm": 2.177843868972199, "language_loss": 0.82919055, "learning_rate": 3.989811777246057e-07, "loss": 0.85084295, "num_input_tokens_seen": 143802060, "step": 6666, "time_per_iteration": 2.637723922729492 }, { "auxiliary_loss_clip": 0.01035715, "auxiliary_loss_mlp": 0.00999734, "balance_loss_clip": 1.00745702, "balance_loss_mlp": 0.99817258, "epoch": 0.8016593518908195, "flos": 70397340675840.0, "grad_norm": 0.8486955328691804, "language_loss": 0.66301131, "learning_rate": 3.985144457422305e-07, "loss": 0.68336582, "num_input_tokens_seen": 143856345, "step": 6667, "time_per_iteration": 3.1180872917175293 }, { "auxiliary_loss_clip": 0.01132905, "auxiliary_loss_mlp": 0.01045514, "balance_loss_clip": 1.0442363, "balance_loss_mlp": 1.02963507, "epoch": 0.8017795947814585, "flos": 26026114688640.0, "grad_norm": 2.0221384977370365, "language_loss": 0.76803273, "learning_rate": 3.9804795670394096e-07, "loss": 0.78981686, "num_input_tokens_seen": 143876470, "step": 6668, "time_per_iteration": 2.563927412033081 }, { "auxiliary_loss_clip": 0.01102829, "auxiliary_loss_mlp": 0.01039615, "balance_loss_clip": 1.04079795, "balance_loss_mlp": 1.02364111, "epoch": 0.8018998376720976, "flos": 22087181260800.0, "grad_norm": 1.6548274170351878, "language_loss": 0.70857704, "learning_rate": 3.975817106805022e-07, "loss": 0.73000157, "num_input_tokens_seen": 143895170, "step": 6669, "time_per_iteration": 2.631796360015869 }, { "auxiliary_loss_clip": 0.01097277, "auxiliary_loss_mlp": 0.01045583, "balance_loss_clip": 1.03996515, "balance_loss_mlp": 1.02848887, "epoch": 0.8020200805627368, "flos": 34568545023360.0, "grad_norm": 3.34346926190243, "language_loss": 0.64995539, "learning_rate": 3.97115707742645e-07, "loss": 0.67138392, "num_input_tokens_seen": 143915845, "step": 6670, "time_per_iteration": 3.6605372428894043 }, { "auxiliary_loss_clip": 0.01112932, "auxiliary_loss_mlp": 0.01041381, "balance_loss_clip": 1.04456782, "balance_loss_mlp": 1.02480507, "epoch": 0.8021403234533758, "flos": 20120354196480.0, "grad_norm": 2.082570213787691, "language_loss": 0.65085316, "learning_rate": 3.966499479610599e-07, "loss": 0.6723963, "num_input_tokens_seen": 143933940, "step": 6671, "time_per_iteration": 2.6199116706848145 }, { "auxiliary_loss_clip": 0.01094284, "auxiliary_loss_mlp": 0.01036959, "balance_loss_clip": 1.04189157, "balance_loss_mlp": 1.02218902, "epoch": 0.8022605663440149, "flos": 27746200252800.0, "grad_norm": 2.2381512600322035, "language_loss": 0.65046453, "learning_rate": 3.9618443140640225e-07, "loss": 0.67177695, "num_input_tokens_seen": 143952850, "step": 6672, "time_per_iteration": 2.737778902053833 }, { "auxiliary_loss_clip": 0.0099981, "auxiliary_loss_mlp": 0.01000935, "balance_loss_clip": 1.00939929, "balance_loss_mlp": 0.9994213, "epoch": 0.802380809234654, "flos": 60244998768000.0, "grad_norm": 4.713550523153521, "language_loss": 0.51257372, "learning_rate": 3.957191581492918e-07, "loss": 0.53258115, "num_input_tokens_seen": 144013610, "step": 6673, "time_per_iteration": 3.2745189666748047 }, { "auxiliary_loss_clip": 0.0110659, "auxiliary_loss_mlp": 0.01036949, "balance_loss_clip": 1.04163432, "balance_loss_mlp": 1.02018833, "epoch": 0.8025010521252931, "flos": 15080722352640.0, "grad_norm": 2.613705600192925, "language_loss": 0.71631473, "learning_rate": 3.952541282603097e-07, "loss": 0.73775011, "num_input_tokens_seen": 144028715, "step": 6674, "time_per_iteration": 2.59063982963562 }, { "auxiliary_loss_clip": 0.01117803, "auxiliary_loss_mlp": 0.01040598, "balance_loss_clip": 1.04202604, "balance_loss_mlp": 1.02371752, "epoch": 0.8026212950159322, "flos": 22163527618560.0, "grad_norm": 1.8922267378657, "language_loss": 0.8362146, "learning_rate": 3.9478934181000013e-07, "loss": 0.85779858, "num_input_tokens_seen": 144048740, "step": 6675, "time_per_iteration": 2.617689847946167 }, { "auxiliary_loss_clip": 0.01137648, "auxiliary_loss_mlp": 0.0103765, "balance_loss_clip": 1.0438205, "balance_loss_mlp": 1.02137816, "epoch": 0.8027415379065713, "flos": 17675986792320.0, "grad_norm": 2.71579352451475, "language_loss": 0.84875703, "learning_rate": 3.943247988688714e-07, "loss": 0.87050998, "num_input_tokens_seen": 144067435, "step": 6676, "time_per_iteration": 2.546760320663452 }, { "auxiliary_loss_clip": 0.01120043, "auxiliary_loss_mlp": 0.01034274, "balance_loss_clip": 1.04219079, "balance_loss_mlp": 1.01930702, "epoch": 0.8028617807972104, "flos": 21979593048960.0, "grad_norm": 1.8311570788336657, "language_loss": 0.72306645, "learning_rate": 3.938604995073933e-07, "loss": 0.74460971, "num_input_tokens_seen": 144085905, "step": 6677, "time_per_iteration": 2.6247644424438477 }, { "auxiliary_loss_clip": 0.01113099, "auxiliary_loss_mlp": 0.01043231, "balance_loss_clip": 1.04202032, "balance_loss_mlp": 1.02669632, "epoch": 0.8029820236878494, "flos": 26428457905920.0, "grad_norm": 1.722731244350269, "language_loss": 0.65331376, "learning_rate": 3.9339644379600157e-07, "loss": 0.67487705, "num_input_tokens_seen": 144105735, "step": 6678, "time_per_iteration": 2.6836097240448 }, { "auxiliary_loss_clip": 0.01122636, "auxiliary_loss_mlp": 0.010366, "balance_loss_clip": 1.04258871, "balance_loss_mlp": 1.02144825, "epoch": 0.8031022665784886, "flos": 17676489582720.0, "grad_norm": 1.8512361268224864, "language_loss": 0.71613479, "learning_rate": 3.929326318050907e-07, "loss": 0.73772711, "num_input_tokens_seen": 144123405, "step": 6679, "time_per_iteration": 2.5752112865448 }, { "auxiliary_loss_clip": 0.01126458, "auxiliary_loss_mlp": 0.01037167, "balance_loss_clip": 1.03928804, "balance_loss_mlp": 1.02203941, "epoch": 0.8032225094691277, "flos": 15450279431040.0, "grad_norm": 2.249237536768478, "language_loss": 0.79126102, "learning_rate": 3.924690636050225e-07, "loss": 0.81289721, "num_input_tokens_seen": 144140815, "step": 6680, "time_per_iteration": 2.5369789600372314 }, { "auxiliary_loss_clip": 0.01122208, "auxiliary_loss_mlp": 0.01043609, "balance_loss_clip": 1.04310822, "balance_loss_mlp": 1.02701545, "epoch": 0.8033427523597667, "flos": 26179202453760.0, "grad_norm": 1.9198066934335534, "language_loss": 0.73016697, "learning_rate": 3.9200573926611915e-07, "loss": 0.75182515, "num_input_tokens_seen": 144162230, "step": 6681, "time_per_iteration": 2.6509757041931152 }, { "auxiliary_loss_clip": 0.01120241, "auxiliary_loss_mlp": 0.01040656, "balance_loss_clip": 1.04650187, "balance_loss_mlp": 1.02537382, "epoch": 0.8034629952504058, "flos": 21324905809920.0, "grad_norm": 2.048105176239303, "language_loss": 0.72895962, "learning_rate": 3.9154265885866613e-07, "loss": 0.75056857, "num_input_tokens_seen": 144181540, "step": 6682, "time_per_iteration": 2.6030542850494385 }, { "auxiliary_loss_clip": 0.01118867, "auxiliary_loss_mlp": 0.01048179, "balance_loss_clip": 1.04241109, "balance_loss_mlp": 1.03228796, "epoch": 0.8035832381410449, "flos": 21651585027840.0, "grad_norm": 4.762852825230915, "language_loss": 0.75150621, "learning_rate": 3.9107982245291394e-07, "loss": 0.77317667, "num_input_tokens_seen": 144199665, "step": 6683, "time_per_iteration": 2.581136465072632 }, { "auxiliary_loss_clip": 0.01095278, "auxiliary_loss_mlp": 0.01044502, "balance_loss_clip": 1.04175878, "balance_loss_mlp": 1.02900445, "epoch": 0.803703481031684, "flos": 20518818744960.0, "grad_norm": 2.048172905851748, "language_loss": 0.77552134, "learning_rate": 3.9061723011907245e-07, "loss": 0.79691911, "num_input_tokens_seen": 144219020, "step": 6684, "time_per_iteration": 2.642716407775879 }, { "auxiliary_loss_clip": 0.01107292, "auxiliary_loss_mlp": 0.01044818, "balance_loss_clip": 1.03995144, "balance_loss_mlp": 1.02796185, "epoch": 0.803823723922323, "flos": 22854807838080.0, "grad_norm": 2.756538757973168, "language_loss": 0.79346013, "learning_rate": 3.901548819273179e-07, "loss": 0.81498122, "num_input_tokens_seen": 144239035, "step": 6685, "time_per_iteration": 2.615237236022949 }, { "auxiliary_loss_clip": 0.01121504, "auxiliary_loss_mlp": 0.01042767, "balance_loss_clip": 1.04308021, "balance_loss_mlp": 1.02670968, "epoch": 0.8039439668129622, "flos": 21362145235200.0, "grad_norm": 1.808382113045965, "language_loss": 0.69046438, "learning_rate": 3.896927779477881e-07, "loss": 0.71210712, "num_input_tokens_seen": 144258295, "step": 6686, "time_per_iteration": 2.6327855587005615 }, { "auxiliary_loss_clip": 0.01097677, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.04114485, "balance_loss_mlp": 1.02294493, "epoch": 0.8040642097036013, "flos": 23802382575360.0, "grad_norm": 2.166160896042557, "language_loss": 0.66860175, "learning_rate": 3.892309182505833e-07, "loss": 0.68995452, "num_input_tokens_seen": 144276110, "step": 6687, "time_per_iteration": 4.611279010772705 }, { "auxiliary_loss_clip": 0.01133385, "auxiliary_loss_mlp": 0.01043037, "balance_loss_clip": 1.04274046, "balance_loss_mlp": 1.02738452, "epoch": 0.8041844525942403, "flos": 25922046009600.0, "grad_norm": 2.4008993369311806, "language_loss": 0.85697782, "learning_rate": 3.887693029057675e-07, "loss": 0.8787421, "num_input_tokens_seen": 144295620, "step": 6688, "time_per_iteration": 2.6040267944335938 }, { "auxiliary_loss_clip": 0.01107023, "auxiliary_loss_mlp": 0.0104009, "balance_loss_clip": 1.04142094, "balance_loss_mlp": 1.02493858, "epoch": 0.8043046954848795, "flos": 25191120153600.0, "grad_norm": 1.8661901912492829, "language_loss": 0.81047916, "learning_rate": 3.8830793198336684e-07, "loss": 0.83195031, "num_input_tokens_seen": 144315210, "step": 6689, "time_per_iteration": 2.6799564361572266 }, { "auxiliary_loss_clip": 0.01122032, "auxiliary_loss_mlp": 0.01036915, "balance_loss_clip": 1.04072213, "balance_loss_mlp": 1.02108431, "epoch": 0.8044249383755185, "flos": 41719185123840.0, "grad_norm": 1.693170506598739, "language_loss": 0.70136249, "learning_rate": 3.878468055533721e-07, "loss": 0.72295189, "num_input_tokens_seen": 144337750, "step": 6690, "time_per_iteration": 2.7673134803771973 }, { "auxiliary_loss_clip": 0.01108485, "auxiliary_loss_mlp": 0.01044719, "balance_loss_clip": 1.0462141, "balance_loss_mlp": 1.02829218, "epoch": 0.8045451812661576, "flos": 20631434860800.0, "grad_norm": 2.8546740685315815, "language_loss": 0.8496927, "learning_rate": 3.8738592368573464e-07, "loss": 0.8712247, "num_input_tokens_seen": 144355305, "step": 6691, "time_per_iteration": 3.619758129119873 }, { "auxiliary_loss_clip": 0.01090138, "auxiliary_loss_mlp": 0.0103747, "balance_loss_clip": 1.03980958, "balance_loss_mlp": 1.02166319, "epoch": 0.8046654241567968, "flos": 29711806254720.0, "grad_norm": 2.342034555509035, "language_loss": 0.88122308, "learning_rate": 3.8692528645037137e-07, "loss": 0.9024992, "num_input_tokens_seen": 144374485, "step": 6692, "time_per_iteration": 2.702991485595703 }, { "auxiliary_loss_clip": 0.01131952, "auxiliary_loss_mlp": 0.01037543, "balance_loss_clip": 1.04343677, "balance_loss_mlp": 1.02249849, "epoch": 0.8047856670474358, "flos": 17671389851520.0, "grad_norm": 2.4010944279026067, "language_loss": 0.77810997, "learning_rate": 3.8646489391715907e-07, "loss": 0.79980493, "num_input_tokens_seen": 144388780, "step": 6693, "time_per_iteration": 2.546463966369629 }, { "auxiliary_loss_clip": 0.0111148, "auxiliary_loss_mlp": 0.01040677, "balance_loss_clip": 1.0437994, "balance_loss_mlp": 1.0246079, "epoch": 0.8049059099380749, "flos": 17120699464320.0, "grad_norm": 4.614123742660787, "language_loss": 0.88010538, "learning_rate": 3.8600474615593903e-07, "loss": 0.90162694, "num_input_tokens_seen": 144403395, "step": 6694, "time_per_iteration": 2.614762544631958 }, { "auxiliary_loss_clip": 0.01014527, "auxiliary_loss_mlp": 0.01002474, "balance_loss_clip": 1.01391101, "balance_loss_mlp": 1.00088811, "epoch": 0.805026152828714, "flos": 62212903240320.0, "grad_norm": 0.78174868610273, "language_loss": 0.59654438, "learning_rate": 3.8554484323651605e-07, "loss": 0.61671436, "num_input_tokens_seen": 144465265, "step": 6695, "time_per_iteration": 3.3998587131500244 }, { "auxiliary_loss_clip": 0.01118409, "auxiliary_loss_mlp": 0.00772329, "balance_loss_clip": 1.04239404, "balance_loss_mlp": 1.00047302, "epoch": 0.8051463957193531, "flos": 21688608971520.0, "grad_norm": 1.7806142846492996, "language_loss": 0.7959761, "learning_rate": 3.85085185228657e-07, "loss": 0.81488347, "num_input_tokens_seen": 144484235, "step": 6696, "time_per_iteration": 3.5694427490234375 }, { "auxiliary_loss_clip": 0.01104124, "auxiliary_loss_mlp": 0.01047932, "balance_loss_clip": 1.03976572, "balance_loss_mlp": 1.03093243, "epoch": 0.8052666386099921, "flos": 32051458535040.0, "grad_norm": 2.620301542081992, "language_loss": 0.7334742, "learning_rate": 3.8462577220209114e-07, "loss": 0.75499475, "num_input_tokens_seen": 144504610, "step": 6697, "time_per_iteration": 2.766374349594116 }, { "auxiliary_loss_clip": 0.01034725, "auxiliary_loss_mlp": 0.01001421, "balance_loss_clip": 1.00665379, "balance_loss_mlp": 0.99994832, "epoch": 0.8053868815006313, "flos": 67157875768320.0, "grad_norm": 0.7091990764815809, "language_loss": 0.58982003, "learning_rate": 3.8416660422651127e-07, "loss": 0.61018145, "num_input_tokens_seen": 144574260, "step": 6698, "time_per_iteration": 3.2423605918884277 }, { "auxiliary_loss_clip": 0.011018, "auxiliary_loss_mlp": 0.01037359, "balance_loss_clip": 1.04100895, "balance_loss_mlp": 1.02154016, "epoch": 0.8055071243912704, "flos": 23837000307840.0, "grad_norm": 1.8217339982037177, "language_loss": 0.68246609, "learning_rate": 3.837076813715723e-07, "loss": 0.70385766, "num_input_tokens_seen": 144594145, "step": 6699, "time_per_iteration": 2.682448625564575 }, { "auxiliary_loss_clip": 0.01095024, "auxiliary_loss_mlp": 0.01040638, "balance_loss_clip": 1.03931177, "balance_loss_mlp": 1.02320957, "epoch": 0.8056273672819094, "flos": 21324510760320.0, "grad_norm": 1.774119929863802, "language_loss": 0.7535274, "learning_rate": 3.832490037068941e-07, "loss": 0.77488399, "num_input_tokens_seen": 144612935, "step": 6700, "time_per_iteration": 2.68684720993042 }, { "auxiliary_loss_clip": 0.01064765, "auxiliary_loss_mlp": 0.01042877, "balance_loss_clip": 1.0351429, "balance_loss_mlp": 1.02726102, "epoch": 0.8057476101725486, "flos": 25768383626880.0, "grad_norm": 1.990990117709455, "language_loss": 0.76122653, "learning_rate": 3.827905713020554e-07, "loss": 0.78230298, "num_input_tokens_seen": 144630580, "step": 6701, "time_per_iteration": 2.763723850250244 }, { "auxiliary_loss_clip": 0.0109819, "auxiliary_loss_mlp": 0.01047793, "balance_loss_clip": 1.03647304, "balance_loss_mlp": 1.02924418, "epoch": 0.8058678530631876, "flos": 24535283679360.0, "grad_norm": 2.520785039025258, "language_loss": 0.68901896, "learning_rate": 3.823323842266017e-07, "loss": 0.71047878, "num_input_tokens_seen": 144649975, "step": 6702, "time_per_iteration": 2.7190322875976562 }, { "auxiliary_loss_clip": 0.01124154, "auxiliary_loss_mlp": 0.01038191, "balance_loss_clip": 1.04142356, "balance_loss_mlp": 1.02235985, "epoch": 0.8059880959538267, "flos": 24753728240640.0, "grad_norm": 2.484386532474656, "language_loss": 0.73139155, "learning_rate": 3.818744425500393e-07, "loss": 0.75301504, "num_input_tokens_seen": 144667990, "step": 6703, "time_per_iteration": 2.6191089153289795 }, { "auxiliary_loss_clip": 0.01090509, "auxiliary_loss_mlp": 0.01040794, "balance_loss_clip": 1.03816617, "balance_loss_mlp": 1.02533293, "epoch": 0.8061083388444659, "flos": 22196349671040.0, "grad_norm": 1.758999083758454, "language_loss": 0.80363762, "learning_rate": 3.8141674634183675e-07, "loss": 0.82495058, "num_input_tokens_seen": 144687020, "step": 6704, "time_per_iteration": 2.73856782913208 }, { "auxiliary_loss_clip": 0.01085364, "auxiliary_loss_mlp": 0.01038053, "balance_loss_clip": 1.04018998, "balance_loss_mlp": 1.02241302, "epoch": 0.8062285817351049, "flos": 30044195735040.0, "grad_norm": 2.3567367457559434, "language_loss": 0.66159803, "learning_rate": 3.809592956714278e-07, "loss": 0.68283224, "num_input_tokens_seen": 144710255, "step": 6705, "time_per_iteration": 2.776313543319702 }, { "auxiliary_loss_clip": 0.01127338, "auxiliary_loss_mlp": 0.01040068, "balance_loss_clip": 1.04530704, "balance_loss_mlp": 1.02415371, "epoch": 0.806348824625744, "flos": 22782591544320.0, "grad_norm": 2.0347685155896738, "language_loss": 0.74796128, "learning_rate": 3.805020906082057e-07, "loss": 0.76963544, "num_input_tokens_seen": 144728830, "step": 6706, "time_per_iteration": 2.6231062412261963 }, { "auxiliary_loss_clip": 0.01113643, "auxiliary_loss_mlp": 0.01034449, "balance_loss_clip": 1.04158735, "balance_loss_mlp": 1.01804602, "epoch": 0.8064690675163831, "flos": 23404600385280.0, "grad_norm": 2.221625893351594, "language_loss": 0.81064463, "learning_rate": 3.8004513122152917e-07, "loss": 0.83212554, "num_input_tokens_seen": 144747140, "step": 6707, "time_per_iteration": 2.623985767364502 }, { "auxiliary_loss_clip": 0.01094789, "auxiliary_loss_mlp": 0.01045395, "balance_loss_clip": 1.04033017, "balance_loss_mlp": 1.02973092, "epoch": 0.8065893104070222, "flos": 24060903736320.0, "grad_norm": 1.7806584678748039, "language_loss": 0.67524785, "learning_rate": 3.79588417580718e-07, "loss": 0.69664967, "num_input_tokens_seen": 144765250, "step": 6708, "time_per_iteration": 2.658468246459961 }, { "auxiliary_loss_clip": 0.0112139, "auxiliary_loss_mlp": 0.01041675, "balance_loss_clip": 1.04309344, "balance_loss_mlp": 1.02661252, "epoch": 0.8067095532976613, "flos": 22305410340480.0, "grad_norm": 1.957584361603761, "language_loss": 0.7682848, "learning_rate": 3.791319497550558e-07, "loss": 0.7899155, "num_input_tokens_seen": 144783080, "step": 6709, "time_per_iteration": 2.617478370666504 }, { "auxiliary_loss_clip": 0.01098995, "auxiliary_loss_mlp": 0.00772256, "balance_loss_clip": 1.04102826, "balance_loss_mlp": 1.00045776, "epoch": 0.8068297961883004, "flos": 17129498296320.0, "grad_norm": 4.021326670490137, "language_loss": 0.70798528, "learning_rate": 3.78675727813788e-07, "loss": 0.7266978, "num_input_tokens_seen": 144800645, "step": 6710, "time_per_iteration": 2.632863759994507 }, { "auxiliary_loss_clip": 0.01109543, "auxiliary_loss_mlp": 0.01044443, "balance_loss_clip": 1.04011416, "balance_loss_mlp": 1.02848113, "epoch": 0.8069500390789395, "flos": 22018843635840.0, "grad_norm": 1.8863327788595092, "language_loss": 0.73637056, "learning_rate": 3.782197518261225e-07, "loss": 0.75791043, "num_input_tokens_seen": 144820085, "step": 6711, "time_per_iteration": 2.659227132797241 }, { "auxiliary_loss_clip": 0.01113653, "auxiliary_loss_mlp": 0.01032923, "balance_loss_clip": 1.04296672, "balance_loss_mlp": 1.01637721, "epoch": 0.8070702819695785, "flos": 19244241567360.0, "grad_norm": 2.0403379512269755, "language_loss": 0.95624506, "learning_rate": 3.777640218612319e-07, "loss": 0.97771084, "num_input_tokens_seen": 144838070, "step": 6712, "time_per_iteration": 2.677574396133423 }, { "auxiliary_loss_clip": 0.01113682, "auxiliary_loss_mlp": 0.01035309, "balance_loss_clip": 1.04126787, "balance_loss_mlp": 1.0201633, "epoch": 0.8071905248602176, "flos": 21544320038400.0, "grad_norm": 2.6988731198250644, "language_loss": 0.721044, "learning_rate": 3.773085379882488e-07, "loss": 0.74253392, "num_input_tokens_seen": 144857125, "step": 6713, "time_per_iteration": 3.615785598754883 }, { "auxiliary_loss_clip": 0.01119644, "auxiliary_loss_mlp": 0.00772804, "balance_loss_clip": 1.04036641, "balance_loss_mlp": 1.00053465, "epoch": 0.8073107677508568, "flos": 37268309105280.0, "grad_norm": 1.9995365495964155, "language_loss": 0.76085806, "learning_rate": 3.768533002762715e-07, "loss": 0.77978253, "num_input_tokens_seen": 144880660, "step": 6714, "time_per_iteration": 3.6746304035186768 }, { "auxiliary_loss_clip": 0.01107768, "auxiliary_loss_mlp": 0.01035418, "balance_loss_clip": 1.04003596, "balance_loss_mlp": 1.02045703, "epoch": 0.8074310106414958, "flos": 28366269759360.0, "grad_norm": 1.8490481177067368, "language_loss": 0.76824212, "learning_rate": 3.763983087943572e-07, "loss": 0.78967392, "num_input_tokens_seen": 144900050, "step": 6715, "time_per_iteration": 2.7048709392547607 }, { "auxiliary_loss_clip": 0.0111111, "auxiliary_loss_mlp": 0.00771965, "balance_loss_clip": 1.0394156, "balance_loss_mlp": 1.00045657, "epoch": 0.8075512535321349, "flos": 24281646768000.0, "grad_norm": 1.851323684709084, "language_loss": 0.80987406, "learning_rate": 3.759435636115282e-07, "loss": 0.82870477, "num_input_tokens_seen": 144920835, "step": 6716, "time_per_iteration": 2.6286914348602295 }, { "auxiliary_loss_clip": 0.01064323, "auxiliary_loss_mlp": 0.00772927, "balance_loss_clip": 1.03844547, "balance_loss_mlp": 1.00049078, "epoch": 0.807671496422774, "flos": 26030855283840.0, "grad_norm": 1.7415377206250424, "language_loss": 0.73364228, "learning_rate": 3.7548906479676967e-07, "loss": 0.75201476, "num_input_tokens_seen": 144940430, "step": 6717, "time_per_iteration": 3.7447142601013184 }, { "auxiliary_loss_clip": 0.01124617, "auxiliary_loss_mlp": 0.01037816, "balance_loss_clip": 1.04165602, "balance_loss_mlp": 1.02153182, "epoch": 0.8077917393134131, "flos": 23730740899200.0, "grad_norm": 2.074084076205545, "language_loss": 0.71547717, "learning_rate": 3.7503481241902855e-07, "loss": 0.7371015, "num_input_tokens_seen": 144960405, "step": 6718, "time_per_iteration": 2.6597962379455566 }, { "auxiliary_loss_clip": 0.01106122, "auxiliary_loss_mlp": 0.0077176, "balance_loss_clip": 1.03983545, "balance_loss_mlp": 1.0005163, "epoch": 0.8079119822040521, "flos": 18402028398720.0, "grad_norm": 2.2516265133004656, "language_loss": 0.80289292, "learning_rate": 3.745808065472145e-07, "loss": 0.82167172, "num_input_tokens_seen": 144977700, "step": 6719, "time_per_iteration": 2.6293749809265137 }, { "auxiliary_loss_clip": 0.01114123, "auxiliary_loss_mlp": 0.01034438, "balance_loss_clip": 1.04397464, "balance_loss_mlp": 1.01991856, "epoch": 0.8080322250946913, "flos": 23621787970560.0, "grad_norm": 1.6275654694477437, "language_loss": 0.76387703, "learning_rate": 3.741270472501994e-07, "loss": 0.7853626, "num_input_tokens_seen": 144998340, "step": 6720, "time_per_iteration": 2.6274824142456055 }, { "auxiliary_loss_clip": 0.01109307, "auxiliary_loss_mlp": 0.0103957, "balance_loss_clip": 1.04311299, "balance_loss_mlp": 1.02273762, "epoch": 0.8081524679853304, "flos": 22820692896000.0, "grad_norm": 1.867146256961362, "language_loss": 0.72630185, "learning_rate": 3.736735345968183e-07, "loss": 0.74779058, "num_input_tokens_seen": 145017950, "step": 6721, "time_per_iteration": 3.5936121940612793 }, { "auxiliary_loss_clip": 0.011221, "auxiliary_loss_mlp": 0.01036512, "balance_loss_clip": 1.0447495, "balance_loss_mlp": 1.02189684, "epoch": 0.8082727108759694, "flos": 17640004343040.0, "grad_norm": 1.6435368351063493, "language_loss": 0.78842103, "learning_rate": 3.7322026865586986e-07, "loss": 0.81000715, "num_input_tokens_seen": 145036985, "step": 6722, "time_per_iteration": 2.5890097618103027 }, { "auxiliary_loss_clip": 0.01126175, "auxiliary_loss_mlp": 0.01034609, "balance_loss_clip": 1.04427528, "balance_loss_mlp": 1.01966, "epoch": 0.8083929537666086, "flos": 25958172113280.0, "grad_norm": 2.2585853084305483, "language_loss": 0.73218894, "learning_rate": 3.7276724949611206e-07, "loss": 0.75379676, "num_input_tokens_seen": 145057095, "step": 6723, "time_per_iteration": 2.6604058742523193 }, { "auxiliary_loss_clip": 0.01112874, "auxiliary_loss_mlp": 0.01039911, "balance_loss_clip": 1.0426662, "balance_loss_mlp": 1.02371001, "epoch": 0.8085131966572476, "flos": 27089178629760.0, "grad_norm": 1.8557021169020251, "language_loss": 0.75576651, "learning_rate": 3.723144771862694e-07, "loss": 0.77729428, "num_input_tokens_seen": 145077735, "step": 6724, "time_per_iteration": 2.6650390625 }, { "auxiliary_loss_clip": 0.01100099, "auxiliary_loss_mlp": 0.01044355, "balance_loss_clip": 1.03901184, "balance_loss_mlp": 1.02840495, "epoch": 0.8086334395478867, "flos": 23988543788160.0, "grad_norm": 1.6251113169542948, "language_loss": 0.7684139, "learning_rate": 3.718619517950263e-07, "loss": 0.78985846, "num_input_tokens_seen": 145098330, "step": 6725, "time_per_iteration": 2.7551732063293457 }, { "auxiliary_loss_clip": 0.01134232, "auxiliary_loss_mlp": 0.01046576, "balance_loss_clip": 1.04620504, "balance_loss_mlp": 1.03138876, "epoch": 0.8087536824385259, "flos": 20405879406720.0, "grad_norm": 2.20265553232724, "language_loss": 0.77304953, "learning_rate": 3.714096733910301e-07, "loss": 0.79485762, "num_input_tokens_seen": 145115855, "step": 6726, "time_per_iteration": 2.573746681213379 }, { "auxiliary_loss_clip": 0.01128599, "auxiliary_loss_mlp": 0.01037999, "balance_loss_clip": 1.04402709, "balance_loss_mlp": 1.02090478, "epoch": 0.8088739253291649, "flos": 25919639798400.0, "grad_norm": 2.239008776143707, "language_loss": 0.70309675, "learning_rate": 3.709576420428926e-07, "loss": 0.7247628, "num_input_tokens_seen": 145136655, "step": 6727, "time_per_iteration": 2.630256175994873 }, { "auxiliary_loss_clip": 0.01109084, "auxiliary_loss_mlp": 0.01043339, "balance_loss_clip": 1.03995061, "balance_loss_mlp": 1.02761579, "epoch": 0.808994168219804, "flos": 28402072640640.0, "grad_norm": 2.7079868937951064, "language_loss": 0.73694348, "learning_rate": 3.7050585781918463e-07, "loss": 0.75846767, "num_input_tokens_seen": 145156955, "step": 6728, "time_per_iteration": 2.7157726287841797 }, { "auxiliary_loss_clip": 0.01123641, "auxiliary_loss_mlp": 0.0104064, "balance_loss_clip": 1.0421567, "balance_loss_mlp": 1.02275848, "epoch": 0.8091144111104431, "flos": 17421056991360.0, "grad_norm": 3.298805414389983, "language_loss": 0.69200045, "learning_rate": 3.700543207884428e-07, "loss": 0.71364331, "num_input_tokens_seen": 145173865, "step": 6729, "time_per_iteration": 2.582913875579834 }, { "auxiliary_loss_clip": 0.0111737, "auxiliary_loss_mlp": 0.01042915, "balance_loss_clip": 1.04048991, "balance_loss_mlp": 1.02776313, "epoch": 0.8092346540010822, "flos": 32153803361280.0, "grad_norm": 2.1981931006503284, "language_loss": 0.711303, "learning_rate": 3.6960303101916466e-07, "loss": 0.73290586, "num_input_tokens_seen": 145193780, "step": 6730, "time_per_iteration": 2.6985130310058594 }, { "auxiliary_loss_clip": 0.01034755, "auxiliary_loss_mlp": 0.0075569, "balance_loss_clip": 1.00683618, "balance_loss_mlp": 1.00018167, "epoch": 0.8093548968917212, "flos": 58035093390720.0, "grad_norm": 0.7529014654381122, "language_loss": 0.55530763, "learning_rate": 3.6915198857981047e-07, "loss": 0.57321209, "num_input_tokens_seen": 145258980, "step": 6731, "time_per_iteration": 3.2154557704925537 }, { "auxiliary_loss_clip": 0.01092925, "auxiliary_loss_mlp": 0.01052128, "balance_loss_clip": 1.04089952, "balance_loss_mlp": 1.03411579, "epoch": 0.8094751397823604, "flos": 27381599251200.0, "grad_norm": 1.579398699299519, "language_loss": 0.6817925, "learning_rate": 3.687011935388027e-07, "loss": 0.70324302, "num_input_tokens_seen": 145281875, "step": 6732, "time_per_iteration": 2.732527732849121 }, { "auxiliary_loss_clip": 0.01120796, "auxiliary_loss_mlp": 0.01036632, "balance_loss_clip": 1.04211259, "balance_loss_mlp": 1.02120614, "epoch": 0.8095953826729995, "flos": 24061083304320.0, "grad_norm": 1.7583605450526214, "language_loss": 0.73018944, "learning_rate": 3.6825064596452646e-07, "loss": 0.75176376, "num_input_tokens_seen": 145302220, "step": 6733, "time_per_iteration": 2.637862205505371 }, { "auxiliary_loss_clip": 0.01122767, "auxiliary_loss_mlp": 0.01032634, "balance_loss_clip": 1.0439831, "balance_loss_mlp": 1.01792336, "epoch": 0.8097156255636385, "flos": 23951412103680.0, "grad_norm": 1.8950822761794117, "language_loss": 0.70227408, "learning_rate": 3.678003459253305e-07, "loss": 0.72382808, "num_input_tokens_seen": 145323070, "step": 6734, "time_per_iteration": 2.613429069519043 }, { "auxiliary_loss_clip": 0.01093712, "auxiliary_loss_mlp": 0.01038829, "balance_loss_clip": 1.03934371, "balance_loss_mlp": 1.0236063, "epoch": 0.8098358684542777, "flos": 21799142098560.0, "grad_norm": 2.039078833317999, "language_loss": 0.74192119, "learning_rate": 3.673502934895236e-07, "loss": 0.76324666, "num_input_tokens_seen": 145342575, "step": 6735, "time_per_iteration": 2.671333074569702 }, { "auxiliary_loss_clip": 0.01035111, "auxiliary_loss_mlp": 0.01001217, "balance_loss_clip": 1.00702477, "balance_loss_mlp": 0.99969149, "epoch": 0.8099561113449167, "flos": 68809515966720.0, "grad_norm": 0.6883218792366841, "language_loss": 0.57945549, "learning_rate": 3.669004887253802e-07, "loss": 0.59981883, "num_input_tokens_seen": 145408865, "step": 6736, "time_per_iteration": 3.306300401687622 }, { "auxiliary_loss_clip": 0.01114698, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.04676175, "balance_loss_mlp": 1.02159536, "epoch": 0.8100763542355558, "flos": 23586056916480.0, "grad_norm": 1.8112876815020247, "language_loss": 0.78936058, "learning_rate": 3.664509317011335e-07, "loss": 0.81086826, "num_input_tokens_seen": 145429200, "step": 6737, "time_per_iteration": 2.6816909313201904 }, { "auxiliary_loss_clip": 0.01119412, "auxiliary_loss_mlp": 0.01042741, "balance_loss_clip": 1.04341304, "balance_loss_mlp": 1.02490687, "epoch": 0.810196597126195, "flos": 31650408207360.0, "grad_norm": 2.3504951943646244, "language_loss": 0.7365098, "learning_rate": 3.6600162248498134e-07, "loss": 0.75813133, "num_input_tokens_seen": 145452830, "step": 6738, "time_per_iteration": 2.7296526432037354 }, { "auxiliary_loss_clip": 0.01052195, "auxiliary_loss_mlp": 0.01039352, "balance_loss_clip": 1.03213882, "balance_loss_mlp": 1.02238822, "epoch": 0.810316840016834, "flos": 24900459298560.0, "grad_norm": 1.7794083345361216, "language_loss": 0.7586078, "learning_rate": 3.6555256114508426e-07, "loss": 0.77952331, "num_input_tokens_seen": 145472625, "step": 6739, "time_per_iteration": 3.7230536937713623 }, { "auxiliary_loss_clip": 0.01105545, "auxiliary_loss_mlp": 0.01041364, "balance_loss_clip": 1.03776276, "balance_loss_mlp": 1.02517509, "epoch": 0.8104370829074731, "flos": 27965003950080.0, "grad_norm": 1.9062900152979947, "language_loss": 0.73116398, "learning_rate": 3.651037477495642e-07, "loss": 0.75263309, "num_input_tokens_seen": 145494075, "step": 6740, "time_per_iteration": 3.759427309036255 }, { "auxiliary_loss_clip": 0.01133945, "auxiliary_loss_mlp": 0.01036672, "balance_loss_clip": 1.04362082, "balance_loss_mlp": 1.02054322, "epoch": 0.8105573257981122, "flos": 24640752988800.0, "grad_norm": 2.2281976091455635, "language_loss": 0.68027908, "learning_rate": 3.6465518236650584e-07, "loss": 0.70198524, "num_input_tokens_seen": 145514220, "step": 6741, "time_per_iteration": 2.6177985668182373 }, { "auxiliary_loss_clip": 0.01094907, "auxiliary_loss_mlp": 0.0104011, "balance_loss_clip": 1.03913832, "balance_loss_mlp": 1.02496409, "epoch": 0.8106775686887513, "flos": 26358935132160.0, "grad_norm": 1.8173890083753603, "language_loss": 0.78514242, "learning_rate": 3.642068650639558e-07, "loss": 0.80649263, "num_input_tokens_seen": 145533965, "step": 6742, "time_per_iteration": 3.698071241378784 }, { "auxiliary_loss_clip": 0.01100365, "auxiliary_loss_mlp": 0.01037976, "balance_loss_clip": 1.03714156, "balance_loss_mlp": 1.02181125, "epoch": 0.8107978115793903, "flos": 27271892136960.0, "grad_norm": 2.268780225775951, "language_loss": 0.64663291, "learning_rate": 3.6375879590992334e-07, "loss": 0.66801631, "num_input_tokens_seen": 145554310, "step": 6743, "time_per_iteration": 2.6794357299804688 }, { "auxiliary_loss_clip": 0.0110455, "auxiliary_loss_mlp": 0.0104436, "balance_loss_clip": 1.0398829, "balance_loss_mlp": 1.02727699, "epoch": 0.8109180544700295, "flos": 24934322845440.0, "grad_norm": 2.0600186113054675, "language_loss": 0.80830866, "learning_rate": 3.6331097497238173e-07, "loss": 0.82979774, "num_input_tokens_seen": 145573755, "step": 6744, "time_per_iteration": 2.6836352348327637 }, { "auxiliary_loss_clip": 0.01091433, "auxiliary_loss_mlp": 0.01032218, "balance_loss_clip": 1.03801966, "balance_loss_mlp": 1.01731706, "epoch": 0.8110382973606686, "flos": 21105383840640.0, "grad_norm": 2.097611791434692, "language_loss": 0.80051315, "learning_rate": 3.628634023192627e-07, "loss": 0.82174969, "num_input_tokens_seen": 145594000, "step": 6745, "time_per_iteration": 2.7032926082611084 }, { "auxiliary_loss_clip": 0.01122491, "auxiliary_loss_mlp": 0.01040812, "balance_loss_clip": 1.04212368, "balance_loss_mlp": 1.02459931, "epoch": 0.8111585402513076, "flos": 15414081500160.0, "grad_norm": 2.898046696225491, "language_loss": 0.75684416, "learning_rate": 3.624160780184644e-07, "loss": 0.77847713, "num_input_tokens_seen": 145611215, "step": 6746, "time_per_iteration": 2.594031810760498 }, { "auxiliary_loss_clip": 0.01105484, "auxiliary_loss_mlp": 0.01034997, "balance_loss_clip": 1.04129505, "balance_loss_mlp": 1.02000022, "epoch": 0.8112787831419467, "flos": 24095736950400.0, "grad_norm": 1.95775667072361, "language_loss": 0.74354643, "learning_rate": 3.6196900213784496e-07, "loss": 0.76495123, "num_input_tokens_seen": 145630530, "step": 6747, "time_per_iteration": 2.632697820663452 }, { "auxiliary_loss_clip": 0.01121651, "auxiliary_loss_mlp": 0.01040832, "balance_loss_clip": 1.04379427, "balance_loss_mlp": 1.02625251, "epoch": 0.8113990260325858, "flos": 20483374999680.0, "grad_norm": 2.4160605419744514, "language_loss": 0.86807406, "learning_rate": 3.6152217474522527e-07, "loss": 0.88969886, "num_input_tokens_seen": 145647345, "step": 6748, "time_per_iteration": 3.563046455383301 }, { "auxiliary_loss_clip": 0.01119608, "auxiliary_loss_mlp": 0.01042279, "balance_loss_clip": 1.04462123, "balance_loss_mlp": 1.02679956, "epoch": 0.8115192689232249, "flos": 24901141656960.0, "grad_norm": 2.3342286929489, "language_loss": 0.73000705, "learning_rate": 3.6107559590838975e-07, "loss": 0.7516259, "num_input_tokens_seen": 145666330, "step": 6749, "time_per_iteration": 2.63252854347229 }, { "auxiliary_loss_clip": 0.0106525, "auxiliary_loss_mlp": 0.01044999, "balance_loss_clip": 1.03706312, "balance_loss_mlp": 1.02824998, "epoch": 0.811639511813864, "flos": 24057204635520.0, "grad_norm": 2.678474106216655, "language_loss": 0.66306889, "learning_rate": 3.606292656950822e-07, "loss": 0.68417132, "num_input_tokens_seen": 145684740, "step": 6750, "time_per_iteration": 2.7747693061828613 }, { "auxiliary_loss_clip": 0.01103518, "auxiliary_loss_mlp": 0.01042084, "balance_loss_clip": 1.03848565, "balance_loss_mlp": 1.02472746, "epoch": 0.8117597547045031, "flos": 23185150243200.0, "grad_norm": 1.9268653520654107, "language_loss": 0.86789119, "learning_rate": 3.601831841730121e-07, "loss": 0.8893472, "num_input_tokens_seen": 145702660, "step": 6751, "time_per_iteration": 2.6662449836730957 }, { "auxiliary_loss_clip": 0.01120382, "auxiliary_loss_mlp": 0.01038624, "balance_loss_clip": 1.04311013, "balance_loss_mlp": 1.02349651, "epoch": 0.8118799975951422, "flos": 23040250778880.0, "grad_norm": 1.7475789843181162, "language_loss": 0.72914815, "learning_rate": 3.5973735140984916e-07, "loss": 0.75073814, "num_input_tokens_seen": 145722830, "step": 6752, "time_per_iteration": 2.5885491371154785 }, { "auxiliary_loss_clip": 0.01078304, "auxiliary_loss_mlp": 0.00771885, "balance_loss_clip": 1.03523803, "balance_loss_mlp": 1.00038743, "epoch": 0.8120002404857812, "flos": 24639962889600.0, "grad_norm": 2.0822239907256868, "language_loss": 0.79857123, "learning_rate": 3.5929176747322607e-07, "loss": 0.81707311, "num_input_tokens_seen": 145741935, "step": 6753, "time_per_iteration": 2.7263782024383545 }, { "auxiliary_loss_clip": 0.01019208, "auxiliary_loss_mlp": 0.01000266, "balance_loss_clip": 1.00841904, "balance_loss_mlp": 0.99875212, "epoch": 0.8121204833764204, "flos": 57415742156160.0, "grad_norm": 0.8107918095277059, "language_loss": 0.56125367, "learning_rate": 3.588464324307372e-07, "loss": 0.58144844, "num_input_tokens_seen": 145805560, "step": 6754, "time_per_iteration": 3.2877438068389893 }, { "auxiliary_loss_clip": 0.01122557, "auxiliary_loss_mlp": 0.01031347, "balance_loss_clip": 1.04169035, "balance_loss_mlp": 1.01682711, "epoch": 0.8122407262670595, "flos": 19464589549440.0, "grad_norm": 2.8351580323749084, "language_loss": 0.75030828, "learning_rate": 3.584013463499391e-07, "loss": 0.77184737, "num_input_tokens_seen": 145824180, "step": 6755, "time_per_iteration": 2.588667631149292 }, { "auxiliary_loss_clip": 0.01021433, "auxiliary_loss_mlp": 0.01001012, "balance_loss_clip": 1.01217675, "balance_loss_mlp": 0.99950391, "epoch": 0.8123609691576985, "flos": 56425325472000.0, "grad_norm": 0.7306023737772632, "language_loss": 0.6442759, "learning_rate": 3.579565092983521e-07, "loss": 0.66450036, "num_input_tokens_seen": 145885300, "step": 6756, "time_per_iteration": 3.106384754180908 }, { "auxiliary_loss_clip": 0.01132295, "auxiliary_loss_mlp": 0.01037627, "balance_loss_clip": 1.04314613, "balance_loss_mlp": 1.0223918, "epoch": 0.8124812120483377, "flos": 20631973564800.0, "grad_norm": 1.953025539368285, "language_loss": 0.83941513, "learning_rate": 3.575119213434565e-07, "loss": 0.86111432, "num_input_tokens_seen": 145903815, "step": 6757, "time_per_iteration": 2.5264129638671875 }, { "auxiliary_loss_clip": 0.01116673, "auxiliary_loss_mlp": 0.01038145, "balance_loss_clip": 1.04109597, "balance_loss_mlp": 1.02261162, "epoch": 0.8126014549389767, "flos": 22492397566080.0, "grad_norm": 1.9863413196905737, "language_loss": 0.81866348, "learning_rate": 3.5706758255269765e-07, "loss": 0.84021163, "num_input_tokens_seen": 145922270, "step": 6758, "time_per_iteration": 2.620528221130371 }, { "auxiliary_loss_clip": 0.01116625, "auxiliary_loss_mlp": 0.01041617, "balance_loss_clip": 1.04225123, "balance_loss_mlp": 1.02397406, "epoch": 0.8127216978296158, "flos": 23287961946240.0, "grad_norm": 1.530945882253258, "language_loss": 0.69391745, "learning_rate": 3.566234929934795e-07, "loss": 0.71549982, "num_input_tokens_seen": 145941470, "step": 6759, "time_per_iteration": 2.6485345363616943 }, { "auxiliary_loss_clip": 0.01117709, "auxiliary_loss_mlp": 0.01036902, "balance_loss_clip": 1.04368603, "balance_loss_mlp": 1.02241254, "epoch": 0.812841940720255, "flos": 25154994049920.0, "grad_norm": 1.5780598672313784, "language_loss": 0.7153883, "learning_rate": 3.561796527331706e-07, "loss": 0.73693442, "num_input_tokens_seen": 145963145, "step": 6760, "time_per_iteration": 2.6637086868286133 }, { "auxiliary_loss_clip": 0.01099407, "auxiliary_loss_mlp": 0.01042194, "balance_loss_clip": 1.04034162, "balance_loss_mlp": 1.02548122, "epoch": 0.812962183610894, "flos": 26648446752000.0, "grad_norm": 1.944694623215099, "language_loss": 0.77738112, "learning_rate": 3.5573606183910163e-07, "loss": 0.79879719, "num_input_tokens_seen": 145983150, "step": 6761, "time_per_iteration": 2.696732997894287 }, { "auxiliary_loss_clip": 0.01128871, "auxiliary_loss_mlp": 0.01041467, "balance_loss_clip": 1.04259706, "balance_loss_mlp": 1.02477765, "epoch": 0.8130824265015331, "flos": 24966965329920.0, "grad_norm": 2.9287295968054163, "language_loss": 0.78730845, "learning_rate": 3.5529272037856493e-07, "loss": 0.80901182, "num_input_tokens_seen": 146001365, "step": 6762, "time_per_iteration": 2.6014204025268555 }, { "auxiliary_loss_clip": 0.00992483, "auxiliary_loss_mlp": 0.01007023, "balance_loss_clip": 1.00868964, "balance_loss_mlp": 1.00463855, "epoch": 0.8132026693921722, "flos": 67622918175360.0, "grad_norm": 0.7084731645910154, "language_loss": 0.53784096, "learning_rate": 3.548496284188149e-07, "loss": 0.55783594, "num_input_tokens_seen": 146061570, "step": 6763, "time_per_iteration": 3.335899591445923 }, { "auxiliary_loss_clip": 0.01076392, "auxiliary_loss_mlp": 0.01038532, "balance_loss_clip": 1.03731799, "balance_loss_mlp": 1.02397084, "epoch": 0.8133229122828113, "flos": 19495149045120.0, "grad_norm": 2.3043745169927887, "language_loss": 0.79436892, "learning_rate": 3.544067860270681e-07, "loss": 0.8155182, "num_input_tokens_seen": 146079145, "step": 6764, "time_per_iteration": 3.581223487854004 }, { "auxiliary_loss_clip": 0.01099383, "auxiliary_loss_mlp": 0.01042443, "balance_loss_clip": 1.04027092, "balance_loss_mlp": 1.02578974, "epoch": 0.8134431551734503, "flos": 20668135582080.0, "grad_norm": 1.8153251731462434, "language_loss": 0.70997208, "learning_rate": 3.539641932705029e-07, "loss": 0.73139036, "num_input_tokens_seen": 146097625, "step": 6765, "time_per_iteration": 2.683703660964966 }, { "auxiliary_loss_clip": 0.01137447, "auxiliary_loss_mlp": 0.01043618, "balance_loss_clip": 1.04362631, "balance_loss_mlp": 1.02750111, "epoch": 0.8135633980640895, "flos": 21507332008320.0, "grad_norm": 2.096651597688485, "language_loss": 0.77479368, "learning_rate": 3.53521850216262e-07, "loss": 0.79660434, "num_input_tokens_seen": 146117195, "step": 6766, "time_per_iteration": 3.5690267086029053 }, { "auxiliary_loss_clip": 0.01133786, "auxiliary_loss_mlp": 0.01034466, "balance_loss_clip": 1.04357827, "balance_loss_mlp": 1.01819348, "epoch": 0.8136836409547286, "flos": 20554442058240.0, "grad_norm": 1.8242747445264618, "language_loss": 0.77054131, "learning_rate": 3.530797569314461e-07, "loss": 0.79222381, "num_input_tokens_seen": 146136220, "step": 6767, "time_per_iteration": 2.5411455631256104 }, { "auxiliary_loss_clip": 0.01135259, "auxiliary_loss_mlp": 0.01039504, "balance_loss_clip": 1.0457232, "balance_loss_mlp": 1.02320838, "epoch": 0.8138038838453676, "flos": 20299045380480.0, "grad_norm": 1.8755102740194127, "language_loss": 0.77977228, "learning_rate": 3.5263791348312235e-07, "loss": 0.80151987, "num_input_tokens_seen": 146155415, "step": 6768, "time_per_iteration": 2.608734369277954 }, { "auxiliary_loss_clip": 0.01106114, "auxiliary_loss_mlp": 0.01032139, "balance_loss_clip": 1.04176998, "balance_loss_mlp": 1.0161171, "epoch": 0.8139241267360068, "flos": 29789840551680.0, "grad_norm": 2.235971948075485, "language_loss": 0.7081145, "learning_rate": 3.521963199383171e-07, "loss": 0.72949708, "num_input_tokens_seen": 146178370, "step": 6769, "time_per_iteration": 3.711984634399414 }, { "auxiliary_loss_clip": 0.01084707, "auxiliary_loss_mlp": 0.01050654, "balance_loss_clip": 1.03946674, "balance_loss_mlp": 1.03253448, "epoch": 0.8140443696266458, "flos": 19713270384000.0, "grad_norm": 2.511449273701424, "language_loss": 0.76842374, "learning_rate": 3.517549763640197e-07, "loss": 0.78977734, "num_input_tokens_seen": 146196010, "step": 6770, "time_per_iteration": 2.70823073387146 }, { "auxiliary_loss_clip": 0.01118128, "auxiliary_loss_mlp": 0.00771473, "balance_loss_clip": 1.04325318, "balance_loss_mlp": 1.00044513, "epoch": 0.8141646125172849, "flos": 27160568910720.0, "grad_norm": 1.9016157122772048, "language_loss": 0.7113654, "learning_rate": 3.513138828271829e-07, "loss": 0.73026145, "num_input_tokens_seen": 146215880, "step": 6771, "time_per_iteration": 2.648688316345215 }, { "auxiliary_loss_clip": 0.01088475, "auxiliary_loss_mlp": 0.01036791, "balance_loss_clip": 1.03889441, "balance_loss_mlp": 1.02116847, "epoch": 0.8142848554079241, "flos": 39673102700160.0, "grad_norm": 1.969426578512929, "language_loss": 0.70446765, "learning_rate": 3.508730393947179e-07, "loss": 0.72572029, "num_input_tokens_seen": 146239135, "step": 6772, "time_per_iteration": 2.810267925262451 }, { "auxiliary_loss_clip": 0.01094412, "auxiliary_loss_mlp": 0.0104179, "balance_loss_clip": 1.04001176, "balance_loss_mlp": 1.02611995, "epoch": 0.8144050982985631, "flos": 22237288197120.0, "grad_norm": 1.6369179342418843, "language_loss": 0.72207415, "learning_rate": 3.504324461335024e-07, "loss": 0.7434361, "num_input_tokens_seen": 146259245, "step": 6773, "time_per_iteration": 3.571075439453125 }, { "auxiliary_loss_clip": 0.01073737, "auxiliary_loss_mlp": 0.01048448, "balance_loss_clip": 1.03722405, "balance_loss_mlp": 1.03196168, "epoch": 0.8145253411892022, "flos": 23038239617280.0, "grad_norm": 11.755551189028195, "language_loss": 0.88401449, "learning_rate": 3.499921031103732e-07, "loss": 0.90523636, "num_input_tokens_seen": 146280015, "step": 6774, "time_per_iteration": 2.76239275932312 }, { "auxiliary_loss_clip": 0.01104169, "auxiliary_loss_mlp": 0.01042144, "balance_loss_clip": 1.04003096, "balance_loss_mlp": 1.02650356, "epoch": 0.8146455840798413, "flos": 24827668387200.0, "grad_norm": 1.6586076381347519, "language_loss": 0.78550452, "learning_rate": 3.4955201039212987e-07, "loss": 0.80696762, "num_input_tokens_seen": 146300935, "step": 6775, "time_per_iteration": 2.6865148544311523 }, { "auxiliary_loss_clip": 0.0112372, "auxiliary_loss_mlp": 0.0103866, "balance_loss_clip": 1.04245877, "balance_loss_mlp": 1.02232814, "epoch": 0.8147658269704804, "flos": 19974520978560.0, "grad_norm": 1.8213548118014529, "language_loss": 0.65462065, "learning_rate": 3.4911216804553465e-07, "loss": 0.67624444, "num_input_tokens_seen": 146319835, "step": 6776, "time_per_iteration": 2.6076407432556152 }, { "auxiliary_loss_clip": 0.01106475, "auxiliary_loss_mlp": 0.01041742, "balance_loss_clip": 1.03836751, "balance_loss_mlp": 1.02401555, "epoch": 0.8148860698611194, "flos": 21178031097600.0, "grad_norm": 2.188456520253319, "language_loss": 0.70560241, "learning_rate": 3.4867257613731017e-07, "loss": 0.72708452, "num_input_tokens_seen": 146339030, "step": 6777, "time_per_iteration": 2.6476001739501953 }, { "auxiliary_loss_clip": 0.01108402, "auxiliary_loss_mlp": 0.01045257, "balance_loss_clip": 1.03943253, "balance_loss_mlp": 1.03061795, "epoch": 0.8150063127517585, "flos": 19606903234560.0, "grad_norm": 1.6899402699618986, "language_loss": 0.85571468, "learning_rate": 3.4823323473414343e-07, "loss": 0.87725127, "num_input_tokens_seen": 146358550, "step": 6778, "time_per_iteration": 2.6293580532073975 }, { "auxiliary_loss_clip": 0.01098507, "auxiliary_loss_mlp": 0.01043848, "balance_loss_clip": 1.03873241, "balance_loss_mlp": 1.02636003, "epoch": 0.8151265556423977, "flos": 22638374438400.0, "grad_norm": 2.071889980959472, "language_loss": 0.76015955, "learning_rate": 3.477941439026812e-07, "loss": 0.78158313, "num_input_tokens_seen": 146376770, "step": 6779, "time_per_iteration": 2.688451051712036 }, { "auxiliary_loss_clip": 0.01107086, "auxiliary_loss_mlp": 0.0103694, "balance_loss_clip": 1.04151106, "balance_loss_mlp": 1.02270627, "epoch": 0.8152467985330367, "flos": 17968048277760.0, "grad_norm": 2.119756933070819, "language_loss": 0.72691572, "learning_rate": 3.473553037095349e-07, "loss": 0.74835593, "num_input_tokens_seen": 146395795, "step": 6780, "time_per_iteration": 2.628324508666992 }, { "auxiliary_loss_clip": 0.01101525, "auxiliary_loss_mlp": 0.01047068, "balance_loss_clip": 1.04026377, "balance_loss_mlp": 1.03080726, "epoch": 0.8153670414236758, "flos": 24969012405120.0, "grad_norm": 1.717177562709526, "language_loss": 0.83217537, "learning_rate": 3.469167142212743e-07, "loss": 0.8536613, "num_input_tokens_seen": 146417640, "step": 6781, "time_per_iteration": 2.6707260608673096 }, { "auxiliary_loss_clip": 0.0111945, "auxiliary_loss_mlp": 0.0104419, "balance_loss_clip": 1.041013, "balance_loss_mlp": 1.02819264, "epoch": 0.8154872843143149, "flos": 31066069754880.0, "grad_norm": 2.599862566701957, "language_loss": 0.63273144, "learning_rate": 3.4647837550443337e-07, "loss": 0.6543678, "num_input_tokens_seen": 146436205, "step": 6782, "time_per_iteration": 2.7065773010253906 }, { "auxiliary_loss_clip": 0.01102396, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.04222536, "balance_loss_mlp": 1.02281833, "epoch": 0.815607527204954, "flos": 19391654983680.0, "grad_norm": 2.168175530730793, "language_loss": 0.74881214, "learning_rate": 3.460402876255086e-07, "loss": 0.77022344, "num_input_tokens_seen": 146453595, "step": 6783, "time_per_iteration": 2.6727616786956787 }, { "auxiliary_loss_clip": 0.01123382, "auxiliary_loss_mlp": 0.01037398, "balance_loss_clip": 1.04327846, "balance_loss_mlp": 1.02154326, "epoch": 0.815727770095593, "flos": 26140418743680.0, "grad_norm": 2.164992756648806, "language_loss": 0.72011387, "learning_rate": 3.456024506509574e-07, "loss": 0.74172163, "num_input_tokens_seen": 146474515, "step": 6784, "time_per_iteration": 2.6739704608917236 }, { "auxiliary_loss_clip": 0.01120133, "auxiliary_loss_mlp": 0.00772469, "balance_loss_clip": 1.04447424, "balance_loss_mlp": 1.00042868, "epoch": 0.8158480129862322, "flos": 25337527989120.0, "grad_norm": 1.7637971870760245, "language_loss": 0.73946929, "learning_rate": 3.4516486464719873e-07, "loss": 0.75839531, "num_input_tokens_seen": 146493905, "step": 6785, "time_per_iteration": 2.6612491607666016 }, { "auxiliary_loss_clip": 0.01077289, "auxiliary_loss_mlp": 0.01047402, "balance_loss_clip": 1.03641295, "balance_loss_mlp": 1.03030717, "epoch": 0.8159682558768713, "flos": 34423645559040.0, "grad_norm": 1.6378145087810208, "language_loss": 0.62028968, "learning_rate": 3.4472752968061445e-07, "loss": 0.64153659, "num_input_tokens_seen": 146518335, "step": 6786, "time_per_iteration": 2.8167357444763184 }, { "auxiliary_loss_clip": 0.01118574, "auxiliary_loss_mlp": 0.01038781, "balance_loss_clip": 1.04048729, "balance_loss_mlp": 1.02351034, "epoch": 0.8160884987675103, "flos": 18653223185280.0, "grad_norm": 2.3889202149759834, "language_loss": 0.73624146, "learning_rate": 3.442904458175475e-07, "loss": 0.75781506, "num_input_tokens_seen": 146535655, "step": 6787, "time_per_iteration": 2.560810089111328 }, { "auxiliary_loss_clip": 0.01116853, "auxiliary_loss_mlp": 0.01045194, "balance_loss_clip": 1.03898346, "balance_loss_mlp": 1.02895725, "epoch": 0.8162087416581495, "flos": 31430527102080.0, "grad_norm": 1.8399478628807682, "language_loss": 0.76181936, "learning_rate": 3.438536131243044e-07, "loss": 0.78343987, "num_input_tokens_seen": 146556815, "step": 6788, "time_per_iteration": 2.709866523742676 }, { "auxiliary_loss_clip": 0.01111425, "auxiliary_loss_mlp": 0.01043243, "balance_loss_clip": 1.04129159, "balance_loss_mlp": 1.02612412, "epoch": 0.8163289845487885, "flos": 37593910915200.0, "grad_norm": 3.2868144298577655, "language_loss": 0.61461723, "learning_rate": 3.434170316671503e-07, "loss": 0.63616389, "num_input_tokens_seen": 146581845, "step": 6789, "time_per_iteration": 2.77622389793396 }, { "auxiliary_loss_clip": 0.01087839, "auxiliary_loss_mlp": 0.01039461, "balance_loss_clip": 1.03805256, "balance_loss_mlp": 1.0246675, "epoch": 0.8164492274394276, "flos": 13953989554560.0, "grad_norm": 2.4902101972470767, "language_loss": 0.89226764, "learning_rate": 3.4298070151231583e-07, "loss": 0.91354066, "num_input_tokens_seen": 146597245, "step": 6790, "time_per_iteration": 3.6053218841552734 }, { "auxiliary_loss_clip": 0.01114306, "auxiliary_loss_mlp": 0.01031678, "balance_loss_clip": 1.04511285, "balance_loss_mlp": 1.01579928, "epoch": 0.8165694703300668, "flos": 28986554747520.0, "grad_norm": 2.8417069648731403, "language_loss": 0.60582721, "learning_rate": 3.425446227259916e-07, "loss": 0.62728703, "num_input_tokens_seen": 146618210, "step": 6791, "time_per_iteration": 3.697826862335205 }, { "auxiliary_loss_clip": 0.01109287, "auxiliary_loss_mlp": 0.01040444, "balance_loss_clip": 1.04067659, "balance_loss_mlp": 1.02644849, "epoch": 0.8166897132207058, "flos": 25118365155840.0, "grad_norm": 1.8922524544497012, "language_loss": 0.82117975, "learning_rate": 3.421087953743296e-07, "loss": 0.84267706, "num_input_tokens_seen": 146637975, "step": 6792, "time_per_iteration": 2.6661412715911865 }, { "auxiliary_loss_clip": 0.01121121, "auxiliary_loss_mlp": 0.0104775, "balance_loss_clip": 1.03973663, "balance_loss_mlp": 1.03118038, "epoch": 0.8168099561113449, "flos": 23148593176320.0, "grad_norm": 4.818911813818845, "language_loss": 0.80367869, "learning_rate": 3.416732195234464e-07, "loss": 0.82536739, "num_input_tokens_seen": 146658030, "step": 6793, "time_per_iteration": 2.6404004096984863 }, { "auxiliary_loss_clip": 0.01122128, "auxiliary_loss_mlp": 0.01041053, "balance_loss_clip": 1.04404998, "balance_loss_mlp": 1.02611589, "epoch": 0.816930199001984, "flos": 18407666833920.0, "grad_norm": 1.52628695230925, "language_loss": 0.79524457, "learning_rate": 3.4123789523941613e-07, "loss": 0.81687641, "num_input_tokens_seen": 146677855, "step": 6794, "time_per_iteration": 2.5901012420654297 }, { "auxiliary_loss_clip": 0.01111691, "auxiliary_loss_mlp": 0.01041118, "balance_loss_clip": 1.03847945, "balance_loss_mlp": 1.02490544, "epoch": 0.8170504418926231, "flos": 21251324799360.0, "grad_norm": 1.496953102357601, "language_loss": 0.63594294, "learning_rate": 3.4080282258827884e-07, "loss": 0.657471, "num_input_tokens_seen": 146696230, "step": 6795, "time_per_iteration": 3.5823538303375244 }, { "auxiliary_loss_clip": 0.01121432, "auxiliary_loss_mlp": 0.01035278, "balance_loss_clip": 1.04168892, "balance_loss_mlp": 1.02068686, "epoch": 0.8171706847832622, "flos": 19099234362240.0, "grad_norm": 2.277380479339792, "language_loss": 0.72608638, "learning_rate": 3.403680016360342e-07, "loss": 0.74765348, "num_input_tokens_seen": 146714835, "step": 6796, "time_per_iteration": 2.6031718254089355 }, { "auxiliary_loss_clip": 0.01114746, "auxiliary_loss_mlp": 0.01047194, "balance_loss_clip": 1.04203904, "balance_loss_mlp": 1.03138733, "epoch": 0.8172909276739013, "flos": 21470128496640.0, "grad_norm": 2.221287855915607, "language_loss": 0.67748213, "learning_rate": 3.3993343244864403e-07, "loss": 0.69910157, "num_input_tokens_seen": 146734425, "step": 6797, "time_per_iteration": 2.563741683959961 }, { "auxiliary_loss_clip": 0.01119831, "auxiliary_loss_mlp": 0.01037065, "balance_loss_clip": 1.04157495, "balance_loss_mlp": 1.02243197, "epoch": 0.8174111705645404, "flos": 27599792417280.0, "grad_norm": 1.6507765123063327, "language_loss": 0.72999388, "learning_rate": 3.394991150920323e-07, "loss": 0.75156283, "num_input_tokens_seen": 146757545, "step": 6798, "time_per_iteration": 2.6893393993377686 }, { "auxiliary_loss_clip": 0.01087089, "auxiliary_loss_mlp": 0.00775133, "balance_loss_clip": 1.03957331, "balance_loss_mlp": 1.00048316, "epoch": 0.8175314134551794, "flos": 14064594508800.0, "grad_norm": 3.0959629734409155, "language_loss": 0.74266988, "learning_rate": 3.3906504963208396e-07, "loss": 0.7612921, "num_input_tokens_seen": 146774240, "step": 6799, "time_per_iteration": 2.6847188472747803 }, { "auxiliary_loss_clip": 0.01078999, "auxiliary_loss_mlp": 0.01041768, "balance_loss_clip": 1.03797436, "balance_loss_mlp": 1.02547169, "epoch": 0.8176516563458186, "flos": 22708076780160.0, "grad_norm": 1.9214162254826015, "language_loss": 0.6647976, "learning_rate": 3.3863123613464774e-07, "loss": 0.68600523, "num_input_tokens_seen": 146793140, "step": 6800, "time_per_iteration": 3.6148204803466797 }, { "auxiliary_loss_clip": 0.01108718, "auxiliary_loss_mlp": 0.01038006, "balance_loss_clip": 1.03962314, "balance_loss_mlp": 1.02235353, "epoch": 0.8177718992364577, "flos": 21945406279680.0, "grad_norm": 2.3749727157501788, "language_loss": 0.75228119, "learning_rate": 3.381976746655317e-07, "loss": 0.7737484, "num_input_tokens_seen": 146812895, "step": 6801, "time_per_iteration": 2.6448144912719727 }, { "auxiliary_loss_clip": 0.01075371, "auxiliary_loss_mlp": 0.01039863, "balance_loss_clip": 1.03943491, "balance_loss_mlp": 1.0249145, "epoch": 0.8178921421270967, "flos": 22017443005440.0, "grad_norm": 2.207963907849429, "language_loss": 0.67534161, "learning_rate": 3.3776436529050756e-07, "loss": 0.69649398, "num_input_tokens_seen": 146832445, "step": 6802, "time_per_iteration": 2.690122127532959 }, { "auxiliary_loss_clip": 0.01130053, "auxiliary_loss_mlp": 0.01035792, "balance_loss_clip": 1.042418, "balance_loss_mlp": 1.02091479, "epoch": 0.8180123850177359, "flos": 33183111496320.0, "grad_norm": 1.7512240117230087, "language_loss": 0.72693419, "learning_rate": 3.373313080753073e-07, "loss": 0.74859262, "num_input_tokens_seen": 146856505, "step": 6803, "time_per_iteration": 2.663404941558838 }, { "auxiliary_loss_clip": 0.01116807, "auxiliary_loss_mlp": 0.01043967, "balance_loss_clip": 1.03990674, "balance_loss_mlp": 1.02723038, "epoch": 0.8181326279083749, "flos": 22091167670400.0, "grad_norm": 2.0269909022224475, "language_loss": 0.77667665, "learning_rate": 3.3689850308562527e-07, "loss": 0.79828441, "num_input_tokens_seen": 146876950, "step": 6804, "time_per_iteration": 2.6145272254943848 }, { "auxiliary_loss_clip": 0.01071692, "auxiliary_loss_mlp": 0.01041856, "balance_loss_clip": 1.03909481, "balance_loss_mlp": 1.0273366, "epoch": 0.818252870799014, "flos": 15705747936000.0, "grad_norm": 1.849898782649762, "language_loss": 0.7766645, "learning_rate": 3.364659503871183e-07, "loss": 0.79779989, "num_input_tokens_seen": 146894885, "step": 6805, "time_per_iteration": 2.691316604614258 }, { "auxiliary_loss_clip": 0.01090603, "auxiliary_loss_mlp": 0.01029637, "balance_loss_clip": 1.03813219, "balance_loss_mlp": 1.0147841, "epoch": 0.8183731136896532, "flos": 18770687637120.0, "grad_norm": 2.1622531533475375, "language_loss": 0.83725333, "learning_rate": 3.3603365004540417e-07, "loss": 0.85845572, "num_input_tokens_seen": 146913180, "step": 6806, "time_per_iteration": 2.670590400695801 }, { "auxiliary_loss_clip": 0.01132277, "auxiliary_loss_mlp": 0.01035389, "balance_loss_clip": 1.04371905, "balance_loss_mlp": 1.02022576, "epoch": 0.8184933565802922, "flos": 26541792293760.0, "grad_norm": 2.0721910117298297, "language_loss": 0.77055132, "learning_rate": 3.356016021260624e-07, "loss": 0.79222804, "num_input_tokens_seen": 146933510, "step": 6807, "time_per_iteration": 2.62699556350708 }, { "auxiliary_loss_clip": 0.01123162, "auxiliary_loss_mlp": 0.0104988, "balance_loss_clip": 1.04329717, "balance_loss_mlp": 1.03230882, "epoch": 0.8186135994709313, "flos": 17530117660800.0, "grad_norm": 2.907746807473566, "language_loss": 0.65581036, "learning_rate": 3.35169806694634e-07, "loss": 0.67754078, "num_input_tokens_seen": 146951760, "step": 6808, "time_per_iteration": 2.580497980117798 }, { "auxiliary_loss_clip": 0.0100386, "auxiliary_loss_mlp": 0.01000857, "balance_loss_clip": 1.00824594, "balance_loss_mlp": 0.9993847, "epoch": 0.8187338423615703, "flos": 63480300675840.0, "grad_norm": 0.7166798222350615, "language_loss": 0.6062845, "learning_rate": 3.3473826381662186e-07, "loss": 0.62633169, "num_input_tokens_seen": 147022900, "step": 6809, "time_per_iteration": 3.3619697093963623 }, { "auxiliary_loss_clip": 0.01115539, "auxiliary_loss_mlp": 0.01039419, "balance_loss_clip": 1.04224133, "balance_loss_mlp": 1.02404118, "epoch": 0.8188540852522095, "flos": 17529974006400.0, "grad_norm": 2.0326086907331358, "language_loss": 0.82128119, "learning_rate": 3.3430697355749216e-07, "loss": 0.84283078, "num_input_tokens_seen": 147040590, "step": 6810, "time_per_iteration": 2.606898069381714 }, { "auxiliary_loss_clip": 0.01080231, "auxiliary_loss_mlp": 0.01047466, "balance_loss_clip": 1.03746331, "balance_loss_mlp": 1.0302763, "epoch": 0.8189743281428485, "flos": 14392530702720.0, "grad_norm": 1.8819470258303168, "language_loss": 0.75559175, "learning_rate": 3.3387593598266907e-07, "loss": 0.7768687, "num_input_tokens_seen": 147057200, "step": 6811, "time_per_iteration": 2.6896748542785645 }, { "auxiliary_loss_clip": 0.01086657, "auxiliary_loss_mlp": 0.01036015, "balance_loss_clip": 1.03706288, "balance_loss_mlp": 1.02037501, "epoch": 0.8190945710334876, "flos": 25080479285760.0, "grad_norm": 2.366112228458309, "language_loss": 0.78374839, "learning_rate": 3.3344515115754225e-07, "loss": 0.80497509, "num_input_tokens_seen": 147076180, "step": 6812, "time_per_iteration": 2.6860547065734863 }, { "auxiliary_loss_clip": 0.01097239, "auxiliary_loss_mlp": 0.01034149, "balance_loss_clip": 1.03742099, "balance_loss_mlp": 1.019629, "epoch": 0.8192148139241268, "flos": 21507152440320.0, "grad_norm": 12.948585452626448, "language_loss": 0.80122852, "learning_rate": 3.33014619147461e-07, "loss": 0.82254237, "num_input_tokens_seen": 147094205, "step": 6813, "time_per_iteration": 2.6620657444000244 }, { "auxiliary_loss_clip": 0.01109356, "auxiliary_loss_mlp": 0.01044922, "balance_loss_clip": 1.04219317, "balance_loss_mlp": 1.02977622, "epoch": 0.8193350568147658, "flos": 23952166289280.0, "grad_norm": 1.982758595073926, "language_loss": 0.7190358, "learning_rate": 3.325843400177362e-07, "loss": 0.74057859, "num_input_tokens_seen": 147115545, "step": 6814, "time_per_iteration": 2.666489601135254 }, { "auxiliary_loss_clip": 0.011229, "auxiliary_loss_mlp": 0.00772483, "balance_loss_clip": 1.04374897, "balance_loss_mlp": 1.00050378, "epoch": 0.8194552997054049, "flos": 20559469962240.0, "grad_norm": 3.657776094252052, "language_loss": 0.74053586, "learning_rate": 3.32154313833642e-07, "loss": 0.75948972, "num_input_tokens_seen": 147135700, "step": 6815, "time_per_iteration": 2.5849673748016357 }, { "auxiliary_loss_clip": 0.01136689, "auxiliary_loss_mlp": 0.01044044, "balance_loss_clip": 1.04435682, "balance_loss_mlp": 1.02685452, "epoch": 0.819575542596044, "flos": 26031753123840.0, "grad_norm": 1.9840024524848554, "language_loss": 0.59355867, "learning_rate": 3.3172454066041164e-07, "loss": 0.61536598, "num_input_tokens_seen": 147155205, "step": 6816, "time_per_iteration": 2.6309874057769775 }, { "auxiliary_loss_clip": 0.01068513, "auxiliary_loss_mlp": 0.00770378, "balance_loss_clip": 1.03999662, "balance_loss_mlp": 1.00055242, "epoch": 0.8196957854866831, "flos": 29096944220160.0, "grad_norm": 2.2687802594498065, "language_loss": 0.76173735, "learning_rate": 3.3129502056324234e-07, "loss": 0.78012621, "num_input_tokens_seen": 147176570, "step": 6817, "time_per_iteration": 4.653801679611206 }, { "auxiliary_loss_clip": 0.0098252, "auxiliary_loss_mlp": 0.01004556, "balance_loss_clip": 1.01168561, "balance_loss_mlp": 1.00276816, "epoch": 0.8198160283773221, "flos": 69033631898880.0, "grad_norm": 0.8019068172326513, "language_loss": 0.5972141, "learning_rate": 3.3086575360729165e-07, "loss": 0.61708486, "num_input_tokens_seen": 147234105, "step": 6818, "time_per_iteration": 3.218353033065796 }, { "auxiliary_loss_clip": 0.0110478, "auxiliary_loss_mlp": 0.01045753, "balance_loss_clip": 1.03929424, "balance_loss_mlp": 1.0298028, "epoch": 0.8199362712679613, "flos": 16618058496000.0, "grad_norm": 1.9690161596726574, "language_loss": 0.71468252, "learning_rate": 3.3043673985767906e-07, "loss": 0.73618782, "num_input_tokens_seen": 147253170, "step": 6819, "time_per_iteration": 2.8456790447235107 }, { "auxiliary_loss_clip": 0.01086656, "auxiliary_loss_mlp": 0.01051547, "balance_loss_clip": 1.03769505, "balance_loss_mlp": 1.03405929, "epoch": 0.8200565141586004, "flos": 21757664868480.0, "grad_norm": 1.7236273084993372, "language_loss": 0.77623343, "learning_rate": 3.3000797937948564e-07, "loss": 0.79761547, "num_input_tokens_seen": 147271465, "step": 6820, "time_per_iteration": 2.6051697731018066 }, { "auxiliary_loss_clip": 0.010075, "auxiliary_loss_mlp": 0.01000173, "balance_loss_clip": 1.00701499, "balance_loss_mlp": 0.9987545, "epoch": 0.8201767570492394, "flos": 69807112392960.0, "grad_norm": 0.9474443060841246, "language_loss": 0.65043712, "learning_rate": 3.295794722377534e-07, "loss": 0.67051387, "num_input_tokens_seen": 147335070, "step": 6821, "time_per_iteration": 4.0386433601379395 }, { "auxiliary_loss_clip": 0.01128966, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.04207242, "balance_loss_mlp": 1.02081919, "epoch": 0.8202969999398786, "flos": 23111892455040.0, "grad_norm": 1.9192131794691831, "language_loss": 0.80334795, "learning_rate": 3.291512184974876e-07, "loss": 0.82498538, "num_input_tokens_seen": 147355460, "step": 6822, "time_per_iteration": 2.5943093299865723 }, { "auxiliary_loss_clip": 0.01105125, "auxiliary_loss_mlp": 0.01039662, "balance_loss_clip": 1.03830385, "balance_loss_mlp": 1.02266264, "epoch": 0.8204172428305176, "flos": 28220616109440.0, "grad_norm": 1.7810495852782071, "language_loss": 0.66688365, "learning_rate": 3.2872321822365346e-07, "loss": 0.68833154, "num_input_tokens_seen": 147375675, "step": 6823, "time_per_iteration": 2.6596498489379883 }, { "auxiliary_loss_clip": 0.01118785, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.04218388, "balance_loss_mlp": 1.02102971, "epoch": 0.8205374857211567, "flos": 20887011106560.0, "grad_norm": 2.0977698746373408, "language_loss": 0.73437726, "learning_rate": 3.282954714811783e-07, "loss": 0.75592375, "num_input_tokens_seen": 147394580, "step": 6824, "time_per_iteration": 2.5933010578155518 }, { "auxiliary_loss_clip": 0.01096583, "auxiliary_loss_mlp": 0.01041946, "balance_loss_clip": 1.03798032, "balance_loss_mlp": 1.0250895, "epoch": 0.8206577286117959, "flos": 13152140294400.0, "grad_norm": 2.123952994968562, "language_loss": 0.71209335, "learning_rate": 3.2786797833495093e-07, "loss": 0.73347867, "num_input_tokens_seen": 147409935, "step": 6825, "time_per_iteration": 3.5162832736968994 }, { "auxiliary_loss_clip": 0.01130296, "auxiliary_loss_mlp": 0.01033761, "balance_loss_clip": 1.04129851, "balance_loss_mlp": 1.01891994, "epoch": 0.8207779715024349, "flos": 25265634917760.0, "grad_norm": 3.7438921875630164, "language_loss": 0.72792119, "learning_rate": 3.274407388498213e-07, "loss": 0.74956179, "num_input_tokens_seen": 147428065, "step": 6826, "time_per_iteration": 2.5624117851257324 }, { "auxiliary_loss_clip": 0.01092491, "auxiliary_loss_mlp": 0.0104122, "balance_loss_clip": 1.03964591, "balance_loss_mlp": 1.02674794, "epoch": 0.820898214393074, "flos": 19610243199360.0, "grad_norm": 2.3796478391210156, "language_loss": 0.74531823, "learning_rate": 3.270137530906021e-07, "loss": 0.76665533, "num_input_tokens_seen": 147447300, "step": 6827, "time_per_iteration": 2.616598606109619 }, { "auxiliary_loss_clip": 0.0107372, "auxiliary_loss_mlp": 0.01032526, "balance_loss_clip": 1.03791976, "balance_loss_mlp": 1.0177443, "epoch": 0.8210184572837131, "flos": 15596615439360.0, "grad_norm": 2.2393285184455625, "language_loss": 0.83170563, "learning_rate": 3.265870211220665e-07, "loss": 0.85276812, "num_input_tokens_seen": 147465135, "step": 6828, "time_per_iteration": 2.6420419216156006 }, { "auxiliary_loss_clip": 0.01090145, "auxiliary_loss_mlp": 0.01036858, "balance_loss_clip": 1.03926539, "balance_loss_mlp": 1.01839256, "epoch": 0.8211387001743522, "flos": 20813932886400.0, "grad_norm": 2.0638244450003667, "language_loss": 0.81634986, "learning_rate": 3.2616054300894934e-07, "loss": 0.8376199, "num_input_tokens_seen": 147484585, "step": 6829, "time_per_iteration": 2.645249366760254 }, { "auxiliary_loss_clip": 0.01094367, "auxiliary_loss_mlp": 0.01040403, "balance_loss_clip": 1.03903425, "balance_loss_mlp": 1.02494168, "epoch": 0.8212589430649913, "flos": 27704579368320.0, "grad_norm": 4.116142359798905, "language_loss": 0.84560752, "learning_rate": 3.2573431881594693e-07, "loss": 0.86695522, "num_input_tokens_seen": 147504130, "step": 6830, "time_per_iteration": 2.6333580017089844 }, { "auxiliary_loss_clip": 0.0107049, "auxiliary_loss_mlp": 0.01047105, "balance_loss_clip": 1.03507268, "balance_loss_mlp": 1.0314827, "epoch": 0.8213791859556304, "flos": 22455625017600.0, "grad_norm": 2.159228012330345, "language_loss": 0.65866792, "learning_rate": 3.2530834860771663e-07, "loss": 0.6798439, "num_input_tokens_seen": 147523510, "step": 6831, "time_per_iteration": 2.809767246246338 }, { "auxiliary_loss_clip": 0.01118517, "auxiliary_loss_mlp": 0.01039835, "balance_loss_clip": 1.0398283, "balance_loss_mlp": 1.02364659, "epoch": 0.8214994288462695, "flos": 16654471908480.0, "grad_norm": 1.909655114544382, "language_loss": 0.74239451, "learning_rate": 3.248826324488794e-07, "loss": 0.763978, "num_input_tokens_seen": 147540805, "step": 6832, "time_per_iteration": 2.547755718231201 }, { "auxiliary_loss_clip": 0.0113198, "auxiliary_loss_mlp": 0.01036505, "balance_loss_clip": 1.04455543, "balance_loss_mlp": 1.02211618, "epoch": 0.8216196717369085, "flos": 25221787390080.0, "grad_norm": 2.1360336141079634, "language_loss": 0.88098919, "learning_rate": 3.244571704040138e-07, "loss": 0.90267408, "num_input_tokens_seen": 147560965, "step": 6833, "time_per_iteration": 2.5782244205474854 }, { "auxiliary_loss_clip": 0.01115542, "auxiliary_loss_mlp": 0.01042344, "balance_loss_clip": 1.03829265, "balance_loss_mlp": 1.02641737, "epoch": 0.8217399146275477, "flos": 25371930240000.0, "grad_norm": 2.2031954115131795, "language_loss": 0.73738962, "learning_rate": 3.2403196253766374e-07, "loss": 0.75896847, "num_input_tokens_seen": 147580045, "step": 6834, "time_per_iteration": 2.6144776344299316 }, { "auxiliary_loss_clip": 0.01117508, "auxiliary_loss_mlp": 0.01040054, "balance_loss_clip": 1.04016638, "balance_loss_mlp": 1.022596, "epoch": 0.8218601575181868, "flos": 25629625388160.0, "grad_norm": 2.3346884053342336, "language_loss": 0.79022694, "learning_rate": 3.2360700891433254e-07, "loss": 0.81180263, "num_input_tokens_seen": 147599070, "step": 6835, "time_per_iteration": 2.6211705207824707 }, { "auxiliary_loss_clip": 0.01001567, "auxiliary_loss_mlp": 0.01002773, "balance_loss_clip": 1.01086593, "balance_loss_mlp": 1.00118756, "epoch": 0.8219804004088258, "flos": 67660229427840.0, "grad_norm": 0.7863303732843467, "language_loss": 0.57350385, "learning_rate": 3.231823095984847e-07, "loss": 0.59354722, "num_input_tokens_seen": 147653710, "step": 6836, "time_per_iteration": 3.1398673057556152 }, { "auxiliary_loss_clip": 0.01107763, "auxiliary_loss_mlp": 0.01037804, "balance_loss_clip": 1.04231048, "balance_loss_mlp": 1.0236299, "epoch": 0.822100643299465, "flos": 19464266327040.0, "grad_norm": 7.976673920422091, "language_loss": 0.76110715, "learning_rate": 3.2275786465454814e-07, "loss": 0.78256279, "num_input_tokens_seen": 147670360, "step": 6837, "time_per_iteration": 2.593585968017578 }, { "auxiliary_loss_clip": 0.01094483, "auxiliary_loss_mlp": 0.01043259, "balance_loss_clip": 1.03896856, "balance_loss_mlp": 1.02670074, "epoch": 0.822220886190104, "flos": 24681368292480.0, "grad_norm": 1.8660471424480793, "language_loss": 0.75406659, "learning_rate": 3.2233367414690917e-07, "loss": 0.77544397, "num_input_tokens_seen": 147692550, "step": 6838, "time_per_iteration": 2.642017126083374 }, { "auxiliary_loss_clip": 0.01089427, "auxiliary_loss_mlp": 0.01032349, "balance_loss_clip": 1.03659844, "balance_loss_mlp": 1.01748312, "epoch": 0.8223411290807431, "flos": 27819062991360.0, "grad_norm": 2.2420615111997777, "language_loss": 0.84751767, "learning_rate": 3.219097381399183e-07, "loss": 0.86873537, "num_input_tokens_seen": 147709725, "step": 6839, "time_per_iteration": 2.742370843887329 }, { "auxiliary_loss_clip": 0.01117761, "auxiliary_loss_mlp": 0.01039303, "balance_loss_clip": 1.0441395, "balance_loss_mlp": 1.02318609, "epoch": 0.8224613719713821, "flos": 23218546913280.0, "grad_norm": 1.7762671092385764, "language_loss": 0.81017715, "learning_rate": 3.2148605669788584e-07, "loss": 0.83174777, "num_input_tokens_seen": 147729615, "step": 6840, "time_per_iteration": 2.618436574935913 }, { "auxiliary_loss_clip": 0.01110025, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.04300833, "balance_loss_mlp": 1.02121663, "epoch": 0.8225816148620213, "flos": 15706250726400.0, "grad_norm": 2.6920398048696264, "language_loss": 0.77742994, "learning_rate": 3.2106262988508405e-07, "loss": 0.79889452, "num_input_tokens_seen": 147747665, "step": 6841, "time_per_iteration": 2.583183526992798 }, { "auxiliary_loss_clip": 0.01109509, "auxiliary_loss_mlp": 0.01043709, "balance_loss_clip": 1.03990126, "balance_loss_mlp": 1.02759194, "epoch": 0.8227018577526604, "flos": 18515111391360.0, "grad_norm": 3.6834328132856187, "language_loss": 0.7420845, "learning_rate": 3.206394577657465e-07, "loss": 0.76361668, "num_input_tokens_seen": 147765445, "step": 6842, "time_per_iteration": 3.5217652320861816 }, { "auxiliary_loss_clip": 0.0112897, "auxiliary_loss_mlp": 0.01046389, "balance_loss_clip": 1.04370439, "balance_loss_mlp": 1.02961576, "epoch": 0.8228221006432994, "flos": 22236785406720.0, "grad_norm": 2.6038301609576164, "language_loss": 0.72581398, "learning_rate": 3.202165404040675e-07, "loss": 0.74756753, "num_input_tokens_seen": 147783365, "step": 6843, "time_per_iteration": 2.5597496032714844 }, { "auxiliary_loss_clip": 0.01066404, "auxiliary_loss_mlp": 0.01042906, "balance_loss_clip": 1.03729296, "balance_loss_mlp": 1.02622867, "epoch": 0.8229423435339386, "flos": 24097532630400.0, "grad_norm": 2.280452326390035, "language_loss": 0.74614203, "learning_rate": 3.1979387786420396e-07, "loss": 0.76723516, "num_input_tokens_seen": 147803605, "step": 6844, "time_per_iteration": 3.644972801208496 }, { "auxiliary_loss_clip": 0.01108401, "auxiliary_loss_mlp": 0.01032681, "balance_loss_clip": 1.03997087, "balance_loss_mlp": 1.01692128, "epoch": 0.8230625864245776, "flos": 23878549365120.0, "grad_norm": 1.9765087828796222, "language_loss": 0.81947851, "learning_rate": 3.1937147021027346e-07, "loss": 0.84088933, "num_input_tokens_seen": 147822060, "step": 6845, "time_per_iteration": 2.615494728088379 }, { "auxiliary_loss_clip": 0.01117972, "auxiliary_loss_mlp": 0.01038535, "balance_loss_clip": 1.04189396, "balance_loss_mlp": 1.02303791, "epoch": 0.8231828293152167, "flos": 16581106379520.0, "grad_norm": 2.430666682253666, "language_loss": 0.76562822, "learning_rate": 3.189493175063547e-07, "loss": 0.7871933, "num_input_tokens_seen": 147839295, "step": 6846, "time_per_iteration": 2.5462231636047363 }, { "auxiliary_loss_clip": 0.01108652, "auxiliary_loss_mlp": 0.01041985, "balance_loss_clip": 1.04110539, "balance_loss_mlp": 1.02546287, "epoch": 0.8233030722058559, "flos": 18880071528960.0, "grad_norm": 2.023341415454135, "language_loss": 0.6738652, "learning_rate": 3.1852741981648776e-07, "loss": 0.69537163, "num_input_tokens_seen": 147857945, "step": 6847, "time_per_iteration": 3.487617015838623 }, { "auxiliary_loss_clip": 0.01084667, "auxiliary_loss_mlp": 0.01040415, "balance_loss_clip": 1.03981996, "balance_loss_mlp": 1.02454877, "epoch": 0.8234233150964949, "flos": 28439024757120.0, "grad_norm": 2.4770911903485993, "language_loss": 0.70394206, "learning_rate": 3.1810577720467404e-07, "loss": 0.7251929, "num_input_tokens_seen": 147879675, "step": 6848, "time_per_iteration": 2.694370985031128 }, { "auxiliary_loss_clip": 0.01112325, "auxiliary_loss_mlp": 0.0103827, "balance_loss_clip": 1.04196823, "balance_loss_mlp": 1.02268958, "epoch": 0.823543557987134, "flos": 33765941577600.0, "grad_norm": 1.516813721201424, "language_loss": 0.56659007, "learning_rate": 3.176843897348769e-07, "loss": 0.58809602, "num_input_tokens_seen": 147902870, "step": 6849, "time_per_iteration": 2.7181925773620605 }, { "auxiliary_loss_clip": 0.01103217, "auxiliary_loss_mlp": 0.01050405, "balance_loss_clip": 1.04035401, "balance_loss_mlp": 1.03346515, "epoch": 0.8236638008777731, "flos": 17092366611840.0, "grad_norm": 2.694828215531024, "language_loss": 0.75451124, "learning_rate": 3.1726325747102034e-07, "loss": 0.77604735, "num_input_tokens_seen": 147921245, "step": 6850, "time_per_iteration": 2.569387912750244 }, { "auxiliary_loss_clip": 0.01073921, "auxiliary_loss_mlp": 0.01046962, "balance_loss_clip": 1.03323913, "balance_loss_mlp": 1.03102386, "epoch": 0.8237840437684122, "flos": 61639982334720.0, "grad_norm": 1.5076337224378293, "language_loss": 0.64112186, "learning_rate": 3.1684238047698974e-07, "loss": 0.66233075, "num_input_tokens_seen": 147949515, "step": 6851, "time_per_iteration": 3.866121768951416 }, { "auxiliary_loss_clip": 0.01111129, "auxiliary_loss_mlp": 0.01040367, "balance_loss_clip": 1.04102349, "balance_loss_mlp": 1.02368939, "epoch": 0.8239042866590512, "flos": 27309023821440.0, "grad_norm": 2.1930133844239164, "language_loss": 0.53138757, "learning_rate": 3.1642175881663155e-07, "loss": 0.55290246, "num_input_tokens_seen": 147969245, "step": 6852, "time_per_iteration": 2.677950620651245 }, { "auxiliary_loss_clip": 0.01131446, "auxiliary_loss_mlp": 0.0103195, "balance_loss_clip": 1.04219389, "balance_loss_mlp": 1.0172869, "epoch": 0.8240245295496904, "flos": 21726351187200.0, "grad_norm": 2.0361527030323066, "language_loss": 0.83702481, "learning_rate": 3.160013925537537e-07, "loss": 0.85865879, "num_input_tokens_seen": 147990080, "step": 6853, "time_per_iteration": 2.5349340438842773 }, { "auxiliary_loss_clip": 0.01098287, "auxiliary_loss_mlp": 0.01035337, "balance_loss_clip": 1.03955877, "balance_loss_mlp": 1.01913619, "epoch": 0.8241447724403295, "flos": 20009318279040.0, "grad_norm": 2.6168036923873688, "language_loss": 0.75696576, "learning_rate": 3.155812817521266e-07, "loss": 0.77830195, "num_input_tokens_seen": 148010455, "step": 6854, "time_per_iteration": 2.6495137214660645 }, { "auxiliary_loss_clip": 0.01108682, "auxiliary_loss_mlp": 0.01040886, "balance_loss_clip": 1.04008675, "balance_loss_mlp": 1.02469778, "epoch": 0.8242650153309685, "flos": 22272983337600.0, "grad_norm": 1.928670910137128, "language_loss": 0.7807039, "learning_rate": 3.151614264754787e-07, "loss": 0.80219948, "num_input_tokens_seen": 148028400, "step": 6855, "time_per_iteration": 2.591064214706421 }, { "auxiliary_loss_clip": 0.01132292, "auxiliary_loss_mlp": 0.01037109, "balance_loss_clip": 1.04094028, "balance_loss_mlp": 1.0217545, "epoch": 0.8243852582216077, "flos": 22309971367680.0, "grad_norm": 2.7640612266151687, "language_loss": 0.79230654, "learning_rate": 3.147418267875035e-07, "loss": 0.81400055, "num_input_tokens_seen": 148046530, "step": 6856, "time_per_iteration": 2.5592894554138184 }, { "auxiliary_loss_clip": 0.01065672, "auxiliary_loss_mlp": 0.00773487, "balance_loss_clip": 1.03391147, "balance_loss_mlp": 1.00051296, "epoch": 0.8245055011122467, "flos": 24645421756800.0, "grad_norm": 2.549016462492462, "language_loss": 0.65765667, "learning_rate": 3.1432248275185315e-07, "loss": 0.67604822, "num_input_tokens_seen": 148067040, "step": 6857, "time_per_iteration": 2.7211947441101074 }, { "auxiliary_loss_clip": 0.01116513, "auxiliary_loss_mlp": 0.01040719, "balance_loss_clip": 1.03990924, "balance_loss_mlp": 1.02522159, "epoch": 0.8246257440028858, "flos": 17487275713920.0, "grad_norm": 2.123576456659765, "language_loss": 0.77459753, "learning_rate": 3.139033944321412e-07, "loss": 0.79616982, "num_input_tokens_seen": 148084400, "step": 6858, "time_per_iteration": 2.549250364303589 }, { "auxiliary_loss_clip": 0.01121406, "auxiliary_loss_mlp": 0.01040717, "balance_loss_clip": 1.04120636, "balance_loss_mlp": 1.02561355, "epoch": 0.824745986893525, "flos": 25010130499200.0, "grad_norm": 1.66696157663863, "language_loss": 0.79194176, "learning_rate": 3.1348456189194507e-07, "loss": 0.81356299, "num_input_tokens_seen": 148104860, "step": 6859, "time_per_iteration": 2.5978031158447266 }, { "auxiliary_loss_clip": 0.01084591, "auxiliary_loss_mlp": 0.01054249, "balance_loss_clip": 1.03690696, "balance_loss_mlp": 1.03746486, "epoch": 0.824866229784164, "flos": 18772698798720.0, "grad_norm": 1.6586278295647086, "language_loss": 0.8297925, "learning_rate": 3.1306598519479876e-07, "loss": 0.85118091, "num_input_tokens_seen": 148124680, "step": 6860, "time_per_iteration": 2.653796911239624 }, { "auxiliary_loss_clip": 0.01103064, "auxiliary_loss_mlp": 0.01038535, "balance_loss_clip": 1.04080379, "balance_loss_mlp": 1.02381873, "epoch": 0.8249864726748031, "flos": 23842171866240.0, "grad_norm": 1.7223058474242008, "language_loss": 0.77994597, "learning_rate": 3.1264766440420177e-07, "loss": 0.80136204, "num_input_tokens_seen": 148147150, "step": 6861, "time_per_iteration": 2.6273999214172363 }, { "auxiliary_loss_clip": 0.01118469, "auxiliary_loss_mlp": 0.01037051, "balance_loss_clip": 1.04265594, "balance_loss_mlp": 1.02201915, "epoch": 0.8251067155654422, "flos": 20303103617280.0, "grad_norm": 3.295002157658535, "language_loss": 0.69078732, "learning_rate": 3.122295995836124e-07, "loss": 0.7123425, "num_input_tokens_seen": 148167020, "step": 6862, "time_per_iteration": 2.613039493560791 }, { "auxiliary_loss_clip": 0.01121786, "auxiliary_loss_mlp": 0.01041681, "balance_loss_clip": 1.0399332, "balance_loss_mlp": 1.02545679, "epoch": 0.8252269584560813, "flos": 25009699536000.0, "grad_norm": 1.955739800685825, "language_loss": 0.77877903, "learning_rate": 3.118117907964508e-07, "loss": 0.80041367, "num_input_tokens_seen": 148188965, "step": 6863, "time_per_iteration": 2.6121139526367188 }, { "auxiliary_loss_clip": 0.01102725, "auxiliary_loss_mlp": 0.01047994, "balance_loss_clip": 1.04294097, "balance_loss_mlp": 1.03167415, "epoch": 0.8253472013467203, "flos": 17128564542720.0, "grad_norm": 1.8311527011032833, "language_loss": 0.80219918, "learning_rate": 3.1139423810609856e-07, "loss": 0.82370639, "num_input_tokens_seen": 148205660, "step": 6864, "time_per_iteration": 2.674448013305664 }, { "auxiliary_loss_clip": 0.01132741, "auxiliary_loss_mlp": 0.01040534, "balance_loss_clip": 1.04181004, "balance_loss_mlp": 1.02436876, "epoch": 0.8254674442373595, "flos": 22414794232320.0, "grad_norm": 2.3596654754149577, "language_loss": 0.7549628, "learning_rate": 3.1097694157589714e-07, "loss": 0.77669561, "num_input_tokens_seen": 148225545, "step": 6865, "time_per_iteration": 2.5908854007720947 }, { "auxiliary_loss_clip": 0.0111854, "auxiliary_loss_mlp": 0.01041033, "balance_loss_clip": 1.04298329, "balance_loss_mlp": 1.0246774, "epoch": 0.8255876871279986, "flos": 24786765774720.0, "grad_norm": 3.1604007645995686, "language_loss": 0.75829279, "learning_rate": 3.105599012691511e-07, "loss": 0.77988851, "num_input_tokens_seen": 148243975, "step": 6866, "time_per_iteration": 2.6206517219543457 }, { "auxiliary_loss_clip": 0.01117848, "auxiliary_loss_mlp": 0.01035436, "balance_loss_clip": 1.04201591, "balance_loss_mlp": 1.01977801, "epoch": 0.8257079300186376, "flos": 27455431656960.0, "grad_norm": 1.4701522647716312, "language_loss": 0.82411885, "learning_rate": 3.101431172491249e-07, "loss": 0.84565175, "num_input_tokens_seen": 148265520, "step": 6867, "time_per_iteration": 2.6194324493408203 }, { "auxiliary_loss_clip": 0.01097106, "auxiliary_loss_mlp": 0.00772346, "balance_loss_clip": 1.03672254, "balance_loss_mlp": 1.00048983, "epoch": 0.8258281729092768, "flos": 16471866142080.0, "grad_norm": 3.6152158666411007, "language_loss": 0.72089124, "learning_rate": 3.097265895790444e-07, "loss": 0.73958576, "num_input_tokens_seen": 148283730, "step": 6868, "time_per_iteration": 2.642716884613037 }, { "auxiliary_loss_clip": 0.01100406, "auxiliary_loss_mlp": 0.01039531, "balance_loss_clip": 1.04098368, "balance_loss_mlp": 1.02389073, "epoch": 0.8259484157999158, "flos": 21433822824960.0, "grad_norm": 2.495013834050026, "language_loss": 0.83848089, "learning_rate": 3.093103183220962e-07, "loss": 0.85988021, "num_input_tokens_seen": 148303775, "step": 6869, "time_per_iteration": 4.513405084609985 }, { "auxiliary_loss_clip": 0.0102635, "auxiliary_loss_mlp": 0.01004229, "balance_loss_clip": 1.00791121, "balance_loss_mlp": 1.00270295, "epoch": 0.8260686586905549, "flos": 58322342453760.0, "grad_norm": 0.8283092995514415, "language_loss": 0.59353423, "learning_rate": 3.0889430354142796e-07, "loss": 0.61384004, "num_input_tokens_seen": 148365285, "step": 6870, "time_per_iteration": 3.15983510017395 }, { "auxiliary_loss_clip": 0.01098725, "auxiliary_loss_mlp": 0.01031031, "balance_loss_clip": 1.03811038, "balance_loss_mlp": 1.01471114, "epoch": 0.826188901581194, "flos": 27527288814720.0, "grad_norm": 1.880522421447244, "language_loss": 0.70306766, "learning_rate": 3.084785453001497e-07, "loss": 0.72436523, "num_input_tokens_seen": 148386200, "step": 6871, "time_per_iteration": 2.69478178024292 }, { "auxiliary_loss_clip": 0.01104609, "auxiliary_loss_mlp": 0.00772008, "balance_loss_clip": 1.03947628, "balance_loss_mlp": 1.00049925, "epoch": 0.8263091444718331, "flos": 23696051339520.0, "grad_norm": 2.052814980433371, "language_loss": 0.81877834, "learning_rate": 3.080630436613314e-07, "loss": 0.83754456, "num_input_tokens_seen": 148403970, "step": 6872, "time_per_iteration": 2.6344425678253174 }, { "auxiliary_loss_clip": 0.01110663, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.03899491, "balance_loss_mlp": 1.02281451, "epoch": 0.8264293873624722, "flos": 17165157523200.0, "grad_norm": 3.872359046795957, "language_loss": 0.86138576, "learning_rate": 3.076477986880039e-07, "loss": 0.88286424, "num_input_tokens_seen": 148421765, "step": 6873, "time_per_iteration": 3.5499870777130127 }, { "auxiliary_loss_clip": 0.01108253, "auxiliary_loss_mlp": 0.01039219, "balance_loss_clip": 1.04153681, "balance_loss_mlp": 1.02450871, "epoch": 0.8265496302531112, "flos": 24098645952000.0, "grad_norm": 3.364525134245275, "language_loss": 0.6924811, "learning_rate": 3.0723281044315986e-07, "loss": 0.71395588, "num_input_tokens_seen": 148443720, "step": 6874, "time_per_iteration": 2.6912131309509277 }, { "auxiliary_loss_clip": 0.01126684, "auxiliary_loss_mlp": 0.01039658, "balance_loss_clip": 1.04003084, "balance_loss_mlp": 1.02495909, "epoch": 0.8266698731437504, "flos": 14099894599680.0, "grad_norm": 2.139142379990511, "language_loss": 0.76638418, "learning_rate": 3.068180789897521e-07, "loss": 0.78804755, "num_input_tokens_seen": 148462130, "step": 6875, "time_per_iteration": 2.5205986499786377 }, { "auxiliary_loss_clip": 0.01125159, "auxiliary_loss_mlp": 0.01038086, "balance_loss_clip": 1.04228854, "balance_loss_mlp": 1.02186179, "epoch": 0.8267901160343895, "flos": 30777563715840.0, "grad_norm": 1.4619582677548055, "language_loss": 0.81708902, "learning_rate": 3.064036043906966e-07, "loss": 0.83872145, "num_input_tokens_seen": 148485570, "step": 6876, "time_per_iteration": 2.7011711597442627 }, { "auxiliary_loss_clip": 0.01106565, "auxiliary_loss_mlp": 0.01042362, "balance_loss_clip": 1.04256976, "balance_loss_mlp": 1.02508903, "epoch": 0.8269103589250285, "flos": 40624915242240.0, "grad_norm": 2.161631861520018, "language_loss": 0.68031096, "learning_rate": 3.059893867088668e-07, "loss": 0.70180023, "num_input_tokens_seen": 148509715, "step": 6877, "time_per_iteration": 3.6993072032928467 }, { "auxiliary_loss_clip": 0.01116084, "auxiliary_loss_mlp": 0.01035891, "balance_loss_clip": 1.04087973, "balance_loss_mlp": 1.02115679, "epoch": 0.8270306018156677, "flos": 30263645877120.0, "grad_norm": 1.8297503433084004, "language_loss": 0.66990322, "learning_rate": 3.055754260071004e-07, "loss": 0.691423, "num_input_tokens_seen": 148532010, "step": 6878, "time_per_iteration": 2.6668550968170166 }, { "auxiliary_loss_clip": 0.0111807, "auxiliary_loss_mlp": 0.01034975, "balance_loss_clip": 1.04111958, "balance_loss_mlp": 1.0206995, "epoch": 0.8271508447063067, "flos": 25226599812480.0, "grad_norm": 2.8902020761806115, "language_loss": 0.73273396, "learning_rate": 3.051617223481948e-07, "loss": 0.75426441, "num_input_tokens_seen": 148553330, "step": 6879, "time_per_iteration": 2.625582218170166 }, { "auxiliary_loss_clip": 0.01105859, "auxiliary_loss_mlp": 0.01049747, "balance_loss_clip": 1.04108953, "balance_loss_mlp": 1.0305903, "epoch": 0.8272710875969458, "flos": 17566602900480.0, "grad_norm": 2.0440178624901457, "language_loss": 0.75145984, "learning_rate": 3.047482757949078e-07, "loss": 0.77301586, "num_input_tokens_seen": 148570960, "step": 6880, "time_per_iteration": 2.6208581924438477 }, { "auxiliary_loss_clip": 0.01090759, "auxiliary_loss_mlp": 0.00771139, "balance_loss_clip": 1.03847289, "balance_loss_mlp": 1.00047541, "epoch": 0.827391330487585, "flos": 19755465886080.0, "grad_norm": 1.9520061556444772, "language_loss": 0.85767829, "learning_rate": 3.043350864099605e-07, "loss": 0.87629735, "num_input_tokens_seen": 148589520, "step": 6881, "time_per_iteration": 2.6573901176452637 }, { "auxiliary_loss_clip": 0.01121483, "auxiliary_loss_mlp": 0.01031811, "balance_loss_clip": 1.04105043, "balance_loss_mlp": 1.01676702, "epoch": 0.827511573378224, "flos": 16835174254080.0, "grad_norm": 2.1854648481365215, "language_loss": 0.80691981, "learning_rate": 3.039221542560315e-07, "loss": 0.82845277, "num_input_tokens_seen": 148606085, "step": 6882, "time_per_iteration": 2.5346078872680664 }, { "auxiliary_loss_clip": 0.01116895, "auxiliary_loss_mlp": 0.01035933, "balance_loss_clip": 1.04095864, "balance_loss_mlp": 1.02013767, "epoch": 0.8276318162688631, "flos": 18369242259840.0, "grad_norm": 2.20073232210675, "language_loss": 0.73623097, "learning_rate": 3.0350947939576356e-07, "loss": 0.75775933, "num_input_tokens_seen": 148625240, "step": 6883, "time_per_iteration": 2.6017651557922363 }, { "auxiliary_loss_clip": 0.01124813, "auxiliary_loss_mlp": 0.01043354, "balance_loss_clip": 1.04280269, "balance_loss_mlp": 1.02779746, "epoch": 0.8277520591595022, "flos": 19352691705600.0, "grad_norm": 2.491622461786667, "language_loss": 0.72178745, "learning_rate": 3.0309706189175876e-07, "loss": 0.74346912, "num_input_tokens_seen": 148645075, "step": 6884, "time_per_iteration": 2.5595128536224365 }, { "auxiliary_loss_clip": 0.01017575, "auxiliary_loss_mlp": 0.01000793, "balance_loss_clip": 1.0081706, "balance_loss_mlp": 0.99933243, "epoch": 0.8278723020501413, "flos": 67918858329600.0, "grad_norm": 0.8162855665546345, "language_loss": 0.57328981, "learning_rate": 3.0268490180658045e-07, "loss": 0.59347349, "num_input_tokens_seen": 148707855, "step": 6885, "time_per_iteration": 3.2134711742401123 }, { "auxiliary_loss_clip": 0.0113396, "auxiliary_loss_mlp": 0.01036906, "balance_loss_clip": 1.04370189, "balance_loss_mlp": 1.02111042, "epoch": 0.8279925449407803, "flos": 18185738653440.0, "grad_norm": 2.2583352035531488, "language_loss": 0.79613549, "learning_rate": 3.0227299920275305e-07, "loss": 0.81784409, "num_input_tokens_seen": 148724170, "step": 6886, "time_per_iteration": 2.51492977142334 }, { "auxiliary_loss_clip": 0.01108464, "auxiliary_loss_mlp": 0.01044159, "balance_loss_clip": 1.04526281, "balance_loss_mlp": 1.02656388, "epoch": 0.8281127878314195, "flos": 20631434860800.0, "grad_norm": 2.3168675886122587, "language_loss": 0.85923105, "learning_rate": 3.018613541427613e-07, "loss": 0.88075727, "num_input_tokens_seen": 148743690, "step": 6887, "time_per_iteration": 2.628676652908325 }, { "auxiliary_loss_clip": 0.01131408, "auxiliary_loss_mlp": 0.01035937, "balance_loss_clip": 1.04340768, "balance_loss_mlp": 1.02087438, "epoch": 0.8282330307220586, "flos": 18004282122240.0, "grad_norm": 1.8124356870874718, "language_loss": 0.73861623, "learning_rate": 3.0144996668905243e-07, "loss": 0.76028967, "num_input_tokens_seen": 148761070, "step": 6888, "time_per_iteration": 2.532625913619995 }, { "auxiliary_loss_clip": 0.01074083, "auxiliary_loss_mlp": 0.00772506, "balance_loss_clip": 1.03716326, "balance_loss_mlp": 1.0004462, "epoch": 0.8283532736126976, "flos": 20084120352000.0, "grad_norm": 2.176961595350253, "language_loss": 0.8213948, "learning_rate": 3.010388369040331e-07, "loss": 0.83986074, "num_input_tokens_seen": 148779730, "step": 6889, "time_per_iteration": 2.7108848094940186 }, { "auxiliary_loss_clip": 0.01120431, "auxiliary_loss_mlp": 0.01040675, "balance_loss_clip": 1.04269195, "balance_loss_mlp": 1.02610815, "epoch": 0.8284735165033368, "flos": 31868421805440.0, "grad_norm": 2.6198873182328297, "language_loss": 0.82944304, "learning_rate": 3.0062796485007156e-07, "loss": 0.85105413, "num_input_tokens_seen": 148800670, "step": 6890, "time_per_iteration": 2.657606363296509 }, { "auxiliary_loss_clip": 0.01132873, "auxiliary_loss_mlp": 0.00772466, "balance_loss_clip": 1.04298878, "balance_loss_mlp": 1.00044, "epoch": 0.8285937593939758, "flos": 26651319840000.0, "grad_norm": 2.772072010649474, "language_loss": 0.65398961, "learning_rate": 3.002173505894965e-07, "loss": 0.67304301, "num_input_tokens_seen": 148819820, "step": 6891, "time_per_iteration": 2.589310646057129 }, { "auxiliary_loss_clip": 0.01122926, "auxiliary_loss_mlp": 0.01044882, "balance_loss_clip": 1.04150748, "balance_loss_mlp": 1.02787042, "epoch": 0.8287140022846149, "flos": 20193683811840.0, "grad_norm": 2.338786901770484, "language_loss": 0.62371498, "learning_rate": 2.998069941845973e-07, "loss": 0.64539307, "num_input_tokens_seen": 148838890, "step": 6892, "time_per_iteration": 2.577353000640869 }, { "auxiliary_loss_clip": 0.01034671, "auxiliary_loss_mlp": 0.01001457, "balance_loss_clip": 1.00688457, "balance_loss_mlp": 0.9999429, "epoch": 0.8288342451752541, "flos": 70755980019840.0, "grad_norm": 0.709744316935673, "language_loss": 0.57426465, "learning_rate": 2.993968956976258e-07, "loss": 0.59462595, "num_input_tokens_seen": 148906635, "step": 6893, "time_per_iteration": 3.2838265895843506 }, { "auxiliary_loss_clip": 0.01138734, "auxiliary_loss_mlp": 0.01037866, "balance_loss_clip": 1.04415298, "balance_loss_mlp": 1.02129579, "epoch": 0.8289544880658931, "flos": 24572235795840.0, "grad_norm": 2.569787970733151, "language_loss": 0.70253521, "learning_rate": 2.9898705519079313e-07, "loss": 0.72430122, "num_input_tokens_seen": 148925740, "step": 6894, "time_per_iteration": 2.6233842372894287 }, { "auxiliary_loss_clip": 0.01103215, "auxiliary_loss_mlp": 0.01039843, "balance_loss_clip": 1.04140949, "balance_loss_mlp": 1.02290297, "epoch": 0.8290747309565322, "flos": 22273378387200.0, "grad_norm": 1.8734338377472275, "language_loss": 0.75054681, "learning_rate": 2.985774727262715e-07, "loss": 0.77197737, "num_input_tokens_seen": 148944585, "step": 6895, "time_per_iteration": 4.520265817642212 }, { "auxiliary_loss_clip": 0.01129945, "auxiliary_loss_mlp": 0.01030661, "balance_loss_clip": 1.04357839, "balance_loss_mlp": 1.01624823, "epoch": 0.8291949738471713, "flos": 23255570856960.0, "grad_norm": 1.9310474493345693, "language_loss": 0.81498235, "learning_rate": 2.981681483661949e-07, "loss": 0.83658838, "num_input_tokens_seen": 148964170, "step": 6896, "time_per_iteration": 2.5745770931243896 }, { "auxiliary_loss_clip": 0.01121463, "auxiliary_loss_mlp": 0.01042473, "balance_loss_clip": 1.04262161, "balance_loss_mlp": 1.02823973, "epoch": 0.8293152167378104, "flos": 52555768185600.0, "grad_norm": 1.7016186568156644, "language_loss": 0.70911515, "learning_rate": 2.9775908217265633e-07, "loss": 0.73075449, "num_input_tokens_seen": 148989405, "step": 6897, "time_per_iteration": 2.857151508331299 }, { "auxiliary_loss_clip": 0.00989893, "auxiliary_loss_mlp": 0.01000577, "balance_loss_clip": 1.01167583, "balance_loss_mlp": 0.99895573, "epoch": 0.8294354596284494, "flos": 63356156294400.0, "grad_norm": 0.8372238059030559, "language_loss": 0.5035392, "learning_rate": 2.9735027420771253e-07, "loss": 0.52344394, "num_input_tokens_seen": 149049740, "step": 6898, "time_per_iteration": 3.3031742572784424 }, { "auxiliary_loss_clip": 0.0110048, "auxiliary_loss_mlp": 0.01038845, "balance_loss_clip": 1.04087365, "balance_loss_mlp": 1.02458787, "epoch": 0.8295557025190886, "flos": 24827021942400.0, "grad_norm": 5.1368341143645795, "language_loss": 0.71371222, "learning_rate": 2.969417245333774e-07, "loss": 0.73510551, "num_input_tokens_seen": 149069120, "step": 6899, "time_per_iteration": 3.9553093910217285 }, { "auxiliary_loss_clip": 0.0108979, "auxiliary_loss_mlp": 0.01040121, "balance_loss_clip": 1.03901315, "balance_loss_mlp": 1.02471948, "epoch": 0.8296759454097277, "flos": 25118580637440.0, "grad_norm": 4.307107973326613, "language_loss": 0.78229725, "learning_rate": 2.9653343321162915e-07, "loss": 0.80359638, "num_input_tokens_seen": 149088630, "step": 6900, "time_per_iteration": 2.694775104522705 }, { "auxiliary_loss_clip": 0.01097387, "auxiliary_loss_mlp": 0.01037674, "balance_loss_clip": 1.04102397, "balance_loss_mlp": 1.02175975, "epoch": 0.8297961883003667, "flos": 24132581326080.0, "grad_norm": 2.136639759767452, "language_loss": 0.64749378, "learning_rate": 2.9612540030440446e-07, "loss": 0.6688444, "num_input_tokens_seen": 149109175, "step": 6901, "time_per_iteration": 2.702730178833008 }, { "auxiliary_loss_clip": 0.01017003, "auxiliary_loss_mlp": 0.01001118, "balance_loss_clip": 1.00779343, "balance_loss_mlp": 0.99969339, "epoch": 0.8299164311910058, "flos": 67446561375360.0, "grad_norm": 0.8481378614189543, "language_loss": 0.64119899, "learning_rate": 2.9571762587360206e-07, "loss": 0.66138029, "num_input_tokens_seen": 149165560, "step": 6902, "time_per_iteration": 3.1322810649871826 }, { "auxiliary_loss_clip": 0.01079972, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.03572011, "balance_loss_mlp": 1.0213933, "epoch": 0.8300366740816449, "flos": 25228682801280.0, "grad_norm": 1.7917831461540412, "language_loss": 0.73683524, "learning_rate": 2.953101099810806e-07, "loss": 0.75800252, "num_input_tokens_seen": 149185165, "step": 6903, "time_per_iteration": 3.6422300338745117 }, { "auxiliary_loss_clip": 0.01112883, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.0415107, "balance_loss_mlp": 1.02170277, "epoch": 0.830156916972284, "flos": 18041018757120.0, "grad_norm": 2.066489057187632, "language_loss": 0.82495105, "learning_rate": 2.9490285268865965e-07, "loss": 0.84644747, "num_input_tokens_seen": 149202655, "step": 6904, "time_per_iteration": 2.5890591144561768 }, { "auxiliary_loss_clip": 0.01125467, "auxiliary_loss_mlp": 0.01041899, "balance_loss_clip": 1.04329181, "balance_loss_mlp": 1.02589536, "epoch": 0.830277159862923, "flos": 26322485806080.0, "grad_norm": 2.6173831685305697, "language_loss": 0.79449284, "learning_rate": 2.9449585405812085e-07, "loss": 0.81616646, "num_input_tokens_seen": 149220035, "step": 6905, "time_per_iteration": 2.7151780128479004 }, { "auxiliary_loss_clip": 0.01100352, "auxiliary_loss_mlp": 0.01039376, "balance_loss_clip": 1.04029906, "balance_loss_mlp": 1.02386642, "epoch": 0.8303974027535622, "flos": 19938861751680.0, "grad_norm": 1.8000924806327105, "language_loss": 0.73892343, "learning_rate": 2.940891141512043e-07, "loss": 0.76032072, "num_input_tokens_seen": 149238055, "step": 6906, "time_per_iteration": 2.611550807952881 }, { "auxiliary_loss_clip": 0.01106778, "auxiliary_loss_mlp": 0.01048269, "balance_loss_clip": 1.04058635, "balance_loss_mlp": 1.03056633, "epoch": 0.8305176456442013, "flos": 17165552572800.0, "grad_norm": 2.0713447475959335, "language_loss": 0.72151232, "learning_rate": 2.9368263302961385e-07, "loss": 0.74306273, "num_input_tokens_seen": 149256755, "step": 6907, "time_per_iteration": 2.5816762447357178 }, { "auxiliary_loss_clip": 0.01071244, "auxiliary_loss_mlp": 0.01038582, "balance_loss_clip": 1.03382564, "balance_loss_mlp": 1.0217731, "epoch": 0.8306378885348403, "flos": 25627614226560.0, "grad_norm": 2.1451373556730275, "language_loss": 0.79489726, "learning_rate": 2.9327641075501075e-07, "loss": 0.81599557, "num_input_tokens_seen": 149275745, "step": 6908, "time_per_iteration": 2.7671210765838623 }, { "auxiliary_loss_clip": 0.01100716, "auxiliary_loss_mlp": 0.01045756, "balance_loss_clip": 1.03718293, "balance_loss_mlp": 1.02886367, "epoch": 0.8307581314254795, "flos": 33947864985600.0, "grad_norm": 2.668644936064596, "language_loss": 0.66881317, "learning_rate": 2.9287044738901866e-07, "loss": 0.69027793, "num_input_tokens_seen": 149293730, "step": 6909, "time_per_iteration": 2.7942724227905273 }, { "auxiliary_loss_clip": 0.01121214, "auxiliary_loss_mlp": 0.00772312, "balance_loss_clip": 1.04133081, "balance_loss_mlp": 1.00055289, "epoch": 0.8308783743161186, "flos": 17562724231680.0, "grad_norm": 2.498262319933711, "language_loss": 0.9097743, "learning_rate": 2.9246474299322274e-07, "loss": 0.92870957, "num_input_tokens_seen": 149309290, "step": 6910, "time_per_iteration": 2.5782060623168945 }, { "auxiliary_loss_clip": 0.01004505, "auxiliary_loss_mlp": 0.01005317, "balance_loss_clip": 1.00701499, "balance_loss_mlp": 1.00389218, "epoch": 0.8309986172067576, "flos": 69412885649280.0, "grad_norm": 0.8907319102438216, "language_loss": 0.63152617, "learning_rate": 2.920592976291678e-07, "loss": 0.65162438, "num_input_tokens_seen": 149366620, "step": 6911, "time_per_iteration": 3.1458821296691895 }, { "auxiliary_loss_clip": 0.01118591, "auxiliary_loss_mlp": 0.01041793, "balance_loss_clip": 1.04074621, "balance_loss_mlp": 1.02617657, "epoch": 0.8311188600973968, "flos": 22309755886080.0, "grad_norm": 1.985393186734, "language_loss": 0.80895686, "learning_rate": 2.916541113583595e-07, "loss": 0.83056074, "num_input_tokens_seen": 149385120, "step": 6912, "time_per_iteration": 2.581996202468872 }, { "auxiliary_loss_clip": 0.01097763, "auxiliary_loss_mlp": 0.0103491, "balance_loss_clip": 1.0408175, "balance_loss_mlp": 1.01844764, "epoch": 0.8312391029880358, "flos": 18770077105920.0, "grad_norm": 2.8657050502177497, "language_loss": 0.66234338, "learning_rate": 2.912491842422642e-07, "loss": 0.68367016, "num_input_tokens_seen": 149402825, "step": 6913, "time_per_iteration": 2.624840021133423 }, { "auxiliary_loss_clip": 0.01120705, "auxiliary_loss_mlp": 0.01033777, "balance_loss_clip": 1.04213023, "balance_loss_mlp": 1.01869726, "epoch": 0.8313593458786749, "flos": 20376648714240.0, "grad_norm": 3.14617369136593, "language_loss": 0.71292078, "learning_rate": 2.9084451634230857e-07, "loss": 0.7344656, "num_input_tokens_seen": 149422125, "step": 6914, "time_per_iteration": 2.5975661277770996 }, { "auxiliary_loss_clip": 0.01094961, "auxiliary_loss_mlp": 0.01040215, "balance_loss_clip": 1.03700006, "balance_loss_mlp": 1.0251056, "epoch": 0.831479588769314, "flos": 32124069878400.0, "grad_norm": 2.2636871314285583, "language_loss": 0.70807153, "learning_rate": 2.9044010771988125e-07, "loss": 0.72942328, "num_input_tokens_seen": 149441940, "step": 6915, "time_per_iteration": 2.702092170715332 }, { "auxiliary_loss_clip": 0.01100254, "auxiliary_loss_mlp": 0.01042387, "balance_loss_clip": 1.03861475, "balance_loss_mlp": 1.02623439, "epoch": 0.8315998316599531, "flos": 45185929338240.0, "grad_norm": 3.1372241864599033, "language_loss": 0.72013927, "learning_rate": 2.900359584363303e-07, "loss": 0.7415657, "num_input_tokens_seen": 149465045, "step": 6916, "time_per_iteration": 2.8165130615234375 }, { "auxiliary_loss_clip": 0.01079842, "auxiliary_loss_mlp": 0.01059232, "balance_loss_clip": 1.03868723, "balance_loss_mlp": 1.04092121, "epoch": 0.8317200745505922, "flos": 18363747479040.0, "grad_norm": 2.2044271558230233, "language_loss": 0.84213167, "learning_rate": 2.8963206855296494e-07, "loss": 0.86352241, "num_input_tokens_seen": 149481285, "step": 6917, "time_per_iteration": 2.6177453994750977 }, { "auxiliary_loss_clip": 0.01123687, "auxiliary_loss_mlp": 0.01038148, "balance_loss_clip": 1.04398477, "balance_loss_mlp": 1.02274036, "epoch": 0.8318403174412313, "flos": 24206557386240.0, "grad_norm": 1.716974825673288, "language_loss": 0.76988542, "learning_rate": 2.892284381310548e-07, "loss": 0.79150379, "num_input_tokens_seen": 149502700, "step": 6918, "time_per_iteration": 2.6036696434020996 }, { "auxiliary_loss_clip": 0.01103756, "auxiliary_loss_mlp": 0.01034401, "balance_loss_clip": 1.03885448, "balance_loss_mlp": 1.01856399, "epoch": 0.8319605603318704, "flos": 22418780641920.0, "grad_norm": 2.621203589672019, "language_loss": 0.72005641, "learning_rate": 2.888250672318302e-07, "loss": 0.74143803, "num_input_tokens_seen": 149520100, "step": 6919, "time_per_iteration": 2.681666851043701 }, { "auxiliary_loss_clip": 0.01135764, "auxiliary_loss_mlp": 0.01046349, "balance_loss_clip": 1.04592776, "balance_loss_mlp": 1.03149545, "epoch": 0.8320808032225094, "flos": 37414501459200.0, "grad_norm": 1.8278478339640065, "language_loss": 0.68549663, "learning_rate": 2.884219559164831e-07, "loss": 0.70731771, "num_input_tokens_seen": 149543245, "step": 6920, "time_per_iteration": 2.693718433380127 }, { "auxiliary_loss_clip": 0.01119827, "auxiliary_loss_mlp": 0.01042495, "balance_loss_clip": 1.04166067, "balance_loss_mlp": 1.02625823, "epoch": 0.8322010461131486, "flos": 12787395638400.0, "grad_norm": 2.3209849440604455, "language_loss": 0.81245768, "learning_rate": 2.880191042461635e-07, "loss": 0.83408093, "num_input_tokens_seen": 149559185, "step": 6921, "time_per_iteration": 4.501172304153442 }, { "auxiliary_loss_clip": 0.0109023, "auxiliary_loss_mlp": 0.0103226, "balance_loss_clip": 1.04094803, "balance_loss_mlp": 1.01786566, "epoch": 0.8323212890037877, "flos": 15815455050240.0, "grad_norm": 1.6571851420572585, "language_loss": 0.80142474, "learning_rate": 2.876165122819849e-07, "loss": 0.8226496, "num_input_tokens_seen": 149577165, "step": 6922, "time_per_iteration": 2.67405366897583 }, { "auxiliary_loss_clip": 0.01129756, "auxiliary_loss_mlp": 0.01038371, "balance_loss_clip": 1.04212868, "balance_loss_mlp": 1.02243233, "epoch": 0.8324415318944267, "flos": 21719276208000.0, "grad_norm": 1.9747777693072206, "language_loss": 0.79384202, "learning_rate": 2.872141800850201e-07, "loss": 0.81552327, "num_input_tokens_seen": 149594340, "step": 6923, "time_per_iteration": 2.5540273189544678 }, { "auxiliary_loss_clip": 0.01129915, "auxiliary_loss_mlp": 0.01039164, "balance_loss_clip": 1.04201603, "balance_loss_mlp": 1.02457881, "epoch": 0.8325617747850659, "flos": 34198700636160.0, "grad_norm": 2.3970713227805778, "language_loss": 0.73215902, "learning_rate": 2.868121077163024e-07, "loss": 0.75384986, "num_input_tokens_seen": 149613895, "step": 6924, "time_per_iteration": 2.783729076385498 }, { "auxiliary_loss_clip": 0.011238, "auxiliary_loss_mlp": 0.01046794, "balance_loss_clip": 1.04217649, "balance_loss_mlp": 1.03106976, "epoch": 0.8326820176757049, "flos": 18369457741440.0, "grad_norm": 1.9828908127111384, "language_loss": 0.72632647, "learning_rate": 2.864102952368257e-07, "loss": 0.74803245, "num_input_tokens_seen": 149631820, "step": 6925, "time_per_iteration": 3.495678663253784 }, { "auxiliary_loss_clip": 0.0107341, "auxiliary_loss_mlp": 0.01042105, "balance_loss_clip": 1.03505576, "balance_loss_mlp": 1.02476025, "epoch": 0.832802260566344, "flos": 35991325716480.0, "grad_norm": 9.108188189852097, "language_loss": 0.59363717, "learning_rate": 2.860087427075444e-07, "loss": 0.61479229, "num_input_tokens_seen": 149656070, "step": 6926, "time_per_iteration": 2.799710750579834 }, { "auxiliary_loss_clip": 0.01101145, "auxiliary_loss_mlp": 0.01039425, "balance_loss_clip": 1.040488, "balance_loss_mlp": 1.02520299, "epoch": 0.8329225034569832, "flos": 14244434928000.0, "grad_norm": 2.43547878977336, "language_loss": 0.86009049, "learning_rate": 2.856074501893744e-07, "loss": 0.88149619, "num_input_tokens_seen": 149671270, "step": 6927, "time_per_iteration": 2.5775067806243896 }, { "auxiliary_loss_clip": 0.01125248, "auxiliary_loss_mlp": 0.01049747, "balance_loss_clip": 1.04415011, "balance_loss_mlp": 1.03253305, "epoch": 0.8330427463476222, "flos": 18077468083200.0, "grad_norm": 1.6816312590299065, "language_loss": 0.8165887, "learning_rate": 2.8520641774319054e-07, "loss": 0.83833861, "num_input_tokens_seen": 149689360, "step": 6928, "time_per_iteration": 2.5690033435821533 }, { "auxiliary_loss_clip": 0.01107661, "auxiliary_loss_mlp": 0.01039063, "balance_loss_clip": 1.03674972, "balance_loss_mlp": 1.02250445, "epoch": 0.8331629892382613, "flos": 18040839189120.0, "grad_norm": 2.187136152797467, "language_loss": 0.75614536, "learning_rate": 2.848056454298309e-07, "loss": 0.77761263, "num_input_tokens_seen": 149706685, "step": 6929, "time_per_iteration": 3.508729934692383 }, { "auxiliary_loss_clip": 0.01109793, "auxiliary_loss_mlp": 0.01038848, "balance_loss_clip": 1.04157758, "balance_loss_mlp": 1.02303529, "epoch": 0.8332832321289004, "flos": 17457398576640.0, "grad_norm": 2.6772108896908184, "language_loss": 0.65432519, "learning_rate": 2.844051333100905e-07, "loss": 0.67581159, "num_input_tokens_seen": 149724230, "step": 6930, "time_per_iteration": 2.6936237812042236 }, { "auxiliary_loss_clip": 0.01107003, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 1.04380655, "balance_loss_mlp": 1.01658201, "epoch": 0.8334034750195395, "flos": 15084852416640.0, "grad_norm": 1.8489239387029728, "language_loss": 0.83816099, "learning_rate": 2.840048814447269e-07, "loss": 0.85953677, "num_input_tokens_seen": 149742395, "step": 6931, "time_per_iteration": 2.5835530757904053 }, { "auxiliary_loss_clip": 0.01100548, "auxiliary_loss_mlp": 0.01054263, "balance_loss_clip": 1.03855693, "balance_loss_mlp": 1.0361315, "epoch": 0.8335237179101785, "flos": 19427170556160.0, "grad_norm": 2.44159450095801, "language_loss": 0.73619074, "learning_rate": 2.836048898944587e-07, "loss": 0.75773883, "num_input_tokens_seen": 149760820, "step": 6932, "time_per_iteration": 2.597675323486328 }, { "auxiliary_loss_clip": 0.01103974, "auxiliary_loss_mlp": 0.01047093, "balance_loss_clip": 1.03938055, "balance_loss_mlp": 1.03238225, "epoch": 0.8336439608008177, "flos": 21762046327680.0, "grad_norm": 3.9479949025164833, "language_loss": 0.72843927, "learning_rate": 2.832051587199642e-07, "loss": 0.74994993, "num_input_tokens_seen": 149778075, "step": 6933, "time_per_iteration": 2.5917842388153076 }, { "auxiliary_loss_clip": 0.01028654, "auxiliary_loss_mlp": 0.01000869, "balance_loss_clip": 1.00977504, "balance_loss_mlp": 0.99920028, "epoch": 0.8337642036914568, "flos": 59702783990400.0, "grad_norm": 0.8040735524420392, "language_loss": 0.57682776, "learning_rate": 2.828056879818821e-07, "loss": 0.59712303, "num_input_tokens_seen": 149837150, "step": 6934, "time_per_iteration": 3.1037023067474365 }, { "auxiliary_loss_clip": 0.010911, "auxiliary_loss_mlp": 0.01037997, "balance_loss_clip": 1.03579164, "balance_loss_mlp": 1.02208257, "epoch": 0.8338844465820958, "flos": 27162185022720.0, "grad_norm": 2.5383984786356337, "language_loss": 0.83689451, "learning_rate": 2.824064777408117e-07, "loss": 0.85818547, "num_input_tokens_seen": 149856940, "step": 6935, "time_per_iteration": 2.694138765335083 }, { "auxiliary_loss_clip": 0.01119944, "auxiliary_loss_mlp": 0.01045564, "balance_loss_clip": 1.04273009, "balance_loss_mlp": 1.0294708, "epoch": 0.8340046894727349, "flos": 30481264425600.0, "grad_norm": 2.0450801046195286, "language_loss": 0.75741673, "learning_rate": 2.8200752805731263e-07, "loss": 0.77907181, "num_input_tokens_seen": 149879930, "step": 6936, "time_per_iteration": 2.6572675704956055 }, { "auxiliary_loss_clip": 0.01120478, "auxiliary_loss_mlp": 0.01038296, "balance_loss_clip": 1.04222083, "balance_loss_mlp": 1.02277517, "epoch": 0.834124932363374, "flos": 27126166659840.0, "grad_norm": 1.5539976103768303, "language_loss": 0.81271249, "learning_rate": 2.8160883899190625e-07, "loss": 0.83430022, "num_input_tokens_seen": 149903200, "step": 6937, "time_per_iteration": 2.648481607437134 }, { "auxiliary_loss_clip": 0.01087448, "auxiliary_loss_mlp": 0.01038993, "balance_loss_clip": 1.03907228, "balance_loss_mlp": 1.02309084, "epoch": 0.8342451752540131, "flos": 24569865498240.0, "grad_norm": 8.455000637335676, "language_loss": 0.73190033, "learning_rate": 2.8121041060507234e-07, "loss": 0.75316477, "num_input_tokens_seen": 149922230, "step": 6938, "time_per_iteration": 2.6685030460357666 }, { "auxiliary_loss_clip": 0.01124693, "auxiliary_loss_mlp": 0.01037978, "balance_loss_clip": 1.04223716, "balance_loss_mlp": 1.02230191, "epoch": 0.8343654181446521, "flos": 26615085995520.0, "grad_norm": 1.8180328615988224, "language_loss": 0.7111361, "learning_rate": 2.808122429572528e-07, "loss": 0.73276281, "num_input_tokens_seen": 149942435, "step": 6939, "time_per_iteration": 2.6211118698120117 }, { "auxiliary_loss_clip": 0.0110428, "auxiliary_loss_mlp": 0.01036838, "balance_loss_clip": 1.04085207, "balance_loss_mlp": 1.01955247, "epoch": 0.8344856610352913, "flos": 20777268078720.0, "grad_norm": 3.803401365124802, "language_loss": 0.75880927, "learning_rate": 2.804143361088489e-07, "loss": 0.78022051, "num_input_tokens_seen": 149961615, "step": 6940, "time_per_iteration": 2.6399874687194824 }, { "auxiliary_loss_clip": 0.0110373, "auxiliary_loss_mlp": 0.01043997, "balance_loss_clip": 1.03945315, "balance_loss_mlp": 1.02766514, "epoch": 0.8346059039259304, "flos": 26095960684800.0, "grad_norm": 2.377836359487561, "language_loss": 0.77913785, "learning_rate": 2.8001669012022277e-07, "loss": 0.80061513, "num_input_tokens_seen": 149979585, "step": 6941, "time_per_iteration": 2.6471805572509766 }, { "auxiliary_loss_clip": 0.0112128, "auxiliary_loss_mlp": 0.01039765, "balance_loss_clip": 1.04378414, "balance_loss_mlp": 1.02436316, "epoch": 0.8347261468165694, "flos": 29027708755200.0, "grad_norm": 3.382352761388624, "language_loss": 0.69355249, "learning_rate": 2.7961930505169795e-07, "loss": 0.71516299, "num_input_tokens_seen": 150003830, "step": 6942, "time_per_iteration": 2.641244888305664 }, { "auxiliary_loss_clip": 0.011228, "auxiliary_loss_mlp": 0.00772728, "balance_loss_clip": 1.04278803, "balance_loss_mlp": 1.00048089, "epoch": 0.8348463897072086, "flos": 26396461866240.0, "grad_norm": 2.163803879100418, "language_loss": 0.76496738, "learning_rate": 2.792221809635558e-07, "loss": 0.78392267, "num_input_tokens_seen": 150024460, "step": 6943, "time_per_iteration": 2.6166279315948486 }, { "auxiliary_loss_clip": 0.01065126, "auxiliary_loss_mlp": 0.01041358, "balance_loss_clip": 1.0384798, "balance_loss_mlp": 1.02465689, "epoch": 0.8349666325978476, "flos": 23367720096000.0, "grad_norm": 2.1864384012386595, "language_loss": 0.74782133, "learning_rate": 2.788253179160411e-07, "loss": 0.76888621, "num_input_tokens_seen": 150045620, "step": 6944, "time_per_iteration": 2.7622852325439453 }, { "auxiliary_loss_clip": 0.01108655, "auxiliary_loss_mlp": 0.01039284, "balance_loss_clip": 1.04289198, "balance_loss_mlp": 1.02429366, "epoch": 0.8350868754884867, "flos": 12896528135040.0, "grad_norm": 1.8767414986502393, "language_loss": 0.64759755, "learning_rate": 2.7842871596935725e-07, "loss": 0.66907698, "num_input_tokens_seen": 150064135, "step": 6945, "time_per_iteration": 2.5984504222869873 }, { "auxiliary_loss_clip": 0.01120652, "auxiliary_loss_mlp": 0.01033435, "balance_loss_clip": 1.04002452, "balance_loss_mlp": 1.01825964, "epoch": 0.8352071183791259, "flos": 26505522535680.0, "grad_norm": 1.9946418942593802, "language_loss": 0.69172418, "learning_rate": 2.780323751836682e-07, "loss": 0.71326512, "num_input_tokens_seen": 150085350, "step": 6946, "time_per_iteration": 2.617356061935425 }, { "auxiliary_loss_clip": 0.01105937, "auxiliary_loss_mlp": 0.00771701, "balance_loss_clip": 1.03919077, "balance_loss_mlp": 1.00046647, "epoch": 0.8353273612697649, "flos": 20668063754880.0, "grad_norm": 1.5933088684780377, "language_loss": 0.78727931, "learning_rate": 2.7763629561909876e-07, "loss": 0.80605572, "num_input_tokens_seen": 150106180, "step": 6947, "time_per_iteration": 4.676600933074951 }, { "auxiliary_loss_clip": 0.01128992, "auxiliary_loss_mlp": 0.01037859, "balance_loss_clip": 1.04115129, "balance_loss_mlp": 1.02174199, "epoch": 0.835447604160404, "flos": 19754137082880.0, "grad_norm": 2.52276745620538, "language_loss": 0.77073538, "learning_rate": 2.772404773357335e-07, "loss": 0.79240388, "num_input_tokens_seen": 150125585, "step": 6948, "time_per_iteration": 2.5662055015563965 }, { "auxiliary_loss_clip": 0.01090023, "auxiliary_loss_mlp": 0.01040865, "balance_loss_clip": 1.03823018, "balance_loss_mlp": 1.02478361, "epoch": 0.8355678470510431, "flos": 23435842239360.0, "grad_norm": 2.3527953030692084, "language_loss": 0.78167641, "learning_rate": 2.7684492039361853e-07, "loss": 0.80298531, "num_input_tokens_seen": 150144810, "step": 6949, "time_per_iteration": 2.656048059463501 }, { "auxiliary_loss_clip": 0.01135445, "auxiliary_loss_mlp": 0.01036161, "balance_loss_clip": 1.0450393, "balance_loss_mlp": 1.02091408, "epoch": 0.8356880899416822, "flos": 21214588164480.0, "grad_norm": 1.9587955183673345, "language_loss": 0.83836114, "learning_rate": 2.764496248527586e-07, "loss": 0.8600772, "num_input_tokens_seen": 150163785, "step": 6950, "time_per_iteration": 2.576303720474243 }, { "auxiliary_loss_clip": 0.01099487, "auxiliary_loss_mlp": 0.01039017, "balance_loss_clip": 1.03785276, "balance_loss_mlp": 1.02320981, "epoch": 0.8358083328323213, "flos": 28037543466240.0, "grad_norm": 1.9032615609696426, "language_loss": 0.78688133, "learning_rate": 2.760545907731211e-07, "loss": 0.8082664, "num_input_tokens_seen": 150184360, "step": 6951, "time_per_iteration": 2.7257888317108154 }, { "auxiliary_loss_clip": 0.01119633, "auxiliary_loss_mlp": 0.01041984, "balance_loss_clip": 1.03985131, "balance_loss_mlp": 1.02493715, "epoch": 0.8359285757229604, "flos": 27783655159680.0, "grad_norm": 1.811851718094184, "language_loss": 0.67776382, "learning_rate": 2.75659818214631e-07, "loss": 0.69938004, "num_input_tokens_seen": 150205465, "step": 6952, "time_per_iteration": 3.5652763843536377 }, { "auxiliary_loss_clip": 0.01111015, "auxiliary_loss_mlp": 0.01036345, "balance_loss_clip": 1.04251087, "balance_loss_mlp": 1.0213604, "epoch": 0.8360488186135995, "flos": 21435115714560.0, "grad_norm": 1.7809635931507712, "language_loss": 0.78016728, "learning_rate": 2.752653072371749e-07, "loss": 0.80164087, "num_input_tokens_seen": 150224900, "step": 6953, "time_per_iteration": 2.643456220626831 }, { "auxiliary_loss_clip": 0.01091492, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.04083097, "balance_loss_mlp": 1.02143717, "epoch": 0.8361690615042385, "flos": 27632327160960.0, "grad_norm": 1.7522165059555554, "language_loss": 0.74465203, "learning_rate": 2.7487105790060105e-07, "loss": 0.76592475, "num_input_tokens_seen": 150244310, "step": 6954, "time_per_iteration": 2.779235601425171 }, { "auxiliary_loss_clip": 0.0112217, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.04279149, "balance_loss_mlp": 1.0218662, "epoch": 0.8362893043948777, "flos": 39202529598720.0, "grad_norm": 2.8264184531836922, "language_loss": 0.69207931, "learning_rate": 2.7447707026471587e-07, "loss": 0.71365976, "num_input_tokens_seen": 150267285, "step": 6955, "time_per_iteration": 3.5481295585632324 }, { "auxiliary_loss_clip": 0.01094247, "auxiliary_loss_mlp": 0.01036409, "balance_loss_clip": 1.03892374, "balance_loss_mlp": 1.02215195, "epoch": 0.8364095472855168, "flos": 24785329230720.0, "grad_norm": 2.37548056044082, "language_loss": 0.798352, "learning_rate": 2.740833443892874e-07, "loss": 0.81965858, "num_input_tokens_seen": 150285455, "step": 6956, "time_per_iteration": 2.6372313499450684 }, { "auxiliary_loss_clip": 0.01106432, "auxiliary_loss_mlp": 0.01036655, "balance_loss_clip": 1.03936601, "balance_loss_mlp": 1.02212369, "epoch": 0.8365297901761558, "flos": 22743412784640.0, "grad_norm": 1.9498899235479135, "language_loss": 0.79913378, "learning_rate": 2.7368988033404327e-07, "loss": 0.82056463, "num_input_tokens_seen": 150302970, "step": 6957, "time_per_iteration": 2.6206893920898438 }, { "auxiliary_loss_clip": 0.01098423, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.0426265, "balance_loss_mlp": 1.01655936, "epoch": 0.836650033066795, "flos": 28396003242240.0, "grad_norm": 1.588102650041385, "language_loss": 0.84940475, "learning_rate": 2.732966781586712e-07, "loss": 0.87069976, "num_input_tokens_seen": 150322715, "step": 6958, "time_per_iteration": 2.75056529045105 }, { "auxiliary_loss_clip": 0.0111648, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.03970695, "balance_loss_mlp": 1.02197123, "epoch": 0.836770275957434, "flos": 22236857233920.0, "grad_norm": 1.5954709294791793, "language_loss": 0.6649282, "learning_rate": 2.729037379228205e-07, "loss": 0.68647051, "num_input_tokens_seen": 150342900, "step": 6959, "time_per_iteration": 2.9296975135803223 }, { "auxiliary_loss_clip": 0.01108655, "auxiliary_loss_mlp": 0.01040862, "balance_loss_clip": 1.04094875, "balance_loss_mlp": 1.0256989, "epoch": 0.8368905188480731, "flos": 22491930689280.0, "grad_norm": 1.834832499394778, "language_loss": 0.80224717, "learning_rate": 2.725110596860998e-07, "loss": 0.82374233, "num_input_tokens_seen": 150363580, "step": 6960, "time_per_iteration": 2.6759305000305176 }, { "auxiliary_loss_clip": 0.01082722, "auxiliary_loss_mlp": 0.0104684, "balance_loss_clip": 1.04113829, "balance_loss_mlp": 1.02895844, "epoch": 0.8370107617387123, "flos": 13370405287680.0, "grad_norm": 1.9806247462833368, "language_loss": 0.70176768, "learning_rate": 2.7211864350807776e-07, "loss": 0.72306329, "num_input_tokens_seen": 150381780, "step": 6961, "time_per_iteration": 2.7631635665893555 }, { "auxiliary_loss_clip": 0.01132669, "auxiliary_loss_mlp": 0.01048922, "balance_loss_clip": 1.04318464, "balance_loss_mlp": 1.0329957, "epoch": 0.8371310046293513, "flos": 25261289372160.0, "grad_norm": 1.947569786647005, "language_loss": 0.73592842, "learning_rate": 2.717264894482836e-07, "loss": 0.75774431, "num_input_tokens_seen": 150402120, "step": 6962, "time_per_iteration": 2.6222782135009766 }, { "auxiliary_loss_clip": 0.01124661, "auxiliary_loss_mlp": 0.01039559, "balance_loss_clip": 1.04353118, "balance_loss_mlp": 1.02430046, "epoch": 0.8372512475199904, "flos": 19792705311360.0, "grad_norm": 3.120466678579458, "language_loss": 0.81301183, "learning_rate": 2.7133459756620646e-07, "loss": 0.83465397, "num_input_tokens_seen": 150419315, "step": 6963, "time_per_iteration": 2.6894326210021973 }, { "auxiliary_loss_clip": 0.01115917, "auxiliary_loss_mlp": 0.01048703, "balance_loss_clip": 1.0412457, "balance_loss_mlp": 1.03107238, "epoch": 0.8373714904106295, "flos": 19391224020480.0, "grad_norm": 2.1094594409630822, "language_loss": 0.73743933, "learning_rate": 2.7094296792129733e-07, "loss": 0.75908554, "num_input_tokens_seen": 150438915, "step": 6964, "time_per_iteration": 2.752983808517456 }, { "auxiliary_loss_clip": 0.01119287, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.04147422, "balance_loss_mlp": 1.02100348, "epoch": 0.8374917333012686, "flos": 14975935401600.0, "grad_norm": 1.7443070942668277, "language_loss": 0.75552607, "learning_rate": 2.7055160057296424e-07, "loss": 0.7770772, "num_input_tokens_seen": 150456155, "step": 6965, "time_per_iteration": 2.6527295112609863 }, { "auxiliary_loss_clip": 0.01096008, "auxiliary_loss_mlp": 0.01047009, "balance_loss_clip": 1.03916931, "balance_loss_mlp": 1.03031933, "epoch": 0.8376119761919076, "flos": 30331839847680.0, "grad_norm": 1.6527090239768338, "language_loss": 0.72356045, "learning_rate": 2.7016049558057896e-07, "loss": 0.74499059, "num_input_tokens_seen": 150478115, "step": 6966, "time_per_iteration": 2.7339866161346436 }, { "auxiliary_loss_clip": 0.01116942, "auxiliary_loss_mlp": 0.0103993, "balance_loss_clip": 1.04068542, "balance_loss_mlp": 1.02443862, "epoch": 0.8377322190825467, "flos": 29423336129280.0, "grad_norm": 1.7787925220450784, "language_loss": 0.71032619, "learning_rate": 2.6976965300347074e-07, "loss": 0.73189497, "num_input_tokens_seen": 150500725, "step": 6967, "time_per_iteration": 2.6551296710968018 }, { "auxiliary_loss_clip": 0.01102504, "auxiliary_loss_mlp": 0.01042164, "balance_loss_clip": 1.03779137, "balance_loss_mlp": 1.02653599, "epoch": 0.8378524619731859, "flos": 26687086807680.0, "grad_norm": 2.6041492311964727, "language_loss": 0.69335121, "learning_rate": 2.693790729009309e-07, "loss": 0.71479791, "num_input_tokens_seen": 150522335, "step": 6968, "time_per_iteration": 2.6695220470428467 }, { "auxiliary_loss_clip": 0.01103909, "auxiliary_loss_mlp": 0.01032512, "balance_loss_clip": 1.03867459, "balance_loss_mlp": 1.01787305, "epoch": 0.8379727048638249, "flos": 20703866636160.0, "grad_norm": 2.0660113200976506, "language_loss": 0.88434428, "learning_rate": 2.6898875533220946e-07, "loss": 0.90570843, "num_input_tokens_seen": 150541640, "step": 6969, "time_per_iteration": 2.6480023860931396 }, { "auxiliary_loss_clip": 0.01126781, "auxiliary_loss_mlp": 0.01031671, "balance_loss_clip": 1.04293215, "balance_loss_mlp": 1.0172888, "epoch": 0.838092947754464, "flos": 20084084438400.0, "grad_norm": 2.7793339621525863, "language_loss": 0.81710732, "learning_rate": 2.685987003565171e-07, "loss": 0.83869183, "num_input_tokens_seen": 150559680, "step": 6970, "time_per_iteration": 2.5843441486358643 }, { "auxiliary_loss_clip": 0.01086215, "auxiliary_loss_mlp": 0.0103469, "balance_loss_clip": 1.03951347, "balance_loss_mlp": 1.01906157, "epoch": 0.8382131906451031, "flos": 18113270964480.0, "grad_norm": 2.679979149177755, "language_loss": 0.75203091, "learning_rate": 2.6820890803302566e-07, "loss": 0.77323997, "num_input_tokens_seen": 150575205, "step": 6971, "time_per_iteration": 2.6297357082366943 }, { "auxiliary_loss_clip": 0.01102281, "auxiliary_loss_mlp": 0.01041251, "balance_loss_clip": 1.04149473, "balance_loss_mlp": 1.02565861, "epoch": 0.8383334335357422, "flos": 17092653920640.0, "grad_norm": 2.367719876426691, "language_loss": 0.81786537, "learning_rate": 2.6781937842086557e-07, "loss": 0.83930069, "num_input_tokens_seen": 150593995, "step": 6972, "time_per_iteration": 2.6103405952453613 }, { "auxiliary_loss_clip": 0.01123422, "auxiliary_loss_mlp": 0.01042033, "balance_loss_clip": 1.0434382, "balance_loss_mlp": 1.02666736, "epoch": 0.8384536764263812, "flos": 20704728562560.0, "grad_norm": 2.2775309282764726, "language_loss": 0.67307281, "learning_rate": 2.6743011157912933e-07, "loss": 0.69472742, "num_input_tokens_seen": 150613715, "step": 6973, "time_per_iteration": 4.4627439975738525 }, { "auxiliary_loss_clip": 0.01081664, "auxiliary_loss_mlp": 0.01039517, "balance_loss_clip": 1.03545141, "balance_loss_mlp": 1.02423429, "epoch": 0.8385739193170204, "flos": 28986842056320.0, "grad_norm": 3.758195465652083, "language_loss": 0.65376461, "learning_rate": 2.6704110756686725e-07, "loss": 0.67497635, "num_input_tokens_seen": 150634540, "step": 6974, "time_per_iteration": 2.7232487201690674 }, { "auxiliary_loss_clip": 0.01105109, "auxiliary_loss_mlp": 0.00772671, "balance_loss_clip": 1.03886533, "balance_loss_mlp": 1.00044584, "epoch": 0.8386941622076595, "flos": 23438068882560.0, "grad_norm": 1.6953882387164423, "language_loss": 0.83783031, "learning_rate": 2.6665236644309085e-07, "loss": 0.85660815, "num_input_tokens_seen": 150654850, "step": 6975, "time_per_iteration": 2.6776657104492188 }, { "auxiliary_loss_clip": 0.01115872, "auxiliary_loss_mlp": 0.01034054, "balance_loss_clip": 1.0400573, "balance_loss_mlp": 1.01884258, "epoch": 0.8388144050982985, "flos": 23002724044800.0, "grad_norm": 2.7073394634500865, "language_loss": 0.79477096, "learning_rate": 2.662638882667727e-07, "loss": 0.81627023, "num_input_tokens_seen": 150673790, "step": 6976, "time_per_iteration": 2.6245181560516357 }, { "auxiliary_loss_clip": 0.01136089, "auxiliary_loss_mlp": 0.01035563, "balance_loss_clip": 1.04402483, "balance_loss_mlp": 1.01910067, "epoch": 0.8389346479889377, "flos": 24280353878400.0, "grad_norm": 2.0617465897625458, "language_loss": 0.73042142, "learning_rate": 2.658756730968443e-07, "loss": 0.75213796, "num_input_tokens_seen": 150692255, "step": 6977, "time_per_iteration": 3.593930721282959 }, { "auxiliary_loss_clip": 0.01111879, "auxiliary_loss_mlp": 0.01037045, "balance_loss_clip": 1.04300761, "balance_loss_mlp": 1.02169085, "epoch": 0.8390548908795767, "flos": 21215019127680.0, "grad_norm": 2.1515286957927917, "language_loss": 0.88563842, "learning_rate": 2.654877209921975e-07, "loss": 0.90712762, "num_input_tokens_seen": 150709790, "step": 6978, "time_per_iteration": 2.618345260620117 }, { "auxiliary_loss_clip": 0.01091808, "auxiliary_loss_mlp": 0.01049207, "balance_loss_clip": 1.0380336, "balance_loss_mlp": 1.03074169, "epoch": 0.8391751337702158, "flos": 35627299332480.0, "grad_norm": 2.576660171299506, "language_loss": 0.63040888, "learning_rate": 2.651000320116843e-07, "loss": 0.65181905, "num_input_tokens_seen": 150730675, "step": 6979, "time_per_iteration": 2.7949745655059814 }, { "auxiliary_loss_clip": 0.01092219, "auxiliary_loss_mlp": 0.00774371, "balance_loss_clip": 1.03932989, "balance_loss_mlp": 1.00049329, "epoch": 0.839295376660855, "flos": 21325229032320.0, "grad_norm": 2.0524261022451262, "language_loss": 0.76081944, "learning_rate": 2.647126062141163e-07, "loss": 0.77948534, "num_input_tokens_seen": 150749750, "step": 6980, "time_per_iteration": 3.5129997730255127 }, { "auxiliary_loss_clip": 0.01110822, "auxiliary_loss_mlp": 0.01041583, "balance_loss_clip": 1.03972077, "balance_loss_mlp": 1.02491713, "epoch": 0.839415619551494, "flos": 18442535961600.0, "grad_norm": 1.8745065349196623, "language_loss": 0.83983946, "learning_rate": 2.643254436582669e-07, "loss": 0.86136353, "num_input_tokens_seen": 150769240, "step": 6981, "time_per_iteration": 2.6270315647125244 }, { "auxiliary_loss_clip": 0.01085198, "auxiliary_loss_mlp": 0.01034548, "balance_loss_clip": 1.03934896, "balance_loss_mlp": 1.01977777, "epoch": 0.8395358624421331, "flos": 23221958705280.0, "grad_norm": 2.883683991189819, "language_loss": 0.827461, "learning_rate": 2.6393854440286743e-07, "loss": 0.8486585, "num_input_tokens_seen": 150788410, "step": 6982, "time_per_iteration": 2.674412488937378 }, { "auxiliary_loss_clip": 0.0112961, "auxiliary_loss_mlp": 0.01041303, "balance_loss_clip": 1.04192519, "balance_loss_mlp": 1.02482867, "epoch": 0.8396561053327722, "flos": 24381657210240.0, "grad_norm": 2.055761793964538, "language_loss": 0.70789635, "learning_rate": 2.6355190850661045e-07, "loss": 0.72960556, "num_input_tokens_seen": 150805245, "step": 6983, "time_per_iteration": 2.5659725666046143 }, { "auxiliary_loss_clip": 0.01102663, "auxiliary_loss_mlp": 0.01035086, "balance_loss_clip": 1.04020798, "balance_loss_mlp": 1.01941013, "epoch": 0.8397763482234113, "flos": 22237755073920.0, "grad_norm": 1.6383885621082108, "language_loss": 0.86680412, "learning_rate": 2.631655360281486e-07, "loss": 0.88818157, "num_input_tokens_seen": 150824920, "step": 6984, "time_per_iteration": 2.6451950073242188 }, { "auxiliary_loss_clip": 0.0112425, "auxiliary_loss_mlp": 0.00772655, "balance_loss_clip": 1.04196048, "balance_loss_mlp": 1.00048172, "epoch": 0.8398965911140504, "flos": 22163743100160.0, "grad_norm": 1.9067031650737005, "language_loss": 0.65776736, "learning_rate": 2.6277942702609323e-07, "loss": 0.67673641, "num_input_tokens_seen": 150844400, "step": 6985, "time_per_iteration": 2.5917410850524902 }, { "auxiliary_loss_clip": 0.01101412, "auxiliary_loss_mlp": 0.01042268, "balance_loss_clip": 1.04203677, "balance_loss_mlp": 1.02722383, "epoch": 0.8400168340046895, "flos": 21542775753600.0, "grad_norm": 1.8480967850874332, "language_loss": 0.8727361, "learning_rate": 2.623935815590186e-07, "loss": 0.89417291, "num_input_tokens_seen": 150862780, "step": 6986, "time_per_iteration": 2.6386172771453857 }, { "auxiliary_loss_clip": 0.01110841, "auxiliary_loss_mlp": 0.01044992, "balance_loss_clip": 1.0441767, "balance_loss_mlp": 1.02902985, "epoch": 0.8401370768953286, "flos": 22491966602880.0, "grad_norm": 1.7881098870009602, "language_loss": 0.81314933, "learning_rate": 2.6200799968545516e-07, "loss": 0.83470768, "num_input_tokens_seen": 150883075, "step": 6987, "time_per_iteration": 2.651825428009033 }, { "auxiliary_loss_clip": 0.01012144, "auxiliary_loss_mlp": 0.01005803, "balance_loss_clip": 1.00756991, "balance_loss_mlp": 1.00434828, "epoch": 0.8402573197859676, "flos": 59238890818560.0, "grad_norm": 0.7873049732504186, "language_loss": 0.56372923, "learning_rate": 2.616226814638969e-07, "loss": 0.58390874, "num_input_tokens_seen": 150948180, "step": 6988, "time_per_iteration": 3.210655927658081 }, { "auxiliary_loss_clip": 0.01106165, "auxiliary_loss_mlp": 0.01041005, "balance_loss_clip": 1.03829718, "balance_loss_mlp": 1.02612138, "epoch": 0.8403775626766068, "flos": 22674608282880.0, "grad_norm": 2.0026891861761875, "language_loss": 0.77407044, "learning_rate": 2.612376269527954e-07, "loss": 0.79554212, "num_input_tokens_seen": 150967885, "step": 6989, "time_per_iteration": 2.6944284439086914 }, { "auxiliary_loss_clip": 0.011067, "auxiliary_loss_mlp": 0.0103634, "balance_loss_clip": 1.04201007, "balance_loss_mlp": 1.02056873, "epoch": 0.8404978055672458, "flos": 19609704495360.0, "grad_norm": 2.426651861830187, "language_loss": 0.67500651, "learning_rate": 2.608528362105635e-07, "loss": 0.69643688, "num_input_tokens_seen": 150987255, "step": 6990, "time_per_iteration": 2.6049282550811768 }, { "auxiliary_loss_clip": 0.01094761, "auxiliary_loss_mlp": 0.01041386, "balance_loss_clip": 1.03725362, "balance_loss_mlp": 1.02683032, "epoch": 0.8406180484578849, "flos": 27526929678720.0, "grad_norm": 2.1325080879895197, "language_loss": 0.73094296, "learning_rate": 2.6046830929557374e-07, "loss": 0.75230443, "num_input_tokens_seen": 151006905, "step": 6991, "time_per_iteration": 2.679955244064331 }, { "auxiliary_loss_clip": 0.01095027, "auxiliary_loss_mlp": 0.01034569, "balance_loss_clip": 1.04159904, "balance_loss_mlp": 1.01926255, "epoch": 0.8407382913485241, "flos": 22127473342080.0, "grad_norm": 4.813937446877248, "language_loss": 0.8515709, "learning_rate": 2.6008404626615776e-07, "loss": 0.87286687, "num_input_tokens_seen": 151025405, "step": 6992, "time_per_iteration": 2.6530864238739014 }, { "auxiliary_loss_clip": 0.01123783, "auxiliary_loss_mlp": 0.01044769, "balance_loss_clip": 1.04273796, "balance_loss_mlp": 1.02884245, "epoch": 0.8408585342391631, "flos": 13918473982080.0, "grad_norm": 2.4994651105086576, "language_loss": 0.73868012, "learning_rate": 2.597000471806092e-07, "loss": 0.76036561, "num_input_tokens_seen": 151041970, "step": 6993, "time_per_iteration": 2.539548873901367 }, { "auxiliary_loss_clip": 0.01102206, "auxiliary_loss_mlp": 0.01045234, "balance_loss_clip": 1.04091752, "balance_loss_mlp": 1.02843714, "epoch": 0.8409787771298022, "flos": 20187865808640.0, "grad_norm": 2.1530533673987966, "language_loss": 0.73496437, "learning_rate": 2.593163120971793e-07, "loss": 0.75643879, "num_input_tokens_seen": 151060835, "step": 6994, "time_per_iteration": 2.581820011138916 }, { "auxiliary_loss_clip": 0.01072493, "auxiliary_loss_mlp": 0.01036274, "balance_loss_clip": 1.03369355, "balance_loss_mlp": 1.02105665, "epoch": 0.8410990200204413, "flos": 23142523777920.0, "grad_norm": 2.1162262284787423, "language_loss": 0.68919146, "learning_rate": 2.5893284107408165e-07, "loss": 0.71027911, "num_input_tokens_seen": 151078205, "step": 6995, "time_per_iteration": 2.676968812942505 }, { "auxiliary_loss_clip": 0.01078585, "auxiliary_loss_mlp": 0.01044958, "balance_loss_clip": 1.03772402, "balance_loss_mlp": 1.02993739, "epoch": 0.8412192629110804, "flos": 24027219757440.0, "grad_norm": 1.7186477689165662, "language_loss": 0.78074217, "learning_rate": 2.5854963416948726e-07, "loss": 0.80197763, "num_input_tokens_seen": 151100470, "step": 6996, "time_per_iteration": 2.7140963077545166 }, { "auxiliary_loss_clip": 0.01080327, "auxiliary_loss_mlp": 0.01038213, "balance_loss_clip": 1.03436732, "balance_loss_mlp": 1.02196431, "epoch": 0.8413395058017195, "flos": 25591703604480.0, "grad_norm": 1.6540465310460448, "language_loss": 0.69386452, "learning_rate": 2.5816669144152816e-07, "loss": 0.71504992, "num_input_tokens_seen": 151121650, "step": 6997, "time_per_iteration": 2.710735559463501 }, { "auxiliary_loss_clip": 0.01034095, "auxiliary_loss_mlp": 0.01000397, "balance_loss_clip": 1.00628901, "balance_loss_mlp": 0.99885958, "epoch": 0.8414597486923585, "flos": 63635396624640.0, "grad_norm": 0.8476333640847987, "language_loss": 0.66307563, "learning_rate": 2.5778401294829777e-07, "loss": 0.68342054, "num_input_tokens_seen": 151180390, "step": 6998, "time_per_iteration": 3.2000086307525635 }, { "auxiliary_loss_clip": 0.01115253, "auxiliary_loss_mlp": 0.00771452, "balance_loss_clip": 1.03990006, "balance_loss_mlp": 1.00044739, "epoch": 0.8415799915829977, "flos": 19098731571840.0, "grad_norm": 1.875616195478323, "language_loss": 0.64906216, "learning_rate": 2.574015987478473e-07, "loss": 0.66792929, "num_input_tokens_seen": 151198520, "step": 6999, "time_per_iteration": 4.467142581939697 }, { "auxiliary_loss_clip": 0.01114792, "auxiliary_loss_mlp": 0.01037096, "balance_loss_clip": 1.04285693, "balance_loss_mlp": 1.01909518, "epoch": 0.8417002344736367, "flos": 19821612781440.0, "grad_norm": 2.087310110124947, "language_loss": 0.86915779, "learning_rate": 2.570194488981887e-07, "loss": 0.89067674, "num_input_tokens_seen": 151215065, "step": 7000, "time_per_iteration": 2.607395887374878 }, { "auxiliary_loss_clip": 0.01034014, "auxiliary_loss_mlp": 0.01003929, "balance_loss_clip": 1.00617635, "balance_loss_mlp": 1.00236726, "epoch": 0.8418204773642758, "flos": 62161516834560.0, "grad_norm": 0.8467788650397627, "language_loss": 0.60298461, "learning_rate": 2.566375634572939e-07, "loss": 0.62336403, "num_input_tokens_seen": 151275705, "step": 7001, "time_per_iteration": 3.100372791290283 }, { "auxiliary_loss_clip": 0.01103336, "auxiliary_loss_mlp": 0.01035415, "balance_loss_clip": 1.04043055, "balance_loss_mlp": 1.01974511, "epoch": 0.841940720254915, "flos": 17092905315840.0, "grad_norm": 2.8458014325983094, "language_loss": 0.76075947, "learning_rate": 2.562559424830943e-07, "loss": 0.78214693, "num_input_tokens_seen": 151293665, "step": 7002, "time_per_iteration": 2.6811342239379883 }, { "auxiliary_loss_clip": 0.01102506, "auxiliary_loss_mlp": 0.01044134, "balance_loss_clip": 1.03945923, "balance_loss_mlp": 1.02633631, "epoch": 0.842060963145554, "flos": 16283586026880.0, "grad_norm": 3.3627038531118765, "language_loss": 0.70104253, "learning_rate": 2.5587458603348256e-07, "loss": 0.72250897, "num_input_tokens_seen": 151310955, "step": 7003, "time_per_iteration": 3.4912853240966797 }, { "auxiliary_loss_clip": 0.01087565, "auxiliary_loss_mlp": 0.01046974, "balance_loss_clip": 1.03787816, "balance_loss_mlp": 1.02806771, "epoch": 0.8421812060361931, "flos": 21908238681600.0, "grad_norm": 1.9670197963696685, "language_loss": 0.83948314, "learning_rate": 2.554934941663085e-07, "loss": 0.86082852, "num_input_tokens_seen": 151328490, "step": 7004, "time_per_iteration": 2.6785454750061035 }, { "auxiliary_loss_clip": 0.01094501, "auxiliary_loss_mlp": 0.01036494, "balance_loss_clip": 1.0386982, "balance_loss_mlp": 1.01937521, "epoch": 0.8423014489268322, "flos": 27777693502080.0, "grad_norm": 2.7636197996601544, "language_loss": 0.73127609, "learning_rate": 2.5511266693938484e-07, "loss": 0.75258607, "num_input_tokens_seen": 151346950, "step": 7005, "time_per_iteration": 2.660115957260132 }, { "auxiliary_loss_clip": 0.01106035, "auxiliary_loss_mlp": 0.01041562, "balance_loss_clip": 1.04272223, "balance_loss_mlp": 1.02574325, "epoch": 0.8424216918174713, "flos": 25117610970240.0, "grad_norm": 1.7290320428456711, "language_loss": 0.7778157, "learning_rate": 2.547321044104822e-07, "loss": 0.79929161, "num_input_tokens_seen": 151368445, "step": 7006, "time_per_iteration": 2.704606294631958 }, { "auxiliary_loss_clip": 0.01134891, "auxiliary_loss_mlp": 0.01041077, "balance_loss_clip": 1.04415035, "balance_loss_mlp": 1.02542484, "epoch": 0.8425419347081103, "flos": 24748448941440.0, "grad_norm": 1.8399102798841325, "language_loss": 0.76951885, "learning_rate": 2.5435180663733113e-07, "loss": 0.79127848, "num_input_tokens_seen": 151388745, "step": 7007, "time_per_iteration": 3.464968204498291 }, { "auxiliary_loss_clip": 0.01088066, "auxiliary_loss_mlp": 0.01037403, "balance_loss_clip": 1.03650808, "balance_loss_mlp": 1.02179801, "epoch": 0.8426621775987495, "flos": 24820916630400.0, "grad_norm": 6.1789851764479815, "language_loss": 0.71889561, "learning_rate": 2.539717736776241e-07, "loss": 0.74015033, "num_input_tokens_seen": 151404970, "step": 7008, "time_per_iteration": 2.670654535293579 }, { "auxiliary_loss_clip": 0.01114586, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.04018939, "balance_loss_mlp": 1.02619529, "epoch": 0.8427824204893886, "flos": 23550074467200.0, "grad_norm": 4.6589605876113716, "language_loss": 0.76199806, "learning_rate": 2.535920055890097e-07, "loss": 0.78355932, "num_input_tokens_seen": 151426265, "step": 7009, "time_per_iteration": 2.5959270000457764 }, { "auxiliary_loss_clip": 0.01076784, "auxiliary_loss_mlp": 0.01045024, "balance_loss_clip": 1.03695488, "balance_loss_mlp": 1.02878737, "epoch": 0.8429026633800276, "flos": 16143858120960.0, "grad_norm": 2.1616135901893645, "language_loss": 0.64740765, "learning_rate": 2.5321250242910006e-07, "loss": 0.66862571, "num_input_tokens_seen": 151444180, "step": 7010, "time_per_iteration": 2.6819674968719482 }, { "auxiliary_loss_clip": 0.01130009, "auxiliary_loss_mlp": 0.01039961, "balance_loss_clip": 1.04248273, "balance_loss_mlp": 1.02504754, "epoch": 0.8430229062706668, "flos": 22198540400640.0, "grad_norm": 1.8256839022286766, "language_loss": 0.86331737, "learning_rate": 2.5283326425546493e-07, "loss": 0.88501704, "num_input_tokens_seen": 151463290, "step": 7011, "time_per_iteration": 2.5473763942718506 }, { "auxiliary_loss_clip": 0.01088324, "auxiliary_loss_mlp": 0.01037317, "balance_loss_clip": 1.04091537, "balance_loss_mlp": 1.02285659, "epoch": 0.8431431491613058, "flos": 35330317683840.0, "grad_norm": 2.8255101313688424, "language_loss": 0.69713348, "learning_rate": 2.5245429112563443e-07, "loss": 0.71838987, "num_input_tokens_seen": 151483965, "step": 7012, "time_per_iteration": 2.7627291679382324 }, { "auxiliary_loss_clip": 0.01123114, "auxiliary_loss_mlp": 0.01038491, "balance_loss_clip": 1.04473114, "balance_loss_mlp": 1.02359009, "epoch": 0.8432633920519449, "flos": 25812374808960.0, "grad_norm": 2.0568507254857487, "language_loss": 0.81783628, "learning_rate": 2.5207558309709865e-07, "loss": 0.83945239, "num_input_tokens_seen": 151503700, "step": 7013, "time_per_iteration": 2.601665735244751 }, { "auxiliary_loss_clip": 0.01009377, "auxiliary_loss_mlp": 0.0075594, "balance_loss_clip": 1.00746202, "balance_loss_mlp": 1.00017095, "epoch": 0.8433836349425841, "flos": 64959531592320.0, "grad_norm": 0.657182332531354, "language_loss": 0.56269294, "learning_rate": 2.516971402273065e-07, "loss": 0.58034611, "num_input_tokens_seen": 151569765, "step": 7014, "time_per_iteration": 3.198518753051758 }, { "auxiliary_loss_clip": 0.01107178, "auxiliary_loss_mlp": 0.01035441, "balance_loss_clip": 1.03865588, "balance_loss_mlp": 1.01915669, "epoch": 0.8435038778332231, "flos": 20229989483520.0, "grad_norm": 2.14056233646691, "language_loss": 0.67647982, "learning_rate": 2.513189625736687e-07, "loss": 0.69790596, "num_input_tokens_seen": 151586660, "step": 7015, "time_per_iteration": 2.5859644412994385 }, { "auxiliary_loss_clip": 0.01099542, "auxiliary_loss_mlp": 0.01044838, "balance_loss_clip": 1.03874278, "balance_loss_mlp": 1.0281961, "epoch": 0.8436241207238622, "flos": 20992229020800.0, "grad_norm": 2.4527473972351483, "language_loss": 0.71537745, "learning_rate": 2.509410501935534e-07, "loss": 0.73682123, "num_input_tokens_seen": 151602295, "step": 7016, "time_per_iteration": 2.6409502029418945 }, { "auxiliary_loss_clip": 0.01113446, "auxiliary_loss_mlp": 0.01041253, "balance_loss_clip": 1.04416966, "balance_loss_mlp": 1.02386069, "epoch": 0.8437443636145013, "flos": 14682257804160.0, "grad_norm": 3.7577827230248984, "language_loss": 0.7543065, "learning_rate": 2.5056340314429116e-07, "loss": 0.77585346, "num_input_tokens_seen": 151619760, "step": 7017, "time_per_iteration": 2.600891351699829 }, { "auxiliary_loss_clip": 0.01088745, "auxiliary_loss_mlp": 0.01040241, "balance_loss_clip": 1.03937757, "balance_loss_mlp": 1.0235641, "epoch": 0.8438646065051404, "flos": 21608814908160.0, "grad_norm": 2.3133948253056897, "language_loss": 0.80533826, "learning_rate": 2.5018602148316904e-07, "loss": 0.82662809, "num_input_tokens_seen": 151635795, "step": 7018, "time_per_iteration": 2.675050973892212 }, { "auxiliary_loss_clip": 0.01129286, "auxiliary_loss_mlp": 0.01038988, "balance_loss_clip": 1.04437447, "balance_loss_mlp": 1.02378905, "epoch": 0.8439848493957794, "flos": 23289937194240.0, "grad_norm": 2.3517406128300347, "language_loss": 0.801498, "learning_rate": 2.498089052674359e-07, "loss": 0.82318079, "num_input_tokens_seen": 151653770, "step": 7019, "time_per_iteration": 2.552461624145508 }, { "auxiliary_loss_clip": 0.01121185, "auxiliary_loss_mlp": 0.01044249, "balance_loss_clip": 1.04236341, "balance_loss_mlp": 1.02854931, "epoch": 0.8441050922864186, "flos": 19719339782400.0, "grad_norm": 1.9033257278321993, "language_loss": 0.75469708, "learning_rate": 2.494320545543007e-07, "loss": 0.77635139, "num_input_tokens_seen": 151673340, "step": 7020, "time_per_iteration": 2.5789706707000732 }, { "auxiliary_loss_clip": 0.01133518, "auxiliary_loss_mlp": 0.01045693, "balance_loss_clip": 1.04137301, "balance_loss_mlp": 1.02905154, "epoch": 0.8442253351770577, "flos": 21835268202240.0, "grad_norm": 3.240918375593518, "language_loss": 0.66870099, "learning_rate": 2.490554694009308e-07, "loss": 0.69049311, "num_input_tokens_seen": 151694205, "step": 7021, "time_per_iteration": 2.58245849609375 }, { "auxiliary_loss_clip": 0.0112325, "auxiliary_loss_mlp": 0.01039992, "balance_loss_clip": 1.04193568, "balance_loss_mlp": 1.02460766, "epoch": 0.8443455780676967, "flos": 34346365447680.0, "grad_norm": 1.6928800845259153, "language_loss": 0.78186965, "learning_rate": 2.4867914986445426e-07, "loss": 0.80350202, "num_input_tokens_seen": 151716595, "step": 7022, "time_per_iteration": 2.6801648139953613 }, { "auxiliary_loss_clip": 0.01108569, "auxiliary_loss_mlp": 0.01033258, "balance_loss_clip": 1.03874981, "balance_loss_mlp": 1.01922143, "epoch": 0.8444658209583359, "flos": 48214599281280.0, "grad_norm": 2.0072984494485597, "language_loss": 0.71355188, "learning_rate": 2.483030960019581e-07, "loss": 0.73497021, "num_input_tokens_seen": 151740525, "step": 7023, "time_per_iteration": 2.8166542053222656 }, { "auxiliary_loss_clip": 0.00998986, "auxiliary_loss_mlp": 0.01003113, "balance_loss_clip": 1.00813222, "balance_loss_mlp": 1.00156355, "epoch": 0.8445860638489749, "flos": 68484773105280.0, "grad_norm": 0.7322505593136655, "language_loss": 0.5549109, "learning_rate": 2.479273078704891e-07, "loss": 0.57493186, "num_input_tokens_seen": 151793890, "step": 7024, "time_per_iteration": 4.0660645961761475 }, { "auxiliary_loss_clip": 0.0099786, "auxiliary_loss_mlp": 0.01002435, "balance_loss_clip": 1.01461864, "balance_loss_mlp": 1.00078964, "epoch": 0.844706306739614, "flos": 62833331882880.0, "grad_norm": 0.7909544611588264, "language_loss": 0.64679742, "learning_rate": 2.475517855270552e-07, "loss": 0.66680038, "num_input_tokens_seen": 151853970, "step": 7025, "time_per_iteration": 4.182530641555786 }, { "auxiliary_loss_clip": 0.01130445, "auxiliary_loss_mlp": 0.01036259, "balance_loss_clip": 1.04270601, "balance_loss_mlp": 1.0208925, "epoch": 0.8448265496302532, "flos": 14976114969600.0, "grad_norm": 2.4114548850086486, "language_loss": 0.72522897, "learning_rate": 2.4717652902862143e-07, "loss": 0.74689603, "num_input_tokens_seen": 151872945, "step": 7026, "time_per_iteration": 2.548893928527832 }, { "auxiliary_loss_clip": 0.01111487, "auxiliary_loss_mlp": 0.01040082, "balance_loss_clip": 1.04107165, "balance_loss_mlp": 1.0234282, "epoch": 0.8449467925208922, "flos": 23441265192960.0, "grad_norm": 2.237334653974893, "language_loss": 0.81570542, "learning_rate": 2.4680153843211495e-07, "loss": 0.83722103, "num_input_tokens_seen": 151892875, "step": 7027, "time_per_iteration": 2.633105754852295 }, { "auxiliary_loss_clip": 0.01109185, "auxiliary_loss_mlp": 0.01039947, "balance_loss_clip": 1.04362869, "balance_loss_mlp": 1.02392578, "epoch": 0.8450670354115313, "flos": 22748045639040.0, "grad_norm": 1.891109952719004, "language_loss": 0.72417343, "learning_rate": 2.464268137944212e-07, "loss": 0.74566472, "num_input_tokens_seen": 151914170, "step": 7028, "time_per_iteration": 2.6131362915039062 }, { "auxiliary_loss_clip": 0.01073657, "auxiliary_loss_mlp": 0.01050231, "balance_loss_clip": 1.03823829, "balance_loss_mlp": 1.03134811, "epoch": 0.8451872783021703, "flos": 29825571605760.0, "grad_norm": 5.824893918380644, "language_loss": 0.78384435, "learning_rate": 2.46052355172385e-07, "loss": 0.80508327, "num_input_tokens_seen": 151932210, "step": 7029, "time_per_iteration": 3.690305471420288 }, { "auxiliary_loss_clip": 0.01132401, "auxiliary_loss_mlp": 0.01042192, "balance_loss_clip": 1.04179239, "balance_loss_mlp": 1.02586031, "epoch": 0.8453075211928095, "flos": 21870029589120.0, "grad_norm": 1.7357909123401798, "language_loss": 0.75058484, "learning_rate": 2.456781626228128e-07, "loss": 0.77233076, "num_input_tokens_seen": 151951715, "step": 7030, "time_per_iteration": 2.5406887531280518 }, { "auxiliary_loss_clip": 0.0099879, "auxiliary_loss_mlp": 0.00756591, "balance_loss_clip": 1.00795436, "balance_loss_mlp": 1.00012767, "epoch": 0.8454277640834486, "flos": 58751869288320.0, "grad_norm": 0.9107590400377121, "language_loss": 0.66249037, "learning_rate": 2.453042362024675e-07, "loss": 0.68004417, "num_input_tokens_seen": 152004960, "step": 7031, "time_per_iteration": 3.360600233078003 }, { "auxiliary_loss_clip": 0.01130589, "auxiliary_loss_mlp": 0.01038732, "balance_loss_clip": 1.04308438, "balance_loss_mlp": 1.02360451, "epoch": 0.8455480069740876, "flos": 27090076469760.0, "grad_norm": 2.0431730415508014, "language_loss": 0.73248869, "learning_rate": 2.449305759680751e-07, "loss": 0.75418186, "num_input_tokens_seen": 152026285, "step": 7032, "time_per_iteration": 2.797147512435913 }, { "auxiliary_loss_clip": 0.01096337, "auxiliary_loss_mlp": 0.01038618, "balance_loss_clip": 1.0412972, "balance_loss_mlp": 1.02444983, "epoch": 0.8456682498647268, "flos": 27198670262400.0, "grad_norm": 1.5383951829541818, "language_loss": 0.75267929, "learning_rate": 2.445571819763188e-07, "loss": 0.77402878, "num_input_tokens_seen": 152048585, "step": 7033, "time_per_iteration": 3.6221368312835693 }, { "auxiliary_loss_clip": 0.01130869, "auxiliary_loss_mlp": 0.01039439, "balance_loss_clip": 1.0428524, "balance_loss_mlp": 1.02337551, "epoch": 0.8457884927553658, "flos": 20631901737600.0, "grad_norm": 1.9750905240721521, "language_loss": 0.58269978, "learning_rate": 2.4418405428384227e-07, "loss": 0.60440284, "num_input_tokens_seen": 152068795, "step": 7034, "time_per_iteration": 2.555551767349243 }, { "auxiliary_loss_clip": 0.0112941, "auxiliary_loss_mlp": 0.00772179, "balance_loss_clip": 1.04208946, "balance_loss_mlp": 1.0004921, "epoch": 0.8459087356460049, "flos": 15299023259520.0, "grad_norm": 1.7058461589366818, "language_loss": 0.71820164, "learning_rate": 2.4381119294724864e-07, "loss": 0.73721755, "num_input_tokens_seen": 152086240, "step": 7035, "time_per_iteration": 2.541100025177002 }, { "auxiliary_loss_clip": 0.01132273, "auxiliary_loss_mlp": 0.01043947, "balance_loss_clip": 1.04371071, "balance_loss_mlp": 1.02822316, "epoch": 0.846028978536644, "flos": 18843155326080.0, "grad_norm": 2.255693470645833, "language_loss": 0.53835487, "learning_rate": 2.434385980231004e-07, "loss": 0.56011707, "num_input_tokens_seen": 152105080, "step": 7036, "time_per_iteration": 2.5465638637542725 }, { "auxiliary_loss_clip": 0.01119844, "auxiliary_loss_mlp": 0.010376, "balance_loss_clip": 1.04288948, "balance_loss_mlp": 1.02298439, "epoch": 0.8461492214272831, "flos": 52661740285440.0, "grad_norm": 1.471520365790157, "language_loss": 0.65660405, "learning_rate": 2.4306626956792043e-07, "loss": 0.67817843, "num_input_tokens_seen": 152130025, "step": 7037, "time_per_iteration": 2.844440460205078 }, { "auxiliary_loss_clip": 0.01121623, "auxiliary_loss_mlp": 0.01033785, "balance_loss_clip": 1.04073429, "balance_loss_mlp": 1.01814485, "epoch": 0.8462694643179222, "flos": 18588405093120.0, "grad_norm": 5.098675198417054, "language_loss": 0.75721407, "learning_rate": 2.4269420763819017e-07, "loss": 0.77876818, "num_input_tokens_seen": 152148070, "step": 7038, "time_per_iteration": 2.577016830444336 }, { "auxiliary_loss_clip": 0.01115064, "auxiliary_loss_mlp": 0.0103487, "balance_loss_clip": 1.0385592, "balance_loss_mlp": 1.01998067, "epoch": 0.8463897072085613, "flos": 24387080163840.0, "grad_norm": 3.106583010243065, "language_loss": 0.83785599, "learning_rate": 2.4232241229035223e-07, "loss": 0.85935533, "num_input_tokens_seen": 152165825, "step": 7039, "time_per_iteration": 2.58161997795105 }, { "auxiliary_loss_clip": 0.01025929, "auxiliary_loss_mlp": 0.01004616, "balance_loss_clip": 1.0069685, "balance_loss_mlp": 1.00294685, "epoch": 0.8465099500992004, "flos": 68702140258560.0, "grad_norm": 0.7606623037735825, "language_loss": 0.5668965, "learning_rate": 2.419508835808064e-07, "loss": 0.58720195, "num_input_tokens_seen": 152222380, "step": 7040, "time_per_iteration": 3.121087074279785 }, { "auxiliary_loss_clip": 0.01109943, "auxiliary_loss_mlp": 0.01032214, "balance_loss_clip": 1.041031, "balance_loss_mlp": 1.01562619, "epoch": 0.8466301929898394, "flos": 13735724561280.0, "grad_norm": 2.4196752122258496, "language_loss": 0.62889332, "learning_rate": 2.415796215659134e-07, "loss": 0.65031493, "num_input_tokens_seen": 152239085, "step": 7041, "time_per_iteration": 2.5723235607147217 }, { "auxiliary_loss_clip": 0.01096702, "auxiliary_loss_mlp": 0.01044057, "balance_loss_clip": 1.03804851, "balance_loss_mlp": 1.02764761, "epoch": 0.8467504358804786, "flos": 19241260738560.0, "grad_norm": 2.3048487814170393, "language_loss": 0.7756322, "learning_rate": 2.412086263019939e-07, "loss": 0.79703975, "num_input_tokens_seen": 152257110, "step": 7042, "time_per_iteration": 2.602121114730835 }, { "auxiliary_loss_clip": 0.01127617, "auxiliary_loss_mlp": 0.01037365, "balance_loss_clip": 1.04364932, "balance_loss_mlp": 1.02244592, "epoch": 0.8468706787711177, "flos": 21324115710720.0, "grad_norm": 1.6916662424991176, "language_loss": 0.80361623, "learning_rate": 2.408378978453276e-07, "loss": 0.82526606, "num_input_tokens_seen": 152277230, "step": 7043, "time_per_iteration": 2.5478732585906982 }, { "auxiliary_loss_clip": 0.01029926, "auxiliary_loss_mlp": 0.01002967, "balance_loss_clip": 1.01064587, "balance_loss_mlp": 1.00127459, "epoch": 0.8469909216617567, "flos": 64877439058560.0, "grad_norm": 0.8124875564853481, "language_loss": 0.63924819, "learning_rate": 2.404674362521533e-07, "loss": 0.65957713, "num_input_tokens_seen": 152335725, "step": 7044, "time_per_iteration": 3.0447895526885986 }, { "auxiliary_loss_clip": 0.011219, "auxiliary_loss_mlp": 0.01043257, "balance_loss_clip": 1.04403591, "balance_loss_mlp": 1.02770042, "epoch": 0.8471111645523959, "flos": 19280583152640.0, "grad_norm": 3.3390462839254953, "language_loss": 0.74326897, "learning_rate": 2.4009724157866997e-07, "loss": 0.76492053, "num_input_tokens_seen": 152352785, "step": 7045, "time_per_iteration": 2.6188268661499023 }, { "auxiliary_loss_clip": 0.01129816, "auxiliary_loss_mlp": 0.01030788, "balance_loss_clip": 1.04237163, "balance_loss_mlp": 1.0155766, "epoch": 0.8472314074430349, "flos": 22015826893440.0, "grad_norm": 4.92896847602267, "language_loss": 0.76530981, "learning_rate": 2.3972731388103564e-07, "loss": 0.78691578, "num_input_tokens_seen": 152371265, "step": 7046, "time_per_iteration": 2.5797431468963623 }, { "auxiliary_loss_clip": 0.00988872, "auxiliary_loss_mlp": 0.01002934, "balance_loss_clip": 1.00929737, "balance_loss_mlp": 1.00137234, "epoch": 0.847351650333674, "flos": 57882580243200.0, "grad_norm": 0.8001777609120245, "language_loss": 0.62374127, "learning_rate": 2.393576532153687e-07, "loss": 0.64365923, "num_input_tokens_seen": 152435050, "step": 7047, "time_per_iteration": 3.3337881565093994 }, { "auxiliary_loss_clip": 0.01025868, "auxiliary_loss_mlp": 0.01001857, "balance_loss_clip": 1.00760818, "balance_loss_mlp": 1.00029528, "epoch": 0.8474718932243132, "flos": 41284238313600.0, "grad_norm": 0.928553996430936, "language_loss": 0.57803881, "learning_rate": 2.389882596377453e-07, "loss": 0.59831601, "num_input_tokens_seen": 152489315, "step": 7048, "time_per_iteration": 3.08443021774292 }, { "auxiliary_loss_clip": 0.01129786, "auxiliary_loss_mlp": 0.01036844, "balance_loss_clip": 1.04118443, "balance_loss_mlp": 1.02134705, "epoch": 0.8475921361149522, "flos": 38180906974080.0, "grad_norm": 1.8572002068152174, "language_loss": 0.76336062, "learning_rate": 2.386191332042031e-07, "loss": 0.78502691, "num_input_tokens_seen": 152511210, "step": 7049, "time_per_iteration": 2.6864538192749023 }, { "auxiliary_loss_clip": 0.01140981, "auxiliary_loss_mlp": 0.01041363, "balance_loss_clip": 1.04576278, "balance_loss_mlp": 1.02507854, "epoch": 0.8477123790055913, "flos": 25375054723200.0, "grad_norm": 1.8299224677243455, "language_loss": 0.72816807, "learning_rate": 2.3825027397073794e-07, "loss": 0.74999154, "num_input_tokens_seen": 152531685, "step": 7050, "time_per_iteration": 3.5888874530792236 }, { "auxiliary_loss_clip": 0.01115775, "auxiliary_loss_mlp": 0.0103481, "balance_loss_clip": 1.04221582, "balance_loss_mlp": 1.01907444, "epoch": 0.8478326218962304, "flos": 30225185389440.0, "grad_norm": 2.980361518822021, "language_loss": 0.66902816, "learning_rate": 2.3788168199330515e-07, "loss": 0.690534, "num_input_tokens_seen": 152553245, "step": 7051, "time_per_iteration": 3.4657514095306396 }, { "auxiliary_loss_clip": 0.01094489, "auxiliary_loss_mlp": 0.01041876, "balance_loss_clip": 1.03789639, "balance_loss_mlp": 1.02591431, "epoch": 0.8479528647868695, "flos": 38213800853760.0, "grad_norm": 2.296867505416219, "language_loss": 0.72683632, "learning_rate": 2.3751335732782074e-07, "loss": 0.7482, "num_input_tokens_seen": 152574505, "step": 7052, "time_per_iteration": 2.7354559898376465 }, { "auxiliary_loss_clip": 0.01115571, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.04208839, "balance_loss_mlp": 1.0220952, "epoch": 0.8480731076775085, "flos": 20957790856320.0, "grad_norm": 1.830878946815438, "language_loss": 0.79776824, "learning_rate": 2.371453000301582e-07, "loss": 0.81929761, "num_input_tokens_seen": 152593190, "step": 7053, "time_per_iteration": 2.5591940879821777 }, { "auxiliary_loss_clip": 0.0109238, "auxiliary_loss_mlp": 0.01036018, "balance_loss_clip": 1.03788877, "balance_loss_mlp": 1.02013898, "epoch": 0.8481933505681477, "flos": 32596510487040.0, "grad_norm": 1.847297985581084, "language_loss": 0.74352139, "learning_rate": 2.3677751015615222e-07, "loss": 0.76480544, "num_input_tokens_seen": 152615265, "step": 7054, "time_per_iteration": 2.741766929626465 }, { "auxiliary_loss_clip": 0.01096161, "auxiliary_loss_mlp": 0.01043968, "balance_loss_clip": 1.03750229, "balance_loss_mlp": 1.025419, "epoch": 0.8483135934587868, "flos": 20741177888640.0, "grad_norm": 1.897452132897269, "language_loss": 0.85273194, "learning_rate": 2.3640998776159593e-07, "loss": 0.87413317, "num_input_tokens_seen": 152632770, "step": 7055, "time_per_iteration": 3.8144426345825195 }, { "auxiliary_loss_clip": 0.01111276, "auxiliary_loss_mlp": 0.0103322, "balance_loss_clip": 1.04577756, "balance_loss_mlp": 1.01917672, "epoch": 0.8484338363494258, "flos": 21653057485440.0, "grad_norm": 1.8960947962179555, "language_loss": 0.81039405, "learning_rate": 2.3604273290224253e-07, "loss": 0.83183897, "num_input_tokens_seen": 152653485, "step": 7056, "time_per_iteration": 2.6471948623657227 }, { "auxiliary_loss_clip": 0.01111566, "auxiliary_loss_mlp": 0.01045864, "balance_loss_clip": 1.0419327, "balance_loss_mlp": 1.02982986, "epoch": 0.848554079240065, "flos": 15013964926080.0, "grad_norm": 2.925527542322381, "language_loss": 0.74712992, "learning_rate": 2.356757456338039e-07, "loss": 0.76870418, "num_input_tokens_seen": 152670970, "step": 7057, "time_per_iteration": 2.6301116943359375 }, { "auxiliary_loss_clip": 0.01013357, "auxiliary_loss_mlp": 0.01002092, "balance_loss_clip": 1.00831711, "balance_loss_mlp": 1.00051832, "epoch": 0.848674322130704, "flos": 68060453742720.0, "grad_norm": 0.7540575823383826, "language_loss": 0.58985829, "learning_rate": 2.3530902601195147e-07, "loss": 0.61001277, "num_input_tokens_seen": 152739460, "step": 7058, "time_per_iteration": 4.1708033084869385 }, { "auxiliary_loss_clip": 0.0112016, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.04194331, "balance_loss_mlp": 1.01933861, "epoch": 0.8487945650213431, "flos": 18475788977280.0, "grad_norm": 2.218247060663678, "language_loss": 0.78572732, "learning_rate": 2.34942574092317e-07, "loss": 0.80729097, "num_input_tokens_seen": 152754710, "step": 7059, "time_per_iteration": 2.531593084335327 }, { "auxiliary_loss_clip": 0.01119959, "auxiliary_loss_mlp": 0.01035635, "balance_loss_clip": 1.04001856, "balance_loss_mlp": 1.02035213, "epoch": 0.8489148079119821, "flos": 23473189405440.0, "grad_norm": 2.1464260514610176, "language_loss": 0.76799822, "learning_rate": 2.3457638993049045e-07, "loss": 0.78955412, "num_input_tokens_seen": 152772700, "step": 7060, "time_per_iteration": 2.599346160888672 }, { "auxiliary_loss_clip": 0.01077366, "auxiliary_loss_mlp": 0.01037778, "balance_loss_clip": 1.03944778, "balance_loss_mlp": 1.02142262, "epoch": 0.8490350508026213, "flos": 19937604775680.0, "grad_norm": 1.9775480939899928, "language_loss": 0.64425004, "learning_rate": 2.3421047358202252e-07, "loss": 0.66540146, "num_input_tokens_seen": 152791550, "step": 7061, "time_per_iteration": 2.690124988555908 }, { "auxiliary_loss_clip": 0.01123973, "auxiliary_loss_mlp": 0.01035839, "balance_loss_clip": 1.04500699, "balance_loss_mlp": 1.02035356, "epoch": 0.8491552936932604, "flos": 24279958828800.0, "grad_norm": 2.5574323015795253, "language_loss": 0.82842106, "learning_rate": 2.3384482510242144e-07, "loss": 0.85001916, "num_input_tokens_seen": 152809410, "step": 7062, "time_per_iteration": 2.6357922554016113 }, { "auxiliary_loss_clip": 0.01131038, "auxiliary_loss_mlp": 0.0104171, "balance_loss_clip": 1.04194355, "balance_loss_mlp": 1.02531862, "epoch": 0.8492755365838994, "flos": 22522526098560.0, "grad_norm": 2.3953465535716334, "language_loss": 0.77144277, "learning_rate": 2.3347944454715575e-07, "loss": 0.79317033, "num_input_tokens_seen": 152825800, "step": 7063, "time_per_iteration": 2.552643060684204 }, { "auxiliary_loss_clip": 0.01135965, "auxiliary_loss_mlp": 0.01041862, "balance_loss_clip": 1.04359722, "balance_loss_mlp": 1.02493429, "epoch": 0.8493957794745386, "flos": 26980441182720.0, "grad_norm": 2.544100547366568, "language_loss": 0.6732111, "learning_rate": 2.331143319716542e-07, "loss": 0.69498932, "num_input_tokens_seen": 152845330, "step": 7064, "time_per_iteration": 2.61112380027771 }, { "auxiliary_loss_clip": 0.01099003, "auxiliary_loss_mlp": 0.01040848, "balance_loss_clip": 1.04209161, "balance_loss_mlp": 1.02514839, "epoch": 0.8495160223651776, "flos": 29861985018240.0, "grad_norm": 3.9560621785895043, "language_loss": 0.65690851, "learning_rate": 2.3274948743130363e-07, "loss": 0.678307, "num_input_tokens_seen": 152865165, "step": 7065, "time_per_iteration": 2.7045342922210693 }, { "auxiliary_loss_clip": 0.01130817, "auxiliary_loss_mlp": 0.01042617, "balance_loss_clip": 1.04107916, "balance_loss_mlp": 1.02723873, "epoch": 0.8496362652558167, "flos": 23075443128960.0, "grad_norm": 1.7014471118640262, "language_loss": 0.7934131, "learning_rate": 2.3238491098145085e-07, "loss": 0.8151474, "num_input_tokens_seen": 152884695, "step": 7066, "time_per_iteration": 2.566624641418457 }, { "auxiliary_loss_clip": 0.01118468, "auxiliary_loss_mlp": 0.01039647, "balance_loss_clip": 1.04108858, "balance_loss_mlp": 1.02405441, "epoch": 0.8497565081464559, "flos": 14609107756800.0, "grad_norm": 7.034794365481129, "language_loss": 0.73536444, "learning_rate": 2.3202060267740141e-07, "loss": 0.75694561, "num_input_tokens_seen": 152902220, "step": 7067, "time_per_iteration": 2.542302370071411 }, { "auxiliary_loss_clip": 0.01075862, "auxiliary_loss_mlp": 0.01036225, "balance_loss_clip": 1.03556275, "balance_loss_mlp": 1.01991725, "epoch": 0.8498767510370949, "flos": 21136446126720.0, "grad_norm": 2.443994978312799, "language_loss": 0.77844262, "learning_rate": 2.3165656257442044e-07, "loss": 0.79956347, "num_input_tokens_seen": 152920740, "step": 7068, "time_per_iteration": 2.681278705596924 }, { "auxiliary_loss_clip": 0.01117112, "auxiliary_loss_mlp": 0.01039534, "balance_loss_clip": 1.04096079, "balance_loss_mlp": 1.02493072, "epoch": 0.849996993927734, "flos": 23654538195840.0, "grad_norm": 2.0661943557376543, "language_loss": 0.90080321, "learning_rate": 2.31292790727734e-07, "loss": 0.92236972, "num_input_tokens_seen": 152938305, "step": 7069, "time_per_iteration": 2.617527484893799 }, { "auxiliary_loss_clip": 0.01125847, "auxiliary_loss_mlp": 0.01039749, "balance_loss_clip": 1.03893781, "balance_loss_mlp": 1.02418017, "epoch": 0.8501172368183731, "flos": 20558069331840.0, "grad_norm": 2.291652831076221, "language_loss": 0.79981565, "learning_rate": 2.3092928719252392e-07, "loss": 0.82147157, "num_input_tokens_seen": 152956705, "step": 7070, "time_per_iteration": 2.560732841491699 }, { "auxiliary_loss_clip": 0.01117253, "auxiliary_loss_mlp": 0.01038104, "balance_loss_clip": 1.0402019, "balance_loss_mlp": 1.02193952, "epoch": 0.8502374797090122, "flos": 22272624201600.0, "grad_norm": 2.6464070205137404, "language_loss": 0.78503275, "learning_rate": 2.3056605202393475e-07, "loss": 0.80658638, "num_input_tokens_seen": 152974265, "step": 7071, "time_per_iteration": 2.569023609161377 }, { "auxiliary_loss_clip": 0.01115611, "auxiliary_loss_mlp": 0.00773582, "balance_loss_clip": 1.03937268, "balance_loss_mlp": 1.00042033, "epoch": 0.8503577225996513, "flos": 23659817495040.0, "grad_norm": 2.5886640019024103, "language_loss": 0.66628981, "learning_rate": 2.3020308527706888e-07, "loss": 0.68518174, "num_input_tokens_seen": 152993680, "step": 7072, "time_per_iteration": 2.6377480030059814 }, { "auxiliary_loss_clip": 0.01112289, "auxiliary_loss_mlp": 0.01034167, "balance_loss_clip": 1.0407033, "balance_loss_mlp": 1.0188489, "epoch": 0.8504779654902904, "flos": 26758513002240.0, "grad_norm": 1.9601399254550993, "language_loss": 0.88509214, "learning_rate": 2.2984038700698715e-07, "loss": 0.90655673, "num_input_tokens_seen": 153012990, "step": 7073, "time_per_iteration": 2.657952070236206 }, { "auxiliary_loss_clip": 0.01116401, "auxiliary_loss_mlp": 0.01035728, "balance_loss_clip": 1.0427829, "balance_loss_mlp": 1.02050543, "epoch": 0.8505982083809295, "flos": 26468247196800.0, "grad_norm": 1.5618173604425758, "language_loss": 0.78893316, "learning_rate": 2.2947795726871222e-07, "loss": 0.81045443, "num_input_tokens_seen": 153034015, "step": 7074, "time_per_iteration": 2.624690294265747 }, { "auxiliary_loss_clip": 0.01116306, "auxiliary_loss_mlp": 0.0077195, "balance_loss_clip": 1.04427266, "balance_loss_mlp": 1.00040293, "epoch": 0.8507184512715685, "flos": 20303390926080.0, "grad_norm": 1.655281256938517, "language_loss": 0.8598882, "learning_rate": 2.2911579611722253e-07, "loss": 0.87877071, "num_input_tokens_seen": 153053160, "step": 7075, "time_per_iteration": 2.5808792114257812 }, { "auxiliary_loss_clip": 0.01103637, "auxiliary_loss_mlp": 0.01039278, "balance_loss_clip": 1.04070902, "balance_loss_mlp": 1.02381063, "epoch": 0.8508386941622077, "flos": 19025186474880.0, "grad_norm": 1.7363477171916277, "language_loss": 0.87110263, "learning_rate": 2.2875390360745905e-07, "loss": 0.89253181, "num_input_tokens_seen": 153072565, "step": 7076, "time_per_iteration": 2.6042697429656982 }, { "auxiliary_loss_clip": 0.0110149, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 1.04140544, "balance_loss_mlp": 1.01906192, "epoch": 0.8509589370528468, "flos": 16433405654400.0, "grad_norm": 2.1423693525644714, "language_loss": 0.77642918, "learning_rate": 2.2839227979432008e-07, "loss": 0.79780561, "num_input_tokens_seen": 153090215, "step": 7077, "time_per_iteration": 3.5159387588500977 }, { "auxiliary_loss_clip": 0.01106987, "auxiliary_loss_mlp": 0.01036195, "balance_loss_clip": 1.03764629, "balance_loss_mlp": 1.01970792, "epoch": 0.8510791799434858, "flos": 18259714713600.0, "grad_norm": 2.1419032914789042, "language_loss": 0.85284519, "learning_rate": 2.2803092473266373e-07, "loss": 0.874277, "num_input_tokens_seen": 153107740, "step": 7078, "time_per_iteration": 3.452287435531616 }, { "auxiliary_loss_clip": 0.01131992, "auxiliary_loss_mlp": 0.01037068, "balance_loss_clip": 1.04294193, "balance_loss_mlp": 1.02219081, "epoch": 0.851199422834125, "flos": 23441372933760.0, "grad_norm": 4.812640753365149, "language_loss": 0.86767942, "learning_rate": 2.2766983847730724e-07, "loss": 0.88937002, "num_input_tokens_seen": 153127410, "step": 7079, "time_per_iteration": 2.5383522510528564 }, { "auxiliary_loss_clip": 0.01106162, "auxiliary_loss_mlp": 0.01048007, "balance_loss_clip": 1.04147911, "balance_loss_mlp": 1.03032815, "epoch": 0.851319665724764, "flos": 16289404030080.0, "grad_norm": 2.1664387524895745, "language_loss": 0.66650486, "learning_rate": 2.2730902108302663e-07, "loss": 0.68804657, "num_input_tokens_seen": 153144325, "step": 7080, "time_per_iteration": 2.6131749153137207 }, { "auxiliary_loss_clip": 0.01099055, "auxiliary_loss_mlp": 0.01048153, "balance_loss_clip": 1.0371685, "balance_loss_mlp": 1.03028297, "epoch": 0.8514399086154031, "flos": 18989347680000.0, "grad_norm": 3.80237254216847, "language_loss": 0.68855399, "learning_rate": 2.269484726045583e-07, "loss": 0.71002603, "num_input_tokens_seen": 153163240, "step": 7081, "time_per_iteration": 3.5689897537231445 }, { "auxiliary_loss_clip": 0.01097907, "auxiliary_loss_mlp": 0.01035597, "balance_loss_clip": 1.04025602, "balance_loss_mlp": 1.02173889, "epoch": 0.8515601515060423, "flos": 24571194301440.0, "grad_norm": 2.2280816326490256, "language_loss": 0.79203606, "learning_rate": 2.2658819309659672e-07, "loss": 0.81337106, "num_input_tokens_seen": 153183440, "step": 7082, "time_per_iteration": 2.659566879272461 }, { "auxiliary_loss_clip": 0.01099068, "auxiliary_loss_mlp": 0.01038757, "balance_loss_clip": 1.03994966, "balance_loss_mlp": 1.0243926, "epoch": 0.8516803943966813, "flos": 19529443555200.0, "grad_norm": 1.9053031091600643, "language_loss": 0.85102451, "learning_rate": 2.2622818261379706e-07, "loss": 0.87240279, "num_input_tokens_seen": 153200460, "step": 7083, "time_per_iteration": 2.5949759483337402 }, { "auxiliary_loss_clip": 0.01106392, "auxiliary_loss_mlp": 0.01047883, "balance_loss_clip": 1.04187369, "balance_loss_mlp": 1.03128862, "epoch": 0.8518006372873204, "flos": 20265792364800.0, "grad_norm": 1.7301108280711073, "language_loss": 0.75144315, "learning_rate": 2.2586844121077142e-07, "loss": 0.77298594, "num_input_tokens_seen": 153218970, "step": 7084, "time_per_iteration": 3.572504758834839 }, { "auxiliary_loss_clip": 0.01081434, "auxiliary_loss_mlp": 0.01040557, "balance_loss_clip": 1.0369761, "balance_loss_mlp": 1.0247376, "epoch": 0.8519208801779595, "flos": 24133227770880.0, "grad_norm": 2.3465758723719943, "language_loss": 0.72026294, "learning_rate": 2.2550896894209215e-07, "loss": 0.74148285, "num_input_tokens_seen": 153238485, "step": 7085, "time_per_iteration": 2.715344190597534 }, { "auxiliary_loss_clip": 0.00994512, "auxiliary_loss_mlp": 0.01002684, "balance_loss_clip": 1.01328111, "balance_loss_mlp": 1.0009793, "epoch": 0.8520411230685986, "flos": 63035223252480.0, "grad_norm": 0.6812284930511692, "language_loss": 0.56610137, "learning_rate": 2.2514976586229184e-07, "loss": 0.58607328, "num_input_tokens_seen": 153306430, "step": 7086, "time_per_iteration": 3.4357619285583496 }, { "auxiliary_loss_clip": 0.01024702, "auxiliary_loss_mlp": 0.0099948, "balance_loss_clip": 1.00623405, "balance_loss_mlp": 0.99800819, "epoch": 0.8521613659592376, "flos": 65836865283840.0, "grad_norm": 0.7528834178974255, "language_loss": 0.54618943, "learning_rate": 2.247908320258609e-07, "loss": 0.56643116, "num_input_tokens_seen": 153366520, "step": 7087, "time_per_iteration": 3.342059373855591 }, { "auxiliary_loss_clip": 0.01077421, "auxiliary_loss_mlp": 0.01040855, "balance_loss_clip": 1.038028, "balance_loss_mlp": 1.02511907, "epoch": 0.8522816088498768, "flos": 23112323418240.0, "grad_norm": 2.523789519873095, "language_loss": 0.79713714, "learning_rate": 2.2443216748724914e-07, "loss": 0.81831992, "num_input_tokens_seen": 153387230, "step": 7088, "time_per_iteration": 2.7062764167785645 }, { "auxiliary_loss_clip": 0.01121819, "auxiliary_loss_mlp": 0.00771754, "balance_loss_clip": 1.04284561, "balance_loss_mlp": 1.00050712, "epoch": 0.8524018517405159, "flos": 31758140073600.0, "grad_norm": 2.2157629412059223, "language_loss": 0.74700367, "learning_rate": 2.2407377230086588e-07, "loss": 0.76593941, "num_input_tokens_seen": 153409585, "step": 7089, "time_per_iteration": 2.710710048675537 }, { "auxiliary_loss_clip": 0.01090998, "auxiliary_loss_mlp": 0.01042803, "balance_loss_clip": 1.04005682, "balance_loss_mlp": 1.02734137, "epoch": 0.8525220946311549, "flos": 18690318956160.0, "grad_norm": 1.8991726192185883, "language_loss": 0.83764732, "learning_rate": 2.23715646521079e-07, "loss": 0.85898525, "num_input_tokens_seen": 153427105, "step": 7090, "time_per_iteration": 2.6191751956939697 }, { "auxiliary_loss_clip": 0.01124766, "auxiliary_loss_mlp": 0.00772453, "balance_loss_clip": 1.0424819, "balance_loss_mlp": 1.00046325, "epoch": 0.852642337521794, "flos": 21793216354560.0, "grad_norm": 1.9348023696186953, "language_loss": 0.84037006, "learning_rate": 2.2335779020221724e-07, "loss": 0.85934222, "num_input_tokens_seen": 153443725, "step": 7091, "time_per_iteration": 2.6011464595794678 }, { "auxiliary_loss_clip": 0.01021605, "auxiliary_loss_mlp": 0.01006862, "balance_loss_clip": 1.00868785, "balance_loss_mlp": 1.0050019, "epoch": 0.8527625804124331, "flos": 69040132260480.0, "grad_norm": 0.7982426961950438, "language_loss": 0.56423855, "learning_rate": 2.2300020339856497e-07, "loss": 0.5845232, "num_input_tokens_seen": 153506410, "step": 7092, "time_per_iteration": 3.193592071533203 }, { "auxiliary_loss_clip": 0.01104067, "auxiliary_loss_mlp": 0.01043043, "balance_loss_clip": 1.03867471, "balance_loss_mlp": 1.02786231, "epoch": 0.8528828233030722, "flos": 26979399688320.0, "grad_norm": 2.5162047531460887, "language_loss": 0.78374636, "learning_rate": 2.2264288616436966e-07, "loss": 0.8052175, "num_input_tokens_seen": 153526665, "step": 7093, "time_per_iteration": 2.636892318725586 }, { "auxiliary_loss_clip": 0.01102934, "auxiliary_loss_mlp": 0.01035061, "balance_loss_clip": 1.03918767, "balance_loss_mlp": 1.01764429, "epoch": 0.8530030661937112, "flos": 17487598936320.0, "grad_norm": 1.989106250689838, "language_loss": 0.72604531, "learning_rate": 2.222858385538351e-07, "loss": 0.74742532, "num_input_tokens_seen": 153543465, "step": 7094, "time_per_iteration": 2.61769962310791 }, { "auxiliary_loss_clip": 0.01115984, "auxiliary_loss_mlp": 0.01034529, "balance_loss_clip": 1.03996432, "balance_loss_mlp": 1.01963961, "epoch": 0.8531233090843504, "flos": 22160798184960.0, "grad_norm": 2.139210501531769, "language_loss": 0.67992973, "learning_rate": 2.2192906062112527e-07, "loss": 0.70143491, "num_input_tokens_seen": 153563340, "step": 7095, "time_per_iteration": 2.5885183811187744 }, { "auxiliary_loss_clip": 0.0113206, "auxiliary_loss_mlp": 0.01042473, "balance_loss_clip": 1.04279542, "balance_loss_mlp": 1.02647495, "epoch": 0.8532435519749895, "flos": 37635388145280.0, "grad_norm": 1.728557593678196, "language_loss": 0.70755011, "learning_rate": 2.2157255242036377e-07, "loss": 0.72929549, "num_input_tokens_seen": 153587005, "step": 7096, "time_per_iteration": 2.735605239868164 }, { "auxiliary_loss_clip": 0.01089799, "auxiliary_loss_mlp": 0.01042831, "balance_loss_clip": 1.03821826, "balance_loss_mlp": 1.02715516, "epoch": 0.8533637948656285, "flos": 21398163598080.0, "grad_norm": 1.6816005709749045, "language_loss": 0.74405032, "learning_rate": 2.2121631400563135e-07, "loss": 0.76537669, "num_input_tokens_seen": 153606835, "step": 7097, "time_per_iteration": 2.6500766277313232 }, { "auxiliary_loss_clip": 0.01021909, "auxiliary_loss_mlp": 0.01003726, "balance_loss_clip": 1.00734353, "balance_loss_mlp": 1.00214005, "epoch": 0.8534840377562677, "flos": 53345122490880.0, "grad_norm": 0.7779380052639815, "language_loss": 0.5293541, "learning_rate": 2.208603454309701e-07, "loss": 0.54961038, "num_input_tokens_seen": 153664925, "step": 7098, "time_per_iteration": 3.14404296875 }, { "auxiliary_loss_clip": 0.01084723, "auxiliary_loss_mlp": 0.01041808, "balance_loss_clip": 1.03772438, "balance_loss_mlp": 1.02591765, "epoch": 0.8536042806469067, "flos": 20814148368000.0, "grad_norm": 2.4400753571369997, "language_loss": 0.71234381, "learning_rate": 2.2050464675037994e-07, "loss": 0.7336092, "num_input_tokens_seen": 153683550, "step": 7099, "time_per_iteration": 2.683250665664673 }, { "auxiliary_loss_clip": 0.01110432, "auxiliary_loss_mlp": 0.01036618, "balance_loss_clip": 1.0418812, "balance_loss_mlp": 1.01995218, "epoch": 0.8537245235375458, "flos": 24681368292480.0, "grad_norm": 2.8667074100872245, "language_loss": 0.72828293, "learning_rate": 2.2014921801782016e-07, "loss": 0.74975336, "num_input_tokens_seen": 153703040, "step": 7100, "time_per_iteration": 2.6249380111694336 }, { "auxiliary_loss_clip": 0.01105139, "auxiliary_loss_mlp": 0.01037797, "balance_loss_clip": 1.03693104, "balance_loss_mlp": 1.02246666, "epoch": 0.853844766428185, "flos": 24384817607040.0, "grad_norm": 2.5586511415533555, "language_loss": 0.74281257, "learning_rate": 2.1979405928720872e-07, "loss": 0.76424193, "num_input_tokens_seen": 153722695, "step": 7101, "time_per_iteration": 2.6297473907470703 }, { "auxiliary_loss_clip": 0.01107853, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.04069626, "balance_loss_mlp": 1.01976871, "epoch": 0.853965009318824, "flos": 20955707867520.0, "grad_norm": 1.6560849907259678, "language_loss": 0.79405296, "learning_rate": 2.1943917061242257e-07, "loss": 0.81547952, "num_input_tokens_seen": 153742550, "step": 7102, "time_per_iteration": 2.626906156539917 }, { "auxiliary_loss_clip": 0.01129067, "auxiliary_loss_mlp": 0.00774057, "balance_loss_clip": 1.0433439, "balance_loss_mlp": 1.00052083, "epoch": 0.8540852522094631, "flos": 24201816791040.0, "grad_norm": 1.8255459637068499, "language_loss": 0.66481984, "learning_rate": 2.1908455204729903e-07, "loss": 0.683851, "num_input_tokens_seen": 153761700, "step": 7103, "time_per_iteration": 3.5221986770629883 }, { "auxiliary_loss_clip": 0.01107765, "auxiliary_loss_mlp": 0.01046047, "balance_loss_clip": 1.03991818, "balance_loss_mlp": 1.02852345, "epoch": 0.8542054951001022, "flos": 25082921410560.0, "grad_norm": 2.9083830004626416, "language_loss": 0.78374457, "learning_rate": 2.1873020364563265e-07, "loss": 0.80528265, "num_input_tokens_seen": 153780765, "step": 7104, "time_per_iteration": 3.903028726577759 }, { "auxiliary_loss_clip": 0.01116833, "auxiliary_loss_mlp": 0.0103907, "balance_loss_clip": 1.04249167, "balance_loss_mlp": 1.02528346, "epoch": 0.8543257379907413, "flos": 24316551809280.0, "grad_norm": 3.1473218252458732, "language_loss": 0.76050431, "learning_rate": 2.183761254611789e-07, "loss": 0.78206331, "num_input_tokens_seen": 153801090, "step": 7105, "time_per_iteration": 2.620671272277832 }, { "auxiliary_loss_clip": 0.01116891, "auxiliary_loss_mlp": 0.01034709, "balance_loss_clip": 1.04117131, "balance_loss_mlp": 1.01974773, "epoch": 0.8544459808813804, "flos": 55286630467200.0, "grad_norm": 1.8658457983863916, "language_loss": 0.70192897, "learning_rate": 2.1802231754764987e-07, "loss": 0.72344494, "num_input_tokens_seen": 153826530, "step": 7106, "time_per_iteration": 2.8792591094970703 }, { "auxiliary_loss_clip": 0.01107199, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.03871107, "balance_loss_mlp": 1.01927662, "epoch": 0.8545662237720195, "flos": 25776248705280.0, "grad_norm": 1.8959653205583609, "language_loss": 0.76323253, "learning_rate": 2.17668779958718e-07, "loss": 0.78464937, "num_input_tokens_seen": 153849110, "step": 7107, "time_per_iteration": 3.6179778575897217 }, { "auxiliary_loss_clip": 0.01132973, "auxiliary_loss_mlp": 0.01038276, "balance_loss_clip": 1.044595, "balance_loss_mlp": 1.02249312, "epoch": 0.8546864666626586, "flos": 11108320427520.0, "grad_norm": 2.260880459285346, "language_loss": 0.80814993, "learning_rate": 2.1731551274801553e-07, "loss": 0.82986242, "num_input_tokens_seen": 153865550, "step": 7108, "time_per_iteration": 2.5171101093292236 }, { "auxiliary_loss_clip": 0.01104983, "auxiliary_loss_mlp": 0.01038445, "balance_loss_clip": 1.04141748, "balance_loss_mlp": 1.02298379, "epoch": 0.8548067095532976, "flos": 25520169669120.0, "grad_norm": 2.353071155071555, "language_loss": 0.62207484, "learning_rate": 2.169625159691324e-07, "loss": 0.64350915, "num_input_tokens_seen": 153885425, "step": 7109, "time_per_iteration": 2.617616891860962 }, { "auxiliary_loss_clip": 0.01090749, "auxiliary_loss_mlp": 0.01036834, "balance_loss_clip": 1.03951311, "balance_loss_mlp": 1.02132499, "epoch": 0.8549269524439368, "flos": 24717853532160.0, "grad_norm": 2.7307417978061124, "language_loss": 0.74090964, "learning_rate": 2.1660978967561784e-07, "loss": 0.76218545, "num_input_tokens_seen": 153904760, "step": 7110, "time_per_iteration": 3.5980632305145264 }, { "auxiliary_loss_clip": 0.01128841, "auxiliary_loss_mlp": 0.01034003, "balance_loss_clip": 1.04009032, "balance_loss_mlp": 1.01858938, "epoch": 0.8550471953345758, "flos": 19825599191040.0, "grad_norm": 2.5753498185723314, "language_loss": 0.78744984, "learning_rate": 2.1625733392098035e-07, "loss": 0.8090784, "num_input_tokens_seen": 153920370, "step": 7111, "time_per_iteration": 2.524782180786133 }, { "auxiliary_loss_clip": 0.01130005, "auxiliary_loss_mlp": 0.01038968, "balance_loss_clip": 1.04274821, "balance_loss_mlp": 1.02467477, "epoch": 0.8551674382252149, "flos": 22820441500800.0, "grad_norm": 1.8389425310552543, "language_loss": 0.79610872, "learning_rate": 2.159051487586867e-07, "loss": 0.81779838, "num_input_tokens_seen": 153940500, "step": 7112, "time_per_iteration": 2.547837257385254 }, { "auxiliary_loss_clip": 0.01115784, "auxiliary_loss_mlp": 0.01042682, "balance_loss_clip": 1.04612219, "balance_loss_mlp": 1.02652955, "epoch": 0.8552876811158541, "flos": 20631255292800.0, "grad_norm": 2.0994311934370335, "language_loss": 0.72850078, "learning_rate": 2.155532342421642e-07, "loss": 0.75008547, "num_input_tokens_seen": 153958500, "step": 7113, "time_per_iteration": 2.612776279449463 }, { "auxiliary_loss_clip": 0.01122747, "auxiliary_loss_mlp": 0.01043245, "balance_loss_clip": 1.04282987, "balance_loss_mlp": 1.0275805, "epoch": 0.8554079240064931, "flos": 23112359331840.0, "grad_norm": 1.6514437141885125, "language_loss": 0.78162414, "learning_rate": 2.1520159042479636e-07, "loss": 0.80328405, "num_input_tokens_seen": 153976790, "step": 7114, "time_per_iteration": 2.5818116664886475 }, { "auxiliary_loss_clip": 0.01120675, "auxiliary_loss_mlp": 0.0103427, "balance_loss_clip": 1.04345632, "balance_loss_mlp": 1.01877308, "epoch": 0.8555281668971322, "flos": 22128047959680.0, "grad_norm": 2.3090261859933405, "language_loss": 0.71011031, "learning_rate": 2.148502173599287e-07, "loss": 0.73165977, "num_input_tokens_seen": 153994930, "step": 7115, "time_per_iteration": 2.5853898525238037 }, { "auxiliary_loss_clip": 0.01101392, "auxiliary_loss_mlp": 0.01034816, "balance_loss_clip": 1.03982544, "balance_loss_mlp": 1.01962852, "epoch": 0.8556484097877713, "flos": 31139040234240.0, "grad_norm": 2.0218600853265505, "language_loss": 0.65772402, "learning_rate": 2.1449911510086372e-07, "loss": 0.67908609, "num_input_tokens_seen": 154014400, "step": 7116, "time_per_iteration": 2.688267469406128 }, { "auxiliary_loss_clip": 0.01118687, "auxiliary_loss_mlp": 0.01035147, "balance_loss_clip": 1.04249096, "balance_loss_mlp": 1.0201509, "epoch": 0.8557686526784104, "flos": 24316551809280.0, "grad_norm": 1.7983334597549856, "language_loss": 0.77072799, "learning_rate": 2.141482837008628e-07, "loss": 0.79226637, "num_input_tokens_seen": 154034940, "step": 7117, "time_per_iteration": 2.6166365146636963 }, { "auxiliary_loss_clip": 0.01113029, "auxiliary_loss_mlp": 0.01035464, "balance_loss_clip": 1.04069018, "balance_loss_mlp": 1.01846457, "epoch": 0.8558888955690495, "flos": 17712723427200.0, "grad_norm": 2.734275876950648, "language_loss": 0.72060513, "learning_rate": 2.1379772321314826e-07, "loss": 0.74209011, "num_input_tokens_seen": 154052985, "step": 7118, "time_per_iteration": 2.534334421157837 }, { "auxiliary_loss_clip": 0.01059256, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.03518736, "balance_loss_mlp": 1.02874553, "epoch": 0.8560091384596886, "flos": 19171702051200.0, "grad_norm": 2.3399502358141206, "language_loss": 0.81573725, "learning_rate": 2.1344743369089802e-07, "loss": 0.8367728, "num_input_tokens_seen": 154068765, "step": 7119, "time_per_iteration": 2.6881940364837646 }, { "auxiliary_loss_clip": 0.01105423, "auxiliary_loss_mlp": 0.01035788, "balance_loss_clip": 1.04204369, "balance_loss_mlp": 1.02174485, "epoch": 0.8561293813503277, "flos": 23914855036800.0, "grad_norm": 1.6629519188622166, "language_loss": 0.81995511, "learning_rate": 2.130974151872522e-07, "loss": 0.84136724, "num_input_tokens_seen": 154089100, "step": 7120, "time_per_iteration": 2.6050844192504883 }, { "auxiliary_loss_clip": 0.01093743, "auxiliary_loss_mlp": 0.01036736, "balance_loss_clip": 1.03909421, "balance_loss_mlp": 1.0217278, "epoch": 0.8562496242409667, "flos": 22529206028160.0, "grad_norm": 1.6236826693379534, "language_loss": 0.787305, "learning_rate": 2.1274766775530773e-07, "loss": 0.80860972, "num_input_tokens_seen": 154108965, "step": 7121, "time_per_iteration": 2.618372678756714 }, { "auxiliary_loss_clip": 0.01136036, "auxiliary_loss_mlp": 0.01035112, "balance_loss_clip": 1.04276502, "balance_loss_mlp": 1.01943576, "epoch": 0.8563698671316058, "flos": 14712745472640.0, "grad_norm": 2.8014417704336747, "language_loss": 0.79559094, "learning_rate": 2.1239819144812077e-07, "loss": 0.81730241, "num_input_tokens_seen": 154123425, "step": 7122, "time_per_iteration": 2.5356321334838867 }, { "auxiliary_loss_clip": 0.01089261, "auxiliary_loss_mlp": 0.01045782, "balance_loss_clip": 1.03634143, "balance_loss_mlp": 1.02930701, "epoch": 0.856490110022245, "flos": 39167768211840.0, "grad_norm": 3.804194984359947, "language_loss": 0.69861197, "learning_rate": 2.1204898631870716e-07, "loss": 0.71996236, "num_input_tokens_seen": 154148315, "step": 7123, "time_per_iteration": 2.788025140762329 }, { "auxiliary_loss_clip": 0.01107525, "auxiliary_loss_mlp": 0.01032431, "balance_loss_clip": 1.04209781, "balance_loss_mlp": 1.01687407, "epoch": 0.856610352912884, "flos": 29059345658880.0, "grad_norm": 1.7896089941813667, "language_loss": 0.76263654, "learning_rate": 2.1170005242004006e-07, "loss": 0.7840361, "num_input_tokens_seen": 154169665, "step": 7124, "time_per_iteration": 2.704422950744629 }, { "auxiliary_loss_clip": 0.01110368, "auxiliary_loss_mlp": 0.01041401, "balance_loss_clip": 1.04175806, "balance_loss_mlp": 1.02729273, "epoch": 0.8567305958035231, "flos": 23878333883520.0, "grad_norm": 1.808750189392322, "language_loss": 0.77873302, "learning_rate": 2.1135138980505384e-07, "loss": 0.80025065, "num_input_tokens_seen": 154190335, "step": 7125, "time_per_iteration": 2.7131729125976562 }, { "auxiliary_loss_clip": 0.01104646, "auxiliary_loss_mlp": 0.01041906, "balance_loss_clip": 1.04210556, "balance_loss_mlp": 1.02595568, "epoch": 0.8568508386941622, "flos": 22200120599040.0, "grad_norm": 1.672942663623604, "language_loss": 0.72609806, "learning_rate": 2.110029985266395e-07, "loss": 0.74756366, "num_input_tokens_seen": 154210040, "step": 7126, "time_per_iteration": 2.7302894592285156 }, { "auxiliary_loss_clip": 0.0110852, "auxiliary_loss_mlp": 0.01035514, "balance_loss_clip": 1.0410254, "balance_loss_mlp": 1.02015972, "epoch": 0.8569710815848013, "flos": 17307507121920.0, "grad_norm": 1.883922238866923, "language_loss": 0.7357012, "learning_rate": 2.1065487863764787e-07, "loss": 0.75714147, "num_input_tokens_seen": 154228385, "step": 7127, "time_per_iteration": 2.7225289344787598 }, { "auxiliary_loss_clip": 0.01071717, "auxiliary_loss_mlp": 0.01047571, "balance_loss_clip": 1.03305721, "balance_loss_mlp": 1.03067875, "epoch": 0.8570913244754403, "flos": 23732285184000.0, "grad_norm": 1.605936288471791, "language_loss": 0.85698748, "learning_rate": 2.1030703019088846e-07, "loss": 0.87818038, "num_input_tokens_seen": 154249015, "step": 7128, "time_per_iteration": 2.8170900344848633 }, { "auxiliary_loss_clip": 0.01113305, "auxiliary_loss_mlp": 0.01036554, "balance_loss_clip": 1.04038334, "balance_loss_mlp": 1.02212977, "epoch": 0.8572115673660795, "flos": 20048748433920.0, "grad_norm": 1.7635111822684828, "language_loss": 0.70970118, "learning_rate": 2.099594532391291e-07, "loss": 0.73119974, "num_input_tokens_seen": 154267700, "step": 7129, "time_per_iteration": 3.6766552925109863 }, { "auxiliary_loss_clip": 0.01110406, "auxiliary_loss_mlp": 0.01037636, "balance_loss_clip": 1.03995347, "balance_loss_mlp": 1.0222466, "epoch": 0.8573318102567186, "flos": 27160389342720.0, "grad_norm": 2.452613251400858, "language_loss": 0.79164809, "learning_rate": 2.0961214783509806e-07, "loss": 0.81312853, "num_input_tokens_seen": 154290580, "step": 7130, "time_per_iteration": 3.582226514816284 }, { "auxiliary_loss_clip": 0.01112229, "auxiliary_loss_mlp": 0.01038285, "balance_loss_clip": 1.04133677, "balance_loss_mlp": 1.02350307, "epoch": 0.8574520531473576, "flos": 24936585402240.0, "grad_norm": 1.8683859182233975, "language_loss": 0.74672198, "learning_rate": 2.0926511403148051e-07, "loss": 0.7682271, "num_input_tokens_seen": 154309545, "step": 7131, "time_per_iteration": 2.709423065185547 }, { "auxiliary_loss_clip": 0.01102474, "auxiliary_loss_mlp": 0.01037277, "balance_loss_clip": 1.04295361, "balance_loss_mlp": 1.0215894, "epoch": 0.8575722960379968, "flos": 18771154513920.0, "grad_norm": 8.508402314256506, "language_loss": 0.75930023, "learning_rate": 2.0891835188092143e-07, "loss": 0.78069776, "num_input_tokens_seen": 154326545, "step": 7132, "time_per_iteration": 2.6331567764282227 }, { "auxiliary_loss_clip": 0.01100034, "auxiliary_loss_mlp": 0.01038347, "balance_loss_clip": 1.04067338, "balance_loss_mlp": 1.0225637, "epoch": 0.8576925389286358, "flos": 22200300167040.0, "grad_norm": 2.001605777589468, "language_loss": 0.81182986, "learning_rate": 2.0857186143602434e-07, "loss": 0.83321369, "num_input_tokens_seen": 154345190, "step": 7133, "time_per_iteration": 3.691842794418335 }, { "auxiliary_loss_clip": 0.01087276, "auxiliary_loss_mlp": 0.01032781, "balance_loss_clip": 1.03724337, "balance_loss_mlp": 1.01658034, "epoch": 0.8578127818192749, "flos": 22894345733760.0, "grad_norm": 1.8355351772525097, "language_loss": 0.67445534, "learning_rate": 2.0822564274935094e-07, "loss": 0.69565594, "num_input_tokens_seen": 154364615, "step": 7134, "time_per_iteration": 2.6681854724884033 }, { "auxiliary_loss_clip": 0.01107273, "auxiliary_loss_mlp": 0.01038418, "balance_loss_clip": 1.04250932, "balance_loss_mlp": 1.02162123, "epoch": 0.8579330247099141, "flos": 34824839541120.0, "grad_norm": 3.072716144652211, "language_loss": 0.67143869, "learning_rate": 2.078796958734239e-07, "loss": 0.69289559, "num_input_tokens_seen": 154387335, "step": 7135, "time_per_iteration": 2.7311651706695557 }, { "auxiliary_loss_clip": 0.01120676, "auxiliary_loss_mlp": 0.01036017, "balance_loss_clip": 1.04205275, "balance_loss_mlp": 1.02073479, "epoch": 0.8580532676005531, "flos": 19755681367680.0, "grad_norm": 3.6128643142467762, "language_loss": 0.75086272, "learning_rate": 2.0753402086072124e-07, "loss": 0.77242959, "num_input_tokens_seen": 154405965, "step": 7136, "time_per_iteration": 3.4854001998901367 }, { "auxiliary_loss_clip": 0.01074758, "auxiliary_loss_mlp": 0.01044969, "balance_loss_clip": 1.04252589, "balance_loss_mlp": 1.02818465, "epoch": 0.8581735104911922, "flos": 22739318634240.0, "grad_norm": 2.1499885538317822, "language_loss": 0.74932277, "learning_rate": 2.071886177636828e-07, "loss": 0.77052009, "num_input_tokens_seen": 154422750, "step": 7137, "time_per_iteration": 2.890674591064453 }, { "auxiliary_loss_clip": 0.01116883, "auxiliary_loss_mlp": 0.01035018, "balance_loss_clip": 1.04065156, "balance_loss_mlp": 1.01950884, "epoch": 0.8582937533818313, "flos": 23149131880320.0, "grad_norm": 1.8257326706697923, "language_loss": 0.83078408, "learning_rate": 2.0684348663470575e-07, "loss": 0.85230303, "num_input_tokens_seen": 154442930, "step": 7138, "time_per_iteration": 2.7575361728668213 }, { "auxiliary_loss_clip": 0.01104775, "auxiliary_loss_mlp": 0.01044446, "balance_loss_clip": 1.03661656, "balance_loss_mlp": 1.02730346, "epoch": 0.8584139962724704, "flos": 19498668577920.0, "grad_norm": 1.7982136426697672, "language_loss": 0.61808032, "learning_rate": 2.0649862752614555e-07, "loss": 0.6395725, "num_input_tokens_seen": 154461640, "step": 7139, "time_per_iteration": 2.6532833576202393 }, { "auxiliary_loss_clip": 0.01017645, "auxiliary_loss_mlp": 0.01004551, "balance_loss_clip": 1.00802708, "balance_loss_mlp": 1.00271499, "epoch": 0.8585342391631094, "flos": 71276577788160.0, "grad_norm": 0.7506033594499383, "language_loss": 0.57020563, "learning_rate": 2.0615404049031838e-07, "loss": 0.59042764, "num_input_tokens_seen": 154518610, "step": 7140, "time_per_iteration": 3.228053569793701 }, { "auxiliary_loss_clip": 0.01120128, "auxiliary_loss_mlp": 0.01047086, "balance_loss_clip": 1.04079318, "balance_loss_mlp": 1.02890682, "epoch": 0.8586544820537486, "flos": 10815432929280.0, "grad_norm": 3.49987114951155, "language_loss": 0.77865738, "learning_rate": 2.0580972557949616e-07, "loss": 0.80032957, "num_input_tokens_seen": 154533700, "step": 7141, "time_per_iteration": 2.5874032974243164 }, { "auxiliary_loss_clip": 0.0102604, "auxiliary_loss_mlp": 0.01024709, "balance_loss_clip": 1.00681639, "balance_loss_mlp": 1.0227778, "epoch": 0.8587747249443877, "flos": 64811184422400.0, "grad_norm": 0.8093554922184739, "language_loss": 0.54262727, "learning_rate": 2.054656828459125e-07, "loss": 0.56313479, "num_input_tokens_seen": 154597810, "step": 7142, "time_per_iteration": 3.1967601776123047 }, { "auxiliary_loss_clip": 0.0107742, "auxiliary_loss_mlp": 0.01043129, "balance_loss_clip": 1.03591299, "balance_loss_mlp": 1.02591562, "epoch": 0.8588949678350267, "flos": 26834607964800.0, "grad_norm": 2.434577577383918, "language_loss": 0.77515018, "learning_rate": 2.051219123417578e-07, "loss": 0.79635566, "num_input_tokens_seen": 154617870, "step": 7143, "time_per_iteration": 2.7115607261657715 }, { "auxiliary_loss_clip": 0.01134085, "auxiliary_loss_mlp": 0.01038284, "balance_loss_clip": 1.04209483, "balance_loss_mlp": 1.02221513, "epoch": 0.8590152107256659, "flos": 26104256726400.0, "grad_norm": 1.902887472533197, "language_loss": 0.60244244, "learning_rate": 2.0477841411918196e-07, "loss": 0.62416613, "num_input_tokens_seen": 154637395, "step": 7144, "time_per_iteration": 2.595773696899414 }, { "auxiliary_loss_clip": 0.01115505, "auxiliary_loss_mlp": 0.01035728, "balance_loss_clip": 1.04056156, "balance_loss_mlp": 1.02090979, "epoch": 0.859135453616305, "flos": 26140885620480.0, "grad_norm": 2.0701689100629195, "language_loss": 0.74533594, "learning_rate": 2.0443518823029326e-07, "loss": 0.76684821, "num_input_tokens_seen": 154657935, "step": 7145, "time_per_iteration": 2.6725573539733887 }, { "auxiliary_loss_clip": 0.01090934, "auxiliary_loss_mlp": 0.01036478, "balance_loss_clip": 1.03957105, "balance_loss_mlp": 1.02062941, "epoch": 0.859255696506944, "flos": 12969319046400.0, "grad_norm": 2.0746407403422937, "language_loss": 0.76503474, "learning_rate": 2.0409223472715854e-07, "loss": 0.78630883, "num_input_tokens_seen": 154675080, "step": 7146, "time_per_iteration": 2.6191794872283936 }, { "auxiliary_loss_clip": 0.0109521, "auxiliary_loss_mlp": 0.00771137, "balance_loss_clip": 1.03899133, "balance_loss_mlp": 1.00051045, "epoch": 0.8593759393975832, "flos": 18475753063680.0, "grad_norm": 2.2898942685208765, "language_loss": 0.74876189, "learning_rate": 2.0374955366180434e-07, "loss": 0.7674253, "num_input_tokens_seen": 154692720, "step": 7147, "time_per_iteration": 2.6667253971099854 }, { "auxiliary_loss_clip": 0.01098026, "auxiliary_loss_mlp": 0.01033782, "balance_loss_clip": 1.03812242, "balance_loss_mlp": 1.01797521, "epoch": 0.8594961822882222, "flos": 22200156512640.0, "grad_norm": 1.717506504278959, "language_loss": 0.72692341, "learning_rate": 2.034071450862147e-07, "loss": 0.74824148, "num_input_tokens_seen": 154710190, "step": 7148, "time_per_iteration": 2.7057619094848633 }, { "auxiliary_loss_clip": 0.01108897, "auxiliary_loss_mlp": 0.01047747, "balance_loss_clip": 1.03850722, "balance_loss_mlp": 1.03141499, "epoch": 0.8596164251788613, "flos": 23294749616640.0, "grad_norm": 1.681093397907211, "language_loss": 0.76645595, "learning_rate": 2.030650090523327e-07, "loss": 0.7880224, "num_input_tokens_seen": 154729380, "step": 7149, "time_per_iteration": 2.6645264625549316 }, { "auxiliary_loss_clip": 0.01093057, "auxiliary_loss_mlp": 0.01045046, "balance_loss_clip": 1.03858757, "balance_loss_mlp": 1.02963209, "epoch": 0.8597366680695004, "flos": 31649905416960.0, "grad_norm": 1.6737015278815415, "language_loss": 0.59296238, "learning_rate": 2.0272314561205995e-07, "loss": 0.6143434, "num_input_tokens_seen": 154749775, "step": 7150, "time_per_iteration": 2.7556798458099365 }, { "auxiliary_loss_clip": 0.01087498, "auxiliary_loss_mlp": 0.01036484, "balance_loss_clip": 1.03543222, "balance_loss_mlp": 1.02101016, "epoch": 0.8598569109601395, "flos": 21287738211840.0, "grad_norm": 1.867874098313425, "language_loss": 0.73002833, "learning_rate": 2.023815548172567e-07, "loss": 0.75126815, "num_input_tokens_seen": 154769845, "step": 7151, "time_per_iteration": 2.6801021099090576 }, { "auxiliary_loss_clip": 0.01121862, "auxiliary_loss_mlp": 0.010414, "balance_loss_clip": 1.0413301, "balance_loss_mlp": 1.02667761, "epoch": 0.8599771538507786, "flos": 25447809720960.0, "grad_norm": 2.2826804101788802, "language_loss": 0.65970004, "learning_rate": 2.0204023671974267e-07, "loss": 0.68133271, "num_input_tokens_seen": 154789230, "step": 7152, "time_per_iteration": 2.640028715133667 }, { "auxiliary_loss_clip": 0.01111392, "auxiliary_loss_mlp": 0.01038117, "balance_loss_clip": 1.03722346, "balance_loss_mlp": 1.02337074, "epoch": 0.8600973967414177, "flos": 16723958768640.0, "grad_norm": 2.168192173259634, "language_loss": 0.81119037, "learning_rate": 2.0169919137129532e-07, "loss": 0.83268547, "num_input_tokens_seen": 154807670, "step": 7153, "time_per_iteration": 2.5771901607513428 }, { "auxiliary_loss_clip": 0.01123022, "auxiliary_loss_mlp": 0.01040514, "balance_loss_clip": 1.04334784, "balance_loss_mlp": 1.02448058, "epoch": 0.8602176396320568, "flos": 25227928615680.0, "grad_norm": 2.7887111704606293, "language_loss": 0.71015751, "learning_rate": 2.013584188236508e-07, "loss": 0.73179293, "num_input_tokens_seen": 154825575, "step": 7154, "time_per_iteration": 2.6128268241882324 }, { "auxiliary_loss_clip": 0.01130485, "auxiliary_loss_mlp": 0.01036967, "balance_loss_clip": 1.04000235, "balance_loss_mlp": 1.0213387, "epoch": 0.8603378825226958, "flos": 20412236113920.0, "grad_norm": 2.5721079999810073, "language_loss": 0.79355049, "learning_rate": 2.0101791912850396e-07, "loss": 0.81522501, "num_input_tokens_seen": 154845115, "step": 7155, "time_per_iteration": 3.534764289855957 }, { "auxiliary_loss_clip": 0.01111214, "auxiliary_loss_mlp": 0.01041522, "balance_loss_clip": 1.04373467, "balance_loss_mlp": 1.02515435, "epoch": 0.8604581254133349, "flos": 34930201109760.0, "grad_norm": 39.68429475881568, "language_loss": 0.6374557, "learning_rate": 2.006776923375082e-07, "loss": 0.65898311, "num_input_tokens_seen": 154866770, "step": 7156, "time_per_iteration": 3.6770918369293213 }, { "auxiliary_loss_clip": 0.01131124, "auxiliary_loss_mlp": 0.01033439, "balance_loss_clip": 1.04278696, "balance_loss_mlp": 1.01841879, "epoch": 0.860578368303974, "flos": 22596538072320.0, "grad_norm": 1.600464661609129, "language_loss": 0.71401763, "learning_rate": 2.003377385022764e-07, "loss": 0.73566324, "num_input_tokens_seen": 154885595, "step": 7157, "time_per_iteration": 2.5483059883117676 }, { "auxiliary_loss_clip": 0.01109975, "auxiliary_loss_mlp": 0.01035495, "balance_loss_clip": 1.04171622, "balance_loss_mlp": 1.01979494, "epoch": 0.8606986111946131, "flos": 21324331192320.0, "grad_norm": 2.035999675442914, "language_loss": 0.77381474, "learning_rate": 1.9999805767437826e-07, "loss": 0.79526937, "num_input_tokens_seen": 154904485, "step": 7158, "time_per_iteration": 2.6064648628234863 }, { "auxiliary_loss_clip": 0.01101466, "auxiliary_loss_mlp": 0.01054114, "balance_loss_clip": 1.03906512, "balance_loss_mlp": 1.03790784, "epoch": 0.8608188540852522, "flos": 28877206769280.0, "grad_norm": 2.105834114996182, "language_loss": 0.71565706, "learning_rate": 1.9965864990534386e-07, "loss": 0.7372129, "num_input_tokens_seen": 154925010, "step": 7159, "time_per_iteration": 3.6684021949768066 }, { "auxiliary_loss_clip": 0.0108632, "auxiliary_loss_mlp": 0.01044931, "balance_loss_clip": 1.03512907, "balance_loss_mlp": 1.02832532, "epoch": 0.8609390969758913, "flos": 29716187713920.0, "grad_norm": 1.876124280537077, "language_loss": 0.77937371, "learning_rate": 1.9931951524666092e-07, "loss": 0.80068624, "num_input_tokens_seen": 154946100, "step": 7160, "time_per_iteration": 2.7041330337524414 }, { "auxiliary_loss_clip": 0.01123539, "auxiliary_loss_mlp": 0.00772181, "balance_loss_clip": 1.04346526, "balance_loss_mlp": 1.00040245, "epoch": 0.8610593398665304, "flos": 21249349551360.0, "grad_norm": 1.6937474278943283, "language_loss": 0.81235278, "learning_rate": 1.9898065374977534e-07, "loss": 0.83130991, "num_input_tokens_seen": 154966305, "step": 7161, "time_per_iteration": 2.6082406044006348 }, { "auxiliary_loss_clip": 0.01088912, "auxiliary_loss_mlp": 0.01031611, "balance_loss_clip": 1.03897119, "balance_loss_mlp": 1.0179143, "epoch": 0.8611795827571694, "flos": 14830102183680.0, "grad_norm": 2.2170117936855624, "language_loss": 0.73406029, "learning_rate": 1.9864206546609342e-07, "loss": 0.75526547, "num_input_tokens_seen": 154985145, "step": 7162, "time_per_iteration": 2.671131134033203 }, { "auxiliary_loss_clip": 0.01131415, "auxiliary_loss_mlp": 0.01034363, "balance_loss_clip": 1.04134059, "balance_loss_mlp": 1.02017689, "epoch": 0.8612998256478086, "flos": 24243258107520.0, "grad_norm": 9.47091693734626, "language_loss": 0.84243619, "learning_rate": 1.983037504469771e-07, "loss": 0.86409402, "num_input_tokens_seen": 155003855, "step": 7163, "time_per_iteration": 3.4753878116607666 }, { "auxiliary_loss_clip": 0.0112077, "auxiliary_loss_mlp": 0.01045715, "balance_loss_clip": 1.04207325, "balance_loss_mlp": 1.02938378, "epoch": 0.8614200685384477, "flos": 21252653602560.0, "grad_norm": 1.7032770026207016, "language_loss": 0.6662038, "learning_rate": 1.9796570874374984e-07, "loss": 0.6878686, "num_input_tokens_seen": 155023960, "step": 7164, "time_per_iteration": 2.6069252490997314 }, { "auxiliary_loss_clip": 0.0110909, "auxiliary_loss_mlp": 0.01042649, "balance_loss_clip": 1.03927052, "balance_loss_mlp": 1.02750909, "epoch": 0.8615403114290867, "flos": 20007738080640.0, "grad_norm": 1.6945010132928533, "language_loss": 0.77667677, "learning_rate": 1.976279404076917e-07, "loss": 0.79819417, "num_input_tokens_seen": 155043360, "step": 7165, "time_per_iteration": 2.627722978591919 }, { "auxiliary_loss_clip": 0.01091768, "auxiliary_loss_mlp": 0.01042098, "balance_loss_clip": 1.03786993, "balance_loss_mlp": 1.02658939, "epoch": 0.8616605543197259, "flos": 29789373674880.0, "grad_norm": 1.8627266595627485, "language_loss": 0.76301664, "learning_rate": 1.9729044549004193e-07, "loss": 0.78435534, "num_input_tokens_seen": 155064745, "step": 7166, "time_per_iteration": 2.7046399116516113 }, { "auxiliary_loss_clip": 0.01116608, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.0411135, "balance_loss_mlp": 1.02045035, "epoch": 0.8617807972103649, "flos": 28911609020160.0, "grad_norm": 1.826363213610305, "language_loss": 0.70536262, "learning_rate": 1.9695322404199822e-07, "loss": 0.72687876, "num_input_tokens_seen": 155086790, "step": 7167, "time_per_iteration": 2.631249189376831 }, { "auxiliary_loss_clip": 0.01107525, "auxiliary_loss_mlp": 0.01043926, "balance_loss_clip": 1.04288459, "balance_loss_mlp": 1.02745163, "epoch": 0.861901040101004, "flos": 27673804391040.0, "grad_norm": 1.8306169315346883, "language_loss": 0.8216598, "learning_rate": 1.9661627611471654e-07, "loss": 0.84317422, "num_input_tokens_seen": 155106585, "step": 7168, "time_per_iteration": 2.6725635528564453 }, { "auxiliary_loss_clip": 0.01114172, "auxiliary_loss_mlp": 0.01041184, "balance_loss_clip": 1.04248452, "balance_loss_mlp": 1.02480495, "epoch": 0.8620212829916432, "flos": 49748056755840.0, "grad_norm": 1.9045560340380507, "language_loss": 0.70245922, "learning_rate": 1.9627960175931246e-07, "loss": 0.72401273, "num_input_tokens_seen": 155131285, "step": 7169, "time_per_iteration": 2.8599181175231934 }, { "auxiliary_loss_clip": 0.01117566, "auxiliary_loss_mlp": 0.01033424, "balance_loss_clip": 1.04201436, "balance_loss_mlp": 1.01846313, "epoch": 0.8621415258822822, "flos": 21138672769920.0, "grad_norm": 2.036589645635603, "language_loss": 0.74520695, "learning_rate": 1.9594320102685847e-07, "loss": 0.76671684, "num_input_tokens_seen": 155150555, "step": 7170, "time_per_iteration": 2.6033823490142822 }, { "auxiliary_loss_clip": 0.01096144, "auxiliary_loss_mlp": 0.00772848, "balance_loss_clip": 1.03577566, "balance_loss_mlp": 1.00043976, "epoch": 0.8622617687729213, "flos": 21689039934720.0, "grad_norm": 1.8676471570347013, "language_loss": 0.63699907, "learning_rate": 1.956070739683864e-07, "loss": 0.655689, "num_input_tokens_seen": 155169890, "step": 7171, "time_per_iteration": 2.619507074356079 }, { "auxiliary_loss_clip": 0.01078019, "auxiliary_loss_mlp": 0.01050145, "balance_loss_clip": 1.03465343, "balance_loss_mlp": 1.03276467, "epoch": 0.8623820116635604, "flos": 26250592734720.0, "grad_norm": 1.5073458069377348, "language_loss": 0.74315894, "learning_rate": 1.9527122063488678e-07, "loss": 0.7644406, "num_input_tokens_seen": 155191005, "step": 7172, "time_per_iteration": 2.7011616230010986 }, { "auxiliary_loss_clip": 0.01103671, "auxiliary_loss_mlp": 0.01034833, "balance_loss_clip": 1.0367626, "balance_loss_mlp": 1.01987219, "epoch": 0.8625022545541995, "flos": 19647554451840.0, "grad_norm": 1.804351309355161, "language_loss": 0.80528629, "learning_rate": 1.9493564107730755e-07, "loss": 0.8266713, "num_input_tokens_seen": 155211005, "step": 7173, "time_per_iteration": 2.6471378803253174 }, { "auxiliary_loss_clip": 0.01099204, "auxiliary_loss_mlp": 0.01049687, "balance_loss_clip": 1.0366441, "balance_loss_mlp": 1.03324747, "epoch": 0.8626224974448385, "flos": 21908382336000.0, "grad_norm": 2.2532291901036423, "language_loss": 0.61394936, "learning_rate": 1.9460033534655684e-07, "loss": 0.63543826, "num_input_tokens_seen": 155230365, "step": 7174, "time_per_iteration": 2.6055116653442383 }, { "auxiliary_loss_clip": 0.01100398, "auxiliary_loss_mlp": 0.01048719, "balance_loss_clip": 1.03670597, "balance_loss_mlp": 1.03225636, "epoch": 0.8627427403354777, "flos": 23331198942720.0, "grad_norm": 1.5797539100208522, "language_loss": 0.8437109, "learning_rate": 1.9426530349349978e-07, "loss": 0.86520213, "num_input_tokens_seen": 155250815, "step": 7175, "time_per_iteration": 2.639782667160034 }, { "auxiliary_loss_clip": 0.01114902, "auxiliary_loss_mlp": 0.0077201, "balance_loss_clip": 1.03994751, "balance_loss_mlp": 1.00042605, "epoch": 0.8628629832261168, "flos": 16362877299840.0, "grad_norm": 2.788556449390427, "language_loss": 0.64699507, "learning_rate": 1.9393054556896038e-07, "loss": 0.66586417, "num_input_tokens_seen": 155268515, "step": 7176, "time_per_iteration": 2.5577712059020996 }, { "auxiliary_loss_clip": 0.01090254, "auxiliary_loss_mlp": 0.01042977, "balance_loss_clip": 1.03652608, "balance_loss_mlp": 1.0263052, "epoch": 0.8629832261167558, "flos": 28103941756800.0, "grad_norm": 6.142269393231166, "language_loss": 0.69201165, "learning_rate": 1.9359606162372133e-07, "loss": 0.71334398, "num_input_tokens_seen": 155290120, "step": 7177, "time_per_iteration": 2.7101712226867676 }, { "auxiliary_loss_clip": 0.01130586, "auxiliary_loss_mlp": 0.0103747, "balance_loss_clip": 1.04391718, "balance_loss_mlp": 1.0217936, "epoch": 0.863103469007395, "flos": 20230061310720.0, "grad_norm": 1.614741939867725, "language_loss": 0.70831192, "learning_rate": 1.9326185170852293e-07, "loss": 0.72999245, "num_input_tokens_seen": 155309085, "step": 7178, "time_per_iteration": 2.554351806640625 }, { "auxiliary_loss_clip": 0.01119401, "auxiliary_loss_mlp": 0.01036528, "balance_loss_clip": 1.04350507, "balance_loss_mlp": 1.0214237, "epoch": 0.863223711898034, "flos": 24498547044480.0, "grad_norm": 8.328876338057638, "language_loss": 0.72029781, "learning_rate": 1.9292791587406598e-07, "loss": 0.74185711, "num_input_tokens_seen": 155327945, "step": 7179, "time_per_iteration": 2.6110994815826416 }, { "auxiliary_loss_clip": 0.01117338, "auxiliary_loss_mlp": 0.0077227, "balance_loss_clip": 1.04085875, "balance_loss_mlp": 1.00047529, "epoch": 0.8633439547886731, "flos": 17675376261120.0, "grad_norm": 2.6792556659199773, "language_loss": 0.87023747, "learning_rate": 1.9259425417100661e-07, "loss": 0.88913357, "num_input_tokens_seen": 155344060, "step": 7180, "time_per_iteration": 3.5982019901275635 }, { "auxiliary_loss_clip": 0.01065568, "auxiliary_loss_mlp": 0.01038655, "balance_loss_clip": 1.032107, "balance_loss_mlp": 1.0222522, "epoch": 0.8634641976793123, "flos": 12895055677440.0, "grad_norm": 2.531084565288279, "language_loss": 0.74989533, "learning_rate": 1.9226086664996234e-07, "loss": 0.77093756, "num_input_tokens_seen": 155362305, "step": 7181, "time_per_iteration": 2.6562695503234863 }, { "auxiliary_loss_clip": 0.01110941, "auxiliary_loss_mlp": 0.01043334, "balance_loss_clip": 1.04258609, "balance_loss_mlp": 1.02807522, "epoch": 0.8635844405699513, "flos": 23878980328320.0, "grad_norm": 1.9359290207848288, "language_loss": 0.74332023, "learning_rate": 1.9192775336150712e-07, "loss": 0.76486295, "num_input_tokens_seen": 155382605, "step": 7182, "time_per_iteration": 3.601742744445801 }, { "auxiliary_loss_clip": 0.01025367, "auxiliary_loss_mlp": 0.01002922, "balance_loss_clip": 1.00733113, "balance_loss_mlp": 1.00132465, "epoch": 0.8637046834605904, "flos": 60453387521280.0, "grad_norm": 0.7642684834495499, "language_loss": 0.56214309, "learning_rate": 1.915949143561739e-07, "loss": 0.58242595, "num_input_tokens_seen": 155437280, "step": 7183, "time_per_iteration": 3.1445319652557373 }, { "auxiliary_loss_clip": 0.01120365, "auxiliary_loss_mlp": 0.01039215, "balance_loss_clip": 1.04202437, "balance_loss_mlp": 1.02409923, "epoch": 0.8638249263512295, "flos": 20558751690240.0, "grad_norm": 1.8287950081483366, "language_loss": 0.78113937, "learning_rate": 1.9126234968445498e-07, "loss": 0.80273515, "num_input_tokens_seen": 155456970, "step": 7184, "time_per_iteration": 3.5955541133880615 }, { "auxiliary_loss_clip": 0.01133418, "auxiliary_loss_mlp": 0.01039273, "balance_loss_clip": 1.04252064, "balance_loss_mlp": 1.02385902, "epoch": 0.8639451692418686, "flos": 26615768353920.0, "grad_norm": 1.5065213467673564, "language_loss": 0.67535901, "learning_rate": 1.9093005939679884e-07, "loss": 0.69708586, "num_input_tokens_seen": 155478925, "step": 7185, "time_per_iteration": 2.589970827102661 }, { "auxiliary_loss_clip": 0.01121967, "auxiliary_loss_mlp": 0.01039221, "balance_loss_clip": 1.04389608, "balance_loss_mlp": 1.02386689, "epoch": 0.8640654121325076, "flos": 15122450977920.0, "grad_norm": 1.8174492162323135, "language_loss": 0.76589352, "learning_rate": 1.9059804354361452e-07, "loss": 0.78750539, "num_input_tokens_seen": 155496700, "step": 7186, "time_per_iteration": 2.558562755584717 }, { "auxiliary_loss_clip": 0.01099601, "auxiliary_loss_mlp": 0.01044047, "balance_loss_clip": 1.03740585, "balance_loss_mlp": 1.02628529, "epoch": 0.8641856550231467, "flos": 31869068250240.0, "grad_norm": 1.7324168482276066, "language_loss": 0.70538843, "learning_rate": 1.902663021752684e-07, "loss": 0.72682488, "num_input_tokens_seen": 155518130, "step": 7187, "time_per_iteration": 2.6899216175079346 }, { "auxiliary_loss_clip": 0.01134881, "auxiliary_loss_mlp": 0.01038031, "balance_loss_clip": 1.04394937, "balance_loss_mlp": 1.02284324, "epoch": 0.8643058979137859, "flos": 14976545932800.0, "grad_norm": 2.0695644057636224, "language_loss": 0.8251279, "learning_rate": 1.8993483534208556e-07, "loss": 0.84685701, "num_input_tokens_seen": 155537040, "step": 7188, "time_per_iteration": 3.5296406745910645 }, { "auxiliary_loss_clip": 0.01098852, "auxiliary_loss_mlp": 0.0104969, "balance_loss_clip": 1.03938413, "balance_loss_mlp": 1.03253531, "epoch": 0.8644261408044249, "flos": 13115726881920.0, "grad_norm": 6.639933967800243, "language_loss": 0.7494818, "learning_rate": 1.8960364309434884e-07, "loss": 0.77096719, "num_input_tokens_seen": 155554535, "step": 7189, "time_per_iteration": 2.6894466876983643 }, { "auxiliary_loss_clip": 0.01067316, "auxiliary_loss_mlp": 0.00772295, "balance_loss_clip": 1.03629756, "balance_loss_mlp": 1.00052476, "epoch": 0.864546383695064, "flos": 20850920916480.0, "grad_norm": 1.876410074102933, "language_loss": 0.78721917, "learning_rate": 1.8927272548229967e-07, "loss": 0.80561531, "num_input_tokens_seen": 155574225, "step": 7190, "time_per_iteration": 2.779158592224121 }, { "auxiliary_loss_clip": 0.0108184, "auxiliary_loss_mlp": 0.01042518, "balance_loss_clip": 1.03821385, "balance_loss_mlp": 1.02566218, "epoch": 0.8646666265857031, "flos": 21324582587520.0, "grad_norm": 1.5439479777131526, "language_loss": 0.83328784, "learning_rate": 1.8894208255613876e-07, "loss": 0.85453141, "num_input_tokens_seen": 155593540, "step": 7191, "time_per_iteration": 2.794210910797119 }, { "auxiliary_loss_clip": 0.01128609, "auxiliary_loss_mlp": 0.0103009, "balance_loss_clip": 1.04098845, "balance_loss_mlp": 1.01599336, "epoch": 0.8647868694763422, "flos": 19750833031680.0, "grad_norm": 2.135611927507686, "language_loss": 0.77509475, "learning_rate": 1.8861171436602397e-07, "loss": 0.7966817, "num_input_tokens_seen": 155610655, "step": 7192, "time_per_iteration": 2.645883083343506 }, { "auxiliary_loss_clip": 0.01125965, "auxiliary_loss_mlp": 0.01040479, "balance_loss_clip": 1.04608881, "balance_loss_mlp": 1.02432036, "epoch": 0.8649071123669813, "flos": 26176760328960.0, "grad_norm": 2.2620809841062766, "language_loss": 0.80375409, "learning_rate": 1.882816209620719e-07, "loss": 0.82541853, "num_input_tokens_seen": 155627365, "step": 7193, "time_per_iteration": 2.7467398643493652 }, { "auxiliary_loss_clip": 0.01105528, "auxiliary_loss_mlp": 0.0103616, "balance_loss_clip": 1.04152119, "balance_loss_mlp": 1.01919627, "epoch": 0.8650273552576204, "flos": 20302888135680.0, "grad_norm": 2.006120138779823, "language_loss": 0.76821351, "learning_rate": 1.8795180239435738e-07, "loss": 0.78963041, "num_input_tokens_seen": 155646220, "step": 7194, "time_per_iteration": 2.704257011413574 }, { "auxiliary_loss_clip": 0.01114931, "auxiliary_loss_mlp": 0.01040598, "balance_loss_clip": 1.04431438, "balance_loss_mlp": 1.02455246, "epoch": 0.8651475981482595, "flos": 23951088881280.0, "grad_norm": 5.238061332671681, "language_loss": 0.7580691, "learning_rate": 1.8762225871291348e-07, "loss": 0.7796244, "num_input_tokens_seen": 155662095, "step": 7195, "time_per_iteration": 2.7159109115600586 }, { "auxiliary_loss_clip": 0.01130086, "auxiliary_loss_mlp": 0.00771621, "balance_loss_clip": 1.04077148, "balance_loss_mlp": 1.00046897, "epoch": 0.8652678410388985, "flos": 21684622561920.0, "grad_norm": 1.6130617954086144, "language_loss": 0.8097207, "learning_rate": 1.8729298996773201e-07, "loss": 0.82873785, "num_input_tokens_seen": 155680845, "step": 7196, "time_per_iteration": 2.5948100090026855 }, { "auxiliary_loss_clip": 0.01022456, "auxiliary_loss_mlp": 0.01002497, "balance_loss_clip": 1.00650847, "balance_loss_mlp": 1.00103664, "epoch": 0.8653880839295377, "flos": 65224660855680.0, "grad_norm": 0.8296803275357598, "language_loss": 0.60904723, "learning_rate": 1.8696399620876301e-07, "loss": 0.62929678, "num_input_tokens_seen": 155737875, "step": 7197, "time_per_iteration": 3.139230728149414 }, { "auxiliary_loss_clip": 0.01090353, "auxiliary_loss_mlp": 0.01044632, "balance_loss_clip": 1.03538322, "balance_loss_mlp": 1.02801394, "epoch": 0.8655083268201768, "flos": 17749172753280.0, "grad_norm": 2.4308876790422276, "language_loss": 0.78956544, "learning_rate": 1.866352774859141e-07, "loss": 0.81091529, "num_input_tokens_seen": 155753100, "step": 7198, "time_per_iteration": 2.6474201679229736 }, { "auxiliary_loss_clip": 0.01095671, "auxiliary_loss_mlp": 0.01038362, "balance_loss_clip": 1.03783512, "balance_loss_mlp": 1.02331758, "epoch": 0.8656285697108158, "flos": 20703974376960.0, "grad_norm": 2.819344609978143, "language_loss": 0.69299608, "learning_rate": 1.8630683384905188e-07, "loss": 0.7143364, "num_input_tokens_seen": 155772430, "step": 7199, "time_per_iteration": 2.804597854614258 }, { "auxiliary_loss_clip": 0.01129528, "auxiliary_loss_mlp": 0.00771453, "balance_loss_clip": 1.04186201, "balance_loss_mlp": 1.00045061, "epoch": 0.865748812601455, "flos": 18653833716480.0, "grad_norm": 2.3468468450098805, "language_loss": 0.88811827, "learning_rate": 1.8597866534800045e-07, "loss": 0.9071281, "num_input_tokens_seen": 155787545, "step": 7200, "time_per_iteration": 2.5421528816223145 }, { "auxiliary_loss_clip": 0.01120458, "auxiliary_loss_mlp": 0.00772211, "balance_loss_clip": 1.04141021, "balance_loss_mlp": 1.00053239, "epoch": 0.865869055492094, "flos": 70652554807680.0, "grad_norm": 2.5027668431301358, "language_loss": 0.74261791, "learning_rate": 1.8565077203254398e-07, "loss": 0.76154459, "num_input_tokens_seen": 155813005, "step": 7201, "time_per_iteration": 3.029658794403076 }, { "auxiliary_loss_clip": 0.01099942, "auxiliary_loss_mlp": 0.01039697, "balance_loss_clip": 1.04140615, "balance_loss_mlp": 1.02247119, "epoch": 0.8659892983827331, "flos": 17383961220480.0, "grad_norm": 3.4942507450271254, "language_loss": 0.72742844, "learning_rate": 1.8532315395242203e-07, "loss": 0.74882478, "num_input_tokens_seen": 155829455, "step": 7202, "time_per_iteration": 2.671085834503174 }, { "auxiliary_loss_clip": 0.01100251, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.04231429, "balance_loss_mlp": 1.02191353, "epoch": 0.8661095412733723, "flos": 17895221452800.0, "grad_norm": 2.3794797033532253, "language_loss": 0.72059369, "learning_rate": 1.849958111573353e-07, "loss": 0.74196124, "num_input_tokens_seen": 155848060, "step": 7203, "time_per_iteration": 2.658627986907959 }, { "auxiliary_loss_clip": 0.0112915, "auxiliary_loss_mlp": 0.0103593, "balance_loss_clip": 1.04152715, "balance_loss_mlp": 1.02059972, "epoch": 0.8662297841640113, "flos": 18224163227520.0, "grad_norm": 1.6317082844105826, "language_loss": 0.63781655, "learning_rate": 1.8466874369694074e-07, "loss": 0.6594674, "num_input_tokens_seen": 155865755, "step": 7204, "time_per_iteration": 2.5763072967529297 }, { "auxiliary_loss_clip": 0.01092233, "auxiliary_loss_mlp": 0.01039016, "balance_loss_clip": 1.03610468, "balance_loss_mlp": 1.02450848, "epoch": 0.8663500270546504, "flos": 16362159027840.0, "grad_norm": 3.310581833117034, "language_loss": 0.70134956, "learning_rate": 1.843419516208542e-07, "loss": 0.72266197, "num_input_tokens_seen": 155882680, "step": 7205, "time_per_iteration": 2.6582415103912354 }, { "auxiliary_loss_clip": 0.01119655, "auxiliary_loss_mlp": 0.01043322, "balance_loss_clip": 1.04216123, "balance_loss_mlp": 1.02691901, "epoch": 0.8664702699452895, "flos": 17894431353600.0, "grad_norm": 3.8992103637534314, "language_loss": 0.7948218, "learning_rate": 1.8401543497865047e-07, "loss": 0.81645155, "num_input_tokens_seen": 155900680, "step": 7206, "time_per_iteration": 3.66129469871521 }, { "auxiliary_loss_clip": 0.01122618, "auxiliary_loss_mlp": 0.00771941, "balance_loss_clip": 1.04373264, "balance_loss_mlp": 1.00051665, "epoch": 0.8665905128359286, "flos": 30736373794560.0, "grad_norm": 2.4025197969537984, "language_loss": 0.64770293, "learning_rate": 1.836891938198608e-07, "loss": 0.66664851, "num_input_tokens_seen": 155921105, "step": 7207, "time_per_iteration": 2.674778699874878 }, { "auxiliary_loss_clip": 0.01105275, "auxiliary_loss_mlp": 0.01034253, "balance_loss_clip": 1.03913057, "balance_loss_mlp": 1.01772428, "epoch": 0.8667107557265676, "flos": 18656419495680.0, "grad_norm": 2.156933431642956, "language_loss": 0.71104741, "learning_rate": 1.8336322819397677e-07, "loss": 0.73244274, "num_input_tokens_seen": 155938640, "step": 7208, "time_per_iteration": 3.568701982498169 }, { "auxiliary_loss_clip": 0.0109987, "auxiliary_loss_mlp": 0.01038126, "balance_loss_clip": 1.03905654, "balance_loss_mlp": 1.02157998, "epoch": 0.8668309986172068, "flos": 20083725302400.0, "grad_norm": 1.975289721208252, "language_loss": 0.62971425, "learning_rate": 1.8303753815044654e-07, "loss": 0.65109426, "num_input_tokens_seen": 155957945, "step": 7209, "time_per_iteration": 2.6304664611816406 }, { "auxiliary_loss_clip": 0.01120489, "auxiliary_loss_mlp": 0.01048066, "balance_loss_clip": 1.0432297, "balance_loss_mlp": 1.03035164, "epoch": 0.8669512415078459, "flos": 21615099788160.0, "grad_norm": 2.3193530658573644, "language_loss": 0.70655787, "learning_rate": 1.827121237386773e-07, "loss": 0.72824347, "num_input_tokens_seen": 155975390, "step": 7210, "time_per_iteration": 2.6112523078918457 }, { "auxiliary_loss_clip": 0.01111013, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.04104972, "balance_loss_mlp": 1.02446544, "epoch": 0.8670714843984849, "flos": 17703601372800.0, "grad_norm": 2.3159545101205157, "language_loss": 0.75172275, "learning_rate": 1.8238698500803374e-07, "loss": 0.77323329, "num_input_tokens_seen": 155988155, "step": 7211, "time_per_iteration": 3.6661975383758545 }, { "auxiliary_loss_clip": 0.01025609, "auxiliary_loss_mlp": 0.01000253, "balance_loss_clip": 1.00643158, "balance_loss_mlp": 0.99867994, "epoch": 0.8671917272891241, "flos": 60705483125760.0, "grad_norm": 0.7175344586049346, "language_loss": 0.56235445, "learning_rate": 1.820621220078391e-07, "loss": 0.58261311, "num_input_tokens_seen": 156052065, "step": 7212, "time_per_iteration": 3.2458295822143555 }, { "auxiliary_loss_clip": 0.01130756, "auxiliary_loss_mlp": 0.01042007, "balance_loss_clip": 1.0416801, "balance_loss_mlp": 1.02677155, "epoch": 0.8673119701797631, "flos": 20451881750400.0, "grad_norm": 1.9470204003884266, "language_loss": 0.68217033, "learning_rate": 1.8173753478737553e-07, "loss": 0.70389795, "num_input_tokens_seen": 156072500, "step": 7213, "time_per_iteration": 2.538577079772949 }, { "auxiliary_loss_clip": 0.01133542, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.04374695, "balance_loss_mlp": 1.02191913, "epoch": 0.8674322130704022, "flos": 19647410797440.0, "grad_norm": 3.7126395141131656, "language_loss": 0.8019197, "learning_rate": 1.8141322339588205e-07, "loss": 0.82362878, "num_input_tokens_seen": 156089840, "step": 7214, "time_per_iteration": 3.473756790161133 }, { "auxiliary_loss_clip": 0.01132574, "auxiliary_loss_mlp": 0.01038211, "balance_loss_clip": 1.04378283, "balance_loss_mlp": 1.02300024, "epoch": 0.8675524559610414, "flos": 26025001367040.0, "grad_norm": 1.9884558098016485, "language_loss": 0.70215702, "learning_rate": 1.810891878825569e-07, "loss": 0.72386479, "num_input_tokens_seen": 156109815, "step": 7215, "time_per_iteration": 2.6223278045654297 }, { "auxiliary_loss_clip": 0.01104641, "auxiliary_loss_mlp": 0.01037917, "balance_loss_clip": 1.03891683, "balance_loss_mlp": 1.02299166, "epoch": 0.8676726988516804, "flos": 15049444584960.0, "grad_norm": 2.550218534468949, "language_loss": 0.71785444, "learning_rate": 1.8076542829655561e-07, "loss": 0.73927999, "num_input_tokens_seen": 156128620, "step": 7216, "time_per_iteration": 2.615525007247925 }, { "auxiliary_loss_clip": 0.01106878, "auxiliary_loss_mlp": 0.01040916, "balance_loss_clip": 1.04115975, "balance_loss_mlp": 1.02452493, "epoch": 0.8677929417423195, "flos": 16288111140480.0, "grad_norm": 4.903697696905391, "language_loss": 0.79519761, "learning_rate": 1.8044194468699203e-07, "loss": 0.81667554, "num_input_tokens_seen": 156145930, "step": 7217, "time_per_iteration": 2.637502908706665 }, { "auxiliary_loss_clip": 0.011011, "auxiliary_loss_mlp": 0.01034923, "balance_loss_clip": 1.03966856, "balance_loss_mlp": 1.01863873, "epoch": 0.8679131846329585, "flos": 18844160906880.0, "grad_norm": 3.0984802008305645, "language_loss": 0.75543189, "learning_rate": 1.8011873710293912e-07, "loss": 0.77679205, "num_input_tokens_seen": 156164435, "step": 7218, "time_per_iteration": 2.601958990097046 }, { "auxiliary_loss_clip": 0.0111486, "auxiliary_loss_mlp": 0.01035598, "balance_loss_clip": 1.04085541, "balance_loss_mlp": 1.02012503, "epoch": 0.8680334275235977, "flos": 33620718890880.0, "grad_norm": 2.096636658483931, "language_loss": 0.69788504, "learning_rate": 1.7979580559342677e-07, "loss": 0.71938962, "num_input_tokens_seen": 156185165, "step": 7219, "time_per_iteration": 2.735542058944702 }, { "auxiliary_loss_clip": 0.01103988, "auxiliary_loss_mlp": 0.0104265, "balance_loss_clip": 1.04049921, "balance_loss_mlp": 1.02678287, "epoch": 0.8681536704142367, "flos": 24681152810880.0, "grad_norm": 2.308900178346739, "language_loss": 0.6681881, "learning_rate": 1.7947315020744358e-07, "loss": 0.68965459, "num_input_tokens_seen": 156206260, "step": 7220, "time_per_iteration": 2.6562721729278564 }, { "auxiliary_loss_clip": 0.01105065, "auxiliary_loss_mlp": 0.01033304, "balance_loss_clip": 1.03896475, "balance_loss_mlp": 1.01740193, "epoch": 0.8682739133048758, "flos": 20011042131840.0, "grad_norm": 1.8630975185897023, "language_loss": 0.80120414, "learning_rate": 1.7915077099393594e-07, "loss": 0.82258785, "num_input_tokens_seen": 156222860, "step": 7221, "time_per_iteration": 2.6387486457824707 }, { "auxiliary_loss_clip": 0.01120657, "auxiliary_loss_mlp": 0.0103602, "balance_loss_clip": 1.04038072, "balance_loss_mlp": 1.01997447, "epoch": 0.868394156195515, "flos": 16654759217280.0, "grad_norm": 1.9943681352495826, "language_loss": 0.73058373, "learning_rate": 1.788286680018083e-07, "loss": 0.75215048, "num_input_tokens_seen": 156241570, "step": 7222, "time_per_iteration": 2.6617465019226074 }, { "auxiliary_loss_clip": 0.01109715, "auxiliary_loss_mlp": 0.01034225, "balance_loss_clip": 1.04165912, "balance_loss_mlp": 1.01945496, "epoch": 0.868514399086154, "flos": 28001381448960.0, "grad_norm": 1.5838533082412813, "language_loss": 0.72302955, "learning_rate": 1.7850684127992443e-07, "loss": 0.74446893, "num_input_tokens_seen": 156261315, "step": 7223, "time_per_iteration": 2.6895253658294678 }, { "auxiliary_loss_clip": 0.01094948, "auxiliary_loss_mlp": 0.01039423, "balance_loss_clip": 1.04119015, "balance_loss_mlp": 1.02514207, "epoch": 0.8686346419767931, "flos": 20084587228800.0, "grad_norm": 1.8773060550412748, "language_loss": 0.70479894, "learning_rate": 1.7818529087710378e-07, "loss": 0.72614264, "num_input_tokens_seen": 156281670, "step": 7224, "time_per_iteration": 2.672795295715332 }, { "auxiliary_loss_clip": 0.01118257, "auxiliary_loss_mlp": 0.00772604, "balance_loss_clip": 1.04113901, "balance_loss_mlp": 1.00049639, "epoch": 0.8687548848674322, "flos": 18223516782720.0, "grad_norm": 1.9493350750820657, "language_loss": 0.84359717, "learning_rate": 1.7786401684212637e-07, "loss": 0.86250579, "num_input_tokens_seen": 156300500, "step": 7225, "time_per_iteration": 2.601618766784668 }, { "auxiliary_loss_clip": 0.01003627, "auxiliary_loss_mlp": 0.00999876, "balance_loss_clip": 1.00838256, "balance_loss_mlp": 0.99824315, "epoch": 0.8688751277580713, "flos": 70457885049600.0, "grad_norm": 0.7279382707348966, "language_loss": 0.55939555, "learning_rate": 1.7754301922372883e-07, "loss": 0.57943058, "num_input_tokens_seen": 156350145, "step": 7226, "time_per_iteration": 3.0995912551879883 }, { "auxiliary_loss_clip": 0.01074238, "auxiliary_loss_mlp": 0.01035988, "balance_loss_clip": 1.03802848, "balance_loss_mlp": 1.02044284, "epoch": 0.8689953706487104, "flos": 26906788344960.0, "grad_norm": 2.5651522909561884, "language_loss": 0.80958056, "learning_rate": 1.7722229807060617e-07, "loss": 0.83068281, "num_input_tokens_seen": 156368725, "step": 7227, "time_per_iteration": 2.791450262069702 }, { "auxiliary_loss_clip": 0.01083006, "auxiliary_loss_mlp": 0.0103616, "balance_loss_clip": 1.03584278, "balance_loss_mlp": 1.02118182, "epoch": 0.8691156135393495, "flos": 34637385438720.0, "grad_norm": 2.1066192058623665, "language_loss": 0.81918752, "learning_rate": 1.7690185343141172e-07, "loss": 0.84037912, "num_input_tokens_seen": 156388640, "step": 7228, "time_per_iteration": 2.76886248588562 }, { "auxiliary_loss_clip": 0.01103926, "auxiliary_loss_mlp": 0.01033731, "balance_loss_clip": 1.03874326, "balance_loss_mlp": 1.01916933, "epoch": 0.8692358564299886, "flos": 18989814556800.0, "grad_norm": 2.3572734210955475, "language_loss": 0.69973457, "learning_rate": 1.7658168535475615e-07, "loss": 0.72111112, "num_input_tokens_seen": 156406425, "step": 7229, "time_per_iteration": 2.6081321239471436 }, { "auxiliary_loss_clip": 0.01111642, "auxiliary_loss_mlp": 0.01043328, "balance_loss_clip": 1.04146004, "balance_loss_mlp": 1.02656746, "epoch": 0.8693560993206276, "flos": 30370839039360.0, "grad_norm": 1.5356456650662906, "language_loss": 0.64606845, "learning_rate": 1.7626179388920948e-07, "loss": 0.66761822, "num_input_tokens_seen": 156427705, "step": 7230, "time_per_iteration": 2.7514257431030273 }, { "auxiliary_loss_clip": 0.0111116, "auxiliary_loss_mlp": 0.00772721, "balance_loss_clip": 1.04371297, "balance_loss_mlp": 1.00044811, "epoch": 0.8694763422112668, "flos": 27200430028800.0, "grad_norm": 1.973704586180253, "language_loss": 0.80624914, "learning_rate": 1.7594217908329866e-07, "loss": 0.8250879, "num_input_tokens_seen": 156449890, "step": 7231, "time_per_iteration": 2.676588535308838 }, { "auxiliary_loss_clip": 0.01101661, "auxiliary_loss_mlp": 0.01038148, "balance_loss_clip": 1.0411377, "balance_loss_mlp": 1.02350914, "epoch": 0.8695965851019059, "flos": 26139161767680.0, "grad_norm": 1.7758066272927362, "language_loss": 0.74003452, "learning_rate": 1.7562284098550895e-07, "loss": 0.76143265, "num_input_tokens_seen": 156469600, "step": 7232, "time_per_iteration": 3.628725290298462 }, { "auxiliary_loss_clip": 0.01012532, "auxiliary_loss_mlp": 0.01002407, "balance_loss_clip": 1.00794339, "balance_loss_mlp": 1.00091124, "epoch": 0.8697168279925449, "flos": 67332616456320.0, "grad_norm": 0.8316199073933741, "language_loss": 0.62264419, "learning_rate": 1.753037796442838e-07, "loss": 0.64279366, "num_input_tokens_seen": 156529040, "step": 7233, "time_per_iteration": 3.1793856620788574 }, { "auxiliary_loss_clip": 0.0113223, "auxiliary_loss_mlp": 0.01050516, "balance_loss_clip": 1.04198575, "balance_loss_mlp": 1.03464901, "epoch": 0.8698370708831841, "flos": 19718693337600.0, "grad_norm": 2.1069988936922237, "language_loss": 0.7507785, "learning_rate": 1.74984995108024e-07, "loss": 0.77260596, "num_input_tokens_seen": 156546970, "step": 7234, "time_per_iteration": 3.4971277713775635 }, { "auxiliary_loss_clip": 0.0112172, "auxiliary_loss_mlp": 0.01038857, "balance_loss_clip": 1.04253602, "balance_loss_mlp": 1.02375352, "epoch": 0.8699573137738231, "flos": 12859971068160.0, "grad_norm": 2.1385282618319863, "language_loss": 0.82933521, "learning_rate": 1.7466648742508981e-07, "loss": 0.850941, "num_input_tokens_seen": 156563155, "step": 7235, "time_per_iteration": 2.5990982055664062 }, { "auxiliary_loss_clip": 0.01106807, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.04051232, "balance_loss_mlp": 1.01988268, "epoch": 0.8700775566644622, "flos": 17420733768960.0, "grad_norm": 1.9984630915262684, "language_loss": 0.84442627, "learning_rate": 1.7434825664379837e-07, "loss": 0.86583662, "num_input_tokens_seen": 156581660, "step": 7236, "time_per_iteration": 2.5926051139831543 }, { "auxiliary_loss_clip": 0.01124709, "auxiliary_loss_mlp": 0.01040105, "balance_loss_clip": 1.04427063, "balance_loss_mlp": 1.02453613, "epoch": 0.8701977995551013, "flos": 13735221770880.0, "grad_norm": 8.762963015187731, "language_loss": 0.86069787, "learning_rate": 1.740303028124246e-07, "loss": 0.88234597, "num_input_tokens_seen": 156597720, "step": 7237, "time_per_iteration": 3.5445854663848877 }, { "auxiliary_loss_clip": 0.01058802, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.03530896, "balance_loss_mlp": 1.02178705, "epoch": 0.8703180424457404, "flos": 30555707362560.0, "grad_norm": 3.5441265712803736, "language_loss": 0.75482196, "learning_rate": 1.7371262597920212e-07, "loss": 0.77579796, "num_input_tokens_seen": 156619780, "step": 7238, "time_per_iteration": 2.804854393005371 }, { "auxiliary_loss_clip": 0.010796, "auxiliary_loss_mlp": 0.01040976, "balance_loss_clip": 1.03873181, "balance_loss_mlp": 1.02528834, "epoch": 0.8704382853363795, "flos": 19608986223360.0, "grad_norm": 1.5450894763473448, "language_loss": 0.76423061, "learning_rate": 1.7339522619232195e-07, "loss": 0.78543639, "num_input_tokens_seen": 156638160, "step": 7239, "time_per_iteration": 2.6747071743011475 }, { "auxiliary_loss_clip": 0.01113957, "auxiliary_loss_mlp": 0.01040612, "balance_loss_clip": 1.03957403, "balance_loss_mlp": 1.02391052, "epoch": 0.8705585282270186, "flos": 26613900846720.0, "grad_norm": 2.688968588632523, "language_loss": 0.75694966, "learning_rate": 1.730781034999338e-07, "loss": 0.77849543, "num_input_tokens_seen": 156659740, "step": 7240, "time_per_iteration": 3.5701096057891846 }, { "auxiliary_loss_clip": 0.01128578, "auxiliary_loss_mlp": 0.01033154, "balance_loss_clip": 1.04382217, "balance_loss_mlp": 1.01801419, "epoch": 0.8706787711176577, "flos": 34090465979520.0, "grad_norm": 2.209313944351178, "language_loss": 0.73696184, "learning_rate": 1.7276125795014497e-07, "loss": 0.75857913, "num_input_tokens_seen": 156678190, "step": 7241, "time_per_iteration": 2.64005446434021 }, { "auxiliary_loss_clip": 0.01112086, "auxiliary_loss_mlp": 0.01034946, "balance_loss_clip": 1.04024625, "balance_loss_mlp": 1.01860261, "epoch": 0.8707990140082967, "flos": 14611513968000.0, "grad_norm": 2.7260169653978594, "language_loss": 0.6767652, "learning_rate": 1.7244468959102054e-07, "loss": 0.69823551, "num_input_tokens_seen": 156695245, "step": 7242, "time_per_iteration": 2.614370822906494 }, { "auxiliary_loss_clip": 0.01119061, "auxiliary_loss_mlp": 0.01034087, "balance_loss_clip": 1.04131508, "balance_loss_mlp": 1.01863766, "epoch": 0.8709192568989359, "flos": 20084156265600.0, "grad_norm": 2.130672552460741, "language_loss": 0.85626829, "learning_rate": 1.7212839847058348e-07, "loss": 0.87779975, "num_input_tokens_seen": 156710375, "step": 7243, "time_per_iteration": 2.574098825454712 }, { "auxiliary_loss_clip": 0.01073437, "auxiliary_loss_mlp": 0.01035596, "balance_loss_clip": 1.03601933, "balance_loss_mlp": 1.02055144, "epoch": 0.871039499789575, "flos": 16727083251840.0, "grad_norm": 2.03780919933395, "language_loss": 0.73670942, "learning_rate": 1.718123846368147e-07, "loss": 0.75779974, "num_input_tokens_seen": 156729420, "step": 7244, "time_per_iteration": 2.7168049812316895 }, { "auxiliary_loss_clip": 0.01106331, "auxiliary_loss_mlp": 0.00770884, "balance_loss_clip": 1.04318762, "balance_loss_mlp": 1.00046992, "epoch": 0.871159742680214, "flos": 21068790860160.0, "grad_norm": 2.1881424923550243, "language_loss": 0.71690744, "learning_rate": 1.714966481376543e-07, "loss": 0.73567963, "num_input_tokens_seen": 156746100, "step": 7245, "time_per_iteration": 2.6480209827423096 }, { "auxiliary_loss_clip": 0.01118357, "auxiliary_loss_mlp": 0.01036708, "balance_loss_clip": 1.04014373, "balance_loss_mlp": 1.02094865, "epoch": 0.8712799855708532, "flos": 28256526731520.0, "grad_norm": 2.5764789959817147, "language_loss": 0.83161342, "learning_rate": 1.7118118902099797e-07, "loss": 0.85316408, "num_input_tokens_seen": 156764185, "step": 7246, "time_per_iteration": 2.6340129375457764 }, { "auxiliary_loss_clip": 0.01119994, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.04214287, "balance_loss_mlp": 1.02480936, "epoch": 0.8714002284614922, "flos": 22236677665920.0, "grad_norm": 2.0004928691477124, "language_loss": 0.80895603, "learning_rate": 1.7086600733470146e-07, "loss": 0.83056188, "num_input_tokens_seen": 156784855, "step": 7247, "time_per_iteration": 2.596378803253174 }, { "auxiliary_loss_clip": 0.01115612, "auxiliary_loss_mlp": 0.0103456, "balance_loss_clip": 1.040797, "balance_loss_mlp": 1.02061272, "epoch": 0.8715204713521313, "flos": 21431919404160.0, "grad_norm": 1.901456414793158, "language_loss": 0.77193522, "learning_rate": 1.7055110312657738e-07, "loss": 0.79343694, "num_input_tokens_seen": 156804350, "step": 7248, "time_per_iteration": 2.613128900527954 }, { "auxiliary_loss_clip": 0.0110335, "auxiliary_loss_mlp": 0.0104267, "balance_loss_clip": 1.04041696, "balance_loss_mlp": 1.0267911, "epoch": 0.8716407142427703, "flos": 23440439180160.0, "grad_norm": 2.7794759833247853, "language_loss": 0.74232054, "learning_rate": 1.702364764443962e-07, "loss": 0.76378071, "num_input_tokens_seen": 156823425, "step": 7249, "time_per_iteration": 2.6522908210754395 }, { "auxiliary_loss_clip": 0.01068699, "auxiliary_loss_mlp": 0.0104499, "balance_loss_clip": 1.03497171, "balance_loss_mlp": 1.02810979, "epoch": 0.8717609571334095, "flos": 27958683156480.0, "grad_norm": 2.034563017302588, "language_loss": 0.72420478, "learning_rate": 1.6992212733588685e-07, "loss": 0.74534172, "num_input_tokens_seen": 156843090, "step": 7250, "time_per_iteration": 2.7444283962249756 }, { "auxiliary_loss_clip": 0.01101319, "auxiliary_loss_mlp": 0.01043458, "balance_loss_clip": 1.03788161, "balance_loss_mlp": 1.02823496, "epoch": 0.8718812000240486, "flos": 25479482538240.0, "grad_norm": 1.8606513708554206, "language_loss": 0.7485342, "learning_rate": 1.6960805584873538e-07, "loss": 0.76998198, "num_input_tokens_seen": 156861090, "step": 7251, "time_per_iteration": 2.666282892227173 }, { "auxiliary_loss_clip": 0.01085993, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.03795004, "balance_loss_mlp": 1.02359438, "epoch": 0.8720014429146876, "flos": 23403056100480.0, "grad_norm": 1.7490402741302493, "language_loss": 0.78339314, "learning_rate": 1.6929426203058684e-07, "loss": 0.8046335, "num_input_tokens_seen": 156881515, "step": 7252, "time_per_iteration": 2.6810529232025146 }, { "auxiliary_loss_clip": 0.0113561, "auxiliary_loss_mlp": 0.00773657, "balance_loss_clip": 1.04250598, "balance_loss_mlp": 1.00047302, "epoch": 0.8721216858053268, "flos": 24352821567360.0, "grad_norm": 2.1825804903668042, "language_loss": 0.80105758, "learning_rate": 1.689807459290431e-07, "loss": 0.8201502, "num_input_tokens_seen": 156900170, "step": 7253, "time_per_iteration": 2.569796562194824 }, { "auxiliary_loss_clip": 0.01104034, "auxiliary_loss_mlp": 0.01038088, "balance_loss_clip": 1.04077029, "balance_loss_mlp": 1.02328861, "epoch": 0.8722419286959658, "flos": 33869687034240.0, "grad_norm": 2.3807881177985855, "language_loss": 0.71350521, "learning_rate": 1.6866750759166437e-07, "loss": 0.73492646, "num_input_tokens_seen": 156920150, "step": 7254, "time_per_iteration": 2.716684341430664 }, { "auxiliary_loss_clip": 0.01091614, "auxiliary_loss_mlp": 0.01037611, "balance_loss_clip": 1.03589892, "balance_loss_mlp": 1.0220542, "epoch": 0.8723621715866049, "flos": 18369385914240.0, "grad_norm": 2.8207180466226163, "language_loss": 0.77381611, "learning_rate": 1.6835454706596865e-07, "loss": 0.79510832, "num_input_tokens_seen": 156937980, "step": 7255, "time_per_iteration": 2.66377592086792 }, { "auxiliary_loss_clip": 0.01134659, "auxiliary_loss_mlp": 0.01044503, "balance_loss_clip": 1.04477561, "balance_loss_mlp": 1.02924395, "epoch": 0.8724824144772441, "flos": 22013348855040.0, "grad_norm": 1.6974332177431934, "language_loss": 0.73857874, "learning_rate": 1.680418643994317e-07, "loss": 0.76037037, "num_input_tokens_seen": 156956550, "step": 7256, "time_per_iteration": 2.5710442066192627 }, { "auxiliary_loss_clip": 0.01034079, "auxiliary_loss_mlp": 0.01001267, "balance_loss_clip": 1.00633836, "balance_loss_mlp": 0.99970573, "epoch": 0.8726026573678831, "flos": 66698720213760.0, "grad_norm": 0.8806635245580998, "language_loss": 0.64470774, "learning_rate": 1.6772945963948738e-07, "loss": 0.66506124, "num_input_tokens_seen": 157014715, "step": 7257, "time_per_iteration": 3.1724679470062256 }, { "auxiliary_loss_clip": 0.01105723, "auxiliary_loss_mlp": 0.01039362, "balance_loss_clip": 1.04055858, "balance_loss_mlp": 1.02279162, "epoch": 0.8727229002585222, "flos": 13370908078080.0, "grad_norm": 3.2741084324913046, "language_loss": 0.77460808, "learning_rate": 1.6741733283352733e-07, "loss": 0.79605889, "num_input_tokens_seen": 157032320, "step": 7258, "time_per_iteration": 3.5473270416259766 }, { "auxiliary_loss_clip": 0.01086116, "auxiliary_loss_mlp": 0.01036087, "balance_loss_clip": 1.03727388, "balance_loss_mlp": 1.02016664, "epoch": 0.8728431431491613, "flos": 21796987282560.0, "grad_norm": 1.596141304045407, "language_loss": 0.83806813, "learning_rate": 1.6710548402890102e-07, "loss": 0.85929024, "num_input_tokens_seen": 157052845, "step": 7259, "time_per_iteration": 2.6882338523864746 }, { "auxiliary_loss_clip": 0.01137627, "auxiliary_loss_mlp": 0.01036639, "balance_loss_clip": 1.04339409, "balance_loss_mlp": 1.0203073, "epoch": 0.8729633860398004, "flos": 36173823742080.0, "grad_norm": 1.8532296486326145, "language_loss": 0.66354871, "learning_rate": 1.6679391327291527e-07, "loss": 0.68529129, "num_input_tokens_seen": 157074050, "step": 7260, "time_per_iteration": 3.4371683597564697 }, { "auxiliary_loss_clip": 0.01103167, "auxiliary_loss_mlp": 0.01036991, "balance_loss_clip": 1.03697824, "balance_loss_mlp": 1.0207901, "epoch": 0.8730836289304394, "flos": 16359680989440.0, "grad_norm": 2.463546536519267, "language_loss": 0.68078959, "learning_rate": 1.6648262061283492e-07, "loss": 0.70219123, "num_input_tokens_seen": 157089350, "step": 7261, "time_per_iteration": 2.5805532932281494 }, { "auxiliary_loss_clip": 0.01095937, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.03875864, "balance_loss_mlp": 1.01640964, "epoch": 0.8732038718210786, "flos": 21215126868480.0, "grad_norm": 2.2640580560935244, "language_loss": 0.73472488, "learning_rate": 1.6617160609588353e-07, "loss": 0.75599062, "num_input_tokens_seen": 157108525, "step": 7262, "time_per_iteration": 2.710148811340332 }, { "auxiliary_loss_clip": 0.01112409, "auxiliary_loss_mlp": 0.01037915, "balance_loss_clip": 1.04172277, "balance_loss_mlp": 1.02288854, "epoch": 0.8733241147117177, "flos": 16610696208000.0, "grad_norm": 2.586135977059674, "language_loss": 0.71919072, "learning_rate": 1.6586086976924163e-07, "loss": 0.74069393, "num_input_tokens_seen": 157124025, "step": 7263, "time_per_iteration": 3.596269130706787 }, { "auxiliary_loss_clip": 0.01117655, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.04051638, "balance_loss_mlp": 1.01842618, "epoch": 0.8734443576023567, "flos": 20193935207040.0, "grad_norm": 1.7842215005466353, "language_loss": 0.78503907, "learning_rate": 1.6555041168004747e-07, "loss": 0.80656123, "num_input_tokens_seen": 157143345, "step": 7264, "time_per_iteration": 2.5681164264678955 }, { "auxiliary_loss_clip": 0.01102361, "auxiliary_loss_mlp": 0.01039297, "balance_loss_clip": 1.0398078, "balance_loss_mlp": 1.02487898, "epoch": 0.8735646004929959, "flos": 18041162411520.0, "grad_norm": 1.8542917118842521, "language_loss": 0.69176739, "learning_rate": 1.6524023187539715e-07, "loss": 0.713184, "num_input_tokens_seen": 157161630, "step": 7265, "time_per_iteration": 3.505629301071167 }, { "auxiliary_loss_clip": 0.01110752, "auxiliary_loss_mlp": 0.01042122, "balance_loss_clip": 1.04223061, "balance_loss_mlp": 1.02619553, "epoch": 0.873684843383635, "flos": 20262344659200.0, "grad_norm": 3.3711171642043003, "language_loss": 0.74669141, "learning_rate": 1.649303304023446e-07, "loss": 0.76822013, "num_input_tokens_seen": 157181385, "step": 7266, "time_per_iteration": 2.630825996398926 }, { "auxiliary_loss_clip": 0.01088155, "auxiliary_loss_mlp": 0.01037919, "balance_loss_clip": 1.03747845, "balance_loss_mlp": 1.02363777, "epoch": 0.873805086274274, "flos": 16947287579520.0, "grad_norm": 1.6058993987930412, "language_loss": 0.79024869, "learning_rate": 1.6462070730790246e-07, "loss": 0.81150943, "num_input_tokens_seen": 157200545, "step": 7267, "time_per_iteration": 2.646606206893921 }, { "auxiliary_loss_clip": 0.01101327, "auxiliary_loss_mlp": 0.0104002, "balance_loss_clip": 1.03811383, "balance_loss_mlp": 1.02294922, "epoch": 0.8739253291649132, "flos": 18041270152320.0, "grad_norm": 2.366679030516387, "language_loss": 0.78696775, "learning_rate": 1.6431136263903912e-07, "loss": 0.8083812, "num_input_tokens_seen": 157219545, "step": 7268, "time_per_iteration": 2.5891804695129395 }, { "auxiliary_loss_clip": 0.01123566, "auxiliary_loss_mlp": 0.00772059, "balance_loss_clip": 1.04181552, "balance_loss_mlp": 1.00047886, "epoch": 0.8740455720555522, "flos": 21325085377920.0, "grad_norm": 1.97798079323948, "language_loss": 0.73596323, "learning_rate": 1.6400229644268282e-07, "loss": 0.75491941, "num_input_tokens_seen": 157237900, "step": 7269, "time_per_iteration": 2.6262402534484863 }, { "auxiliary_loss_clip": 0.01085697, "auxiliary_loss_mlp": 0.01036263, "balance_loss_clip": 1.0386076, "balance_loss_mlp": 1.02103412, "epoch": 0.8741658149461913, "flos": 15158684822400.0, "grad_norm": 2.002710438718593, "language_loss": 0.81360245, "learning_rate": 1.6369350876571852e-07, "loss": 0.83482206, "num_input_tokens_seen": 157256055, "step": 7270, "time_per_iteration": 2.623185157775879 }, { "auxiliary_loss_clip": 0.01078634, "auxiliary_loss_mlp": 0.01038685, "balance_loss_clip": 1.03763413, "balance_loss_mlp": 1.02278233, "epoch": 0.8742860578368304, "flos": 23039855729280.0, "grad_norm": 2.4206003988143325, "language_loss": 0.81767118, "learning_rate": 1.6338499965498874e-07, "loss": 0.83884442, "num_input_tokens_seen": 157274785, "step": 7271, "time_per_iteration": 2.6725409030914307 }, { "auxiliary_loss_clip": 0.01088067, "auxiliary_loss_mlp": 0.01040213, "balance_loss_clip": 1.03847599, "balance_loss_mlp": 1.02511489, "epoch": 0.8744063007274695, "flos": 28145347159680.0, "grad_norm": 2.4367025374106035, "language_loss": 0.77482998, "learning_rate": 1.630767691572943e-07, "loss": 0.79611278, "num_input_tokens_seen": 157294805, "step": 7272, "time_per_iteration": 2.706489086151123 }, { "auxiliary_loss_clip": 0.01017215, "auxiliary_loss_mlp": 0.01002351, "balance_loss_clip": 1.00745928, "balance_loss_mlp": 1.00089097, "epoch": 0.8745265436181086, "flos": 64034076654720.0, "grad_norm": 0.7399161773061764, "language_loss": 0.53502727, "learning_rate": 1.6276881731939306e-07, "loss": 0.55522287, "num_input_tokens_seen": 157356695, "step": 7273, "time_per_iteration": 3.2427799701690674 }, { "auxiliary_loss_clip": 0.01115947, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.04111934, "balance_loss_mlp": 1.02245688, "epoch": 0.8746467865087477, "flos": 28658618553600.0, "grad_norm": 2.2391521114834245, "language_loss": 0.75351727, "learning_rate": 1.6246114418800193e-07, "loss": 0.77505481, "num_input_tokens_seen": 157376975, "step": 7274, "time_per_iteration": 2.667034387588501 }, { "auxiliary_loss_clip": 0.01114933, "auxiliary_loss_mlp": 0.0103422, "balance_loss_clip": 1.04129171, "balance_loss_mlp": 1.01883066, "epoch": 0.8747670293993868, "flos": 23985850268160.0, "grad_norm": 1.824507853730724, "language_loss": 0.76485944, "learning_rate": 1.6215374980979423e-07, "loss": 0.78635097, "num_input_tokens_seen": 157397385, "step": 7275, "time_per_iteration": 2.590433359146118 }, { "auxiliary_loss_clip": 0.01113741, "auxiliary_loss_mlp": 0.01035676, "balance_loss_clip": 1.04073274, "balance_loss_mlp": 1.02039933, "epoch": 0.8748872722900258, "flos": 45221624478720.0, "grad_norm": 1.991005383273348, "language_loss": 0.68268484, "learning_rate": 1.6184663423140133e-07, "loss": 0.70417899, "num_input_tokens_seen": 157417685, "step": 7276, "time_per_iteration": 2.8005073070526123 }, { "auxiliary_loss_clip": 0.01081964, "auxiliary_loss_mlp": 0.01039822, "balance_loss_clip": 1.03787494, "balance_loss_mlp": 1.02374053, "epoch": 0.875007515180665, "flos": 19754280737280.0, "grad_norm": 2.188446372817707, "language_loss": 0.63978207, "learning_rate": 1.615397974994126e-07, "loss": 0.66099989, "num_input_tokens_seen": 157435490, "step": 7277, "time_per_iteration": 2.642016887664795 }, { "auxiliary_loss_clip": 0.01128042, "auxiliary_loss_mlp": 0.01034283, "balance_loss_clip": 1.04245853, "balance_loss_mlp": 1.01929808, "epoch": 0.875127758071304, "flos": 22710734386560.0, "grad_norm": 1.7707257831871257, "language_loss": 0.80731219, "learning_rate": 1.6123323966037438e-07, "loss": 0.82893538, "num_input_tokens_seen": 157454010, "step": 7278, "time_per_iteration": 2.5831055641174316 }, { "auxiliary_loss_clip": 0.01132502, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.04286575, "balance_loss_mlp": 1.01722693, "epoch": 0.8752480009619431, "flos": 23403846199680.0, "grad_norm": 1.9713728141610392, "language_loss": 0.78424716, "learning_rate": 1.6092696076079216e-07, "loss": 0.80589187, "num_input_tokens_seen": 157472385, "step": 7279, "time_per_iteration": 2.6115832328796387 }, { "auxiliary_loss_clip": 0.01086865, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.03866231, "balance_loss_mlp": 1.02481413, "epoch": 0.8753682438525822, "flos": 26213101914240.0, "grad_norm": 2.6344722891135786, "language_loss": 0.73572296, "learning_rate": 1.6062096084712785e-07, "loss": 0.7569927, "num_input_tokens_seen": 157493735, "step": 7280, "time_per_iteration": 2.7021124362945557 }, { "auxiliary_loss_clip": 0.01101126, "auxiliary_loss_mlp": 0.0077473, "balance_loss_clip": 1.03889894, "balance_loss_mlp": 1.00053275, "epoch": 0.8754884867432213, "flos": 23326745656320.0, "grad_norm": 1.862959403261925, "language_loss": 0.70848143, "learning_rate": 1.6031523996580098e-07, "loss": 0.72723997, "num_input_tokens_seen": 157511295, "step": 7281, "time_per_iteration": 2.621258497238159 }, { "auxiliary_loss_clip": 0.01104976, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.04171515, "balance_loss_mlp": 1.02018309, "epoch": 0.8756087296338604, "flos": 12495226412160.0, "grad_norm": 6.323172681861116, "language_loss": 0.66205692, "learning_rate": 1.6000979816318981e-07, "loss": 0.68346781, "num_input_tokens_seen": 157529760, "step": 7282, "time_per_iteration": 2.6454966068267822 }, { "auxiliary_loss_clip": 0.01115541, "auxiliary_loss_mlp": 0.01039185, "balance_loss_clip": 1.04205275, "balance_loss_mlp": 1.02434373, "epoch": 0.8757289725244994, "flos": 18952898353920.0, "grad_norm": 7.574248322075997, "language_loss": 0.74946249, "learning_rate": 1.5970463548562886e-07, "loss": 0.7710098, "num_input_tokens_seen": 157548915, "step": 7283, "time_per_iteration": 2.554758310317993 }, { "auxiliary_loss_clip": 0.01107375, "auxiliary_loss_mlp": 0.01037595, "balance_loss_clip": 1.04086351, "balance_loss_mlp": 1.02235997, "epoch": 0.8758492154151386, "flos": 25265958140160.0, "grad_norm": 1.86111937906789, "language_loss": 0.7121911, "learning_rate": 1.5939975197941192e-07, "loss": 0.73364085, "num_input_tokens_seen": 157570570, "step": 7284, "time_per_iteration": 2.6520192623138428 }, { "auxiliary_loss_clip": 0.01016892, "auxiliary_loss_mlp": 0.01002906, "balance_loss_clip": 1.00730252, "balance_loss_mlp": 1.00135612, "epoch": 0.8759694583057777, "flos": 65571664193280.0, "grad_norm": 0.8095514419230354, "language_loss": 0.53327072, "learning_rate": 1.5909514769078892e-07, "loss": 0.5534687, "num_input_tokens_seen": 157635675, "step": 7285, "time_per_iteration": 4.194343090057373 }, { "auxiliary_loss_clip": 0.01085106, "auxiliary_loss_mlp": 0.01032126, "balance_loss_clip": 1.04002357, "balance_loss_mlp": 1.01708817, "epoch": 0.8760897011964167, "flos": 25446193608960.0, "grad_norm": 1.5769582547514207, "language_loss": 0.77789772, "learning_rate": 1.5879082266596867e-07, "loss": 0.79907006, "num_input_tokens_seen": 157657015, "step": 7286, "time_per_iteration": 3.5953192710876465 }, { "auxiliary_loss_clip": 0.01100286, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.03643179, "balance_loss_mlp": 1.01661646, "epoch": 0.8762099440870559, "flos": 28984830894720.0, "grad_norm": 1.6714752862925455, "language_loss": 0.71756816, "learning_rate": 1.5848677695111645e-07, "loss": 0.73888052, "num_input_tokens_seen": 157678615, "step": 7287, "time_per_iteration": 2.662255048751831 }, { "auxiliary_loss_clip": 0.01103637, "auxiliary_loss_mlp": 0.0104813, "balance_loss_clip": 1.04175365, "balance_loss_mlp": 1.03086901, "epoch": 0.8763301869776949, "flos": 21609461352960.0, "grad_norm": 2.207045646112015, "language_loss": 0.69421732, "learning_rate": 1.5818301059235562e-07, "loss": 0.71573502, "num_input_tokens_seen": 157693790, "step": 7288, "time_per_iteration": 2.6211562156677246 }, { "auxiliary_loss_clip": 0.01107941, "auxiliary_loss_mlp": 0.01039418, "balance_loss_clip": 1.03942442, "balance_loss_mlp": 1.02257395, "epoch": 0.876450429868334, "flos": 24644416176000.0, "grad_norm": 2.5565338245923224, "language_loss": 0.8124876, "learning_rate": 1.578795236357684e-07, "loss": 0.83396119, "num_input_tokens_seen": 157715255, "step": 7289, "time_per_iteration": 3.6846532821655273 }, { "auxiliary_loss_clip": 0.01109048, "auxiliary_loss_mlp": 0.01036908, "balance_loss_clip": 1.04349852, "balance_loss_mlp": 1.02170241, "epoch": 0.8765706727589732, "flos": 20260046188800.0, "grad_norm": 2.8663370879119183, "language_loss": 0.85394579, "learning_rate": 1.5757631612739218e-07, "loss": 0.87540531, "num_input_tokens_seen": 157728800, "step": 7290, "time_per_iteration": 2.619459390640259 }, { "auxiliary_loss_clip": 0.0103423, "auxiliary_loss_mlp": 0.0100209, "balance_loss_clip": 1.0063827, "balance_loss_mlp": 1.00045729, "epoch": 0.8766909156496122, "flos": 71371165276800.0, "grad_norm": 0.7770792174516027, "language_loss": 0.61398888, "learning_rate": 1.572733881132242e-07, "loss": 0.63435209, "num_input_tokens_seen": 157789445, "step": 7291, "time_per_iteration": 4.095687389373779 }, { "auxiliary_loss_clip": 0.01007493, "auxiliary_loss_mlp": 0.01002778, "balance_loss_clip": 1.01244915, "balance_loss_mlp": 1.00121617, "epoch": 0.8768111585402513, "flos": 69523490603520.0, "grad_norm": 0.7807669341959483, "language_loss": 0.58535212, "learning_rate": 1.5697073963921814e-07, "loss": 0.6054548, "num_input_tokens_seen": 157848685, "step": 7292, "time_per_iteration": 3.181441068649292 }, { "auxiliary_loss_clip": 0.01119426, "auxiliary_loss_mlp": 0.01040055, "balance_loss_clip": 1.04082561, "balance_loss_mlp": 1.02384257, "epoch": 0.8769314014308904, "flos": 18838558385280.0, "grad_norm": 2.1684442812352094, "language_loss": 0.85087472, "learning_rate": 1.566683707512857e-07, "loss": 0.87246954, "num_input_tokens_seen": 157866360, "step": 7293, "time_per_iteration": 2.5832743644714355 }, { "auxiliary_loss_clip": 0.01104094, "auxiliary_loss_mlp": 0.01036445, "balance_loss_clip": 1.0396812, "balance_loss_mlp": 1.01994658, "epoch": 0.8770516443215295, "flos": 14976402278400.0, "grad_norm": 1.9799157550633504, "language_loss": 0.79370642, "learning_rate": 1.5636628149529553e-07, "loss": 0.81511188, "num_input_tokens_seen": 157884150, "step": 7294, "time_per_iteration": 2.626389503479004 }, { "auxiliary_loss_clip": 0.0110442, "auxiliary_loss_mlp": 0.01039189, "balance_loss_clip": 1.03902864, "balance_loss_mlp": 1.02305961, "epoch": 0.8771718872121685, "flos": 31649654021760.0, "grad_norm": 1.9260918988108746, "language_loss": 0.79538816, "learning_rate": 1.560644719170743e-07, "loss": 0.81682426, "num_input_tokens_seen": 157905020, "step": 7295, "time_per_iteration": 2.677565336227417 }, { "auxiliary_loss_clip": 0.01095715, "auxiliary_loss_mlp": 0.01042773, "balance_loss_clip": 1.03816724, "balance_loss_mlp": 1.02499914, "epoch": 0.8772921301028077, "flos": 36095466222720.0, "grad_norm": 2.0203394551246516, "language_loss": 0.72483552, "learning_rate": 1.5576294206240692e-07, "loss": 0.74622041, "num_input_tokens_seen": 157924545, "step": 7296, "time_per_iteration": 2.725494623184204 }, { "auxiliary_loss_clip": 0.01102884, "auxiliary_loss_mlp": 0.01043247, "balance_loss_clip": 1.04108644, "balance_loss_mlp": 1.02826214, "epoch": 0.8774123729934468, "flos": 57116961849600.0, "grad_norm": 1.895981441111914, "language_loss": 0.67747557, "learning_rate": 1.5546169197703507e-07, "loss": 0.69893688, "num_input_tokens_seen": 157950820, "step": 7297, "time_per_iteration": 2.949920892715454 }, { "auxiliary_loss_clip": 0.01113933, "auxiliary_loss_mlp": 0.01039993, "balance_loss_clip": 1.04082465, "balance_loss_mlp": 1.02423978, "epoch": 0.8775326158840858, "flos": 23914495900800.0, "grad_norm": 2.5051854224622443, "language_loss": 0.770863, "learning_rate": 1.5516072170665774e-07, "loss": 0.79240227, "num_input_tokens_seen": 157968790, "step": 7298, "time_per_iteration": 2.6137728691101074 }, { "auxiliary_loss_clip": 0.01119069, "auxiliary_loss_mlp": 0.0103473, "balance_loss_clip": 1.04231453, "balance_loss_mlp": 1.0196625, "epoch": 0.877652858774725, "flos": 17123285243520.0, "grad_norm": 1.897415910834055, "language_loss": 0.86923093, "learning_rate": 1.5486003129693214e-07, "loss": 0.89076889, "num_input_tokens_seen": 157986155, "step": 7299, "time_per_iteration": 2.546532154083252 }, { "auxiliary_loss_clip": 0.01120709, "auxiliary_loss_mlp": 0.0104189, "balance_loss_clip": 1.0403254, "balance_loss_mlp": 1.02706099, "epoch": 0.877773101665364, "flos": 16508961912960.0, "grad_norm": 1.9271317974580582, "language_loss": 0.78022599, "learning_rate": 1.545596207934725e-07, "loss": 0.80185199, "num_input_tokens_seen": 158004640, "step": 7300, "time_per_iteration": 2.575183868408203 }, { "auxiliary_loss_clip": 0.01099399, "auxiliary_loss_mlp": 0.01036724, "balance_loss_clip": 1.03974605, "balance_loss_mlp": 1.02194238, "epoch": 0.8778933445560031, "flos": 22053209973120.0, "grad_norm": 1.9078084561684305, "language_loss": 0.77642149, "learning_rate": 1.5425949024185147e-07, "loss": 0.79778278, "num_input_tokens_seen": 158024665, "step": 7301, "time_per_iteration": 2.648778200149536 }, { "auxiliary_loss_clip": 0.01107414, "auxiliary_loss_mlp": 0.01037597, "balance_loss_clip": 1.03882837, "balance_loss_mlp": 1.02189755, "epoch": 0.8780135874466423, "flos": 22564757514240.0, "grad_norm": 2.0355696790106985, "language_loss": 0.67667389, "learning_rate": 1.5395963968759818e-07, "loss": 0.69812399, "num_input_tokens_seen": 158044940, "step": 7302, "time_per_iteration": 2.635641098022461 }, { "auxiliary_loss_clip": 0.01104788, "auxiliary_loss_mlp": 0.01035624, "balance_loss_clip": 1.03876495, "balance_loss_mlp": 1.0200907, "epoch": 0.8781338303372813, "flos": 61531999073280.0, "grad_norm": 1.8344042139193995, "language_loss": 0.6483407, "learning_rate": 1.536600691761998e-07, "loss": 0.66974485, "num_input_tokens_seen": 158070770, "step": 7303, "time_per_iteration": 2.9688467979431152 }, { "auxiliary_loss_clip": 0.01097537, "auxiliary_loss_mlp": 0.01037645, "balance_loss_clip": 1.04005182, "balance_loss_mlp": 1.02291036, "epoch": 0.8782540732279204, "flos": 22674751937280.0, "grad_norm": 1.8003368423008588, "language_loss": 0.71455097, "learning_rate": 1.5336077875310084e-07, "loss": 0.73590279, "num_input_tokens_seen": 158089995, "step": 7304, "time_per_iteration": 2.647489070892334 }, { "auxiliary_loss_clip": 0.01084845, "auxiliary_loss_mlp": 0.01039561, "balance_loss_clip": 1.03712702, "balance_loss_mlp": 1.02467155, "epoch": 0.8783743161185595, "flos": 16070348937600.0, "grad_norm": 2.2161250186251755, "language_loss": 0.73886812, "learning_rate": 1.5306176846370321e-07, "loss": 0.76011217, "num_input_tokens_seen": 158108140, "step": 7305, "time_per_iteration": 2.6730105876922607 }, { "auxiliary_loss_clip": 0.01114145, "auxiliary_loss_mlp": 0.01043911, "balance_loss_clip": 1.04244375, "balance_loss_mlp": 1.027722, "epoch": 0.8784945590091986, "flos": 26067879227520.0, "grad_norm": 2.216678086530904, "language_loss": 0.74254018, "learning_rate": 1.5276303835336712e-07, "loss": 0.76412076, "num_input_tokens_seen": 158128680, "step": 7306, "time_per_iteration": 2.704253673553467 }, { "auxiliary_loss_clip": 0.01025575, "auxiliary_loss_mlp": 0.01001671, "balance_loss_clip": 1.00677466, "balance_loss_mlp": 1.0001328, "epoch": 0.8786148018998376, "flos": 62720643939840.0, "grad_norm": 0.7634265751836548, "language_loss": 0.53506261, "learning_rate": 1.524645884674094e-07, "loss": 0.55533504, "num_input_tokens_seen": 158185610, "step": 7307, "time_per_iteration": 3.197124481201172 }, { "auxiliary_loss_clip": 0.01134087, "auxiliary_loss_mlp": 0.00772739, "balance_loss_clip": 1.04360449, "balance_loss_mlp": 1.00049782, "epoch": 0.8787350447904768, "flos": 21652734263040.0, "grad_norm": 2.2971554887941545, "language_loss": 0.79068255, "learning_rate": 1.521664188511047e-07, "loss": 0.8097508, "num_input_tokens_seen": 158205635, "step": 7308, "time_per_iteration": 2.579627275466919 }, { "auxiliary_loss_clip": 0.01102159, "auxiliary_loss_mlp": 0.00770518, "balance_loss_clip": 1.04131854, "balance_loss_mlp": 1.00052238, "epoch": 0.8788552876811159, "flos": 25478476957440.0, "grad_norm": 1.8990008810015275, "language_loss": 0.80328214, "learning_rate": 1.518685295496851e-07, "loss": 0.82200891, "num_input_tokens_seen": 158223495, "step": 7309, "time_per_iteration": 2.7031407356262207 }, { "auxiliary_loss_clip": 0.01118987, "auxiliary_loss_mlp": 0.01041981, "balance_loss_clip": 1.04197717, "balance_loss_mlp": 1.02743769, "epoch": 0.8789755305717549, "flos": 22310222762880.0, "grad_norm": 1.94953290013351, "language_loss": 0.85574281, "learning_rate": 1.5157092060833975e-07, "loss": 0.8773526, "num_input_tokens_seen": 158243145, "step": 7310, "time_per_iteration": 3.5226893424987793 }, { "auxiliary_loss_clip": 0.01102589, "auxiliary_loss_mlp": 0.01042063, "balance_loss_clip": 1.03809166, "balance_loss_mlp": 1.02456295, "epoch": 0.879095773462394, "flos": 29310971408640.0, "grad_norm": 1.7186496440396364, "language_loss": 0.66149861, "learning_rate": 1.5127359207221658e-07, "loss": 0.68294513, "num_input_tokens_seen": 158262625, "step": 7311, "time_per_iteration": 2.6583919525146484 }, { "auxiliary_loss_clip": 0.01060016, "auxiliary_loss_mlp": 0.01051596, "balance_loss_clip": 1.03343737, "balance_loss_mlp": 1.03192651, "epoch": 0.8792160163530331, "flos": 16690023394560.0, "grad_norm": 2.0805522079602627, "language_loss": 0.73220193, "learning_rate": 1.5097654398641923e-07, "loss": 0.75331807, "num_input_tokens_seen": 158280530, "step": 7312, "time_per_iteration": 3.69272518157959 }, { "auxiliary_loss_clip": 0.01122017, "auxiliary_loss_mlp": 0.01037408, "balance_loss_clip": 1.04315615, "balance_loss_mlp": 1.02227998, "epoch": 0.8793362592436722, "flos": 24499301230080.0, "grad_norm": 1.4558607954185991, "language_loss": 0.73031187, "learning_rate": 1.5067977639601014e-07, "loss": 0.7519061, "num_input_tokens_seen": 158303290, "step": 7313, "time_per_iteration": 2.6256887912750244 }, { "auxiliary_loss_clip": 0.01103658, "auxiliary_loss_mlp": 0.01031991, "balance_loss_clip": 1.04187965, "balance_loss_mlp": 1.01767397, "epoch": 0.8794565021343113, "flos": 14538399834240.0, "grad_norm": 2.280886154437622, "language_loss": 0.71017087, "learning_rate": 1.5038328934600864e-07, "loss": 0.73152733, "num_input_tokens_seen": 158319925, "step": 7314, "time_per_iteration": 2.611132860183716 }, { "auxiliary_loss_clip": 0.01110201, "auxiliary_loss_mlp": 0.01038959, "balance_loss_clip": 1.04032314, "balance_loss_mlp": 1.02404046, "epoch": 0.8795767450249504, "flos": 39530286224640.0, "grad_norm": 2.4794254455663807, "language_loss": 0.70220679, "learning_rate": 1.5008708288139161e-07, "loss": 0.72369844, "num_input_tokens_seen": 158342285, "step": 7315, "time_per_iteration": 3.751861810684204 }, { "auxiliary_loss_clip": 0.01115265, "auxiliary_loss_mlp": 0.01042397, "balance_loss_clip": 1.04139221, "balance_loss_mlp": 1.0270431, "epoch": 0.8796969879155895, "flos": 22960672197120.0, "grad_norm": 1.8719418944332302, "language_loss": 0.73473787, "learning_rate": 1.497911570470931e-07, "loss": 0.75631458, "num_input_tokens_seen": 158362290, "step": 7316, "time_per_iteration": 2.6303274631500244 }, { "auxiliary_loss_clip": 0.01086786, "auxiliary_loss_mlp": 0.01034625, "balance_loss_clip": 1.03833699, "balance_loss_mlp": 1.01936567, "epoch": 0.8798172308062285, "flos": 28362427004160.0, "grad_norm": 1.6923450888366742, "language_loss": 0.8553279, "learning_rate": 1.494955118880048e-07, "loss": 0.87654203, "num_input_tokens_seen": 158383275, "step": 7317, "time_per_iteration": 3.563977003097534 }, { "auxiliary_loss_clip": 0.01119227, "auxiliary_loss_mlp": 0.010358, "balance_loss_clip": 1.04226351, "balance_loss_mlp": 1.02075005, "epoch": 0.8799374736968677, "flos": 23988974751360.0, "grad_norm": 1.6462625697431819, "language_loss": 0.72853673, "learning_rate": 1.4920014744897634e-07, "loss": 0.75008702, "num_input_tokens_seen": 158402690, "step": 7318, "time_per_iteration": 2.600872039794922 }, { "auxiliary_loss_clip": 0.01099265, "auxiliary_loss_mlp": 0.01052139, "balance_loss_clip": 1.0394814, "balance_loss_mlp": 1.03528249, "epoch": 0.8800577165875068, "flos": 25630271832960.0, "grad_norm": 2.0214248894191686, "language_loss": 0.86444414, "learning_rate": 1.4890506377481392e-07, "loss": 0.88595819, "num_input_tokens_seen": 158421780, "step": 7319, "time_per_iteration": 2.6423888206481934 }, { "auxiliary_loss_clip": 0.01061846, "auxiliary_loss_mlp": 0.01042148, "balance_loss_clip": 1.03722262, "balance_loss_mlp": 1.02829039, "epoch": 0.8801779594781458, "flos": 23440331439360.0, "grad_norm": 1.4701457566819935, "language_loss": 0.64089561, "learning_rate": 1.486102609102815e-07, "loss": 0.66193557, "num_input_tokens_seen": 158442330, "step": 7320, "time_per_iteration": 2.737551689147949 }, { "auxiliary_loss_clip": 0.01101802, "auxiliary_loss_mlp": 0.01034457, "balance_loss_clip": 1.03942263, "balance_loss_mlp": 1.01981211, "epoch": 0.880298202368785, "flos": 11508580656000.0, "grad_norm": 3.191253645637348, "language_loss": 0.85616338, "learning_rate": 1.483157389001004e-07, "loss": 0.87752599, "num_input_tokens_seen": 158459890, "step": 7321, "time_per_iteration": 2.6437106132507324 }, { "auxiliary_loss_clip": 0.01104496, "auxiliary_loss_mlp": 0.01038267, "balance_loss_clip": 1.0370822, "balance_loss_mlp": 1.02151847, "epoch": 0.880418445259424, "flos": 22671447886080.0, "grad_norm": 2.1428655185882457, "language_loss": 0.78740597, "learning_rate": 1.4802149778894933e-07, "loss": 0.8088336, "num_input_tokens_seen": 158478680, "step": 7322, "time_per_iteration": 2.6262383460998535 }, { "auxiliary_loss_clip": 0.01111068, "auxiliary_loss_mlp": 0.0104128, "balance_loss_clip": 1.03780329, "balance_loss_mlp": 1.02666521, "epoch": 0.8805386881500631, "flos": 20522158709760.0, "grad_norm": 1.6757307709180134, "language_loss": 0.87423772, "learning_rate": 1.4772753762146484e-07, "loss": 0.89576125, "num_input_tokens_seen": 158497935, "step": 7323, "time_per_iteration": 2.640324115753174 }, { "auxiliary_loss_clip": 0.01114555, "auxiliary_loss_mlp": 0.0103668, "balance_loss_clip": 1.04000258, "balance_loss_mlp": 1.02057517, "epoch": 0.8806589310407023, "flos": 36538891620480.0, "grad_norm": 1.8408452204558592, "language_loss": 0.7053107, "learning_rate": 1.474338584422401e-07, "loss": 0.72682309, "num_input_tokens_seen": 158523145, "step": 7324, "time_per_iteration": 2.7122819423675537 }, { "auxiliary_loss_clip": 0.0111516, "auxiliary_loss_mlp": 0.01038279, "balance_loss_clip": 1.0410428, "balance_loss_mlp": 1.02403927, "epoch": 0.8807791739313413, "flos": 23440187784960.0, "grad_norm": 3.0475143034512975, "language_loss": 0.76108861, "learning_rate": 1.4714046029582595e-07, "loss": 0.78262293, "num_input_tokens_seen": 158542210, "step": 7325, "time_per_iteration": 2.614825963973999 }, { "auxiliary_loss_clip": 0.01094021, "auxiliary_loss_mlp": 0.01038927, "balance_loss_clip": 1.03855896, "balance_loss_mlp": 1.02345347, "epoch": 0.8808994168219804, "flos": 25956843310080.0, "grad_norm": 1.700400328530948, "language_loss": 0.75825906, "learning_rate": 1.46847343226731e-07, "loss": 0.77958846, "num_input_tokens_seen": 158563250, "step": 7326, "time_per_iteration": 2.7005200386047363 }, { "auxiliary_loss_clip": 0.01123429, "auxiliary_loss_mlp": 0.010392, "balance_loss_clip": 1.0427897, "balance_loss_mlp": 1.02372682, "epoch": 0.8810196597126195, "flos": 17092079303040.0, "grad_norm": 2.4308206934900487, "language_loss": 0.69639874, "learning_rate": 1.465545072794203e-07, "loss": 0.71802503, "num_input_tokens_seen": 158581125, "step": 7327, "time_per_iteration": 2.572843074798584 }, { "auxiliary_loss_clip": 0.01074683, "auxiliary_loss_mlp": 0.01039224, "balance_loss_clip": 1.03796506, "balance_loss_mlp": 1.02386427, "epoch": 0.8811399026032586, "flos": 23002831785600.0, "grad_norm": 3.733652943002584, "language_loss": 0.75861013, "learning_rate": 1.4626195249831774e-07, "loss": 0.77974921, "num_input_tokens_seen": 158602025, "step": 7328, "time_per_iteration": 2.760357141494751 }, { "auxiliary_loss_clip": 0.01114491, "auxiliary_loss_mlp": 0.0103663, "balance_loss_clip": 1.03858757, "balance_loss_mlp": 1.02202702, "epoch": 0.8812601454938976, "flos": 14463813242880.0, "grad_norm": 2.005794369063549, "language_loss": 0.71674877, "learning_rate": 1.4596967892780244e-07, "loss": 0.73825997, "num_input_tokens_seen": 158618355, "step": 7329, "time_per_iteration": 2.5753488540649414 }, { "auxiliary_loss_clip": 0.01129321, "auxiliary_loss_mlp": 0.01038493, "balance_loss_clip": 1.04227805, "balance_loss_mlp": 1.02361548, "epoch": 0.8813803883845368, "flos": 22493223578880.0, "grad_norm": 1.883954629252848, "language_loss": 0.74972785, "learning_rate": 1.4567768661221314e-07, "loss": 0.77140599, "num_input_tokens_seen": 158638925, "step": 7330, "time_per_iteration": 2.56113862991333 }, { "auxiliary_loss_clip": 0.01125721, "auxiliary_loss_mlp": 0.00772333, "balance_loss_clip": 1.04355729, "balance_loss_mlp": 1.00058556, "epoch": 0.8815006312751759, "flos": 21506901045120.0, "grad_norm": 2.049868743295714, "language_loss": 0.74479115, "learning_rate": 1.4538597559584442e-07, "loss": 0.76377165, "num_input_tokens_seen": 158656715, "step": 7331, "time_per_iteration": 2.597254991531372 }, { "auxiliary_loss_clip": 0.01102744, "auxiliary_loss_mlp": 0.01035882, "balance_loss_clip": 1.03826928, "balance_loss_mlp": 1.02015829, "epoch": 0.8816208741658149, "flos": 22784566792320.0, "grad_norm": 1.851551728287007, "language_loss": 0.78896493, "learning_rate": 1.4509454592294823e-07, "loss": 0.81035119, "num_input_tokens_seen": 158677200, "step": 7332, "time_per_iteration": 2.6507151126861572 }, { "auxiliary_loss_clip": 0.01092338, "auxiliary_loss_mlp": 0.0077252, "balance_loss_clip": 1.03768063, "balance_loss_mlp": 1.00050712, "epoch": 0.8817411170564541, "flos": 17779409026560.0, "grad_norm": 3.0846528444896917, "language_loss": 0.79037243, "learning_rate": 1.448033976377354e-07, "loss": 0.80902106, "num_input_tokens_seen": 158692185, "step": 7333, "time_per_iteration": 2.6454052925109863 }, { "auxiliary_loss_clip": 0.01120092, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.04027629, "balance_loss_mlp": 1.02199042, "epoch": 0.8818613599470931, "flos": 18551812112640.0, "grad_norm": 1.9220452386130102, "language_loss": 0.74233603, "learning_rate": 1.445125307843713e-07, "loss": 0.76390398, "num_input_tokens_seen": 158710410, "step": 7334, "time_per_iteration": 2.722554922103882 }, { "auxiliary_loss_clip": 0.01116077, "auxiliary_loss_mlp": 0.01037718, "balance_loss_clip": 1.04027891, "balance_loss_mlp": 1.023103, "epoch": 0.8819816028377322, "flos": 27599792417280.0, "grad_norm": 2.101626408526927, "language_loss": 0.75901365, "learning_rate": 1.442219454069813e-07, "loss": 0.78055167, "num_input_tokens_seen": 158731435, "step": 7335, "time_per_iteration": 2.6539385318756104 }, { "auxiliary_loss_clip": 0.01083987, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.03799605, "balance_loss_mlp": 1.0188508, "epoch": 0.8821018457283714, "flos": 23404600385280.0, "grad_norm": 2.2673497309609676, "language_loss": 0.67098194, "learning_rate": 1.4393164154964676e-07, "loss": 0.69216812, "num_input_tokens_seen": 158750965, "step": 7336, "time_per_iteration": 3.7242367267608643 }, { "auxiliary_loss_clip": 0.01116438, "auxiliary_loss_mlp": 0.01036915, "balance_loss_clip": 1.04316497, "balance_loss_mlp": 1.02275252, "epoch": 0.8822220886190104, "flos": 29132459792640.0, "grad_norm": 2.0150170448077143, "language_loss": 0.94078648, "learning_rate": 1.4364161925640649e-07, "loss": 0.96232003, "num_input_tokens_seen": 158772365, "step": 7337, "time_per_iteration": 2.641629219055176 }, { "auxiliary_loss_clip": 0.01131977, "auxiliary_loss_mlp": 0.01037283, "balance_loss_clip": 1.04389262, "balance_loss_mlp": 1.02226841, "epoch": 0.8823423315096495, "flos": 20485422074880.0, "grad_norm": 2.0197281138497254, "language_loss": 0.85229242, "learning_rate": 1.4335187857125663e-07, "loss": 0.87398499, "num_input_tokens_seen": 158791065, "step": 7338, "time_per_iteration": 3.4561753273010254 }, { "auxiliary_loss_clip": 0.01121202, "auxiliary_loss_mlp": 0.01038198, "balance_loss_clip": 1.04182982, "balance_loss_mlp": 1.02315927, "epoch": 0.8824625744002886, "flos": 24206377818240.0, "grad_norm": 1.860043846533202, "language_loss": 0.75623441, "learning_rate": 1.4306241953815023e-07, "loss": 0.77782845, "num_input_tokens_seen": 158812125, "step": 7339, "time_per_iteration": 2.65486216545105 }, { "auxiliary_loss_clip": 0.01117483, "auxiliary_loss_mlp": 0.010354, "balance_loss_clip": 1.03920639, "balance_loss_mlp": 1.02010512, "epoch": 0.8825828172909277, "flos": 24679500785280.0, "grad_norm": 1.61587208996169, "language_loss": 0.70802999, "learning_rate": 1.4277324220099862e-07, "loss": 0.72955883, "num_input_tokens_seen": 158834035, "step": 7340, "time_per_iteration": 2.5913074016571045 }, { "auxiliary_loss_clip": 0.01088323, "auxiliary_loss_mlp": 0.01035891, "balance_loss_clip": 1.0370928, "balance_loss_mlp": 1.02083516, "epoch": 0.8827030601815667, "flos": 22456163721600.0, "grad_norm": 1.9799686928004028, "language_loss": 0.74572271, "learning_rate": 1.4248434660366938e-07, "loss": 0.76696485, "num_input_tokens_seen": 158853510, "step": 7341, "time_per_iteration": 3.5450665950775146 }, { "auxiliary_loss_clip": 0.01107545, "auxiliary_loss_mlp": 0.01039999, "balance_loss_clip": 1.04061437, "balance_loss_mlp": 1.02458489, "epoch": 0.8828233030722058, "flos": 19865639877120.0, "grad_norm": 2.0184609998220444, "language_loss": 0.70772052, "learning_rate": 1.4219573278998808e-07, "loss": 0.72919595, "num_input_tokens_seen": 158871970, "step": 7342, "time_per_iteration": 2.6144421100616455 }, { "auxiliary_loss_clip": 0.01106142, "auxiliary_loss_mlp": 0.01052347, "balance_loss_clip": 1.03822434, "balance_loss_mlp": 1.03439403, "epoch": 0.882943545962845, "flos": 39347213581440.0, "grad_norm": 2.113187411542641, "language_loss": 0.6510365, "learning_rate": 1.4190740080373685e-07, "loss": 0.67262137, "num_input_tokens_seen": 158892250, "step": 7343, "time_per_iteration": 3.5686962604522705 }, { "auxiliary_loss_clip": 0.01082265, "auxiliary_loss_mlp": 0.01041564, "balance_loss_clip": 1.03857088, "balance_loss_mlp": 1.02420723, "epoch": 0.883063788853484, "flos": 19054524908160.0, "grad_norm": 2.3025238338047953, "language_loss": 0.84218544, "learning_rate": 1.4161935068865538e-07, "loss": 0.86342371, "num_input_tokens_seen": 158907395, "step": 7344, "time_per_iteration": 2.6736865043640137 }, { "auxiliary_loss_clip": 0.01133007, "auxiliary_loss_mlp": 0.01045329, "balance_loss_clip": 1.04268873, "balance_loss_mlp": 1.02980781, "epoch": 0.8831840317441231, "flos": 18733196816640.0, "grad_norm": 1.8174570049272092, "language_loss": 0.75747383, "learning_rate": 1.4133158248844113e-07, "loss": 0.77925718, "num_input_tokens_seen": 158926300, "step": 7345, "time_per_iteration": 2.53194260597229 }, { "auxiliary_loss_clip": 0.01097797, "auxiliary_loss_mlp": 0.01033722, "balance_loss_clip": 1.03891015, "balance_loss_mlp": 1.01786721, "epoch": 0.8833042746347622, "flos": 26827712553600.0, "grad_norm": 2.0011721207722273, "language_loss": 0.73463261, "learning_rate": 1.4104409624674785e-07, "loss": 0.75594777, "num_input_tokens_seen": 158946085, "step": 7346, "time_per_iteration": 2.6595072746276855 }, { "auxiliary_loss_clip": 0.01122048, "auxiliary_loss_mlp": 0.01043116, "balance_loss_clip": 1.04540098, "balance_loss_mlp": 1.02717817, "epoch": 0.8834245175254013, "flos": 26104077158400.0, "grad_norm": 1.7088233192541868, "language_loss": 0.78750867, "learning_rate": 1.407568920071873e-07, "loss": 0.80916035, "num_input_tokens_seen": 158964950, "step": 7347, "time_per_iteration": 2.6188271045684814 }, { "auxiliary_loss_clip": 0.01136567, "auxiliary_loss_mlp": 0.0103914, "balance_loss_clip": 1.04347038, "balance_loss_mlp": 1.02272499, "epoch": 0.8835447604160404, "flos": 30629036977920.0, "grad_norm": 2.865976727022638, "language_loss": 0.68560201, "learning_rate": 1.4046996981332782e-07, "loss": 0.70735908, "num_input_tokens_seen": 158984835, "step": 7348, "time_per_iteration": 2.6425631046295166 }, { "auxiliary_loss_clip": 0.01095948, "auxiliary_loss_mlp": 0.01037832, "balance_loss_clip": 1.03765357, "balance_loss_mlp": 1.02130961, "epoch": 0.8836650033066795, "flos": 24718356322560.0, "grad_norm": 2.183211180328889, "language_loss": 0.78318167, "learning_rate": 1.4018332970869516e-07, "loss": 0.80451953, "num_input_tokens_seen": 159002775, "step": 7349, "time_per_iteration": 2.6488282680511475 }, { "auxiliary_loss_clip": 0.01097292, "auxiliary_loss_mlp": 0.01036908, "balance_loss_clip": 1.03961992, "balance_loss_mlp": 1.02243626, "epoch": 0.8837852461973186, "flos": 25413371556480.0, "grad_norm": 1.7363038349988922, "language_loss": 0.84923881, "learning_rate": 1.398969717367733e-07, "loss": 0.87058079, "num_input_tokens_seen": 159024100, "step": 7350, "time_per_iteration": 2.7232770919799805 }, { "auxiliary_loss_clip": 0.0108004, "auxiliary_loss_mlp": 0.01032242, "balance_loss_clip": 1.03768253, "balance_loss_mlp": 1.01751399, "epoch": 0.8839054890879576, "flos": 17822574195840.0, "grad_norm": 1.9498282549102364, "language_loss": 0.76302397, "learning_rate": 1.396108959410014e-07, "loss": 0.78414673, "num_input_tokens_seen": 159043315, "step": 7351, "time_per_iteration": 2.6454007625579834 }, { "auxiliary_loss_clip": 0.01118412, "auxiliary_loss_mlp": 0.00772162, "balance_loss_clip": 1.04067755, "balance_loss_mlp": 1.00046778, "epoch": 0.8840257319785968, "flos": 23769021818880.0, "grad_norm": 1.5840562394224613, "language_loss": 0.81455636, "learning_rate": 1.3932510236477745e-07, "loss": 0.83346218, "num_input_tokens_seen": 159063985, "step": 7352, "time_per_iteration": 2.6372780799865723 }, { "auxiliary_loss_clip": 0.01118833, "auxiliary_loss_mlp": 0.01043886, "balance_loss_clip": 1.03896236, "balance_loss_mlp": 1.02754843, "epoch": 0.8841459748692359, "flos": 29059776622080.0, "grad_norm": 1.810705304240065, "language_loss": 0.55932629, "learning_rate": 1.3903959105145636e-07, "loss": 0.58095342, "num_input_tokens_seen": 159084475, "step": 7353, "time_per_iteration": 2.635404586791992 }, { "auxiliary_loss_clip": 0.01130647, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.04316723, "balance_loss_mlp": 1.02192688, "epoch": 0.8842662177598749, "flos": 24311523905280.0, "grad_norm": 2.019048051423795, "language_loss": 0.8306753, "learning_rate": 1.387543620443492e-07, "loss": 0.85235149, "num_input_tokens_seen": 159101320, "step": 7354, "time_per_iteration": 2.601066827774048 }, { "auxiliary_loss_clip": 0.01130996, "auxiliary_loss_mlp": 0.01047665, "balance_loss_clip": 1.04414761, "balance_loss_mlp": 1.03248978, "epoch": 0.8843864606505141, "flos": 25007867942400.0, "grad_norm": 5.060184295299158, "language_loss": 0.84428287, "learning_rate": 1.3846941538672606e-07, "loss": 0.8660695, "num_input_tokens_seen": 159120025, "step": 7355, "time_per_iteration": 2.6493759155273438 }, { "auxiliary_loss_clip": 0.01083642, "auxiliary_loss_mlp": 0.01041072, "balance_loss_clip": 1.03943622, "balance_loss_mlp": 1.02551532, "epoch": 0.8845067035411531, "flos": 28183915388160.0, "grad_norm": 2.3737118508242308, "language_loss": 0.80925906, "learning_rate": 1.3818475112181193e-07, "loss": 0.83050627, "num_input_tokens_seen": 159138820, "step": 7356, "time_per_iteration": 2.71610689163208 }, { "auxiliary_loss_clip": 0.011062, "auxiliary_loss_mlp": 0.01038331, "balance_loss_clip": 1.04124129, "balance_loss_mlp": 1.02372169, "epoch": 0.8846269464317922, "flos": 12853219311360.0, "grad_norm": 1.928461742071664, "language_loss": 0.79168147, "learning_rate": 1.3790036929279091e-07, "loss": 0.8131268, "num_input_tokens_seen": 159155975, "step": 7357, "time_per_iteration": 2.567143440246582 }, { "auxiliary_loss_clip": 0.01122758, "auxiliary_loss_mlp": 0.00771951, "balance_loss_clip": 1.0439086, "balance_loss_mlp": 1.00051594, "epoch": 0.8847471893224313, "flos": 18624351628800.0, "grad_norm": 2.5561493919020175, "language_loss": 0.59321713, "learning_rate": 1.3761626994280363e-07, "loss": 0.61216426, "num_input_tokens_seen": 159173445, "step": 7358, "time_per_iteration": 2.578068733215332 }, { "auxiliary_loss_clip": 0.01099268, "auxiliary_loss_mlp": 0.01032472, "balance_loss_clip": 1.03949523, "balance_loss_mlp": 1.01721919, "epoch": 0.8848674322130704, "flos": 35769433449600.0, "grad_norm": 2.318791121573648, "language_loss": 0.73371154, "learning_rate": 1.3733245311494735e-07, "loss": 0.75502896, "num_input_tokens_seen": 159196100, "step": 7359, "time_per_iteration": 2.7978103160858154 }, { "auxiliary_loss_clip": 0.01119629, "auxiliary_loss_mlp": 0.01044974, "balance_loss_clip": 1.04221439, "balance_loss_mlp": 1.0294466, "epoch": 0.8849876751037095, "flos": 24243760897920.0, "grad_norm": 2.2036736644010837, "language_loss": 0.70813847, "learning_rate": 1.3704891885227676e-07, "loss": 0.72978449, "num_input_tokens_seen": 159216145, "step": 7360, "time_per_iteration": 2.6084091663360596 }, { "auxiliary_loss_clip": 0.0109329, "auxiliary_loss_mlp": 0.0104498, "balance_loss_clip": 1.0365926, "balance_loss_mlp": 1.02805221, "epoch": 0.8851079179943486, "flos": 21500580251520.0, "grad_norm": 1.8681431703209963, "language_loss": 0.77916074, "learning_rate": 1.367656671978037e-07, "loss": 0.80054343, "num_input_tokens_seen": 159233610, "step": 7361, "time_per_iteration": 2.646267890930176 }, { "auxiliary_loss_clip": 0.01114263, "auxiliary_loss_mlp": 0.01037952, "balance_loss_clip": 1.04279637, "balance_loss_mlp": 1.0242784, "epoch": 0.8852281608849877, "flos": 15300711198720.0, "grad_norm": 1.9768570424652474, "language_loss": 0.73745966, "learning_rate": 1.36482698194498e-07, "loss": 0.75898182, "num_input_tokens_seen": 159250155, "step": 7362, "time_per_iteration": 2.5868828296661377 }, { "auxiliary_loss_clip": 0.01107113, "auxiliary_loss_mlp": 0.01037274, "balance_loss_clip": 1.03916836, "balance_loss_mlp": 1.02076304, "epoch": 0.8853484037756267, "flos": 23295719283840.0, "grad_norm": 1.7573451061661924, "language_loss": 0.71785557, "learning_rate": 1.3620001188528506e-07, "loss": 0.73929942, "num_input_tokens_seen": 159270875, "step": 7363, "time_per_iteration": 3.6036438941955566 }, { "auxiliary_loss_clip": 0.01126385, "auxiliary_loss_mlp": 0.01044065, "balance_loss_clip": 1.0425303, "balance_loss_mlp": 1.0270896, "epoch": 0.8854686466662659, "flos": 25114773795840.0, "grad_norm": 2.195985361553838, "language_loss": 0.73981416, "learning_rate": 1.3591760831304865e-07, "loss": 0.76151872, "num_input_tokens_seen": 159288565, "step": 7364, "time_per_iteration": 3.5481393337249756 }, { "auxiliary_loss_clip": 0.01132925, "auxiliary_loss_mlp": 0.01040784, "balance_loss_clip": 1.04519141, "balance_loss_mlp": 1.02448785, "epoch": 0.885588889556905, "flos": 21390873137280.0, "grad_norm": 1.773478176902408, "language_loss": 0.79354274, "learning_rate": 1.356354875206287e-07, "loss": 0.81527984, "num_input_tokens_seen": 159306400, "step": 7365, "time_per_iteration": 2.5552713871002197 }, { "auxiliary_loss_clip": 0.01096051, "auxiliary_loss_mlp": 0.01037879, "balance_loss_clip": 1.04120934, "balance_loss_mlp": 1.02331185, "epoch": 0.885709132447544, "flos": 26906752431360.0, "grad_norm": 1.9164001302032272, "language_loss": 0.69707215, "learning_rate": 1.3535364955082296e-07, "loss": 0.71841145, "num_input_tokens_seen": 159326250, "step": 7366, "time_per_iteration": 2.6932761669158936 }, { "auxiliary_loss_clip": 0.01129657, "auxiliary_loss_mlp": 0.0103411, "balance_loss_clip": 1.04300976, "balance_loss_mlp": 1.02045441, "epoch": 0.8858293753381832, "flos": 26103394800000.0, "grad_norm": 1.6816932030525649, "language_loss": 0.64518714, "learning_rate": 1.3507209444638613e-07, "loss": 0.66682482, "num_input_tokens_seen": 159348250, "step": 7367, "time_per_iteration": 3.508488893508911 }, { "auxiliary_loss_clip": 0.01120776, "auxiliary_loss_mlp": 0.01033977, "balance_loss_clip": 1.04179943, "balance_loss_mlp": 1.01878953, "epoch": 0.8859496182288222, "flos": 23292810282240.0, "grad_norm": 2.2896085746471257, "language_loss": 0.73937011, "learning_rate": 1.347908222500298e-07, "loss": 0.76091754, "num_input_tokens_seen": 159368325, "step": 7368, "time_per_iteration": 3.481381416320801 }, { "auxiliary_loss_clip": 0.01077412, "auxiliary_loss_mlp": 0.01039755, "balance_loss_clip": 1.03631783, "balance_loss_mlp": 1.02524686, "epoch": 0.8860698611194613, "flos": 16872916469760.0, "grad_norm": 1.8761247149758133, "language_loss": 0.69584537, "learning_rate": 1.3450983300442276e-07, "loss": 0.71701699, "num_input_tokens_seen": 159387555, "step": 7369, "time_per_iteration": 2.617832899093628 }, { "auxiliary_loss_clip": 0.01122377, "auxiliary_loss_mlp": 0.01037465, "balance_loss_clip": 1.04349804, "balance_loss_mlp": 1.02236092, "epoch": 0.8861901040101005, "flos": 24681404206080.0, "grad_norm": 2.388293894364235, "language_loss": 0.73703897, "learning_rate": 1.3422912675219068e-07, "loss": 0.75863743, "num_input_tokens_seen": 159407310, "step": 7370, "time_per_iteration": 2.6811234951019287 }, { "auxiliary_loss_clip": 0.01128945, "auxiliary_loss_mlp": 0.01038585, "balance_loss_clip": 1.04261839, "balance_loss_mlp": 1.02376127, "epoch": 0.8863103469007395, "flos": 24423026699520.0, "grad_norm": 1.7131263865141648, "language_loss": 0.79374802, "learning_rate": 1.339487035359166e-07, "loss": 0.81542337, "num_input_tokens_seen": 159427680, "step": 7371, "time_per_iteration": 2.6225569248199463 }, { "auxiliary_loss_clip": 0.01106944, "auxiliary_loss_mlp": 0.00770916, "balance_loss_clip": 1.04189467, "balance_loss_mlp": 1.00049877, "epoch": 0.8864305897913786, "flos": 22053964158720.0, "grad_norm": 1.522108000417417, "language_loss": 0.85095537, "learning_rate": 1.336685633981409e-07, "loss": 0.86973399, "num_input_tokens_seen": 159448765, "step": 7372, "time_per_iteration": 2.6214191913604736 }, { "auxiliary_loss_clip": 0.01119371, "auxiliary_loss_mlp": 0.01043954, "balance_loss_clip": 1.04191482, "balance_loss_mlp": 1.02842724, "epoch": 0.8865508326820177, "flos": 19099449843840.0, "grad_norm": 1.96308330673137, "language_loss": 0.75194615, "learning_rate": 1.333887063813597e-07, "loss": 0.77357936, "num_input_tokens_seen": 159466870, "step": 7373, "time_per_iteration": 2.6351611614227295 }, { "auxiliary_loss_clip": 0.0111084, "auxiliary_loss_mlp": 0.01036138, "balance_loss_clip": 1.04128408, "balance_loss_mlp": 1.02186275, "epoch": 0.8866710755726568, "flos": 15414189240960.0, "grad_norm": 1.846280005795187, "language_loss": 0.66935062, "learning_rate": 1.331091325280278e-07, "loss": 0.6908204, "num_input_tokens_seen": 159485840, "step": 7374, "time_per_iteration": 2.658923864364624 }, { "auxiliary_loss_clip": 0.01070654, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.03377593, "balance_loss_mlp": 1.01859808, "epoch": 0.8867913184632958, "flos": 20083689388800.0, "grad_norm": 1.6987374647379716, "language_loss": 0.78620422, "learning_rate": 1.3282984188055625e-07, "loss": 0.80726284, "num_input_tokens_seen": 159505630, "step": 7375, "time_per_iteration": 2.6648170948028564 }, { "auxiliary_loss_clip": 0.01132107, "auxiliary_loss_mlp": 0.01040449, "balance_loss_clip": 1.04258418, "balance_loss_mlp": 1.02570307, "epoch": 0.8869115613539349, "flos": 23365852588800.0, "grad_norm": 1.8312336914386196, "language_loss": 0.79581332, "learning_rate": 1.3255083448131288e-07, "loss": 0.81753892, "num_input_tokens_seen": 159524675, "step": 7376, "time_per_iteration": 2.5504703521728516 }, { "auxiliary_loss_clip": 0.01123765, "auxiliary_loss_mlp": 0.01036202, "balance_loss_clip": 1.04215527, "balance_loss_mlp": 1.02038336, "epoch": 0.8870318042445741, "flos": 21286840371840.0, "grad_norm": 2.094191167897575, "language_loss": 0.78832364, "learning_rate": 1.3227211037262365e-07, "loss": 0.80992329, "num_input_tokens_seen": 159541915, "step": 7377, "time_per_iteration": 2.597095489501953 }, { "auxiliary_loss_clip": 0.01083562, "auxiliary_loss_mlp": 0.01041809, "balance_loss_clip": 1.03603053, "balance_loss_mlp": 1.0243566, "epoch": 0.8871520471352131, "flos": 20010862563840.0, "grad_norm": 2.024470256749543, "language_loss": 0.85032618, "learning_rate": 1.319936695967696e-07, "loss": 0.87157989, "num_input_tokens_seen": 159559740, "step": 7378, "time_per_iteration": 2.7176618576049805 }, { "auxiliary_loss_clip": 0.01139701, "auxiliary_loss_mlp": 0.01038101, "balance_loss_clip": 1.04338288, "balance_loss_mlp": 1.02013612, "epoch": 0.8872722900258522, "flos": 22601422321920.0, "grad_norm": 2.1804802674206805, "language_loss": 0.81942511, "learning_rate": 1.3171551219599097e-07, "loss": 0.84120309, "num_input_tokens_seen": 159578265, "step": 7379, "time_per_iteration": 2.564918041229248 }, { "auxiliary_loss_clip": 0.01130601, "auxiliary_loss_mlp": 0.01039391, "balance_loss_clip": 1.04325795, "balance_loss_mlp": 1.0237447, "epoch": 0.8873925329164913, "flos": 22163276223360.0, "grad_norm": 1.9874238697703084, "language_loss": 0.78316337, "learning_rate": 1.3143763821248377e-07, "loss": 0.80486327, "num_input_tokens_seen": 159595350, "step": 7380, "time_per_iteration": 2.552262544631958 }, { "auxiliary_loss_clip": 0.01132699, "auxiliary_loss_mlp": 0.01038487, "balance_loss_clip": 1.04576159, "balance_loss_mlp": 1.02430129, "epoch": 0.8875127758071304, "flos": 19208223204480.0, "grad_norm": 1.7547755989035114, "language_loss": 0.71943641, "learning_rate": 1.3116004768840118e-07, "loss": 0.74114823, "num_input_tokens_seen": 159613725, "step": 7381, "time_per_iteration": 2.5606746673583984 }, { "auxiliary_loss_clip": 0.01130025, "auxiliary_loss_mlp": 0.01036713, "balance_loss_clip": 1.04124415, "balance_loss_mlp": 1.01991701, "epoch": 0.8876330186977694, "flos": 18110900666880.0, "grad_norm": 1.6804735428709234, "language_loss": 0.74152792, "learning_rate": 1.3088274066585348e-07, "loss": 0.76319528, "num_input_tokens_seen": 159631335, "step": 7382, "time_per_iteration": 2.60630202293396 }, { "auxiliary_loss_clip": 0.01097523, "auxiliary_loss_mlp": 0.01036288, "balance_loss_clip": 1.03911543, "balance_loss_mlp": 1.02133918, "epoch": 0.8877532615884086, "flos": 22009434272640.0, "grad_norm": 2.7280663009766077, "language_loss": 0.90333581, "learning_rate": 1.3060571718690749e-07, "loss": 0.92467391, "num_input_tokens_seen": 159648830, "step": 7383, "time_per_iteration": 2.6912636756896973 }, { "auxiliary_loss_clip": 0.0100929, "auxiliary_loss_mlp": 0.0075569, "balance_loss_clip": 1.00831866, "balance_loss_mlp": 1.00021446, "epoch": 0.8878735044790477, "flos": 72136924346880.0, "grad_norm": 0.7471066215444299, "language_loss": 0.56894654, "learning_rate": 1.3032897729358805e-07, "loss": 0.58659637, "num_input_tokens_seen": 159709785, "step": 7384, "time_per_iteration": 3.2681572437286377 }, { "auxiliary_loss_clip": 0.01061104, "auxiliary_loss_mlp": 0.00774213, "balance_loss_clip": 1.03463805, "balance_loss_mlp": 1.00050688, "epoch": 0.8879937473696867, "flos": 27526355061120.0, "grad_norm": 1.8344991962544324, "language_loss": 0.79929656, "learning_rate": 1.3005252102787645e-07, "loss": 0.81764972, "num_input_tokens_seen": 159728725, "step": 7385, "time_per_iteration": 2.855015516281128 }, { "auxiliary_loss_clip": 0.01122429, "auxiliary_loss_mlp": 0.01039215, "balance_loss_clip": 1.04349065, "balance_loss_mlp": 1.02492213, "epoch": 0.8881139902603259, "flos": 22234091886720.0, "grad_norm": 1.6701916175968068, "language_loss": 0.73579073, "learning_rate": 1.297763484317105e-07, "loss": 0.75740719, "num_input_tokens_seen": 159747020, "step": 7386, "time_per_iteration": 2.6420700550079346 }, { "auxiliary_loss_clip": 0.01080254, "auxiliary_loss_mlp": 0.00775272, "balance_loss_clip": 1.0360831, "balance_loss_mlp": 1.00054431, "epoch": 0.888234233150965, "flos": 20299548170880.0, "grad_norm": 3.1814643195472545, "language_loss": 0.7093541, "learning_rate": 1.2950045954698551e-07, "loss": 0.72790939, "num_input_tokens_seen": 159764855, "step": 7387, "time_per_iteration": 2.648761510848999 }, { "auxiliary_loss_clip": 0.01084173, "auxiliary_loss_mlp": 0.01033458, "balance_loss_clip": 1.03737617, "balance_loss_mlp": 1.0179553, "epoch": 0.888354476041604, "flos": 18147996437760.0, "grad_norm": 1.7395039347100927, "language_loss": 0.75141621, "learning_rate": 1.2922485441555343e-07, "loss": 0.77259254, "num_input_tokens_seen": 159783935, "step": 7388, "time_per_iteration": 3.6181931495666504 }, { "auxiliary_loss_clip": 0.01130419, "auxiliary_loss_mlp": 0.0103617, "balance_loss_clip": 1.04086733, "balance_loss_mlp": 1.02068448, "epoch": 0.8884747189322432, "flos": 22014282608640.0, "grad_norm": 1.7627841778331788, "language_loss": 0.81755251, "learning_rate": 1.2894953307922363e-07, "loss": 0.83921838, "num_input_tokens_seen": 159802895, "step": 7389, "time_per_iteration": 2.561823844909668 }, { "auxiliary_loss_clip": 0.01092644, "auxiliary_loss_mlp": 0.01052772, "balance_loss_clip": 1.03991294, "balance_loss_mlp": 1.03479481, "epoch": 0.8885949618228822, "flos": 19786779567360.0, "grad_norm": 1.9432147043737664, "language_loss": 0.84182334, "learning_rate": 1.2867449557976208e-07, "loss": 0.86327749, "num_input_tokens_seen": 159820995, "step": 7390, "time_per_iteration": 3.5941567420959473 }, { "auxiliary_loss_clip": 0.01118343, "auxiliary_loss_mlp": 0.01041004, "balance_loss_clip": 1.04131293, "balance_loss_mlp": 1.02523851, "epoch": 0.8887152047135213, "flos": 20047599198720.0, "grad_norm": 3.0535290228395238, "language_loss": 0.75654328, "learning_rate": 1.283997419588916e-07, "loss": 0.77813673, "num_input_tokens_seen": 159840465, "step": 7391, "time_per_iteration": 2.6669201850891113 }, { "auxiliary_loss_clip": 0.0112529, "auxiliary_loss_mlp": 0.01039612, "balance_loss_clip": 1.04329133, "balance_loss_mlp": 1.02300668, "epoch": 0.8888354476041604, "flos": 18588117784320.0, "grad_norm": 2.1154668707621496, "language_loss": 0.6168071, "learning_rate": 1.2812527225829216e-07, "loss": 0.63845617, "num_input_tokens_seen": 159858690, "step": 7392, "time_per_iteration": 2.578238010406494 }, { "auxiliary_loss_clip": 0.01126776, "auxiliary_loss_mlp": 0.01039008, "balance_loss_clip": 1.04389739, "balance_loss_mlp": 1.02228332, "epoch": 0.8889556904947995, "flos": 21689794120320.0, "grad_norm": 1.9184643708313158, "language_loss": 0.76392484, "learning_rate": 1.2785108651960052e-07, "loss": 0.78558266, "num_input_tokens_seen": 159880325, "step": 7393, "time_per_iteration": 3.613314628601074 }, { "auxiliary_loss_clip": 0.01122829, "auxiliary_loss_mlp": 0.01041378, "balance_loss_clip": 1.04237127, "balance_loss_mlp": 1.02558279, "epoch": 0.8890759333854386, "flos": 27381204201600.0, "grad_norm": 1.8222329338845318, "language_loss": 0.80344516, "learning_rate": 1.2757718478441094e-07, "loss": 0.82508719, "num_input_tokens_seen": 159901070, "step": 7394, "time_per_iteration": 3.584043502807617 }, { "auxiliary_loss_clip": 0.01105051, "auxiliary_loss_mlp": 0.01035111, "balance_loss_clip": 1.0400002, "balance_loss_mlp": 1.01996529, "epoch": 0.8891961762760777, "flos": 24498834353280.0, "grad_norm": 2.5341791892084635, "language_loss": 0.77394426, "learning_rate": 1.2730356709427302e-07, "loss": 0.79534584, "num_input_tokens_seen": 159919750, "step": 7395, "time_per_iteration": 2.6309659481048584 }, { "auxiliary_loss_clip": 0.01115473, "auxiliary_loss_mlp": 0.01041006, "balance_loss_clip": 1.04215646, "balance_loss_mlp": 1.02497292, "epoch": 0.8893164191667168, "flos": 41499770895360.0, "grad_norm": 1.5794590530397659, "language_loss": 0.59835458, "learning_rate": 1.2703023349069542e-07, "loss": 0.61991936, "num_input_tokens_seen": 159944600, "step": 7396, "time_per_iteration": 2.8026349544525146 }, { "auxiliary_loss_clip": 0.01113006, "auxiliary_loss_mlp": 0.01038867, "balance_loss_clip": 1.03970373, "balance_loss_mlp": 1.02354908, "epoch": 0.8894366620573558, "flos": 33583623120000.0, "grad_norm": 2.960477463756351, "language_loss": 0.61730832, "learning_rate": 1.2675718401514223e-07, "loss": 0.63882703, "num_input_tokens_seen": 159968780, "step": 7397, "time_per_iteration": 2.667705535888672 }, { "auxiliary_loss_clip": 0.01105275, "auxiliary_loss_mlp": 0.01051369, "balance_loss_clip": 1.0394417, "balance_loss_mlp": 1.03603864, "epoch": 0.889556904947995, "flos": 16909832672640.0, "grad_norm": 2.303400758115368, "language_loss": 0.74771631, "learning_rate": 1.264844187090346e-07, "loss": 0.7692827, "num_input_tokens_seen": 159985905, "step": 7398, "time_per_iteration": 2.634392261505127 }, { "auxiliary_loss_clip": 0.01102824, "auxiliary_loss_mlp": 0.01032708, "balance_loss_clip": 1.04036963, "balance_loss_mlp": 1.01785421, "epoch": 0.889677147838634, "flos": 26030855283840.0, "grad_norm": 1.9570358571765443, "language_loss": 0.75045532, "learning_rate": 1.262119376137516e-07, "loss": 0.77181065, "num_input_tokens_seen": 160006965, "step": 7399, "time_per_iteration": 2.649169445037842 }, { "auxiliary_loss_clip": 0.01112274, "auxiliary_loss_mlp": 0.01040386, "balance_loss_clip": 1.04116297, "balance_loss_mlp": 1.02571154, "epoch": 0.8897973907292731, "flos": 26468283110400.0, "grad_norm": 1.540200169774773, "language_loss": 0.8512395, "learning_rate": 1.2593974077062707e-07, "loss": 0.87276614, "num_input_tokens_seen": 160028585, "step": 7400, "time_per_iteration": 2.6282472610473633 }, { "auxiliary_loss_clip": 0.0108603, "auxiliary_loss_mlp": 0.01044333, "balance_loss_clip": 1.03739095, "balance_loss_mlp": 1.02751219, "epoch": 0.8899176336199123, "flos": 26249694894720.0, "grad_norm": 1.9843273429471182, "language_loss": 0.63713008, "learning_rate": 1.2566782822095423e-07, "loss": 0.65843368, "num_input_tokens_seen": 160048840, "step": 7401, "time_per_iteration": 2.6899964809417725 }, { "auxiliary_loss_clip": 0.01099735, "auxiliary_loss_mlp": 0.0103785, "balance_loss_clip": 1.04103887, "balance_loss_mlp": 1.02185249, "epoch": 0.8900378765105513, "flos": 20811742156800.0, "grad_norm": 7.573481125242811, "language_loss": 0.71413982, "learning_rate": 1.2539620000598162e-07, "loss": 0.73551571, "num_input_tokens_seen": 160068175, "step": 7402, "time_per_iteration": 2.7222633361816406 }, { "auxiliary_loss_clip": 0.01132314, "auxiliary_loss_mlp": 0.01042362, "balance_loss_clip": 1.04278803, "balance_loss_mlp": 1.0275383, "epoch": 0.8901581194011904, "flos": 16472333018880.0, "grad_norm": 1.8025404574861943, "language_loss": 0.79782367, "learning_rate": 1.2512485616691492e-07, "loss": 0.81957042, "num_input_tokens_seen": 160085230, "step": 7403, "time_per_iteration": 2.648036003112793 }, { "auxiliary_loss_clip": 0.01098379, "auxiliary_loss_mlp": 0.01046561, "balance_loss_clip": 1.03926706, "balance_loss_mlp": 1.02974105, "epoch": 0.8902783622918296, "flos": 35155253773440.0, "grad_norm": 1.5018162523856395, "language_loss": 0.81042165, "learning_rate": 1.2485379674491681e-07, "loss": 0.83187109, "num_input_tokens_seen": 160111425, "step": 7404, "time_per_iteration": 2.779477119445801 }, { "auxiliary_loss_clip": 0.01109035, "auxiliary_loss_mlp": 0.01043548, "balance_loss_clip": 1.04081023, "balance_loss_mlp": 1.02845621, "epoch": 0.8903986051824686, "flos": 17201068145280.0, "grad_norm": 2.1256761483009146, "language_loss": 0.79454863, "learning_rate": 1.2458302178110657e-07, "loss": 0.81607443, "num_input_tokens_seen": 160129790, "step": 7405, "time_per_iteration": 2.615752696990967 }, { "auxiliary_loss_clip": 0.0108322, "auxiliary_loss_mlp": 0.0103456, "balance_loss_clip": 1.03551579, "balance_loss_mlp": 1.01897895, "epoch": 0.8905188480731077, "flos": 25483863997440.0, "grad_norm": 1.9675203056333939, "language_loss": 0.82658195, "learning_rate": 1.2431253131656118e-07, "loss": 0.84775972, "num_input_tokens_seen": 160149265, "step": 7406, "time_per_iteration": 2.6709837913513184 }, { "auxiliary_loss_clip": 0.01099015, "auxiliary_loss_mlp": 0.01037413, "balance_loss_clip": 1.03794599, "balance_loss_mlp": 1.0214864, "epoch": 0.8906390909637467, "flos": 23365888502400.0, "grad_norm": 1.8064249644601929, "language_loss": 0.76716113, "learning_rate": 1.240423253923133e-07, "loss": 0.7885254, "num_input_tokens_seen": 160168870, "step": 7407, "time_per_iteration": 2.6093826293945312 }, { "auxiliary_loss_clip": 0.0112233, "auxiliary_loss_mlp": 0.01039668, "balance_loss_clip": 1.04235351, "balance_loss_mlp": 1.02275205, "epoch": 0.8907593338543859, "flos": 21068790860160.0, "grad_norm": 2.5035639704529116, "language_loss": 0.69684702, "learning_rate": 1.237724040493533e-07, "loss": 0.718467, "num_input_tokens_seen": 160187495, "step": 7408, "time_per_iteration": 2.576629638671875 }, { "auxiliary_loss_clip": 0.0113766, "auxiliary_loss_mlp": 0.01041635, "balance_loss_clip": 1.04390848, "balance_loss_mlp": 1.02535069, "epoch": 0.8908795767450249, "flos": 21869562712320.0, "grad_norm": 2.229516936645328, "language_loss": 0.73013127, "learning_rate": 1.2350276732862773e-07, "loss": 0.75192416, "num_input_tokens_seen": 160208520, "step": 7409, "time_per_iteration": 2.593661308288574 }, { "auxiliary_loss_clip": 0.01026614, "auxiliary_loss_mlp": 0.0100356, "balance_loss_clip": 1.00759387, "balance_loss_mlp": 1.00199819, "epoch": 0.890999819635664, "flos": 66307869348480.0, "grad_norm": 0.8319412835978098, "language_loss": 0.56636834, "learning_rate": 1.2323341527103993e-07, "loss": 0.58667004, "num_input_tokens_seen": 160263720, "step": 7410, "time_per_iteration": 3.078294277191162 }, { "auxiliary_loss_clip": 0.01130651, "auxiliary_loss_mlp": 0.01034606, "balance_loss_clip": 1.04198456, "balance_loss_mlp": 1.01953745, "epoch": 0.8911200625263032, "flos": 26869908055680.0, "grad_norm": 2.319144126904598, "language_loss": 0.8523156, "learning_rate": 1.2296434791745135e-07, "loss": 0.87396818, "num_input_tokens_seen": 160282170, "step": 7411, "time_per_iteration": 2.5576791763305664 }, { "auxiliary_loss_clip": 0.0112349, "auxiliary_loss_mlp": 0.01037107, "balance_loss_clip": 1.04275036, "balance_loss_mlp": 1.0215143, "epoch": 0.8912403054169422, "flos": 20885825957760.0, "grad_norm": 1.6122073381868673, "language_loss": 0.7682929, "learning_rate": 1.2269556530867875e-07, "loss": 0.78989887, "num_input_tokens_seen": 160300725, "step": 7412, "time_per_iteration": 2.5706558227539062 }, { "auxiliary_loss_clip": 0.01139093, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.04478407, "balance_loss_mlp": 1.02628374, "epoch": 0.8913605483075813, "flos": 27016567286400.0, "grad_norm": 1.9072255978422306, "language_loss": 0.82205296, "learning_rate": 1.2242706748549614e-07, "loss": 0.84388292, "num_input_tokens_seen": 160318720, "step": 7413, "time_per_iteration": 2.576345443725586 }, { "auxiliary_loss_clip": 0.01106041, "auxiliary_loss_mlp": 0.01041031, "balance_loss_clip": 1.03618813, "balance_loss_mlp": 1.02492595, "epoch": 0.8914807911982204, "flos": 23621500661760.0, "grad_norm": 2.1920005700038683, "language_loss": 0.82386911, "learning_rate": 1.2215885448863473e-07, "loss": 0.84533989, "num_input_tokens_seen": 160339595, "step": 7414, "time_per_iteration": 3.5472934246063232 }, { "auxiliary_loss_clip": 0.01109163, "auxiliary_loss_mlp": 0.01039619, "balance_loss_clip": 1.04156303, "balance_loss_mlp": 1.0252068, "epoch": 0.8916010340888595, "flos": 24462277286400.0, "grad_norm": 2.5736148717645895, "language_loss": 0.80331337, "learning_rate": 1.2189092635878152e-07, "loss": 0.82480121, "num_input_tokens_seen": 160361045, "step": 7415, "time_per_iteration": 2.63930344581604 }, { "auxiliary_loss_clip": 0.01086333, "auxiliary_loss_mlp": 0.0104259, "balance_loss_clip": 1.03764963, "balance_loss_mlp": 1.02631807, "epoch": 0.8917212769794985, "flos": 21215773313280.0, "grad_norm": 1.7167632546824572, "language_loss": 0.77536905, "learning_rate": 1.216232831365822e-07, "loss": 0.79665828, "num_input_tokens_seen": 160379990, "step": 7416, "time_per_iteration": 3.517529010772705 }, { "auxiliary_loss_clip": 0.01113579, "auxiliary_loss_mlp": 0.01042254, "balance_loss_clip": 1.04365039, "balance_loss_mlp": 1.02682853, "epoch": 0.8918415198701377, "flos": 25513992529920.0, "grad_norm": 1.9275669749845883, "language_loss": 0.80836868, "learning_rate": 1.2135592486263678e-07, "loss": 0.82992709, "num_input_tokens_seen": 160399240, "step": 7417, "time_per_iteration": 2.6647849082946777 }, { "auxiliary_loss_clip": 0.01104556, "auxiliary_loss_mlp": 0.01033589, "balance_loss_clip": 1.03871703, "balance_loss_mlp": 1.0186044, "epoch": 0.8919617627607768, "flos": 37853006693760.0, "grad_norm": 1.7125354649754316, "language_loss": 0.61081564, "learning_rate": 1.2108885157750415e-07, "loss": 0.63219714, "num_input_tokens_seen": 160421600, "step": 7418, "time_per_iteration": 2.7385401725769043 }, { "auxiliary_loss_clip": 0.01092486, "auxiliary_loss_mlp": 0.00772607, "balance_loss_clip": 1.04070997, "balance_loss_mlp": 1.00054002, "epoch": 0.8920820056514158, "flos": 26213676531840.0, "grad_norm": 1.9762525628317305, "language_loss": 0.80555964, "learning_rate": 1.2082206332169897e-07, "loss": 0.82421052, "num_input_tokens_seen": 160441695, "step": 7419, "time_per_iteration": 3.593848466873169 }, { "auxiliary_loss_clip": 0.01100369, "auxiliary_loss_mlp": 0.01036921, "balance_loss_clip": 1.0382179, "balance_loss_mlp": 1.02200246, "epoch": 0.892202248542055, "flos": 17383135207680.0, "grad_norm": 2.2502734373205024, "language_loss": 0.73018098, "learning_rate": 1.2055556013569225e-07, "loss": 0.75155389, "num_input_tokens_seen": 160457205, "step": 7420, "time_per_iteration": 3.5172836780548096 }, { "auxiliary_loss_clip": 0.01106911, "auxiliary_loss_mlp": 0.01036506, "balance_loss_clip": 1.03969574, "balance_loss_mlp": 1.02204013, "epoch": 0.892322491432694, "flos": 21324223451520.0, "grad_norm": 1.6416534843437398, "language_loss": 0.82103384, "learning_rate": 1.2028934205991315e-07, "loss": 0.8424679, "num_input_tokens_seen": 160476525, "step": 7421, "time_per_iteration": 2.715169906616211 }, { "auxiliary_loss_clip": 0.01118439, "auxiliary_loss_mlp": 0.01040637, "balance_loss_clip": 1.03990626, "balance_loss_mlp": 1.02567029, "epoch": 0.8924427343233331, "flos": 24029374573440.0, "grad_norm": 1.4523568292551077, "language_loss": 0.7675072, "learning_rate": 1.2002340913474607e-07, "loss": 0.78909791, "num_input_tokens_seen": 160500160, "step": 7422, "time_per_iteration": 2.633115530014038 }, { "auxiliary_loss_clip": 0.01133521, "auxiliary_loss_mlp": 0.01041465, "balance_loss_clip": 1.04179978, "balance_loss_mlp": 1.02553821, "epoch": 0.8925629772139723, "flos": 30008069631360.0, "grad_norm": 3.8739541048027113, "language_loss": 0.73945129, "learning_rate": 1.1975776140053317e-07, "loss": 0.76120114, "num_input_tokens_seen": 160520130, "step": 7423, "time_per_iteration": 2.640716314315796 }, { "auxiliary_loss_clip": 0.0108667, "auxiliary_loss_mlp": 0.01039713, "balance_loss_clip": 1.03875947, "balance_loss_mlp": 1.02358413, "epoch": 0.8926832201046113, "flos": 22601709630720.0, "grad_norm": 2.1335829683696184, "language_loss": 0.73665398, "learning_rate": 1.194923988975729e-07, "loss": 0.75791782, "num_input_tokens_seen": 160539730, "step": 7424, "time_per_iteration": 2.667790174484253 }, { "auxiliary_loss_clip": 0.01092841, "auxiliary_loss_mlp": 0.0103181, "balance_loss_clip": 1.03778613, "balance_loss_mlp": 1.01628923, "epoch": 0.8928034629952504, "flos": 13297722117120.0, "grad_norm": 2.4081410263334297, "language_loss": 0.73216712, "learning_rate": 1.192273216661206e-07, "loss": 0.75341368, "num_input_tokens_seen": 160557820, "step": 7425, "time_per_iteration": 2.6410839557647705 }, { "auxiliary_loss_clip": 0.00990801, "auxiliary_loss_mlp": 0.01000906, "balance_loss_clip": 1.00878465, "balance_loss_mlp": 0.99945796, "epoch": 0.8929237058858895, "flos": 54854556744960.0, "grad_norm": 0.762089765282259, "language_loss": 0.57523954, "learning_rate": 1.189625297463881e-07, "loss": 0.59515667, "num_input_tokens_seen": 160619510, "step": 7426, "time_per_iteration": 3.2035017013549805 }, { "auxiliary_loss_clip": 0.01061582, "auxiliary_loss_mlp": 0.01036644, "balance_loss_clip": 1.03257322, "balance_loss_mlp": 1.02126646, "epoch": 0.8930439487765286, "flos": 28883850785280.0, "grad_norm": 1.5489961875184912, "language_loss": 0.796538, "learning_rate": 1.1869802317854394e-07, "loss": 0.81752032, "num_input_tokens_seen": 160643295, "step": 7427, "time_per_iteration": 2.822542190551758 }, { "auxiliary_loss_clip": 0.01088031, "auxiliary_loss_mlp": 0.01037841, "balance_loss_clip": 1.03881681, "balance_loss_mlp": 1.02233231, "epoch": 0.8931641916671677, "flos": 22419283432320.0, "grad_norm": 2.033004131036545, "language_loss": 0.72259724, "learning_rate": 1.1843380200271425e-07, "loss": 0.74385595, "num_input_tokens_seen": 160662495, "step": 7428, "time_per_iteration": 2.685197591781616 }, { "auxiliary_loss_clip": 0.01089856, "auxiliary_loss_mlp": 0.01037804, "balance_loss_clip": 1.03845429, "balance_loss_mlp": 1.02140069, "epoch": 0.8932844345578068, "flos": 25843149786240.0, "grad_norm": 1.7692170372796872, "language_loss": 0.80498302, "learning_rate": 1.181698662589805e-07, "loss": 0.82625961, "num_input_tokens_seen": 160682080, "step": 7429, "time_per_iteration": 2.643235921859741 }, { "auxiliary_loss_clip": 0.01119256, "auxiliary_loss_mlp": 0.01033591, "balance_loss_clip": 1.04213452, "balance_loss_mlp": 1.01781964, "epoch": 0.8934046774484459, "flos": 22925803069440.0, "grad_norm": 1.7754927444858786, "language_loss": 0.76469159, "learning_rate": 1.1790621598738249e-07, "loss": 0.78622007, "num_input_tokens_seen": 160700395, "step": 7430, "time_per_iteration": 2.61458158493042 }, { "auxiliary_loss_clip": 0.01129506, "auxiliary_loss_mlp": 0.01030711, "balance_loss_clip": 1.04429257, "balance_loss_mlp": 1.01705003, "epoch": 0.8935249203390849, "flos": 24462097718400.0, "grad_norm": 2.1068016414967405, "language_loss": 0.75172287, "learning_rate": 1.1764285122791461e-07, "loss": 0.77332503, "num_input_tokens_seen": 160721115, "step": 7431, "time_per_iteration": 2.6005334854125977 }, { "auxiliary_loss_clip": 0.01117042, "auxiliary_loss_mlp": 0.01039736, "balance_loss_clip": 1.03890872, "balance_loss_mlp": 1.02396417, "epoch": 0.8936451632297241, "flos": 15742735966080.0, "grad_norm": 1.8738739043702917, "language_loss": 0.76875043, "learning_rate": 1.173797720205294e-07, "loss": 0.79031819, "num_input_tokens_seen": 160739150, "step": 7432, "time_per_iteration": 2.583085775375366 }, { "auxiliary_loss_clip": 0.0112442, "auxiliary_loss_mlp": 0.01046841, "balance_loss_clip": 1.04337716, "balance_loss_mlp": 1.02975798, "epoch": 0.8937654061203631, "flos": 35115500396160.0, "grad_norm": 4.099395579943122, "language_loss": 0.72011733, "learning_rate": 1.1711697840513602e-07, "loss": 0.74182993, "num_input_tokens_seen": 160758585, "step": 7433, "time_per_iteration": 2.715348243713379 }, { "auxiliary_loss_clip": 0.0111149, "auxiliary_loss_mlp": 0.01032745, "balance_loss_clip": 1.03886294, "balance_loss_mlp": 1.01813018, "epoch": 0.8938856490110022, "flos": 16107444708480.0, "grad_norm": 2.0445231186484167, "language_loss": 0.70966029, "learning_rate": 1.1685447042160012e-07, "loss": 0.73110259, "num_input_tokens_seen": 160776620, "step": 7434, "time_per_iteration": 2.534189224243164 }, { "auxiliary_loss_clip": 0.01133542, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.04307008, "balance_loss_mlp": 1.02057993, "epoch": 0.8940058919016414, "flos": 20704189858560.0, "grad_norm": 1.7086277034830295, "language_loss": 0.71673131, "learning_rate": 1.1659224810974367e-07, "loss": 0.73842311, "num_input_tokens_seen": 160796580, "step": 7435, "time_per_iteration": 2.544344663619995 }, { "auxiliary_loss_clip": 0.0110621, "auxiliary_loss_mlp": 0.01033588, "balance_loss_clip": 1.04171443, "balance_loss_mlp": 1.01884758, "epoch": 0.8941261347922804, "flos": 25229041937280.0, "grad_norm": 1.4754226955369618, "language_loss": 0.68775207, "learning_rate": 1.1633031150934591e-07, "loss": 0.70915008, "num_input_tokens_seen": 160819610, "step": 7436, "time_per_iteration": 2.6973302364349365 }, { "auxiliary_loss_clip": 0.01123471, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.04390371, "balance_loss_mlp": 1.0254426, "epoch": 0.8942463776829195, "flos": 19537236806400.0, "grad_norm": 2.8402228567530696, "language_loss": 0.8050698, "learning_rate": 1.1606866066014176e-07, "loss": 0.82671779, "num_input_tokens_seen": 160838660, "step": 7437, "time_per_iteration": 2.563600778579712 }, { "auxiliary_loss_clip": 0.01095382, "auxiliary_loss_mlp": 0.01035436, "balance_loss_clip": 1.03960073, "balance_loss_mlp": 1.01940215, "epoch": 0.8943666205735585, "flos": 22301567585280.0, "grad_norm": 2.2996394082021347, "language_loss": 0.75687754, "learning_rate": 1.1580729560182434e-07, "loss": 0.77818567, "num_input_tokens_seen": 160854515, "step": 7438, "time_per_iteration": 2.6273810863494873 }, { "auxiliary_loss_clip": 0.01129436, "auxiliary_loss_mlp": 0.00771887, "balance_loss_clip": 1.04174018, "balance_loss_mlp": 1.00054073, "epoch": 0.8944868634641977, "flos": 18912893581440.0, "grad_norm": 1.6744626039293578, "language_loss": 0.70922804, "learning_rate": 1.1554621637404171e-07, "loss": 0.72824126, "num_input_tokens_seen": 160872605, "step": 7439, "time_per_iteration": 3.42702579498291 }, { "auxiliary_loss_clip": 0.01120031, "auxiliary_loss_mlp": 0.01039101, "balance_loss_clip": 1.04152107, "balance_loss_mlp": 1.02458131, "epoch": 0.8946071063548368, "flos": 14460904241280.0, "grad_norm": 2.3162938150949666, "language_loss": 0.60878301, "learning_rate": 1.1528542301639999e-07, "loss": 0.63037431, "num_input_tokens_seen": 160889395, "step": 7440, "time_per_iteration": 2.524797201156616 }, { "auxiliary_loss_clip": 0.01099555, "auxiliary_loss_mlp": 0.0104003, "balance_loss_clip": 1.03789747, "balance_loss_mlp": 1.0233283, "epoch": 0.8947273492454758, "flos": 20084084438400.0, "grad_norm": 2.6892141070257067, "language_loss": 0.82527578, "learning_rate": 1.1502491556846105e-07, "loss": 0.84667158, "num_input_tokens_seen": 160907890, "step": 7441, "time_per_iteration": 2.6638636589050293 }, { "auxiliary_loss_clip": 0.01106743, "auxiliary_loss_mlp": 0.01038843, "balance_loss_clip": 1.04023409, "balance_loss_mlp": 1.0237633, "epoch": 0.894847592136115, "flos": 18550555136640.0, "grad_norm": 2.467745822364877, "language_loss": 0.81600982, "learning_rate": 1.1476469406974331e-07, "loss": 0.83746564, "num_input_tokens_seen": 160923490, "step": 7442, "time_per_iteration": 3.4819626808166504 }, { "auxiliary_loss_clip": 0.0113105, "auxiliary_loss_mlp": 0.01032156, "balance_loss_clip": 1.04312611, "balance_loss_mlp": 1.01770222, "epoch": 0.894967835026754, "flos": 23478468704640.0, "grad_norm": 1.7097085781668233, "language_loss": 0.76877809, "learning_rate": 1.1450475855972341e-07, "loss": 0.79041016, "num_input_tokens_seen": 160944280, "step": 7443, "time_per_iteration": 2.5758817195892334 }, { "auxiliary_loss_clip": 0.01106283, "auxiliary_loss_mlp": 0.00772982, "balance_loss_clip": 1.0395689, "balance_loss_mlp": 1.00050306, "epoch": 0.8950880779173931, "flos": 15188310564480.0, "grad_norm": 2.1293313038381507, "language_loss": 0.70929253, "learning_rate": 1.1424510907783158e-07, "loss": 0.72808516, "num_input_tokens_seen": 160961560, "step": 7444, "time_per_iteration": 2.5945000648498535 }, { "auxiliary_loss_clip": 0.01109589, "auxiliary_loss_mlp": 0.01038538, "balance_loss_clip": 1.03891432, "balance_loss_mlp": 1.02305293, "epoch": 0.8952083208080323, "flos": 22091957769600.0, "grad_norm": 1.6061638952504143, "language_loss": 0.82957971, "learning_rate": 1.1398574566345787e-07, "loss": 0.85106099, "num_input_tokens_seen": 160982195, "step": 7445, "time_per_iteration": 3.6192986965179443 }, { "auxiliary_loss_clip": 0.01113493, "auxiliary_loss_mlp": 0.01034063, "balance_loss_clip": 1.04061174, "balance_loss_mlp": 1.01801693, "epoch": 0.8953285636986713, "flos": 23254026572160.0, "grad_norm": 2.177452476732466, "language_loss": 0.82495928, "learning_rate": 1.1372666835594702e-07, "loss": 0.84643483, "num_input_tokens_seen": 161000520, "step": 7446, "time_per_iteration": 3.518263578414917 }, { "auxiliary_loss_clip": 0.01106769, "auxiliary_loss_mlp": 0.01033292, "balance_loss_clip": 1.04107773, "balance_loss_mlp": 1.01850998, "epoch": 0.8954488065893104, "flos": 16362661818240.0, "grad_norm": 2.017604440411937, "language_loss": 0.71892071, "learning_rate": 1.1346787719460071e-07, "loss": 0.74032128, "num_input_tokens_seen": 161019405, "step": 7447, "time_per_iteration": 2.581836462020874 }, { "auxiliary_loss_clip": 0.01107901, "auxiliary_loss_mlp": 0.01032271, "balance_loss_clip": 1.04085755, "balance_loss_mlp": 1.01702476, "epoch": 0.8955690494799495, "flos": 18257883120000.0, "grad_norm": 2.095534418364601, "language_loss": 0.72484225, "learning_rate": 1.1320937221867732e-07, "loss": 0.74624401, "num_input_tokens_seen": 161036985, "step": 7448, "time_per_iteration": 2.6225860118865967 }, { "auxiliary_loss_clip": 0.0110559, "auxiliary_loss_mlp": 0.01039385, "balance_loss_clip": 1.04097927, "balance_loss_mlp": 1.0244894, "epoch": 0.8956892923705886, "flos": 25447486498560.0, "grad_norm": 1.9118009094127046, "language_loss": 0.79750043, "learning_rate": 1.1295115346739192e-07, "loss": 0.81895018, "num_input_tokens_seen": 161056985, "step": 7449, "time_per_iteration": 2.6303555965423584 }, { "auxiliary_loss_clip": 0.01111942, "auxiliary_loss_mlp": 0.01032789, "balance_loss_clip": 1.04234052, "balance_loss_mlp": 1.01702976, "epoch": 0.8958095352612276, "flos": 52661883939840.0, "grad_norm": 1.9792457064463171, "language_loss": 0.72567403, "learning_rate": 1.1269322097991629e-07, "loss": 0.74712133, "num_input_tokens_seen": 161080270, "step": 7450, "time_per_iteration": 2.880657434463501 }, { "auxiliary_loss_clip": 0.01125272, "auxiliary_loss_mlp": 0.01047337, "balance_loss_clip": 1.04237616, "balance_loss_mlp": 1.03173804, "epoch": 0.8959297781518668, "flos": 23186335392000.0, "grad_norm": 2.2364671190674716, "language_loss": 0.67740226, "learning_rate": 1.1243557479537846e-07, "loss": 0.69912827, "num_input_tokens_seen": 161100160, "step": 7451, "time_per_iteration": 2.5865073204040527 }, { "auxiliary_loss_clip": 0.01133902, "auxiliary_loss_mlp": 0.01040183, "balance_loss_clip": 1.0430994, "balance_loss_mlp": 1.02372015, "epoch": 0.8960500210425059, "flos": 20334309557760.0, "grad_norm": 2.232232135123505, "language_loss": 0.68841803, "learning_rate": 1.121782149528634e-07, "loss": 0.71015888, "num_input_tokens_seen": 161117260, "step": 7452, "time_per_iteration": 2.559363842010498 }, { "auxiliary_loss_clip": 0.01109544, "auxiliary_loss_mlp": 0.01040017, "balance_loss_clip": 1.04264808, "balance_loss_mlp": 1.02492476, "epoch": 0.8961702639331449, "flos": 19901694153600.0, "grad_norm": 2.009994213446035, "language_loss": 0.78668773, "learning_rate": 1.1192114149141208e-07, "loss": 0.80818331, "num_input_tokens_seen": 161136895, "step": 7453, "time_per_iteration": 2.588094472885132 }, { "auxiliary_loss_clip": 0.01112555, "auxiliary_loss_mlp": 0.01044985, "balance_loss_clip": 1.04096818, "balance_loss_mlp": 1.02785444, "epoch": 0.8962905068237841, "flos": 12896348567040.0, "grad_norm": 2.7654633741511003, "language_loss": 0.65086657, "learning_rate": 1.1166435445002197e-07, "loss": 0.67244196, "num_input_tokens_seen": 161154565, "step": 7454, "time_per_iteration": 2.581671714782715 }, { "auxiliary_loss_clip": 0.0112375, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.04446125, "balance_loss_mlp": 1.01917052, "epoch": 0.8964107497144231, "flos": 23440331439360.0, "grad_norm": 2.2958580332588983, "language_loss": 0.68728048, "learning_rate": 1.1140785386764818e-07, "loss": 0.70886248, "num_input_tokens_seen": 161173265, "step": 7455, "time_per_iteration": 2.5955355167388916 }, { "auxiliary_loss_clip": 0.01115479, "auxiliary_loss_mlp": 0.01046098, "balance_loss_clip": 1.04110575, "balance_loss_mlp": 1.03076732, "epoch": 0.8965309926050622, "flos": 19500176949120.0, "grad_norm": 2.5743361271440732, "language_loss": 0.69483811, "learning_rate": 1.1115163978320153e-07, "loss": 0.71645391, "num_input_tokens_seen": 161191995, "step": 7456, "time_per_iteration": 2.564971685409546 }, { "auxiliary_loss_clip": 0.01124188, "auxiliary_loss_mlp": 0.00772459, "balance_loss_clip": 1.04269838, "balance_loss_mlp": 1.00046492, "epoch": 0.8966512354957014, "flos": 28658008022400.0, "grad_norm": 1.7994907368149933, "language_loss": 0.82705998, "learning_rate": 1.1089571223554917e-07, "loss": 0.84602642, "num_input_tokens_seen": 161212880, "step": 7457, "time_per_iteration": 2.6331920623779297 }, { "auxiliary_loss_clip": 0.01120413, "auxiliary_loss_mlp": 0.01038477, "balance_loss_clip": 1.03952599, "balance_loss_mlp": 1.02222872, "epoch": 0.8967714783863404, "flos": 23370916406400.0, "grad_norm": 1.7176140074390767, "language_loss": 0.85684448, "learning_rate": 1.1064007126351537e-07, "loss": 0.87843335, "num_input_tokens_seen": 161233595, "step": 7458, "time_per_iteration": 2.5985658168792725 }, { "auxiliary_loss_clip": 0.01103739, "auxiliary_loss_mlp": 0.01037737, "balance_loss_clip": 1.0392642, "balance_loss_mlp": 1.02209663, "epoch": 0.8968917212769795, "flos": 24535175938560.0, "grad_norm": 2.0775255470095906, "language_loss": 0.76366997, "learning_rate": 1.1038471690588003e-07, "loss": 0.78508472, "num_input_tokens_seen": 161252740, "step": 7459, "time_per_iteration": 2.646660327911377 }, { "auxiliary_loss_clip": 0.01078896, "auxiliary_loss_mlp": 0.01035288, "balance_loss_clip": 1.03853238, "balance_loss_mlp": 1.02047062, "epoch": 0.8970119641676186, "flos": 23475416048640.0, "grad_norm": 2.0114994553174363, "language_loss": 0.79886425, "learning_rate": 1.1012964920138145e-07, "loss": 0.82000613, "num_input_tokens_seen": 161272325, "step": 7460, "time_per_iteration": 2.6946380138397217 }, { "auxiliary_loss_clip": 0.01102368, "auxiliary_loss_mlp": 0.01036203, "balance_loss_clip": 1.03820133, "balance_loss_mlp": 1.0213083, "epoch": 0.8971322070582577, "flos": 24538192680960.0, "grad_norm": 1.7091020237941992, "language_loss": 0.75637501, "learning_rate": 1.0987486818871205e-07, "loss": 0.77776074, "num_input_tokens_seen": 161295915, "step": 7461, "time_per_iteration": 2.727133274078369 }, { "auxiliary_loss_clip": 0.01122513, "auxiliary_loss_mlp": 0.00771547, "balance_loss_clip": 1.04301631, "balance_loss_mlp": 1.00055707, "epoch": 0.8972524499488967, "flos": 21797454159360.0, "grad_norm": 3.5265850274866097, "language_loss": 0.73016298, "learning_rate": 1.0962037390652245e-07, "loss": 0.74910361, "num_input_tokens_seen": 161314935, "step": 7462, "time_per_iteration": 2.5480618476867676 }, { "auxiliary_loss_clip": 0.01105825, "auxiliary_loss_mlp": 0.01042106, "balance_loss_clip": 1.04092443, "balance_loss_mlp": 1.02749097, "epoch": 0.8973726928395359, "flos": 21726243446400.0, "grad_norm": 1.629519591250985, "language_loss": 0.71980405, "learning_rate": 1.0936616639341911e-07, "loss": 0.7412833, "num_input_tokens_seen": 161335225, "step": 7463, "time_per_iteration": 2.619614601135254 }, { "auxiliary_loss_clip": 0.01021115, "auxiliary_loss_mlp": 0.01002249, "balance_loss_clip": 1.00806046, "balance_loss_mlp": 1.00083029, "epoch": 0.897492935730175, "flos": 53837100097920.0, "grad_norm": 0.7414846802488789, "language_loss": 0.54711837, "learning_rate": 1.0911224568796473e-07, "loss": 0.567352, "num_input_tokens_seen": 161393420, "step": 7464, "time_per_iteration": 3.231066942214966 }, { "auxiliary_loss_clip": 0.011195, "auxiliary_loss_mlp": 0.01041537, "balance_loss_clip": 1.04333544, "balance_loss_mlp": 1.02654672, "epoch": 0.897613178620814, "flos": 18290346036480.0, "grad_norm": 2.7650324794724206, "language_loss": 0.71019012, "learning_rate": 1.0885861182867984e-07, "loss": 0.73180056, "num_input_tokens_seen": 161411525, "step": 7465, "time_per_iteration": 3.475783348083496 }, { "auxiliary_loss_clip": 0.01111152, "auxiliary_loss_mlp": 0.01035933, "balance_loss_clip": 1.04168856, "balance_loss_mlp": 1.02031624, "epoch": 0.8977334215114532, "flos": 32993718059520.0, "grad_norm": 2.0095518558486996, "language_loss": 0.7087003, "learning_rate": 1.0860526485403942e-07, "loss": 0.73017114, "num_input_tokens_seen": 161432800, "step": 7466, "time_per_iteration": 2.7014217376708984 }, { "auxiliary_loss_clip": 0.01132293, "auxiliary_loss_mlp": 0.01042634, "balance_loss_clip": 1.04408312, "balance_loss_mlp": 1.02738714, "epoch": 0.8978536644020922, "flos": 15195636938880.0, "grad_norm": 2.660015534788873, "language_loss": 0.7695148, "learning_rate": 1.0835220480247675e-07, "loss": 0.79126406, "num_input_tokens_seen": 161451295, "step": 7467, "time_per_iteration": 2.5274899005889893 }, { "auxiliary_loss_clip": 0.01102751, "auxiliary_loss_mlp": 0.01033732, "balance_loss_clip": 1.03982449, "balance_loss_mlp": 1.01865184, "epoch": 0.8979739072927313, "flos": 18004389863040.0, "grad_norm": 2.0822230817859277, "language_loss": 0.83810854, "learning_rate": 1.0809943171238067e-07, "loss": 0.85947335, "num_input_tokens_seen": 161469220, "step": 7468, "time_per_iteration": 3.5192928314208984 }, { "auxiliary_loss_clip": 0.01117477, "auxiliary_loss_mlp": 0.01043642, "balance_loss_clip": 1.04250324, "balance_loss_mlp": 1.02608299, "epoch": 0.8980941501833704, "flos": 22271546793600.0, "grad_norm": 2.709605320647839, "language_loss": 0.62967694, "learning_rate": 1.078469456220965e-07, "loss": 0.65128803, "num_input_tokens_seen": 161489375, "step": 7469, "time_per_iteration": 2.595796585083008 }, { "auxiliary_loss_clip": 0.01120172, "auxiliary_loss_mlp": 0.01039146, "balance_loss_clip": 1.04215908, "balance_loss_mlp": 1.02400684, "epoch": 0.8982143930740095, "flos": 37560729726720.0, "grad_norm": 2.1313839518220132, "language_loss": 0.69621575, "learning_rate": 1.0759474656992606e-07, "loss": 0.7178089, "num_input_tokens_seen": 161512145, "step": 7470, "time_per_iteration": 2.7418928146362305 }, { "auxiliary_loss_clip": 0.01113527, "auxiliary_loss_mlp": 0.01039815, "balance_loss_clip": 1.04163361, "balance_loss_mlp": 1.0238049, "epoch": 0.8983346359646486, "flos": 18076893465600.0, "grad_norm": 2.4252219615154367, "language_loss": 0.7787894, "learning_rate": 1.0734283459412785e-07, "loss": 0.80032283, "num_input_tokens_seen": 161528995, "step": 7471, "time_per_iteration": 3.5754916667938232 }, { "auxiliary_loss_clip": 0.0108905, "auxiliary_loss_mlp": 0.01039755, "balance_loss_clip": 1.03783548, "balance_loss_mlp": 1.02221966, "epoch": 0.8984548788552876, "flos": 20558895344640.0, "grad_norm": 1.9107459368444328, "language_loss": 0.8057009, "learning_rate": 1.0709120973291707e-07, "loss": 0.826989, "num_input_tokens_seen": 161548775, "step": 7472, "time_per_iteration": 3.584552526473999 }, { "auxiliary_loss_clip": 0.01136659, "auxiliary_loss_mlp": 0.01045309, "balance_loss_clip": 1.04457116, "balance_loss_mlp": 1.02870357, "epoch": 0.8985751217459268, "flos": 17785442511360.0, "grad_norm": 2.0317963317144345, "language_loss": 0.77358651, "learning_rate": 1.0683987202446475e-07, "loss": 0.79540622, "num_input_tokens_seen": 161566960, "step": 7473, "time_per_iteration": 2.5325264930725098 }, { "auxiliary_loss_clip": 0.01125344, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.04357481, "balance_loss_mlp": 1.02000022, "epoch": 0.8986953646365659, "flos": 21617003208960.0, "grad_norm": 3.308117611143968, "language_loss": 0.69805527, "learning_rate": 1.0658882150689862e-07, "loss": 0.71966612, "num_input_tokens_seen": 161585820, "step": 7474, "time_per_iteration": 2.560615062713623 }, { "auxiliary_loss_clip": 0.0109805, "auxiliary_loss_mlp": 0.01042807, "balance_loss_clip": 1.03869224, "balance_loss_mlp": 1.02665424, "epoch": 0.8988156075272049, "flos": 14027355083520.0, "grad_norm": 2.82861503733695, "language_loss": 0.78402114, "learning_rate": 1.0633805821830288e-07, "loss": 0.8054297, "num_input_tokens_seen": 161602505, "step": 7475, "time_per_iteration": 2.6633081436157227 }, { "auxiliary_loss_clip": 0.01105549, "auxiliary_loss_mlp": 0.01035179, "balance_loss_clip": 1.03882265, "balance_loss_mlp": 1.01885891, "epoch": 0.8989358504178441, "flos": 29059202004480.0, "grad_norm": 2.688896041379201, "language_loss": 0.8296724, "learning_rate": 1.0608758219671753e-07, "loss": 0.8510797, "num_input_tokens_seen": 161621545, "step": 7476, "time_per_iteration": 2.65986704826355 }, { "auxiliary_loss_clip": 0.01114749, "auxiliary_loss_mlp": 0.01040736, "balance_loss_clip": 1.04330587, "balance_loss_mlp": 1.02498877, "epoch": 0.8990560933084831, "flos": 20230420446720.0, "grad_norm": 1.5732393910132925, "language_loss": 0.7035082, "learning_rate": 1.0583739348014065e-07, "loss": 0.72506309, "num_input_tokens_seen": 161642630, "step": 7477, "time_per_iteration": 2.625563859939575 }, { "auxiliary_loss_clip": 0.01133603, "auxiliary_loss_mlp": 0.01038827, "balance_loss_clip": 1.04545379, "balance_loss_mlp": 1.02318692, "epoch": 0.8991763361991222, "flos": 25520672459520.0, "grad_norm": 1.921672452761505, "language_loss": 0.84558797, "learning_rate": 1.0558749210652518e-07, "loss": 0.86731225, "num_input_tokens_seen": 161662560, "step": 7478, "time_per_iteration": 2.5989606380462646 }, { "auxiliary_loss_clip": 0.01100118, "auxiliary_loss_mlp": 0.01043514, "balance_loss_clip": 1.04217052, "balance_loss_mlp": 1.02699196, "epoch": 0.8992965790897613, "flos": 25119191168640.0, "grad_norm": 1.6853184740444742, "language_loss": 0.85515368, "learning_rate": 1.053378781137808e-07, "loss": 0.87659001, "num_input_tokens_seen": 161683480, "step": 7479, "time_per_iteration": 2.6786441802978516 }, { "auxiliary_loss_clip": 0.01113789, "auxiliary_loss_mlp": 0.01042088, "balance_loss_clip": 1.04192019, "balance_loss_mlp": 1.02554131, "epoch": 0.8994168219804004, "flos": 16070815814400.0, "grad_norm": 1.857513641282441, "language_loss": 0.7777428, "learning_rate": 1.0508855153977392e-07, "loss": 0.79930151, "num_input_tokens_seen": 161699945, "step": 7480, "time_per_iteration": 2.6067557334899902 }, { "auxiliary_loss_clip": 0.01119359, "auxiliary_loss_mlp": 0.01041215, "balance_loss_clip": 1.04014039, "balance_loss_mlp": 1.02507412, "epoch": 0.8995370648710395, "flos": 24825764966400.0, "grad_norm": 2.44950296825059, "language_loss": 0.66707069, "learning_rate": 1.0483951242232669e-07, "loss": 0.68867648, "num_input_tokens_seen": 161720420, "step": 7481, "time_per_iteration": 2.614071846008301 }, { "auxiliary_loss_clip": 0.01034401, "auxiliary_loss_mlp": 0.01002478, "balance_loss_clip": 1.00662112, "balance_loss_mlp": 1.00098825, "epoch": 0.8996573077616786, "flos": 63116238378240.0, "grad_norm": 0.9758648608800627, "language_loss": 0.57687324, "learning_rate": 1.0459076079921936e-07, "loss": 0.59724206, "num_input_tokens_seen": 161773080, "step": 7482, "time_per_iteration": 3.190530776977539 }, { "auxiliary_loss_clip": 0.01100317, "auxiliary_loss_mlp": 0.01035195, "balance_loss_clip": 1.03822327, "balance_loss_mlp": 1.01976979, "epoch": 0.8997775506523177, "flos": 18219674027520.0, "grad_norm": 2.1732485478062182, "language_loss": 0.84975111, "learning_rate": 1.0434229670818618e-07, "loss": 0.87110627, "num_input_tokens_seen": 161789755, "step": 7483, "time_per_iteration": 2.5595076084136963 }, { "auxiliary_loss_clip": 0.01099917, "auxiliary_loss_mlp": 0.01037607, "balance_loss_clip": 1.03877079, "balance_loss_mlp": 1.02410638, "epoch": 0.8998977935429567, "flos": 24166768095360.0, "grad_norm": 1.5799990606417804, "language_loss": 0.80045259, "learning_rate": 1.0409412018691944e-07, "loss": 0.82182777, "num_input_tokens_seen": 161810220, "step": 7484, "time_per_iteration": 2.625555992126465 }, { "auxiliary_loss_clip": 0.01103989, "auxiliary_loss_mlp": 0.01050593, "balance_loss_clip": 1.03937662, "balance_loss_mlp": 1.03292656, "epoch": 0.9000180364335959, "flos": 20773030273920.0, "grad_norm": 1.89078940555331, "language_loss": 0.75033224, "learning_rate": 1.0384623127306724e-07, "loss": 0.771878, "num_input_tokens_seen": 161827565, "step": 7485, "time_per_iteration": 2.59694766998291 }, { "auxiliary_loss_clip": 0.01094351, "auxiliary_loss_mlp": 0.01054332, "balance_loss_clip": 1.03950274, "balance_loss_mlp": 1.03734493, "epoch": 0.900138279324235, "flos": 19205745166080.0, "grad_norm": 1.8122621715372127, "language_loss": 0.79484391, "learning_rate": 1.0359863000423397e-07, "loss": 0.81633073, "num_input_tokens_seen": 161845700, "step": 7486, "time_per_iteration": 2.621103048324585 }, { "auxiliary_loss_clip": 0.01131466, "auxiliary_loss_mlp": 0.01036214, "balance_loss_clip": 1.04300141, "balance_loss_mlp": 1.02183747, "epoch": 0.900258522214874, "flos": 28731158069760.0, "grad_norm": 1.6057049323078556, "language_loss": 0.71848428, "learning_rate": 1.0335131641798112e-07, "loss": 0.74016112, "num_input_tokens_seen": 161867660, "step": 7487, "time_per_iteration": 2.7700183391571045 }, { "auxiliary_loss_clip": 0.01017062, "auxiliary_loss_mlp": 0.01003524, "balance_loss_clip": 1.00801563, "balance_loss_mlp": 1.0018189, "epoch": 0.9003787651055132, "flos": 58280685655680.0, "grad_norm": 0.8021786555882662, "language_loss": 0.55578274, "learning_rate": 1.0310429055182512e-07, "loss": 0.57598859, "num_input_tokens_seen": 161921980, "step": 7488, "time_per_iteration": 3.03324031829834 }, { "auxiliary_loss_clip": 0.01098514, "auxiliary_loss_mlp": 0.01036766, "balance_loss_clip": 1.03859615, "balance_loss_mlp": 1.02012467, "epoch": 0.9004990079961522, "flos": 25556475340800.0, "grad_norm": 3.205361230568548, "language_loss": 0.74089992, "learning_rate": 1.0285755244324024e-07, "loss": 0.76225275, "num_input_tokens_seen": 161942725, "step": 7489, "time_per_iteration": 2.708324670791626 }, { "auxiliary_loss_clip": 0.01109362, "auxiliary_loss_mlp": 0.00771558, "balance_loss_clip": 1.04082143, "balance_loss_mlp": 1.00046778, "epoch": 0.9006192508867913, "flos": 23335185352320.0, "grad_norm": 1.6427465516260613, "language_loss": 0.6878221, "learning_rate": 1.0261110212965629e-07, "loss": 0.7066313, "num_input_tokens_seen": 161964520, "step": 7490, "time_per_iteration": 2.6498587131500244 }, { "auxiliary_loss_clip": 0.01107221, "auxiliary_loss_mlp": 0.01032978, "balance_loss_clip": 1.04109645, "balance_loss_mlp": 1.0180769, "epoch": 0.9007394937774305, "flos": 18040300485120.0, "grad_norm": 2.0653525245957596, "language_loss": 0.7918632, "learning_rate": 1.023649396484596e-07, "loss": 0.8132652, "num_input_tokens_seen": 161983575, "step": 7491, "time_per_iteration": 2.582794427871704 }, { "auxiliary_loss_clip": 0.01131405, "auxiliary_loss_mlp": 0.01034595, "balance_loss_clip": 1.04190743, "balance_loss_mlp": 1.01897812, "epoch": 0.9008597366680695, "flos": 43068456633600.0, "grad_norm": 3.002467617013114, "language_loss": 0.67492735, "learning_rate": 1.0211906503699275e-07, "loss": 0.69658738, "num_input_tokens_seen": 162006550, "step": 7492, "time_per_iteration": 3.66552996635437 }, { "auxiliary_loss_clip": 0.0112419, "auxiliary_loss_mlp": 0.01038368, "balance_loss_clip": 1.04401755, "balance_loss_mlp": 1.02194071, "epoch": 0.9009799795587086, "flos": 14939055112320.0, "grad_norm": 4.756661155373055, "language_loss": 0.82086748, "learning_rate": 1.0187347833255455e-07, "loss": 0.84249312, "num_input_tokens_seen": 162022455, "step": 7493, "time_per_iteration": 2.561998128890991 }, { "auxiliary_loss_clip": 0.01130391, "auxiliary_loss_mlp": 0.0103734, "balance_loss_clip": 1.04448986, "balance_loss_mlp": 1.0218904, "epoch": 0.9011002224493477, "flos": 21579584215680.0, "grad_norm": 1.7403110217418059, "language_loss": 0.79100853, "learning_rate": 1.0162817957240056e-07, "loss": 0.81268585, "num_input_tokens_seen": 162042350, "step": 7494, "time_per_iteration": 3.5099377632141113 }, { "auxiliary_loss_clip": 0.01025367, "auxiliary_loss_mlp": 0.00999845, "balance_loss_clip": 1.00704896, "balance_loss_mlp": 0.99827147, "epoch": 0.9012204653399868, "flos": 71166367883520.0, "grad_norm": 0.8859556159323954, "language_loss": 0.62954807, "learning_rate": 1.0138316879374253e-07, "loss": 0.64980018, "num_input_tokens_seen": 162111640, "step": 7495, "time_per_iteration": 3.334649085998535 }, { "auxiliary_loss_clip": 0.01110152, "auxiliary_loss_mlp": 0.01036298, "balance_loss_clip": 1.04186296, "balance_loss_mlp": 1.0211823, "epoch": 0.9013407082306258, "flos": 15594963413760.0, "grad_norm": 3.6821479989605814, "language_loss": 0.74235803, "learning_rate": 1.0113844603374833e-07, "loss": 0.76382244, "num_input_tokens_seen": 162128165, "step": 7496, "time_per_iteration": 2.5705668926239014 }, { "auxiliary_loss_clip": 0.01108069, "auxiliary_loss_mlp": 0.01038375, "balance_loss_clip": 1.03856373, "balance_loss_mlp": 1.02215075, "epoch": 0.901460951121265, "flos": 15049157276160.0, "grad_norm": 3.3171933664479125, "language_loss": 0.71878213, "learning_rate": 1.0089401132954178e-07, "loss": 0.74024653, "num_input_tokens_seen": 162146145, "step": 7497, "time_per_iteration": 3.6192004680633545 }, { "auxiliary_loss_clip": 0.01105399, "auxiliary_loss_mlp": 0.01033916, "balance_loss_clip": 1.03966343, "balance_loss_mlp": 1.01844215, "epoch": 0.9015811940119041, "flos": 22236857233920.0, "grad_norm": 1.9443810821321534, "language_loss": 0.72552609, "learning_rate": 1.006498647182037e-07, "loss": 0.74691921, "num_input_tokens_seen": 162164800, "step": 7498, "time_per_iteration": 3.4666028022766113 }, { "auxiliary_loss_clip": 0.01064, "auxiliary_loss_mlp": 0.01042725, "balance_loss_clip": 1.03525674, "balance_loss_mlp": 1.02614331, "epoch": 0.9017014369025431, "flos": 24973824827520.0, "grad_norm": 2.043198925168171, "language_loss": 0.71645707, "learning_rate": 1.004060062367713e-07, "loss": 0.73752439, "num_input_tokens_seen": 162185895, "step": 7499, "time_per_iteration": 2.7294366359710693 }, { "auxiliary_loss_clip": 0.01121539, "auxiliary_loss_mlp": 0.01040262, "balance_loss_clip": 1.04270148, "balance_loss_mlp": 1.02427626, "epoch": 0.9018216797931822, "flos": 18114168804480.0, "grad_norm": 1.8467507964171486, "language_loss": 0.6991598, "learning_rate": 1.0016243592223728e-07, "loss": 0.72077781, "num_input_tokens_seen": 162206295, "step": 7500, "time_per_iteration": 2.6025989055633545 }, { "auxiliary_loss_clip": 0.01069959, "auxiliary_loss_mlp": 0.01043178, "balance_loss_clip": 1.03634214, "balance_loss_mlp": 1.02718568, "epoch": 0.9019419226838213, "flos": 37268452759680.0, "grad_norm": 2.109874234898489, "language_loss": 0.65735072, "learning_rate": 9.991915381155114e-08, "loss": 0.67848212, "num_input_tokens_seen": 162229275, "step": 7501, "time_per_iteration": 2.9598000049591064 }, { "auxiliary_loss_clip": 0.01121498, "auxiliary_loss_mlp": 0.01042292, "balance_loss_clip": 1.04069281, "balance_loss_mlp": 1.0273788, "epoch": 0.9020621655744604, "flos": 23441121538560.0, "grad_norm": 2.519198248812778, "language_loss": 0.74932039, "learning_rate": 9.967615994161871e-08, "loss": 0.7709583, "num_input_tokens_seen": 162248935, "step": 7502, "time_per_iteration": 2.767174005508423 }, { "auxiliary_loss_clip": 0.01129237, "auxiliary_loss_mlp": 0.01046638, "balance_loss_clip": 1.04236722, "balance_loss_mlp": 1.03160572, "epoch": 0.9021824084650995, "flos": 22857465444480.0, "grad_norm": 1.962412950748376, "language_loss": 0.78316724, "learning_rate": 9.943345434930161e-08, "loss": 0.80492598, "num_input_tokens_seen": 162269185, "step": 7503, "time_per_iteration": 2.5659868717193604 }, { "auxiliary_loss_clip": 0.01094658, "auxiliary_loss_mlp": 0.01036785, "balance_loss_clip": 1.03963101, "balance_loss_mlp": 1.02184844, "epoch": 0.9023026513557386, "flos": 22127581082880.0, "grad_norm": 2.0864973277319967, "language_loss": 0.68676025, "learning_rate": 9.919103707141885e-08, "loss": 0.70807469, "num_input_tokens_seen": 162288065, "step": 7504, "time_per_iteration": 2.6329033374786377 }, { "auxiliary_loss_clip": 0.01121522, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.04219449, "balance_loss_mlp": 1.01775336, "epoch": 0.9024228942463777, "flos": 24199087357440.0, "grad_norm": 1.9504237165661849, "language_loss": 0.76311755, "learning_rate": 9.89489081447441e-08, "loss": 0.78467536, "num_input_tokens_seen": 162305265, "step": 7505, "time_per_iteration": 2.5523993968963623 }, { "auxiliary_loss_clip": 0.01106904, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.03941298, "balance_loss_mlp": 1.02177072, "epoch": 0.9025431371370167, "flos": 25008262992000.0, "grad_norm": 1.9728235906901987, "language_loss": 0.83076787, "learning_rate": 9.870706760600844e-08, "loss": 0.85222465, "num_input_tokens_seen": 162325215, "step": 7506, "time_per_iteration": 2.5885610580444336 }, { "auxiliary_loss_clip": 0.01090958, "auxiliary_loss_mlp": 0.01040959, "balance_loss_clip": 1.04373252, "balance_loss_mlp": 1.02513957, "epoch": 0.9026633800276559, "flos": 18952862440320.0, "grad_norm": 1.8596193895419033, "language_loss": 0.72760677, "learning_rate": 9.846551549189918e-08, "loss": 0.74892592, "num_input_tokens_seen": 162344820, "step": 7507, "time_per_iteration": 2.582798719406128 }, { "auxiliary_loss_clip": 0.01107086, "auxiliary_loss_mlp": 0.01037443, "balance_loss_clip": 1.04000771, "balance_loss_mlp": 1.02163029, "epoch": 0.902783622918295, "flos": 32416059536640.0, "grad_norm": 1.9127323136984635, "language_loss": 0.68139893, "learning_rate": 9.822425183905902e-08, "loss": 0.70284426, "num_input_tokens_seen": 162365345, "step": 7508, "time_per_iteration": 2.6769137382507324 }, { "auxiliary_loss_clip": 0.01007712, "auxiliary_loss_mlp": 0.01001528, "balance_loss_clip": 1.00803912, "balance_loss_mlp": 0.99994296, "epoch": 0.902903865808934, "flos": 63717453244800.0, "grad_norm": 0.9060682651666506, "language_loss": 0.75109792, "learning_rate": 9.798327668408823e-08, "loss": 0.77119035, "num_input_tokens_seen": 162426980, "step": 7509, "time_per_iteration": 3.318697690963745 }, { "auxiliary_loss_clip": 0.01135889, "auxiliary_loss_mlp": 0.01041351, "balance_loss_clip": 1.04389369, "balance_loss_mlp": 1.02464938, "epoch": 0.9030241086995732, "flos": 23804034600960.0, "grad_norm": 1.8901055360826537, "language_loss": 0.68781918, "learning_rate": 9.774259006354158e-08, "loss": 0.70959163, "num_input_tokens_seen": 162447050, "step": 7510, "time_per_iteration": 2.562059164047241 }, { "auxiliary_loss_clip": 0.01109426, "auxiliary_loss_mlp": 0.01036332, "balance_loss_clip": 1.04035544, "balance_loss_mlp": 1.02081072, "epoch": 0.9031443515902122, "flos": 26395887248640.0, "grad_norm": 2.424562366217091, "language_loss": 0.7639848, "learning_rate": 9.750219201393184e-08, "loss": 0.78544241, "num_input_tokens_seen": 162467015, "step": 7511, "time_per_iteration": 2.634453773498535 }, { "auxiliary_loss_clip": 0.01119072, "auxiliary_loss_mlp": 0.01034183, "balance_loss_clip": 1.04060507, "balance_loss_mlp": 1.01921058, "epoch": 0.9032645944808513, "flos": 24939350749440.0, "grad_norm": 2.47242957293164, "language_loss": 0.77994633, "learning_rate": 9.726208257172697e-08, "loss": 0.80147886, "num_input_tokens_seen": 162488710, "step": 7512, "time_per_iteration": 2.609092950820923 }, { "auxiliary_loss_clip": 0.01130729, "auxiliary_loss_mlp": 0.01039924, "balance_loss_clip": 1.04322505, "balance_loss_mlp": 1.02465367, "epoch": 0.9033848373714904, "flos": 21178821196800.0, "grad_norm": 2.716627836099085, "language_loss": 0.74681377, "learning_rate": 9.702226177335115e-08, "loss": 0.7685203, "num_input_tokens_seen": 162507205, "step": 7513, "time_per_iteration": 2.5224156379699707 }, { "auxiliary_loss_clip": 0.01103374, "auxiliary_loss_mlp": 0.01036487, "balance_loss_clip": 1.03996158, "balance_loss_mlp": 1.02231884, "epoch": 0.9035050802621295, "flos": 26286359702400.0, "grad_norm": 1.757748655558513, "language_loss": 0.72388387, "learning_rate": 9.67827296551853e-08, "loss": 0.74528253, "num_input_tokens_seen": 162528490, "step": 7514, "time_per_iteration": 2.6333062648773193 }, { "auxiliary_loss_clip": 0.01101793, "auxiliary_loss_mlp": 0.00772442, "balance_loss_clip": 1.04018712, "balance_loss_mlp": 1.00050688, "epoch": 0.9036253231527686, "flos": 24204546224640.0, "grad_norm": 2.30800601201535, "language_loss": 0.68350714, "learning_rate": 9.65434862535659e-08, "loss": 0.70224947, "num_input_tokens_seen": 162547860, "step": 7515, "time_per_iteration": 2.641780138015747 }, { "auxiliary_loss_clip": 0.0111461, "auxiliary_loss_mlp": 0.0104563, "balance_loss_clip": 1.04318523, "balance_loss_mlp": 1.03037131, "epoch": 0.9037455660434077, "flos": 18072655660800.0, "grad_norm": 2.6682189843179187, "language_loss": 0.65919, "learning_rate": 9.630453160478635e-08, "loss": 0.68079245, "num_input_tokens_seen": 162563215, "step": 7516, "time_per_iteration": 2.5481972694396973 }, { "auxiliary_loss_clip": 0.01081649, "auxiliary_loss_mlp": 0.01040919, "balance_loss_clip": 1.03682971, "balance_loss_mlp": 1.02571356, "epoch": 0.9038658089340468, "flos": 24060795995520.0, "grad_norm": 1.8564757699988919, "language_loss": 0.82488233, "learning_rate": 9.60658657450959e-08, "loss": 0.84610796, "num_input_tokens_seen": 162583515, "step": 7517, "time_per_iteration": 2.662357807159424 }, { "auxiliary_loss_clip": 0.01094705, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.03757799, "balance_loss_mlp": 1.01956344, "epoch": 0.9039860518246858, "flos": 21834298535040.0, "grad_norm": 1.6060150314423312, "language_loss": 0.79343069, "learning_rate": 9.582748871069979e-08, "loss": 0.81473148, "num_input_tokens_seen": 162602955, "step": 7518, "time_per_iteration": 3.516942024230957 }, { "auxiliary_loss_clip": 0.01109166, "auxiliary_loss_mlp": 0.00770769, "balance_loss_clip": 1.04007947, "balance_loss_mlp": 1.00055635, "epoch": 0.904106294715325, "flos": 26614870513920.0, "grad_norm": 1.8931318729790814, "language_loss": 0.83261722, "learning_rate": 9.558940053775954e-08, "loss": 0.85141659, "num_input_tokens_seen": 162621595, "step": 7519, "time_per_iteration": 2.6116552352905273 }, { "auxiliary_loss_clip": 0.01119913, "auxiliary_loss_mlp": 0.01041698, "balance_loss_clip": 1.04115546, "balance_loss_mlp": 1.02577138, "epoch": 0.904226537605964, "flos": 17785693906560.0, "grad_norm": 1.8669650787961258, "language_loss": 0.67512405, "learning_rate": 9.535160126239294e-08, "loss": 0.69674015, "num_input_tokens_seen": 162638220, "step": 7520, "time_per_iteration": 3.963500738143921 }, { "auxiliary_loss_clip": 0.01117754, "auxiliary_loss_mlp": 0.01035139, "balance_loss_clip": 1.04180503, "balance_loss_mlp": 1.02023184, "epoch": 0.9043467804966031, "flos": 24790428961920.0, "grad_norm": 1.571540611461625, "language_loss": 0.7059468, "learning_rate": 9.511409092067424e-08, "loss": 0.7274757, "num_input_tokens_seen": 162658575, "step": 7521, "time_per_iteration": 2.606954336166382 }, { "auxiliary_loss_clip": 0.01107068, "auxiliary_loss_mlp": 0.01044763, "balance_loss_clip": 1.03954649, "balance_loss_mlp": 1.02826405, "epoch": 0.9044670233872423, "flos": 22632125472000.0, "grad_norm": 1.869072776061624, "language_loss": 0.66992784, "learning_rate": 9.487686954863327e-08, "loss": 0.69144613, "num_input_tokens_seen": 162678295, "step": 7522, "time_per_iteration": 3.6758947372436523 }, { "auxiliary_loss_clip": 0.0111989, "auxiliary_loss_mlp": 0.01036968, "balance_loss_clip": 1.04190969, "balance_loss_mlp": 1.02056527, "epoch": 0.9045872662778813, "flos": 23771320289280.0, "grad_norm": 2.1998638988548493, "language_loss": 0.77522445, "learning_rate": 9.46399371822566e-08, "loss": 0.79679298, "num_input_tokens_seen": 162698070, "step": 7523, "time_per_iteration": 2.5839085578918457 }, { "auxiliary_loss_clip": 0.01132701, "auxiliary_loss_mlp": 0.01036519, "balance_loss_clip": 1.04323411, "balance_loss_mlp": 1.02167702, "epoch": 0.9047075091685204, "flos": 15191039998080.0, "grad_norm": 4.03503063498935, "language_loss": 0.72473621, "learning_rate": 9.440329385748657e-08, "loss": 0.74642843, "num_input_tokens_seen": 162715140, "step": 7524, "time_per_iteration": 3.346681833267212 }, { "auxiliary_loss_clip": 0.01096458, "auxiliary_loss_mlp": 0.0103468, "balance_loss_clip": 1.0408839, "balance_loss_mlp": 1.02069688, "epoch": 0.9048277520591596, "flos": 18003707504640.0, "grad_norm": 1.824401827592673, "language_loss": 0.7086432, "learning_rate": 9.416693961022137e-08, "loss": 0.7299546, "num_input_tokens_seen": 162733390, "step": 7525, "time_per_iteration": 2.5769920349121094 }, { "auxiliary_loss_clip": 0.01059013, "auxiliary_loss_mlp": 0.01039717, "balance_loss_clip": 1.03488398, "balance_loss_mlp": 1.02438712, "epoch": 0.9049479949497986, "flos": 21872471713920.0, "grad_norm": 4.000088270649937, "language_loss": 0.77035934, "learning_rate": 9.393087447631654e-08, "loss": 0.79134661, "num_input_tokens_seen": 162751670, "step": 7526, "time_per_iteration": 2.6813197135925293 }, { "auxiliary_loss_clip": 0.01102036, "auxiliary_loss_mlp": 0.01038917, "balance_loss_clip": 1.03853774, "balance_loss_mlp": 1.0247606, "epoch": 0.9050682378404377, "flos": 20773928113920.0, "grad_norm": 21.0206406597369, "language_loss": 0.7313363, "learning_rate": 9.36950984915823e-08, "loss": 0.75274575, "num_input_tokens_seen": 162770025, "step": 7527, "time_per_iteration": 2.5809216499328613 }, { "auxiliary_loss_clip": 0.01133717, "auxiliary_loss_mlp": 0.01041063, "balance_loss_clip": 1.04458523, "balance_loss_mlp": 1.02629256, "epoch": 0.9051884807310768, "flos": 21580015178880.0, "grad_norm": 2.45279319089842, "language_loss": 0.69494367, "learning_rate": 9.345961169178607e-08, "loss": 0.71669143, "num_input_tokens_seen": 162789710, "step": 7528, "time_per_iteration": 2.5322208404541016 }, { "auxiliary_loss_clip": 0.01078844, "auxiliary_loss_mlp": 0.01042139, "balance_loss_clip": 1.03779113, "balance_loss_mlp": 1.02640367, "epoch": 0.9053087236217159, "flos": 21908059113600.0, "grad_norm": 7.826351243642248, "language_loss": 0.72659755, "learning_rate": 9.322441411265081e-08, "loss": 0.74780738, "num_input_tokens_seen": 162810695, "step": 7529, "time_per_iteration": 2.6271135807037354 }, { "auxiliary_loss_clip": 0.01105457, "auxiliary_loss_mlp": 0.01041738, "balance_loss_clip": 1.04180121, "balance_loss_mlp": 1.02392793, "epoch": 0.9054289665123549, "flos": 17055809544960.0, "grad_norm": 2.266323388562252, "language_loss": 0.7331388, "learning_rate": 9.298950578985554e-08, "loss": 0.75461078, "num_input_tokens_seen": 162827770, "step": 7530, "time_per_iteration": 2.5584678649902344 }, { "auxiliary_loss_clip": 0.01116503, "auxiliary_loss_mlp": 0.007722, "balance_loss_clip": 1.04110217, "balance_loss_mlp": 1.0004878, "epoch": 0.905549209402994, "flos": 20777268078720.0, "grad_norm": 1.7780197543685567, "language_loss": 0.70922863, "learning_rate": 9.275488675903665e-08, "loss": 0.72811568, "num_input_tokens_seen": 162846715, "step": 7531, "time_per_iteration": 2.595008134841919 }, { "auxiliary_loss_clip": 0.01076626, "auxiliary_loss_mlp": 0.01038066, "balance_loss_clip": 1.03760207, "balance_loss_mlp": 1.02273619, "epoch": 0.9056694522936332, "flos": 21686813291520.0, "grad_norm": 2.7163797050901346, "language_loss": 0.74050391, "learning_rate": 9.252055705578454e-08, "loss": 0.76165086, "num_input_tokens_seen": 162866215, "step": 7532, "time_per_iteration": 2.640134334564209 }, { "auxiliary_loss_clip": 0.01119107, "auxiliary_loss_mlp": 0.01036134, "balance_loss_clip": 1.04194975, "balance_loss_mlp": 1.02083325, "epoch": 0.9057896951842722, "flos": 29569133433600.0, "grad_norm": 2.1687205332027104, "language_loss": 0.71982968, "learning_rate": 9.228651671564747e-08, "loss": 0.74138206, "num_input_tokens_seen": 162888245, "step": 7533, "time_per_iteration": 2.595341682434082 }, { "auxiliary_loss_clip": 0.01075414, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.03819752, "balance_loss_mlp": 1.02214372, "epoch": 0.9059099380749113, "flos": 27892248952320.0, "grad_norm": 1.5702695766761274, "language_loss": 0.78008437, "learning_rate": 9.205276577412901e-08, "loss": 0.80120903, "num_input_tokens_seen": 162911025, "step": 7534, "time_per_iteration": 2.666510820388794 }, { "auxiliary_loss_clip": 0.01116915, "auxiliary_loss_mlp": 0.00772378, "balance_loss_clip": 1.043015, "balance_loss_mlp": 1.00049758, "epoch": 0.9060301809655504, "flos": 17748993185280.0, "grad_norm": 2.6817351817096813, "language_loss": 0.76910359, "learning_rate": 9.181930426668905e-08, "loss": 0.78799653, "num_input_tokens_seen": 162927820, "step": 7535, "time_per_iteration": 2.572688102722168 }, { "auxiliary_loss_clip": 0.01074887, "auxiliary_loss_mlp": 0.01048679, "balance_loss_clip": 1.03696024, "balance_loss_mlp": 1.03078604, "epoch": 0.9061504238561895, "flos": 31759432963200.0, "grad_norm": 1.725021206801983, "language_loss": 0.67775428, "learning_rate": 9.158613222874346e-08, "loss": 0.69898993, "num_input_tokens_seen": 162949445, "step": 7536, "time_per_iteration": 2.705944061279297 }, { "auxiliary_loss_clip": 0.01106633, "auxiliary_loss_mlp": 0.0103507, "balance_loss_clip": 1.0388844, "balance_loss_mlp": 1.01977587, "epoch": 0.9062706667468285, "flos": 20048066075520.0, "grad_norm": 1.798885744388151, "language_loss": 0.8195864, "learning_rate": 9.135324969566394e-08, "loss": 0.84100348, "num_input_tokens_seen": 162968945, "step": 7537, "time_per_iteration": 2.577208995819092 }, { "auxiliary_loss_clip": 0.01124353, "auxiliary_loss_mlp": 0.01043469, "balance_loss_clip": 1.04288554, "balance_loss_mlp": 1.02858019, "epoch": 0.9063909096374677, "flos": 18437292576000.0, "grad_norm": 6.689544401895218, "language_loss": 0.75692421, "learning_rate": 9.112065670277913e-08, "loss": 0.77860236, "num_input_tokens_seen": 162985310, "step": 7538, "time_per_iteration": 2.507398843765259 }, { "auxiliary_loss_clip": 0.01101056, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.03850293, "balance_loss_mlp": 1.02316809, "epoch": 0.9065111525281068, "flos": 33547353361920.0, "grad_norm": 1.7387082109974574, "language_loss": 0.72961414, "learning_rate": 9.088835328537303e-08, "loss": 0.75099367, "num_input_tokens_seen": 163006900, "step": 7539, "time_per_iteration": 2.6965830326080322 }, { "auxiliary_loss_clip": 0.011094, "auxiliary_loss_mlp": 0.01038333, "balance_loss_clip": 1.04014659, "balance_loss_mlp": 1.02288377, "epoch": 0.9066313954187458, "flos": 23367863750400.0, "grad_norm": 2.5226082404031054, "language_loss": 0.71849877, "learning_rate": 9.065633947868568e-08, "loss": 0.73997611, "num_input_tokens_seen": 163026505, "step": 7540, "time_per_iteration": 2.723970890045166 }, { "auxiliary_loss_clip": 0.01096267, "auxiliary_loss_mlp": 0.0077135, "balance_loss_clip": 1.04115713, "balance_loss_mlp": 1.00054348, "epoch": 0.906751638309385, "flos": 26249623067520.0, "grad_norm": 2.3480641946545826, "language_loss": 0.80016768, "learning_rate": 9.042461531791379e-08, "loss": 0.81884384, "num_input_tokens_seen": 163044925, "step": 7541, "time_per_iteration": 2.6525418758392334 }, { "auxiliary_loss_clip": 0.01126745, "auxiliary_loss_mlp": 0.01035308, "balance_loss_clip": 1.04152179, "balance_loss_mlp": 1.02043629, "epoch": 0.906871881200024, "flos": 16544477485440.0, "grad_norm": 1.9977426081201501, "language_loss": 0.78162313, "learning_rate": 9.019318083820903e-08, "loss": 0.80324364, "num_input_tokens_seen": 163063505, "step": 7542, "time_per_iteration": 2.53196382522583 }, { "auxiliary_loss_clip": 0.01120439, "auxiliary_loss_mlp": 0.01046551, "balance_loss_clip": 1.04329932, "balance_loss_mlp": 1.03082716, "epoch": 0.9069921240906631, "flos": 24605129675520.0, "grad_norm": 1.7255554695049613, "language_loss": 0.855286, "learning_rate": 8.996203607468045e-08, "loss": 0.87695587, "num_input_tokens_seen": 163082505, "step": 7543, "time_per_iteration": 2.615114450454712 }, { "auxiliary_loss_clip": 0.01114795, "auxiliary_loss_mlp": 0.01044666, "balance_loss_clip": 1.03806126, "balance_loss_mlp": 1.028108, "epoch": 0.9071123669813023, "flos": 25374731500800.0, "grad_norm": 1.6909949828035846, "language_loss": 0.75469768, "learning_rate": 8.973118106239241e-08, "loss": 0.77629232, "num_input_tokens_seen": 163105110, "step": 7544, "time_per_iteration": 3.611196994781494 }, { "auxiliary_loss_clip": 0.01070323, "auxiliary_loss_mlp": 0.0104389, "balance_loss_clip": 1.03588915, "balance_loss_mlp": 1.02699792, "epoch": 0.9072326098719413, "flos": 26725798690560.0, "grad_norm": 2.8392145386977217, "language_loss": 0.94739199, "learning_rate": 8.95006158363656e-08, "loss": 0.96853411, "num_input_tokens_seen": 163125295, "step": 7545, "time_per_iteration": 2.7250144481658936 }, { "auxiliary_loss_clip": 0.01117176, "auxiliary_loss_mlp": 0.01045945, "balance_loss_clip": 1.04069424, "balance_loss_mlp": 1.02879095, "epoch": 0.9073528527625804, "flos": 23878800760320.0, "grad_norm": 2.167136162365301, "language_loss": 0.77597171, "learning_rate": 8.9270340431576e-08, "loss": 0.79760289, "num_input_tokens_seen": 163144385, "step": 7546, "time_per_iteration": 3.4949698448181152 }, { "auxiliary_loss_clip": 0.01121434, "auxiliary_loss_mlp": 0.01039775, "balance_loss_clip": 1.04221106, "balance_loss_mlp": 1.02550554, "epoch": 0.9074730956532195, "flos": 37852144767360.0, "grad_norm": 7.222470323305482, "language_loss": 0.73456705, "learning_rate": 8.904035488295658e-08, "loss": 0.75617915, "num_input_tokens_seen": 163163885, "step": 7547, "time_per_iteration": 2.6906521320343018 }, { "auxiliary_loss_clip": 0.01025552, "auxiliary_loss_mlp": 0.00755672, "balance_loss_clip": 1.00710297, "balance_loss_mlp": 1.00022697, "epoch": 0.9075933385438586, "flos": 65173307385600.0, "grad_norm": 0.6585228200165562, "language_loss": 0.53254008, "learning_rate": 8.881065922539632e-08, "loss": 0.55035233, "num_input_tokens_seen": 163224325, "step": 7548, "time_per_iteration": 3.9478564262390137 }, { "auxiliary_loss_clip": 0.01092058, "auxiliary_loss_mlp": 0.01029943, "balance_loss_clip": 1.04059184, "balance_loss_mlp": 1.01532841, "epoch": 0.9077135814344977, "flos": 19931571290880.0, "grad_norm": 3.249687319213386, "language_loss": 0.73402643, "learning_rate": 8.85812534937389e-08, "loss": 0.7552464, "num_input_tokens_seen": 163242425, "step": 7549, "time_per_iteration": 2.6134560108184814 }, { "auxiliary_loss_clip": 0.01127343, "auxiliary_loss_mlp": 0.01044168, "balance_loss_clip": 1.0448215, "balance_loss_mlp": 1.02854002, "epoch": 0.9078338243251368, "flos": 17529650784000.0, "grad_norm": 2.5367160094280257, "language_loss": 0.67895603, "learning_rate": 8.835213772278583e-08, "loss": 0.7006712, "num_input_tokens_seen": 163259280, "step": 7550, "time_per_iteration": 3.5402472019195557 }, { "auxiliary_loss_clip": 0.01083624, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.03775954, "balance_loss_mlp": 1.02099955, "epoch": 0.9079540672157759, "flos": 28803410277120.0, "grad_norm": 1.8255844208123566, "language_loss": 0.79160154, "learning_rate": 8.812331194729373e-08, "loss": 0.8127858, "num_input_tokens_seen": 163278925, "step": 7551, "time_per_iteration": 2.683251142501831 }, { "auxiliary_loss_clip": 0.01137007, "auxiliary_loss_mlp": 0.01047892, "balance_loss_clip": 1.04562616, "balance_loss_mlp": 1.03053474, "epoch": 0.9080743101064149, "flos": 23513840622720.0, "grad_norm": 2.8501283155713093, "language_loss": 0.7210983, "learning_rate": 8.789477620197461e-08, "loss": 0.74294728, "num_input_tokens_seen": 163298450, "step": 7552, "time_per_iteration": 2.548905611038208 }, { "auxiliary_loss_clip": 0.01104044, "auxiliary_loss_mlp": 0.01041501, "balance_loss_clip": 1.03933811, "balance_loss_mlp": 1.02535999, "epoch": 0.9081945529970541, "flos": 22778102344320.0, "grad_norm": 2.2671540273381385, "language_loss": 0.78672928, "learning_rate": 8.766653052149831e-08, "loss": 0.80818468, "num_input_tokens_seen": 163313635, "step": 7553, "time_per_iteration": 2.6045172214508057 }, { "auxiliary_loss_clip": 0.01108669, "auxiliary_loss_mlp": 0.01039462, "balance_loss_clip": 1.04018831, "balance_loss_mlp": 1.02260542, "epoch": 0.9083147958876931, "flos": 18873714821760.0, "grad_norm": 2.393854204792172, "language_loss": 0.7485255, "learning_rate": 8.743857494048823e-08, "loss": 0.7700069, "num_input_tokens_seen": 163330450, "step": 7554, "time_per_iteration": 2.566908359527588 }, { "auxiliary_loss_clip": 0.01092565, "auxiliary_loss_mlp": 0.01034998, "balance_loss_clip": 1.03799963, "balance_loss_mlp": 1.02012682, "epoch": 0.9084350387783322, "flos": 18909374048640.0, "grad_norm": 2.3277374006866816, "language_loss": 0.62880588, "learning_rate": 8.721090949352605e-08, "loss": 0.65008152, "num_input_tokens_seen": 163346690, "step": 7555, "time_per_iteration": 2.5951783657073975 }, { "auxiliary_loss_clip": 0.01128643, "auxiliary_loss_mlp": 0.01041287, "balance_loss_clip": 1.04379988, "balance_loss_mlp": 1.02451444, "epoch": 0.9085552816689714, "flos": 20595488325120.0, "grad_norm": 5.2527822875801675, "language_loss": 0.73023653, "learning_rate": 8.698353421514793e-08, "loss": 0.75193584, "num_input_tokens_seen": 163365065, "step": 7556, "time_per_iteration": 2.5535247325897217 }, { "auxiliary_loss_clip": 0.01117734, "auxiliary_loss_mlp": 0.0103757, "balance_loss_clip": 1.04261184, "balance_loss_mlp": 1.02284169, "epoch": 0.9086755245596104, "flos": 18113163223680.0, "grad_norm": 2.3540855777994563, "language_loss": 0.80341542, "learning_rate": 8.67564491398467e-08, "loss": 0.82496846, "num_input_tokens_seen": 163382070, "step": 7557, "time_per_iteration": 2.5509915351867676 }, { "auxiliary_loss_clip": 0.0112341, "auxiliary_loss_mlp": 0.01041739, "balance_loss_clip": 1.04344344, "balance_loss_mlp": 1.02636099, "epoch": 0.9087957674502495, "flos": 19129793857920.0, "grad_norm": 2.0122863761652914, "language_loss": 0.73436528, "learning_rate": 8.652965430207104e-08, "loss": 0.75601673, "num_input_tokens_seen": 163399975, "step": 7558, "time_per_iteration": 2.5373928546905518 }, { "auxiliary_loss_clip": 0.01125656, "auxiliary_loss_mlp": 0.01038219, "balance_loss_clip": 1.04340744, "balance_loss_mlp": 1.02119613, "epoch": 0.9089160103408886, "flos": 18109930999680.0, "grad_norm": 2.0525662155680946, "language_loss": 0.65465987, "learning_rate": 8.630314973622521e-08, "loss": 0.67629856, "num_input_tokens_seen": 163417520, "step": 7559, "time_per_iteration": 2.5369489192962646 }, { "auxiliary_loss_clip": 0.01116356, "auxiliary_loss_mlp": 0.01038008, "balance_loss_clip": 1.04239726, "balance_loss_mlp": 1.0231545, "epoch": 0.9090362532315277, "flos": 33364855336320.0, "grad_norm": 1.8916812300658232, "language_loss": 0.71084583, "learning_rate": 8.607693547666995e-08, "loss": 0.73238951, "num_input_tokens_seen": 163440060, "step": 7560, "time_per_iteration": 2.6775081157684326 }, { "auxiliary_loss_clip": 0.01008999, "auxiliary_loss_mlp": 0.01004126, "balance_loss_clip": 1.00832248, "balance_loss_mlp": 1.00267816, "epoch": 0.9091564961221668, "flos": 71480585082240.0, "grad_norm": 0.907737424981443, "language_loss": 0.57947713, "learning_rate": 8.585101155772201e-08, "loss": 0.59960836, "num_input_tokens_seen": 163502180, "step": 7561, "time_per_iteration": 3.28187894821167 }, { "auxiliary_loss_clip": 0.01099139, "auxiliary_loss_mlp": 0.01040513, "balance_loss_clip": 1.03653669, "balance_loss_mlp": 1.02410972, "epoch": 0.9092767390128058, "flos": 24712574232960.0, "grad_norm": 2.5640661513293788, "language_loss": 0.68643248, "learning_rate": 8.562537801365377e-08, "loss": 0.70782894, "num_input_tokens_seen": 163521915, "step": 7562, "time_per_iteration": 2.630805253982544 }, { "auxiliary_loss_clip": 0.01131989, "auxiliary_loss_mlp": 0.01043474, "balance_loss_clip": 1.04238915, "balance_loss_mlp": 1.026564, "epoch": 0.909396981903445, "flos": 23586487879680.0, "grad_norm": 1.7200736914505455, "language_loss": 0.69730008, "learning_rate": 8.540003487869362e-08, "loss": 0.7190547, "num_input_tokens_seen": 163543585, "step": 7563, "time_per_iteration": 2.5504183769226074 }, { "auxiliary_loss_clip": 0.010837, "auxiliary_loss_mlp": 0.01032678, "balance_loss_clip": 1.03654957, "balance_loss_mlp": 1.01701355, "epoch": 0.909517224794084, "flos": 23404169422080.0, "grad_norm": 4.201591045718172, "language_loss": 0.79816663, "learning_rate": 8.517498218702557e-08, "loss": 0.81933045, "num_input_tokens_seen": 163561515, "step": 7564, "time_per_iteration": 2.730095148086548 }, { "auxiliary_loss_clip": 0.01090891, "auxiliary_loss_mlp": 0.01035722, "balance_loss_clip": 1.03811288, "balance_loss_mlp": 1.02084494, "epoch": 0.9096374676847231, "flos": 19208618254080.0, "grad_norm": 1.6215219399909633, "language_loss": 0.69236565, "learning_rate": 8.49502199727905e-08, "loss": 0.71363175, "num_input_tokens_seen": 163579540, "step": 7565, "time_per_iteration": 2.6650288105010986 }, { "auxiliary_loss_clip": 0.01117939, "auxiliary_loss_mlp": 0.0103867, "balance_loss_clip": 1.04040456, "balance_loss_mlp": 1.02199268, "epoch": 0.9097577105753623, "flos": 33292495388160.0, "grad_norm": 2.2043899882359463, "language_loss": 0.66326892, "learning_rate": 8.472574827008428e-08, "loss": 0.68483496, "num_input_tokens_seen": 163600425, "step": 7566, "time_per_iteration": 2.6660704612731934 }, { "auxiliary_loss_clip": 0.01117276, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.04108298, "balance_loss_mlp": 1.01971459, "epoch": 0.9098779534660013, "flos": 21906443001600.0, "grad_norm": 1.8894705236388043, "language_loss": 0.83845282, "learning_rate": 8.450156711295942e-08, "loss": 0.85997206, "num_input_tokens_seen": 163620595, "step": 7567, "time_per_iteration": 2.6613919734954834 }, { "auxiliary_loss_clip": 0.01105436, "auxiliary_loss_mlp": 0.01036984, "balance_loss_clip": 1.03989148, "balance_loss_mlp": 1.02183211, "epoch": 0.9099981963566404, "flos": 25730354102400.0, "grad_norm": 2.200846399780803, "language_loss": 0.86608589, "learning_rate": 8.427767653542383e-08, "loss": 0.88751006, "num_input_tokens_seen": 163635765, "step": 7568, "time_per_iteration": 2.6091389656066895 }, { "auxiliary_loss_clip": 0.01076544, "auxiliary_loss_mlp": 0.01037382, "balance_loss_clip": 1.03371406, "balance_loss_mlp": 1.02186728, "epoch": 0.9101184392472795, "flos": 21069437304960.0, "grad_norm": 4.413619331366024, "language_loss": 0.7013588, "learning_rate": 8.405407657144125e-08, "loss": 0.722498, "num_input_tokens_seen": 163654925, "step": 7569, "time_per_iteration": 3.5797104835510254 }, { "auxiliary_loss_clip": 0.0110107, "auxiliary_loss_mlp": 0.01039163, "balance_loss_clip": 1.03901708, "balance_loss_mlp": 1.02319503, "epoch": 0.9102386821379186, "flos": 24752614919040.0, "grad_norm": 2.1870915643373667, "language_loss": 0.72597134, "learning_rate": 8.383076725493232e-08, "loss": 0.7473737, "num_input_tokens_seen": 163672245, "step": 7570, "time_per_iteration": 2.6074798107147217 }, { "auxiliary_loss_clip": 0.0112053, "auxiliary_loss_mlp": 0.01038549, "balance_loss_clip": 1.04209113, "balance_loss_mlp": 1.02370739, "epoch": 0.9103589250285576, "flos": 22562818179840.0, "grad_norm": 15.235998378650564, "language_loss": 0.6826458, "learning_rate": 8.360774861977216e-08, "loss": 0.70423663, "num_input_tokens_seen": 163691365, "step": 7571, "time_per_iteration": 2.551008462905884 }, { "auxiliary_loss_clip": 0.01103671, "auxiliary_loss_mlp": 0.01036016, "balance_loss_clip": 1.03704739, "balance_loss_mlp": 1.02104378, "epoch": 0.9104791679191968, "flos": 25373474524800.0, "grad_norm": 2.007183676098876, "language_loss": 0.74271023, "learning_rate": 8.338502069979281e-08, "loss": 0.76410705, "num_input_tokens_seen": 163711675, "step": 7572, "time_per_iteration": 3.6015307903289795 }, { "auxiliary_loss_clip": 0.01121316, "auxiliary_loss_mlp": 0.01036053, "balance_loss_clip": 1.04135597, "balance_loss_mlp": 1.02043676, "epoch": 0.9105994108098359, "flos": 14426681558400.0, "grad_norm": 3.4191105421915267, "language_loss": 0.79975867, "learning_rate": 8.316258352878214e-08, "loss": 0.82133234, "num_input_tokens_seen": 163728095, "step": 7573, "time_per_iteration": 2.5168936252593994 }, { "auxiliary_loss_clip": 0.01125031, "auxiliary_loss_mlp": 0.01042031, "balance_loss_clip": 1.04216802, "balance_loss_mlp": 1.0251627, "epoch": 0.9107196537004749, "flos": 26718292748160.0, "grad_norm": 1.9145688544291386, "language_loss": 0.71222079, "learning_rate": 8.294043714048338e-08, "loss": 0.73389137, "num_input_tokens_seen": 163747175, "step": 7574, "time_per_iteration": 2.587355375289917 }, { "auxiliary_loss_clip": 0.01017768, "auxiliary_loss_mlp": 0.01002576, "balance_loss_clip": 1.0080893, "balance_loss_mlp": 1.00099075, "epoch": 0.9108398965911141, "flos": 66532634703360.0, "grad_norm": 0.7511678495037798, "language_loss": 0.60428077, "learning_rate": 8.271858156859624e-08, "loss": 0.62448418, "num_input_tokens_seen": 163812545, "step": 7575, "time_per_iteration": 4.255435943603516 }, { "auxiliary_loss_clip": 0.01132501, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.04291606, "balance_loss_mlp": 1.01962972, "epoch": 0.9109601394817531, "flos": 25411073086080.0, "grad_norm": 1.6065788524809896, "language_loss": 0.7386893, "learning_rate": 8.249701684677557e-08, "loss": 0.76037133, "num_input_tokens_seen": 163833870, "step": 7576, "time_per_iteration": 3.4329066276550293 }, { "auxiliary_loss_clip": 0.01120954, "auxiliary_loss_mlp": 0.01036225, "balance_loss_clip": 1.04450107, "balance_loss_mlp": 1.02063227, "epoch": 0.9110803823723922, "flos": 22747794243840.0, "grad_norm": 1.7937189589660611, "language_loss": 0.80918896, "learning_rate": 8.227574300863294e-08, "loss": 0.83076072, "num_input_tokens_seen": 163854040, "step": 7577, "time_per_iteration": 2.561797618865967 }, { "auxiliary_loss_clip": 0.01112381, "auxiliary_loss_mlp": 0.01037273, "balance_loss_clip": 1.04143429, "balance_loss_mlp": 1.02168584, "epoch": 0.9112006252630314, "flos": 48469924131840.0, "grad_norm": 1.6909621975976334, "language_loss": 0.69654477, "learning_rate": 8.205476008773548e-08, "loss": 0.7180413, "num_input_tokens_seen": 163878040, "step": 7578, "time_per_iteration": 2.797194004058838 }, { "auxiliary_loss_clip": 0.0109026, "auxiliary_loss_mlp": 0.01036712, "balance_loss_clip": 1.03993511, "balance_loss_mlp": 1.02079749, "epoch": 0.9113208681536704, "flos": 30009649829760.0, "grad_norm": 2.809225713210951, "language_loss": 0.82605064, "learning_rate": 8.183406811760596e-08, "loss": 0.84732038, "num_input_tokens_seen": 163897770, "step": 7579, "time_per_iteration": 2.6825623512268066 }, { "auxiliary_loss_clip": 0.01078351, "auxiliary_loss_mlp": 0.01045236, "balance_loss_clip": 1.03467917, "balance_loss_mlp": 1.02882099, "epoch": 0.9114411110443095, "flos": 25594971742080.0, "grad_norm": 1.6549579156438636, "language_loss": 0.74222291, "learning_rate": 8.161366713172313e-08, "loss": 0.76345873, "num_input_tokens_seen": 163920160, "step": 7580, "time_per_iteration": 2.6671454906463623 }, { "auxiliary_loss_clip": 0.01100126, "auxiliary_loss_mlp": 0.01043028, "balance_loss_clip": 1.03993988, "balance_loss_mlp": 1.02463377, "epoch": 0.9115613539349486, "flos": 18399729928320.0, "grad_norm": 2.6484617142676545, "language_loss": 0.84025252, "learning_rate": 8.139355716352137e-08, "loss": 0.86168396, "num_input_tokens_seen": 163935000, "step": 7581, "time_per_iteration": 2.5457475185394287 }, { "auxiliary_loss_clip": 0.01107227, "auxiliary_loss_mlp": 0.01037491, "balance_loss_clip": 1.03810811, "balance_loss_mlp": 1.02248216, "epoch": 0.9116815968255877, "flos": 21726171619200.0, "grad_norm": 1.8602615755712202, "language_loss": 0.69901705, "learning_rate": 8.117373824639196e-08, "loss": 0.72046423, "num_input_tokens_seen": 163955265, "step": 7582, "time_per_iteration": 2.578579902648926 }, { "auxiliary_loss_clip": 0.01034335, "auxiliary_loss_mlp": 0.01000986, "balance_loss_clip": 1.00637507, "balance_loss_mlp": 0.99940008, "epoch": 0.9118018397162267, "flos": 65363526835200.0, "grad_norm": 0.7271569984390479, "language_loss": 0.59261954, "learning_rate": 8.095421041368067e-08, "loss": 0.61297274, "num_input_tokens_seen": 164014680, "step": 7583, "time_per_iteration": 3.039661169052124 }, { "auxiliary_loss_clip": 0.01104892, "auxiliary_loss_mlp": 0.00771435, "balance_loss_clip": 1.04026985, "balance_loss_mlp": 1.00056052, "epoch": 0.9119220826068659, "flos": 20922885815040.0, "grad_norm": 1.9863574154560473, "language_loss": 0.70438516, "learning_rate": 8.073497369868999e-08, "loss": 0.72314847, "num_input_tokens_seen": 164033140, "step": 7584, "time_per_iteration": 2.5935449600219727 }, { "auxiliary_loss_clip": 0.0111863, "auxiliary_loss_mlp": 0.01035629, "balance_loss_clip": 1.04327679, "balance_loss_mlp": 1.01840341, "epoch": 0.912042325497505, "flos": 28366449327360.0, "grad_norm": 1.6048819021064322, "language_loss": 0.75624275, "learning_rate": 8.051602813467772e-08, "loss": 0.7777853, "num_input_tokens_seen": 164054995, "step": 7585, "time_per_iteration": 2.6445348262786865 }, { "auxiliary_loss_clip": 0.01119603, "auxiliary_loss_mlp": 0.01038522, "balance_loss_clip": 1.04208052, "balance_loss_mlp": 1.02423525, "epoch": 0.912162568388144, "flos": 17566782468480.0, "grad_norm": 3.1882187932307025, "language_loss": 0.71519375, "learning_rate": 8.029737375485756e-08, "loss": 0.73677498, "num_input_tokens_seen": 164074225, "step": 7586, "time_per_iteration": 2.522033929824829 }, { "auxiliary_loss_clip": 0.01131376, "auxiliary_loss_mlp": 0.01038533, "balance_loss_clip": 1.04305625, "balance_loss_mlp": 1.02369189, "epoch": 0.9122828112787832, "flos": 19827897661440.0, "grad_norm": 1.8206262199642547, "language_loss": 0.72446656, "learning_rate": 8.007901059239986e-08, "loss": 0.74616569, "num_input_tokens_seen": 164093505, "step": 7587, "time_per_iteration": 2.5202083587646484 }, { "auxiliary_loss_clip": 0.01109567, "auxiliary_loss_mlp": 0.01034599, "balance_loss_clip": 1.04052114, "balance_loss_mlp": 1.01980484, "epoch": 0.9124030541694222, "flos": 20813789232000.0, "grad_norm": 1.6857043467269168, "language_loss": 0.80383193, "learning_rate": 7.986093868042964e-08, "loss": 0.82527357, "num_input_tokens_seen": 164113750, "step": 7588, "time_per_iteration": 2.6000592708587646 }, { "auxiliary_loss_clip": 0.01117514, "auxiliary_loss_mlp": 0.01043178, "balance_loss_clip": 1.04040885, "balance_loss_mlp": 1.02837205, "epoch": 0.9125232970600613, "flos": 25192305302400.0, "grad_norm": 1.8047580459458163, "language_loss": 0.67929953, "learning_rate": 7.964315805202826e-08, "loss": 0.70090652, "num_input_tokens_seen": 164134330, "step": 7589, "time_per_iteration": 2.6792094707489014 }, { "auxiliary_loss_clip": 0.01106775, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.04223394, "balance_loss_mlp": 1.02280354, "epoch": 0.9126435399507005, "flos": 19719591177600.0, "grad_norm": 7.054579791771631, "language_loss": 0.73564339, "learning_rate": 7.942566874023304e-08, "loss": 0.75708574, "num_input_tokens_seen": 164153515, "step": 7590, "time_per_iteration": 2.580855131149292 }, { "auxiliary_loss_clip": 0.01102718, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.03721428, "balance_loss_mlp": 1.01976919, "epoch": 0.9127637828413395, "flos": 19573614305280.0, "grad_norm": 2.399538326329254, "language_loss": 0.6953606, "learning_rate": 7.920847077803649e-08, "loss": 0.7167505, "num_input_tokens_seen": 164171305, "step": 7591, "time_per_iteration": 2.564979314804077 }, { "auxiliary_loss_clip": 0.01069283, "auxiliary_loss_mlp": 0.01047717, "balance_loss_clip": 1.03326762, "balance_loss_mlp": 1.03068233, "epoch": 0.9128840257319786, "flos": 20230635928320.0, "grad_norm": 1.7950545494082415, "language_loss": 0.82462305, "learning_rate": 7.899156419838826e-08, "loss": 0.84579301, "num_input_tokens_seen": 164190275, "step": 7592, "time_per_iteration": 2.6292672157287598 }, { "auxiliary_loss_clip": 0.01094196, "auxiliary_loss_mlp": 0.01040611, "balance_loss_clip": 1.03979623, "balance_loss_mlp": 1.02504253, "epoch": 0.9130042686226177, "flos": 24858658846080.0, "grad_norm": 1.807740725022418, "language_loss": 0.65363193, "learning_rate": 7.87749490341918e-08, "loss": 0.67498004, "num_input_tokens_seen": 164210550, "step": 7593, "time_per_iteration": 2.651707887649536 }, { "auxiliary_loss_clip": 0.01134277, "auxiliary_loss_mlp": 0.01039313, "balance_loss_clip": 1.04262877, "balance_loss_mlp": 1.02268362, "epoch": 0.9131245115132568, "flos": 23581747284480.0, "grad_norm": 2.2072762021885914, "language_loss": 0.83571422, "learning_rate": 7.855862531830836e-08, "loss": 0.85745007, "num_input_tokens_seen": 164226660, "step": 7594, "time_per_iteration": 2.5162885189056396 }, { "auxiliary_loss_clip": 0.01118202, "auxiliary_loss_mlp": 0.01037221, "balance_loss_clip": 1.04072249, "balance_loss_mlp": 1.02178383, "epoch": 0.9132447544038959, "flos": 19931607204480.0, "grad_norm": 1.5855142068526817, "language_loss": 0.72678638, "learning_rate": 7.834259308355373e-08, "loss": 0.74834061, "num_input_tokens_seen": 164245425, "step": 7595, "time_per_iteration": 2.6124775409698486 }, { "auxiliary_loss_clip": 0.01054716, "auxiliary_loss_mlp": 0.01043123, "balance_loss_clip": 1.03311265, "balance_loss_mlp": 1.02524114, "epoch": 0.9133649972945349, "flos": 21981747864960.0, "grad_norm": 3.914033609495429, "language_loss": 0.7503739, "learning_rate": 7.812685236269989e-08, "loss": 0.77135229, "num_input_tokens_seen": 164264085, "step": 7596, "time_per_iteration": 3.664252996444702 }, { "auxiliary_loss_clip": 0.01004563, "auxiliary_loss_mlp": 0.01002794, "balance_loss_clip": 1.00969541, "balance_loss_mlp": 1.00126839, "epoch": 0.9134852401851741, "flos": 71240523511680.0, "grad_norm": 0.7900632077514843, "language_loss": 0.58667099, "learning_rate": 7.791140318847445e-08, "loss": 0.60674453, "num_input_tokens_seen": 164322220, "step": 7597, "time_per_iteration": 4.113935232162476 }, { "auxiliary_loss_clip": 0.01101417, "auxiliary_loss_mlp": 0.01038191, "balance_loss_clip": 1.04090261, "balance_loss_mlp": 1.02319455, "epoch": 0.9136054830758131, "flos": 23626923615360.0, "grad_norm": 2.0014915871041175, "language_loss": 0.80403638, "learning_rate": 7.769624559356081e-08, "loss": 0.82543242, "num_input_tokens_seen": 164345615, "step": 7598, "time_per_iteration": 2.707937479019165 }, { "auxiliary_loss_clip": 0.011192, "auxiliary_loss_mlp": 0.01039391, "balance_loss_clip": 1.0413816, "balance_loss_mlp": 1.02376246, "epoch": 0.9137257259664522, "flos": 23438858981760.0, "grad_norm": 3.092879090101599, "language_loss": 0.75472105, "learning_rate": 7.748137961059842e-08, "loss": 0.77630699, "num_input_tokens_seen": 164359595, "step": 7599, "time_per_iteration": 2.5543086528778076 }, { "auxiliary_loss_clip": 0.01125508, "auxiliary_loss_mlp": 0.0103729, "balance_loss_clip": 1.04181075, "balance_loss_mlp": 1.02278209, "epoch": 0.9138459688570914, "flos": 19127854523520.0, "grad_norm": 2.3129401781319894, "language_loss": 0.66055346, "learning_rate": 7.726680527218211e-08, "loss": 0.68218148, "num_input_tokens_seen": 164376635, "step": 7600, "time_per_iteration": 3.511993885040283 }, { "auxiliary_loss_clip": 0.01129317, "auxiliary_loss_mlp": 0.01038719, "balance_loss_clip": 1.03959131, "balance_loss_mlp": 1.02323329, "epoch": 0.9139662117477304, "flos": 46281240714240.0, "grad_norm": 1.7589406935871144, "language_loss": 0.75552666, "learning_rate": 7.70525226108627e-08, "loss": 0.77720702, "num_input_tokens_seen": 164400305, "step": 7601, "time_per_iteration": 2.745330333709717 }, { "auxiliary_loss_clip": 0.01119969, "auxiliary_loss_mlp": 0.01037155, "balance_loss_clip": 1.04295945, "balance_loss_mlp": 1.02139533, "epoch": 0.9140864546383695, "flos": 22273198819200.0, "grad_norm": 2.3834010838804915, "language_loss": 0.79891938, "learning_rate": 7.683853165914666e-08, "loss": 0.8204906, "num_input_tokens_seen": 164418075, "step": 7602, "time_per_iteration": 3.483818531036377 }, { "auxiliary_loss_clip": 0.01082249, "auxiliary_loss_mlp": 0.0104109, "balance_loss_clip": 1.03617191, "balance_loss_mlp": 1.02525938, "epoch": 0.9142066975290086, "flos": 17530009920000.0, "grad_norm": 1.8746366602559734, "language_loss": 0.77083713, "learning_rate": 7.662483244949602e-08, "loss": 0.79207051, "num_input_tokens_seen": 164435335, "step": 7603, "time_per_iteration": 2.619119882583618 }, { "auxiliary_loss_clip": 0.01089363, "auxiliary_loss_mlp": 0.01036906, "balance_loss_clip": 1.03951287, "balance_loss_mlp": 1.0219574, "epoch": 0.9143269404196477, "flos": 17712148809600.0, "grad_norm": 2.3909661578870725, "language_loss": 0.80539691, "learning_rate": 7.641142501432951e-08, "loss": 0.82665956, "num_input_tokens_seen": 164451530, "step": 7604, "time_per_iteration": 2.6109859943389893 }, { "auxiliary_loss_clip": 0.01104061, "auxiliary_loss_mlp": 0.01039037, "balance_loss_clip": 1.03940177, "balance_loss_mlp": 1.02395725, "epoch": 0.9144471833102867, "flos": 33323414019840.0, "grad_norm": 1.6330370155378255, "language_loss": 0.73959553, "learning_rate": 7.619830938602013e-08, "loss": 0.7610265, "num_input_tokens_seen": 164472755, "step": 7605, "time_per_iteration": 2.6715996265411377 }, { "auxiliary_loss_clip": 0.01115337, "auxiliary_loss_mlp": 0.01048936, "balance_loss_clip": 1.04224586, "balance_loss_mlp": 1.03349829, "epoch": 0.9145674262009259, "flos": 21068970428160.0, "grad_norm": 3.2424096235017803, "language_loss": 0.82800376, "learning_rate": 7.598548559689777e-08, "loss": 0.84964657, "num_input_tokens_seen": 164491155, "step": 7606, "time_per_iteration": 2.554414749145508 }, { "auxiliary_loss_clip": 0.01089041, "auxiliary_loss_mlp": 0.01039495, "balance_loss_clip": 1.03794301, "balance_loss_mlp": 1.02378917, "epoch": 0.914687669091565, "flos": 16800269212800.0, "grad_norm": 2.158862304727693, "language_loss": 0.813411, "learning_rate": 7.577295367924751e-08, "loss": 0.83469635, "num_input_tokens_seen": 164507555, "step": 7607, "time_per_iteration": 2.6158642768859863 }, { "auxiliary_loss_clip": 0.01110455, "auxiliary_loss_mlp": 0.01042634, "balance_loss_clip": 1.04095089, "balance_loss_mlp": 1.02613521, "epoch": 0.914807911982204, "flos": 25773627012480.0, "grad_norm": 2.4420796061838272, "language_loss": 0.82400179, "learning_rate": 7.556071366531002e-08, "loss": 0.84553266, "num_input_tokens_seen": 164528525, "step": 7608, "time_per_iteration": 2.6007003784179688 }, { "auxiliary_loss_clip": 0.01119692, "auxiliary_loss_mlp": 0.01045604, "balance_loss_clip": 1.04218006, "balance_loss_mlp": 1.02800822, "epoch": 0.9149281548728432, "flos": 19208043636480.0, "grad_norm": 1.9021099485594317, "language_loss": 0.79113066, "learning_rate": 7.53487655872822e-08, "loss": 0.8127836, "num_input_tokens_seen": 164547695, "step": 7609, "time_per_iteration": 2.5440378189086914 }, { "auxiliary_loss_clip": 0.01081732, "auxiliary_loss_mlp": 0.01035064, "balance_loss_clip": 1.0347296, "balance_loss_mlp": 1.01911974, "epoch": 0.9150483977634822, "flos": 26870554500480.0, "grad_norm": 1.9986744701741244, "language_loss": 0.74131948, "learning_rate": 7.513710947731656e-08, "loss": 0.76248741, "num_input_tokens_seen": 164568905, "step": 7610, "time_per_iteration": 2.739794969558716 }, { "auxiliary_loss_clip": 0.01098884, "auxiliary_loss_mlp": 0.0104079, "balance_loss_clip": 1.03953815, "balance_loss_mlp": 1.02509058, "epoch": 0.9151686406541213, "flos": 21908956953600.0, "grad_norm": 1.7351119962407966, "language_loss": 0.85232615, "learning_rate": 7.492574536752095e-08, "loss": 0.87372291, "num_input_tokens_seen": 164588895, "step": 7611, "time_per_iteration": 2.6169869899749756 }, { "auxiliary_loss_clip": 0.0111689, "auxiliary_loss_mlp": 0.01039053, "balance_loss_clip": 1.04176712, "balance_loss_mlp": 1.0237112, "epoch": 0.9152888835447605, "flos": 27308556944640.0, "grad_norm": 2.01975080168641, "language_loss": 0.78446686, "learning_rate": 7.471467328995907e-08, "loss": 0.80602628, "num_input_tokens_seen": 164607705, "step": 7612, "time_per_iteration": 2.611229419708252 }, { "auxiliary_loss_clip": 0.01052794, "auxiliary_loss_mlp": 0.0103385, "balance_loss_clip": 1.03371406, "balance_loss_mlp": 1.01672006, "epoch": 0.9154091264353995, "flos": 13370728510080.0, "grad_norm": 2.6947484856111683, "language_loss": 0.61035049, "learning_rate": 7.450389327665018e-08, "loss": 0.631217, "num_input_tokens_seen": 164625540, "step": 7613, "time_per_iteration": 2.8756628036499023 }, { "auxiliary_loss_clip": 0.01096695, "auxiliary_loss_mlp": 0.01039141, "balance_loss_clip": 1.04281747, "balance_loss_mlp": 1.02288675, "epoch": 0.9155293693260386, "flos": 20193037367040.0, "grad_norm": 2.366579886905588, "language_loss": 0.67496991, "learning_rate": 7.429340535957029e-08, "loss": 0.69632828, "num_input_tokens_seen": 164640735, "step": 7614, "time_per_iteration": 2.7918379306793213 }, { "auxiliary_loss_clip": 0.01111723, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.04260123, "balance_loss_mlp": 1.01986074, "epoch": 0.9156496122166777, "flos": 19354990176000.0, "grad_norm": 2.887119688945359, "language_loss": 0.70972651, "learning_rate": 7.40832095706494e-08, "loss": 0.73119175, "num_input_tokens_seen": 164657430, "step": 7615, "time_per_iteration": 2.584702730178833 }, { "auxiliary_loss_clip": 0.01100789, "auxiliary_loss_mlp": 0.0104761, "balance_loss_clip": 1.04160929, "balance_loss_mlp": 1.03196931, "epoch": 0.9157698551073168, "flos": 21107287261440.0, "grad_norm": 2.115291458244918, "language_loss": 0.80245006, "learning_rate": 7.387330594177443e-08, "loss": 0.82393402, "num_input_tokens_seen": 164679505, "step": 7616, "time_per_iteration": 2.6376657485961914 }, { "auxiliary_loss_clip": 0.01086355, "auxiliary_loss_mlp": 0.01043145, "balance_loss_clip": 1.03820336, "balance_loss_mlp": 1.02705169, "epoch": 0.9158900979979558, "flos": 25193167228800.0, "grad_norm": 1.862748890179539, "language_loss": 0.79374397, "learning_rate": 7.366369450478749e-08, "loss": 0.81503898, "num_input_tokens_seen": 164700615, "step": 7617, "time_per_iteration": 2.683318853378296 }, { "auxiliary_loss_clip": 0.01090202, "auxiliary_loss_mlp": 0.01042164, "balance_loss_clip": 1.03710127, "balance_loss_mlp": 1.02683318, "epoch": 0.916010340888595, "flos": 30146648302080.0, "grad_norm": 2.14551654536141, "language_loss": 0.66267979, "learning_rate": 7.345437529148646e-08, "loss": 0.68400347, "num_input_tokens_seen": 164719625, "step": 7618, "time_per_iteration": 2.691344976425171 }, { "auxiliary_loss_clip": 0.01093966, "auxiliary_loss_mlp": 0.01040912, "balance_loss_clip": 1.03854299, "balance_loss_mlp": 1.02573705, "epoch": 0.9161305837792341, "flos": 17091827907840.0, "grad_norm": 1.8813891288917017, "language_loss": 0.73130673, "learning_rate": 7.324534833362483e-08, "loss": 0.75265551, "num_input_tokens_seen": 164737200, "step": 7619, "time_per_iteration": 2.6344170570373535 }, { "auxiliary_loss_clip": 0.01106585, "auxiliary_loss_mlp": 0.01039207, "balance_loss_clip": 1.04056811, "balance_loss_mlp": 1.02468145, "epoch": 0.9162508266698731, "flos": 22893699288960.0, "grad_norm": 2.0715861387399124, "language_loss": 0.68555009, "learning_rate": 7.303661366291192e-08, "loss": 0.707008, "num_input_tokens_seen": 164757870, "step": 7620, "time_per_iteration": 2.644702672958374 }, { "auxiliary_loss_clip": 0.01079511, "auxiliary_loss_mlp": 0.01036185, "balance_loss_clip": 1.03791237, "balance_loss_mlp": 1.02102113, "epoch": 0.9163710695605123, "flos": 19974808287360.0, "grad_norm": 1.8400855902297242, "language_loss": 0.81266451, "learning_rate": 7.28281713110126e-08, "loss": 0.83382142, "num_input_tokens_seen": 164775945, "step": 7621, "time_per_iteration": 3.6353869438171387 }, { "auxiliary_loss_clip": 0.01100494, "auxiliary_loss_mlp": 0.01038942, "balance_loss_clip": 1.04122174, "balance_loss_mlp": 1.02470827, "epoch": 0.9164913124511513, "flos": 22783812606720.0, "grad_norm": 1.9817681279383894, "language_loss": 0.76976353, "learning_rate": 7.262002130954759e-08, "loss": 0.7911579, "num_input_tokens_seen": 164794400, "step": 7622, "time_per_iteration": 2.6601028442382812 }, { "auxiliary_loss_clip": 0.01087001, "auxiliary_loss_mlp": 0.01042772, "balance_loss_clip": 1.04119658, "balance_loss_mlp": 1.02748895, "epoch": 0.9166115553417904, "flos": 24900854348160.0, "grad_norm": 1.696732098003622, "language_loss": 0.7903235, "learning_rate": 7.241216369009296e-08, "loss": 0.81162119, "num_input_tokens_seen": 164814585, "step": 7623, "time_per_iteration": 4.282405138015747 }, { "auxiliary_loss_clip": 0.01130091, "auxiliary_loss_mlp": 0.01039011, "balance_loss_clip": 1.04157686, "balance_loss_mlp": 1.02369261, "epoch": 0.9167317982324296, "flos": 25702919089920.0, "grad_norm": 1.81700042431151, "language_loss": 0.66694266, "learning_rate": 7.220459848418037e-08, "loss": 0.68863368, "num_input_tokens_seen": 164834660, "step": 7624, "time_per_iteration": 2.59200119972229 }, { "auxiliary_loss_clip": 0.01130303, "auxiliary_loss_mlp": 0.0103799, "balance_loss_clip": 1.04269159, "balance_loss_mlp": 1.02278495, "epoch": 0.9168520411230686, "flos": 15632813370240.0, "grad_norm": 1.8451695179827983, "language_loss": 0.79516935, "learning_rate": 7.199732572329708e-08, "loss": 0.81685227, "num_input_tokens_seen": 164852560, "step": 7625, "time_per_iteration": 2.5173449516296387 }, { "auxiliary_loss_clip": 0.01093807, "auxiliary_loss_mlp": 0.01040169, "balance_loss_clip": 1.03973281, "balance_loss_mlp": 1.02461243, "epoch": 0.9169722840137077, "flos": 30258151096320.0, "grad_norm": 2.112324859821759, "language_loss": 0.76216179, "learning_rate": 7.179034543888684e-08, "loss": 0.78350157, "num_input_tokens_seen": 164872065, "step": 7626, "time_per_iteration": 3.65061092376709 }, { "auxiliary_loss_clip": 0.01121232, "auxiliary_loss_mlp": 0.01034556, "balance_loss_clip": 1.03995383, "balance_loss_mlp": 1.01830745, "epoch": 0.9170925269043467, "flos": 22491643380480.0, "grad_norm": 2.78354646777761, "language_loss": 0.77593261, "learning_rate": 7.158365766234808e-08, "loss": 0.79749048, "num_input_tokens_seen": 164890915, "step": 7627, "time_per_iteration": 2.589111804962158 }, { "auxiliary_loss_clip": 0.01086457, "auxiliary_loss_mlp": 0.0104139, "balance_loss_clip": 1.03558075, "balance_loss_mlp": 1.02479625, "epoch": 0.9172127697949859, "flos": 22893914770560.0, "grad_norm": 5.329210686111031, "language_loss": 0.72288835, "learning_rate": 7.137726242503527e-08, "loss": 0.74416685, "num_input_tokens_seen": 164909835, "step": 7628, "time_per_iteration": 3.4980714321136475 }, { "auxiliary_loss_clip": 0.0111902, "auxiliary_loss_mlp": 0.00771725, "balance_loss_clip": 1.0409348, "balance_loss_mlp": 1.00058055, "epoch": 0.917333012685625, "flos": 17451867882240.0, "grad_norm": 3.545282177277409, "language_loss": 0.78194433, "learning_rate": 7.11711597582585e-08, "loss": 0.80085182, "num_input_tokens_seen": 164927195, "step": 7629, "time_per_iteration": 2.539980411529541 }, { "auxiliary_loss_clip": 0.01095079, "auxiliary_loss_mlp": 0.01036003, "balance_loss_clip": 1.0374645, "balance_loss_mlp": 1.02094698, "epoch": 0.917453255576264, "flos": 14318949692160.0, "grad_norm": 1.673074940702643, "language_loss": 0.79746449, "learning_rate": 7.096534969328271e-08, "loss": 0.81877536, "num_input_tokens_seen": 164944640, "step": 7630, "time_per_iteration": 2.6036438941955566 }, { "auxiliary_loss_clip": 0.01109061, "auxiliary_loss_mlp": 0.01041215, "balance_loss_clip": 1.03833425, "balance_loss_mlp": 1.02596247, "epoch": 0.9175734984669032, "flos": 20741177888640.0, "grad_norm": 2.010460624067119, "language_loss": 0.84600139, "learning_rate": 7.075983226132987e-08, "loss": 0.86750412, "num_input_tokens_seen": 164963570, "step": 7631, "time_per_iteration": 2.661820650100708 }, { "auxiliary_loss_clip": 0.01115838, "auxiliary_loss_mlp": 0.00772885, "balance_loss_clip": 1.0430181, "balance_loss_mlp": 1.00053763, "epoch": 0.9176937413575422, "flos": 14830497233280.0, "grad_norm": 2.605368436121334, "language_loss": 0.79253238, "learning_rate": 7.055460749357656e-08, "loss": 0.81141961, "num_input_tokens_seen": 164979850, "step": 7632, "time_per_iteration": 2.5694921016693115 }, { "auxiliary_loss_clip": 0.01106306, "auxiliary_loss_mlp": 0.01037185, "balance_loss_clip": 1.04070652, "balance_loss_mlp": 1.02137733, "epoch": 0.9178139842481813, "flos": 18474603828480.0, "grad_norm": 1.815787486285109, "language_loss": 0.70289284, "learning_rate": 7.034967542115521e-08, "loss": 0.7243278, "num_input_tokens_seen": 164998115, "step": 7633, "time_per_iteration": 2.569690227508545 }, { "auxiliary_loss_clip": 0.01109642, "auxiliary_loss_mlp": 0.00772133, "balance_loss_clip": 1.03902984, "balance_loss_mlp": 1.00047147, "epoch": 0.9179342271388204, "flos": 20047455544320.0, "grad_norm": 5.631584206154938, "language_loss": 0.75770438, "learning_rate": 7.014503607515388e-08, "loss": 0.77652222, "num_input_tokens_seen": 165017420, "step": 7634, "time_per_iteration": 2.5496134757995605 }, { "auxiliary_loss_clip": 0.01113371, "auxiliary_loss_mlp": 0.01039175, "balance_loss_clip": 1.04473281, "balance_loss_mlp": 1.02446485, "epoch": 0.9180544700294595, "flos": 24676232647680.0, "grad_norm": 2.0393232258978733, "language_loss": 0.68105185, "learning_rate": 6.994068948661592e-08, "loss": 0.70257729, "num_input_tokens_seen": 165035575, "step": 7635, "time_per_iteration": 2.6036229133605957 }, { "auxiliary_loss_clip": 0.01119739, "auxiliary_loss_mlp": 0.01045347, "balance_loss_clip": 1.03975725, "balance_loss_mlp": 1.02720356, "epoch": 0.9181747129200986, "flos": 16727478301440.0, "grad_norm": 2.1884862729064123, "language_loss": 0.76766682, "learning_rate": 6.973663568654142e-08, "loss": 0.78931773, "num_input_tokens_seen": 165053280, "step": 7636, "time_per_iteration": 2.533658742904663 }, { "auxiliary_loss_clip": 0.01131834, "auxiliary_loss_mlp": 0.01040129, "balance_loss_clip": 1.0438782, "balance_loss_mlp": 1.02459645, "epoch": 0.9182949558107377, "flos": 24271626873600.0, "grad_norm": 6.24958266665986, "language_loss": 0.65411085, "learning_rate": 6.953287470588386e-08, "loss": 0.67583048, "num_input_tokens_seen": 165071235, "step": 7637, "time_per_iteration": 2.525547504425049 }, { "auxiliary_loss_clip": 0.01122575, "auxiliary_loss_mlp": 0.01040545, "balance_loss_clip": 1.0411005, "balance_loss_mlp": 1.02474976, "epoch": 0.9184151987013768, "flos": 22082117443200.0, "grad_norm": 2.3376635658343736, "language_loss": 0.85987341, "learning_rate": 6.932940657555452e-08, "loss": 0.8815046, "num_input_tokens_seen": 165087365, "step": 7638, "time_per_iteration": 2.5544543266296387 }, { "auxiliary_loss_clip": 0.01123992, "auxiliary_loss_mlp": 0.01035484, "balance_loss_clip": 1.04135942, "balance_loss_mlp": 1.02101207, "epoch": 0.9185354415920158, "flos": 32166732257280.0, "grad_norm": 2.3118244345950485, "language_loss": 0.76484931, "learning_rate": 6.912623132641938e-08, "loss": 0.78644407, "num_input_tokens_seen": 165112455, "step": 7639, "time_per_iteration": 2.6458230018615723 }, { "auxiliary_loss_clip": 0.01108233, "auxiliary_loss_mlp": 0.01037371, "balance_loss_clip": 1.04072821, "balance_loss_mlp": 1.01907229, "epoch": 0.918655684482655, "flos": 20997831542400.0, "grad_norm": 2.051546699564596, "language_loss": 0.76660484, "learning_rate": 6.892334898929952e-08, "loss": 0.7880609, "num_input_tokens_seen": 165132700, "step": 7640, "time_per_iteration": 2.7058863639831543 }, { "auxiliary_loss_clip": 0.01115364, "auxiliary_loss_mlp": 0.01036285, "balance_loss_clip": 1.04169059, "balance_loss_mlp": 1.02094281, "epoch": 0.918775927373294, "flos": 15560704817280.0, "grad_norm": 1.9853604595056968, "language_loss": 0.84754616, "learning_rate": 6.872075959497236e-08, "loss": 0.86906266, "num_input_tokens_seen": 165151475, "step": 7641, "time_per_iteration": 2.532227039337158 }, { "auxiliary_loss_clip": 0.01120652, "auxiliary_loss_mlp": 0.01035066, "balance_loss_clip": 1.04102969, "balance_loss_mlp": 1.01943791, "epoch": 0.9188961702639331, "flos": 29934057657600.0, "grad_norm": 2.192394919670545, "language_loss": 0.8284173, "learning_rate": 6.85184631741702e-08, "loss": 0.84997451, "num_input_tokens_seen": 165172040, "step": 7642, "time_per_iteration": 2.6057727336883545 }, { "auxiliary_loss_clip": 0.01119094, "auxiliary_loss_mlp": 0.01033517, "balance_loss_clip": 1.04084086, "balance_loss_mlp": 1.01832962, "epoch": 0.9190164131545723, "flos": 20701244943360.0, "grad_norm": 2.375500470608307, "language_loss": 0.77319646, "learning_rate": 6.831645975758161e-08, "loss": 0.79472256, "num_input_tokens_seen": 165189980, "step": 7643, "time_per_iteration": 2.536284923553467 }, { "auxiliary_loss_clip": 0.01099548, "auxiliary_loss_mlp": 0.01042227, "balance_loss_clip": 1.03912568, "balance_loss_mlp": 1.02478623, "epoch": 0.9191366560452113, "flos": 25629912696960.0, "grad_norm": 2.4986382896040156, "language_loss": 0.67471385, "learning_rate": 6.811474937585026e-08, "loss": 0.69613159, "num_input_tokens_seen": 165209770, "step": 7644, "time_per_iteration": 2.6198456287384033 }, { "auxiliary_loss_clip": 0.01092179, "auxiliary_loss_mlp": 0.01034531, "balance_loss_clip": 1.04057097, "balance_loss_mlp": 1.01925993, "epoch": 0.9192568989358504, "flos": 21434325615360.0, "grad_norm": 3.2044782693564446, "language_loss": 0.790016, "learning_rate": 6.79133320595755e-08, "loss": 0.81128311, "num_input_tokens_seen": 165229690, "step": 7645, "time_per_iteration": 2.6127867698669434 }, { "auxiliary_loss_clip": 0.01107202, "auxiliary_loss_mlp": 0.01028613, "balance_loss_clip": 1.03982186, "balance_loss_mlp": 1.01350284, "epoch": 0.9193771418264896, "flos": 23185078416000.0, "grad_norm": 2.2399671052655132, "language_loss": 0.75360525, "learning_rate": 6.771220783931198e-08, "loss": 0.77496338, "num_input_tokens_seen": 165249850, "step": 7646, "time_per_iteration": 2.581983804702759 }, { "auxiliary_loss_clip": 0.00966601, "auxiliary_loss_mlp": 0.00755612, "balance_loss_clip": 1.01223397, "balance_loss_mlp": 1.00031114, "epoch": 0.9194973847171286, "flos": 70582963184640.0, "grad_norm": 0.8900538233203427, "language_loss": 0.64594924, "learning_rate": 6.751137674556994e-08, "loss": 0.66317135, "num_input_tokens_seen": 165310235, "step": 7647, "time_per_iteration": 4.5941972732543945 }, { "auxiliary_loss_clip": 0.01122131, "auxiliary_loss_mlp": 0.01037055, "balance_loss_clip": 1.04022336, "balance_loss_mlp": 1.02169466, "epoch": 0.9196176276077677, "flos": 14720682378240.0, "grad_norm": 2.3690141838311316, "language_loss": 0.77753961, "learning_rate": 6.731083880881572e-08, "loss": 0.79913145, "num_input_tokens_seen": 165326455, "step": 7648, "time_per_iteration": 4.063644647598267 }, { "auxiliary_loss_clip": 0.01108301, "auxiliary_loss_mlp": 0.01042799, "balance_loss_clip": 1.04244161, "balance_loss_mlp": 1.02821326, "epoch": 0.9197378704984068, "flos": 23294893271040.0, "grad_norm": 2.2352053232846782, "language_loss": 0.80674142, "learning_rate": 6.711059405947072e-08, "loss": 0.82825238, "num_input_tokens_seen": 165344645, "step": 7649, "time_per_iteration": 3.6012814044952393 }, { "auxiliary_loss_clip": 0.0109185, "auxiliary_loss_mlp": 0.01039856, "balance_loss_clip": 1.03829408, "balance_loss_mlp": 1.02514565, "epoch": 0.9198581133890459, "flos": 20302564913280.0, "grad_norm": 2.0140657524696417, "language_loss": 0.77136922, "learning_rate": 6.691064252791156e-08, "loss": 0.79268634, "num_input_tokens_seen": 165364120, "step": 7650, "time_per_iteration": 2.651825189590454 }, { "auxiliary_loss_clip": 0.01073988, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.03571904, "balance_loss_mlp": 1.01216364, "epoch": 0.9199783562796849, "flos": 17675663569920.0, "grad_norm": 2.531817341408685, "language_loss": 0.77998817, "learning_rate": 6.67109842444713e-08, "loss": 0.80100477, "num_input_tokens_seen": 165383050, "step": 7651, "time_per_iteration": 2.6896371841430664 }, { "auxiliary_loss_clip": 0.01114975, "auxiliary_loss_mlp": 0.00771881, "balance_loss_clip": 1.04202199, "balance_loss_mlp": 1.00046194, "epoch": 0.9200985991703241, "flos": 17676022705920.0, "grad_norm": 2.017665856286669, "language_loss": 0.76806438, "learning_rate": 6.651161923943704e-08, "loss": 0.78693295, "num_input_tokens_seen": 165400955, "step": 7652, "time_per_iteration": 3.4463582038879395 }, { "auxiliary_loss_clip": 0.01115611, "auxiliary_loss_mlp": 0.010396, "balance_loss_clip": 1.04026437, "balance_loss_mlp": 1.02375066, "epoch": 0.9202188420609632, "flos": 20996574566400.0, "grad_norm": 3.2762603798070082, "language_loss": 0.77008927, "learning_rate": 6.631254754305326e-08, "loss": 0.79164141, "num_input_tokens_seen": 165420415, "step": 7653, "time_per_iteration": 2.5919673442840576 }, { "auxiliary_loss_clip": 0.01132907, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.04269242, "balance_loss_mlp": 1.022228, "epoch": 0.9203390849516022, "flos": 13918222586880.0, "grad_norm": 2.928695991329291, "language_loss": 0.78131926, "learning_rate": 6.611376918551848e-08, "loss": 0.80302286, "num_input_tokens_seen": 165439200, "step": 7654, "time_per_iteration": 3.3852405548095703 }, { "auxiliary_loss_clip": 0.01088881, "auxiliary_loss_mlp": 0.00771983, "balance_loss_clip": 1.03660274, "balance_loss_mlp": 1.00057435, "epoch": 0.9204593278422414, "flos": 21175912195200.0, "grad_norm": 2.312705937008987, "language_loss": 0.79493725, "learning_rate": 6.591528419698744e-08, "loss": 0.81354588, "num_input_tokens_seen": 165458985, "step": 7655, "time_per_iteration": 2.604051113128662 }, { "auxiliary_loss_clip": 0.0110804, "auxiliary_loss_mlp": 0.01035092, "balance_loss_clip": 1.03929508, "balance_loss_mlp": 1.02077496, "epoch": 0.9205795707328804, "flos": 14501375890560.0, "grad_norm": 2.284559896528584, "language_loss": 0.83490801, "learning_rate": 6.571709260756986e-08, "loss": 0.85633934, "num_input_tokens_seen": 165475630, "step": 7656, "time_per_iteration": 2.574018716812134 }, { "auxiliary_loss_clip": 0.01124992, "auxiliary_loss_mlp": 0.01041367, "balance_loss_clip": 1.04601383, "balance_loss_mlp": 1.02562499, "epoch": 0.9206998136235195, "flos": 22417559579520.0, "grad_norm": 2.263105976191466, "language_loss": 0.77011853, "learning_rate": 6.551919444733122e-08, "loss": 0.79178214, "num_input_tokens_seen": 165493445, "step": 7657, "time_per_iteration": 2.702686071395874 }, { "auxiliary_loss_clip": 0.01104093, "auxiliary_loss_mlp": 0.01037538, "balance_loss_clip": 1.04045522, "balance_loss_mlp": 1.02148092, "epoch": 0.9208200565141585, "flos": 53358407544960.0, "grad_norm": 2.0722632504685294, "language_loss": 0.65393376, "learning_rate": 6.53215897462931e-08, "loss": 0.67535007, "num_input_tokens_seen": 165517200, "step": 7658, "time_per_iteration": 2.8725125789642334 }, { "auxiliary_loss_clip": 0.01117683, "auxiliary_loss_mlp": 0.01041751, "balance_loss_clip": 1.04136157, "balance_loss_mlp": 1.02602744, "epoch": 0.9209402994047977, "flos": 30589139946240.0, "grad_norm": 2.016198650119853, "language_loss": 0.75107557, "learning_rate": 6.512427853443103e-08, "loss": 0.77266991, "num_input_tokens_seen": 165539280, "step": 7659, "time_per_iteration": 2.6382384300231934 }, { "auxiliary_loss_clip": 0.01119149, "auxiliary_loss_mlp": 0.01035795, "balance_loss_clip": 1.04136848, "balance_loss_mlp": 1.0196898, "epoch": 0.9210605422954368, "flos": 29132711187840.0, "grad_norm": 2.006586929147814, "language_loss": 0.75524563, "learning_rate": 6.492726084167799e-08, "loss": 0.77679503, "num_input_tokens_seen": 165561395, "step": 7660, "time_per_iteration": 2.6426961421966553 }, { "auxiliary_loss_clip": 0.01034045, "auxiliary_loss_mlp": 0.01001734, "balance_loss_clip": 1.00623202, "balance_loss_mlp": 1.00014865, "epoch": 0.9211807851860758, "flos": 54853838472960.0, "grad_norm": 0.779294726775801, "language_loss": 0.57496601, "learning_rate": 6.473053669792072e-08, "loss": 0.5953238, "num_input_tokens_seen": 165616085, "step": 7661, "time_per_iteration": 2.9955005645751953 }, { "auxiliary_loss_clip": 0.01118972, "auxiliary_loss_mlp": 0.01035259, "balance_loss_clip": 1.04074168, "balance_loss_mlp": 1.0199945, "epoch": 0.921301028076715, "flos": 19201974238080.0, "grad_norm": 2.3670090154372327, "language_loss": 0.72871804, "learning_rate": 6.453410613300248e-08, "loss": 0.75026035, "num_input_tokens_seen": 165634015, "step": 7662, "time_per_iteration": 2.550071954727173 }, { "auxiliary_loss_clip": 0.01071658, "auxiliary_loss_mlp": 0.01038154, "balance_loss_clip": 1.03936028, "balance_loss_mlp": 1.02331281, "epoch": 0.921421270967354, "flos": 27526893765120.0, "grad_norm": 1.577310144173202, "language_loss": 0.58220261, "learning_rate": 6.43379691767214e-08, "loss": 0.60330075, "num_input_tokens_seen": 165653220, "step": 7663, "time_per_iteration": 2.7394936084747314 }, { "auxiliary_loss_clip": 0.00997595, "auxiliary_loss_mlp": 0.01000526, "balance_loss_clip": 1.00651503, "balance_loss_mlp": 0.99902976, "epoch": 0.9215415138579931, "flos": 70209311955840.0, "grad_norm": 0.7168733502509561, "language_loss": 0.55060577, "learning_rate": 6.414212585883105e-08, "loss": 0.57058692, "num_input_tokens_seen": 165715850, "step": 7664, "time_per_iteration": 3.2906477451324463 }, { "auxiliary_loss_clip": 0.01108524, "auxiliary_loss_mlp": 0.01035918, "balance_loss_clip": 1.03891265, "balance_loss_mlp": 1.02015901, "epoch": 0.9216617567486323, "flos": 35553107790720.0, "grad_norm": 4.634632108964287, "language_loss": 0.69790095, "learning_rate": 6.394657620904143e-08, "loss": 0.71934533, "num_input_tokens_seen": 165738960, "step": 7665, "time_per_iteration": 2.73087739944458 }, { "auxiliary_loss_clip": 0.01136518, "auxiliary_loss_mlp": 0.01034306, "balance_loss_clip": 1.04400826, "balance_loss_mlp": 1.01982188, "epoch": 0.9217819996392713, "flos": 29533330552320.0, "grad_norm": 1.7923776522784958, "language_loss": 0.71756339, "learning_rate": 6.375132025701657e-08, "loss": 0.7392717, "num_input_tokens_seen": 165761260, "step": 7666, "time_per_iteration": 2.579145669937134 }, { "auxiliary_loss_clip": 0.0113586, "auxiliary_loss_mlp": 0.01034584, "balance_loss_clip": 1.04513121, "balance_loss_mlp": 1.01784682, "epoch": 0.9219022425299104, "flos": 14574669592320.0, "grad_norm": 2.381305006723731, "language_loss": 0.69245583, "learning_rate": 6.355635803237724e-08, "loss": 0.71416026, "num_input_tokens_seen": 165776960, "step": 7667, "time_per_iteration": 2.492323160171509 }, { "auxiliary_loss_clip": 0.01120366, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.04154801, "balance_loss_mlp": 1.02018583, "epoch": 0.9220224854205495, "flos": 18077503996800.0, "grad_norm": 2.007677286647104, "language_loss": 0.79409301, "learning_rate": 6.336168956469867e-08, "loss": 0.81566381, "num_input_tokens_seen": 165795435, "step": 7668, "time_per_iteration": 2.5447187423706055 }, { "auxiliary_loss_clip": 0.01101016, "auxiliary_loss_mlp": 0.01035801, "balance_loss_clip": 1.04033375, "balance_loss_mlp": 1.02011299, "epoch": 0.9221427283111886, "flos": 24790464875520.0, "grad_norm": 2.0834887013292374, "language_loss": 0.71811724, "learning_rate": 6.316731488351168e-08, "loss": 0.73948538, "num_input_tokens_seen": 165816625, "step": 7669, "time_per_iteration": 2.5991458892822266 }, { "auxiliary_loss_clip": 0.0111753, "auxiliary_loss_mlp": 0.01036332, "balance_loss_clip": 1.04076767, "balance_loss_mlp": 1.02107286, "epoch": 0.9222629712018277, "flos": 13845036625920.0, "grad_norm": 1.8849334131715663, "language_loss": 0.63688385, "learning_rate": 6.297323401830334e-08, "loss": 0.65842247, "num_input_tokens_seen": 165835410, "step": 7670, "time_per_iteration": 2.532257080078125 }, { "auxiliary_loss_clip": 0.0112064, "auxiliary_loss_mlp": 0.01042322, "balance_loss_clip": 1.04092932, "balance_loss_mlp": 1.02693248, "epoch": 0.9223832140924668, "flos": 21616177196160.0, "grad_norm": 2.4421060183361285, "language_loss": 0.68802309, "learning_rate": 6.277944699851523e-08, "loss": 0.70965272, "num_input_tokens_seen": 165854930, "step": 7671, "time_per_iteration": 2.5612270832061768 }, { "auxiliary_loss_clip": 0.01132624, "auxiliary_loss_mlp": 0.0103843, "balance_loss_clip": 1.04089522, "balance_loss_mlp": 1.02206302, "epoch": 0.9225034569831059, "flos": 21142084561920.0, "grad_norm": 2.067759101436839, "language_loss": 0.73580551, "learning_rate": 6.25859538535447e-08, "loss": 0.75751603, "num_input_tokens_seen": 165875725, "step": 7672, "time_per_iteration": 2.5304672718048096 }, { "auxiliary_loss_clip": 0.01110434, "auxiliary_loss_mlp": 0.01033694, "balance_loss_clip": 1.04126406, "balance_loss_mlp": 1.01819718, "epoch": 0.9226236998737449, "flos": 12495046844160.0, "grad_norm": 2.6398019366226873, "language_loss": 0.77942669, "learning_rate": 6.239275461274474e-08, "loss": 0.80086792, "num_input_tokens_seen": 165892100, "step": 7673, "time_per_iteration": 3.5318803787231445 }, { "auxiliary_loss_clip": 0.01120142, "auxiliary_loss_mlp": 0.01043512, "balance_loss_clip": 1.04234672, "balance_loss_mlp": 1.0283246, "epoch": 0.9227439427643841, "flos": 26214071581440.0, "grad_norm": 1.6745647820559828, "language_loss": 0.85817236, "learning_rate": 6.219984930542299e-08, "loss": 0.8798089, "num_input_tokens_seen": 165912840, "step": 7674, "time_per_iteration": 2.5981087684631348 }, { "auxiliary_loss_clip": 0.01120033, "auxiliary_loss_mlp": 0.01035682, "balance_loss_clip": 1.0406189, "balance_loss_mlp": 1.01992261, "epoch": 0.9228641856550232, "flos": 17967581400960.0, "grad_norm": 2.0307099830176325, "language_loss": 0.75858748, "learning_rate": 6.200723796084383e-08, "loss": 0.78014457, "num_input_tokens_seen": 165930935, "step": 7675, "time_per_iteration": 3.297394275665283 }, { "auxiliary_loss_clip": 0.01012387, "auxiliary_loss_mlp": 0.01008497, "balance_loss_clip": 1.01100755, "balance_loss_mlp": 1.00611234, "epoch": 0.9229844285456622, "flos": 70420609710720.0, "grad_norm": 0.7644078419071234, "language_loss": 0.63008267, "learning_rate": 6.181492060822546e-08, "loss": 0.6502915, "num_input_tokens_seen": 165991110, "step": 7676, "time_per_iteration": 3.1658427715301514 }, { "auxiliary_loss_clip": 0.01079558, "auxiliary_loss_mlp": 0.01051575, "balance_loss_clip": 1.03842258, "balance_loss_mlp": 1.03453994, "epoch": 0.9231046714363014, "flos": 17967832796160.0, "grad_norm": 2.263935284147124, "language_loss": 0.81988442, "learning_rate": 6.162289727674274e-08, "loss": 0.8411957, "num_input_tokens_seen": 166008790, "step": 7677, "time_per_iteration": 2.6747381687164307 }, { "auxiliary_loss_clip": 0.01091797, "auxiliary_loss_mlp": 0.01031498, "balance_loss_clip": 1.03704417, "balance_loss_mlp": 1.01681697, "epoch": 0.9232249143269404, "flos": 17858233422720.0, "grad_norm": 3.4359043709324, "language_loss": 0.876423, "learning_rate": 6.143116799552527e-08, "loss": 0.89765596, "num_input_tokens_seen": 166025035, "step": 7678, "time_per_iteration": 3.5828616619110107 }, { "auxiliary_loss_clip": 0.01122755, "auxiliary_loss_mlp": 0.01037049, "balance_loss_clip": 1.04293621, "balance_loss_mlp": 1.02187324, "epoch": 0.9233451572175795, "flos": 23404384903680.0, "grad_norm": 2.6514072470903955, "language_loss": 0.5614472, "learning_rate": 6.123973279365802e-08, "loss": 0.58304518, "num_input_tokens_seen": 166044010, "step": 7679, "time_per_iteration": 3.485483169555664 }, { "auxiliary_loss_clip": 0.01124222, "auxiliary_loss_mlp": 0.01039018, "balance_loss_clip": 1.04506385, "balance_loss_mlp": 1.02303243, "epoch": 0.9234654001082186, "flos": 17999326045440.0, "grad_norm": 2.340198305125168, "language_loss": 0.77959353, "learning_rate": 6.10485917001824e-08, "loss": 0.80122602, "num_input_tokens_seen": 166061865, "step": 7680, "time_per_iteration": 2.553597927093506 }, { "auxiliary_loss_clip": 0.01106542, "auxiliary_loss_mlp": 0.010359, "balance_loss_clip": 1.03856385, "balance_loss_mlp": 1.02108812, "epoch": 0.9235856429988577, "flos": 24750747411840.0, "grad_norm": 1.644059734579622, "language_loss": 0.80757368, "learning_rate": 6.085774474409322e-08, "loss": 0.82899809, "num_input_tokens_seen": 166082425, "step": 7681, "time_per_iteration": 2.670522451400757 }, { "auxiliary_loss_clip": 0.01109614, "auxiliary_loss_mlp": 0.01047188, "balance_loss_clip": 1.04418457, "balance_loss_mlp": 1.03177404, "epoch": 0.9237058858894968, "flos": 14099894599680.0, "grad_norm": 2.0420980121584615, "language_loss": 0.70076114, "learning_rate": 6.066719195434267e-08, "loss": 0.72232914, "num_input_tokens_seen": 166100225, "step": 7682, "time_per_iteration": 2.5982494354248047 }, { "auxiliary_loss_clip": 0.01125866, "auxiliary_loss_mlp": 0.01045564, "balance_loss_clip": 1.04316235, "balance_loss_mlp": 1.02898169, "epoch": 0.9238261287801359, "flos": 28694529175680.0, "grad_norm": 2.292748117544631, "language_loss": 0.66545683, "learning_rate": 6.047693335983717e-08, "loss": 0.6871711, "num_input_tokens_seen": 166122570, "step": 7683, "time_per_iteration": 2.635314702987671 }, { "auxiliary_loss_clip": 0.01122619, "auxiliary_loss_mlp": 0.0103753, "balance_loss_clip": 1.0412575, "balance_loss_mlp": 1.02129376, "epoch": 0.923946371670775, "flos": 23111856541440.0, "grad_norm": 2.8966731618781774, "language_loss": 0.8254751, "learning_rate": 6.028696898943853e-08, "loss": 0.84707654, "num_input_tokens_seen": 166141630, "step": 7684, "time_per_iteration": 2.571524143218994 }, { "auxiliary_loss_clip": 0.0110894, "auxiliary_loss_mlp": 0.00772454, "balance_loss_clip": 1.03975797, "balance_loss_mlp": 1.00047374, "epoch": 0.924066614561414, "flos": 21867120587520.0, "grad_norm": 4.8530179966204825, "language_loss": 0.70827323, "learning_rate": 6.00972988719648e-08, "loss": 0.72708714, "num_input_tokens_seen": 166159865, "step": 7685, "time_per_iteration": 2.646796464920044 }, { "auxiliary_loss_clip": 0.01098417, "auxiliary_loss_mlp": 0.00772473, "balance_loss_clip": 1.03898561, "balance_loss_mlp": 1.00054383, "epoch": 0.9241868574520532, "flos": 28511887495680.0, "grad_norm": 2.2498828369399653, "language_loss": 0.70957547, "learning_rate": 5.990792303618807e-08, "loss": 0.72828436, "num_input_tokens_seen": 166179445, "step": 7686, "time_per_iteration": 2.667088747024536 }, { "auxiliary_loss_clip": 0.01090566, "auxiliary_loss_mlp": 0.01037102, "balance_loss_clip": 1.03982854, "balance_loss_mlp": 1.02198625, "epoch": 0.9243071003426923, "flos": 30518324282880.0, "grad_norm": 1.6759575220253082, "language_loss": 0.69745541, "learning_rate": 5.971884151083695e-08, "loss": 0.71873212, "num_input_tokens_seen": 166201855, "step": 7687, "time_per_iteration": 2.739445209503174 }, { "auxiliary_loss_clip": 0.011091, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.04056692, "balance_loss_mlp": 1.01869965, "epoch": 0.9244273432333313, "flos": 28658331244800.0, "grad_norm": 2.9585259074767087, "language_loss": 0.74421918, "learning_rate": 5.9530054324595124e-08, "loss": 0.76564515, "num_input_tokens_seen": 166221970, "step": 7688, "time_per_iteration": 2.663602113723755 }, { "auxiliary_loss_clip": 0.01021282, "auxiliary_loss_mlp": 0.00755292, "balance_loss_clip": 1.00544262, "balance_loss_mlp": 1.00020909, "epoch": 0.9245475861239704, "flos": 66230589237120.0, "grad_norm": 0.7467045688598678, "language_loss": 0.57481182, "learning_rate": 5.934156150610103e-08, "loss": 0.59257758, "num_input_tokens_seen": 166279335, "step": 7689, "time_per_iteration": 3.224452495574951 }, { "auxiliary_loss_clip": 0.01108698, "auxiliary_loss_mlp": 0.01036899, "balance_loss_clip": 1.04157662, "balance_loss_mlp": 1.02276087, "epoch": 0.9246678290146095, "flos": 24239918142720.0, "grad_norm": 2.1491815901890376, "language_loss": 0.79076451, "learning_rate": 5.915336308394914e-08, "loss": 0.81222045, "num_input_tokens_seen": 166298170, "step": 7690, "time_per_iteration": 2.6522634029388428 }, { "auxiliary_loss_clip": 0.0111242, "auxiliary_loss_mlp": 0.01035855, "balance_loss_clip": 1.03931522, "balance_loss_mlp": 1.02153778, "epoch": 0.9247880719052486, "flos": 18988808976000.0, "grad_norm": 1.8266636358821122, "language_loss": 0.77083063, "learning_rate": 5.89654590866886e-08, "loss": 0.79231334, "num_input_tokens_seen": 166317670, "step": 7691, "time_per_iteration": 2.5970005989074707 }, { "auxiliary_loss_clip": 0.01071279, "auxiliary_loss_mlp": 0.01044537, "balance_loss_clip": 1.03873897, "balance_loss_mlp": 1.02890921, "epoch": 0.9249083147958876, "flos": 24024095274240.0, "grad_norm": 2.0332027882937873, "language_loss": 0.88642669, "learning_rate": 5.877784954282483e-08, "loss": 0.90758491, "num_input_tokens_seen": 166337010, "step": 7692, "time_per_iteration": 2.827533721923828 }, { "auxiliary_loss_clip": 0.01125579, "auxiliary_loss_mlp": 0.01043434, "balance_loss_clip": 1.04357529, "balance_loss_mlp": 1.02687621, "epoch": 0.9250285576865268, "flos": 30773972355840.0, "grad_norm": 1.8789599612013508, "language_loss": 0.72465956, "learning_rate": 5.8590534480817963e-08, "loss": 0.74634969, "num_input_tokens_seen": 166358735, "step": 7693, "time_per_iteration": 2.94244647026062 }, { "auxiliary_loss_clip": 0.01131708, "auxiliary_loss_mlp": 0.01039973, "balance_loss_clip": 1.0438596, "balance_loss_mlp": 1.02461874, "epoch": 0.9251488005771659, "flos": 10633581348480.0, "grad_norm": 2.7443104578293975, "language_loss": 0.72689509, "learning_rate": 5.840351392908349e-08, "loss": 0.74861187, "num_input_tokens_seen": 166374455, "step": 7694, "time_per_iteration": 2.534994125366211 }, { "auxiliary_loss_clip": 0.01111658, "auxiliary_loss_mlp": 0.00771886, "balance_loss_clip": 1.03951442, "balance_loss_mlp": 1.00054145, "epoch": 0.9252690434678049, "flos": 23586416052480.0, "grad_norm": 3.723615296445046, "language_loss": 0.70558512, "learning_rate": 5.821678791599205e-08, "loss": 0.72442055, "num_input_tokens_seen": 166393900, "step": 7695, "time_per_iteration": 2.683648109436035 }, { "auxiliary_loss_clip": 0.01106644, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.04139829, "balance_loss_mlp": 1.02131176, "epoch": 0.9253892863584441, "flos": 21469158829440.0, "grad_norm": 1.8131421826505834, "language_loss": 0.81008911, "learning_rate": 5.803035646986965e-08, "loss": 0.83151305, "num_input_tokens_seen": 166413235, "step": 7696, "time_per_iteration": 2.678309202194214 }, { "auxiliary_loss_clip": 0.01135768, "auxiliary_loss_mlp": 0.01041996, "balance_loss_clip": 1.04323101, "balance_loss_mlp": 1.02616489, "epoch": 0.9255095292490831, "flos": 17456680304640.0, "grad_norm": 2.329183668342185, "language_loss": 0.67369837, "learning_rate": 5.7844219618998766e-08, "loss": 0.695476, "num_input_tokens_seen": 166427560, "step": 7697, "time_per_iteration": 2.612008810043335 }, { "auxiliary_loss_clip": 0.01080993, "auxiliary_loss_mlp": 0.01055147, "balance_loss_clip": 1.03401995, "balance_loss_mlp": 1.03755128, "epoch": 0.9256297721397222, "flos": 24750675584640.0, "grad_norm": 2.2372948276821396, "language_loss": 0.71349192, "learning_rate": 5.765837739161505e-08, "loss": 0.73485339, "num_input_tokens_seen": 166446680, "step": 7698, "time_per_iteration": 3.642641305923462 }, { "auxiliary_loss_clip": 0.01096162, "auxiliary_loss_mlp": 0.01037413, "balance_loss_clip": 1.038728, "balance_loss_mlp": 1.02180278, "epoch": 0.9257500150303614, "flos": 23112215677440.0, "grad_norm": 1.621902739338201, "language_loss": 0.74554342, "learning_rate": 5.7472829815911504e-08, "loss": 0.7668792, "num_input_tokens_seen": 166465505, "step": 7699, "time_per_iteration": 2.7229087352752686 }, { "auxiliary_loss_clip": 0.01103472, "auxiliary_loss_mlp": 0.01046374, "balance_loss_clip": 1.04032755, "balance_loss_mlp": 1.02913618, "epoch": 0.9258702579210004, "flos": 22564685687040.0, "grad_norm": 1.7210085913293671, "language_loss": 0.81583369, "learning_rate": 5.7287576920035164e-08, "loss": 0.83733213, "num_input_tokens_seen": 166484520, "step": 7700, "time_per_iteration": 2.6350317001342773 }, { "auxiliary_loss_clip": 0.0109143, "auxiliary_loss_mlp": 0.01042555, "balance_loss_clip": 1.03740418, "balance_loss_mlp": 1.02667642, "epoch": 0.9259905008116395, "flos": 30004298703360.0, "grad_norm": 1.776755851186421, "language_loss": 0.7695443, "learning_rate": 5.7102618732088435e-08, "loss": 0.79088414, "num_input_tokens_seen": 166503850, "step": 7701, "time_per_iteration": 3.6731410026550293 }, { "auxiliary_loss_clip": 0.01110015, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.04080033, "balance_loss_mlp": 1.02401686, "epoch": 0.9261107437022786, "flos": 24572128055040.0, "grad_norm": 1.7838935021967497, "language_loss": 0.74587142, "learning_rate": 5.6917955280130216e-08, "loss": 0.7673552, "num_input_tokens_seen": 166525330, "step": 7702, "time_per_iteration": 2.6798768043518066 }, { "auxiliary_loss_clip": 0.01114525, "auxiliary_loss_mlp": 0.01038305, "balance_loss_clip": 1.03897619, "balance_loss_mlp": 1.02274811, "epoch": 0.9262309865929177, "flos": 22018448586240.0, "grad_norm": 2.4580361251772604, "language_loss": 0.71746361, "learning_rate": 5.6733586592172755e-08, "loss": 0.73899198, "num_input_tokens_seen": 166544825, "step": 7703, "time_per_iteration": 2.603663206100464 }, { "auxiliary_loss_clip": 0.01100188, "auxiliary_loss_mlp": 0.00771125, "balance_loss_clip": 1.03717709, "balance_loss_mlp": 1.00050378, "epoch": 0.9263512294835567, "flos": 20339481116160.0, "grad_norm": 1.809412571483433, "language_loss": 0.80280745, "learning_rate": 5.6549512696185244e-08, "loss": 0.82152057, "num_input_tokens_seen": 166563325, "step": 7704, "time_per_iteration": 3.652066230773926 }, { "auxiliary_loss_clip": 0.01128765, "auxiliary_loss_mlp": 0.01043038, "balance_loss_clip": 1.04241109, "balance_loss_mlp": 1.02812517, "epoch": 0.9264714723741959, "flos": 21215378263680.0, "grad_norm": 1.6016095811621307, "language_loss": 0.68542969, "learning_rate": 5.636573362009156e-08, "loss": 0.70714772, "num_input_tokens_seen": 166583385, "step": 7705, "time_per_iteration": 3.559211254119873 }, { "auxiliary_loss_clip": 0.01133285, "auxiliary_loss_mlp": 0.01040774, "balance_loss_clip": 1.04076576, "balance_loss_mlp": 1.02353001, "epoch": 0.926591715264835, "flos": 18004964480640.0, "grad_norm": 2.252395238296381, "language_loss": 0.76743138, "learning_rate": 5.618224939177074e-08, "loss": 0.78917193, "num_input_tokens_seen": 166601290, "step": 7706, "time_per_iteration": 2.6128554344177246 }, { "auxiliary_loss_clip": 0.01095018, "auxiliary_loss_mlp": 0.0104213, "balance_loss_clip": 1.0370127, "balance_loss_mlp": 1.02639413, "epoch": 0.926711958155474, "flos": 36167969825280.0, "grad_norm": 1.867277977001426, "language_loss": 0.7046628, "learning_rate": 5.599906003905719e-08, "loss": 0.72603428, "num_input_tokens_seen": 166623835, "step": 7707, "time_per_iteration": 2.7494802474975586 }, { "auxiliary_loss_clip": 0.01114239, "auxiliary_loss_mlp": 0.01042847, "balance_loss_clip": 1.04391932, "balance_loss_mlp": 1.0246675, "epoch": 0.9268322010461132, "flos": 21032736583680.0, "grad_norm": 2.8086256138930406, "language_loss": 0.81760383, "learning_rate": 5.581616558974023e-08, "loss": 0.83917469, "num_input_tokens_seen": 166642400, "step": 7708, "time_per_iteration": 2.6078574657440186 }, { "auxiliary_loss_clip": 0.01124769, "auxiliary_loss_mlp": 0.00773039, "balance_loss_clip": 1.04131317, "balance_loss_mlp": 1.00061131, "epoch": 0.9269524439367522, "flos": 22964838174720.0, "grad_norm": 1.7349101554221087, "language_loss": 0.79418188, "learning_rate": 5.5633566071565444e-08, "loss": 0.81315994, "num_input_tokens_seen": 166661640, "step": 7709, "time_per_iteration": 2.6153011322021484 }, { "auxiliary_loss_clip": 0.01075662, "auxiliary_loss_mlp": 0.0103197, "balance_loss_clip": 1.03871489, "balance_loss_mlp": 1.01717615, "epoch": 0.9270726868273913, "flos": 41975551468800.0, "grad_norm": 2.0642017696043, "language_loss": 0.70873189, "learning_rate": 5.5451261512232896e-08, "loss": 0.72980821, "num_input_tokens_seen": 166684320, "step": 7710, "time_per_iteration": 2.874163866043091 }, { "auxiliary_loss_clip": 0.01125723, "auxiliary_loss_mlp": 0.01036764, "balance_loss_clip": 1.04202807, "balance_loss_mlp": 1.02119565, "epoch": 0.9271929297180305, "flos": 19791771557760.0, "grad_norm": 1.8801766887836076, "language_loss": 0.62493169, "learning_rate": 5.5269251939397576e-08, "loss": 0.64655662, "num_input_tokens_seen": 166703835, "step": 7711, "time_per_iteration": 2.641228675842285 }, { "auxiliary_loss_clip": 0.01091902, "auxiliary_loss_mlp": 0.01042472, "balance_loss_clip": 1.03483629, "balance_loss_mlp": 1.02560353, "epoch": 0.9273131726086695, "flos": 19968343839360.0, "grad_norm": 5.8625029004899805, "language_loss": 0.76334256, "learning_rate": 5.508753738067073e-08, "loss": 0.78468633, "num_input_tokens_seen": 166723375, "step": 7712, "time_per_iteration": 2.6589267253875732 }, { "auxiliary_loss_clip": 0.01121808, "auxiliary_loss_mlp": 0.01033906, "balance_loss_clip": 1.04077208, "balance_loss_mlp": 1.01850367, "epoch": 0.9274334154993086, "flos": 23258587599360.0, "grad_norm": 6.299727603258576, "language_loss": 0.79382026, "learning_rate": 5.4906117863617875e-08, "loss": 0.81537735, "num_input_tokens_seen": 166742760, "step": 7713, "time_per_iteration": 2.648252487182617 }, { "auxiliary_loss_clip": 0.01089029, "auxiliary_loss_mlp": 0.01036495, "balance_loss_clip": 1.03768754, "balance_loss_mlp": 1.0212599, "epoch": 0.9275536583899477, "flos": 31795343585280.0, "grad_norm": 5.132715030065583, "language_loss": 0.78219014, "learning_rate": 5.4724993415760533e-08, "loss": 0.8034454, "num_input_tokens_seen": 166761115, "step": 7714, "time_per_iteration": 2.826324462890625 }, { "auxiliary_loss_clip": 0.01102182, "auxiliary_loss_mlp": 0.00772319, "balance_loss_clip": 1.03863204, "balance_loss_mlp": 1.00056374, "epoch": 0.9276739012805868, "flos": 18696998885760.0, "grad_norm": 4.272295335898867, "language_loss": 0.74740958, "learning_rate": 5.454416406457496e-08, "loss": 0.76615453, "num_input_tokens_seen": 166780210, "step": 7715, "time_per_iteration": 2.8372626304626465 }, { "auxiliary_loss_clip": 0.01118064, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.04081273, "balance_loss_mlp": 1.0190022, "epoch": 0.9277941441712259, "flos": 13879079740800.0, "grad_norm": 9.914762458090156, "language_loss": 0.7364192, "learning_rate": 5.436362983749299e-08, "loss": 0.75792539, "num_input_tokens_seen": 166795380, "step": 7716, "time_per_iteration": 2.6410884857177734 }, { "auxiliary_loss_clip": 0.01088305, "auxiliary_loss_mlp": 0.01033142, "balance_loss_clip": 1.03906035, "balance_loss_mlp": 1.01858079, "epoch": 0.927914387061865, "flos": 23258659426560.0, "grad_norm": 2.19869466016322, "language_loss": 0.64352232, "learning_rate": 5.418339076190137e-08, "loss": 0.66473675, "num_input_tokens_seen": 166814890, "step": 7717, "time_per_iteration": 2.7407631874084473 }, { "auxiliary_loss_clip": 0.01100069, "auxiliary_loss_mlp": 0.01040484, "balance_loss_clip": 1.03993356, "balance_loss_mlp": 1.02350903, "epoch": 0.9280346299525041, "flos": 18073733068800.0, "grad_norm": 1.8283221590816232, "language_loss": 0.88769352, "learning_rate": 5.400344686514202e-08, "loss": 0.90909898, "num_input_tokens_seen": 166832475, "step": 7718, "time_per_iteration": 2.754718065261841 }, { "auxiliary_loss_clip": 0.01118166, "auxiliary_loss_mlp": 0.01033957, "balance_loss_clip": 1.04234648, "balance_loss_mlp": 1.01865017, "epoch": 0.9281548728431431, "flos": 22342901160960.0, "grad_norm": 2.036193939091202, "language_loss": 0.67029822, "learning_rate": 5.38237981745131e-08, "loss": 0.69181943, "num_input_tokens_seen": 166850590, "step": 7719, "time_per_iteration": 2.6301209926605225 }, { "auxiliary_loss_clip": 0.01120252, "auxiliary_loss_mlp": 0.00771725, "balance_loss_clip": 1.0417316, "balance_loss_mlp": 1.00052476, "epoch": 0.9282751157337822, "flos": 18843765857280.0, "grad_norm": 1.7116547911678215, "language_loss": 0.81478125, "learning_rate": 5.364444471726592e-08, "loss": 0.83370101, "num_input_tokens_seen": 166869795, "step": 7720, "time_per_iteration": 2.5603349208831787 }, { "auxiliary_loss_clip": 0.01117372, "auxiliary_loss_mlp": 0.01031269, "balance_loss_clip": 1.03918624, "balance_loss_mlp": 1.01583159, "epoch": 0.9283953586244214, "flos": 25556834476800.0, "grad_norm": 2.4358831191055375, "language_loss": 0.80123043, "learning_rate": 5.346538652060939e-08, "loss": 0.82271683, "num_input_tokens_seen": 166891150, "step": 7721, "time_per_iteration": 2.681347131729126 }, { "auxiliary_loss_clip": 0.01101478, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.04035091, "balance_loss_mlp": 1.02044988, "epoch": 0.9285156015150604, "flos": 18223480869120.0, "grad_norm": 2.458305968273703, "language_loss": 0.70217264, "learning_rate": 5.3286623611705994e-08, "loss": 0.72353685, "num_input_tokens_seen": 166909195, "step": 7722, "time_per_iteration": 2.6091806888580322 }, { "auxiliary_loss_clip": 0.01034156, "auxiliary_loss_mlp": 0.01001767, "balance_loss_clip": 1.00647199, "balance_loss_mlp": 1.00018108, "epoch": 0.9286358444056995, "flos": 66400017690240.0, "grad_norm": 0.8122147242832688, "language_loss": 0.60528553, "learning_rate": 5.3108156017673824e-08, "loss": 0.6256448, "num_input_tokens_seen": 166970955, "step": 7723, "time_per_iteration": 3.2158896923065186 }, { "auxiliary_loss_clip": 0.01112274, "auxiliary_loss_mlp": 0.0104539, "balance_loss_clip": 1.04005599, "balance_loss_mlp": 1.02874875, "epoch": 0.9287560872963386, "flos": 22345630594560.0, "grad_norm": 1.623230345913631, "language_loss": 0.71560663, "learning_rate": 5.2929983765586775e-08, "loss": 0.73718327, "num_input_tokens_seen": 166989735, "step": 7724, "time_per_iteration": 3.5882651805877686 }, { "auxiliary_loss_clip": 0.01132287, "auxiliary_loss_mlp": 0.01041027, "balance_loss_clip": 1.04331374, "balance_loss_mlp": 1.0246954, "epoch": 0.9288763301869777, "flos": 25700225569920.0, "grad_norm": 2.023946386582334, "language_loss": 0.62719965, "learning_rate": 5.275210688247278e-08, "loss": 0.64893281, "num_input_tokens_seen": 167010060, "step": 7725, "time_per_iteration": 2.682077169418335 }, { "auxiliary_loss_clip": 0.01077089, "auxiliary_loss_mlp": 0.01037379, "balance_loss_clip": 1.03761292, "balance_loss_mlp": 1.02333009, "epoch": 0.9289965730776167, "flos": 12312046028160.0, "grad_norm": 1.9437481847263334, "language_loss": 0.85470283, "learning_rate": 5.257452539531604e-08, "loss": 0.87584746, "num_input_tokens_seen": 167027130, "step": 7726, "time_per_iteration": 2.6709187030792236 }, { "auxiliary_loss_clip": 0.01118042, "auxiliary_loss_mlp": 0.01038273, "balance_loss_clip": 1.03930306, "balance_loss_mlp": 1.02297854, "epoch": 0.9291168159682559, "flos": 26685973486080.0, "grad_norm": 2.280624954124475, "language_loss": 0.68309915, "learning_rate": 5.2397239331055445e-08, "loss": 0.70466238, "num_input_tokens_seen": 167049130, "step": 7727, "time_per_iteration": 3.590982675552368 }, { "auxiliary_loss_clip": 0.01103598, "auxiliary_loss_mlp": 0.01038942, "balance_loss_clip": 1.04042494, "balance_loss_mlp": 1.02311683, "epoch": 0.929237058858895, "flos": 14538256179840.0, "grad_norm": 2.723611255946577, "language_loss": 0.8122946, "learning_rate": 5.2220248716585036e-08, "loss": 0.83371997, "num_input_tokens_seen": 167066810, "step": 7728, "time_per_iteration": 2.620022773742676 }, { "auxiliary_loss_clip": 0.01112823, "auxiliary_loss_mlp": 0.01039254, "balance_loss_clip": 1.0395509, "balance_loss_mlp": 1.02301788, "epoch": 0.929357301749534, "flos": 23835456023040.0, "grad_norm": 2.233718056602646, "language_loss": 0.75527602, "learning_rate": 5.204355357875445e-08, "loss": 0.77679682, "num_input_tokens_seen": 167085155, "step": 7729, "time_per_iteration": 2.620633125305176 }, { "auxiliary_loss_clip": 0.01105302, "auxiliary_loss_mlp": 0.01043419, "balance_loss_clip": 1.0390203, "balance_loss_mlp": 1.02450073, "epoch": 0.9294775446401732, "flos": 12969319046400.0, "grad_norm": 3.1025402446998758, "language_loss": 0.7082026, "learning_rate": 5.1867153944367584e-08, "loss": 0.72968984, "num_input_tokens_seen": 167101545, "step": 7730, "time_per_iteration": 3.5147311687469482 }, { "auxiliary_loss_clip": 0.01100177, "auxiliary_loss_mlp": 0.01041013, "balance_loss_clip": 1.04013133, "balance_loss_mlp": 1.02511096, "epoch": 0.9295977875308122, "flos": 26211809024640.0, "grad_norm": 1.708395820807, "language_loss": 0.73702013, "learning_rate": 5.16910498401848e-08, "loss": 0.75843209, "num_input_tokens_seen": 167120995, "step": 7731, "time_per_iteration": 3.7102153301239014 }, { "auxiliary_loss_clip": 0.01131086, "auxiliary_loss_mlp": 0.0104168, "balance_loss_clip": 1.04361272, "balance_loss_mlp": 1.02649295, "epoch": 0.9297180304214513, "flos": 16472297105280.0, "grad_norm": 2.919763933662627, "language_loss": 0.83561552, "learning_rate": 5.151524129292073e-08, "loss": 0.8573432, "num_input_tokens_seen": 167138890, "step": 7732, "time_per_iteration": 2.5372326374053955 }, { "auxiliary_loss_clip": 0.01119963, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.04145718, "balance_loss_mlp": 1.01897717, "epoch": 0.9298382733120905, "flos": 24060436859520.0, "grad_norm": 1.9776336459413368, "language_loss": 0.6685282, "learning_rate": 5.1339728329245155e-08, "loss": 0.69006801, "num_input_tokens_seen": 167159455, "step": 7733, "time_per_iteration": 2.605053424835205 }, { "auxiliary_loss_clip": 0.01140742, "auxiliary_loss_mlp": 0.01053288, "balance_loss_clip": 1.04641795, "balance_loss_mlp": 1.03582418, "epoch": 0.9299585162027295, "flos": 22127652910080.0, "grad_norm": 2.2940607519546576, "language_loss": 0.79489595, "learning_rate": 5.116451097578367e-08, "loss": 0.8168363, "num_input_tokens_seen": 167178495, "step": 7734, "time_per_iteration": 2.5878899097442627 }, { "auxiliary_loss_clip": 0.01092, "auxiliary_loss_mlp": 0.01039576, "balance_loss_clip": 1.03715861, "balance_loss_mlp": 1.02389967, "epoch": 0.9300787590933686, "flos": 21471780522240.0, "grad_norm": 1.7566300727722641, "language_loss": 0.74596655, "learning_rate": 5.0989589259115895e-08, "loss": 0.76728237, "num_input_tokens_seen": 167199380, "step": 7735, "time_per_iteration": 2.748227834701538 }, { "auxiliary_loss_clip": 0.01121077, "auxiliary_loss_mlp": 0.01047648, "balance_loss_clip": 1.04189301, "balance_loss_mlp": 1.03002918, "epoch": 0.9301990019840077, "flos": 17779588594560.0, "grad_norm": 2.2771497164831787, "language_loss": 0.7138924, "learning_rate": 5.081496320577816e-08, "loss": 0.73557961, "num_input_tokens_seen": 167216500, "step": 7736, "time_per_iteration": 2.61633038520813 }, { "auxiliary_loss_clip": 0.01015183, "auxiliary_loss_mlp": 0.01002157, "balance_loss_clip": 1.01078606, "balance_loss_mlp": 1.00046396, "epoch": 0.9303192448746468, "flos": 58896122307840.0, "grad_norm": 0.9123659007341022, "language_loss": 0.61205089, "learning_rate": 5.0640632842260835e-08, "loss": 0.63222432, "num_input_tokens_seen": 167276760, "step": 7737, "time_per_iteration": 3.2802083492279053 }, { "auxiliary_loss_clip": 0.01091121, "auxiliary_loss_mlp": 0.00771929, "balance_loss_clip": 1.03787708, "balance_loss_mlp": 1.00045347, "epoch": 0.9304394877652858, "flos": 57663522172800.0, "grad_norm": 2.668767585222153, "language_loss": 0.72624087, "learning_rate": 5.0466598195009426e-08, "loss": 0.74487138, "num_input_tokens_seen": 167303630, "step": 7738, "time_per_iteration": 3.0026094913482666 }, { "auxiliary_loss_clip": 0.01091205, "auxiliary_loss_mlp": 0.01034414, "balance_loss_clip": 1.03876948, "balance_loss_mlp": 1.02032328, "epoch": 0.930559730655925, "flos": 20996143603200.0, "grad_norm": 2.519314441061351, "language_loss": 0.70283616, "learning_rate": 5.0292859290425036e-08, "loss": 0.7240923, "num_input_tokens_seen": 167321500, "step": 7739, "time_per_iteration": 2.6452414989471436 }, { "auxiliary_loss_clip": 0.01128723, "auxiliary_loss_mlp": 0.01034947, "balance_loss_clip": 1.04283071, "balance_loss_mlp": 1.02014077, "epoch": 0.9306799735465641, "flos": 23258264376960.0, "grad_norm": 2.506487968161249, "language_loss": 0.78087974, "learning_rate": 5.011941615486348e-08, "loss": 0.8025164, "num_input_tokens_seen": 167340615, "step": 7740, "time_per_iteration": 2.551326274871826 }, { "auxiliary_loss_clip": 0.01129659, "auxiliary_loss_mlp": 0.01035024, "balance_loss_clip": 1.04167891, "balance_loss_mlp": 1.02059972, "epoch": 0.9308002164372031, "flos": 15231547560960.0, "grad_norm": 2.0905524565419613, "language_loss": 0.8482399, "learning_rate": 4.994626881463659e-08, "loss": 0.86988676, "num_input_tokens_seen": 167356870, "step": 7741, "time_per_iteration": 2.5465331077575684 }, { "auxiliary_loss_clip": 0.01070939, "auxiliary_loss_mlp": 0.01038338, "balance_loss_clip": 1.03619063, "balance_loss_mlp": 1.0234313, "epoch": 0.9309204593278423, "flos": 30847481539200.0, "grad_norm": 1.7052981143695627, "language_loss": 0.7091403, "learning_rate": 4.9773417296009814e-08, "loss": 0.73023307, "num_input_tokens_seen": 167378390, "step": 7742, "time_per_iteration": 2.7308907508850098 }, { "auxiliary_loss_clip": 0.0112359, "auxiliary_loss_mlp": 0.01039035, "balance_loss_clip": 1.0418973, "balance_loss_mlp": 1.02167773, "epoch": 0.9310407022184813, "flos": 23037269950080.0, "grad_norm": 2.4934122421871994, "language_loss": 0.65549982, "learning_rate": 4.960086162520527e-08, "loss": 0.67712605, "num_input_tokens_seen": 167398480, "step": 7743, "time_per_iteration": 2.663390874862671 }, { "auxiliary_loss_clip": 0.01086089, "auxiliary_loss_mlp": 0.01041584, "balance_loss_clip": 1.0380969, "balance_loss_mlp": 1.02627134, "epoch": 0.9311609451091204, "flos": 22127976132480.0, "grad_norm": 2.2152804776830934, "language_loss": 0.82660151, "learning_rate": 4.942860182839936e-08, "loss": 0.84787828, "num_input_tokens_seen": 167416825, "step": 7744, "time_per_iteration": 2.689169406890869 }, { "auxiliary_loss_clip": 0.0110927, "auxiliary_loss_mlp": 0.0104606, "balance_loss_clip": 1.04127097, "balance_loss_mlp": 1.03034222, "epoch": 0.9312811879997596, "flos": 21099206701440.0, "grad_norm": 1.7889820882543028, "language_loss": 0.79712784, "learning_rate": 4.925663793172341e-08, "loss": 0.81868106, "num_input_tokens_seen": 167434785, "step": 7745, "time_per_iteration": 2.6117756366729736 }, { "auxiliary_loss_clip": 0.01016464, "auxiliary_loss_mlp": 0.00755607, "balance_loss_clip": 1.00817752, "balance_loss_mlp": 1.00021482, "epoch": 0.9314014308903986, "flos": 67148179096320.0, "grad_norm": 0.7867307472959242, "language_loss": 0.56488758, "learning_rate": 4.908496996126477e-08, "loss": 0.58260828, "num_input_tokens_seen": 167498245, "step": 7746, "time_per_iteration": 3.284797191619873 }, { "auxiliary_loss_clip": 0.01117129, "auxiliary_loss_mlp": 0.01040815, "balance_loss_clip": 1.04409266, "balance_loss_mlp": 1.02530622, "epoch": 0.9315216737810377, "flos": 22565583527040.0, "grad_norm": 2.0332198455149544, "language_loss": 0.76600981, "learning_rate": 4.89135979430646e-08, "loss": 0.78758931, "num_input_tokens_seen": 167518290, "step": 7747, "time_per_iteration": 2.629307985305786 }, { "auxiliary_loss_clip": 0.0113079, "auxiliary_loss_mlp": 0.01041499, "balance_loss_clip": 1.04242301, "balance_loss_mlp": 1.02594829, "epoch": 0.9316419166716768, "flos": 23984054588160.0, "grad_norm": 1.6925990499514525, "language_loss": 0.85495979, "learning_rate": 4.874252190312078e-08, "loss": 0.87668264, "num_input_tokens_seen": 167538675, "step": 7748, "time_per_iteration": 2.5968210697174072 }, { "auxiliary_loss_clip": 0.01123807, "auxiliary_loss_mlp": 0.01042119, "balance_loss_clip": 1.04247761, "balance_loss_mlp": 1.02575171, "epoch": 0.9317621595623159, "flos": 30230464688640.0, "grad_norm": 1.6140753713556466, "language_loss": 0.6490522, "learning_rate": 4.857174186738477e-08, "loss": 0.67071146, "num_input_tokens_seen": 167562025, "step": 7749, "time_per_iteration": 2.6832098960876465 }, { "auxiliary_loss_clip": 0.01135784, "auxiliary_loss_mlp": 0.01045885, "balance_loss_clip": 1.04511261, "balance_loss_mlp": 1.03059006, "epoch": 0.931882402452955, "flos": 15742735966080.0, "grad_norm": 2.283914420812945, "language_loss": 0.73332751, "learning_rate": 4.840125786176408e-08, "loss": 0.75514424, "num_input_tokens_seen": 167578230, "step": 7750, "time_per_iteration": 3.469165563583374 }, { "auxiliary_loss_clip": 0.01102841, "auxiliary_loss_mlp": 0.01040003, "balance_loss_clip": 1.0404253, "balance_loss_mlp": 1.02500653, "epoch": 0.932002645343594, "flos": 28366521154560.0, "grad_norm": 1.9211124011393947, "language_loss": 0.77334368, "learning_rate": 4.823106991212067e-08, "loss": 0.79477209, "num_input_tokens_seen": 167597470, "step": 7751, "time_per_iteration": 2.681335926055908 }, { "auxiliary_loss_clip": 0.01122359, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.04286528, "balance_loss_mlp": 1.01615405, "epoch": 0.9321228882342332, "flos": 15341146934400.0, "grad_norm": 4.640907344953197, "language_loss": 0.83246785, "learning_rate": 4.806117804427212e-08, "loss": 0.85400283, "num_input_tokens_seen": 167615405, "step": 7752, "time_per_iteration": 2.5450901985168457 }, { "auxiliary_loss_clip": 0.01115636, "auxiliary_loss_mlp": 0.01057218, "balance_loss_clip": 1.04024196, "balance_loss_mlp": 1.03967011, "epoch": 0.9322431311248722, "flos": 17895365107200.0, "grad_norm": 1.9480619073053973, "language_loss": 0.63945353, "learning_rate": 4.7891582283990926e-08, "loss": 0.66118205, "num_input_tokens_seen": 167634130, "step": 7753, "time_per_iteration": 3.4563302993774414 }, { "auxiliary_loss_clip": 0.01094077, "auxiliary_loss_mlp": 0.01035317, "balance_loss_clip": 1.03861427, "balance_loss_mlp": 1.02051699, "epoch": 0.9323633740155113, "flos": 24169713010560.0, "grad_norm": 1.6443971779499005, "language_loss": 0.73014289, "learning_rate": 4.772228265700473e-08, "loss": 0.75143683, "num_input_tokens_seen": 167654990, "step": 7754, "time_per_iteration": 2.6913931369781494 }, { "auxiliary_loss_clip": 0.01121999, "auxiliary_loss_mlp": 0.01037499, "balance_loss_clip": 1.04208601, "balance_loss_mlp": 1.02187049, "epoch": 0.9324836169061504, "flos": 15043482927360.0, "grad_norm": 2.3085693437962242, "language_loss": 0.75866264, "learning_rate": 4.75532791889961e-08, "loss": 0.78025758, "num_input_tokens_seen": 167671690, "step": 7755, "time_per_iteration": 2.5426344871520996 }, { "auxiliary_loss_clip": 0.01117772, "auxiliary_loss_mlp": 0.01042802, "balance_loss_clip": 1.03964806, "balance_loss_mlp": 1.02786493, "epoch": 0.9326038597967895, "flos": 18624890332800.0, "grad_norm": 1.7714077580823733, "language_loss": 0.65716547, "learning_rate": 4.738457190560252e-08, "loss": 0.6787712, "num_input_tokens_seen": 167690800, "step": 7756, "time_per_iteration": 3.576225996017456 }, { "auxiliary_loss_clip": 0.01080859, "auxiliary_loss_mlp": 0.01044366, "balance_loss_clip": 1.03888488, "balance_loss_mlp": 1.02911949, "epoch": 0.9327241026874286, "flos": 18952646958720.0, "grad_norm": 3.3256986636437382, "language_loss": 0.78928846, "learning_rate": 4.721616083241664e-08, "loss": 0.81054068, "num_input_tokens_seen": 167709055, "step": 7757, "time_per_iteration": 2.644023895263672 }, { "auxiliary_loss_clip": 0.01115001, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.04160023, "balance_loss_mlp": 1.02187753, "epoch": 0.9328443455780677, "flos": 29570282668800.0, "grad_norm": 2.981750256426982, "language_loss": 0.77811575, "learning_rate": 4.7048045994986684e-08, "loss": 0.79965711, "num_input_tokens_seen": 167729915, "step": 7758, "time_per_iteration": 3.572547674179077 }, { "auxiliary_loss_clip": 0.01124854, "auxiliary_loss_mlp": 0.01045467, "balance_loss_clip": 1.04438972, "balance_loss_mlp": 1.02993417, "epoch": 0.9329645884687068, "flos": 30081722469120.0, "grad_norm": 2.1692403898750645, "language_loss": 0.91100371, "learning_rate": 4.688022741881559e-08, "loss": 0.93270689, "num_input_tokens_seen": 167750440, "step": 7759, "time_per_iteration": 2.6434872150421143 }, { "auxiliary_loss_clip": 0.01115715, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.04084969, "balance_loss_mlp": 1.01815915, "epoch": 0.9330848313593458, "flos": 21867982513920.0, "grad_norm": 2.475200198993296, "language_loss": 0.75023043, "learning_rate": 4.671270512936076e-08, "loss": 0.77172089, "num_input_tokens_seen": 167769600, "step": 7760, "time_per_iteration": 2.58634352684021 }, { "auxiliary_loss_clip": 0.01086941, "auxiliary_loss_mlp": 0.01033409, "balance_loss_clip": 1.03710544, "balance_loss_mlp": 1.01922941, "epoch": 0.933205074249985, "flos": 22127221946880.0, "grad_norm": 1.8344310734379397, "language_loss": 0.83015567, "learning_rate": 4.6545479152035884e-08, "loss": 0.85135913, "num_input_tokens_seen": 167788770, "step": 7761, "time_per_iteration": 2.6515140533447266 }, { "auxiliary_loss_clip": 0.01119257, "auxiliary_loss_mlp": 0.01039101, "balance_loss_clip": 1.04152679, "balance_loss_mlp": 1.0249033, "epoch": 0.9333253171406241, "flos": 15341254675200.0, "grad_norm": 2.5129193774718064, "language_loss": 0.76249248, "learning_rate": 4.637854951220821e-08, "loss": 0.78407609, "num_input_tokens_seen": 167805555, "step": 7762, "time_per_iteration": 2.5325682163238525 }, { "auxiliary_loss_clip": 0.01086719, "auxiliary_loss_mlp": 0.01035915, "balance_loss_clip": 1.03477216, "balance_loss_mlp": 1.02067423, "epoch": 0.9334455600312631, "flos": 15706142985600.0, "grad_norm": 2.051581562593121, "language_loss": 0.74706376, "learning_rate": 4.621191623520171e-08, "loss": 0.76829016, "num_input_tokens_seen": 167823985, "step": 7763, "time_per_iteration": 2.563926935195923 }, { "auxiliary_loss_clip": 0.01082173, "auxiliary_loss_mlp": 0.01036178, "balance_loss_clip": 1.04011726, "balance_loss_mlp": 1.02136064, "epoch": 0.9335658029219023, "flos": 22163563532160.0, "grad_norm": 2.5401111990431304, "language_loss": 0.8456651, "learning_rate": 4.604557934629372e-08, "loss": 0.86684859, "num_input_tokens_seen": 167843060, "step": 7764, "time_per_iteration": 2.7351503372192383 }, { "auxiliary_loss_clip": 0.01102313, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.03982949, "balance_loss_mlp": 1.02881408, "epoch": 0.9336860458125413, "flos": 20266833859200.0, "grad_norm": 1.7344459079873085, "language_loss": 0.80624402, "learning_rate": 4.587953887071805e-08, "loss": 0.82771146, "num_input_tokens_seen": 167862880, "step": 7765, "time_per_iteration": 2.6383626461029053 }, { "auxiliary_loss_clip": 0.01101577, "auxiliary_loss_mlp": 0.01036914, "balance_loss_clip": 1.03698289, "balance_loss_mlp": 1.02195311, "epoch": 0.9338062887031804, "flos": 20919689504640.0, "grad_norm": 2.0690606279630077, "language_loss": 0.85710615, "learning_rate": 4.5713794833662554e-08, "loss": 0.87849104, "num_input_tokens_seen": 167882095, "step": 7766, "time_per_iteration": 2.6360464096069336 }, { "auxiliary_loss_clip": 0.0113562, "auxiliary_loss_mlp": 0.01036042, "balance_loss_clip": 1.04459643, "balance_loss_mlp": 1.01954305, "epoch": 0.9339265315938196, "flos": 23221635482880.0, "grad_norm": 1.6336675995413712, "language_loss": 0.63250536, "learning_rate": 4.5548347260270236e-08, "loss": 0.65422195, "num_input_tokens_seen": 167901385, "step": 7767, "time_per_iteration": 2.585683822631836 }, { "auxiliary_loss_clip": 0.01086559, "auxiliary_loss_mlp": 0.01033827, "balance_loss_clip": 1.03648686, "balance_loss_mlp": 1.0188539, "epoch": 0.9340467744844586, "flos": 22820261932800.0, "grad_norm": 1.694682347860546, "language_loss": 0.69244897, "learning_rate": 4.538319617564012e-08, "loss": 0.71365285, "num_input_tokens_seen": 167920405, "step": 7768, "time_per_iteration": 2.666774034500122 }, { "auxiliary_loss_clip": 0.01107241, "auxiliary_loss_mlp": 0.01040968, "balance_loss_clip": 1.03906441, "balance_loss_mlp": 1.02499366, "epoch": 0.9341670173750977, "flos": 23660428026240.0, "grad_norm": 1.9714214927763023, "language_loss": 0.74839973, "learning_rate": 4.521834160482485e-08, "loss": 0.76988184, "num_input_tokens_seen": 167939145, "step": 7769, "time_per_iteration": 2.6919362545013428 }, { "auxiliary_loss_clip": 0.01124612, "auxiliary_loss_mlp": 0.01031407, "balance_loss_clip": 1.04583478, "balance_loss_mlp": 1.01694107, "epoch": 0.9342872602657368, "flos": 24824256595200.0, "grad_norm": 2.886796840581371, "language_loss": 0.82142091, "learning_rate": 4.5053783572832846e-08, "loss": 0.8429811, "num_input_tokens_seen": 167959325, "step": 7770, "time_per_iteration": 2.655081033706665 }, { "auxiliary_loss_clip": 0.0112015, "auxiliary_loss_mlp": 0.01051561, "balance_loss_clip": 1.04190755, "balance_loss_mlp": 1.0359211, "epoch": 0.9344075031563759, "flos": 25771831332480.0, "grad_norm": 1.753937417430389, "language_loss": 0.7604233, "learning_rate": 4.488952210462771e-08, "loss": 0.78214037, "num_input_tokens_seen": 167979530, "step": 7771, "time_per_iteration": 2.6348111629486084 }, { "auxiliary_loss_clip": 0.01131322, "auxiliary_loss_mlp": 0.01038461, "balance_loss_clip": 1.04310811, "balance_loss_mlp": 1.02371478, "epoch": 0.9345277460470149, "flos": 25551303782400.0, "grad_norm": 2.206018903287478, "language_loss": 0.86021996, "learning_rate": 4.4725557225127495e-08, "loss": 0.88191772, "num_input_tokens_seen": 167997870, "step": 7772, "time_per_iteration": 2.60090708732605 }, { "auxiliary_loss_clip": 0.01124054, "auxiliary_loss_mlp": 0.01032852, "balance_loss_clip": 1.04707384, "balance_loss_mlp": 1.01843989, "epoch": 0.9346479889376541, "flos": 34313112432000.0, "grad_norm": 1.7836559765255093, "language_loss": 0.79544222, "learning_rate": 4.456188895920565e-08, "loss": 0.81701124, "num_input_tokens_seen": 168019625, "step": 7773, "time_per_iteration": 2.6794986724853516 }, { "auxiliary_loss_clip": 0.01131259, "auxiliary_loss_mlp": 0.010374, "balance_loss_clip": 1.04240108, "balance_loss_mlp": 1.02074623, "epoch": 0.9347682318282932, "flos": 19093739581440.0, "grad_norm": 2.1894158736731084, "language_loss": 0.85249174, "learning_rate": 4.439851733169031e-08, "loss": 0.87417829, "num_input_tokens_seen": 168037415, "step": 7774, "time_per_iteration": 2.5354912281036377 }, { "auxiliary_loss_clip": 0.01096799, "auxiliary_loss_mlp": 0.01035862, "balance_loss_clip": 1.03987932, "balance_loss_mlp": 1.02086568, "epoch": 0.9348884747189322, "flos": 26249587153920.0, "grad_norm": 2.237663804694614, "language_loss": 0.69105327, "learning_rate": 4.4235442367365204e-08, "loss": 0.71237987, "num_input_tokens_seen": 168057725, "step": 7775, "time_per_iteration": 2.725287914276123 }, { "auxiliary_loss_clip": 0.01102054, "auxiliary_loss_mlp": 0.0104304, "balance_loss_clip": 1.03762603, "balance_loss_mlp": 1.02782845, "epoch": 0.9350087176095714, "flos": 18333080242560.0, "grad_norm": 1.985905538443413, "language_loss": 0.79480767, "learning_rate": 4.4072664090968545e-08, "loss": 0.81625861, "num_input_tokens_seen": 168076110, "step": 7776, "time_per_iteration": 3.4994845390319824 }, { "auxiliary_loss_clip": 0.0110901, "auxiliary_loss_mlp": 0.01038191, "balance_loss_clip": 1.04022121, "balance_loss_mlp": 1.02274179, "epoch": 0.9351289605002104, "flos": 19318253541120.0, "grad_norm": 2.0160202615164273, "language_loss": 0.84522796, "learning_rate": 4.391018252719347e-08, "loss": 0.86669993, "num_input_tokens_seen": 168095905, "step": 7777, "time_per_iteration": 2.7119882106781006 }, { "auxiliary_loss_clip": 0.01109239, "auxiliary_loss_mlp": 0.01041085, "balance_loss_clip": 1.03993845, "balance_loss_mlp": 1.0248723, "epoch": 0.9352492033908495, "flos": 18799990156800.0, "grad_norm": 2.131901654078361, "language_loss": 0.69190401, "learning_rate": 4.374799770068849e-08, "loss": 0.71340722, "num_input_tokens_seen": 168112580, "step": 7778, "time_per_iteration": 3.4106979370117188 }, { "auxiliary_loss_clip": 0.0111904, "auxiliary_loss_mlp": 0.01038094, "balance_loss_clip": 1.04179275, "balance_loss_mlp": 1.02282333, "epoch": 0.9353694462814887, "flos": 29530134241920.0, "grad_norm": 2.401207359392618, "language_loss": 0.74532962, "learning_rate": 4.358610963605658e-08, "loss": 0.76690102, "num_input_tokens_seen": 168133030, "step": 7779, "time_per_iteration": 2.6235482692718506 }, { "auxiliary_loss_clip": 0.01136083, "auxiliary_loss_mlp": 0.01049643, "balance_loss_clip": 1.04384589, "balance_loss_mlp": 1.03381753, "epoch": 0.9354896891721277, "flos": 30665450390400.0, "grad_norm": 2.0338395948049115, "language_loss": 0.68480903, "learning_rate": 4.342451835785677e-08, "loss": 0.70666629, "num_input_tokens_seen": 168153940, "step": 7780, "time_per_iteration": 2.604187250137329 }, { "auxiliary_loss_clip": 0.01101592, "auxiliary_loss_mlp": 0.01033578, "balance_loss_clip": 1.03695714, "balance_loss_mlp": 1.01827145, "epoch": 0.9356099320627668, "flos": 19463907191040.0, "grad_norm": 1.6460711804394754, "language_loss": 0.75131935, "learning_rate": 4.3263223890601665e-08, "loss": 0.77267104, "num_input_tokens_seen": 168172650, "step": 7781, "time_per_iteration": 2.5897436141967773 }, { "auxiliary_loss_clip": 0.01115452, "auxiliary_loss_mlp": 0.00771302, "balance_loss_clip": 1.04363811, "balance_loss_mlp": 1.00047851, "epoch": 0.9357301749534058, "flos": 19098156954240.0, "grad_norm": 2.501054156225377, "language_loss": 0.79435873, "learning_rate": 4.31022262587597e-08, "loss": 0.81322622, "num_input_tokens_seen": 168191325, "step": 7782, "time_per_iteration": 3.602820873260498 }, { "auxiliary_loss_clip": 0.01123879, "auxiliary_loss_mlp": 0.01036675, "balance_loss_clip": 1.04456019, "balance_loss_mlp": 1.02027154, "epoch": 0.935850417844045, "flos": 23550361776000.0, "grad_norm": 2.0147941798454982, "language_loss": 0.65923834, "learning_rate": 4.2941525486754225e-08, "loss": 0.68084383, "num_input_tokens_seen": 168211645, "step": 7783, "time_per_iteration": 3.452695608139038 }, { "auxiliary_loss_clip": 0.01086933, "auxiliary_loss_mlp": 0.0103434, "balance_loss_clip": 1.0386281, "balance_loss_mlp": 1.02028513, "epoch": 0.935970660734684, "flos": 18588333265920.0, "grad_norm": 1.860953123704173, "language_loss": 0.79602027, "learning_rate": 4.278112159896286e-08, "loss": 0.81723303, "num_input_tokens_seen": 168229485, "step": 7784, "time_per_iteration": 2.630155324935913 }, { "auxiliary_loss_clip": 0.01095834, "auxiliary_loss_mlp": 0.01037235, "balance_loss_clip": 1.03476548, "balance_loss_mlp": 1.02222645, "epoch": 0.9360909036253231, "flos": 20631255292800.0, "grad_norm": 1.9043215601039474, "language_loss": 0.67891884, "learning_rate": 4.2621014619719896e-08, "loss": 0.70024949, "num_input_tokens_seen": 168247250, "step": 7785, "time_per_iteration": 2.5821380615234375 }, { "auxiliary_loss_clip": 0.01017486, "auxiliary_loss_mlp": 0.01000252, "balance_loss_clip": 1.00841796, "balance_loss_mlp": 0.99876231, "epoch": 0.9362111465159623, "flos": 61791421052160.0, "grad_norm": 0.720646664093661, "language_loss": 0.58621788, "learning_rate": 4.246120457331215e-08, "loss": 0.60639524, "num_input_tokens_seen": 168309425, "step": 7786, "time_per_iteration": 3.216625452041626 }, { "auxiliary_loss_clip": 0.01100765, "auxiliary_loss_mlp": 0.01046667, "balance_loss_clip": 1.04122734, "balance_loss_mlp": 1.03002596, "epoch": 0.9363313894066013, "flos": 24170395368960.0, "grad_norm": 2.1310692615964753, "language_loss": 0.72350025, "learning_rate": 4.2301691483983325e-08, "loss": 0.74497455, "num_input_tokens_seen": 168329545, "step": 7787, "time_per_iteration": 2.6348001956939697 }, { "auxiliary_loss_clip": 0.01121572, "auxiliary_loss_mlp": 0.01034833, "balance_loss_clip": 1.04097509, "balance_loss_mlp": 1.0200386, "epoch": 0.9364516322972404, "flos": 20120354196480.0, "grad_norm": 2.502994564265273, "language_loss": 0.75953329, "learning_rate": 4.214247537593163e-08, "loss": 0.78109735, "num_input_tokens_seen": 168348795, "step": 7788, "time_per_iteration": 2.567028045654297 }, { "auxiliary_loss_clip": 0.01106553, "auxiliary_loss_mlp": 0.01039349, "balance_loss_clip": 1.03958905, "balance_loss_mlp": 1.02366138, "epoch": 0.9365718751878795, "flos": 20703758895360.0, "grad_norm": 3.4528415322833896, "language_loss": 0.80541122, "learning_rate": 4.1983556273309293e-08, "loss": 0.8268702, "num_input_tokens_seen": 168367545, "step": 7789, "time_per_iteration": 2.6236555576324463 }, { "auxiliary_loss_clip": 0.01138911, "auxiliary_loss_mlp": 0.01044438, "balance_loss_clip": 1.04523873, "balance_loss_mlp": 1.02869058, "epoch": 0.9366921180785186, "flos": 18655270260480.0, "grad_norm": 2.4104707107621963, "language_loss": 0.69165784, "learning_rate": 4.182493420022526e-08, "loss": 0.71349132, "num_input_tokens_seen": 168383215, "step": 7790, "time_per_iteration": 2.5130364894866943 }, { "auxiliary_loss_clip": 0.01089221, "auxiliary_loss_mlp": 0.01032188, "balance_loss_clip": 1.03737879, "balance_loss_mlp": 1.01785934, "epoch": 0.9368123609691577, "flos": 25774955815680.0, "grad_norm": 1.8544197598544088, "language_loss": 0.78595817, "learning_rate": 4.166660918074139e-08, "loss": 0.8071723, "num_input_tokens_seen": 168403120, "step": 7791, "time_per_iteration": 2.6734936237335205 }, { "auxiliary_loss_clip": 0.01090387, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.03917766, "balance_loss_mlp": 1.01964211, "epoch": 0.9369326038597968, "flos": 25553386771200.0, "grad_norm": 1.5022278844512111, "language_loss": 0.73608667, "learning_rate": 4.15085812388758e-08, "loss": 0.75734115, "num_input_tokens_seen": 168425340, "step": 7792, "time_per_iteration": 2.685014009475708 }, { "auxiliary_loss_clip": 0.01106082, "auxiliary_loss_mlp": 0.01037412, "balance_loss_clip": 1.04050803, "balance_loss_mlp": 1.02136683, "epoch": 0.9370528467504359, "flos": 23220019370880.0, "grad_norm": 1.8852299301830304, "language_loss": 0.78502351, "learning_rate": 4.135085039860153e-08, "loss": 0.80645841, "num_input_tokens_seen": 168444740, "step": 7793, "time_per_iteration": 2.59375262260437 }, { "auxiliary_loss_clip": 0.01103301, "auxiliary_loss_mlp": 0.01040992, "balance_loss_clip": 1.0403862, "balance_loss_mlp": 1.02430296, "epoch": 0.9371730896410749, "flos": 24967468120320.0, "grad_norm": 2.1214537455925515, "language_loss": 0.78608549, "learning_rate": 4.1193416683845906e-08, "loss": 0.80752838, "num_input_tokens_seen": 168463670, "step": 7794, "time_per_iteration": 2.623032569885254 }, { "auxiliary_loss_clip": 0.0109253, "auxiliary_loss_mlp": 0.01033406, "balance_loss_clip": 1.03939748, "balance_loss_mlp": 1.01988792, "epoch": 0.9372933325317141, "flos": 15553091134080.0, "grad_norm": 2.7214514417032123, "language_loss": 0.83484054, "learning_rate": 4.103628011849136e-08, "loss": 0.85609984, "num_input_tokens_seen": 168479030, "step": 7795, "time_per_iteration": 2.59783673286438 }, { "auxiliary_loss_clip": 0.01106555, "auxiliary_loss_mlp": 0.01038885, "balance_loss_clip": 1.04105461, "balance_loss_mlp": 1.02297115, "epoch": 0.9374135754223532, "flos": 21871861182720.0, "grad_norm": 2.5023430844212675, "language_loss": 0.75856435, "learning_rate": 4.0879440726375506e-08, "loss": 0.78001881, "num_input_tokens_seen": 168496815, "step": 7796, "time_per_iteration": 2.5763795375823975 }, { "auxiliary_loss_clip": 0.01105721, "auxiliary_loss_mlp": 0.01034625, "balance_loss_clip": 1.03927112, "balance_loss_mlp": 1.01722026, "epoch": 0.9375338183129922, "flos": 22631048064000.0, "grad_norm": 2.6269558203417804, "language_loss": 0.55916852, "learning_rate": 4.0722898531291074e-08, "loss": 0.58057201, "num_input_tokens_seen": 168514055, "step": 7797, "time_per_iteration": 2.6080756187438965 }, { "auxiliary_loss_clip": 0.011158, "auxiliary_loss_mlp": 0.01041861, "balance_loss_clip": 1.04330587, "balance_loss_mlp": 1.02666211, "epoch": 0.9376540612036314, "flos": 26104292640000.0, "grad_norm": 1.7649302056443856, "language_loss": 0.76912457, "learning_rate": 4.0566653556985295e-08, "loss": 0.79070121, "num_input_tokens_seen": 168534600, "step": 7798, "time_per_iteration": 2.6569011211395264 }, { "auxiliary_loss_clip": 0.0106334, "auxiliary_loss_mlp": 0.01034774, "balance_loss_clip": 1.03619576, "balance_loss_mlp": 1.01773906, "epoch": 0.9377743040942704, "flos": 19717580016000.0, "grad_norm": 2.234245229198971, "language_loss": 0.81532598, "learning_rate": 4.0410705827159886e-08, "loss": 0.83630705, "num_input_tokens_seen": 168551895, "step": 7799, "time_per_iteration": 2.7733795642852783 }, { "auxiliary_loss_clip": 0.01104026, "auxiliary_loss_mlp": 0.01040887, "balance_loss_clip": 1.03975534, "balance_loss_mlp": 1.02550948, "epoch": 0.9378945469849095, "flos": 15267530010240.0, "grad_norm": 2.3184076274510854, "language_loss": 0.71438706, "learning_rate": 4.0255055365472356e-08, "loss": 0.73583621, "num_input_tokens_seen": 168569990, "step": 7800, "time_per_iteration": 2.8025941848754883 }, { "auxiliary_loss_clip": 0.01067054, "auxiliary_loss_mlp": 0.01040844, "balance_loss_clip": 1.03277433, "balance_loss_mlp": 1.02571607, "epoch": 0.9380147898755486, "flos": 20591394174720.0, "grad_norm": 2.181379827418888, "language_loss": 0.75061238, "learning_rate": 4.009970219553471e-08, "loss": 0.77169132, "num_input_tokens_seen": 168586940, "step": 7801, "time_per_iteration": 2.716731071472168 }, { "auxiliary_loss_clip": 0.01125045, "auxiliary_loss_mlp": 0.01038873, "balance_loss_clip": 1.04144621, "balance_loss_mlp": 1.02292275, "epoch": 0.9381350327661877, "flos": 26281116316800.0, "grad_norm": 3.562564612940431, "language_loss": 0.76973104, "learning_rate": 3.99446463409141e-08, "loss": 0.79137027, "num_input_tokens_seen": 168604795, "step": 7802, "time_per_iteration": 3.497694969177246 }, { "auxiliary_loss_clip": 0.01125439, "auxiliary_loss_mlp": 0.01038048, "balance_loss_clip": 1.04122078, "balance_loss_mlp": 1.02107275, "epoch": 0.9382552756568268, "flos": 23586344225280.0, "grad_norm": 2.074774402513637, "language_loss": 0.6893456, "learning_rate": 3.978988782513215e-08, "loss": 0.71098042, "num_input_tokens_seen": 168622290, "step": 7803, "time_per_iteration": 2.7167747020721436 }, { "auxiliary_loss_clip": 0.01125502, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.04357648, "balance_loss_mlp": 1.02076197, "epoch": 0.9383755185474659, "flos": 28438809275520.0, "grad_norm": 1.7756711901521278, "language_loss": 0.76304048, "learning_rate": 3.963542667166586e-08, "loss": 0.784661, "num_input_tokens_seen": 168642395, "step": 7804, "time_per_iteration": 3.5914554595947266 }, { "auxiliary_loss_clip": 0.01093164, "auxiliary_loss_mlp": 0.0103504, "balance_loss_clip": 1.03956091, "balance_loss_mlp": 1.01985288, "epoch": 0.938495761438105, "flos": 20449583280000.0, "grad_norm": 2.9015330848630634, "language_loss": 0.68171751, "learning_rate": 3.9481262903946486e-08, "loss": 0.70299959, "num_input_tokens_seen": 168661840, "step": 7805, "time_per_iteration": 2.639127492904663 }, { "auxiliary_loss_clip": 0.0100008, "auxiliary_loss_mlp": 0.01007991, "balance_loss_clip": 1.00892067, "balance_loss_mlp": 1.00559521, "epoch": 0.938616004328744, "flos": 69302711658240.0, "grad_norm": 0.7821690795478928, "language_loss": 0.54455853, "learning_rate": 3.932739654536066e-08, "loss": 0.56463927, "num_input_tokens_seen": 168724540, "step": 7806, "time_per_iteration": 3.2498860359191895 }, { "auxiliary_loss_clip": 0.01116662, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.04080033, "balance_loss_mlp": 1.02343309, "epoch": 0.9387362472193832, "flos": 18911636605440.0, "grad_norm": 3.7945461175401283, "language_loss": 0.74111104, "learning_rate": 3.917382761925014e-08, "loss": 0.76265192, "num_input_tokens_seen": 168740375, "step": 7807, "time_per_iteration": 2.57177734375 }, { "auxiliary_loss_clip": 0.0111665, "auxiliary_loss_mlp": 0.01035129, "balance_loss_clip": 1.04111016, "balance_loss_mlp": 1.02096128, "epoch": 0.9388564901100223, "flos": 26501967089280.0, "grad_norm": 1.6981216820448104, "language_loss": 0.79316199, "learning_rate": 3.9020556148910754e-08, "loss": 0.8146798, "num_input_tokens_seen": 168759730, "step": 7808, "time_per_iteration": 3.626173496246338 }, { "auxiliary_loss_clip": 0.01017568, "auxiliary_loss_mlp": 0.01003386, "balance_loss_clip": 1.00808573, "balance_loss_mlp": 1.00164568, "epoch": 0.9389767330006613, "flos": 58941083157120.0, "grad_norm": 0.7053104264717684, "language_loss": 0.56592822, "learning_rate": 3.8867582157593895e-08, "loss": 0.58613777, "num_input_tokens_seen": 168813935, "step": 7809, "time_per_iteration": 4.148309230804443 }, { "auxiliary_loss_clip": 0.01119381, "auxiliary_loss_mlp": 0.01042312, "balance_loss_clip": 1.04332566, "balance_loss_mlp": 1.02737522, "epoch": 0.9390969758913005, "flos": 31102554994560.0, "grad_norm": 1.8740544667465355, "language_loss": 0.76251847, "learning_rate": 3.871490566850544e-08, "loss": 0.7841354, "num_input_tokens_seen": 168838145, "step": 7810, "time_per_iteration": 2.6526012420654297 }, { "auxiliary_loss_clip": 0.01103171, "auxiliary_loss_mlp": 0.01039492, "balance_loss_clip": 1.0390209, "balance_loss_mlp": 1.023417, "epoch": 0.9392172187819395, "flos": 22419391173120.0, "grad_norm": 1.7232999299175755, "language_loss": 0.70539737, "learning_rate": 3.856252670480642e-08, "loss": 0.72682393, "num_input_tokens_seen": 168856805, "step": 7811, "time_per_iteration": 2.603651285171509 }, { "auxiliary_loss_clip": 0.01106492, "auxiliary_loss_mlp": 0.01046429, "balance_loss_clip": 1.04014277, "balance_loss_mlp": 1.02985895, "epoch": 0.9393374616725786, "flos": 19719483436800.0, "grad_norm": 1.788100137434685, "language_loss": 0.81118095, "learning_rate": 3.841044528961279e-08, "loss": 0.83271021, "num_input_tokens_seen": 168874600, "step": 7812, "time_per_iteration": 2.597054958343506 }, { "auxiliary_loss_clip": 0.01131875, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.0412488, "balance_loss_mlp": 1.02070642, "epoch": 0.9394577045632178, "flos": 24170215800960.0, "grad_norm": 2.084665189923938, "language_loss": 0.78669548, "learning_rate": 3.825866144599477e-08, "loss": 0.80838794, "num_input_tokens_seen": 168893655, "step": 7813, "time_per_iteration": 2.5442352294921875 }, { "auxiliary_loss_clip": 0.01107306, "auxiliary_loss_mlp": 0.01042576, "balance_loss_clip": 1.03958046, "balance_loss_mlp": 1.02709103, "epoch": 0.9395779474538568, "flos": 19023929498880.0, "grad_norm": 1.9751196865486222, "language_loss": 0.75799936, "learning_rate": 3.8107175196978145e-08, "loss": 0.77949816, "num_input_tokens_seen": 168909960, "step": 7814, "time_per_iteration": 2.5227203369140625 }, { "auxiliary_loss_clip": 0.01089549, "auxiliary_loss_mlp": 0.01035686, "balance_loss_clip": 1.03912914, "balance_loss_mlp": 1.02097583, "epoch": 0.9396981903444959, "flos": 14319129260160.0, "grad_norm": 5.688696907064441, "language_loss": 0.76331794, "learning_rate": 3.7955986565542996e-08, "loss": 0.78457034, "num_input_tokens_seen": 168928040, "step": 7815, "time_per_iteration": 2.5927069187164307 }, { "auxiliary_loss_clip": 0.01093892, "auxiliary_loss_mlp": 0.0103713, "balance_loss_clip": 1.03784919, "balance_loss_mlp": 1.0217582, "epoch": 0.9398184332351349, "flos": 34787564202240.0, "grad_norm": 2.555248792708676, "language_loss": 0.68124592, "learning_rate": 3.780509557462497e-08, "loss": 0.70255613, "num_input_tokens_seen": 168948240, "step": 7816, "time_per_iteration": 2.761021614074707 }, { "auxiliary_loss_clip": 0.01104043, "auxiliary_loss_mlp": 0.01040291, "balance_loss_clip": 1.03815281, "balance_loss_mlp": 1.02387583, "epoch": 0.9399386761257741, "flos": 25372253462400.0, "grad_norm": 1.866712516180499, "language_loss": 0.75753945, "learning_rate": 3.765450224711375e-08, "loss": 0.77898276, "num_input_tokens_seen": 168968745, "step": 7817, "time_per_iteration": 2.627181053161621 }, { "auxiliary_loss_clip": 0.01100446, "auxiliary_loss_mlp": 0.01034712, "balance_loss_clip": 1.0390389, "balance_loss_mlp": 1.01939368, "epoch": 0.9400589190164131, "flos": 27304965584640.0, "grad_norm": 2.1036352894045427, "language_loss": 0.79853588, "learning_rate": 3.750420660585396e-08, "loss": 0.81988752, "num_input_tokens_seen": 168990685, "step": 7818, "time_per_iteration": 2.6678082942962646 }, { "auxiliary_loss_clip": 0.01131231, "auxiliary_loss_mlp": 0.01046708, "balance_loss_clip": 1.04255176, "balance_loss_mlp": 1.03111577, "epoch": 0.9401791619070522, "flos": 23399859790080.0, "grad_norm": 1.8521495167181752, "language_loss": 0.79843283, "learning_rate": 3.735420867364603e-08, "loss": 0.82021213, "num_input_tokens_seen": 169011665, "step": 7819, "time_per_iteration": 2.5660929679870605 }, { "auxiliary_loss_clip": 0.01059523, "auxiliary_loss_mlp": 0.01043728, "balance_loss_clip": 1.03379643, "balance_loss_mlp": 1.0276227, "epoch": 0.9402994047976914, "flos": 35881403120640.0, "grad_norm": 1.8041236955571172, "language_loss": 0.61529899, "learning_rate": 3.7204508473244186e-08, "loss": 0.6363315, "num_input_tokens_seen": 169035290, "step": 7820, "time_per_iteration": 2.834975242614746 }, { "auxiliary_loss_clip": 0.01053269, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.03380847, "balance_loss_mlp": 1.02002168, "epoch": 0.9404196476883304, "flos": 22236821320320.0, "grad_norm": 2.0370997731166405, "language_loss": 0.69376349, "learning_rate": 3.7055106027357395e-08, "loss": 0.71464407, "num_input_tokens_seen": 169055155, "step": 7821, "time_per_iteration": 2.778611898422241 }, { "auxiliary_loss_clip": 0.01115441, "auxiliary_loss_mlp": 0.01040573, "balance_loss_clip": 1.04080582, "balance_loss_mlp": 1.02424169, "epoch": 0.9405398905789695, "flos": 18915802583040.0, "grad_norm": 2.1055195096508035, "language_loss": 0.71835905, "learning_rate": 3.690600135865063e-08, "loss": 0.73991919, "num_input_tokens_seen": 169072080, "step": 7822, "time_per_iteration": 2.53450870513916 }, { "auxiliary_loss_clip": 0.00999635, "auxiliary_loss_mlp": 0.01003461, "balance_loss_clip": 1.00882518, "balance_loss_mlp": 1.00200105, "epoch": 0.9406601334696086, "flos": 70274130048000.0, "grad_norm": 0.7840889962080955, "language_loss": 0.58039451, "learning_rate": 3.675719448974246e-08, "loss": 0.60042548, "num_input_tokens_seen": 169137170, "step": 7823, "time_per_iteration": 3.3166911602020264 }, { "auxiliary_loss_clip": 0.01078036, "auxiliary_loss_mlp": 0.00771999, "balance_loss_clip": 1.03928328, "balance_loss_mlp": 1.00051022, "epoch": 0.9407803763602477, "flos": 22165071903360.0, "grad_norm": 1.8788716254534017, "language_loss": 0.60001117, "learning_rate": 3.6608685443207054e-08, "loss": 0.6185115, "num_input_tokens_seen": 169156320, "step": 7824, "time_per_iteration": 2.698448419570923 }, { "auxiliary_loss_clip": 0.01094116, "auxiliary_loss_mlp": 0.01041258, "balance_loss_clip": 1.03988528, "balance_loss_mlp": 1.02490234, "epoch": 0.9409006192508867, "flos": 18879496911360.0, "grad_norm": 2.525701996614479, "language_loss": 0.66881502, "learning_rate": 3.646047424157306e-08, "loss": 0.69016874, "num_input_tokens_seen": 169173295, "step": 7825, "time_per_iteration": 2.6367034912109375 }, { "auxiliary_loss_clip": 0.01106091, "auxiliary_loss_mlp": 0.01040969, "balance_loss_clip": 1.03958011, "balance_loss_mlp": 1.02352881, "epoch": 0.9410208621415259, "flos": 23368258800000.0, "grad_norm": 2.4950450122047902, "language_loss": 0.68850362, "learning_rate": 3.631256090732382e-08, "loss": 0.70997423, "num_input_tokens_seen": 169193755, "step": 7826, "time_per_iteration": 2.6175217628479004 }, { "auxiliary_loss_clip": 0.01094317, "auxiliary_loss_mlp": 0.01027349, "balance_loss_clip": 1.04224277, "balance_loss_mlp": 1.01352119, "epoch": 0.941141105032165, "flos": 22742227635840.0, "grad_norm": 2.0740906957821292, "language_loss": 0.8281827, "learning_rate": 3.6164945462897833e-08, "loss": 0.84939933, "num_input_tokens_seen": 169213045, "step": 7827, "time_per_iteration": 2.6624367237091064 }, { "auxiliary_loss_clip": 0.01118353, "auxiliary_loss_mlp": 0.00771173, "balance_loss_clip": 1.04250348, "balance_loss_mlp": 1.00050235, "epoch": 0.941261347922804, "flos": 20704908130560.0, "grad_norm": 1.7503564538995067, "language_loss": 0.75727439, "learning_rate": 3.6017627930687856e-08, "loss": 0.77616966, "num_input_tokens_seen": 169232870, "step": 7828, "time_per_iteration": 3.5583178997039795 }, { "auxiliary_loss_clip": 0.01077476, "auxiliary_loss_mlp": 0.01045383, "balance_loss_clip": 1.03413773, "balance_loss_mlp": 1.0285027, "epoch": 0.9413815908134432, "flos": 19421998997760.0, "grad_norm": 2.557273030791945, "language_loss": 0.77259445, "learning_rate": 3.587060833304267e-08, "loss": 0.793823, "num_input_tokens_seen": 169251060, "step": 7829, "time_per_iteration": 2.645986318588257 }, { "auxiliary_loss_clip": 0.01121383, "auxiliary_loss_mlp": 0.01037696, "balance_loss_clip": 1.04044497, "balance_loss_mlp": 1.02221107, "epoch": 0.9415018337040822, "flos": 17493452853120.0, "grad_norm": 2.091110033412988, "language_loss": 0.64118499, "learning_rate": 3.5723886692264225e-08, "loss": 0.66277581, "num_input_tokens_seen": 169268600, "step": 7830, "time_per_iteration": 3.4655044078826904 }, { "auxiliary_loss_clip": 0.0110001, "auxiliary_loss_mlp": 0.01036401, "balance_loss_clip": 1.03565693, "balance_loss_mlp": 1.02208376, "epoch": 0.9416220765947213, "flos": 31831613343360.0, "grad_norm": 4.238112741068993, "language_loss": 0.61690664, "learning_rate": 3.557746303061071e-08, "loss": 0.63827074, "num_input_tokens_seen": 169290355, "step": 7831, "time_per_iteration": 2.692826986312866 }, { "auxiliary_loss_clip": 0.01105639, "auxiliary_loss_mlp": 0.01031659, "balance_loss_clip": 1.03973603, "balance_loss_mlp": 1.01661432, "epoch": 0.9417423194853605, "flos": 23511973115520.0, "grad_norm": 1.8965106563366931, "language_loss": 0.72446334, "learning_rate": 3.543133737029391e-08, "loss": 0.74583626, "num_input_tokens_seen": 169310865, "step": 7832, "time_per_iteration": 2.6399104595184326 }, { "auxiliary_loss_clip": 0.01121291, "auxiliary_loss_mlp": 0.01041309, "balance_loss_clip": 1.03974271, "balance_loss_mlp": 1.02485847, "epoch": 0.9418625623759995, "flos": 23915106432000.0, "grad_norm": 2.3189301780592397, "language_loss": 0.69325447, "learning_rate": 3.5285509733481214e-08, "loss": 0.71488047, "num_input_tokens_seen": 169330590, "step": 7833, "time_per_iteration": 3.647958278656006 }, { "auxiliary_loss_clip": 0.01115491, "auxiliary_loss_mlp": 0.01053152, "balance_loss_clip": 1.03931546, "balance_loss_mlp": 1.03510344, "epoch": 0.9419828052666386, "flos": 18076965292800.0, "grad_norm": 1.8723917183832883, "language_loss": 0.76432693, "learning_rate": 3.513998014229469e-08, "loss": 0.78601336, "num_input_tokens_seen": 169349540, "step": 7834, "time_per_iteration": 2.573639154434204 }, { "auxiliary_loss_clip": 0.01105659, "auxiliary_loss_mlp": 0.01037325, "balance_loss_clip": 1.03978252, "balance_loss_mlp": 1.02400875, "epoch": 0.9421030481572777, "flos": 17712328377600.0, "grad_norm": 2.2210641531897695, "language_loss": 0.86114466, "learning_rate": 3.499474861881069e-08, "loss": 0.88257456, "num_input_tokens_seen": 169366765, "step": 7835, "time_per_iteration": 3.473971366882324 }, { "auxiliary_loss_clip": 0.01072444, "auxiliary_loss_mlp": 0.01042742, "balance_loss_clip": 1.03832996, "balance_loss_mlp": 1.02682722, "epoch": 0.9422232910479168, "flos": 20194114775040.0, "grad_norm": 2.530367012762834, "language_loss": 0.68230474, "learning_rate": 3.4849815185061136e-08, "loss": 0.70345664, "num_input_tokens_seen": 169386655, "step": 7836, "time_per_iteration": 2.7020628452301025 }, { "auxiliary_loss_clip": 0.0111808, "auxiliary_loss_mlp": 0.01036584, "balance_loss_clip": 1.04095626, "balance_loss_mlp": 1.02162302, "epoch": 0.9423435339385559, "flos": 18442571875200.0, "grad_norm": 2.152247597984999, "language_loss": 0.76303339, "learning_rate": 3.470517986303223e-08, "loss": 0.78457999, "num_input_tokens_seen": 169405640, "step": 7837, "time_per_iteration": 2.568153142929077 }, { "auxiliary_loss_clip": 0.01093426, "auxiliary_loss_mlp": 0.01042463, "balance_loss_clip": 1.03970718, "balance_loss_mlp": 1.02757359, "epoch": 0.942463776829195, "flos": 20080636732800.0, "grad_norm": 1.9212980919927718, "language_loss": 0.79315609, "learning_rate": 3.4560842674664856e-08, "loss": 0.81451499, "num_input_tokens_seen": 169424155, "step": 7838, "time_per_iteration": 2.587451219558716 }, { "auxiliary_loss_clip": 0.01120517, "auxiliary_loss_mlp": 0.01038193, "balance_loss_clip": 1.04107368, "balance_loss_mlp": 1.0232619, "epoch": 0.9425840197198341, "flos": 22636255536000.0, "grad_norm": 2.1186987155714356, "language_loss": 0.75492913, "learning_rate": 3.441680364185506e-08, "loss": 0.7765162, "num_input_tokens_seen": 169444025, "step": 7839, "time_per_iteration": 2.6090543270111084 }, { "auxiliary_loss_clip": 0.01108622, "auxiliary_loss_mlp": 0.0103618, "balance_loss_clip": 1.04150724, "balance_loss_mlp": 1.0202899, "epoch": 0.9427042626104731, "flos": 19937892084480.0, "grad_norm": 2.625793162873439, "language_loss": 0.74501866, "learning_rate": 3.427306278645314e-08, "loss": 0.76646674, "num_input_tokens_seen": 169462480, "step": 7840, "time_per_iteration": 2.611586809158325 }, { "auxiliary_loss_clip": 0.01086755, "auxiliary_loss_mlp": 0.01036065, "balance_loss_clip": 1.04054701, "balance_loss_mlp": 1.02131844, "epoch": 0.9428245055011123, "flos": 22856998567680.0, "grad_norm": 2.091213113641089, "language_loss": 0.72824246, "learning_rate": 3.4129620130264767e-08, "loss": 0.74947065, "num_input_tokens_seen": 169480840, "step": 7841, "time_per_iteration": 2.6751534938812256 }, { "auxiliary_loss_clip": 0.01111445, "auxiliary_loss_mlp": 0.00771666, "balance_loss_clip": 1.04291868, "balance_loss_mlp": 1.00049913, "epoch": 0.9429447483917514, "flos": 20951757371520.0, "grad_norm": 2.1144913263531264, "language_loss": 0.778584, "learning_rate": 3.398647569505009e-08, "loss": 0.79741508, "num_input_tokens_seen": 169498265, "step": 7842, "time_per_iteration": 2.624650478363037 }, { "auxiliary_loss_clip": 0.01098412, "auxiliary_loss_mlp": 0.0103649, "balance_loss_clip": 1.03864241, "balance_loss_mlp": 1.02108204, "epoch": 0.9430649912823904, "flos": 18843658116480.0, "grad_norm": 2.3902861072622383, "language_loss": 0.75293446, "learning_rate": 3.384362950252373e-08, "loss": 0.77428347, "num_input_tokens_seen": 169515235, "step": 7843, "time_per_iteration": 2.607121706008911 }, { "auxiliary_loss_clip": 0.0110133, "auxiliary_loss_mlp": 0.01035897, "balance_loss_clip": 1.03663397, "balance_loss_mlp": 1.01954186, "epoch": 0.9431852341730296, "flos": 32556038837760.0, "grad_norm": 2.0962699174020307, "language_loss": 0.57255059, "learning_rate": 3.3701081574355473e-08, "loss": 0.59392285, "num_input_tokens_seen": 169537195, "step": 7844, "time_per_iteration": 2.695784568786621 }, { "auxiliary_loss_clip": 0.01021351, "auxiliary_loss_mlp": 0.01001127, "balance_loss_clip": 1.01177979, "balance_loss_mlp": 0.99954146, "epoch": 0.9433054770636686, "flos": 66904490252160.0, "grad_norm": 0.6439252239743686, "language_loss": 0.51646078, "learning_rate": 3.3558831932169796e-08, "loss": 0.53668553, "num_input_tokens_seen": 169605865, "step": 7845, "time_per_iteration": 3.264209032058716 }, { "auxiliary_loss_clip": 0.01115356, "auxiliary_loss_mlp": 0.01036136, "balance_loss_clip": 1.03959596, "balance_loss_mlp": 1.02021003, "epoch": 0.9434257199543077, "flos": 26140346916480.0, "grad_norm": 2.1663211847256942, "language_loss": 0.88333899, "learning_rate": 3.341688059754588e-08, "loss": 0.90485394, "num_input_tokens_seen": 169621520, "step": 7846, "time_per_iteration": 2.6157071590423584 }, { "auxiliary_loss_clip": 0.01101587, "auxiliary_loss_mlp": 0.00771808, "balance_loss_clip": 1.04024911, "balance_loss_mlp": 1.00048554, "epoch": 0.9435459628449467, "flos": 25003486483200.0, "grad_norm": 2.19843543822489, "language_loss": 0.77946502, "learning_rate": 3.327522759201762e-08, "loss": 0.798199, "num_input_tokens_seen": 169641390, "step": 7847, "time_per_iteration": 2.658938407897949 }, { "auxiliary_loss_clip": 0.01093402, "auxiliary_loss_mlp": 0.01038305, "balance_loss_clip": 1.03879809, "balance_loss_mlp": 1.02285552, "epoch": 0.9436662057355859, "flos": 22163240309760.0, "grad_norm": 4.8136352580087625, "language_loss": 0.66733885, "learning_rate": 3.313387293707359e-08, "loss": 0.68865591, "num_input_tokens_seen": 169660095, "step": 7848, "time_per_iteration": 2.662187337875366 }, { "auxiliary_loss_clip": 0.01089631, "auxiliary_loss_mlp": 0.01044299, "balance_loss_clip": 1.04005361, "balance_loss_mlp": 1.02863479, "epoch": 0.943786448626225, "flos": 20118522602880.0, "grad_norm": 2.0094135418458743, "language_loss": 0.68585902, "learning_rate": 3.29928166541571e-08, "loss": 0.70719832, "num_input_tokens_seen": 169679050, "step": 7849, "time_per_iteration": 2.639256238937378 }, { "auxiliary_loss_clip": 0.01097612, "auxiliary_loss_mlp": 0.01046978, "balance_loss_clip": 1.03901744, "balance_loss_mlp": 1.02939451, "epoch": 0.943906691516864, "flos": 22090808534400.0, "grad_norm": 2.168851818378768, "language_loss": 0.80484188, "learning_rate": 3.2852058764666346e-08, "loss": 0.82628775, "num_input_tokens_seen": 169698150, "step": 7850, "time_per_iteration": 2.709074020385742 }, { "auxiliary_loss_clip": 0.01080957, "auxiliary_loss_mlp": 0.01037164, "balance_loss_clip": 1.03889453, "balance_loss_mlp": 1.02244174, "epoch": 0.9440269344075032, "flos": 35298501212160.0, "grad_norm": 2.3764257275974203, "language_loss": 0.68605149, "learning_rate": 3.2711599289954264e-08, "loss": 0.70723271, "num_input_tokens_seen": 169722185, "step": 7851, "time_per_iteration": 2.7661192417144775 }, { "auxiliary_loss_clip": 0.01073495, "auxiliary_loss_mlp": 0.01036506, "balance_loss_clip": 1.03930974, "balance_loss_mlp": 1.02152109, "epoch": 0.9441471772981422, "flos": 19238136255360.0, "grad_norm": 1.8338927235234688, "language_loss": 0.77676117, "learning_rate": 3.257143825132847e-08, "loss": 0.7978611, "num_input_tokens_seen": 169740355, "step": 7852, "time_per_iteration": 2.700671911239624 }, { "auxiliary_loss_clip": 0.01104429, "auxiliary_loss_mlp": 0.01034449, "balance_loss_clip": 1.03914952, "balance_loss_mlp": 1.0198164, "epoch": 0.9442674201887813, "flos": 25739799379200.0, "grad_norm": 1.7576552474054277, "language_loss": 0.76170743, "learning_rate": 3.243157567005106e-08, "loss": 0.78309625, "num_input_tokens_seen": 169758535, "step": 7853, "time_per_iteration": 3.6129581928253174 }, { "auxiliary_loss_clip": 0.01139044, "auxiliary_loss_mlp": 0.01047727, "balance_loss_clip": 1.04649127, "balance_loss_mlp": 1.03114533, "epoch": 0.9443876630794205, "flos": 15523321737600.0, "grad_norm": 2.3772726745722492, "language_loss": 0.64128661, "learning_rate": 3.2292011567339296e-08, "loss": 0.6631543, "num_input_tokens_seen": 169776340, "step": 7854, "time_per_iteration": 2.5442206859588623 }, { "auxiliary_loss_clip": 0.01116269, "auxiliary_loss_mlp": 0.00771719, "balance_loss_clip": 1.03863454, "balance_loss_mlp": 1.00054193, "epoch": 0.9445079059700595, "flos": 13400821128960.0, "grad_norm": 2.0115190463955814, "language_loss": 0.55908799, "learning_rate": 3.21527459643649e-08, "loss": 0.57796782, "num_input_tokens_seen": 169793225, "step": 7855, "time_per_iteration": 2.5912530422210693 }, { "auxiliary_loss_clip": 0.01125984, "auxiliary_loss_mlp": 0.01051719, "balance_loss_clip": 1.04601073, "balance_loss_mlp": 1.03640127, "epoch": 0.9446281488606986, "flos": 23659242877440.0, "grad_norm": 2.125113222987502, "language_loss": 0.73946267, "learning_rate": 3.2013778882254536e-08, "loss": 0.76123971, "num_input_tokens_seen": 169812020, "step": 7856, "time_per_iteration": 3.5616838932037354 }, { "auxiliary_loss_clip": 0.01110441, "auxiliary_loss_mlp": 0.01032358, "balance_loss_clip": 1.0394733, "balance_loss_mlp": 1.01762331, "epoch": 0.9447483917513377, "flos": 25557337267200.0, "grad_norm": 1.6806995678847658, "language_loss": 0.75724912, "learning_rate": 3.1875110342088676e-08, "loss": 0.77867711, "num_input_tokens_seen": 169833470, "step": 7857, "time_per_iteration": 2.646317481994629 }, { "auxiliary_loss_clip": 0.0110046, "auxiliary_loss_mlp": 0.01035047, "balance_loss_clip": 1.0400095, "balance_loss_mlp": 1.02002096, "epoch": 0.9448686346419768, "flos": 24535463247360.0, "grad_norm": 1.6316847092061773, "language_loss": 0.65481144, "learning_rate": 3.1736740364904035e-08, "loss": 0.67616647, "num_input_tokens_seen": 169854000, "step": 7858, "time_per_iteration": 2.650815010070801 }, { "auxiliary_loss_clip": 0.01082401, "auxiliary_loss_mlp": 0.00772469, "balance_loss_clip": 1.03787053, "balance_loss_mlp": 1.00048959, "epoch": 0.9449888775326158, "flos": 14721256995840.0, "grad_norm": 1.9842065722887203, "language_loss": 0.77271354, "learning_rate": 3.159866897169094e-08, "loss": 0.79126227, "num_input_tokens_seen": 169872200, "step": 7859, "time_per_iteration": 3.652684450149536 }, { "auxiliary_loss_clip": 0.0109583, "auxiliary_loss_mlp": 0.01046771, "balance_loss_clip": 1.03871131, "balance_loss_mlp": 1.03096414, "epoch": 0.945109120423255, "flos": 15447873219840.0, "grad_norm": 1.8944497656787593, "language_loss": 0.75735402, "learning_rate": 3.146089618339487e-08, "loss": 0.77878004, "num_input_tokens_seen": 169889055, "step": 7860, "time_per_iteration": 2.6113228797912598 }, { "auxiliary_loss_clip": 0.01097443, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.03991389, "balance_loss_mlp": 1.02259064, "epoch": 0.9452293633138941, "flos": 25448097029760.0, "grad_norm": 1.8627612629658952, "language_loss": 0.68449545, "learning_rate": 3.132342202091554e-08, "loss": 0.7058444, "num_input_tokens_seen": 169909280, "step": 7861, "time_per_iteration": 3.581724166870117 }, { "auxiliary_loss_clip": 0.01133697, "auxiliary_loss_mlp": 0.0103903, "balance_loss_clip": 1.04382372, "balance_loss_mlp": 1.0230794, "epoch": 0.9453496062045331, "flos": 21215342350080.0, "grad_norm": 2.29131378961794, "language_loss": 0.68974507, "learning_rate": 3.1186246505107595e-08, "loss": 0.71147233, "num_input_tokens_seen": 169928420, "step": 7862, "time_per_iteration": 2.565478801727295 }, { "auxiliary_loss_clip": 0.01121625, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.04416227, "balance_loss_mlp": 1.02435637, "epoch": 0.9454698490951723, "flos": 20010898477440.0, "grad_norm": 1.6901027284189771, "language_loss": 0.83645368, "learning_rate": 3.104936965678084e-08, "loss": 0.85805893, "num_input_tokens_seen": 169946750, "step": 7863, "time_per_iteration": 2.563459873199463 }, { "auxiliary_loss_clip": 0.01117672, "auxiliary_loss_mlp": 0.01044776, "balance_loss_clip": 1.03952336, "balance_loss_mlp": 1.02884936, "epoch": 0.9455900919858113, "flos": 21069652786560.0, "grad_norm": 1.8814541397243965, "language_loss": 0.8205055, "learning_rate": 3.091279149669956e-08, "loss": 0.84212995, "num_input_tokens_seen": 169965540, "step": 7864, "time_per_iteration": 2.602414608001709 }, { "auxiliary_loss_clip": 0.01118159, "auxiliary_loss_mlp": 0.0077091, "balance_loss_clip": 1.04176617, "balance_loss_mlp": 1.00048113, "epoch": 0.9457103348764504, "flos": 20740854666240.0, "grad_norm": 2.195574239214671, "language_loss": 0.74094248, "learning_rate": 3.0776512045581624e-08, "loss": 0.7598331, "num_input_tokens_seen": 169984330, "step": 7865, "time_per_iteration": 2.573914051055908 }, { "auxiliary_loss_clip": 0.01099649, "auxiliary_loss_mlp": 0.01043221, "balance_loss_clip": 1.03899837, "balance_loss_mlp": 1.02717519, "epoch": 0.9458305777670896, "flos": 21428363957760.0, "grad_norm": 2.095404520337665, "language_loss": 0.7788918, "learning_rate": 3.0640531324101384e-08, "loss": 0.80032039, "num_input_tokens_seen": 170002095, "step": 7866, "time_per_iteration": 2.62931752204895 }, { "auxiliary_loss_clip": 0.01122518, "auxiliary_loss_mlp": 0.01040338, "balance_loss_clip": 1.04520965, "balance_loss_mlp": 1.02389884, "epoch": 0.9459508206577286, "flos": 20011185786240.0, "grad_norm": 1.557556691803448, "language_loss": 0.76026869, "learning_rate": 3.0504849352886554e-08, "loss": 0.78189725, "num_input_tokens_seen": 170020240, "step": 7867, "time_per_iteration": 2.5756494998931885 }, { "auxiliary_loss_clip": 0.0111542, "auxiliary_loss_mlp": 0.01035549, "balance_loss_clip": 1.04071033, "balance_loss_mlp": 1.02035022, "epoch": 0.9460710635483677, "flos": 12166428291840.0, "grad_norm": 2.3151633990628646, "language_loss": 0.71737623, "learning_rate": 3.036946615252023e-08, "loss": 0.738886, "num_input_tokens_seen": 170035770, "step": 7868, "time_per_iteration": 2.545886993408203 }, { "auxiliary_loss_clip": 0.01110606, "auxiliary_loss_mlp": 0.01042757, "balance_loss_clip": 1.04226124, "balance_loss_mlp": 1.0268662, "epoch": 0.9461913064390068, "flos": 34276196229120.0, "grad_norm": 2.444563595269437, "language_loss": 0.668046, "learning_rate": 3.0234381743539984e-08, "loss": 0.68957967, "num_input_tokens_seen": 170053385, "step": 7869, "time_per_iteration": 2.709019660949707 }, { "auxiliary_loss_clip": 0.01112109, "auxiliary_loss_mlp": 0.01036758, "balance_loss_clip": 1.04164648, "balance_loss_mlp": 1.02073622, "epoch": 0.9463115493296459, "flos": 19463763536640.0, "grad_norm": 2.1397966750165183, "language_loss": 0.80177116, "learning_rate": 3.0099596146437863e-08, "loss": 0.82325983, "num_input_tokens_seen": 170070490, "step": 7870, "time_per_iteration": 2.592294454574585 }, { "auxiliary_loss_clip": 0.01034254, "auxiliary_loss_mlp": 0.0100078, "balance_loss_clip": 1.00649226, "balance_loss_mlp": 0.99925995, "epoch": 0.946431792220285, "flos": 70570824387840.0, "grad_norm": 0.7723057513319534, "language_loss": 0.59947097, "learning_rate": 2.996510938166086e-08, "loss": 0.61982131, "num_input_tokens_seen": 170133465, "step": 7871, "time_per_iteration": 3.230888605117798 }, { "auxiliary_loss_clip": 0.01119954, "auxiliary_loss_mlp": 0.01040082, "balance_loss_clip": 1.0424459, "balance_loss_mlp": 1.02501416, "epoch": 0.9465520351109241, "flos": 18947906363520.0, "grad_norm": 3.482660002798613, "language_loss": 0.73588639, "learning_rate": 2.983092146960997e-08, "loss": 0.75748676, "num_input_tokens_seen": 170150810, "step": 7872, "time_per_iteration": 2.5460166931152344 }, { "auxiliary_loss_clip": 0.01108953, "auxiliary_loss_mlp": 0.01047688, "balance_loss_clip": 1.03942633, "balance_loss_mlp": 1.03062892, "epoch": 0.9466722780015632, "flos": 19135647774720.0, "grad_norm": 2.9302975550360655, "language_loss": 0.80058479, "learning_rate": 2.9697032430642256e-08, "loss": 0.82215118, "num_input_tokens_seen": 170169025, "step": 7873, "time_per_iteration": 2.612640142440796 }, { "auxiliary_loss_clip": 0.01127066, "auxiliary_loss_mlp": 0.0103879, "balance_loss_clip": 1.04099441, "balance_loss_mlp": 1.02514601, "epoch": 0.9467925208922022, "flos": 17237912520960.0, "grad_norm": 2.405415337347492, "language_loss": 0.73486936, "learning_rate": 2.9563442285067906e-08, "loss": 0.7565279, "num_input_tokens_seen": 170186070, "step": 7874, "time_per_iteration": 2.501575469970703 }, { "auxiliary_loss_clip": 0.01119346, "auxiliary_loss_mlp": 0.010439, "balance_loss_clip": 1.04152393, "balance_loss_mlp": 1.0273422, "epoch": 0.9469127637828414, "flos": 29169016859520.0, "grad_norm": 2.2325353747708228, "language_loss": 0.79418474, "learning_rate": 2.943015105315294e-08, "loss": 0.81581718, "num_input_tokens_seen": 170206265, "step": 7875, "time_per_iteration": 2.6629652976989746 }, { "auxiliary_loss_clip": 0.01086696, "auxiliary_loss_mlp": 0.0104879, "balance_loss_clip": 1.03802705, "balance_loss_mlp": 1.03186238, "epoch": 0.9470330066734804, "flos": 26030460234240.0, "grad_norm": 2.44965897317827, "language_loss": 0.66766155, "learning_rate": 2.929715875511718e-08, "loss": 0.68901646, "num_input_tokens_seen": 170225300, "step": 7876, "time_per_iteration": 2.7052767276763916 }, { "auxiliary_loss_clip": 0.01119279, "auxiliary_loss_mlp": 0.01045222, "balance_loss_clip": 1.04007208, "balance_loss_mlp": 1.02992797, "epoch": 0.9471532495641195, "flos": 23440906056960.0, "grad_norm": 2.080771676655339, "language_loss": 0.70182097, "learning_rate": 2.9164465411135375e-08, "loss": 0.72346592, "num_input_tokens_seen": 170245070, "step": 7877, "time_per_iteration": 2.5935137271881104 }, { "auxiliary_loss_clip": 0.01120029, "auxiliary_loss_mlp": 0.01035163, "balance_loss_clip": 1.04308903, "balance_loss_mlp": 1.02088141, "epoch": 0.9472734924547586, "flos": 15815850099840.0, "grad_norm": 2.16030268040037, "language_loss": 0.81007564, "learning_rate": 2.9032071041337426e-08, "loss": 0.83162761, "num_input_tokens_seen": 170263305, "step": 7878, "time_per_iteration": 2.5381157398223877 }, { "auxiliary_loss_clip": 0.01097816, "auxiliary_loss_mlp": 0.01038089, "balance_loss_clip": 1.03843951, "balance_loss_mlp": 1.02265191, "epoch": 0.9473937353453977, "flos": 11181793697280.0, "grad_norm": 1.9297605969430909, "language_loss": 0.72968197, "learning_rate": 2.889997566580704e-08, "loss": 0.75104105, "num_input_tokens_seen": 170281460, "step": 7879, "time_per_iteration": 3.599179267883301 }, { "auxiliary_loss_clip": 0.01132182, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.04289472, "balance_loss_mlp": 1.02043211, "epoch": 0.9475139782360368, "flos": 25775530433280.0, "grad_norm": 2.2038716526554873, "language_loss": 0.70343179, "learning_rate": 2.8768179304583086e-08, "loss": 0.72512174, "num_input_tokens_seen": 170303515, "step": 7880, "time_per_iteration": 2.578864812850952 }, { "auxiliary_loss_clip": 0.01093295, "auxiliary_loss_mlp": 0.0103618, "balance_loss_clip": 1.04044008, "balance_loss_mlp": 1.02139187, "epoch": 0.9476342211266758, "flos": 22820046451200.0, "grad_norm": 1.7345526307945587, "language_loss": 0.73782623, "learning_rate": 2.8636681977659117e-08, "loss": 0.759121, "num_input_tokens_seen": 170323165, "step": 7881, "time_per_iteration": 3.5534775257110596 }, { "auxiliary_loss_clip": 0.01081133, "auxiliary_loss_mlp": 0.01040015, "balance_loss_clip": 1.0395906, "balance_loss_mlp": 1.02412462, "epoch": 0.947754464017315, "flos": 20193611984640.0, "grad_norm": 2.5240556813353305, "language_loss": 0.783696, "learning_rate": 2.850548370498318e-08, "loss": 0.8049075, "num_input_tokens_seen": 170341005, "step": 7882, "time_per_iteration": 2.6625704765319824 }, { "auxiliary_loss_clip": 0.01116843, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.04009569, "balance_loss_mlp": 1.02024174, "epoch": 0.9478747069079541, "flos": 24717925359360.0, "grad_norm": 1.5813263606017065, "language_loss": 0.7166841, "learning_rate": 2.8374584506457798e-08, "loss": 0.73820186, "num_input_tokens_seen": 170362280, "step": 7883, "time_per_iteration": 2.6038599014282227 }, { "auxiliary_loss_clip": 0.01105857, "auxiliary_loss_mlp": 0.01033473, "balance_loss_clip": 1.03929043, "balance_loss_mlp": 1.01826179, "epoch": 0.9479949497985931, "flos": 21361355136000.0, "grad_norm": 2.304436111638488, "language_loss": 0.67498767, "learning_rate": 2.824398440193998e-08, "loss": 0.69638097, "num_input_tokens_seen": 170381080, "step": 7884, "time_per_iteration": 2.636472463607788 }, { "auxiliary_loss_clip": 0.01074332, "auxiliary_loss_mlp": 0.01044083, "balance_loss_clip": 1.03601611, "balance_loss_mlp": 1.02705991, "epoch": 0.9481151926892323, "flos": 18148606968960.0, "grad_norm": 2.0545796349101564, "language_loss": 0.7154398, "learning_rate": 2.811368341124232e-08, "loss": 0.736624, "num_input_tokens_seen": 170400150, "step": 7885, "time_per_iteration": 3.6006689071655273 }, { "auxiliary_loss_clip": 0.01109982, "auxiliary_loss_mlp": 0.01046106, "balance_loss_clip": 1.04362726, "balance_loss_mlp": 1.03059649, "epoch": 0.9482354355798713, "flos": 22128012046080.0, "grad_norm": 2.449889946991696, "language_loss": 0.68196809, "learning_rate": 2.7983681554131222e-08, "loss": 0.703529, "num_input_tokens_seen": 170420410, "step": 7886, "time_per_iteration": 2.6590871810913086 }, { "auxiliary_loss_clip": 0.01105202, "auxiliary_loss_mlp": 0.01041347, "balance_loss_clip": 1.03860545, "balance_loss_mlp": 1.02539659, "epoch": 0.9483556784705104, "flos": 19063072344960.0, "grad_norm": 2.2047857344564132, "language_loss": 0.69848919, "learning_rate": 2.7853978850327365e-08, "loss": 0.71995461, "num_input_tokens_seen": 170439580, "step": 7887, "time_per_iteration": 3.5409631729125977 }, { "auxiliary_loss_clip": 0.0109666, "auxiliary_loss_mlp": 0.01037347, "balance_loss_clip": 1.04381216, "balance_loss_mlp": 1.02283335, "epoch": 0.9484759213611496, "flos": 25777110631680.0, "grad_norm": 1.7846367361617907, "language_loss": 0.87246025, "learning_rate": 2.7724575319507225e-08, "loss": 0.89380026, "num_input_tokens_seen": 170459290, "step": 7888, "time_per_iteration": 2.6799895763397217 }, { "auxiliary_loss_clip": 0.01115797, "auxiliary_loss_mlp": 0.01038718, "balance_loss_clip": 1.04108286, "balance_loss_mlp": 1.02484846, "epoch": 0.9485961642517886, "flos": 20667740532480.0, "grad_norm": 2.0632903195595875, "language_loss": 0.76800025, "learning_rate": 2.759547098130044e-08, "loss": 0.78954536, "num_input_tokens_seen": 170478020, "step": 7889, "time_per_iteration": 2.562155246734619 }, { "auxiliary_loss_clip": 0.0112994, "auxiliary_loss_mlp": 0.01036332, "balance_loss_clip": 1.042593, "balance_loss_mlp": 1.02138305, "epoch": 0.9487164071424277, "flos": 22674069578880.0, "grad_norm": 1.938905801989093, "language_loss": 0.76921922, "learning_rate": 2.746666585529267e-08, "loss": 0.79088193, "num_input_tokens_seen": 170498295, "step": 7890, "time_per_iteration": 2.547713041305542 }, { "auxiliary_loss_clip": 0.01113375, "auxiliary_loss_mlp": 0.01046124, "balance_loss_clip": 1.04185867, "balance_loss_mlp": 1.03049541, "epoch": 0.9488366500330668, "flos": 38726461716480.0, "grad_norm": 2.5107675443062996, "language_loss": 0.74322838, "learning_rate": 2.73381599610234e-08, "loss": 0.76482338, "num_input_tokens_seen": 170518695, "step": 7891, "time_per_iteration": 2.740668535232544 }, { "auxiliary_loss_clip": 0.01116054, "auxiliary_loss_mlp": 0.01035004, "balance_loss_clip": 1.03921771, "balance_loss_mlp": 1.01964355, "epoch": 0.9489568929237059, "flos": 27890920149120.0, "grad_norm": 2.2785576173681124, "language_loss": 0.71638978, "learning_rate": 2.7209953317987033e-08, "loss": 0.73790044, "num_input_tokens_seen": 170539735, "step": 7892, "time_per_iteration": 2.6219706535339355 }, { "auxiliary_loss_clip": 0.01115295, "auxiliary_loss_mlp": 0.01038662, "balance_loss_clip": 1.04041314, "balance_loss_mlp": 1.02380824, "epoch": 0.9490771358143449, "flos": 33580642291200.0, "grad_norm": 2.0580992703272902, "language_loss": 0.78241599, "learning_rate": 2.7082045945631793e-08, "loss": 0.80395561, "num_input_tokens_seen": 170561950, "step": 7893, "time_per_iteration": 2.6741695404052734 }, { "auxiliary_loss_clip": 0.01085831, "auxiliary_loss_mlp": 0.01037747, "balance_loss_clip": 1.03687096, "balance_loss_mlp": 1.02267861, "epoch": 0.9491973787049841, "flos": 14793796512000.0, "grad_norm": 2.648105628140745, "language_loss": 0.69430435, "learning_rate": 2.6954437863361712e-08, "loss": 0.71554011, "num_input_tokens_seen": 170579865, "step": 7894, "time_per_iteration": 2.623335599899292 }, { "auxiliary_loss_clip": 0.01073173, "auxiliary_loss_mlp": 0.01031876, "balance_loss_clip": 1.03866339, "balance_loss_mlp": 1.0181613, "epoch": 0.9493176215956232, "flos": 25332535998720.0, "grad_norm": 2.0036560206263543, "language_loss": 0.71258426, "learning_rate": 2.6827129090534862e-08, "loss": 0.73363477, "num_input_tokens_seen": 170600165, "step": 7895, "time_per_iteration": 2.6959502696990967 }, { "auxiliary_loss_clip": 0.01103902, "auxiliary_loss_mlp": 0.01032339, "balance_loss_clip": 1.04211414, "balance_loss_mlp": 1.01638913, "epoch": 0.9494378644862622, "flos": 21029971236480.0, "grad_norm": 1.9690891901971832, "language_loss": 0.77944577, "learning_rate": 2.670011964646335e-08, "loss": 0.80080819, "num_input_tokens_seen": 170618845, "step": 7896, "time_per_iteration": 2.6434295177459717 }, { "auxiliary_loss_clip": 0.01063213, "auxiliary_loss_mlp": 0.01038871, "balance_loss_clip": 1.03352916, "balance_loss_mlp": 1.02036917, "epoch": 0.9495581073769014, "flos": 15195134148480.0, "grad_norm": 1.7632448731585568, "language_loss": 0.68374288, "learning_rate": 2.657340955041487e-08, "loss": 0.70476365, "num_input_tokens_seen": 170637620, "step": 7897, "time_per_iteration": 2.709559679031372 }, { "auxiliary_loss_clip": 0.01105809, "auxiliary_loss_mlp": 0.01033583, "balance_loss_clip": 1.04191065, "balance_loss_mlp": 1.01728749, "epoch": 0.9496783502675404, "flos": 28616566705920.0, "grad_norm": 2.22390214601537, "language_loss": 0.71803021, "learning_rate": 2.6446998821611167e-08, "loss": 0.73942417, "num_input_tokens_seen": 170657815, "step": 7898, "time_per_iteration": 2.6650478839874268 }, { "auxiliary_loss_clip": 0.01084462, "auxiliary_loss_mlp": 0.01042137, "balance_loss_clip": 1.03791618, "balance_loss_mlp": 1.02685475, "epoch": 0.9497985931581795, "flos": 14866874732160.0, "grad_norm": 2.3088423960180773, "language_loss": 0.71411014, "learning_rate": 2.6320887479228228e-08, "loss": 0.73537612, "num_input_tokens_seen": 170674415, "step": 7899, "time_per_iteration": 2.620633363723755 }, { "auxiliary_loss_clip": 0.01111238, "auxiliary_loss_mlp": 0.01039159, "balance_loss_clip": 1.04028916, "balance_loss_mlp": 1.02435327, "epoch": 0.9499188360488187, "flos": 27193319136000.0, "grad_norm": 2.208765384756234, "language_loss": 0.73242533, "learning_rate": 2.619507554239786e-08, "loss": 0.75392926, "num_input_tokens_seen": 170692975, "step": 7900, "time_per_iteration": 2.6554253101348877 }, { "auxiliary_loss_clip": 0.01102072, "auxiliary_loss_mlp": 0.01039384, "balance_loss_clip": 1.0404762, "balance_loss_mlp": 1.02314782, "epoch": 0.9500390789394577, "flos": 24316479982080.0, "grad_norm": 1.71847051506088, "language_loss": 0.69983929, "learning_rate": 2.606956303020502e-08, "loss": 0.72125387, "num_input_tokens_seen": 170713780, "step": 7901, "time_per_iteration": 2.6323304176330566 }, { "auxiliary_loss_clip": 0.0112195, "auxiliary_loss_mlp": 0.01044263, "balance_loss_clip": 1.04369712, "balance_loss_mlp": 1.02919483, "epoch": 0.9501593218300968, "flos": 14354752573440.0, "grad_norm": 1.835805814946531, "language_loss": 0.84149158, "learning_rate": 2.5944349961690036e-08, "loss": 0.8631537, "num_input_tokens_seen": 170730800, "step": 7902, "time_per_iteration": 2.568775177001953 }, { "auxiliary_loss_clip": 0.01092345, "auxiliary_loss_mlp": 0.01038515, "balance_loss_clip": 1.03768229, "balance_loss_mlp": 1.02354264, "epoch": 0.9502795647207359, "flos": 38728113742080.0, "grad_norm": 1.7375443006054343, "language_loss": 0.73487985, "learning_rate": 2.581943635584749e-08, "loss": 0.75618845, "num_input_tokens_seen": 170753630, "step": 7903, "time_per_iteration": 2.7601513862609863 }, { "auxiliary_loss_clip": 0.01100264, "auxiliary_loss_mlp": 0.01039889, "balance_loss_clip": 1.04024768, "balance_loss_mlp": 1.02464771, "epoch": 0.950399807611375, "flos": 40808023799040.0, "grad_norm": 1.5744890705352999, "language_loss": 0.65636289, "learning_rate": 2.569482223162689e-08, "loss": 0.67776442, "num_input_tokens_seen": 170777605, "step": 7904, "time_per_iteration": 2.800163507461548 }, { "auxiliary_loss_clip": 0.01120124, "auxiliary_loss_mlp": 0.0104002, "balance_loss_clip": 1.0405525, "balance_loss_mlp": 1.02310443, "epoch": 0.950520050502014, "flos": 23440403266560.0, "grad_norm": 1.6396711507314095, "language_loss": 0.72636032, "learning_rate": 2.5570507607932e-08, "loss": 0.74796176, "num_input_tokens_seen": 170797520, "step": 7905, "time_per_iteration": 3.612704038619995 }, { "auxiliary_loss_clip": 0.01124518, "auxiliary_loss_mlp": 0.01038329, "balance_loss_clip": 1.0429287, "balance_loss_mlp": 1.02180648, "epoch": 0.9506402933926532, "flos": 17783718658560.0, "grad_norm": 2.6126750449986766, "language_loss": 0.63465726, "learning_rate": 2.54464925036213e-08, "loss": 0.65628576, "num_input_tokens_seen": 170814810, "step": 7906, "time_per_iteration": 2.6652252674102783 }, { "auxiliary_loss_clip": 0.01119311, "auxiliary_loss_mlp": 0.01037068, "balance_loss_clip": 1.04151082, "balance_loss_mlp": 1.02227378, "epoch": 0.9507605362832923, "flos": 32561928668160.0, "grad_norm": 2.671828766268199, "language_loss": 0.61031139, "learning_rate": 2.532277693750773e-08, "loss": 0.6318751, "num_input_tokens_seen": 170835735, "step": 7907, "time_per_iteration": 3.9705193042755127 }, { "auxiliary_loss_clip": 0.01078649, "auxiliary_loss_mlp": 0.01034535, "balance_loss_clip": 1.04171038, "balance_loss_mlp": 1.01859641, "epoch": 0.9508807791739313, "flos": 19602054898560.0, "grad_norm": 2.089342420485846, "language_loss": 0.76005995, "learning_rate": 2.5199360928358948e-08, "loss": 0.78119183, "num_input_tokens_seen": 170852970, "step": 7908, "time_per_iteration": 2.684417486190796 }, { "auxiliary_loss_clip": 0.0110821, "auxiliary_loss_mlp": 0.00773011, "balance_loss_clip": 1.03945196, "balance_loss_mlp": 1.00047326, "epoch": 0.9510010220645704, "flos": 21471852349440.0, "grad_norm": 1.843470068788851, "language_loss": 0.87073666, "learning_rate": 2.507624449489665e-08, "loss": 0.8895489, "num_input_tokens_seen": 170871600, "step": 7909, "time_per_iteration": 2.6110591888427734 }, { "auxiliary_loss_clip": 0.0111064, "auxiliary_loss_mlp": 0.0103784, "balance_loss_clip": 1.04078782, "balance_loss_mlp": 1.02220011, "epoch": 0.9511212649552095, "flos": 18879999701760.0, "grad_norm": 2.0345720372645517, "language_loss": 0.65370452, "learning_rate": 2.495342765579811e-08, "loss": 0.67518932, "num_input_tokens_seen": 170890260, "step": 7910, "time_per_iteration": 2.605804920196533 }, { "auxiliary_loss_clip": 0.01080815, "auxiliary_loss_mlp": 0.0103345, "balance_loss_clip": 1.03909647, "balance_loss_mlp": 1.01838779, "epoch": 0.9512415078458486, "flos": 20810521094400.0, "grad_norm": 2.0902980155649074, "language_loss": 0.71076536, "learning_rate": 2.4830910429693984e-08, "loss": 0.73190802, "num_input_tokens_seen": 170910220, "step": 7911, "time_per_iteration": 3.664048194885254 }, { "auxiliary_loss_clip": 0.01132553, "auxiliary_loss_mlp": 0.01038741, "balance_loss_clip": 1.04330945, "balance_loss_mlp": 1.02281475, "epoch": 0.9513617507364877, "flos": 18369565482240.0, "grad_norm": 2.0347451817253623, "language_loss": 0.80215704, "learning_rate": 2.470869283517052e-08, "loss": 0.82387, "num_input_tokens_seen": 170928255, "step": 7912, "time_per_iteration": 2.5634162425994873 }, { "auxiliary_loss_clip": 0.01113857, "auxiliary_loss_mlp": 0.01041367, "balance_loss_clip": 1.04111135, "balance_loss_mlp": 1.02768135, "epoch": 0.9514819936271268, "flos": 25010166412800.0, "grad_norm": 1.7099229565623133, "language_loss": 0.77169901, "learning_rate": 2.458677489076777e-08, "loss": 0.79325128, "num_input_tokens_seen": 170949265, "step": 7913, "time_per_iteration": 3.5416762828826904 }, { "auxiliary_loss_clip": 0.0111257, "auxiliary_loss_mlp": 0.0103755, "balance_loss_clip": 1.04144287, "balance_loss_mlp": 1.02281594, "epoch": 0.9516022365177659, "flos": 18662129758080.0, "grad_norm": 1.623701745299572, "language_loss": 0.83008587, "learning_rate": 2.446515661498072e-08, "loss": 0.85158706, "num_input_tokens_seen": 170968595, "step": 7914, "time_per_iteration": 2.5614101886749268 }, { "auxiliary_loss_clip": 0.01064508, "auxiliary_loss_mlp": 0.01034153, "balance_loss_clip": 1.03602016, "balance_loss_mlp": 1.01895952, "epoch": 0.9517224794084049, "flos": 25372109808000.0, "grad_norm": 2.067770953216116, "language_loss": 0.74371654, "learning_rate": 2.434383802625861e-08, "loss": 0.76470315, "num_input_tokens_seen": 170987550, "step": 7915, "time_per_iteration": 2.797429084777832 }, { "auxiliary_loss_clip": 0.01096073, "auxiliary_loss_mlp": 0.01039722, "balance_loss_clip": 1.03952336, "balance_loss_mlp": 1.0239861, "epoch": 0.9518427222990441, "flos": 21470918595840.0, "grad_norm": 2.1656579050056926, "language_loss": 0.74024796, "learning_rate": 2.4222819143005168e-08, "loss": 0.76160586, "num_input_tokens_seen": 171007145, "step": 7916, "time_per_iteration": 2.659337043762207 }, { "auxiliary_loss_clip": 0.0113087, "auxiliary_loss_mlp": 0.01037583, "balance_loss_clip": 1.04348648, "balance_loss_mlp": 1.02365947, "epoch": 0.9519629651896832, "flos": 21033634423680.0, "grad_norm": 1.7376601324465226, "language_loss": 0.80786955, "learning_rate": 2.4102099983579706e-08, "loss": 0.82955408, "num_input_tokens_seen": 171026295, "step": 7917, "time_per_iteration": 2.544494867324829 }, { "auxiliary_loss_clip": 0.01118491, "auxiliary_loss_mlp": 0.01041248, "balance_loss_clip": 1.04026747, "balance_loss_mlp": 1.02430272, "epoch": 0.9520832080803222, "flos": 21689219502720.0, "grad_norm": 1.9317882598905582, "language_loss": 0.77350914, "learning_rate": 2.3981680566294236e-08, "loss": 0.79510653, "num_input_tokens_seen": 171045895, "step": 7918, "time_per_iteration": 2.553666114807129 }, { "auxiliary_loss_clip": 0.0113004, "auxiliary_loss_mlp": 0.01037541, "balance_loss_clip": 1.04311371, "balance_loss_mlp": 1.02324235, "epoch": 0.9522034509709614, "flos": 23145289125120.0, "grad_norm": 2.1278120365409334, "language_loss": 0.73437953, "learning_rate": 2.3861560909416822e-08, "loss": 0.75605536, "num_input_tokens_seen": 171065445, "step": 7919, "time_per_iteration": 2.5905888080596924 }, { "auxiliary_loss_clip": 0.01081787, "auxiliary_loss_mlp": 0.01043116, "balance_loss_clip": 1.03836703, "balance_loss_mlp": 1.02715421, "epoch": 0.9523236938616004, "flos": 24679428958080.0, "grad_norm": 1.7496407990904905, "language_loss": 0.82198548, "learning_rate": 2.3741741031169325e-08, "loss": 0.84323448, "num_input_tokens_seen": 171085015, "step": 7920, "time_per_iteration": 2.6880741119384766 }, { "auxiliary_loss_clip": 0.01080135, "auxiliary_loss_mlp": 0.0105577, "balance_loss_clip": 1.03716266, "balance_loss_mlp": 1.03951001, "epoch": 0.9524439367522395, "flos": 22672309812480.0, "grad_norm": 2.603761804955696, "language_loss": 0.7184903, "learning_rate": 2.3622220949728544e-08, "loss": 0.73984933, "num_input_tokens_seen": 171103900, "step": 7921, "time_per_iteration": 2.680107831954956 }, { "auxiliary_loss_clip": 0.0110972, "auxiliary_loss_mlp": 0.01038691, "balance_loss_clip": 1.03720546, "balance_loss_mlp": 1.02226377, "epoch": 0.9525641796428787, "flos": 34055525024640.0, "grad_norm": 3.673199930467985, "language_loss": 0.61737913, "learning_rate": 2.3503000683225526e-08, "loss": 0.63886321, "num_input_tokens_seen": 171121615, "step": 7922, "time_per_iteration": 2.660815715789795 }, { "auxiliary_loss_clip": 0.0113116, "auxiliary_loss_mlp": 0.01033899, "balance_loss_clip": 1.04165542, "balance_loss_mlp": 1.01799679, "epoch": 0.9526844225335177, "flos": 16727083251840.0, "grad_norm": 1.9970859059855277, "language_loss": 0.84276181, "learning_rate": 2.3384080249745585e-08, "loss": 0.86441243, "num_input_tokens_seen": 171139505, "step": 7923, "time_per_iteration": 2.492594003677368 }, { "auxiliary_loss_clip": 0.01087544, "auxiliary_loss_mlp": 0.01035195, "balance_loss_clip": 1.04023409, "balance_loss_mlp": 1.01987672, "epoch": 0.9528046654241568, "flos": 36939367330560.0, "grad_norm": 3.3982039064811635, "language_loss": 0.8270607, "learning_rate": 2.3265459667329178e-08, "loss": 0.84828818, "num_input_tokens_seen": 171158995, "step": 7924, "time_per_iteration": 2.7693512439727783 }, { "auxiliary_loss_clip": 0.01108687, "auxiliary_loss_mlp": 0.01029768, "balance_loss_clip": 1.04042792, "balance_loss_mlp": 1.01492643, "epoch": 0.9529249083147959, "flos": 18255010032000.0, "grad_norm": 2.213451527975505, "language_loss": 0.85868943, "learning_rate": 2.31471389539708e-08, "loss": 0.88007402, "num_input_tokens_seen": 171176120, "step": 7925, "time_per_iteration": 2.6103410720825195 }, { "auxiliary_loss_clip": 0.01124996, "auxiliary_loss_mlp": 0.00771729, "balance_loss_clip": 1.04716778, "balance_loss_mlp": 1.00049567, "epoch": 0.953045151205435, "flos": 28658438985600.0, "grad_norm": 2.933858428944579, "language_loss": 0.72612369, "learning_rate": 2.3029118127619872e-08, "loss": 0.74509096, "num_input_tokens_seen": 171195835, "step": 7926, "time_per_iteration": 2.604706287384033 }, { "auxiliary_loss_clip": 0.01099001, "auxiliary_loss_mlp": 0.01038841, "balance_loss_clip": 1.03806758, "balance_loss_mlp": 1.02327204, "epoch": 0.953165394096074, "flos": 21835232288640.0, "grad_norm": 2.205398929266094, "language_loss": 0.8722465, "learning_rate": 2.2911397206179628e-08, "loss": 0.89362496, "num_input_tokens_seen": 171212585, "step": 7927, "time_per_iteration": 2.658250331878662 }, { "auxiliary_loss_clip": 0.01130589, "auxiliary_loss_mlp": 0.01039928, "balance_loss_clip": 1.04264855, "balance_loss_mlp": 1.02524734, "epoch": 0.9532856369867132, "flos": 19975059682560.0, "grad_norm": 2.2814021195364735, "language_loss": 0.63095033, "learning_rate": 2.279397620750845e-08, "loss": 0.65265548, "num_input_tokens_seen": 171231630, "step": 7928, "time_per_iteration": 2.536576747894287 }, { "auxiliary_loss_clip": 0.01102365, "auxiliary_loss_mlp": 0.010372, "balance_loss_clip": 1.0387125, "balance_loss_mlp": 1.0222156, "epoch": 0.9534058798773523, "flos": 15049588239360.0, "grad_norm": 4.7796316770765825, "language_loss": 0.79238898, "learning_rate": 2.2676855149419195e-08, "loss": 0.8137846, "num_input_tokens_seen": 171248800, "step": 7929, "time_per_iteration": 2.5937137603759766 }, { "auxiliary_loss_clip": 0.01101538, "auxiliary_loss_mlp": 0.01037358, "balance_loss_clip": 1.04208982, "balance_loss_mlp": 1.02206397, "epoch": 0.9535261227679913, "flos": 17602800831360.0, "grad_norm": 2.310671280352261, "language_loss": 0.75675309, "learning_rate": 2.2560034049678988e-08, "loss": 0.77814198, "num_input_tokens_seen": 171263150, "step": 7930, "time_per_iteration": 2.576406478881836 }, { "auxiliary_loss_clip": 0.01133378, "auxiliary_loss_mlp": 0.01036074, "balance_loss_clip": 1.0431149, "balance_loss_mlp": 1.02099454, "epoch": 0.9536463656586305, "flos": 23142954741120.0, "grad_norm": 1.7058073787207602, "language_loss": 0.75580472, "learning_rate": 2.2443512926008988e-08, "loss": 0.7774992, "num_input_tokens_seen": 171282480, "step": 7931, "time_per_iteration": 3.5366170406341553 }, { "auxiliary_loss_clip": 0.01095027, "auxiliary_loss_mlp": 0.01041702, "balance_loss_clip": 1.03799748, "balance_loss_mlp": 1.02598429, "epoch": 0.9537666085492695, "flos": 18625033987200.0, "grad_norm": 2.313245725123639, "language_loss": 0.69676787, "learning_rate": 2.2327291796085946e-08, "loss": 0.71813518, "num_input_tokens_seen": 171300840, "step": 7932, "time_per_iteration": 2.6198363304138184 }, { "auxiliary_loss_clip": 0.01132975, "auxiliary_loss_mlp": 0.01040982, "balance_loss_clip": 1.04327595, "balance_loss_mlp": 1.02423275, "epoch": 0.9538868514399086, "flos": 18989347680000.0, "grad_norm": 3.0009767288450107, "language_loss": 0.77513123, "learning_rate": 2.2211370677540197e-08, "loss": 0.79687071, "num_input_tokens_seen": 171317365, "step": 7933, "time_per_iteration": 3.433176040649414 }, { "auxiliary_loss_clip": 0.01133511, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.04396439, "balance_loss_mlp": 1.02470827, "epoch": 0.9540070943305478, "flos": 16800556521600.0, "grad_norm": 5.883722924586901, "language_loss": 0.77864695, "learning_rate": 2.2095749587957012e-08, "loss": 0.80038017, "num_input_tokens_seen": 171335270, "step": 7934, "time_per_iteration": 2.555558443069458 }, { "auxiliary_loss_clip": 0.01103421, "auxiliary_loss_mlp": 0.01037652, "balance_loss_clip": 1.03867745, "balance_loss_mlp": 1.02227449, "epoch": 0.9541273372211868, "flos": 20156911263360.0, "grad_norm": 1.8735048293921626, "language_loss": 0.69780791, "learning_rate": 2.1980428544876138e-08, "loss": 0.71921861, "num_input_tokens_seen": 171353910, "step": 7935, "time_per_iteration": 2.7951865196228027 }, { "auxiliary_loss_clip": 0.01072778, "auxiliary_loss_mlp": 0.01048555, "balance_loss_clip": 1.03428471, "balance_loss_mlp": 1.02958894, "epoch": 0.9542475801118259, "flos": 26725511381760.0, "grad_norm": 1.8318459038157981, "language_loss": 0.74204594, "learning_rate": 2.1865407565791584e-08, "loss": 0.76325929, "num_input_tokens_seen": 171375480, "step": 7936, "time_per_iteration": 2.7936294078826904 }, { "auxiliary_loss_clip": 0.01105767, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.03768647, "balance_loss_mlp": 1.02363944, "epoch": 0.954367823002465, "flos": 23330911633920.0, "grad_norm": 1.9971884044797537, "language_loss": 0.77219218, "learning_rate": 2.175068666815183e-08, "loss": 0.79363614, "num_input_tokens_seen": 171396320, "step": 7937, "time_per_iteration": 3.6934661865234375 }, { "auxiliary_loss_clip": 0.01097494, "auxiliary_loss_mlp": 0.0104557, "balance_loss_clip": 1.04013562, "balance_loss_mlp": 1.02808213, "epoch": 0.9544880658931041, "flos": 14902713527040.0, "grad_norm": 2.190994434595177, "language_loss": 0.79273319, "learning_rate": 2.163626586935985e-08, "loss": 0.8141638, "num_input_tokens_seen": 171412860, "step": 7938, "time_per_iteration": 2.5993690490722656 }, { "auxiliary_loss_clip": 0.0111608, "auxiliary_loss_mlp": 0.01056761, "balance_loss_clip": 1.04095769, "balance_loss_mlp": 1.03889203, "epoch": 0.9546083087837431, "flos": 29095902725760.0, "grad_norm": 2.505449257761337, "language_loss": 0.6287626, "learning_rate": 2.1522145186773755e-08, "loss": 0.650491, "num_input_tokens_seen": 171431780, "step": 7939, "time_per_iteration": 3.5347800254821777 }, { "auxiliary_loss_clip": 0.01105789, "auxiliary_loss_mlp": 0.01034297, "balance_loss_clip": 1.03975391, "balance_loss_mlp": 1.01991487, "epoch": 0.9547285516743822, "flos": 21142335957120.0, "grad_norm": 2.7212828009845302, "language_loss": 0.85575151, "learning_rate": 2.140832463770481e-08, "loss": 0.87715244, "num_input_tokens_seen": 171450975, "step": 7940, "time_per_iteration": 2.6140127182006836 }, { "auxiliary_loss_clip": 0.01107049, "auxiliary_loss_mlp": 0.01036084, "balance_loss_clip": 1.03957188, "balance_loss_mlp": 1.02058649, "epoch": 0.9548487945650214, "flos": 27490157130240.0, "grad_norm": 2.145841013070649, "language_loss": 0.76254189, "learning_rate": 2.129480423941987e-08, "loss": 0.78397322, "num_input_tokens_seen": 171467645, "step": 7941, "time_per_iteration": 2.686358690261841 }, { "auxiliary_loss_clip": 0.01106204, "auxiliary_loss_mlp": 0.01038892, "balance_loss_clip": 1.03896761, "balance_loss_mlp": 1.02399111, "epoch": 0.9549690374556604, "flos": 22273198819200.0, "grad_norm": 1.7124781006411838, "language_loss": 0.80345345, "learning_rate": 2.1181584009140052e-08, "loss": 0.82490444, "num_input_tokens_seen": 171487185, "step": 7942, "time_per_iteration": 2.63545560836792 }, { "auxiliary_loss_clip": 0.01103561, "auxiliary_loss_mlp": 0.01030748, "balance_loss_clip": 1.04232478, "balance_loss_mlp": 1.01707435, "epoch": 0.9550892803462995, "flos": 17595294888960.0, "grad_norm": 2.1622043750586957, "language_loss": 0.84154731, "learning_rate": 2.10686639640405e-08, "loss": 0.86289036, "num_input_tokens_seen": 171501275, "step": 7943, "time_per_iteration": 2.6833107471466064 }, { "auxiliary_loss_clip": 0.01119533, "auxiliary_loss_mlp": 0.01035124, "balance_loss_clip": 1.04118586, "balance_loss_mlp": 1.01919794, "epoch": 0.9552095232369386, "flos": 24353144789760.0, "grad_norm": 1.627242324525162, "language_loss": 0.8131972, "learning_rate": 2.0956044121251294e-08, "loss": 0.8347438, "num_input_tokens_seen": 171520060, "step": 7944, "time_per_iteration": 2.707801103591919 }, { "auxiliary_loss_clip": 0.01093934, "auxiliary_loss_mlp": 0.01039882, "balance_loss_clip": 1.04117298, "balance_loss_mlp": 1.02275181, "epoch": 0.9553297661275777, "flos": 22746860490240.0, "grad_norm": 1.6876686424600906, "language_loss": 0.8103562, "learning_rate": 2.084372449785654e-08, "loss": 0.83169436, "num_input_tokens_seen": 171539895, "step": 7945, "time_per_iteration": 2.6697328090667725 }, { "auxiliary_loss_clip": 0.0110255, "auxiliary_loss_mlp": 0.01045933, "balance_loss_clip": 1.03980446, "balance_loss_mlp": 1.02892208, "epoch": 0.9554500090182168, "flos": 15413866018560.0, "grad_norm": 1.691324681555355, "language_loss": 0.69127923, "learning_rate": 2.0731705110895282e-08, "loss": 0.71276414, "num_input_tokens_seen": 171557385, "step": 7946, "time_per_iteration": 2.592062473297119 }, { "auxiliary_loss_clip": 0.0112155, "auxiliary_loss_mlp": 0.0104928, "balance_loss_clip": 1.04150295, "balance_loss_mlp": 1.03299642, "epoch": 0.9555702519088559, "flos": 23513517400320.0, "grad_norm": 1.9253125733419463, "language_loss": 0.86581957, "learning_rate": 2.0619985977360587e-08, "loss": 0.88752788, "num_input_tokens_seen": 171575705, "step": 7947, "time_per_iteration": 2.6049044132232666 }, { "auxiliary_loss_clip": 0.01077294, "auxiliary_loss_mlp": 0.01034349, "balance_loss_clip": 1.03460312, "balance_loss_mlp": 1.01973367, "epoch": 0.955690494799495, "flos": 22962072827520.0, "grad_norm": 1.70143855434608, "language_loss": 0.77086484, "learning_rate": 2.0508567114200237e-08, "loss": 0.79198122, "num_input_tokens_seen": 171595620, "step": 7948, "time_per_iteration": 2.716525077819824 }, { "auxiliary_loss_clip": 0.01105575, "auxiliary_loss_mlp": 0.01030766, "balance_loss_clip": 1.03978872, "balance_loss_mlp": 1.01656783, "epoch": 0.955810737690134, "flos": 26031250333440.0, "grad_norm": 2.7027528570401373, "language_loss": 0.7872169, "learning_rate": 2.0397448538316485e-08, "loss": 0.80858028, "num_input_tokens_seen": 171616660, "step": 7949, "time_per_iteration": 2.6721038818359375 }, { "auxiliary_loss_clip": 0.01093049, "auxiliary_loss_mlp": 0.01039391, "balance_loss_clip": 1.04122257, "balance_loss_mlp": 1.02441812, "epoch": 0.9559309805807732, "flos": 20849951249280.0, "grad_norm": 2.1980065659021966, "language_loss": 0.66685051, "learning_rate": 2.028663026656563e-08, "loss": 0.6881749, "num_input_tokens_seen": 171635515, "step": 7950, "time_per_iteration": 2.694375514984131 }, { "auxiliary_loss_clip": 0.01129611, "auxiliary_loss_mlp": 0.00771497, "balance_loss_clip": 1.04186761, "balance_loss_mlp": 1.00056314, "epoch": 0.9560512234714122, "flos": 21578219498880.0, "grad_norm": 2.4437589811574316, "language_loss": 0.71714622, "learning_rate": 2.0176112315758885e-08, "loss": 0.7361573, "num_input_tokens_seen": 171653305, "step": 7951, "time_per_iteration": 2.5431177616119385 }, { "auxiliary_loss_clip": 0.01094493, "auxiliary_loss_mlp": 0.01038901, "balance_loss_clip": 1.0419147, "balance_loss_mlp": 1.02291536, "epoch": 0.9561714663620513, "flos": 17450144029440.0, "grad_norm": 2.609114064481778, "language_loss": 0.69402331, "learning_rate": 2.0065894702661957e-08, "loss": 0.71535724, "num_input_tokens_seen": 171669980, "step": 7952, "time_per_iteration": 2.6426589488983154 }, { "auxiliary_loss_clip": 0.01084212, "auxiliary_loss_mlp": 0.00772536, "balance_loss_clip": 1.03519654, "balance_loss_mlp": 1.00043297, "epoch": 0.9562917092526905, "flos": 26098510550400.0, "grad_norm": 1.7818962830725724, "language_loss": 0.77871454, "learning_rate": 1.9955977443994577e-08, "loss": 0.79728198, "num_input_tokens_seen": 171689970, "step": 7953, "time_per_iteration": 2.691837787628174 }, { "auxiliary_loss_clip": 0.01105798, "auxiliary_loss_mlp": 0.01052053, "balance_loss_clip": 1.03802204, "balance_loss_mlp": 1.03333771, "epoch": 0.9564119521433295, "flos": 24096742531200.0, "grad_norm": 2.7580228469619796, "language_loss": 0.62289786, "learning_rate": 1.9846360556430965e-08, "loss": 0.64447635, "num_input_tokens_seen": 171708270, "step": 7954, "time_per_iteration": 2.6494550704956055 }, { "auxiliary_loss_clip": 0.01130317, "auxiliary_loss_mlp": 0.01036129, "balance_loss_clip": 1.04185545, "balance_loss_mlp": 1.02149618, "epoch": 0.9565321950339686, "flos": 32008903896960.0, "grad_norm": 2.290648712264064, "language_loss": 0.6166414, "learning_rate": 1.973704405660004e-08, "loss": 0.6383059, "num_input_tokens_seen": 171729385, "step": 7955, "time_per_iteration": 2.620361804962158 }, { "auxiliary_loss_clip": 0.01067953, "auxiliary_loss_mlp": 0.01036227, "balance_loss_clip": 1.03499007, "balance_loss_mlp": 1.02171898, "epoch": 0.9566524379246077, "flos": 23588642695680.0, "grad_norm": 1.4556098127392876, "language_loss": 0.78042334, "learning_rate": 1.9628027961085203e-08, "loss": 0.80146515, "num_input_tokens_seen": 171752615, "step": 7956, "time_per_iteration": 2.8054845333099365 }, { "auxiliary_loss_clip": 0.01083321, "auxiliary_loss_mlp": 0.01039993, "balance_loss_clip": 1.03560495, "balance_loss_mlp": 1.02275538, "epoch": 0.9567726808152468, "flos": 38067716240640.0, "grad_norm": 2.0130170775754315, "language_loss": 0.84495175, "learning_rate": 1.9519312286423894e-08, "loss": 0.86618495, "num_input_tokens_seen": 171775810, "step": 7957, "time_per_iteration": 3.756032705307007 }, { "auxiliary_loss_clip": 0.01116839, "auxiliary_loss_mlp": 0.0103889, "balance_loss_clip": 1.04094315, "balance_loss_mlp": 1.02407253, "epoch": 0.9568929237058859, "flos": 22744059229440.0, "grad_norm": 1.7562371869230307, "language_loss": 0.77805686, "learning_rate": 1.9410897049108255e-08, "loss": 0.79961413, "num_input_tokens_seen": 171795090, "step": 7958, "time_per_iteration": 2.590355396270752 }, { "auxiliary_loss_clip": 0.01138942, "auxiliary_loss_mlp": 0.01041476, "balance_loss_clip": 1.04689515, "balance_loss_mlp": 1.02610958, "epoch": 0.957013166596525, "flos": 23841633162240.0, "grad_norm": 1.8544546658360175, "language_loss": 0.91452193, "learning_rate": 1.9302782265584905e-08, "loss": 0.93632615, "num_input_tokens_seen": 171815755, "step": 7959, "time_per_iteration": 3.514275550842285 }, { "auxiliary_loss_clip": 0.01072135, "auxiliary_loss_mlp": 0.01037017, "balance_loss_clip": 1.03739119, "balance_loss_mlp": 1.02237856, "epoch": 0.9571334094871641, "flos": 17639286071040.0, "grad_norm": 2.0763479017361157, "language_loss": 0.87231219, "learning_rate": 1.9194967952254282e-08, "loss": 0.89340371, "num_input_tokens_seen": 171834330, "step": 7960, "time_per_iteration": 2.632399797439575 }, { "auxiliary_loss_clip": 0.01117655, "auxiliary_loss_mlp": 0.01042229, "balance_loss_clip": 1.04095411, "balance_loss_mlp": 1.02596903, "epoch": 0.9572536523778031, "flos": 15369623441280.0, "grad_norm": 2.250552145229752, "language_loss": 0.803985, "learning_rate": 1.9087454125472635e-08, "loss": 0.82558382, "num_input_tokens_seen": 171848805, "step": 7961, "time_per_iteration": 2.5527966022491455 }, { "auxiliary_loss_clip": 0.01133512, "auxiliary_loss_mlp": 0.01044268, "balance_loss_clip": 1.04279697, "balance_loss_mlp": 1.02809107, "epoch": 0.9573738952684423, "flos": 24969838417920.0, "grad_norm": 2.041422926719756, "language_loss": 0.78300023, "learning_rate": 1.8980240801548696e-08, "loss": 0.80477804, "num_input_tokens_seen": 171867995, "step": 7962, "time_per_iteration": 2.6041460037231445 }, { "auxiliary_loss_clip": 0.01104602, "auxiliary_loss_mlp": 0.01036828, "balance_loss_clip": 1.0408715, "balance_loss_mlp": 1.02186096, "epoch": 0.9574941381590814, "flos": 25769461034880.0, "grad_norm": 1.7393712433400865, "language_loss": 0.74033177, "learning_rate": 1.8873327996747458e-08, "loss": 0.76174605, "num_input_tokens_seen": 171886495, "step": 7963, "time_per_iteration": 3.6484532356262207 }, { "auxiliary_loss_clip": 0.01122501, "auxiliary_loss_mlp": 0.01040698, "balance_loss_clip": 1.04217362, "balance_loss_mlp": 1.02517736, "epoch": 0.9576143810497204, "flos": 32307178435200.0, "grad_norm": 1.888345540234925, "language_loss": 0.66092384, "learning_rate": 1.8766715727287053e-08, "loss": 0.68255579, "num_input_tokens_seen": 171908200, "step": 7964, "time_per_iteration": 2.6834254264831543 }, { "auxiliary_loss_clip": 0.01123915, "auxiliary_loss_mlp": 0.0077197, "balance_loss_clip": 1.04205751, "balance_loss_mlp": 1.0005157, "epoch": 0.9577346239403596, "flos": 27745733376000.0, "grad_norm": 1.8540340508442534, "language_loss": 0.79379791, "learning_rate": 1.8660404009340546e-08, "loss": 0.81275678, "num_input_tokens_seen": 171928650, "step": 7965, "time_per_iteration": 3.503925323486328 }, { "auxiliary_loss_clip": 0.01026174, "auxiliary_loss_mlp": 0.01000421, "balance_loss_clip": 1.00738919, "balance_loss_mlp": 0.99881202, "epoch": 0.9578548668309986, "flos": 57468313710720.0, "grad_norm": 0.8720838027659498, "language_loss": 0.59533, "learning_rate": 1.8554392859035485e-08, "loss": 0.615596, "num_input_tokens_seen": 171986400, "step": 7966, "time_per_iteration": 3.153413772583008 }, { "auxiliary_loss_clip": 0.01060742, "auxiliary_loss_mlp": 0.01043401, "balance_loss_clip": 1.03499246, "balance_loss_mlp": 1.02749801, "epoch": 0.9579751097216377, "flos": 19756040503680.0, "grad_norm": 1.706466865576179, "language_loss": 0.78979409, "learning_rate": 1.8448682292453444e-08, "loss": 0.8108356, "num_input_tokens_seen": 172005475, "step": 7967, "time_per_iteration": 2.868413209915161 }, { "auxiliary_loss_clip": 0.01133984, "auxiliary_loss_mlp": 0.01035233, "balance_loss_clip": 1.04541206, "balance_loss_mlp": 1.02016509, "epoch": 0.9580953526122769, "flos": 18041270152320.0, "grad_norm": 4.533876903033421, "language_loss": 0.66439331, "learning_rate": 1.8343272325631154e-08, "loss": 0.68608546, "num_input_tokens_seen": 172024420, "step": 7968, "time_per_iteration": 3.039902687072754 }, { "auxiliary_loss_clip": 0.01064178, "auxiliary_loss_mlp": 0.00772783, "balance_loss_clip": 1.03775644, "balance_loss_mlp": 1.00052667, "epoch": 0.9582155955029159, "flos": 24270154416000.0, "grad_norm": 2.1324029005512903, "language_loss": 0.78030401, "learning_rate": 1.8238162974558492e-08, "loss": 0.79867357, "num_input_tokens_seen": 172038350, "step": 7969, "time_per_iteration": 2.8701062202453613 }, { "auxiliary_loss_clip": 0.01102346, "auxiliary_loss_mlp": 0.01034775, "balance_loss_clip": 1.03991389, "balance_loss_mlp": 1.02060151, "epoch": 0.958335838393555, "flos": 22783309816320.0, "grad_norm": 2.036688013554339, "language_loss": 0.7493757, "learning_rate": 1.8133354255181144e-08, "loss": 0.77074695, "num_input_tokens_seen": 172058665, "step": 7970, "time_per_iteration": 2.632467031478882 }, { "auxiliary_loss_clip": 0.01111801, "auxiliary_loss_mlp": 0.0104204, "balance_loss_clip": 1.03901291, "balance_loss_mlp": 1.02655506, "epoch": 0.958456081284194, "flos": 16911484698240.0, "grad_norm": 2.636104157898433, "language_loss": 0.74757951, "learning_rate": 1.802884618339795e-08, "loss": 0.76911795, "num_input_tokens_seen": 172077470, "step": 7971, "time_per_iteration": 2.582977533340454 }, { "auxiliary_loss_clip": 0.01120322, "auxiliary_loss_mlp": 0.01036277, "balance_loss_clip": 1.04171157, "balance_loss_mlp": 1.02101254, "epoch": 0.9585763241748332, "flos": 19974951941760.0, "grad_norm": 1.989370649544554, "language_loss": 0.80983281, "learning_rate": 1.7924638775062894e-08, "loss": 0.83139879, "num_input_tokens_seen": 172096590, "step": 7972, "time_per_iteration": 2.587050676345825 }, { "auxiliary_loss_clip": 0.01088068, "auxiliary_loss_mlp": 0.01032474, "balance_loss_clip": 1.03838134, "balance_loss_mlp": 1.01772833, "epoch": 0.9586965670654722, "flos": 21395649646080.0, "grad_norm": 2.1576529298530858, "language_loss": 0.81342924, "learning_rate": 1.7820732045984444e-08, "loss": 0.83463466, "num_input_tokens_seen": 172116735, "step": 7973, "time_per_iteration": 2.8377528190612793 }, { "auxiliary_loss_clip": 0.01117573, "auxiliary_loss_mlp": 0.01038422, "balance_loss_clip": 1.04080749, "balance_loss_mlp": 1.02192366, "epoch": 0.9588168099561113, "flos": 21435115714560.0, "grad_norm": 1.7428556404648483, "language_loss": 0.73934889, "learning_rate": 1.7717126011924655e-08, "loss": 0.76090884, "num_input_tokens_seen": 172138320, "step": 7974, "time_per_iteration": 2.673259735107422 }, { "auxiliary_loss_clip": 0.01074207, "auxiliary_loss_mlp": 0.01046514, "balance_loss_clip": 1.03411162, "balance_loss_mlp": 1.02989602, "epoch": 0.9589370528467505, "flos": 11763761852160.0, "grad_norm": 2.5101611371528048, "language_loss": 0.76303029, "learning_rate": 1.7613820688600957e-08, "loss": 0.7842375, "num_input_tokens_seen": 172154225, "step": 7975, "time_per_iteration": 2.6330525875091553 }, { "auxiliary_loss_clip": 0.01112418, "auxiliary_loss_mlp": 0.01032278, "balance_loss_clip": 1.0423466, "balance_loss_mlp": 1.01780558, "epoch": 0.9590572957373895, "flos": 23441516588160.0, "grad_norm": 1.7564310756791668, "language_loss": 0.78483415, "learning_rate": 1.7510816091684588e-08, "loss": 0.80628109, "num_input_tokens_seen": 172174150, "step": 7976, "time_per_iteration": 2.653733968734741 }, { "auxiliary_loss_clip": 0.01112689, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.04252434, "balance_loss_mlp": 1.0212816, "epoch": 0.9591775386280286, "flos": 22528272274560.0, "grad_norm": 2.961760277138041, "language_loss": 0.78655434, "learning_rate": 1.740811223680083e-08, "loss": 0.80805516, "num_input_tokens_seen": 172191005, "step": 7977, "time_per_iteration": 2.623176097869873 }, { "auxiliary_loss_clip": 0.01133062, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.0447998, "balance_loss_mlp": 1.01842093, "epoch": 0.9592977815186677, "flos": 18186959715840.0, "grad_norm": 3.7510546472931114, "language_loss": 0.73601294, "learning_rate": 1.7305709139530334e-08, "loss": 0.75767612, "num_input_tokens_seen": 172209785, "step": 7978, "time_per_iteration": 2.5622000694274902 }, { "auxiliary_loss_clip": 0.01111548, "auxiliary_loss_mlp": 0.01038121, "balance_loss_clip": 1.03846073, "balance_loss_mlp": 1.02243328, "epoch": 0.9594180244093068, "flos": 16537797555840.0, "grad_norm": 4.292342068313159, "language_loss": 0.75140142, "learning_rate": 1.7203606815407334e-08, "loss": 0.77289814, "num_input_tokens_seen": 172224380, "step": 7979, "time_per_iteration": 2.5375185012817383 }, { "auxiliary_loss_clip": 0.01114776, "auxiliary_loss_mlp": 0.01045876, "balance_loss_clip": 1.04312587, "balance_loss_mlp": 1.02878094, "epoch": 0.9595382672999458, "flos": 20554334317440.0, "grad_norm": 2.0274934555653923, "language_loss": 0.79671723, "learning_rate": 1.7101805279920557e-08, "loss": 0.81832373, "num_input_tokens_seen": 172242540, "step": 7980, "time_per_iteration": 2.6401479244232178 }, { "auxiliary_loss_clip": 0.01133162, "auxiliary_loss_mlp": 0.01046221, "balance_loss_clip": 1.04342103, "balance_loss_mlp": 1.0302465, "epoch": 0.959658510190585, "flos": 22638266697600.0, "grad_norm": 2.1973867346235973, "language_loss": 0.81233525, "learning_rate": 1.7000304548513643e-08, "loss": 0.8341291, "num_input_tokens_seen": 172262645, "step": 7981, "time_per_iteration": 2.57069993019104 }, { "auxiliary_loss_clip": 0.01089318, "auxiliary_loss_mlp": 0.01052041, "balance_loss_clip": 1.03567946, "balance_loss_mlp": 1.03408766, "epoch": 0.9597787530812241, "flos": 19135252725120.0, "grad_norm": 2.3709192349290347, "language_loss": 0.82957947, "learning_rate": 1.6899104636583394e-08, "loss": 0.85099304, "num_input_tokens_seen": 172280695, "step": 7982, "time_per_iteration": 2.654653310775757 }, { "auxiliary_loss_clip": 0.01026354, "auxiliary_loss_mlp": 0.01000654, "balance_loss_clip": 1.00741863, "balance_loss_mlp": 0.99878263, "epoch": 0.9598989959718631, "flos": 60098124055680.0, "grad_norm": 0.7287235852327131, "language_loss": 0.61925417, "learning_rate": 1.6798205559482638e-08, "loss": 0.63952428, "num_input_tokens_seen": 172343075, "step": 7983, "time_per_iteration": 4.29177451133728 }, { "auxiliary_loss_clip": 0.01095444, "auxiliary_loss_mlp": 0.01037705, "balance_loss_clip": 1.04081988, "balance_loss_mlp": 1.02255392, "epoch": 0.9600192388625023, "flos": 20886795624960.0, "grad_norm": 1.7689094266534706, "language_loss": 0.76696098, "learning_rate": 1.669760733251713e-08, "loss": 0.78829247, "num_input_tokens_seen": 172361950, "step": 7984, "time_per_iteration": 2.659034252166748 }, { "auxiliary_loss_clip": 0.01079195, "auxiliary_loss_mlp": 0.01038673, "balance_loss_clip": 1.03997624, "balance_loss_mlp": 1.02353299, "epoch": 0.9601394817531413, "flos": 20445740524800.0, "grad_norm": 1.7655056543160963, "language_loss": 0.82435679, "learning_rate": 1.659730997094755e-08, "loss": 0.8455354, "num_input_tokens_seen": 172380440, "step": 7985, "time_per_iteration": 3.5996079444885254 }, { "auxiliary_loss_clip": 0.01109217, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.03826225, "balance_loss_mlp": 1.02232218, "epoch": 0.9602597246437804, "flos": 21507152440320.0, "grad_norm": 2.9638371062656255, "language_loss": 0.6237787, "learning_rate": 1.6497313489989283e-08, "loss": 0.64523947, "num_input_tokens_seen": 172400265, "step": 7986, "time_per_iteration": 2.6073286533355713 }, { "auxiliary_loss_clip": 0.01079365, "auxiliary_loss_mlp": 0.01036743, "balance_loss_clip": 1.033131, "balance_loss_mlp": 1.02032793, "epoch": 0.9603799675344196, "flos": 29935099152000.0, "grad_norm": 2.4858745712983854, "language_loss": 0.69946468, "learning_rate": 1.639761790481131e-08, "loss": 0.72062576, "num_input_tokens_seen": 172421145, "step": 7987, "time_per_iteration": 2.7755627632141113 }, { "auxiliary_loss_clip": 0.01121745, "auxiliary_loss_mlp": 0.01040243, "balance_loss_clip": 1.04307699, "balance_loss_mlp": 1.02500844, "epoch": 0.9605002104250586, "flos": 28001525103360.0, "grad_norm": 2.1057384231519967, "language_loss": 0.79766846, "learning_rate": 1.6298223230537754e-08, "loss": 0.81928837, "num_input_tokens_seen": 172438945, "step": 7988, "time_per_iteration": 2.6449177265167236 }, { "auxiliary_loss_clip": 0.01104993, "auxiliary_loss_mlp": 0.0077261, "balance_loss_clip": 1.03902149, "balance_loss_mlp": 1.000471, "epoch": 0.9606204533156977, "flos": 35590490870400.0, "grad_norm": 2.3636301951827483, "language_loss": 0.69844121, "learning_rate": 1.619912948224611e-08, "loss": 0.71721721, "num_input_tokens_seen": 172460150, "step": 7989, "time_per_iteration": 3.769843816757202 }, { "auxiliary_loss_clip": 0.01088491, "auxiliary_loss_mlp": 0.0103813, "balance_loss_clip": 1.03911626, "balance_loss_mlp": 1.02268016, "epoch": 0.9607406962063368, "flos": 26574614346240.0, "grad_norm": 4.521793240431231, "language_loss": 0.60878134, "learning_rate": 1.6100336674969682e-08, "loss": 0.63004756, "num_input_tokens_seen": 172478990, "step": 7990, "time_per_iteration": 2.684624433517456 }, { "auxiliary_loss_clip": 0.01084372, "auxiliary_loss_mlp": 0.01047732, "balance_loss_clip": 1.03613651, "balance_loss_mlp": 1.03222919, "epoch": 0.9608609390969759, "flos": 25331781813120.0, "grad_norm": 2.286883235130231, "language_loss": 0.76368588, "learning_rate": 1.600184482369449e-08, "loss": 0.78500688, "num_input_tokens_seen": 172498905, "step": 7991, "time_per_iteration": 3.9401352405548096 }, { "auxiliary_loss_clip": 0.01095873, "auxiliary_loss_mlp": 0.01040462, "balance_loss_clip": 1.03806138, "balance_loss_mlp": 1.02396333, "epoch": 0.960981181987615, "flos": 21069114082560.0, "grad_norm": 4.1808353041487285, "language_loss": 0.89180028, "learning_rate": 1.5903653943362126e-08, "loss": 0.91316354, "num_input_tokens_seen": 172517900, "step": 7992, "time_per_iteration": 2.6580567359924316 }, { "auxiliary_loss_clip": 0.01109659, "auxiliary_loss_mlp": 0.01038661, "balance_loss_clip": 1.04353893, "balance_loss_mlp": 1.02395034, "epoch": 0.9611014248782541, "flos": 17823256554240.0, "grad_norm": 1.8303069449095413, "language_loss": 0.77180994, "learning_rate": 1.580576404886802e-08, "loss": 0.79329312, "num_input_tokens_seen": 172536430, "step": 7993, "time_per_iteration": 2.669532299041748 }, { "auxiliary_loss_clip": 0.01118556, "auxiliary_loss_mlp": 0.01036879, "balance_loss_clip": 1.04179645, "balance_loss_mlp": 1.02281225, "epoch": 0.9612216677688932, "flos": 19354631040000.0, "grad_norm": 2.1648720606798215, "language_loss": 0.79851341, "learning_rate": 1.570817515506162e-08, "loss": 0.82006776, "num_input_tokens_seen": 172555120, "step": 7994, "time_per_iteration": 2.5989232063293457 }, { "auxiliary_loss_clip": 0.01130608, "auxiliary_loss_mlp": 0.01032462, "balance_loss_clip": 1.04326487, "balance_loss_mlp": 1.0180614, "epoch": 0.9613419106595322, "flos": 15808739207040.0, "grad_norm": 2.067365597874101, "language_loss": 0.81476712, "learning_rate": 1.561088727674753e-08, "loss": 0.83639783, "num_input_tokens_seen": 172569330, "step": 7995, "time_per_iteration": 2.5867979526519775 }, { "auxiliary_loss_clip": 0.01096922, "auxiliary_loss_mlp": 0.01040664, "balance_loss_clip": 1.04068732, "balance_loss_mlp": 1.02293158, "epoch": 0.9614621535501714, "flos": 25702488126720.0, "grad_norm": 2.135350073664584, "language_loss": 0.71159923, "learning_rate": 1.551390042868417e-08, "loss": 0.73297513, "num_input_tokens_seen": 172591100, "step": 7996, "time_per_iteration": 2.7760233879089355 }, { "auxiliary_loss_clip": 0.0111681, "auxiliary_loss_mlp": 0.010454, "balance_loss_clip": 1.04042149, "balance_loss_mlp": 1.02951527, "epoch": 0.9615823964408104, "flos": 17819054663040.0, "grad_norm": 2.7679348333988303, "language_loss": 0.70957279, "learning_rate": 1.5417214625584207e-08, "loss": 0.73119491, "num_input_tokens_seen": 172608755, "step": 7997, "time_per_iteration": 2.7035932540893555 }, { "auxiliary_loss_clip": 0.01116345, "auxiliary_loss_mlp": 0.01040216, "balance_loss_clip": 1.04120874, "balance_loss_mlp": 1.02430177, "epoch": 0.9617026393314495, "flos": 20190020624640.0, "grad_norm": 1.6007401019381589, "language_loss": 0.85032403, "learning_rate": 1.5320829882114806e-08, "loss": 0.87188965, "num_input_tokens_seen": 172626830, "step": 7998, "time_per_iteration": 2.5835189819335938 }, { "auxiliary_loss_clip": 0.01130713, "auxiliary_loss_mlp": 0.01040163, "balance_loss_clip": 1.04109883, "balance_loss_mlp": 1.02523756, "epoch": 0.9618228822220887, "flos": 20267013427200.0, "grad_norm": 2.04198908084535, "language_loss": 0.79185653, "learning_rate": 1.5224746212897378e-08, "loss": 0.81356525, "num_input_tokens_seen": 172646125, "step": 7999, "time_per_iteration": 2.5777060985565186 }, { "auxiliary_loss_clip": 0.01131295, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.04466844, "balance_loss_mlp": 1.01876783, "epoch": 0.9619431251127277, "flos": 21031300039680.0, "grad_norm": 1.6511907599347038, "language_loss": 0.77127916, "learning_rate": 1.512896363250804e-08, "loss": 0.7929343, "num_input_tokens_seen": 172666235, "step": 8000, "time_per_iteration": 2.7284576892852783 }, { "auxiliary_loss_clip": 0.01120252, "auxiliary_loss_mlp": 0.01031979, "balance_loss_clip": 1.0409615, "balance_loss_mlp": 1.01784062, "epoch": 0.9620633680033668, "flos": 22382654538240.0, "grad_norm": 2.030575749544029, "language_loss": 0.75481546, "learning_rate": 1.503348215547673e-08, "loss": 0.7763378, "num_input_tokens_seen": 172687325, "step": 8001, "time_per_iteration": 2.619903087615967 }, { "auxiliary_loss_clip": 0.01102783, "auxiliary_loss_mlp": 0.01039957, "balance_loss_clip": 1.0385232, "balance_loss_mlp": 1.02536523, "epoch": 0.962183610894006, "flos": 18471730740480.0, "grad_norm": 2.0113153149968364, "language_loss": 0.80620241, "learning_rate": 1.4938301796288078e-08, "loss": 0.8276298, "num_input_tokens_seen": 172703895, "step": 8002, "time_per_iteration": 2.5963945388793945 }, { "auxiliary_loss_clip": 0.0113326, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.04384565, "balance_loss_mlp": 1.02570379, "epoch": 0.962303853784645, "flos": 18435245500800.0, "grad_norm": 4.414878921077993, "language_loss": 0.81601977, "learning_rate": 1.4843422569380537e-08, "loss": 0.83777475, "num_input_tokens_seen": 172720650, "step": 8003, "time_per_iteration": 2.6078860759735107 }, { "auxiliary_loss_clip": 0.01077932, "auxiliary_loss_mlp": 0.01038677, "balance_loss_clip": 1.03638387, "balance_loss_mlp": 1.02335906, "epoch": 0.9624240966752841, "flos": 26391074826240.0, "grad_norm": 1.8863291588162456, "language_loss": 0.82772815, "learning_rate": 1.4748844489147483e-08, "loss": 0.84889424, "num_input_tokens_seen": 172737640, "step": 8004, "time_per_iteration": 2.6950292587280273 }, { "auxiliary_loss_clip": 0.01104715, "auxiliary_loss_mlp": 0.01041142, "balance_loss_clip": 1.03670907, "balance_loss_mlp": 1.02600217, "epoch": 0.9625443395659231, "flos": 14647675985280.0, "grad_norm": 2.11345430696814, "language_loss": 0.70470613, "learning_rate": 1.4654567569936326e-08, "loss": 0.7261647, "num_input_tokens_seen": 172755215, "step": 8005, "time_per_iteration": 2.6132659912109375 }, { "auxiliary_loss_clip": 0.01080813, "auxiliary_loss_mlp": 0.01049043, "balance_loss_clip": 1.03797007, "balance_loss_mlp": 1.03381395, "epoch": 0.9626645824565623, "flos": 18367626147840.0, "grad_norm": 2.9523616858513977, "language_loss": 0.83205736, "learning_rate": 1.456059182604874e-08, "loss": 0.85335588, "num_input_tokens_seen": 172774020, "step": 8006, "time_per_iteration": 2.675915241241455 }, { "auxiliary_loss_clip": 0.01133597, "auxiliary_loss_mlp": 0.01034659, "balance_loss_clip": 1.0442524, "balance_loss_mlp": 1.01904857, "epoch": 0.9627848253472013, "flos": 16580424021120.0, "grad_norm": 2.0606185818463154, "language_loss": 0.7660827, "learning_rate": 1.4466917271740653e-08, "loss": 0.78776526, "num_input_tokens_seen": 172792220, "step": 8007, "time_per_iteration": 2.5314505100250244 }, { "auxiliary_loss_clip": 0.01105476, "auxiliary_loss_mlp": 0.01043277, "balance_loss_clip": 1.03973198, "balance_loss_mlp": 1.02518153, "epoch": 0.9629050682378404, "flos": 20886867452160.0, "grad_norm": 2.4612614050751, "language_loss": 0.67753494, "learning_rate": 1.4373543921222697e-08, "loss": 0.69902247, "num_input_tokens_seen": 172811805, "step": 8008, "time_per_iteration": 2.624328136444092 }, { "auxiliary_loss_clip": 0.01104535, "auxiliary_loss_mlp": 0.0103525, "balance_loss_clip": 1.04070759, "balance_loss_mlp": 1.01941895, "epoch": 0.9630253111284796, "flos": 17019252478080.0, "grad_norm": 1.9732791565997208, "language_loss": 0.78002405, "learning_rate": 1.428047178865932e-08, "loss": 0.80142188, "num_input_tokens_seen": 172828595, "step": 8009, "time_per_iteration": 3.5531809329986572 }, { "auxiliary_loss_clip": 0.01101881, "auxiliary_loss_mlp": 0.01045865, "balance_loss_clip": 1.03681791, "balance_loss_mlp": 1.02862728, "epoch": 0.9631455540191186, "flos": 20338942412160.0, "grad_norm": 1.5726223641535948, "language_loss": 0.74389756, "learning_rate": 1.4187700888169451e-08, "loss": 0.76537502, "num_input_tokens_seen": 172847770, "step": 8010, "time_per_iteration": 2.666123390197754 }, { "auxiliary_loss_clip": 0.01026369, "auxiliary_loss_mlp": 0.01002088, "balance_loss_clip": 1.00812876, "balance_loss_mlp": 1.00057971, "epoch": 0.9632657969097577, "flos": 65956700033280.0, "grad_norm": 0.7526188061306595, "language_loss": 0.56972289, "learning_rate": 1.40952312338265e-08, "loss": 0.59000742, "num_input_tokens_seen": 172912415, "step": 8011, "time_per_iteration": 4.182179689407349 }, { "auxiliary_loss_clip": 0.01094466, "auxiliary_loss_mlp": 0.01035217, "balance_loss_clip": 1.0371238, "balance_loss_mlp": 1.01979733, "epoch": 0.9633860398003968, "flos": 44419523823360.0, "grad_norm": 1.8443017358874207, "language_loss": 0.68681729, "learning_rate": 1.4003062839657909e-08, "loss": 0.70811415, "num_input_tokens_seen": 172934895, "step": 8012, "time_per_iteration": 2.8391261100769043 }, { "auxiliary_loss_clip": 0.01096416, "auxiliary_loss_mlp": 0.01040704, "balance_loss_clip": 1.03893256, "balance_loss_mlp": 1.02639937, "epoch": 0.9635062826910359, "flos": 24827704300800.0, "grad_norm": 1.69529860072822, "language_loss": 0.80176622, "learning_rate": 1.391119571964583e-08, "loss": 0.8231374, "num_input_tokens_seen": 172955835, "step": 8013, "time_per_iteration": 2.8155720233917236 }, { "auxiliary_loss_clip": 0.01115923, "auxiliary_loss_mlp": 0.01035263, "balance_loss_clip": 1.0402801, "balance_loss_mlp": 1.02112448, "epoch": 0.9636265255816749, "flos": 15961360095360.0, "grad_norm": 2.0175515582414305, "language_loss": 0.72933066, "learning_rate": 1.3819629887726225e-08, "loss": 0.75084245, "num_input_tokens_seen": 172973925, "step": 8014, "time_per_iteration": 2.5727059841156006 }, { "auxiliary_loss_clip": 0.01110078, "auxiliary_loss_mlp": 0.0103884, "balance_loss_clip": 1.04051816, "balance_loss_mlp": 1.02236545, "epoch": 0.9637467684723141, "flos": 22601781457920.0, "grad_norm": 2.3340906347238533, "language_loss": 0.76455194, "learning_rate": 1.3728365357789317e-08, "loss": 0.78604108, "num_input_tokens_seen": 172993290, "step": 8015, "time_per_iteration": 3.749282121658325 }, { "auxiliary_loss_clip": 0.01061148, "auxiliary_loss_mlp": 0.01042668, "balance_loss_clip": 1.03572202, "balance_loss_mlp": 1.02652073, "epoch": 0.9638670113629532, "flos": 17565812801280.0, "grad_norm": 2.5665720070448232, "language_loss": 0.76974797, "learning_rate": 1.3637402143680254e-08, "loss": 0.79078615, "num_input_tokens_seen": 173008190, "step": 8016, "time_per_iteration": 2.699835777282715 }, { "auxiliary_loss_clip": 0.01003392, "auxiliary_loss_mlp": 0.01002705, "balance_loss_clip": 1.00862741, "balance_loss_mlp": 1.00117958, "epoch": 0.9639872542535922, "flos": 55072139379840.0, "grad_norm": 0.7224480238750998, "language_loss": 0.55011094, "learning_rate": 1.3546740259197998e-08, "loss": 0.57017189, "num_input_tokens_seen": 173061000, "step": 8017, "time_per_iteration": 4.126639366149902 }, { "auxiliary_loss_clip": 0.01108014, "auxiliary_loss_mlp": 0.01034155, "balance_loss_clip": 1.03947687, "balance_loss_mlp": 1.01863384, "epoch": 0.9641074971442314, "flos": 24134484746880.0, "grad_norm": 2.0958040623927237, "language_loss": 0.70379698, "learning_rate": 1.3456379718095989e-08, "loss": 0.72521865, "num_input_tokens_seen": 173081415, "step": 8018, "time_per_iteration": 2.634667158126831 }, { "auxiliary_loss_clip": 0.01013062, "auxiliary_loss_mlp": 0.01011754, "balance_loss_clip": 1.00581384, "balance_loss_mlp": 1.00985873, "epoch": 0.9642277400348704, "flos": 66747416077440.0, "grad_norm": 0.842536709233796, "language_loss": 0.61998093, "learning_rate": 1.3366320534081487e-08, "loss": 0.64022911, "num_input_tokens_seen": 173144095, "step": 8019, "time_per_iteration": 3.2495622634887695 }, { "auxiliary_loss_clip": 0.011189, "auxiliary_loss_mlp": 0.01040108, "balance_loss_clip": 1.04192209, "balance_loss_mlp": 1.02406228, "epoch": 0.9643479829255095, "flos": 30920272450560.0, "grad_norm": 2.1130191471252067, "language_loss": 0.7534017, "learning_rate": 1.3276562720816675e-08, "loss": 0.77499175, "num_input_tokens_seen": 173165605, "step": 8020, "time_per_iteration": 2.7347187995910645 }, { "auxiliary_loss_clip": 0.01134508, "auxiliary_loss_mlp": 0.01040389, "balance_loss_clip": 1.04357076, "balance_loss_mlp": 1.02502286, "epoch": 0.9644682258161487, "flos": 20048245643520.0, "grad_norm": 2.7691750444494567, "language_loss": 0.82336318, "learning_rate": 1.3187106291917549e-08, "loss": 0.8451122, "num_input_tokens_seen": 173182595, "step": 8021, "time_per_iteration": 2.5906660556793213 }, { "auxiliary_loss_clip": 0.01113047, "auxiliary_loss_mlp": 0.01039899, "balance_loss_clip": 1.04058802, "balance_loss_mlp": 1.02540267, "epoch": 0.9645884687067877, "flos": 21178713456000.0, "grad_norm": 1.9970080913945618, "language_loss": 0.7036227, "learning_rate": 1.309795126095503e-08, "loss": 0.72515213, "num_input_tokens_seen": 173200895, "step": 8022, "time_per_iteration": 2.5938432216644287 }, { "auxiliary_loss_clip": 0.01052019, "auxiliary_loss_mlp": 0.01033786, "balance_loss_clip": 1.03416038, "balance_loss_mlp": 1.01859868, "epoch": 0.9647087115974268, "flos": 18945967029120.0, "grad_norm": 2.0408350402684063, "language_loss": 0.80647421, "learning_rate": 1.3009097641453192e-08, "loss": 0.82733226, "num_input_tokens_seen": 173218745, "step": 8023, "time_per_iteration": 2.7771852016448975 }, { "auxiliary_loss_clip": 0.01103913, "auxiliary_loss_mlp": 0.01036304, "balance_loss_clip": 1.04034233, "balance_loss_mlp": 1.02137923, "epoch": 0.9648289544880659, "flos": 16545088016640.0, "grad_norm": 2.5917943319642647, "language_loss": 0.76254779, "learning_rate": 1.2920545446891474e-08, "loss": 0.78394991, "num_input_tokens_seen": 173235465, "step": 8024, "time_per_iteration": 2.7829432487487793 }, { "auxiliary_loss_clip": 0.01109274, "auxiliary_loss_mlp": 0.01040793, "balance_loss_clip": 1.04141104, "balance_loss_mlp": 1.02408004, "epoch": 0.964949197378705, "flos": 24057527857920.0, "grad_norm": 1.6481560112851865, "language_loss": 0.70504755, "learning_rate": 1.2832294690703127e-08, "loss": 0.72654819, "num_input_tokens_seen": 173254440, "step": 8025, "time_per_iteration": 2.7276368141174316 }, { "auxiliary_loss_clip": 0.01116166, "auxiliary_loss_mlp": 0.01036091, "balance_loss_clip": 1.04145801, "balance_loss_mlp": 1.02086759, "epoch": 0.965069440269344, "flos": 23365565280000.0, "grad_norm": 2.196159461088921, "language_loss": 0.77523619, "learning_rate": 1.2744345386275668e-08, "loss": 0.79675877, "num_input_tokens_seen": 173273980, "step": 8026, "time_per_iteration": 2.6205668449401855 }, { "auxiliary_loss_clip": 0.01112824, "auxiliary_loss_mlp": 0.01044472, "balance_loss_clip": 1.04206812, "balance_loss_mlp": 1.02938008, "epoch": 0.9651896831599832, "flos": 25374875155200.0, "grad_norm": 1.5753160790335072, "language_loss": 0.78597021, "learning_rate": 1.265669754695109e-08, "loss": 0.80754316, "num_input_tokens_seen": 173293550, "step": 8027, "time_per_iteration": 2.7077879905700684 }, { "auxiliary_loss_clip": 0.01072079, "auxiliary_loss_mlp": 0.0104025, "balance_loss_clip": 1.03730905, "balance_loss_mlp": 1.02396631, "epoch": 0.9653099260506223, "flos": 22272875596800.0, "grad_norm": 2.053405626095139, "language_loss": 0.8203482, "learning_rate": 1.2569351186025201e-08, "loss": 0.84147149, "num_input_tokens_seen": 173312005, "step": 8028, "time_per_iteration": 2.714824914932251 }, { "auxiliary_loss_clip": 0.01081826, "auxiliary_loss_mlp": 0.01042186, "balance_loss_clip": 1.03461325, "balance_loss_mlp": 1.02684367, "epoch": 0.9654301689412613, "flos": 26760847386240.0, "grad_norm": 2.0262475003018445, "language_loss": 0.75408471, "learning_rate": 1.2482306316748737e-08, "loss": 0.77532482, "num_input_tokens_seen": 173332450, "step": 8029, "time_per_iteration": 2.6640658378601074 }, { "auxiliary_loss_clip": 0.0112164, "auxiliary_loss_mlp": 0.01032003, "balance_loss_clip": 1.03945172, "balance_loss_mlp": 1.0168041, "epoch": 0.9655504118319005, "flos": 17412689122560.0, "grad_norm": 1.9538238610585592, "language_loss": 0.78441113, "learning_rate": 1.2395562952326021e-08, "loss": 0.80594754, "num_input_tokens_seen": 173349610, "step": 8030, "time_per_iteration": 2.566842794418335 }, { "auxiliary_loss_clip": 0.01116449, "auxiliary_loss_mlp": 0.01040342, "balance_loss_clip": 1.0419755, "balance_loss_mlp": 1.02349758, "epoch": 0.9656706547225395, "flos": 22126970551680.0, "grad_norm": 5.252796976150356, "language_loss": 0.81105113, "learning_rate": 1.2309121105916309e-08, "loss": 0.83261907, "num_input_tokens_seen": 173367900, "step": 8031, "time_per_iteration": 2.6625759601593018 }, { "auxiliary_loss_clip": 0.011207, "auxiliary_loss_mlp": 0.01035697, "balance_loss_clip": 1.04139006, "balance_loss_mlp": 1.02073658, "epoch": 0.9657908976131786, "flos": 37049289926400.0, "grad_norm": 1.8656347273964256, "language_loss": 0.6938324, "learning_rate": 1.222298079063222e-08, "loss": 0.7153964, "num_input_tokens_seen": 173389040, "step": 8032, "time_per_iteration": 2.7568910121917725 }, { "auxiliary_loss_clip": 0.01117169, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.0406518, "balance_loss_mlp": 1.02080059, "epoch": 0.9659111405038178, "flos": 24389809597440.0, "grad_norm": 1.9591163870719117, "language_loss": 0.72665465, "learning_rate": 1.2137142019541524e-08, "loss": 0.7481842, "num_input_tokens_seen": 173407595, "step": 8033, "time_per_iteration": 2.5884830951690674 }, { "auxiliary_loss_clip": 0.01110865, "auxiliary_loss_mlp": 0.01044234, "balance_loss_clip": 1.04055381, "balance_loss_mlp": 1.02849841, "epoch": 0.9660313833944568, "flos": 25009412227200.0, "grad_norm": 2.30018066001037, "language_loss": 0.7322185, "learning_rate": 1.2051604805666027e-08, "loss": 0.75376952, "num_input_tokens_seen": 173424720, "step": 8034, "time_per_iteration": 2.6516377925872803 }, { "auxiliary_loss_clip": 0.01130463, "auxiliary_loss_mlp": 0.00771851, "balance_loss_clip": 1.04265094, "balance_loss_mlp": 1.00058031, "epoch": 0.9661516262850959, "flos": 11801575895040.0, "grad_norm": 3.42275312239587, "language_loss": 0.78512877, "learning_rate": 1.196636916198135e-08, "loss": 0.80415189, "num_input_tokens_seen": 173442260, "step": 8035, "time_per_iteration": 3.4884033203125 }, { "auxiliary_loss_clip": 0.01134493, "auxiliary_loss_mlp": 0.01033501, "balance_loss_clip": 1.04300249, "balance_loss_mlp": 1.01865947, "epoch": 0.9662718691757349, "flos": 20047778766720.0, "grad_norm": 2.14705389655608, "language_loss": 0.77045274, "learning_rate": 1.1881435101418036e-08, "loss": 0.79213262, "num_input_tokens_seen": 173461675, "step": 8036, "time_per_iteration": 2.6737258434295654 }, { "auxiliary_loss_clip": 0.01021245, "auxiliary_loss_mlp": 0.01001829, "balance_loss_clip": 1.01243556, "balance_loss_mlp": 1.00032711, "epoch": 0.9663921120663741, "flos": 68027703517440.0, "grad_norm": 0.72158725940516, "language_loss": 0.65537298, "learning_rate": 1.1796802636860003e-08, "loss": 0.67560375, "num_input_tokens_seen": 173530205, "step": 8037, "time_per_iteration": 4.118625640869141 }, { "auxiliary_loss_clip": 0.01132572, "auxiliary_loss_mlp": 0.0104419, "balance_loss_clip": 1.04284775, "balance_loss_mlp": 1.02810264, "epoch": 0.9665123549570132, "flos": 26322916769280.0, "grad_norm": 1.9214798436304183, "language_loss": 0.73829079, "learning_rate": 1.1712471781146316e-08, "loss": 0.7600584, "num_input_tokens_seen": 173549540, "step": 8038, "time_per_iteration": 2.5546295642852783 }, { "auxiliary_loss_clip": 0.01127682, "auxiliary_loss_mlp": 0.01038457, "balance_loss_clip": 1.03996253, "balance_loss_mlp": 1.02366948, "epoch": 0.9666325978476522, "flos": 43941121557120.0, "grad_norm": 1.8075225007332736, "language_loss": 0.66987592, "learning_rate": 1.1628442547069628e-08, "loss": 0.69153726, "num_input_tokens_seen": 173571740, "step": 8039, "time_per_iteration": 2.931417465209961 }, { "auxiliary_loss_clip": 0.01121831, "auxiliary_loss_mlp": 0.00772471, "balance_loss_clip": 1.04100323, "balance_loss_mlp": 1.00057054, "epoch": 0.9667528407382914, "flos": 21543422198400.0, "grad_norm": 3.0181622219087125, "language_loss": 0.77334309, "learning_rate": 1.1544714947377521e-08, "loss": 0.7922861, "num_input_tokens_seen": 173589425, "step": 8040, "time_per_iteration": 2.566993474960327 }, { "auxiliary_loss_clip": 0.01134148, "auxiliary_loss_mlp": 0.01040897, "balance_loss_clip": 1.04388523, "balance_loss_mlp": 1.02491689, "epoch": 0.9668730836289304, "flos": 23878585278720.0, "grad_norm": 2.72887729939636, "language_loss": 0.69916344, "learning_rate": 1.1461288994770945e-08, "loss": 0.72091389, "num_input_tokens_seen": 173608500, "step": 8041, "time_per_iteration": 2.626894950866699 }, { "auxiliary_loss_clip": 0.01133545, "auxiliary_loss_mlp": 0.01041784, "balance_loss_clip": 1.04207349, "balance_loss_mlp": 1.02587008, "epoch": 0.9669933265195695, "flos": 28293011971200.0, "grad_norm": 2.0538885047039197, "language_loss": 0.77492654, "learning_rate": 1.1378164701906002e-08, "loss": 0.79667985, "num_input_tokens_seen": 173630265, "step": 8042, "time_per_iteration": 3.580273389816284 }, { "auxiliary_loss_clip": 0.01135853, "auxiliary_loss_mlp": 0.01037977, "balance_loss_clip": 1.04448712, "balance_loss_mlp": 1.02225327, "epoch": 0.9671135694102087, "flos": 22454763091200.0, "grad_norm": 2.4114066171181703, "language_loss": 0.66738981, "learning_rate": 1.1295342081392156e-08, "loss": 0.6891281, "num_input_tokens_seen": 173649625, "step": 8043, "time_per_iteration": 2.6029698848724365 }, { "auxiliary_loss_clip": 0.01105855, "auxiliary_loss_mlp": 0.01040579, "balance_loss_clip": 1.03962994, "balance_loss_mlp": 1.02552283, "epoch": 0.9672338123008477, "flos": 20155941596160.0, "grad_norm": 1.7968405098167084, "language_loss": 0.69226551, "learning_rate": 1.1212821145793804e-08, "loss": 0.71372986, "num_input_tokens_seen": 173669240, "step": 8044, "time_per_iteration": 3.5167393684387207 }, { "auxiliary_loss_clip": 0.01105885, "auxiliary_loss_mlp": 0.01040682, "balance_loss_clip": 1.03933346, "balance_loss_mlp": 1.02550626, "epoch": 0.9673540551914868, "flos": 16977487939200.0, "grad_norm": 3.7163068820389076, "language_loss": 0.7882539, "learning_rate": 1.1130601907629156e-08, "loss": 0.80971956, "num_input_tokens_seen": 173686970, "step": 8045, "time_per_iteration": 2.5991618633270264 }, { "auxiliary_loss_clip": 0.01025782, "auxiliary_loss_mlp": 0.01007841, "balance_loss_clip": 1.00661659, "balance_loss_mlp": 1.00574327, "epoch": 0.9674742980821259, "flos": 61892903952000.0, "grad_norm": 0.8170524571419776, "language_loss": 0.64776683, "learning_rate": 1.1048684379370899e-08, "loss": 0.6681031, "num_input_tokens_seen": 173747655, "step": 8046, "time_per_iteration": 3.182100296020508 }, { "auxiliary_loss_clip": 0.01100251, "auxiliary_loss_mlp": 0.01038712, "balance_loss_clip": 1.04146397, "balance_loss_mlp": 1.0224514, "epoch": 0.967594540972765, "flos": 18697824898560.0, "grad_norm": 3.030821130113495, "language_loss": 0.74698818, "learning_rate": 1.0967068573445759e-08, "loss": 0.76837784, "num_input_tokens_seen": 173765140, "step": 8047, "time_per_iteration": 2.594604730606079 }, { "auxiliary_loss_clip": 0.01105709, "auxiliary_loss_mlp": 0.0103767, "balance_loss_clip": 1.03789496, "balance_loss_mlp": 1.02228057, "epoch": 0.967714783863404, "flos": 20777411733120.0, "grad_norm": 2.353717337743101, "language_loss": 0.65366626, "learning_rate": 1.0885754502234945e-08, "loss": 0.67510003, "num_input_tokens_seen": 173784800, "step": 8048, "time_per_iteration": 2.6981911659240723 }, { "auxiliary_loss_clip": 0.01089454, "auxiliary_loss_mlp": 0.01036974, "balance_loss_clip": 1.03694749, "balance_loss_mlp": 1.0226686, "epoch": 0.9678350267540432, "flos": 23185473465600.0, "grad_norm": 3.24884715658889, "language_loss": 0.78228915, "learning_rate": 1.08047421780737e-08, "loss": 0.8035534, "num_input_tokens_seen": 173803990, "step": 8049, "time_per_iteration": 2.6604392528533936 }, { "auxiliary_loss_clip": 0.01111939, "auxiliary_loss_mlp": 0.007714, "balance_loss_clip": 1.04056907, "balance_loss_mlp": 1.00053167, "epoch": 0.9679552696446823, "flos": 21726063878400.0, "grad_norm": 4.089497408936063, "language_loss": 0.74152327, "learning_rate": 1.0724031613251305e-08, "loss": 0.76035661, "num_input_tokens_seen": 173821890, "step": 8050, "time_per_iteration": 2.630350112915039 }, { "auxiliary_loss_clip": 0.01124936, "auxiliary_loss_mlp": 0.01043482, "balance_loss_clip": 1.04344082, "balance_loss_mlp": 1.02715063, "epoch": 0.9680755125353213, "flos": 26869046129280.0, "grad_norm": 2.194119777700457, "language_loss": 0.66116011, "learning_rate": 1.0643622820011744e-08, "loss": 0.68284428, "num_input_tokens_seen": 173842945, "step": 8051, "time_per_iteration": 2.766296625137329 }, { "auxiliary_loss_clip": 0.01133816, "auxiliary_loss_mlp": 0.01040604, "balance_loss_clip": 1.04226279, "balance_loss_mlp": 1.02333069, "epoch": 0.9681957554259605, "flos": 28325008010880.0, "grad_norm": 3.72112466946071, "language_loss": 0.68246895, "learning_rate": 1.0563515810552814e-08, "loss": 0.70421308, "num_input_tokens_seen": 173859915, "step": 8052, "time_per_iteration": 2.577796220779419 }, { "auxiliary_loss_clip": 0.01134163, "auxiliary_loss_mlp": 0.01042094, "balance_loss_clip": 1.04562712, "balance_loss_mlp": 1.02619195, "epoch": 0.9683159983165995, "flos": 20557674282240.0, "grad_norm": 1.6797956794051738, "language_loss": 0.73275101, "learning_rate": 1.0483710597026795e-08, "loss": 0.75451356, "num_input_tokens_seen": 173879775, "step": 8053, "time_per_iteration": 2.599215507507324 }, { "auxiliary_loss_clip": 0.01094927, "auxiliary_loss_mlp": 0.01041922, "balance_loss_clip": 1.0409584, "balance_loss_mlp": 1.02679443, "epoch": 0.9684362412072386, "flos": 24207958016640.0, "grad_norm": 1.9865291464235297, "language_loss": 0.73957658, "learning_rate": 1.0404207191540227e-08, "loss": 0.76094508, "num_input_tokens_seen": 173900230, "step": 8054, "time_per_iteration": 2.6684300899505615 }, { "auxiliary_loss_clip": 0.01130023, "auxiliary_loss_mlp": 0.01032884, "balance_loss_clip": 1.04151535, "balance_loss_mlp": 1.01855516, "epoch": 0.9685564840978778, "flos": 22346241125760.0, "grad_norm": 1.9736257014302194, "language_loss": 0.74417567, "learning_rate": 1.0325005606153236e-08, "loss": 0.76580477, "num_input_tokens_seen": 173919690, "step": 8055, "time_per_iteration": 2.720109462738037 }, { "auxiliary_loss_clip": 0.01082862, "auxiliary_loss_mlp": 0.01044136, "balance_loss_clip": 1.03627086, "balance_loss_mlp": 1.02604032, "epoch": 0.9686767269885168, "flos": 14386389477120.0, "grad_norm": 2.5827372992488504, "language_loss": 0.79457474, "learning_rate": 1.0246105852881104e-08, "loss": 0.81584471, "num_input_tokens_seen": 173934790, "step": 8056, "time_per_iteration": 2.7000174522399902 }, { "auxiliary_loss_clip": 0.01134766, "auxiliary_loss_mlp": 0.01040373, "balance_loss_clip": 1.04288197, "balance_loss_mlp": 1.02348089, "epoch": 0.9687969698791559, "flos": 21287630471040.0, "grad_norm": 2.604596865241862, "language_loss": 0.78664017, "learning_rate": 1.0167507943692476e-08, "loss": 0.80839157, "num_input_tokens_seen": 173953875, "step": 8057, "time_per_iteration": 2.6178441047668457 }, { "auxiliary_loss_clip": 0.01114156, "auxiliary_loss_mlp": 0.01039654, "balance_loss_clip": 1.04094005, "balance_loss_mlp": 1.02398944, "epoch": 0.968917212769795, "flos": 19828328624640.0, "grad_norm": 2.024886505442676, "language_loss": 0.71484685, "learning_rate": 1.008921189051093e-08, "loss": 0.73638499, "num_input_tokens_seen": 173971220, "step": 8058, "time_per_iteration": 2.642523765563965 }, { "auxiliary_loss_clip": 0.0113386, "auxiliary_loss_mlp": 0.01038834, "balance_loss_clip": 1.04474521, "balance_loss_mlp": 1.02373588, "epoch": 0.9690374556604341, "flos": 21681749473920.0, "grad_norm": 2.0036252988376027, "language_loss": 0.77296448, "learning_rate": 1.0011217705213848e-08, "loss": 0.79469144, "num_input_tokens_seen": 173989095, "step": 8059, "time_per_iteration": 2.6241044998168945 }, { "auxiliary_loss_clip": 0.01116803, "auxiliary_loss_mlp": 0.01039465, "balance_loss_clip": 1.040622, "balance_loss_mlp": 1.02492142, "epoch": 0.9691576985510731, "flos": 32635437851520.0, "grad_norm": 3.3401432380670784, "language_loss": 0.74370933, "learning_rate": 9.933525399632658e-09, "loss": 0.76527202, "num_input_tokens_seen": 174007330, "step": 8060, "time_per_iteration": 2.7150321006774902 }, { "auxiliary_loss_clip": 0.0110255, "auxiliary_loss_mlp": 0.01041279, "balance_loss_clip": 1.03889537, "balance_loss_mlp": 1.02528143, "epoch": 0.9692779414417123, "flos": 35663174040960.0, "grad_norm": 1.8500027196123876, "language_loss": 0.6539216, "learning_rate": 9.856134985553488e-09, "loss": 0.67535985, "num_input_tokens_seen": 174027055, "step": 8061, "time_per_iteration": 3.6533730030059814 }, { "auxiliary_loss_clip": 0.01129186, "auxiliary_loss_mlp": 0.01038369, "balance_loss_clip": 1.04070795, "balance_loss_mlp": 1.02359247, "epoch": 0.9693981843323514, "flos": 28366952117760.0, "grad_norm": 1.752986459114171, "language_loss": 0.73461193, "learning_rate": 9.77904647471628e-09, "loss": 0.75628746, "num_input_tokens_seen": 174050235, "step": 8062, "time_per_iteration": 2.6063101291656494 }, { "auxiliary_loss_clip": 0.01077512, "auxiliary_loss_mlp": 0.01043091, "balance_loss_clip": 1.03666925, "balance_loss_mlp": 1.02722383, "epoch": 0.9695184272229904, "flos": 23622865378560.0, "grad_norm": 1.6140888898864854, "language_loss": 0.74179512, "learning_rate": 9.702259878815454e-09, "loss": 0.76300114, "num_input_tokens_seen": 174070560, "step": 8063, "time_per_iteration": 2.69455885887146 }, { "auxiliary_loss_clip": 0.0112449, "auxiliary_loss_mlp": 0.01045568, "balance_loss_clip": 1.04320538, "balance_loss_mlp": 1.02834189, "epoch": 0.9696386701136296, "flos": 23294677789440.0, "grad_norm": 2.1995178815383465, "language_loss": 0.74732685, "learning_rate": 9.625775209499254e-09, "loss": 0.76902741, "num_input_tokens_seen": 174090565, "step": 8064, "time_per_iteration": 3.530904531478882 }, { "auxiliary_loss_clip": 0.01087161, "auxiliary_loss_mlp": 0.0103473, "balance_loss_clip": 1.03635538, "balance_loss_mlp": 1.01901221, "epoch": 0.9697589130042686, "flos": 15121876360320.0, "grad_norm": 2.531794827380236, "language_loss": 0.73987198, "learning_rate": 9.549592478370172e-09, "loss": 0.76109087, "num_input_tokens_seen": 174108745, "step": 8065, "time_per_iteration": 2.6188368797302246 }, { "auxiliary_loss_clip": 0.01120249, "auxiliary_loss_mlp": 0.01037852, "balance_loss_clip": 1.04203999, "balance_loss_mlp": 1.02357054, "epoch": 0.9698791558949077, "flos": 18879532824960.0, "grad_norm": 1.7897181422500767, "language_loss": 0.79355842, "learning_rate": 9.473711696985632e-09, "loss": 0.81513941, "num_input_tokens_seen": 174128075, "step": 8066, "time_per_iteration": 2.5983691215515137 }, { "auxiliary_loss_clip": 0.01105124, "auxiliary_loss_mlp": 0.01044365, "balance_loss_clip": 1.03856981, "balance_loss_mlp": 1.02859402, "epoch": 0.9699993987855468, "flos": 17931455297280.0, "grad_norm": 4.2353066684525835, "language_loss": 0.76452827, "learning_rate": 9.398132876856201e-09, "loss": 0.78602326, "num_input_tokens_seen": 174147040, "step": 8067, "time_per_iteration": 2.548022985458374 }, { "auxiliary_loss_clip": 0.01002945, "auxiliary_loss_mlp": 0.0100104, "balance_loss_clip": 1.01281452, "balance_loss_mlp": 0.99956745, "epoch": 0.9701196416761859, "flos": 67182186297600.0, "grad_norm": 0.7732745413410399, "language_loss": 0.60781002, "learning_rate": 9.322856029447379e-09, "loss": 0.62784982, "num_input_tokens_seen": 174208225, "step": 8068, "time_per_iteration": 4.203613758087158 }, { "auxiliary_loss_clip": 0.01128033, "auxiliary_loss_mlp": 0.01041224, "balance_loss_clip": 1.04321361, "balance_loss_mlp": 1.02564287, "epoch": 0.970239884566825, "flos": 24277804012800.0, "grad_norm": 2.0038211137175366, "language_loss": 0.80037051, "learning_rate": 9.247881166178695e-09, "loss": 0.82206309, "num_input_tokens_seen": 174226935, "step": 8069, "time_per_iteration": 2.5952279567718506 }, { "auxiliary_loss_clip": 0.01102627, "auxiliary_loss_mlp": 0.01036, "balance_loss_clip": 1.0419836, "balance_loss_mlp": 1.02039504, "epoch": 0.970360127457464, "flos": 25301689194240.0, "grad_norm": 2.132554701298488, "language_loss": 0.76465487, "learning_rate": 9.173208298423274e-09, "loss": 0.78604114, "num_input_tokens_seen": 174248140, "step": 8070, "time_per_iteration": 3.6394495964050293 }, { "auxiliary_loss_clip": 0.01075576, "auxiliary_loss_mlp": 0.00771684, "balance_loss_clip": 1.03559542, "balance_loss_mlp": 1.00043333, "epoch": 0.9704803703481032, "flos": 29572473398400.0, "grad_norm": 1.619633233514422, "language_loss": 0.76383561, "learning_rate": 9.09883743750961e-09, "loss": 0.78230822, "num_input_tokens_seen": 174271030, "step": 8071, "time_per_iteration": 2.780792474746704 }, { "auxiliary_loss_clip": 0.01107617, "auxiliary_loss_mlp": 0.01034374, "balance_loss_clip": 1.04163587, "balance_loss_mlp": 1.01961589, "epoch": 0.9706006132387422, "flos": 17380046638080.0, "grad_norm": 2.8771117556830053, "language_loss": 0.84149742, "learning_rate": 9.024768594719124e-09, "loss": 0.8629173, "num_input_tokens_seen": 174289410, "step": 8072, "time_per_iteration": 2.6289174556732178 }, { "auxiliary_loss_clip": 0.01092732, "auxiliary_loss_mlp": 0.0103387, "balance_loss_clip": 1.0394671, "balance_loss_mlp": 1.01788366, "epoch": 0.9707208561293813, "flos": 18186421011840.0, "grad_norm": 2.04928984266301, "language_loss": 0.72477651, "learning_rate": 8.95100178128816e-09, "loss": 0.74604255, "num_input_tokens_seen": 174308550, "step": 8073, "time_per_iteration": 2.6298813819885254 }, { "auxiliary_loss_clip": 0.01105953, "auxiliary_loss_mlp": 0.01038464, "balance_loss_clip": 1.03886724, "balance_loss_mlp": 1.02251351, "epoch": 0.9708410990200205, "flos": 31248388212480.0, "grad_norm": 1.8281518976310018, "language_loss": 0.70160711, "learning_rate": 8.877537008407321e-09, "loss": 0.72305131, "num_input_tokens_seen": 174328600, "step": 8074, "time_per_iteration": 2.687105894088745 }, { "auxiliary_loss_clip": 0.01111699, "auxiliary_loss_mlp": 0.01037406, "balance_loss_clip": 1.04286408, "balance_loss_mlp": 1.02275562, "epoch": 0.9709613419106595, "flos": 30554450386560.0, "grad_norm": 1.9752178249427013, "language_loss": 0.68671161, "learning_rate": 8.804374287221028e-09, "loss": 0.70820266, "num_input_tokens_seen": 174349835, "step": 8075, "time_per_iteration": 2.6924264430999756 }, { "auxiliary_loss_clip": 0.01086756, "auxiliary_loss_mlp": 0.01038372, "balance_loss_clip": 1.03623295, "balance_loss_mlp": 1.02229095, "epoch": 0.9710815848012986, "flos": 23730166281600.0, "grad_norm": 2.1524205648160346, "language_loss": 0.84655678, "learning_rate": 8.731513628827958e-09, "loss": 0.86780804, "num_input_tokens_seen": 174369200, "step": 8076, "time_per_iteration": 2.6716041564941406 }, { "auxiliary_loss_clip": 0.01122625, "auxiliary_loss_mlp": 0.01033332, "balance_loss_clip": 1.04167819, "balance_loss_mlp": 1.01806116, "epoch": 0.9712018276919377, "flos": 23761875012480.0, "grad_norm": 1.890032787029834, "language_loss": 0.82457948, "learning_rate": 8.658955044280825e-09, "loss": 0.84613901, "num_input_tokens_seen": 174388125, "step": 8077, "time_per_iteration": 2.609658718109131 }, { "auxiliary_loss_clip": 0.01114644, "auxiliary_loss_mlp": 0.01038911, "balance_loss_clip": 1.04272199, "balance_loss_mlp": 1.02443933, "epoch": 0.9713220705825768, "flos": 23330983461120.0, "grad_norm": 1.5303628671366478, "language_loss": 0.7745446, "learning_rate": 8.586698544587268e-09, "loss": 0.79608011, "num_input_tokens_seen": 174409735, "step": 8078, "time_per_iteration": 2.6534268856048584 }, { "auxiliary_loss_clip": 0.01099879, "auxiliary_loss_mlp": 0.01042701, "balance_loss_clip": 1.04004681, "balance_loss_mlp": 1.02485561, "epoch": 0.9714423134732159, "flos": 22200946611840.0, "grad_norm": 1.8495130349499924, "language_loss": 0.74241996, "learning_rate": 8.514744140707853e-09, "loss": 0.7638458, "num_input_tokens_seen": 174428875, "step": 8079, "time_per_iteration": 2.617509126663208 }, { "auxiliary_loss_clip": 0.0112939, "auxiliary_loss_mlp": 0.0103921, "balance_loss_clip": 1.04274011, "balance_loss_mlp": 1.02529776, "epoch": 0.971562556363855, "flos": 20229917656320.0, "grad_norm": 1.756894508417807, "language_loss": 0.76543903, "learning_rate": 8.443091843558515e-09, "loss": 0.78712505, "num_input_tokens_seen": 174447960, "step": 8080, "time_per_iteration": 2.722682237625122 }, { "auxiliary_loss_clip": 0.01104643, "auxiliary_loss_mlp": 0.01050331, "balance_loss_clip": 1.0398885, "balance_loss_mlp": 1.0319128, "epoch": 0.9716827992544941, "flos": 24970197553920.0, "grad_norm": 2.1468747376869866, "language_loss": 0.64632016, "learning_rate": 8.37174166400878e-09, "loss": 0.66786993, "num_input_tokens_seen": 174463535, "step": 8081, "time_per_iteration": 2.6450653076171875 }, { "auxiliary_loss_clip": 0.01134388, "auxiliary_loss_mlp": 0.01043023, "balance_loss_clip": 1.0457294, "balance_loss_mlp": 1.02820575, "epoch": 0.9718030421451331, "flos": 24681476033280.0, "grad_norm": 1.9571849589369739, "language_loss": 0.85211718, "learning_rate": 8.300693612881992e-09, "loss": 0.87389135, "num_input_tokens_seen": 174483600, "step": 8082, "time_per_iteration": 2.5663464069366455 }, { "auxiliary_loss_clip": 0.01118814, "auxiliary_loss_mlp": 0.00772033, "balance_loss_clip": 1.04106021, "balance_loss_mlp": 1.00047851, "epoch": 0.9719232850357723, "flos": 22090700793600.0, "grad_norm": 2.130782715612094, "language_loss": 0.81648594, "learning_rate": 8.22994770095664e-09, "loss": 0.83539444, "num_input_tokens_seen": 174502175, "step": 8083, "time_per_iteration": 2.5774590969085693 }, { "auxiliary_loss_clip": 0.01107599, "auxiliary_loss_mlp": 0.01033557, "balance_loss_clip": 1.04370487, "balance_loss_mlp": 1.01811314, "epoch": 0.9720435279264114, "flos": 23656908493440.0, "grad_norm": 2.1341297664415753, "language_loss": 0.75179595, "learning_rate": 8.159503938964585e-09, "loss": 0.77320749, "num_input_tokens_seen": 174519495, "step": 8084, "time_per_iteration": 2.74029541015625 }, { "auxiliary_loss_clip": 0.01083234, "auxiliary_loss_mlp": 0.01043545, "balance_loss_clip": 1.03596461, "balance_loss_mlp": 1.02811956, "epoch": 0.9721637708170504, "flos": 28365910623360.0, "grad_norm": 2.3603572966184685, "language_loss": 0.70331454, "learning_rate": 8.089362337592164e-09, "loss": 0.72458231, "num_input_tokens_seen": 174543120, "step": 8085, "time_per_iteration": 2.700915575027466 }, { "auxiliary_loss_clip": 0.01106946, "auxiliary_loss_mlp": 0.01034437, "balance_loss_clip": 1.04073083, "balance_loss_mlp": 1.0198102, "epoch": 0.9722840137076896, "flos": 29130807767040.0, "grad_norm": 1.5734047445779666, "language_loss": 0.72122562, "learning_rate": 8.019522907479536e-09, "loss": 0.74263948, "num_input_tokens_seen": 174563480, "step": 8086, "time_per_iteration": 2.6627535820007324 }, { "auxiliary_loss_clip": 0.01119718, "auxiliary_loss_mlp": 0.01035695, "balance_loss_clip": 1.04189658, "balance_loss_mlp": 1.020818, "epoch": 0.9724042565983286, "flos": 19243954258560.0, "grad_norm": 2.3806717698807787, "language_loss": 0.77378124, "learning_rate": 7.949985659221558e-09, "loss": 0.79533535, "num_input_tokens_seen": 174580745, "step": 8087, "time_per_iteration": 3.591019868850708 }, { "auxiliary_loss_clip": 0.01110429, "auxiliary_loss_mlp": 0.01040311, "balance_loss_clip": 1.04149938, "balance_loss_mlp": 1.02537429, "epoch": 0.9725244994889677, "flos": 23039676161280.0, "grad_norm": 2.3491039440096144, "language_loss": 0.78629094, "learning_rate": 7.880750603366904e-09, "loss": 0.80779833, "num_input_tokens_seen": 174599615, "step": 8088, "time_per_iteration": 2.6179306507110596 }, { "auxiliary_loss_clip": 0.01105147, "auxiliary_loss_mlp": 0.01046013, "balance_loss_clip": 1.04023075, "balance_loss_mlp": 1.02915657, "epoch": 0.9726447423796069, "flos": 23367468700800.0, "grad_norm": 4.04461816344089, "language_loss": 0.79854226, "learning_rate": 7.811817750418282e-09, "loss": 0.82005388, "num_input_tokens_seen": 174618375, "step": 8089, "time_per_iteration": 3.632570743560791 }, { "auxiliary_loss_clip": 0.01094633, "auxiliary_loss_mlp": 0.01040807, "balance_loss_clip": 1.04000521, "balance_loss_mlp": 1.02399898, "epoch": 0.9727649852702459, "flos": 26541648639360.0, "grad_norm": 2.2531819786516927, "language_loss": 0.80025554, "learning_rate": 7.743187110833105e-09, "loss": 0.82160991, "num_input_tokens_seen": 174641135, "step": 8090, "time_per_iteration": 2.693132162094116 }, { "auxiliary_loss_clip": 0.01109506, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.04055309, "balance_loss_mlp": 1.0169661, "epoch": 0.972885228160885, "flos": 20522338277760.0, "grad_norm": 1.69090841742955, "language_loss": 0.80762655, "learning_rate": 7.674858695022602e-09, "loss": 0.82903898, "num_input_tokens_seen": 174659490, "step": 8091, "time_per_iteration": 2.672257900238037 }, { "auxiliary_loss_clip": 0.01135675, "auxiliary_loss_mlp": 0.01039869, "balance_loss_clip": 1.0441401, "balance_loss_mlp": 1.02400255, "epoch": 0.9730054710515241, "flos": 17566064196480.0, "grad_norm": 2.787089933900802, "language_loss": 0.75797069, "learning_rate": 7.606832513351591e-09, "loss": 0.77972615, "num_input_tokens_seen": 174677440, "step": 8092, "time_per_iteration": 2.5351369380950928 }, { "auxiliary_loss_clip": 0.01034396, "auxiliary_loss_mlp": 0.00755576, "balance_loss_clip": 1.00670314, "balance_loss_mlp": 1.00019872, "epoch": 0.9731257139421632, "flos": 68972010117120.0, "grad_norm": 0.8274379732310924, "language_loss": 0.63894248, "learning_rate": 7.539108576140264e-09, "loss": 0.65684223, "num_input_tokens_seen": 174741550, "step": 8093, "time_per_iteration": 3.2167882919311523 }, { "auxiliary_loss_clip": 0.01082744, "auxiliary_loss_mlp": 0.01032472, "balance_loss_clip": 1.03906393, "balance_loss_mlp": 1.01862574, "epoch": 0.9732459568328022, "flos": 18478841633280.0, "grad_norm": 2.2213587450448413, "language_loss": 0.70751965, "learning_rate": 7.471686893661732e-09, "loss": 0.72867179, "num_input_tokens_seen": 174759845, "step": 8094, "time_per_iteration": 2.6703920364379883 }, { "auxiliary_loss_clip": 0.01106707, "auxiliary_loss_mlp": 0.01045027, "balance_loss_clip": 1.04100156, "balance_loss_mlp": 1.02954817, "epoch": 0.9733661997234414, "flos": 20883886623360.0, "grad_norm": 2.4825087662392913, "language_loss": 0.64135575, "learning_rate": 7.4045674761442636e-09, "loss": 0.66287315, "num_input_tokens_seen": 174777175, "step": 8095, "time_per_iteration": 3.56831431388855 }, { "auxiliary_loss_clip": 0.01131588, "auxiliary_loss_mlp": 0.00771488, "balance_loss_clip": 1.0436126, "balance_loss_mlp": 1.00046265, "epoch": 0.9734864426140805, "flos": 23766795175680.0, "grad_norm": 1.9401185679347743, "language_loss": 0.74574268, "learning_rate": 7.337750333769488e-09, "loss": 0.76477343, "num_input_tokens_seen": 174796980, "step": 8096, "time_per_iteration": 3.473601818084717 }, { "auxiliary_loss_clip": 0.0111148, "auxiliary_loss_mlp": 0.01035424, "balance_loss_clip": 1.03836989, "balance_loss_mlp": 1.01943803, "epoch": 0.9736066855047195, "flos": 35042422176000.0, "grad_norm": 2.436257044372169, "language_loss": 0.73023427, "learning_rate": 7.2712354766737425e-09, "loss": 0.75170332, "num_input_tokens_seen": 174817310, "step": 8097, "time_per_iteration": 2.7087759971618652 }, { "auxiliary_loss_clip": 0.01083967, "auxiliary_loss_mlp": 0.0105342, "balance_loss_clip": 1.0385561, "balance_loss_mlp": 1.03643239, "epoch": 0.9737269283953586, "flos": 20410620001920.0, "grad_norm": 1.5834169436942005, "language_loss": 0.80859476, "learning_rate": 7.2050229149469565e-09, "loss": 0.82996863, "num_input_tokens_seen": 174837320, "step": 8098, "time_per_iteration": 2.6524546146392822 }, { "auxiliary_loss_clip": 0.01094631, "auxiliary_loss_mlp": 0.01043081, "balance_loss_clip": 1.03703165, "balance_loss_mlp": 1.02751267, "epoch": 0.9738471712859977, "flos": 28911680847360.0, "grad_norm": 1.7396351552711828, "language_loss": 0.63311923, "learning_rate": 7.139112658633984e-09, "loss": 0.65449631, "num_input_tokens_seen": 174857470, "step": 8099, "time_per_iteration": 2.7169642448425293 }, { "auxiliary_loss_clip": 0.01092167, "auxiliary_loss_mlp": 0.01036915, "balance_loss_clip": 1.03875065, "balance_loss_mlp": 1.0204407, "epoch": 0.9739674141766368, "flos": 27782326356480.0, "grad_norm": 2.885336402019, "language_loss": 0.70357895, "learning_rate": 7.073504717733048e-09, "loss": 0.72486979, "num_input_tokens_seen": 174877035, "step": 8100, "time_per_iteration": 2.6812853813171387 }, { "auxiliary_loss_clip": 0.00995395, "auxiliary_loss_mlp": 0.00999741, "balance_loss_clip": 1.00869679, "balance_loss_mlp": 0.99827445, "epoch": 0.9740876570672758, "flos": 68863057188480.0, "grad_norm": 0.7342599951613055, "language_loss": 0.57158673, "learning_rate": 7.008199102196855e-09, "loss": 0.59153807, "num_input_tokens_seen": 174938460, "step": 8101, "time_per_iteration": 3.372347593307495 }, { "auxiliary_loss_clip": 0.01010715, "auxiliary_loss_mlp": 0.01001658, "balance_loss_clip": 1.00588155, "balance_loss_mlp": 1.00007248, "epoch": 0.974207899957915, "flos": 58236622646400.0, "grad_norm": 0.7985749849836353, "language_loss": 0.58973265, "learning_rate": 6.9431958219321464e-09, "loss": 0.60985637, "num_input_tokens_seen": 174994625, "step": 8102, "time_per_iteration": 3.3510684967041016 }, { "auxiliary_loss_clip": 0.0110824, "auxiliary_loss_mlp": 0.01037592, "balance_loss_clip": 1.04117656, "balance_loss_mlp": 1.02239239, "epoch": 0.9743281428485541, "flos": 22600057605120.0, "grad_norm": 1.8439109018995687, "language_loss": 0.77822489, "learning_rate": 6.878494886800146e-09, "loss": 0.79968327, "num_input_tokens_seen": 175015400, "step": 8103, "time_per_iteration": 2.6450326442718506 }, { "auxiliary_loss_clip": 0.01108418, "auxiliary_loss_mlp": 0.01033761, "balance_loss_clip": 1.04227948, "balance_loss_mlp": 1.01913369, "epoch": 0.9744483857391931, "flos": 20008815488640.0, "grad_norm": 6.023376620065651, "language_loss": 0.76347107, "learning_rate": 6.814096306615669e-09, "loss": 0.78489286, "num_input_tokens_seen": 175033540, "step": 8104, "time_per_iteration": 2.600668430328369 }, { "auxiliary_loss_clip": 0.01112309, "auxiliary_loss_mlp": 0.01045376, "balance_loss_clip": 1.0405972, "balance_loss_mlp": 1.02959228, "epoch": 0.9745686286298323, "flos": 17675268520320.0, "grad_norm": 2.455106924159397, "language_loss": 0.6499424, "learning_rate": 6.750000091148011e-09, "loss": 0.67151922, "num_input_tokens_seen": 175050835, "step": 8105, "time_per_iteration": 2.5987284183502197 }, { "auxiliary_loss_clip": 0.01130718, "auxiliary_loss_mlp": 0.01035089, "balance_loss_clip": 1.04300523, "balance_loss_mlp": 1.01993787, "epoch": 0.9746888715204713, "flos": 29460252332160.0, "grad_norm": 1.9806539638417244, "language_loss": 0.72426814, "learning_rate": 6.686206250120729e-09, "loss": 0.7459262, "num_input_tokens_seen": 175072330, "step": 8106, "time_per_iteration": 2.6323976516723633 }, { "auxiliary_loss_clip": 0.01098715, "auxiliary_loss_mlp": 0.01040335, "balance_loss_clip": 1.03775346, "balance_loss_mlp": 1.02521908, "epoch": 0.9748091144111104, "flos": 18479308510080.0, "grad_norm": 1.91239847523658, "language_loss": 0.74335814, "learning_rate": 6.622714793210749e-09, "loss": 0.76474869, "num_input_tokens_seen": 175091250, "step": 8107, "time_per_iteration": 2.6006762981414795 }, { "auxiliary_loss_clip": 0.01130089, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.04184747, "balance_loss_mlp": 1.02050376, "epoch": 0.9749293573017496, "flos": 20665154753280.0, "grad_norm": 1.7594830347218031, "language_loss": 0.78850329, "learning_rate": 6.559525730050364e-09, "loss": 0.81015742, "num_input_tokens_seen": 175111350, "step": 8108, "time_per_iteration": 2.5493106842041016 }, { "auxiliary_loss_clip": 0.01097871, "auxiliary_loss_mlp": 0.01036152, "balance_loss_clip": 1.04180765, "balance_loss_mlp": 1.02075005, "epoch": 0.9750496001923886, "flos": 18478590238080.0, "grad_norm": 2.102605809645797, "language_loss": 0.75945139, "learning_rate": 6.496639070224574e-09, "loss": 0.78079164, "num_input_tokens_seen": 175129835, "step": 8109, "time_per_iteration": 2.650972604751587 }, { "auxiliary_loss_clip": 0.01118831, "auxiliary_loss_mlp": 0.01034718, "balance_loss_clip": 1.04094696, "balance_loss_mlp": 1.01866102, "epoch": 0.9751698430830277, "flos": 19572967860480.0, "grad_norm": 2.5561933517741244, "language_loss": 0.83595061, "learning_rate": 6.4340548232739714e-09, "loss": 0.85748601, "num_input_tokens_seen": 175146035, "step": 8110, "time_per_iteration": 2.555006980895996 }, { "auxiliary_loss_clip": 0.01098261, "auxiliary_loss_mlp": 0.01040298, "balance_loss_clip": 1.03961492, "balance_loss_mlp": 1.02480054, "epoch": 0.9752900859736668, "flos": 23550325862400.0, "grad_norm": 1.9137681669538984, "language_loss": 0.79052359, "learning_rate": 6.371772998692071e-09, "loss": 0.8119092, "num_input_tokens_seen": 175165290, "step": 8111, "time_per_iteration": 2.664405584335327 }, { "auxiliary_loss_clip": 0.01099575, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.03933311, "balance_loss_mlp": 1.01765227, "epoch": 0.9754103288643059, "flos": 20303211358080.0, "grad_norm": 2.61569962045452, "language_loss": 0.65374583, "learning_rate": 6.309793605927094e-09, "loss": 0.67506766, "num_input_tokens_seen": 175183610, "step": 8112, "time_per_iteration": 2.6312458515167236 }, { "auxiliary_loss_clip": 0.01107815, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.03888798, "balance_loss_mlp": 1.02356768, "epoch": 0.975530571754945, "flos": 19350680544000.0, "grad_norm": 2.25257987343312, "language_loss": 0.8027156, "learning_rate": 6.248116654381297e-09, "loss": 0.8241806, "num_input_tokens_seen": 175202080, "step": 8113, "time_per_iteration": 3.4887547492980957 }, { "auxiliary_loss_clip": 0.01108752, "auxiliary_loss_mlp": 0.01036525, "balance_loss_clip": 1.03888595, "balance_loss_mlp": 1.02224374, "epoch": 0.9756508146455841, "flos": 23583399310080.0, "grad_norm": 1.8231354339711006, "language_loss": 0.73026609, "learning_rate": 6.186742153410751e-09, "loss": 0.75171888, "num_input_tokens_seen": 175221575, "step": 8114, "time_per_iteration": 2.666761875152588 }, { "auxiliary_loss_clip": 0.01106713, "auxiliary_loss_mlp": 0.01045527, "balance_loss_clip": 1.04022479, "balance_loss_mlp": 1.02900434, "epoch": 0.9757710575362232, "flos": 22966921163520.0, "grad_norm": 2.13990282792518, "language_loss": 0.8753475, "learning_rate": 6.125670112326453e-09, "loss": 0.8968699, "num_input_tokens_seen": 175240835, "step": 8115, "time_per_iteration": 3.5413100719451904 }, { "auxiliary_loss_clip": 0.0112092, "auxiliary_loss_mlp": 0.01037999, "balance_loss_clip": 1.04205477, "balance_loss_mlp": 1.02179849, "epoch": 0.9758913004268622, "flos": 27966009530880.0, "grad_norm": 1.7250506134995631, "language_loss": 0.70670974, "learning_rate": 6.064900540392548e-09, "loss": 0.7282989, "num_input_tokens_seen": 175262930, "step": 8116, "time_per_iteration": 2.6406917572021484 }, { "auxiliary_loss_clip": 0.01099986, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.03987956, "balance_loss_mlp": 1.01866734, "epoch": 0.9760115433175014, "flos": 22200156512640.0, "grad_norm": 2.0174995270724025, "language_loss": 0.78815573, "learning_rate": 6.0044334468278835e-09, "loss": 0.8094846, "num_input_tokens_seen": 175282275, "step": 8117, "time_per_iteration": 2.6383090019226074 }, { "auxiliary_loss_clip": 0.01084504, "auxiliary_loss_mlp": 0.01033452, "balance_loss_clip": 1.03866363, "balance_loss_mlp": 1.0176568, "epoch": 0.9761317862081405, "flos": 26250736389120.0, "grad_norm": 1.8308676594664128, "language_loss": 0.71651763, "learning_rate": 5.944268840805345e-09, "loss": 0.73769718, "num_input_tokens_seen": 175303020, "step": 8118, "time_per_iteration": 2.706080913543701 }, { "auxiliary_loss_clip": 0.01087889, "auxiliary_loss_mlp": 0.01034365, "balance_loss_clip": 1.0385983, "balance_loss_mlp": 1.0199883, "epoch": 0.9762520290987795, "flos": 26575440359040.0, "grad_norm": 2.5056655869970075, "language_loss": 0.64302421, "learning_rate": 5.88440673145163e-09, "loss": 0.66424668, "num_input_tokens_seen": 175324070, "step": 8119, "time_per_iteration": 2.7045369148254395 }, { "auxiliary_loss_clip": 0.01119533, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.04438257, "balance_loss_mlp": 1.02293575, "epoch": 0.9763722719894187, "flos": 18005036307840.0, "grad_norm": 1.9188294951084026, "language_loss": 0.82715988, "learning_rate": 5.824847127848142e-09, "loss": 0.84874201, "num_input_tokens_seen": 175342595, "step": 8120, "time_per_iteration": 3.567316770553589 }, { "auxiliary_loss_clip": 0.01090846, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.04227912, "balance_loss_mlp": 1.02098298, "epoch": 0.9764925148800577, "flos": 22455660931200.0, "grad_norm": 1.8508512747562151, "language_loss": 0.79062653, "learning_rate": 5.765590039029433e-09, "loss": 0.81189525, "num_input_tokens_seen": 175361915, "step": 8121, "time_per_iteration": 3.5229034423828125 }, { "auxiliary_loss_clip": 0.01128874, "auxiliary_loss_mlp": 0.01036124, "balance_loss_clip": 1.04193854, "balance_loss_mlp": 1.02149117, "epoch": 0.9766127577706968, "flos": 36757084786560.0, "grad_norm": 2.9028950990811637, "language_loss": 0.71268487, "learning_rate": 5.706635473985422e-09, "loss": 0.73433483, "num_input_tokens_seen": 175385785, "step": 8122, "time_per_iteration": 2.6812524795532227 }, { "auxiliary_loss_clip": 0.01119057, "auxiliary_loss_mlp": 0.01039548, "balance_loss_clip": 1.04145873, "balance_loss_mlp": 1.02430689, "epoch": 0.976733000661336, "flos": 22309971367680.0, "grad_norm": 1.7949898399994813, "language_loss": 0.84935665, "learning_rate": 5.6479834416591764e-09, "loss": 0.87094271, "num_input_tokens_seen": 175405145, "step": 8123, "time_per_iteration": 2.6032633781433105 }, { "auxiliary_loss_clip": 0.01117832, "auxiliary_loss_mlp": 0.00772445, "balance_loss_clip": 1.04047239, "balance_loss_mlp": 1.00056195, "epoch": 0.976853243551975, "flos": 25810938264960.0, "grad_norm": 2.117551199835198, "language_loss": 0.68660325, "learning_rate": 5.589633950947803e-09, "loss": 0.70550603, "num_input_tokens_seen": 175422645, "step": 8124, "time_per_iteration": 2.620326519012451 }, { "auxiliary_loss_clip": 0.011051, "auxiliary_loss_mlp": 0.01052866, "balance_loss_clip": 1.0400629, "balance_loss_mlp": 1.03510356, "epoch": 0.9769734864426141, "flos": 21397445326080.0, "grad_norm": 3.113962207573377, "language_loss": 0.70056891, "learning_rate": 5.5315870107035535e-09, "loss": 0.7221486, "num_input_tokens_seen": 175440695, "step": 8125, "time_per_iteration": 2.634674072265625 }, { "auxiliary_loss_clip": 0.01102906, "auxiliary_loss_mlp": 0.01038889, "balance_loss_clip": 1.04120135, "balance_loss_mlp": 1.02292717, "epoch": 0.9770937293332532, "flos": 13990977584640.0, "grad_norm": 1.9880108481802796, "language_loss": 0.7894057, "learning_rate": 5.473842629731607e-09, "loss": 0.81082362, "num_input_tokens_seen": 175459195, "step": 8126, "time_per_iteration": 2.587235927581787 }, { "auxiliary_loss_clip": 0.01111075, "auxiliary_loss_mlp": 0.00772027, "balance_loss_clip": 1.03986335, "balance_loss_mlp": 1.0005281, "epoch": 0.9772139722238923, "flos": 17931994001280.0, "grad_norm": 15.585952514278015, "language_loss": 0.77836949, "learning_rate": 5.416400816792066e-09, "loss": 0.79720056, "num_input_tokens_seen": 175476710, "step": 8127, "time_per_iteration": 2.5809950828552246 }, { "auxiliary_loss_clip": 0.01127981, "auxiliary_loss_mlp": 0.01035987, "balance_loss_clip": 1.04117608, "balance_loss_mlp": 1.02122903, "epoch": 0.9773342151145313, "flos": 20446171488000.0, "grad_norm": 3.100655725001953, "language_loss": 0.78318042, "learning_rate": 5.359261580598407e-09, "loss": 0.80482012, "num_input_tokens_seen": 175492550, "step": 8128, "time_per_iteration": 2.526512622833252 }, { "auxiliary_loss_clip": 0.01125252, "auxiliary_loss_mlp": 0.01043674, "balance_loss_clip": 1.04321396, "balance_loss_mlp": 1.02705598, "epoch": 0.9774544580051704, "flos": 11837306949120.0, "grad_norm": 2.7511037757505528, "language_loss": 0.78757411, "learning_rate": 5.302424929819027e-09, "loss": 0.80926335, "num_input_tokens_seen": 175506560, "step": 8129, "time_per_iteration": 2.568173885345459 }, { "auxiliary_loss_clip": 0.01121164, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.03866148, "balance_loss_mlp": 1.02293551, "epoch": 0.9775747008958096, "flos": 13479932833920.0, "grad_norm": 5.246007275351463, "language_loss": 0.73174018, "learning_rate": 5.24589087307592e-09, "loss": 0.75333512, "num_input_tokens_seen": 175524180, "step": 8130, "time_per_iteration": 2.534785032272339 }, { "auxiliary_loss_clip": 0.01134203, "auxiliary_loss_mlp": 0.01038268, "balance_loss_clip": 1.04304159, "balance_loss_mlp": 1.02211535, "epoch": 0.9776949437864486, "flos": 59532314042880.0, "grad_norm": 1.4800727048260818, "language_loss": 0.65279317, "learning_rate": 5.189659418944891e-09, "loss": 0.67451787, "num_input_tokens_seen": 175554355, "step": 8131, "time_per_iteration": 2.948378086090088 }, { "auxiliary_loss_clip": 0.01130782, "auxiliary_loss_mlp": 0.0103485, "balance_loss_clip": 1.0432812, "balance_loss_mlp": 1.01855397, "epoch": 0.9778151866770877, "flos": 21178605715200.0, "grad_norm": 1.8667034186945624, "language_loss": 0.7853651, "learning_rate": 5.133730575956674e-09, "loss": 0.80702138, "num_input_tokens_seen": 175574025, "step": 8132, "time_per_iteration": 2.559237480163574 }, { "auxiliary_loss_clip": 0.0110741, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.03943419, "balance_loss_mlp": 1.02185583, "epoch": 0.9779354295677268, "flos": 20886795624960.0, "grad_norm": 2.2172218622640423, "language_loss": 0.71890509, "learning_rate": 5.0781043525953696e-09, "loss": 0.74034762, "num_input_tokens_seen": 175592090, "step": 8133, "time_per_iteration": 2.6471939086914062 }, { "auxiliary_loss_clip": 0.01101763, "auxiliary_loss_mlp": 0.01034892, "balance_loss_clip": 1.04027796, "balance_loss_mlp": 1.02059269, "epoch": 0.9780556724583659, "flos": 23440618748160.0, "grad_norm": 1.7107647726973698, "language_loss": 0.73925662, "learning_rate": 5.0227807572995605e-09, "loss": 0.76062316, "num_input_tokens_seen": 175614065, "step": 8134, "time_per_iteration": 2.635523557662964 }, { "auxiliary_loss_clip": 0.01107321, "auxiliary_loss_mlp": 0.01034676, "balance_loss_clip": 1.03853118, "balance_loss_mlp": 1.02010834, "epoch": 0.9781759153490049, "flos": 20923244951040.0, "grad_norm": 2.0670048243900894, "language_loss": 0.67465949, "learning_rate": 4.967759798461646e-09, "loss": 0.69607949, "num_input_tokens_seen": 175632410, "step": 8135, "time_per_iteration": 2.608903169631958 }, { "auxiliary_loss_clip": 0.01130132, "auxiliary_loss_mlp": 0.01043506, "balance_loss_clip": 1.04454207, "balance_loss_mlp": 1.0295583, "epoch": 0.9782961582396441, "flos": 28293191539200.0, "grad_norm": 2.0335676229682096, "language_loss": 0.75031984, "learning_rate": 4.913041484428282e-09, "loss": 0.77205622, "num_input_tokens_seen": 175652885, "step": 8136, "time_per_iteration": 2.5876305103302 }, { "auxiliary_loss_clip": 0.01119784, "auxiliary_loss_mlp": 0.01036732, "balance_loss_clip": 1.04283547, "balance_loss_mlp": 1.02218866, "epoch": 0.9784164011302832, "flos": 25552955808000.0, "grad_norm": 2.1675317629226485, "language_loss": 0.74080271, "learning_rate": 4.858625823500384e-09, "loss": 0.76236784, "num_input_tokens_seen": 175670585, "step": 8137, "time_per_iteration": 2.5987071990966797 }, { "auxiliary_loss_clip": 0.01122817, "auxiliary_loss_mlp": 0.01038814, "balance_loss_clip": 1.0424397, "balance_loss_mlp": 1.02344763, "epoch": 0.9785366440209222, "flos": 29965945956480.0, "grad_norm": 1.8902365972583302, "language_loss": 0.73322594, "learning_rate": 4.80451282393246e-09, "loss": 0.75484228, "num_input_tokens_seen": 175690570, "step": 8138, "time_per_iteration": 2.6457247734069824 }, { "auxiliary_loss_clip": 0.01105586, "auxiliary_loss_mlp": 0.01041519, "balance_loss_clip": 1.03857601, "balance_loss_mlp": 1.02513337, "epoch": 0.9786568869115614, "flos": 32343591847680.0, "grad_norm": 2.184686036935059, "language_loss": 0.67531234, "learning_rate": 4.750702493933722e-09, "loss": 0.69678336, "num_input_tokens_seen": 175710455, "step": 8139, "time_per_iteration": 3.6665408611297607 }, { "auxiliary_loss_clip": 0.01103141, "auxiliary_loss_mlp": 0.0077229, "balance_loss_clip": 1.04144943, "balance_loss_mlp": 1.000453, "epoch": 0.9787771298022004, "flos": 23331414424320.0, "grad_norm": 1.8475042082054918, "language_loss": 0.85394847, "learning_rate": 4.697194841666974e-09, "loss": 0.87270272, "num_input_tokens_seen": 175729380, "step": 8140, "time_per_iteration": 2.6275129318237305 }, { "auxiliary_loss_clip": 0.01125797, "auxiliary_loss_mlp": 0.01043906, "balance_loss_clip": 1.04252434, "balance_loss_mlp": 1.02714515, "epoch": 0.9788973726928395, "flos": 21468548298240.0, "grad_norm": 3.8622025433298424, "language_loss": 0.81664634, "learning_rate": 4.6439898752492764e-09, "loss": 0.83834332, "num_input_tokens_seen": 175749520, "step": 8141, "time_per_iteration": 3.5607376098632812 }, { "auxiliary_loss_clip": 0.01026789, "auxiliary_loss_mlp": 0.00755675, "balance_loss_clip": 1.00843894, "balance_loss_mlp": 1.00021386, "epoch": 0.9790176155834787, "flos": 68897459439360.0, "grad_norm": 0.7469829344353872, "language_loss": 0.63643378, "learning_rate": 4.591087602751731e-09, "loss": 0.65425843, "num_input_tokens_seen": 175811380, "step": 8142, "time_per_iteration": 3.287593364715576 }, { "auxiliary_loss_clip": 0.01116695, "auxiliary_loss_mlp": 0.01040845, "balance_loss_clip": 1.04090345, "balance_loss_mlp": 1.02642035, "epoch": 0.9791378584741177, "flos": 21430877909760.0, "grad_norm": 1.8378885964759057, "language_loss": 0.72293234, "learning_rate": 4.538488032199916e-09, "loss": 0.74450773, "num_input_tokens_seen": 175829480, "step": 8143, "time_per_iteration": 2.6081137657165527 }, { "auxiliary_loss_clip": 0.01123177, "auxiliary_loss_mlp": 0.01038654, "balance_loss_clip": 1.04039264, "balance_loss_mlp": 1.02225065, "epoch": 0.9792581013647568, "flos": 20153032594560.0, "grad_norm": 2.157548891312004, "language_loss": 0.68868715, "learning_rate": 4.486191171572784e-09, "loss": 0.71030545, "num_input_tokens_seen": 175846750, "step": 8144, "time_per_iteration": 2.592000961303711 }, { "auxiliary_loss_clip": 0.01121338, "auxiliary_loss_mlp": 0.01038321, "balance_loss_clip": 1.04318523, "balance_loss_mlp": 1.02357459, "epoch": 0.9793783442553959, "flos": 23728191033600.0, "grad_norm": 2.156825029493656, "language_loss": 0.77671272, "learning_rate": 4.434197028803766e-09, "loss": 0.79830933, "num_input_tokens_seen": 175865975, "step": 8145, "time_per_iteration": 2.6250061988830566 }, { "auxiliary_loss_clip": 0.01100094, "auxiliary_loss_mlp": 0.01048591, "balance_loss_clip": 1.04118288, "balance_loss_mlp": 1.0306145, "epoch": 0.979498587146035, "flos": 23038742407680.0, "grad_norm": 2.303832617084845, "language_loss": 0.82139897, "learning_rate": 4.3825056117805514e-09, "loss": 0.84288585, "num_input_tokens_seen": 175881860, "step": 8146, "time_per_iteration": 3.5892059803009033 }, { "auxiliary_loss_clip": 0.01130664, "auxiliary_loss_mlp": 0.01048514, "balance_loss_clip": 1.04219055, "balance_loss_mlp": 1.03294516, "epoch": 0.979618830036674, "flos": 14318841951360.0, "grad_norm": 2.844329660977671, "language_loss": 0.79485196, "learning_rate": 4.331116928344425e-09, "loss": 0.81664371, "num_input_tokens_seen": 175898175, "step": 8147, "time_per_iteration": 3.397172451019287 }, { "auxiliary_loss_clip": 0.01109487, "auxiliary_loss_mlp": 0.00771576, "balance_loss_clip": 1.03889823, "balance_loss_mlp": 1.00052929, "epoch": 0.9797390729273132, "flos": 16727514215040.0, "grad_norm": 1.9292587305618367, "language_loss": 0.6248458, "learning_rate": 4.28003098629115e-09, "loss": 0.64365649, "num_input_tokens_seen": 175914310, "step": 8148, "time_per_iteration": 2.5636520385742188 }, { "auxiliary_loss_clip": 0.01095255, "auxiliary_loss_mlp": 0.01040596, "balance_loss_clip": 1.03776264, "balance_loss_mlp": 1.02496779, "epoch": 0.9798593158179523, "flos": 24532661986560.0, "grad_norm": 2.2022742013905754, "language_loss": 0.78440505, "learning_rate": 4.229247793370305e-09, "loss": 0.80576348, "num_input_tokens_seen": 175933435, "step": 8149, "time_per_iteration": 2.6760213375091553 }, { "auxiliary_loss_clip": 0.01134641, "auxiliary_loss_mlp": 0.01041341, "balance_loss_clip": 1.04419231, "balance_loss_mlp": 1.02539062, "epoch": 0.9799795587085913, "flos": 27308808339840.0, "grad_norm": 1.9911712757600646, "language_loss": 0.7034781, "learning_rate": 4.178767357285951e-09, "loss": 0.72523791, "num_input_tokens_seen": 175955065, "step": 8150, "time_per_iteration": 2.592416286468506 }, { "auxiliary_loss_clip": 0.01121666, "auxiliary_loss_mlp": 0.00771524, "balance_loss_clip": 1.04249775, "balance_loss_mlp": 1.00053811, "epoch": 0.9800998015992305, "flos": 26286575184000.0, "grad_norm": 2.4454326673154645, "language_loss": 0.71780276, "learning_rate": 4.128589685695516e-09, "loss": 0.73673469, "num_input_tokens_seen": 175975490, "step": 8151, "time_per_iteration": 2.617238998413086 }, { "auxiliary_loss_clip": 0.01131979, "auxiliary_loss_mlp": 0.01034398, "balance_loss_clip": 1.04297662, "balance_loss_mlp": 1.01859701, "epoch": 0.9802200444898695, "flos": 16723635546240.0, "grad_norm": 1.821082980248654, "language_loss": 0.84417617, "learning_rate": 4.078714786211135e-09, "loss": 0.86583996, "num_input_tokens_seen": 175991340, "step": 8152, "time_per_iteration": 2.510338068008423 }, { "auxiliary_loss_clip": 0.01115487, "auxiliary_loss_mlp": 0.01038141, "balance_loss_clip": 1.03997469, "balance_loss_mlp": 1.02352619, "epoch": 0.9803402873805086, "flos": 24900459298560.0, "grad_norm": 1.8239430922012732, "language_loss": 0.76971972, "learning_rate": 4.029142666398977e-09, "loss": 0.79125601, "num_input_tokens_seen": 176011505, "step": 8153, "time_per_iteration": 2.6207997798919678 }, { "auxiliary_loss_clip": 0.01128995, "auxiliary_loss_mlp": 0.01050837, "balance_loss_clip": 1.04190314, "balance_loss_mlp": 1.03561378, "epoch": 0.9804605302711478, "flos": 22564937082240.0, "grad_norm": 3.8974968895012694, "language_loss": 0.80061376, "learning_rate": 3.979873333778805e-09, "loss": 0.82241207, "num_input_tokens_seen": 176029680, "step": 8154, "time_per_iteration": 2.563483953475952 }, { "auxiliary_loss_clip": 0.01113941, "auxiliary_loss_mlp": 0.01039864, "balance_loss_clip": 1.04382157, "balance_loss_mlp": 1.02551103, "epoch": 0.9805807731617868, "flos": 38905368382080.0, "grad_norm": 1.8885664267770295, "language_loss": 0.74079597, "learning_rate": 3.930906795824862e-09, "loss": 0.76233399, "num_input_tokens_seen": 176050355, "step": 8155, "time_per_iteration": 2.7642641067504883 }, { "auxiliary_loss_clip": 0.01116812, "auxiliary_loss_mlp": 0.01040678, "balance_loss_clip": 1.03998697, "balance_loss_mlp": 1.02568102, "epoch": 0.9807010160524259, "flos": 17821999578240.0, "grad_norm": 2.9138013951371917, "language_loss": 0.76627606, "learning_rate": 3.882243059965207e-09, "loss": 0.78785098, "num_input_tokens_seen": 176068070, "step": 8156, "time_per_iteration": 2.5822787284851074 }, { "auxiliary_loss_clip": 0.01111971, "auxiliary_loss_mlp": 0.01041996, "balance_loss_clip": 1.03817368, "balance_loss_mlp": 1.0250926, "epoch": 0.980821258943065, "flos": 13552975140480.0, "grad_norm": 3.1651899629700044, "language_loss": 0.65790033, "learning_rate": 3.833882133582156e-09, "loss": 0.67944008, "num_input_tokens_seen": 176083730, "step": 8157, "time_per_iteration": 2.5545830726623535 }, { "auxiliary_loss_clip": 0.01121679, "auxiliary_loss_mlp": 0.01041594, "balance_loss_clip": 1.04127049, "balance_loss_mlp": 1.02675247, "epoch": 0.9809415018337041, "flos": 21689794120320.0, "grad_norm": 1.6173287017387057, "language_loss": 0.78534573, "learning_rate": 3.785824024012285e-09, "loss": 0.80697846, "num_input_tokens_seen": 176102730, "step": 8158, "time_per_iteration": 2.604600429534912 }, { "auxiliary_loss_clip": 0.01096768, "auxiliary_loss_mlp": 0.01041498, "balance_loss_clip": 1.03913605, "balance_loss_mlp": 1.02765822, "epoch": 0.9810617447243432, "flos": 23294857357440.0, "grad_norm": 1.536513956425576, "language_loss": 0.78598017, "learning_rate": 3.738068738545541e-09, "loss": 0.80736279, "num_input_tokens_seen": 176121815, "step": 8159, "time_per_iteration": 2.6746673583984375 }, { "auxiliary_loss_clip": 0.01124624, "auxiliary_loss_mlp": 0.01038308, "balance_loss_clip": 1.04391992, "balance_loss_mlp": 1.02281094, "epoch": 0.9811819876149822, "flos": 18332038748160.0, "grad_norm": 12.378372913195967, "language_loss": 0.78473282, "learning_rate": 3.6906162844265733e-09, "loss": 0.80636215, "num_input_tokens_seen": 176138900, "step": 8160, "time_per_iteration": 2.602506399154663 }, { "auxiliary_loss_clip": 0.01103419, "auxiliary_loss_mlp": 0.0104968, "balance_loss_clip": 1.03819311, "balance_loss_mlp": 1.03184617, "epoch": 0.9813022305056214, "flos": 22601961025920.0, "grad_norm": 1.8725157859085475, "language_loss": 0.70889091, "learning_rate": 3.643466668853845e-09, "loss": 0.7304219, "num_input_tokens_seen": 176156925, "step": 8161, "time_per_iteration": 2.605794668197632 }, { "auxiliary_loss_clip": 0.0110964, "auxiliary_loss_mlp": 0.01036831, "balance_loss_clip": 1.04179537, "balance_loss_mlp": 1.02253199, "epoch": 0.9814224733962604, "flos": 25413335642880.0, "grad_norm": 1.913744167398519, "language_loss": 0.75168157, "learning_rate": 3.59661989898008e-09, "loss": 0.77314627, "num_input_tokens_seen": 176177980, "step": 8162, "time_per_iteration": 2.7163937091827393 }, { "auxiliary_loss_clip": 0.01087339, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.0379796, "balance_loss_mlp": 1.0192132, "epoch": 0.9815427162868995, "flos": 25007185584000.0, "grad_norm": 1.8150165969434624, "language_loss": 0.76685369, "learning_rate": 3.5500759819115934e-09, "loss": 0.78806263, "num_input_tokens_seen": 176198345, "step": 8163, "time_per_iteration": 2.6806321144104004 }, { "auxiliary_loss_clip": 0.01131881, "auxiliary_loss_mlp": 0.01043023, "balance_loss_clip": 1.04321814, "balance_loss_mlp": 1.02763343, "epoch": 0.9816629591775387, "flos": 20662604887680.0, "grad_norm": 2.536585149876193, "language_loss": 0.81457984, "learning_rate": 3.5038349247094034e-09, "loss": 0.83632898, "num_input_tokens_seen": 176215605, "step": 8164, "time_per_iteration": 2.53867244720459 }, { "auxiliary_loss_clip": 0.01105806, "auxiliary_loss_mlp": 0.01039539, "balance_loss_clip": 1.03842223, "balance_loss_mlp": 1.02478075, "epoch": 0.9817832020681777, "flos": 17712220636800.0, "grad_norm": 2.1769987692119015, "language_loss": 0.77669269, "learning_rate": 3.4578967343878994e-09, "loss": 0.79814613, "num_input_tokens_seen": 176231810, "step": 8165, "time_per_iteration": 3.615384578704834 }, { "auxiliary_loss_clip": 0.01110946, "auxiliary_loss_mlp": 0.0103663, "balance_loss_clip": 1.04347539, "balance_loss_mlp": 1.0220921, "epoch": 0.9819034449588168, "flos": 22530032040960.0, "grad_norm": 1.770811123180258, "language_loss": 0.81005299, "learning_rate": 3.4122614179161733e-09, "loss": 0.83152878, "num_input_tokens_seen": 176251770, "step": 8166, "time_per_iteration": 2.658763885498047 }, { "auxiliary_loss_clip": 0.01082567, "auxiliary_loss_mlp": 0.01038117, "balance_loss_clip": 1.03669155, "balance_loss_mlp": 1.02335882, "epoch": 0.9820236878494559, "flos": 20011221699840.0, "grad_norm": 1.7055270571837995, "language_loss": 0.78152442, "learning_rate": 3.36692898221691e-09, "loss": 0.80273128, "num_input_tokens_seen": 176270135, "step": 8167, "time_per_iteration": 2.60129714012146 }, { "auxiliary_loss_clip": 0.01118266, "auxiliary_loss_mlp": 0.01035544, "balance_loss_clip": 1.04080629, "balance_loss_mlp": 1.02135837, "epoch": 0.982143930740095, "flos": 18807316531200.0, "grad_norm": 16.59476542814712, "language_loss": 0.7360757, "learning_rate": 3.3218994341668305e-09, "loss": 0.75761384, "num_input_tokens_seen": 176289065, "step": 8168, "time_per_iteration": 3.459559202194214 }, { "auxiliary_loss_clip": 0.01130059, "auxiliary_loss_mlp": 0.01035922, "balance_loss_clip": 1.04344034, "balance_loss_mlp": 1.02096128, "epoch": 0.982264173630734, "flos": 26578026138240.0, "grad_norm": 1.621700502491296, "language_loss": 0.75633526, "learning_rate": 3.2771727805971373e-09, "loss": 0.77799511, "num_input_tokens_seen": 176310450, "step": 8169, "time_per_iteration": 2.575559139251709 }, { "auxiliary_loss_clip": 0.01073984, "auxiliary_loss_mlp": 0.01043439, "balance_loss_clip": 1.03486609, "balance_loss_mlp": 1.02616584, "epoch": 0.9823844165213732, "flos": 22014462176640.0, "grad_norm": 1.7768691708488793, "language_loss": 0.7691558, "learning_rate": 3.232749028292847e-09, "loss": 0.79033011, "num_input_tokens_seen": 176327415, "step": 8170, "time_per_iteration": 2.7431278228759766 }, { "auxiliary_loss_clip": 0.01131031, "auxiliary_loss_mlp": 0.01034354, "balance_loss_clip": 1.04060459, "balance_loss_mlp": 1.01941729, "epoch": 0.9825046594120123, "flos": 21908166854400.0, "grad_norm": 1.7223360449019398, "language_loss": 0.8850264, "learning_rate": 3.188628183992792e-09, "loss": 0.90668023, "num_input_tokens_seen": 176347680, "step": 8171, "time_per_iteration": 2.5726702213287354 }, { "auxiliary_loss_clip": 0.01026169, "auxiliary_loss_mlp": 0.01003069, "balance_loss_clip": 1.00720763, "balance_loss_mlp": 1.00150704, "epoch": 0.9826249023026513, "flos": 59494610718720.0, "grad_norm": 0.7396804216995493, "language_loss": 0.62531972, "learning_rate": 3.1448102543902844e-09, "loss": 0.64561212, "num_input_tokens_seen": 176411595, "step": 8172, "time_per_iteration": 4.080134868621826 }, { "auxiliary_loss_clip": 0.01102327, "auxiliary_loss_mlp": 0.01042004, "balance_loss_clip": 1.04210627, "balance_loss_mlp": 1.02604806, "epoch": 0.9827451451932905, "flos": 16071031296000.0, "grad_norm": 2.8900763270876952, "language_loss": 0.67879277, "learning_rate": 3.1012952461324515e-09, "loss": 0.70023608, "num_input_tokens_seen": 176430570, "step": 8173, "time_per_iteration": 3.521688461303711 }, { "auxiliary_loss_clip": 0.01115562, "auxiliary_loss_mlp": 0.01038929, "balance_loss_clip": 1.04197454, "balance_loss_mlp": 1.02365255, "epoch": 0.9828653880839295, "flos": 20262775622400.0, "grad_norm": 2.8305578760206127, "language_loss": 0.73686039, "learning_rate": 3.0580831658204575e-09, "loss": 0.75840533, "num_input_tokens_seen": 176448150, "step": 8174, "time_per_iteration": 2.5221993923187256 }, { "auxiliary_loss_clip": 0.01120459, "auxiliary_loss_mlp": 0.01035325, "balance_loss_clip": 1.04464006, "balance_loss_mlp": 1.0205307, "epoch": 0.9829856309745686, "flos": 21616141282560.0, "grad_norm": 1.5772949114882724, "language_loss": 0.78208101, "learning_rate": 3.015174020009281e-09, "loss": 0.80363882, "num_input_tokens_seen": 176467475, "step": 8175, "time_per_iteration": 2.5725739002227783 }, { "auxiliary_loss_clip": 0.01096982, "auxiliary_loss_mlp": 0.0103882, "balance_loss_clip": 1.04031038, "balance_loss_mlp": 1.02452052, "epoch": 0.9831058738652078, "flos": 23764209396480.0, "grad_norm": 1.9081605146616138, "language_loss": 0.75350142, "learning_rate": 2.9725678152086043e-09, "loss": 0.77485943, "num_input_tokens_seen": 176486045, "step": 8176, "time_per_iteration": 2.663745641708374 }, { "auxiliary_loss_clip": 0.0109217, "auxiliary_loss_mlp": 0.0103596, "balance_loss_clip": 1.03699338, "balance_loss_mlp": 1.02104688, "epoch": 0.9832261167558468, "flos": 11320911072000.0, "grad_norm": 5.342765677834753, "language_loss": 0.8252809, "learning_rate": 2.930264557881257e-09, "loss": 0.84656221, "num_input_tokens_seen": 176501230, "step": 8177, "time_per_iteration": 2.570624351501465 }, { "auxiliary_loss_clip": 0.01034444, "auxiliary_loss_mlp": 0.01001814, "balance_loss_clip": 1.00661576, "balance_loss_mlp": 1.00018048, "epoch": 0.9833463596464859, "flos": 60000304343040.0, "grad_norm": 0.838484757379542, "language_loss": 0.58140361, "learning_rate": 2.8882642544452163e-09, "loss": 0.60176623, "num_input_tokens_seen": 176565955, "step": 8178, "time_per_iteration": 3.184647798538208 }, { "auxiliary_loss_clip": 0.01099294, "auxiliary_loss_mlp": 0.01040073, "balance_loss_clip": 1.03842473, "balance_loss_mlp": 1.02450395, "epoch": 0.983466602537125, "flos": 13626699805440.0, "grad_norm": 2.296525418985248, "language_loss": 0.74903619, "learning_rate": 2.8465669112716083e-09, "loss": 0.77042985, "num_input_tokens_seen": 176583480, "step": 8179, "time_per_iteration": 2.580947160720825 }, { "auxiliary_loss_clip": 0.01120376, "auxiliary_loss_mlp": 0.00772969, "balance_loss_clip": 1.04081678, "balance_loss_mlp": 1.00048113, "epoch": 0.9835868454277641, "flos": 22926844563840.0, "grad_norm": 1.9649536897789863, "language_loss": 0.76862854, "learning_rate": 2.8051725346858177e-09, "loss": 0.78756201, "num_input_tokens_seen": 176603740, "step": 8180, "time_per_iteration": 2.5771801471710205 }, { "auxiliary_loss_clip": 0.01131254, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.04116774, "balance_loss_mlp": 1.01962709, "epoch": 0.9837070883184031, "flos": 27673409341440.0, "grad_norm": 2.285182528754144, "language_loss": 0.71290576, "learning_rate": 2.7640811309674883e-09, "loss": 0.73456609, "num_input_tokens_seen": 176623240, "step": 8181, "time_per_iteration": 2.6178293228149414 }, { "auxiliary_loss_clip": 0.01081908, "auxiliary_loss_mlp": 0.01048239, "balance_loss_clip": 1.03659678, "balance_loss_mlp": 1.03154957, "epoch": 0.9838273312090423, "flos": 29241951425280.0, "grad_norm": 2.0477310158687736, "language_loss": 0.808595, "learning_rate": 2.7232927063498557e-09, "loss": 0.82989645, "num_input_tokens_seen": 176643615, "step": 8182, "time_per_iteration": 2.6979284286499023 }, { "auxiliary_loss_clip": 0.01119236, "auxiliary_loss_mlp": 0.01038698, "balance_loss_clip": 1.04024196, "balance_loss_mlp": 1.02349877, "epoch": 0.9839475740996814, "flos": 40110207304320.0, "grad_norm": 2.2642003095730407, "language_loss": 0.69194919, "learning_rate": 2.682807267020859e-09, "loss": 0.71352857, "num_input_tokens_seen": 176666375, "step": 8183, "time_per_iteration": 2.7345728874206543 }, { "auxiliary_loss_clip": 0.01123433, "auxiliary_loss_mlp": 0.01035137, "balance_loss_clip": 1.04412127, "balance_loss_mlp": 1.01936555, "epoch": 0.9840678169903204, "flos": 24169389788160.0, "grad_norm": 1.8120258653935946, "language_loss": 0.62176383, "learning_rate": 2.642624819121808e-09, "loss": 0.64334953, "num_input_tokens_seen": 176686525, "step": 8184, "time_per_iteration": 2.59295654296875 }, { "auxiliary_loss_clip": 0.011053, "auxiliary_loss_mlp": 0.01037448, "balance_loss_clip": 1.04095972, "balance_loss_mlp": 1.02229047, "epoch": 0.9841880598809596, "flos": 14684484447360.0, "grad_norm": 1.8271328751759617, "language_loss": 0.61730802, "learning_rate": 2.6027453687487154e-09, "loss": 0.63873541, "num_input_tokens_seen": 176703615, "step": 8185, "time_per_iteration": 2.5991053581237793 }, { "auxiliary_loss_clip": 0.01107958, "auxiliary_loss_mlp": 0.01045437, "balance_loss_clip": 1.04046416, "balance_loss_mlp": 1.02902186, "epoch": 0.9843083027715986, "flos": 22344768668160.0, "grad_norm": 2.423996229123359, "language_loss": 0.54041731, "learning_rate": 2.5631689219509643e-09, "loss": 0.56195128, "num_input_tokens_seen": 176722295, "step": 8186, "time_per_iteration": 2.593534231185913 }, { "auxiliary_loss_clip": 0.01105416, "auxiliary_loss_mlp": 0.01036653, "balance_loss_clip": 1.04201603, "balance_loss_mlp": 1.02262235, "epoch": 0.9844285456622377, "flos": 21800111765760.0, "grad_norm": 1.73781954278551, "language_loss": 0.8352176, "learning_rate": 2.523895484732197e-09, "loss": 0.85663831, "num_input_tokens_seen": 176741750, "step": 8187, "time_per_iteration": 2.608278751373291 }, { "auxiliary_loss_clip": 0.01124629, "auxiliary_loss_mlp": 0.01036739, "balance_loss_clip": 1.04189205, "balance_loss_mlp": 1.02003753, "epoch": 0.9845487885528769, "flos": 18035380321920.0, "grad_norm": 2.1170118433139344, "language_loss": 0.74466813, "learning_rate": 2.4849250630505357e-09, "loss": 0.76628178, "num_input_tokens_seen": 176759995, "step": 8188, "time_per_iteration": 2.6069023609161377 }, { "auxiliary_loss_clip": 0.0104293, "auxiliary_loss_mlp": 0.01041308, "balance_loss_clip": 1.0315578, "balance_loss_mlp": 1.02593553, "epoch": 0.9846690314435159, "flos": 25228610974080.0, "grad_norm": 1.7601421012547538, "language_loss": 0.73586214, "learning_rate": 2.4462576628172528e-09, "loss": 0.75670457, "num_input_tokens_seen": 176778625, "step": 8189, "time_per_iteration": 2.73142409324646 }, { "auxiliary_loss_clip": 0.0111892, "auxiliary_loss_mlp": 0.01037189, "balance_loss_clip": 1.04258525, "balance_loss_mlp": 1.02191842, "epoch": 0.984789274334155, "flos": 18552171248640.0, "grad_norm": 2.7913876962924755, "language_loss": 0.74263453, "learning_rate": 2.407893289898766e-09, "loss": 0.76419568, "num_input_tokens_seen": 176797655, "step": 8190, "time_per_iteration": 2.6148884296417236 }, { "auxiliary_loss_clip": 0.01085868, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.03546095, "balance_loss_mlp": 1.02418971, "epoch": 0.984909517224794, "flos": 27345437233920.0, "grad_norm": 1.9367687812357197, "language_loss": 0.84112537, "learning_rate": 2.3698319501144202e-09, "loss": 0.86239624, "num_input_tokens_seen": 176818640, "step": 8191, "time_per_iteration": 4.376773118972778 }, { "auxiliary_loss_clip": 0.01125586, "auxiliary_loss_mlp": 0.01039001, "balance_loss_clip": 1.04180276, "balance_loss_mlp": 1.02178717, "epoch": 0.9850297601154332, "flos": 18734058743040.0, "grad_norm": 1.6276529636412118, "language_loss": 0.73338431, "learning_rate": 2.3320736492382644e-09, "loss": 0.75503016, "num_input_tokens_seen": 176837475, "step": 8192, "time_per_iteration": 2.7256999015808105 }, { "auxiliary_loss_clip": 0.01129262, "auxiliary_loss_mlp": 0.01034582, "balance_loss_clip": 1.04341006, "balance_loss_mlp": 1.01918066, "epoch": 0.9851500030060723, "flos": 22308247514880.0, "grad_norm": 1.5972746731755592, "language_loss": 0.67797995, "learning_rate": 2.29461839299816e-09, "loss": 0.69961846, "num_input_tokens_seen": 176857190, "step": 8193, "time_per_iteration": 3.7466213703155518 }, { "auxiliary_loss_clip": 0.01093347, "auxiliary_loss_mlp": 0.01036832, "balance_loss_clip": 1.03809035, "balance_loss_mlp": 1.02088213, "epoch": 0.9852702458967113, "flos": 26353691746560.0, "grad_norm": 1.6684661224822048, "language_loss": 0.79912931, "learning_rate": 2.257466187076229e-09, "loss": 0.82043111, "num_input_tokens_seen": 176876395, "step": 8194, "time_per_iteration": 2.7635931968688965 }, { "auxiliary_loss_clip": 0.01125837, "auxiliary_loss_mlp": 0.00771948, "balance_loss_clip": 1.04310024, "balance_loss_mlp": 1.00050628, "epoch": 0.9853904887873505, "flos": 20883599314560.0, "grad_norm": 2.0958055377891007, "language_loss": 0.71927702, "learning_rate": 2.2206170371081854e-09, "loss": 0.7382549, "num_input_tokens_seen": 176894980, "step": 8195, "time_per_iteration": 2.7084341049194336 }, { "auxiliary_loss_clip": 0.01108578, "auxiliary_loss_mlp": 0.01053573, "balance_loss_clip": 1.04053319, "balance_loss_mlp": 1.03619242, "epoch": 0.9855107316779895, "flos": 25263444188160.0, "grad_norm": 1.6527994308678173, "language_loss": 0.84763622, "learning_rate": 2.1840709486842247e-09, "loss": 0.86925775, "num_input_tokens_seen": 176914600, "step": 8196, "time_per_iteration": 2.839932680130005 }, { "auxiliary_loss_clip": 0.01100854, "auxiliary_loss_mlp": 0.01038194, "balance_loss_clip": 1.03810191, "balance_loss_mlp": 1.02186251, "epoch": 0.9856309745686286, "flos": 19062102677760.0, "grad_norm": 2.022527386019338, "language_loss": 0.78712839, "learning_rate": 2.1478279273481335e-09, "loss": 0.80851889, "num_input_tokens_seen": 176933085, "step": 8197, "time_per_iteration": 2.9394009113311768 }, { "auxiliary_loss_clip": 0.01120574, "auxiliary_loss_mlp": 0.01046003, "balance_loss_clip": 1.04312897, "balance_loss_mlp": 1.03100622, "epoch": 0.9857512174592677, "flos": 34130758060800.0, "grad_norm": 2.504874964598282, "language_loss": 0.80132842, "learning_rate": 2.1118879785981815e-09, "loss": 0.82299417, "num_input_tokens_seen": 176953225, "step": 8198, "time_per_iteration": 3.6938750743865967 }, { "auxiliary_loss_clip": 0.01103667, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.04068351, "balance_loss_mlp": 1.01533842, "epoch": 0.9858714603499068, "flos": 25994693266560.0, "grad_norm": 1.6371186626599559, "language_loss": 0.79373193, "learning_rate": 2.0762511078862288e-09, "loss": 0.81507206, "num_input_tokens_seen": 176973570, "step": 8199, "time_per_iteration": 3.6830475330352783 }, { "auxiliary_loss_clip": 0.01115354, "auxiliary_loss_mlp": 0.01040705, "balance_loss_clip": 1.04325151, "balance_loss_mlp": 1.02548218, "epoch": 0.9859917032405459, "flos": 23696230907520.0, "grad_norm": 2.122612086490706, "language_loss": 0.64759374, "learning_rate": 2.0409173206186183e-09, "loss": 0.66915429, "num_input_tokens_seen": 176992810, "step": 8200, "time_per_iteration": 2.636713981628418 }, { "auxiliary_loss_clip": 0.01090614, "auxiliary_loss_mlp": 0.01033371, "balance_loss_clip": 1.03880584, "balance_loss_mlp": 1.01888144, "epoch": 0.986111946131185, "flos": 19938287134080.0, "grad_norm": 2.1393574404429896, "language_loss": 0.87108004, "learning_rate": 2.0058866221550617e-09, "loss": 0.89231986, "num_input_tokens_seen": 177011050, "step": 8201, "time_per_iteration": 2.6369242668151855 }, { "auxiliary_loss_clip": 0.01132049, "auxiliary_loss_mlp": 0.01038867, "balance_loss_clip": 1.04225886, "balance_loss_mlp": 1.02470481, "epoch": 0.9862321890218241, "flos": 19828831415040.0, "grad_norm": 3.3128704707379972, "language_loss": 0.75319445, "learning_rate": 1.971159017809976e-09, "loss": 0.77490354, "num_input_tokens_seen": 177029340, "step": 8202, "time_per_iteration": 2.7369282245635986 }, { "auxiliary_loss_clip": 0.01121128, "auxiliary_loss_mlp": 0.01037623, "balance_loss_clip": 1.0420146, "balance_loss_mlp": 1.02139902, "epoch": 0.9863524319124631, "flos": 21652051904640.0, "grad_norm": 2.317330274416686, "language_loss": 0.77650487, "learning_rate": 1.93673451285159e-09, "loss": 0.79809242, "num_input_tokens_seen": 177048390, "step": 8203, "time_per_iteration": 2.659965991973877 }, { "auxiliary_loss_clip": 0.01017095, "auxiliary_loss_mlp": 0.01002565, "balance_loss_clip": 1.0071888, "balance_loss_mlp": 1.0008601, "epoch": 0.9864726748031023, "flos": 52769977920000.0, "grad_norm": 0.7470074931667272, "language_loss": 0.56481028, "learning_rate": 1.9026131125019495e-09, "loss": 0.58500689, "num_input_tokens_seen": 177105760, "step": 8204, "time_per_iteration": 3.25224232673645 }, { "auxiliary_loss_clip": 0.01111653, "auxiliary_loss_mlp": 0.01040338, "balance_loss_clip": 1.03956366, "balance_loss_mlp": 1.0257585, "epoch": 0.9865929176937414, "flos": 23364631526400.0, "grad_norm": 2.8799941033985244, "language_loss": 0.87068105, "learning_rate": 1.8687948219371363e-09, "loss": 0.89220095, "num_input_tokens_seen": 177124985, "step": 8205, "time_per_iteration": 2.6937172412872314 }, { "auxiliary_loss_clip": 0.01136204, "auxiliary_loss_mlp": 0.01044263, "balance_loss_clip": 1.04153645, "balance_loss_mlp": 1.02718019, "epoch": 0.9867131605843804, "flos": 21616679986560.0, "grad_norm": 2.557083209378011, "language_loss": 0.88681459, "learning_rate": 1.835279646287491e-09, "loss": 0.90861917, "num_input_tokens_seen": 177142995, "step": 8206, "time_per_iteration": 2.6169865131378174 }, { "auxiliary_loss_clip": 0.01126186, "auxiliary_loss_mlp": 0.01040988, "balance_loss_clip": 1.04298568, "balance_loss_mlp": 1.02419114, "epoch": 0.9868334034750196, "flos": 22271403139200.0, "grad_norm": 1.925058607153514, "language_loss": 0.76683462, "learning_rate": 1.8020675906371685e-09, "loss": 0.78850633, "num_input_tokens_seen": 177162390, "step": 8207, "time_per_iteration": 2.811502695083618 }, { "auxiliary_loss_clip": 0.01076774, "auxiliary_loss_mlp": 0.01034571, "balance_loss_clip": 1.03601527, "balance_loss_mlp": 1.02030182, "epoch": 0.9869536463656586, "flos": 25809573548160.0, "grad_norm": 1.9111896409142584, "language_loss": 0.75148374, "learning_rate": 1.7691586600243612e-09, "loss": 0.77259713, "num_input_tokens_seen": 177181290, "step": 8208, "time_per_iteration": 2.789980173110962 }, { "auxiliary_loss_clip": 0.01102938, "auxiliary_loss_mlp": 0.01036693, "balance_loss_clip": 1.04071903, "balance_loss_mlp": 1.02161264, "epoch": 0.9870738892562977, "flos": 16398500613120.0, "grad_norm": 2.7659986043739417, "language_loss": 0.87065876, "learning_rate": 1.7365528594415202e-09, "loss": 0.89205509, "num_input_tokens_seen": 177195360, "step": 8209, "time_per_iteration": 2.716320514678955 }, { "auxiliary_loss_clip": 0.01122303, "auxiliary_loss_mlp": 0.00771944, "balance_loss_clip": 1.04058313, "balance_loss_mlp": 1.00052917, "epoch": 0.9871941321469369, "flos": 35481358373760.0, "grad_norm": 1.744124778256811, "language_loss": 0.67619014, "learning_rate": 1.7042501938346888e-09, "loss": 0.69513261, "num_input_tokens_seen": 177218090, "step": 8210, "time_per_iteration": 2.7753796577453613 }, { "auxiliary_loss_clip": 0.01093547, "auxiliary_loss_mlp": 0.01034199, "balance_loss_clip": 1.03593898, "balance_loss_mlp": 1.01879716, "epoch": 0.9873143750375759, "flos": 21434217874560.0, "grad_norm": 1.8314149629540335, "language_loss": 0.76616442, "learning_rate": 1.6722506681043913e-09, "loss": 0.78744185, "num_input_tokens_seen": 177237050, "step": 8211, "time_per_iteration": 2.690859794616699 }, { "auxiliary_loss_clip": 0.01113847, "auxiliary_loss_mlp": 0.01036272, "balance_loss_clip": 1.04132485, "balance_loss_mlp": 1.02178764, "epoch": 0.987434617928215, "flos": 16326499800960.0, "grad_norm": 2.5542697132157, "language_loss": 0.69211543, "learning_rate": 1.640554287104745e-09, "loss": 0.71361661, "num_input_tokens_seen": 177255325, "step": 8212, "time_per_iteration": 2.6814565658569336 }, { "auxiliary_loss_clip": 0.01095407, "auxiliary_loss_mlp": 0.01039399, "balance_loss_clip": 1.03579628, "balance_loss_mlp": 1.02348447, "epoch": 0.9875548608188541, "flos": 17851984456320.0, "grad_norm": 2.1627981849323525, "language_loss": 0.80419707, "learning_rate": 1.609161055644348e-09, "loss": 0.82554513, "num_input_tokens_seen": 177271250, "step": 8213, "time_per_iteration": 2.944013833999634 }, { "auxiliary_loss_clip": 0.01127603, "auxiliary_loss_mlp": 0.01042726, "balance_loss_clip": 1.04200745, "balance_loss_mlp": 1.02774692, "epoch": 0.9876751037094932, "flos": 26132876887680.0, "grad_norm": 2.981767058345748, "language_loss": 0.68268269, "learning_rate": 1.5780709784849467e-09, "loss": 0.704386, "num_input_tokens_seen": 177288270, "step": 8214, "time_per_iteration": 2.758976459503174 }, { "auxiliary_loss_clip": 0.01076852, "auxiliary_loss_mlp": 0.01036136, "balance_loss_clip": 1.03962445, "balance_loss_mlp": 1.01912487, "epoch": 0.9877953466001322, "flos": 15991344973440.0, "grad_norm": 2.201460500146699, "language_loss": 0.82005179, "learning_rate": 1.5472840603436565e-09, "loss": 0.84118164, "num_input_tokens_seen": 177305500, "step": 8215, "time_per_iteration": 2.754570722579956 }, { "auxiliary_loss_clip": 0.01107696, "auxiliary_loss_mlp": 0.01042587, "balance_loss_clip": 1.0401566, "balance_loss_mlp": 1.02674377, "epoch": 0.9879155894907714, "flos": 18806777827200.0, "grad_norm": 2.0476949227625147, "language_loss": 0.78133118, "learning_rate": 1.5168003058900757e-09, "loss": 0.80283397, "num_input_tokens_seen": 177323500, "step": 8216, "time_per_iteration": 2.719987630844116 }, { "auxiliary_loss_clip": 0.01094012, "auxiliary_loss_mlp": 0.01041828, "balance_loss_clip": 1.03884184, "balance_loss_mlp": 1.0261879, "epoch": 0.9880358323814105, "flos": 22382044007040.0, "grad_norm": 2.419050119149351, "language_loss": 0.9235425, "learning_rate": 1.4866197197491715e-09, "loss": 0.94490087, "num_input_tokens_seen": 177342860, "step": 8217, "time_per_iteration": 3.82437801361084 }, { "auxiliary_loss_clip": 0.01126601, "auxiliary_loss_mlp": 0.00773637, "balance_loss_clip": 1.0425638, "balance_loss_mlp": 1.00061333, "epoch": 0.9881560752720495, "flos": 15668831733120.0, "grad_norm": 5.233866232627431, "language_loss": 0.78816348, "learning_rate": 1.4567423064988371e-09, "loss": 0.80716586, "num_input_tokens_seen": 177360210, "step": 8218, "time_per_iteration": 2.7260239124298096 }, { "auxiliary_loss_clip": 0.01131694, "auxiliary_loss_mlp": 0.01037028, "balance_loss_clip": 1.04216397, "balance_loss_mlp": 1.0217334, "epoch": 0.9882763181626887, "flos": 21500113374720.0, "grad_norm": 1.9406063038177597, "language_loss": 0.78368288, "learning_rate": 1.4271680706718913e-09, "loss": 0.80537015, "num_input_tokens_seen": 177377885, "step": 8219, "time_per_iteration": 3.488830089569092 }, { "auxiliary_loss_clip": 0.01125129, "auxiliary_loss_mlp": 0.01042322, "balance_loss_clip": 1.04541373, "balance_loss_mlp": 1.02589464, "epoch": 0.9883965610533277, "flos": 28034598551040.0, "grad_norm": 2.4517149316380764, "language_loss": 0.82639074, "learning_rate": 1.3978970167543013e-09, "loss": 0.84806526, "num_input_tokens_seen": 177398065, "step": 8220, "time_per_iteration": 2.663893938064575 }, { "auxiliary_loss_clip": 0.01099065, "auxiliary_loss_mlp": 0.0104403, "balance_loss_clip": 1.03912807, "balance_loss_mlp": 1.02810335, "epoch": 0.9885168039439668, "flos": 14098601710080.0, "grad_norm": 2.6930781337550735, "language_loss": 0.78082895, "learning_rate": 1.3689291491867372e-09, "loss": 0.80225992, "num_input_tokens_seen": 177416380, "step": 8221, "time_per_iteration": 2.674772262573242 }, { "auxiliary_loss_clip": 0.01133985, "auxiliary_loss_mlp": 0.01039496, "balance_loss_clip": 1.0432477, "balance_loss_mlp": 1.02420092, "epoch": 0.988637046834606, "flos": 26432013352320.0, "grad_norm": 1.8758896547258397, "language_loss": 0.73596781, "learning_rate": 1.3402644723636836e-09, "loss": 0.75770271, "num_input_tokens_seen": 177438410, "step": 8222, "time_per_iteration": 2.6153981685638428 }, { "auxiliary_loss_clip": 0.01103867, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.04176164, "balance_loss_mlp": 1.01915455, "epoch": 0.988757289725245, "flos": 25229113764480.0, "grad_norm": 16.92755560466385, "language_loss": 0.83533329, "learning_rate": 1.311902990633218e-09, "loss": 0.85671449, "num_input_tokens_seen": 177457375, "step": 8223, "time_per_iteration": 2.727306842803955 }, { "auxiliary_loss_clip": 0.01100338, "auxiliary_loss_mlp": 0.01043718, "balance_loss_clip": 1.03743267, "balance_loss_mlp": 1.02801847, "epoch": 0.9888775326158841, "flos": 26359042872960.0, "grad_norm": 1.9481493260578078, "language_loss": 0.71305335, "learning_rate": 1.2838447082978987e-09, "loss": 0.73449385, "num_input_tokens_seen": 177478530, "step": 8224, "time_per_iteration": 3.8104679584503174 }, { "auxiliary_loss_clip": 0.01115154, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.03976846, "balance_loss_mlp": 1.02254009, "epoch": 0.9889977755065231, "flos": 24316120846080.0, "grad_norm": 2.583901074271826, "language_loss": 0.83551228, "learning_rate": 1.2560896296143208e-09, "loss": 0.85704195, "num_input_tokens_seen": 177496995, "step": 8225, "time_per_iteration": 2.7084195613861084 }, { "auxiliary_loss_clip": 0.01133336, "auxiliary_loss_mlp": 0.01036971, "balance_loss_clip": 1.04398942, "balance_loss_mlp": 1.02201033, "epoch": 0.9891180183971623, "flos": 18951066760320.0, "grad_norm": 2.878600659095496, "language_loss": 0.83086807, "learning_rate": 1.2286377587926722e-09, "loss": 0.85257119, "num_input_tokens_seen": 177513785, "step": 8226, "time_per_iteration": 2.671627998352051 }, { "auxiliary_loss_clip": 0.01129397, "auxiliary_loss_mlp": 0.010428, "balance_loss_clip": 1.04038239, "balance_loss_mlp": 1.02648044, "epoch": 0.9892382612878013, "flos": 26176580760960.0, "grad_norm": 2.4669313169172207, "language_loss": 0.7453742, "learning_rate": 1.2014890999973992e-09, "loss": 0.76709616, "num_input_tokens_seen": 177530705, "step": 8227, "time_per_iteration": 2.581542491912842 }, { "auxiliary_loss_clip": 0.01129922, "auxiliary_loss_mlp": 0.01034949, "balance_loss_clip": 1.04256499, "balance_loss_mlp": 1.01938069, "epoch": 0.9893585041784404, "flos": 25449605400960.0, "grad_norm": 1.6437308142125522, "language_loss": 0.78302264, "learning_rate": 1.1746436573472073e-09, "loss": 0.80467141, "num_input_tokens_seen": 177552440, "step": 8228, "time_per_iteration": 2.750913381576538 }, { "auxiliary_loss_clip": 0.01116446, "auxiliary_loss_mlp": 0.010407, "balance_loss_clip": 1.0415107, "balance_loss_mlp": 1.02545357, "epoch": 0.9894787470690796, "flos": 20189302352640.0, "grad_norm": 2.0964305333765134, "language_loss": 0.6886431, "learning_rate": 1.1481014349141726e-09, "loss": 0.71021456, "num_input_tokens_seen": 177569660, "step": 8229, "time_per_iteration": 2.5763375759124756 }, { "auxiliary_loss_clip": 0.01112394, "auxiliary_loss_mlp": 0.01036868, "balance_loss_clip": 1.04243922, "balance_loss_mlp": 1.02059627, "epoch": 0.9895989899597186, "flos": 24644308435200.0, "grad_norm": 2.117425512521018, "language_loss": 0.84261191, "learning_rate": 1.121862436724852e-09, "loss": 0.86410451, "num_input_tokens_seen": 177588500, "step": 8230, "time_per_iteration": 2.7599918842315674 }, { "auxiliary_loss_clip": 0.01119522, "auxiliary_loss_mlp": 0.01041138, "balance_loss_clip": 1.04387975, "balance_loss_mlp": 1.02615309, "epoch": 0.9897192328503577, "flos": 21799034357760.0, "grad_norm": 1.9223187461002025, "language_loss": 0.70622945, "learning_rate": 1.0959266667598388e-09, "loss": 0.72783607, "num_input_tokens_seen": 177607315, "step": 8231, "time_per_iteration": 2.6299943923950195 }, { "auxiliary_loss_clip": 0.01104103, "auxiliary_loss_mlp": 0.01038883, "balance_loss_clip": 1.04219389, "balance_loss_mlp": 1.02238393, "epoch": 0.9898394757409968, "flos": 21325229032320.0, "grad_norm": 2.066549064988311, "language_loss": 0.74790114, "learning_rate": 1.0702941289533196e-09, "loss": 0.7693311, "num_input_tokens_seen": 177625990, "step": 8232, "time_per_iteration": 2.755615234375 }, { "auxiliary_loss_clip": 0.01096089, "auxiliary_loss_mlp": 0.01042865, "balance_loss_clip": 1.04146612, "balance_loss_mlp": 1.02795148, "epoch": 0.9899597186316359, "flos": 18545024442240.0, "grad_norm": 2.261372522663516, "language_loss": 0.88766241, "learning_rate": 1.0449648271939615e-09, "loss": 0.90905201, "num_input_tokens_seen": 177642335, "step": 8233, "time_per_iteration": 2.7755930423736572 }, { "auxiliary_loss_clip": 0.01084091, "auxiliary_loss_mlp": 0.007719, "balance_loss_clip": 1.04046798, "balance_loss_mlp": 1.00053525, "epoch": 0.990079961522275, "flos": 23766723348480.0, "grad_norm": 1.558662060313978, "language_loss": 0.72760189, "learning_rate": 1.0199387653240243e-09, "loss": 0.74616182, "num_input_tokens_seen": 177662025, "step": 8234, "time_per_iteration": 2.757411241531372 }, { "auxiliary_loss_clip": 0.01101141, "auxiliary_loss_mlp": 0.01039571, "balance_loss_clip": 1.04046154, "balance_loss_mlp": 1.02500355, "epoch": 0.9902002044129141, "flos": 16399182971520.0, "grad_norm": 1.8852390163080313, "language_loss": 0.70626616, "learning_rate": 9.952159471400267e-10, "loss": 0.72767329, "num_input_tokens_seen": 177679065, "step": 8235, "time_per_iteration": 2.8347434997558594 }, { "auxiliary_loss_clip": 0.01118353, "auxiliary_loss_mlp": 0.00771799, "balance_loss_clip": 1.04063916, "balance_loss_mlp": 1.00051188, "epoch": 0.9903204473035532, "flos": 22559657783040.0, "grad_norm": 1.9740675841294175, "language_loss": 0.8468048, "learning_rate": 9.707963763923022e-10, "loss": 0.86570632, "num_input_tokens_seen": 177698115, "step": 8236, "time_per_iteration": 2.7120087146759033 }, { "auxiliary_loss_clip": 0.01106589, "auxiliary_loss_mlp": 0.01037438, "balance_loss_clip": 1.04021108, "balance_loss_mlp": 1.02355611, "epoch": 0.9904406901941922, "flos": 16144001775360.0, "grad_norm": 2.1968786561070237, "language_loss": 0.79085779, "learning_rate": 9.466800567854427e-10, "loss": 0.81229806, "num_input_tokens_seen": 177716715, "step": 8237, "time_per_iteration": 2.6526260375976562 }, { "auxiliary_loss_clip": 0.01092438, "auxiliary_loss_mlp": 0.01051947, "balance_loss_clip": 1.03701496, "balance_loss_mlp": 1.03463769, "epoch": 0.9905609330848314, "flos": 26651499408000.0, "grad_norm": 1.9882593659338355, "language_loss": 0.68414652, "learning_rate": 9.228669919778553e-10, "loss": 0.70559043, "num_input_tokens_seen": 177735640, "step": 8238, "time_per_iteration": 2.716132402420044 }, { "auxiliary_loss_clip": 0.0110237, "auxiliary_loss_mlp": 0.01039325, "balance_loss_clip": 1.03960693, "balance_loss_mlp": 1.02360094, "epoch": 0.9906811759754705, "flos": 23111820627840.0, "grad_norm": 2.135939050469561, "language_loss": 0.79830635, "learning_rate": 8.993571855817617e-10, "loss": 0.81972337, "num_input_tokens_seen": 177754470, "step": 8239, "time_per_iteration": 2.7266030311584473 }, { "auxiliary_loss_clip": 0.01118646, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.04053116, "balance_loss_mlp": 1.01990867, "epoch": 0.9908014188661095, "flos": 22090593052800.0, "grad_norm": 1.846073472306073, "language_loss": 0.74899656, "learning_rate": 8.761506411638642e-10, "loss": 0.77054405, "num_input_tokens_seen": 177773935, "step": 8240, "time_per_iteration": 2.70306396484375 }, { "auxiliary_loss_clip": 0.01102972, "auxiliary_loss_mlp": 0.01039088, "balance_loss_clip": 1.04082978, "balance_loss_mlp": 1.02288747, "epoch": 0.9909216617567487, "flos": 19242948677760.0, "grad_norm": 1.8117961405507332, "language_loss": 0.74134272, "learning_rate": 8.53247362244236e-10, "loss": 0.76276332, "num_input_tokens_seen": 177792745, "step": 8241, "time_per_iteration": 2.775250196456909 }, { "auxiliary_loss_clip": 0.0110524, "auxiliary_loss_mlp": 0.01035218, "balance_loss_clip": 1.04059482, "balance_loss_mlp": 1.02029324, "epoch": 0.9910419046473877, "flos": 23621213352960.0, "grad_norm": 1.6455526212676141, "language_loss": 0.68264377, "learning_rate": 8.306473522976532e-10, "loss": 0.7040484, "num_input_tokens_seen": 177812150, "step": 8242, "time_per_iteration": 2.716156482696533 }, { "auxiliary_loss_clip": 0.01131433, "auxiliary_loss_mlp": 0.01042135, "balance_loss_clip": 1.04283905, "balance_loss_mlp": 1.02654839, "epoch": 0.9911621475380268, "flos": 22711380831360.0, "grad_norm": 1.7952522344656272, "language_loss": 0.71927184, "learning_rate": 8.083506147522623e-10, "loss": 0.74100757, "num_input_tokens_seen": 177831545, "step": 8243, "time_per_iteration": 2.6746017932891846 }, { "auxiliary_loss_clip": 0.01113138, "auxiliary_loss_mlp": 0.01039206, "balance_loss_clip": 1.03961349, "balance_loss_mlp": 1.02438831, "epoch": 0.991282390428666, "flos": 13516956777600.0, "grad_norm": 2.538746213956847, "language_loss": 0.85022473, "learning_rate": 7.863571529906909e-10, "loss": 0.87174821, "num_input_tokens_seen": 177847130, "step": 8244, "time_per_iteration": 3.5894248485565186 }, { "auxiliary_loss_clip": 0.01025371, "auxiliary_loss_mlp": 0.01002076, "balance_loss_clip": 1.00662088, "balance_loss_mlp": 1.00049055, "epoch": 0.991402633319305, "flos": 61830492071040.0, "grad_norm": 0.7201218858917926, "language_loss": 0.59655309, "learning_rate": 7.646669703489372e-10, "loss": 0.61682755, "num_input_tokens_seen": 177911440, "step": 8245, "time_per_iteration": 4.273658275604248 }, { "auxiliary_loss_clip": 0.01037147, "auxiliary_loss_mlp": 0.01040098, "balance_loss_clip": 1.03217638, "balance_loss_mlp": 1.02432692, "epoch": 0.9915228762099441, "flos": 18770148933120.0, "grad_norm": 1.9242085883953965, "language_loss": 0.57446939, "learning_rate": 7.432800701177023e-10, "loss": 0.59524184, "num_input_tokens_seen": 177929440, "step": 8246, "time_per_iteration": 3.3083689212799072 }, { "auxiliary_loss_clip": 0.01017786, "auxiliary_loss_mlp": 0.01002338, "balance_loss_clip": 1.00875461, "balance_loss_mlp": 1.00092554, "epoch": 0.9916431191005832, "flos": 65936660244480.0, "grad_norm": 0.7988272206320416, "language_loss": 0.57734787, "learning_rate": 7.221964555415017e-10, "loss": 0.59754908, "num_input_tokens_seen": 177989100, "step": 8247, "time_per_iteration": 3.55081844329834 }, { "auxiliary_loss_clip": 0.01101699, "auxiliary_loss_mlp": 0.01034557, "balance_loss_clip": 1.03875709, "balance_loss_mlp": 1.01941705, "epoch": 0.9917633619912223, "flos": 16581573256320.0, "grad_norm": 1.8270216721088903, "language_loss": 0.75095177, "learning_rate": 7.01416129818222e-10, "loss": 0.77231437, "num_input_tokens_seen": 178006720, "step": 8248, "time_per_iteration": 2.68025279045105 }, { "auxiliary_loss_clip": 0.01103427, "auxiliary_loss_mlp": 0.010372, "balance_loss_clip": 1.04181659, "balance_loss_mlp": 1.02284694, "epoch": 0.9918836048818613, "flos": 25411108999680.0, "grad_norm": 1.9019113592875605, "language_loss": 0.58940566, "learning_rate": 6.809390961006745e-10, "loss": 0.61081195, "num_input_tokens_seen": 178026850, "step": 8249, "time_per_iteration": 3.077228546142578 }, { "auxiliary_loss_clip": 0.01106131, "auxiliary_loss_mlp": 0.01038805, "balance_loss_clip": 1.04158282, "balance_loss_mlp": 1.0235579, "epoch": 0.9920038477725005, "flos": 25046867134080.0, "grad_norm": 1.8688565848239589, "language_loss": 0.68552661, "learning_rate": 6.607653574948191e-10, "loss": 0.70697594, "num_input_tokens_seen": 178047630, "step": 8250, "time_per_iteration": 4.033844947814941 }, { "auxiliary_loss_clip": 0.01111413, "auxiliary_loss_mlp": 0.01036708, "balance_loss_clip": 1.03931415, "balance_loss_mlp": 1.02233171, "epoch": 0.9921240906631396, "flos": 21829773421440.0, "grad_norm": 1.8982108440388827, "language_loss": 0.81812918, "learning_rate": 6.408949170613187e-10, "loss": 0.83961046, "num_input_tokens_seen": 178066895, "step": 8251, "time_per_iteration": 2.6794722080230713 }, { "auxiliary_loss_clip": 0.01104621, "auxiliary_loss_mlp": 0.01040723, "balance_loss_clip": 1.03986979, "balance_loss_mlp": 1.02277005, "epoch": 0.9922443335537786, "flos": 24864225454080.0, "grad_norm": 1.626787403179129, "language_loss": 0.81651038, "learning_rate": 6.213277778144288e-10, "loss": 0.83796382, "num_input_tokens_seen": 178088540, "step": 8252, "time_per_iteration": 2.8310089111328125 }, { "auxiliary_loss_clip": 0.01070213, "auxiliary_loss_mlp": 0.01039996, "balance_loss_clip": 1.0349195, "balance_loss_mlp": 1.02378356, "epoch": 0.9923645764444178, "flos": 21613088626560.0, "grad_norm": 3.1777217380199434, "language_loss": 0.6710211, "learning_rate": 6.020639427224416e-10, "loss": 0.69212323, "num_input_tokens_seen": 178106185, "step": 8253, "time_per_iteration": 2.865274667739868 }, { "auxiliary_loss_clip": 0.01106377, "auxiliary_loss_mlp": 0.01038223, "balance_loss_clip": 1.04028523, "balance_loss_mlp": 1.02231443, "epoch": 0.9924848193350568, "flos": 25001798544000.0, "grad_norm": 1.9880076276715628, "language_loss": 0.72492397, "learning_rate": 5.831034147076864e-10, "loss": 0.74636996, "num_input_tokens_seen": 178123435, "step": 8254, "time_per_iteration": 2.8267505168914795 }, { "auxiliary_loss_clip": 0.01023764, "auxiliary_loss_mlp": 0.01003452, "balance_loss_clip": 1.00585735, "balance_loss_mlp": 1.00199723, "epoch": 0.9926050622256959, "flos": 68912543151360.0, "grad_norm": 0.686605521319258, "language_loss": 0.55654669, "learning_rate": 5.644461966463065e-10, "loss": 0.57681882, "num_input_tokens_seen": 178191045, "step": 8255, "time_per_iteration": 3.355379343032837 }, { "auxiliary_loss_clip": 0.01106099, "auxiliary_loss_mlp": 0.01040264, "balance_loss_clip": 1.04119933, "balance_loss_mlp": 1.02505279, "epoch": 0.9927253051163349, "flos": 20923675914240.0, "grad_norm": 1.7781028579325278, "language_loss": 0.7601577, "learning_rate": 5.460922913687049e-10, "loss": 0.78162134, "num_input_tokens_seen": 178210135, "step": 8256, "time_per_iteration": 2.8453280925750732 }, { "auxiliary_loss_clip": 0.01081765, "auxiliary_loss_mlp": 0.00773638, "balance_loss_clip": 1.03636479, "balance_loss_mlp": 1.0005095, "epoch": 0.9928455480069741, "flos": 22308211601280.0, "grad_norm": 2.203421906298141, "language_loss": 0.75617856, "learning_rate": 5.280417016593208e-10, "loss": 0.77473259, "num_input_tokens_seen": 178229925, "step": 8257, "time_per_iteration": 2.8207640647888184 }, { "auxiliary_loss_clip": 0.01117178, "auxiliary_loss_mlp": 0.0077045, "balance_loss_clip": 1.04407549, "balance_loss_mlp": 1.00057197, "epoch": 0.9929657908976132, "flos": 17383889393280.0, "grad_norm": 1.6914806289290998, "language_loss": 0.74721491, "learning_rate": 5.102944302559642e-10, "loss": 0.76609117, "num_input_tokens_seen": 178247420, "step": 8258, "time_per_iteration": 2.6367123126983643 }, { "auxiliary_loss_clip": 0.01078614, "auxiliary_loss_mlp": 0.01040653, "balance_loss_clip": 1.03928471, "balance_loss_mlp": 1.02507222, "epoch": 0.9930860337882522, "flos": 22674680110080.0, "grad_norm": 2.053761342057738, "language_loss": 0.7996583, "learning_rate": 4.9285047985137e-10, "loss": 0.82085091, "num_input_tokens_seen": 178266840, "step": 8259, "time_per_iteration": 2.8029661178588867 }, { "auxiliary_loss_clip": 0.01123634, "auxiliary_loss_mlp": 0.01046626, "balance_loss_clip": 1.04229903, "balance_loss_mlp": 1.03180861, "epoch": 0.9932062766788914, "flos": 28147789284480.0, "grad_norm": 2.38848902044791, "language_loss": 0.74565685, "learning_rate": 4.757098530916436e-10, "loss": 0.7673595, "num_input_tokens_seen": 178287285, "step": 8260, "time_per_iteration": 2.6706268787384033 }, { "auxiliary_loss_clip": 0.01125461, "auxiliary_loss_mlp": 0.01048348, "balance_loss_clip": 1.04483795, "balance_loss_mlp": 1.03215909, "epoch": 0.9933265195695304, "flos": 20156659868160.0, "grad_norm": 5.084780077299812, "language_loss": 0.77435482, "learning_rate": 4.5887255257670563e-10, "loss": 0.79609287, "num_input_tokens_seen": 178304325, "step": 8261, "time_per_iteration": 2.6947121620178223 }, { "auxiliary_loss_clip": 0.01131522, "auxiliary_loss_mlp": 0.01039306, "balance_loss_clip": 1.04180181, "balance_loss_mlp": 1.0219965, "epoch": 0.9934467624601695, "flos": 21362037494400.0, "grad_norm": 1.9885093252552783, "language_loss": 0.76703191, "learning_rate": 4.4233858086117906e-10, "loss": 0.78874016, "num_input_tokens_seen": 178322850, "step": 8262, "time_per_iteration": 2.803929328918457 }, { "auxiliary_loss_clip": 0.01083624, "auxiliary_loss_mlp": 0.01051493, "balance_loss_clip": 1.04027379, "balance_loss_mlp": 1.0354116, "epoch": 0.9935670053508087, "flos": 19756040503680.0, "grad_norm": 2.1571250905185186, "language_loss": 0.68028694, "learning_rate": 4.261079404528356e-10, "loss": 0.7016381, "num_input_tokens_seen": 178342330, "step": 8263, "time_per_iteration": 2.7293496131896973 }, { "auxiliary_loss_clip": 0.01119103, "auxiliary_loss_mlp": 0.01039751, "balance_loss_clip": 1.04239964, "balance_loss_mlp": 1.02287114, "epoch": 0.9936872482414477, "flos": 21978838863360.0, "grad_norm": 1.7201799521247134, "language_loss": 0.68842673, "learning_rate": 4.1018063381437205e-10, "loss": 0.7100153, "num_input_tokens_seen": 178362715, "step": 8264, "time_per_iteration": 2.941157341003418 }, { "auxiliary_loss_clip": 0.01021839, "auxiliary_loss_mlp": 0.01001723, "balance_loss_clip": 1.00863838, "balance_loss_mlp": 1.00023293, "epoch": 0.9938074911320868, "flos": 69810667839360.0, "grad_norm": 0.8665716033186148, "language_loss": 0.61101604, "learning_rate": 3.9455666336141167e-10, "loss": 0.63125169, "num_input_tokens_seen": 178426495, "step": 8265, "time_per_iteration": 3.255824327468872 }, { "auxiliary_loss_clip": 0.01132175, "auxiliary_loss_mlp": 0.0104086, "balance_loss_clip": 1.04346776, "balance_loss_mlp": 1.02496302, "epoch": 0.9939277340227259, "flos": 15084170058240.0, "grad_norm": 2.6207376722658746, "language_loss": 0.83043098, "learning_rate": 3.7923603146450267e-10, "loss": 0.85216135, "num_input_tokens_seen": 178442555, "step": 8266, "time_per_iteration": 2.7233264446258545 }, { "auxiliary_loss_clip": 0.0109775, "auxiliary_loss_mlp": 0.01036582, "balance_loss_clip": 1.03902078, "balance_loss_mlp": 1.02201462, "epoch": 0.994047976913365, "flos": 17712364291200.0, "grad_norm": 1.924388235708446, "language_loss": 0.80958408, "learning_rate": 3.642187404473418e-10, "loss": 0.83092737, "num_input_tokens_seen": 178460715, "step": 8267, "time_per_iteration": 2.771538734436035 }, { "auxiliary_loss_clip": 0.01119176, "auxiliary_loss_mlp": 0.01043235, "balance_loss_clip": 1.04080701, "balance_loss_mlp": 1.02799964, "epoch": 0.994168219804004, "flos": 19171558396800.0, "grad_norm": 2.177288052866248, "language_loss": 0.86087549, "learning_rate": 3.495047925885508e-10, "loss": 0.88249958, "num_input_tokens_seen": 178479050, "step": 8268, "time_per_iteration": 2.7843635082244873 }, { "auxiliary_loss_clip": 0.0110253, "auxiliary_loss_mlp": 0.01044858, "balance_loss_clip": 1.03733063, "balance_loss_mlp": 1.02878857, "epoch": 0.9942884626946432, "flos": 17851589406720.0, "grad_norm": 2.427570947121195, "language_loss": 0.82556009, "learning_rate": 3.350941901199e-10, "loss": 0.84703398, "num_input_tokens_seen": 178495970, "step": 8269, "time_per_iteration": 2.807126045227051 }, { "auxiliary_loss_clip": 0.01107718, "auxiliary_loss_mlp": 0.01038689, "balance_loss_clip": 1.04049587, "balance_loss_mlp": 1.02244043, "epoch": 0.9944087055852823, "flos": 18796578364800.0, "grad_norm": 2.3340367792879353, "language_loss": 0.83559585, "learning_rate": 3.2098693522764066e-10, "loss": 0.85705996, "num_input_tokens_seen": 178509170, "step": 8270, "time_per_iteration": 3.842154026031494 }, { "auxiliary_loss_clip": 0.01112506, "auxiliary_loss_mlp": 0.00772206, "balance_loss_clip": 1.04211271, "balance_loss_mlp": 1.00055861, "epoch": 0.9945289484759213, "flos": 20996969616000.0, "grad_norm": 3.2908616883806743, "language_loss": 0.81325549, "learning_rate": 3.071830300516165e-10, "loss": 0.8321026, "num_input_tokens_seen": 178527000, "step": 8271, "time_per_iteration": 3.6305205821990967 }, { "auxiliary_loss_clip": 0.01126606, "auxiliary_loss_mlp": 0.01044796, "balance_loss_clip": 1.04426789, "balance_loss_mlp": 1.02821445, "epoch": 0.9946491913665605, "flos": 14756952136320.0, "grad_norm": 1.9410618388336693, "language_loss": 0.71022439, "learning_rate": 2.9368247668615234e-10, "loss": 0.73193842, "num_input_tokens_seen": 178545590, "step": 8272, "time_per_iteration": 2.6991679668426514 }, { "auxiliary_loss_clip": 0.01140513, "auxiliary_loss_mlp": 0.01043974, "balance_loss_clip": 1.04698253, "balance_loss_mlp": 1.02729642, "epoch": 0.9947694342571995, "flos": 12669931186560.0, "grad_norm": 4.276936728056863, "language_loss": 0.61317301, "learning_rate": 2.804852771789434e-10, "loss": 0.63501787, "num_input_tokens_seen": 178558890, "step": 8273, "time_per_iteration": 2.6187868118286133 }, { "auxiliary_loss_clip": 0.01127714, "auxiliary_loss_mlp": 0.01038521, "balance_loss_clip": 1.04129601, "balance_loss_mlp": 1.02366185, "epoch": 0.9948896771478386, "flos": 18843442634880.0, "grad_norm": 1.6319906148949315, "language_loss": 0.55497396, "learning_rate": 2.675914335321661e-10, "loss": 0.57663637, "num_input_tokens_seen": 178577645, "step": 8274, "time_per_iteration": 2.6452763080596924 }, { "auxiliary_loss_clip": 0.01127059, "auxiliary_loss_mlp": 0.01038963, "balance_loss_clip": 1.04247952, "balance_loss_mlp": 1.02171397, "epoch": 0.9950099200384778, "flos": 24900207903360.0, "grad_norm": 2.3921014387457733, "language_loss": 0.79682314, "learning_rate": 2.550009477018111e-10, "loss": 0.81848335, "num_input_tokens_seen": 178596415, "step": 8275, "time_per_iteration": 2.6598894596099854 }, { "auxiliary_loss_clip": 0.01107656, "auxiliary_loss_mlp": 0.00772178, "balance_loss_clip": 1.04067194, "balance_loss_mlp": 1.00054276, "epoch": 0.9951301629291168, "flos": 23733613987200.0, "grad_norm": 1.9949210991099073, "language_loss": 0.62792474, "learning_rate": 2.4271382159790634e-10, "loss": 0.64672303, "num_input_tokens_seen": 178613845, "step": 8276, "time_per_iteration": 3.7365918159484863 }, { "auxiliary_loss_clip": 0.01078207, "auxiliary_loss_mlp": 0.01041346, "balance_loss_clip": 1.0396657, "balance_loss_mlp": 1.02580154, "epoch": 0.9952504058197559, "flos": 22236893147520.0, "grad_norm": 1.7825692424106048, "language_loss": 0.85474992, "learning_rate": 2.3073005708429406e-10, "loss": 0.87594545, "num_input_tokens_seen": 178633490, "step": 8277, "time_per_iteration": 2.996864080429077 }, { "auxiliary_loss_clip": 0.01088795, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.03839493, "balance_loss_mlp": 1.02441525, "epoch": 0.995370648710395, "flos": 21211032718080.0, "grad_norm": 1.833715583360125, "language_loss": 0.72256887, "learning_rate": 2.190496559788535e-10, "loss": 0.74384439, "num_input_tokens_seen": 178651775, "step": 8278, "time_per_iteration": 2.7874255180358887 }, { "auxiliary_loss_clip": 0.01109198, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.04100323, "balance_loss_mlp": 1.02218246, "epoch": 0.9954908916010341, "flos": 14866731077760.0, "grad_norm": 2.594677900272598, "language_loss": 0.76240885, "learning_rate": 2.0767262005372265e-10, "loss": 0.78387332, "num_input_tokens_seen": 178669290, "step": 8279, "time_per_iteration": 2.754837989807129 }, { "auxiliary_loss_clip": 0.01098767, "auxiliary_loss_mlp": 0.01037041, "balance_loss_clip": 1.03873587, "balance_loss_mlp": 1.02199054, "epoch": 0.9956111344916732, "flos": 19208259118080.0, "grad_norm": 1.8114938804011718, "language_loss": 0.75567979, "learning_rate": 1.965989510346322e-10, "loss": 0.77703786, "num_input_tokens_seen": 178688410, "step": 8280, "time_per_iteration": 2.6469216346740723 }, { "auxiliary_loss_clip": 0.01077594, "auxiliary_loss_mlp": 0.0104507, "balance_loss_clip": 1.03561163, "balance_loss_mlp": 1.02690196, "epoch": 0.9957313773823123, "flos": 20047060494720.0, "grad_norm": 2.1403970536013492, "language_loss": 0.71018362, "learning_rate": 1.8582865060134955e-10, "loss": 0.73141026, "num_input_tokens_seen": 178706600, "step": 8281, "time_per_iteration": 2.7827670574188232 }, { "auxiliary_loss_clip": 0.01034836, "auxiliary_loss_mlp": 0.01003363, "balance_loss_clip": 1.0070703, "balance_loss_mlp": 1.00176609, "epoch": 0.9958516202729514, "flos": 57483253768320.0, "grad_norm": 0.7813346008812091, "language_loss": 0.55747002, "learning_rate": 1.7536172038790098e-10, "loss": 0.57785201, "num_input_tokens_seen": 178766910, "step": 8282, "time_per_iteration": 3.31636643409729 }, { "auxiliary_loss_clip": 0.01110677, "auxiliary_loss_mlp": 0.01039457, "balance_loss_clip": 1.04351509, "balance_loss_mlp": 1.0242939, "epoch": 0.9959718631635904, "flos": 27782900974080.0, "grad_norm": 2.4976052222220186, "language_loss": 0.69122952, "learning_rate": 1.651981619819054e-10, "loss": 0.71273088, "num_input_tokens_seen": 178784060, "step": 8283, "time_per_iteration": 2.8234753608703613 }, { "auxiliary_loss_clip": 0.01087724, "auxiliary_loss_mlp": 0.01030464, "balance_loss_clip": 1.03840876, "balance_loss_mlp": 1.01518095, "epoch": 0.9960921060542296, "flos": 24024095274240.0, "grad_norm": 2.3401394336178445, "language_loss": 0.70348465, "learning_rate": 1.5533797692546257e-10, "loss": 0.72466654, "num_input_tokens_seen": 178802795, "step": 8284, "time_per_iteration": 2.7945563793182373 }, { "auxiliary_loss_clip": 0.01116796, "auxiliary_loss_mlp": 0.01035551, "balance_loss_clip": 1.03935266, "balance_loss_mlp": 1.01974368, "epoch": 0.9962123489448687, "flos": 18697393935360.0, "grad_norm": 2.429199686623739, "language_loss": 0.84199995, "learning_rate": 1.4578116671404296e-10, "loss": 0.86352342, "num_input_tokens_seen": 178821075, "step": 8285, "time_per_iteration": 2.5977070331573486 }, { "auxiliary_loss_clip": 0.01115538, "auxiliary_loss_mlp": 0.01041349, "balance_loss_clip": 1.04311979, "balance_loss_mlp": 1.02589393, "epoch": 0.9963325918355077, "flos": 20010754823040.0, "grad_norm": 2.5933174464410844, "language_loss": 0.71748769, "learning_rate": 1.3652773279759777e-10, "loss": 0.73905659, "num_input_tokens_seen": 178837725, "step": 8286, "time_per_iteration": 2.6476471424102783 }, { "auxiliary_loss_clip": 0.01119272, "auxiliary_loss_mlp": 0.01043166, "balance_loss_clip": 1.0409739, "balance_loss_mlp": 1.02716804, "epoch": 0.9964528347261468, "flos": 33108488991360.0, "grad_norm": 2.8468434818885284, "language_loss": 0.63613969, "learning_rate": 1.2757767657989305e-10, "loss": 0.65776408, "num_input_tokens_seen": 178861515, "step": 8287, "time_per_iteration": 2.7183377742767334 }, { "auxiliary_loss_clip": 0.01121007, "auxiliary_loss_mlp": 0.01043856, "balance_loss_clip": 1.0422852, "balance_loss_mlp": 1.02877569, "epoch": 0.9965730776167859, "flos": 23109342589440.0, "grad_norm": 2.20291859245773, "language_loss": 0.87236786, "learning_rate": 1.1893099941850948e-10, "loss": 0.89401644, "num_input_tokens_seen": 178880410, "step": 8288, "time_per_iteration": 2.703212022781372 }, { "auxiliary_loss_clip": 0.0111315, "auxiliary_loss_mlp": 0.01031062, "balance_loss_clip": 1.04062736, "balance_loss_mlp": 1.01514721, "epoch": 0.996693320507425, "flos": 22965843755520.0, "grad_norm": 2.3844456181687015, "language_loss": 0.77677488, "learning_rate": 1.105877026252866e-10, "loss": 0.79821694, "num_input_tokens_seen": 178898740, "step": 8289, "time_per_iteration": 2.721813917160034 }, { "auxiliary_loss_clip": 0.01135777, "auxiliary_loss_mlp": 0.01038412, "balance_loss_clip": 1.0442766, "balance_loss_mlp": 1.02181804, "epoch": 0.996813563398064, "flos": 13222740476160.0, "grad_norm": 7.119116919485813, "language_loss": 0.72084463, "learning_rate": 1.0254778746565663e-10, "loss": 0.74258649, "num_input_tokens_seen": 178914015, "step": 8290, "time_per_iteration": 2.670820474624634 }, { "auxiliary_loss_clip": 0.01091722, "auxiliary_loss_mlp": 0.01037005, "balance_loss_clip": 1.03906286, "balance_loss_mlp": 1.02268767, "epoch": 0.9969338062887032, "flos": 14647855553280.0, "grad_norm": 2.22093386027202, "language_loss": 0.73138684, "learning_rate": 9.481125515953259e-11, "loss": 0.7526741, "num_input_tokens_seen": 178932075, "step": 8291, "time_per_iteration": 2.73486065864563 }, { "auxiliary_loss_clip": 0.01083425, "auxiliary_loss_mlp": 0.01043943, "balance_loss_clip": 1.03746152, "balance_loss_mlp": 1.02655041, "epoch": 0.9970540491793423, "flos": 25735741142400.0, "grad_norm": 1.8313042880362278, "language_loss": 0.7970717, "learning_rate": 8.737810688064228e-11, "loss": 0.81834543, "num_input_tokens_seen": 178951910, "step": 8292, "time_per_iteration": 2.8415744304656982 }, { "auxiliary_loss_clip": 0.01088485, "auxiliary_loss_mlp": 0.01055154, "balance_loss_clip": 1.03795195, "balance_loss_mlp": 1.0368793, "epoch": 0.9971742920699813, "flos": 21470236237440.0, "grad_norm": 1.9277268409581398, "language_loss": 0.7919234, "learning_rate": 8.024834375608414e-11, "loss": 0.81335986, "num_input_tokens_seen": 178970500, "step": 8293, "time_per_iteration": 2.747393846511841 }, { "auxiliary_loss_clip": 0.01034698, "auxiliary_loss_mlp": 0.01004573, "balance_loss_clip": 1.00692368, "balance_loss_mlp": 1.00301123, "epoch": 0.9972945349606205, "flos": 72211223629440.0, "grad_norm": 0.8214408306567237, "language_loss": 0.62761474, "learning_rate": 7.342196686788149e-11, "loss": 0.64800739, "num_input_tokens_seen": 179023665, "step": 8294, "time_per_iteration": 3.067667245864868 }, { "auxiliary_loss_clip": 0.01103798, "auxiliary_loss_mlp": 0.01049227, "balance_loss_clip": 1.04238963, "balance_loss_mlp": 1.03377759, "epoch": 0.9974147778512595, "flos": 19678293515520.0, "grad_norm": 2.0336643043510327, "language_loss": 0.68489003, "learning_rate": 6.689897725142834e-11, "loss": 0.7064203, "num_input_tokens_seen": 179043140, "step": 8295, "time_per_iteration": 2.771695375442505 }, { "auxiliary_loss_clip": 0.01108439, "auxiliary_loss_mlp": 0.0103495, "balance_loss_clip": 1.03947568, "balance_loss_mlp": 1.01868987, "epoch": 0.9975350207418986, "flos": 15960821391360.0, "grad_norm": 2.634225308014814, "language_loss": 0.88359201, "learning_rate": 6.067937589615545e-11, "loss": 0.90502596, "num_input_tokens_seen": 179061215, "step": 8296, "time_per_iteration": 4.3086607456207275 }, { "auxiliary_loss_clip": 0.01021317, "auxiliary_loss_mlp": 0.01014135, "balance_loss_clip": 1.01159334, "balance_loss_mlp": 1.01191795, "epoch": 0.9976552636325378, "flos": 59961879768960.0, "grad_norm": 0.7390820987714927, "language_loss": 0.57650685, "learning_rate": 5.476316374575241e-11, "loss": 0.59686136, "num_input_tokens_seen": 179124700, "step": 8297, "time_per_iteration": 4.279587268829346 }, { "auxiliary_loss_clip": 0.01133269, "auxiliary_loss_mlp": 0.01043357, "balance_loss_clip": 1.0418582, "balance_loss_mlp": 1.02685809, "epoch": 0.9977755065231768, "flos": 22487872452480.0, "grad_norm": 2.010605640503779, "language_loss": 0.72241962, "learning_rate": 4.9150341697723476e-11, "loss": 0.74418592, "num_input_tokens_seen": 179144590, "step": 8298, "time_per_iteration": 2.733774185180664 }, { "auxiliary_loss_clip": 0.01104834, "auxiliary_loss_mlp": 0.01044508, "balance_loss_clip": 1.0423249, "balance_loss_mlp": 1.02849841, "epoch": 0.9978957494138159, "flos": 26030280666240.0, "grad_norm": 1.8279168876265968, "language_loss": 0.66637576, "learning_rate": 4.384091060338768e-11, "loss": 0.68786925, "num_input_tokens_seen": 179165060, "step": 8299, "time_per_iteration": 2.7414283752441406 }, { "auxiliary_loss_clip": 0.01117718, "auxiliary_loss_mlp": 0.01041, "balance_loss_clip": 1.04112291, "balance_loss_mlp": 1.02517462, "epoch": 0.998015992304455, "flos": 22637835734400.0, "grad_norm": 2.051957321585496, "language_loss": 0.73897821, "learning_rate": 3.883487126810081e-11, "loss": 0.7605654, "num_input_tokens_seen": 179184320, "step": 8300, "time_per_iteration": 2.7891504764556885 }, { "auxiliary_loss_clip": 0.01110926, "auxiliary_loss_mlp": 0.01036063, "balance_loss_clip": 1.03990579, "balance_loss_mlp": 1.02093542, "epoch": 0.9981362351950941, "flos": 18223444955520.0, "grad_norm": 1.7793532715069893, "language_loss": 0.79586977, "learning_rate": 3.41322244516995e-11, "loss": 0.81733966, "num_input_tokens_seen": 179202265, "step": 8301, "time_per_iteration": 2.7515273094177246 }, { "auxiliary_loss_clip": 0.01068223, "auxiliary_loss_mlp": 0.01040602, "balance_loss_clip": 1.03466177, "balance_loss_mlp": 1.02551031, "epoch": 0.9982564780857331, "flos": 33474095573760.0, "grad_norm": 2.545700180045284, "language_loss": 0.63113487, "learning_rate": 2.9732970866946925e-11, "loss": 0.65222311, "num_input_tokens_seen": 179222145, "step": 8302, "time_per_iteration": 3.8564810752868652 }, { "auxiliary_loss_clip": 0.01084271, "auxiliary_loss_mlp": 0.01038589, "balance_loss_clip": 1.03561831, "balance_loss_mlp": 1.02024293, "epoch": 0.9983767209763723, "flos": 15523465392000.0, "grad_norm": 2.4190962955212623, "language_loss": 0.77953577, "learning_rate": 2.563711118175327e-11, "loss": 0.80076438, "num_input_tokens_seen": 179239030, "step": 8303, "time_per_iteration": 3.7585411071777344 }, { "auxiliary_loss_clip": 0.01090271, "auxiliary_loss_mlp": 0.01038253, "balance_loss_clip": 1.03984249, "balance_loss_mlp": 1.02324438, "epoch": 0.9984969638670114, "flos": 19974377324160.0, "grad_norm": 2.020568128597017, "language_loss": 0.84054804, "learning_rate": 2.184464601717728e-11, "loss": 0.86183327, "num_input_tokens_seen": 179257345, "step": 8304, "time_per_iteration": 2.8008675575256348 }, { "auxiliary_loss_clip": 0.01123286, "auxiliary_loss_mlp": 0.01047618, "balance_loss_clip": 1.04314947, "balance_loss_mlp": 1.03177547, "epoch": 0.9986172067576504, "flos": 20375750874240.0, "grad_norm": 5.296646711686621, "language_loss": 0.77849936, "learning_rate": 1.8355575948758585e-11, "loss": 0.80020845, "num_input_tokens_seen": 179275330, "step": 8305, "time_per_iteration": 2.6279261112213135 }, { "auxiliary_loss_clip": 0.01105813, "auxiliary_loss_mlp": 0.01045377, "balance_loss_clip": 1.03844965, "balance_loss_mlp": 1.02806807, "epoch": 0.9987374496482896, "flos": 23727903724800.0, "grad_norm": 2.13025936814221, "language_loss": 0.73834276, "learning_rate": 1.5169901505407424e-11, "loss": 0.75985467, "num_input_tokens_seen": 179292395, "step": 8306, "time_per_iteration": 2.7750802040100098 }, { "auxiliary_loss_clip": 0.01099676, "auxiliary_loss_mlp": 0.01032102, "balance_loss_clip": 1.03682685, "balance_loss_mlp": 1.01721263, "epoch": 0.9988576925389286, "flos": 25044029959680.0, "grad_norm": 4.934812959434955, "language_loss": 0.74137783, "learning_rate": 1.228762317073695e-11, "loss": 0.76269567, "num_input_tokens_seen": 179311225, "step": 8307, "time_per_iteration": 2.743300437927246 }, { "auxiliary_loss_clip": 0.01099959, "auxiliary_loss_mlp": 0.01036411, "balance_loss_clip": 1.03855801, "balance_loss_mlp": 1.02094376, "epoch": 0.9989779354295677, "flos": 31285627637760.0, "grad_norm": 2.071447993471098, "language_loss": 0.79049754, "learning_rate": 9.70874138195299e-12, "loss": 0.81186122, "num_input_tokens_seen": 179333135, "step": 8308, "time_per_iteration": 2.8518171310424805 }, { "auxiliary_loss_clip": 0.01133501, "auxiliary_loss_mlp": 0.01041557, "balance_loss_clip": 1.04177499, "balance_loss_mlp": 1.02645278, "epoch": 0.9990981783202069, "flos": 19573398823680.0, "grad_norm": 1.590197849448438, "language_loss": 0.74773741, "learning_rate": 7.433256530076093e-12, "loss": 0.76948798, "num_input_tokens_seen": 179353090, "step": 8309, "time_per_iteration": 2.6737756729125977 }, { "auxiliary_loss_clip": 0.01085489, "auxiliary_loss_mlp": 0.01031175, "balance_loss_clip": 1.03833485, "balance_loss_mlp": 1.01632202, "epoch": 0.9992184212108459, "flos": 17199667514880.0, "grad_norm": 2.2633490173273234, "language_loss": 0.7585327, "learning_rate": 5.46116896038562e-12, "loss": 0.77969933, "num_input_tokens_seen": 179367500, "step": 8310, "time_per_iteration": 2.9714741706848145 }, { "auxiliary_loss_clip": 0.01103823, "auxiliary_loss_mlp": 0.01038601, "balance_loss_clip": 1.04062641, "balance_loss_mlp": 1.02369928, "epoch": 0.999338664101485, "flos": 46497853681920.0, "grad_norm": 2.620774110869601, "language_loss": 0.62176478, "learning_rate": 3.792478972197699e-12, "loss": 0.64318907, "num_input_tokens_seen": 179388085, "step": 8311, "time_per_iteration": 2.983100175857544 }, { "auxiliary_loss_clip": 0.01129679, "auxiliary_loss_mlp": 0.01040387, "balance_loss_clip": 1.04073298, "balance_loss_mlp": 1.0258373, "epoch": 0.9994589069921241, "flos": 15158253859200.0, "grad_norm": 2.7151186926012922, "language_loss": 0.70363766, "learning_rate": 2.4271868181990895e-12, "loss": 0.72533834, "num_input_tokens_seen": 179405250, "step": 8312, "time_per_iteration": 2.6787209510803223 }, { "auxiliary_loss_clip": 0.01118469, "auxiliary_loss_mlp": 0.01037198, "balance_loss_clip": 1.04077387, "balance_loss_mlp": 1.0217005, "epoch": 0.9995791498827632, "flos": 12531460256640.0, "grad_norm": 2.2811708959939434, "language_loss": 0.81345248, "learning_rate": 1.3652927060014973e-12, "loss": 0.83500916, "num_input_tokens_seen": 179420845, "step": 8313, "time_per_iteration": 2.699444055557251 }, { "auxiliary_loss_clip": 0.01096247, "auxiliary_loss_mlp": 0.01036631, "balance_loss_clip": 1.04018819, "balance_loss_mlp": 1.0203228, "epoch": 0.9996993927734023, "flos": 19245175320960.0, "grad_norm": 3.5492811958134185, "language_loss": 0.64123619, "learning_rate": 6.067967965872612e-13, "loss": 0.66256493, "num_input_tokens_seen": 179440455, "step": 8314, "time_per_iteration": 2.7192535400390625 }, { "auxiliary_loss_clip": 0.01095711, "auxiliary_loss_mlp": 0.01037845, "balance_loss_clip": 1.04102159, "balance_loss_mlp": 1.02255654, "epoch": 0.9998196356640414, "flos": 62952804518400.0, "grad_norm": 1.5453742398559338, "language_loss": 0.76968336, "learning_rate": 1.5169920497548615e-13, "loss": 0.79101896, "num_input_tokens_seen": 179465075, "step": 8315, "time_per_iteration": 3.131525993347168 }, { "auxiliary_loss_clip": 0.01075838, "auxiliary_loss_mlp": 0.0101977, "balance_loss_clip": 1.02431428, "balance_loss_mlp": 1.0109303, "epoch": 0.9999398785546805, "flos": 50922375073920.0, "grad_norm": 1.4852514668828287, "language_loss": 0.55015957, "learning_rate": 0.0, "loss": 0.57111573, "num_input_tokens_seen": 179513955, "step": 8316, "time_per_iteration": 3.199253559112549 }, { "epoch": 0.9999398785546805, "num_input_tokens_seen": 179513955, "step": 8316, "total_flos": 6.996749092776837e+17, "train_loss": 0.7894011992224711, "train_runtime": 25240.8308, "train_samples_per_second": 13.179, "train_steps_per_second": 0.329 } ], "logging_steps": 1.0, "max_steps": 8316, "num_input_tokens_seen": 179513955, "num_train_epochs": 1, "save_steps": 1664, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.996749092776837e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }