{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999398785546805, "eval_steps": 500, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.04467606, "auxiliary_loss_mlp": 0.02214109, "balance_loss_clip": 2.44035244, "balance_loss_mlp": 1.7477622, "epoch": 0.00012024289063909097, "flos": 24932483919360.0, "grad_norm": 39.94680766841656, "language_loss": 2.58248568, "learning_rate": 0.0, "loss": 1.9010365, "num_input_tokens_seen": 20375, "step": 1, "time_per_iteration": 15.553298234939575 }, { "auxiliary_loss_clip": 0.02957249, "auxiliary_loss_mlp": 0.01419526, "balance_loss_clip": 1.61343002, "balance_loss_mlp": 1.13208902, "epoch": 0.00024048578127818193, "flos": 30664624377600.0, "grad_norm": 54.79875405706253, "language_loss": 1.88785017, "learning_rate": 5.021476677069823e-07, "loss": 1.93161786, "num_input_tokens_seen": 39035, "step": 2, "time_per_iteration": 2.7612693309783936 }, { "auxiliary_loss_clip": 0.02960445, "auxiliary_loss_mlp": 0.01484531, "balance_loss_clip": 1.62834692, "balance_loss_mlp": 1.18774796, "epoch": 0.0003607286719172729, "flos": 19026227969280.0, "grad_norm": 40.83861379082411, "language_loss": 1.61476827, "learning_rate": 7.958852231401551e-07, "loss": 1.65921807, "num_input_tokens_seen": 57600, "step": 3, "time_per_iteration": 2.541330575942993 }, { "auxiliary_loss_clip": 0.02935145, "auxiliary_loss_mlp": 0.01464239, "balance_loss_clip": 1.61329925, "balance_loss_mlp": 1.15295982, "epoch": 0.00048097156255636386, "flos": 19316314206720.0, "grad_norm": 37.01716318323993, "language_loss": 1.64450955, "learning_rate": 1.0042953354139647e-06, "loss": 1.6885035, "num_input_tokens_seen": 76465, "step": 4, "time_per_iteration": 2.611107349395752 }, { "auxiliary_loss_clip": 0.02953273, "auxiliary_loss_mlp": 0.01455373, "balance_loss_clip": 1.6154902, "balance_loss_mlp": 1.15248573, "epoch": 0.0006012144531954548, "flos": 13991264893440.0, "grad_norm": 55.07727517249346, "language_loss": 1.93506944, "learning_rate": 1.1659507774310057e-06, "loss": 1.97915578, "num_input_tokens_seen": 94350, "step": 5, "time_per_iteration": 2.8608996868133545 }, { "auxiliary_loss_clip": 0.02967778, "auxiliary_loss_mlp": 0.01440656, "balance_loss_clip": 1.61950994, "balance_loss_mlp": 1.14978492, "epoch": 0.0007214573438345458, "flos": 23148988225920.0, "grad_norm": 45.570967847641434, "language_loss": 1.6095438, "learning_rate": 1.2980328908471373e-06, "loss": 1.65362811, "num_input_tokens_seen": 114595, "step": 6, "time_per_iteration": 2.8553831577301025 }, { "auxiliary_loss_clip": 0.029676, "auxiliary_loss_mlp": 0.01921181, "balance_loss_clip": 1.74828148, "balance_loss_mlp": 1.68619573, "epoch": 0.0008417002344736367, "flos": 67663246170240.0, "grad_norm": 4.605888270912766, "language_loss": 0.81467319, "learning_rate": 1.4097067265369432e-06, "loss": 0.86356103, "num_input_tokens_seen": 179590, "step": 7, "time_per_iteration": 3.397385358810425 }, { "auxiliary_loss_clip": 0.0296086, "auxiliary_loss_mlp": 0.01508198, "balance_loss_clip": 1.61705494, "balance_loss_mlp": 1.19100642, "epoch": 0.0009619431251127277, "flos": 21281381504640.0, "grad_norm": 41.119438373742106, "language_loss": 1.58188546, "learning_rate": 1.506443003120947e-06, "loss": 1.62657607, "num_input_tokens_seen": 195090, "step": 8, "time_per_iteration": 2.84830641746521 }, { "auxiliary_loss_clip": 0.02948974, "auxiliary_loss_mlp": 0.01465889, "balance_loss_clip": 1.61906743, "balance_loss_mlp": 1.15956855, "epoch": 0.0010821860157518186, "flos": 23331342597120.0, "grad_norm": 18.08311959645327, "language_loss": 1.47888017, "learning_rate": 1.5917704462803102e-06, "loss": 1.52302885, "num_input_tokens_seen": 211635, "step": 9, "time_per_iteration": 2.86454701423645 }, { "auxiliary_loss_clip": 0.02937955, "auxiliary_loss_mlp": 0.01467086, "balance_loss_clip": 1.62112153, "balance_loss_mlp": 1.1544714, "epoch": 0.0012024289063909096, "flos": 17010166337280.0, "grad_norm": 13.179708289285298, "language_loss": 1.53100634, "learning_rate": 1.6680984451379884e-06, "loss": 1.57505679, "num_input_tokens_seen": 224705, "step": 10, "time_per_iteration": 2.8662192821502686 }, { "auxiliary_loss_clip": 0.02940062, "auxiliary_loss_mlp": 0.01492166, "balance_loss_clip": 1.61530626, "balance_loss_mlp": 1.18927872, "epoch": 0.0013226717970300007, "flos": 21288133261440.0, "grad_norm": 15.833724520621283, "language_loss": 1.32358086, "learning_rate": 1.7371455188905097e-06, "loss": 1.36790323, "num_input_tokens_seen": 244635, "step": 11, "time_per_iteration": 2.8281431198120117 }, { "auxiliary_loss_clip": 0.02961133, "auxiliary_loss_mlp": 0.01485069, "balance_loss_clip": 1.61891055, "balance_loss_mlp": 1.17684102, "epoch": 0.0014429146876690916, "flos": 27237884935680.0, "grad_norm": 12.324471154923419, "language_loss": 1.25385261, "learning_rate": 1.8001805585541196e-06, "loss": 1.29831457, "num_input_tokens_seen": 265765, "step": 12, "time_per_iteration": 2.96876859664917 }, { "auxiliary_loss_clip": 0.02941593, "auxiliary_loss_mlp": 0.01451554, "balance_loss_clip": 1.62169218, "balance_loss_mlp": 1.15419841, "epoch": 0.0015631575783081825, "flos": 19062174504960.0, "grad_norm": 6.6576706024290715, "language_loss": 1.2902782, "learning_rate": 1.8581671739548328e-06, "loss": 1.33420968, "num_input_tokens_seen": 283500, "step": 13, "time_per_iteration": 2.9746713638305664 }, { "auxiliary_loss_clip": 0.02915523, "auxiliary_loss_mlp": 0.01463951, "balance_loss_clip": 1.61114192, "balance_loss_mlp": 1.16259003, "epoch": 0.0016834004689472734, "flos": 48139473985920.0, "grad_norm": 8.075965978215015, "language_loss": 1.13560903, "learning_rate": 1.9118543942439254e-06, "loss": 1.17940378, "num_input_tokens_seen": 305685, "step": 14, "time_per_iteration": 4.783610582351685 }, { "auxiliary_loss_clip": 0.02905543, "auxiliary_loss_mlp": 0.01498061, "balance_loss_clip": 1.61611676, "balance_loss_mlp": 1.19345713, "epoch": 0.0018036433595863645, "flos": 34970026314240.0, "grad_norm": 5.534810172583667, "language_loss": 1.12852788, "learning_rate": 1.961836000571161e-06, "loss": 1.17256391, "num_input_tokens_seen": 327340, "step": 15, "time_per_iteration": 3.828604221343994 }, { "auxiliary_loss_clip": 0.02850675, "auxiliary_loss_mlp": 0.0173106, "balance_loss_clip": 1.72495747, "balance_loss_mlp": 1.50980794, "epoch": 0.0019238862502254555, "flos": 59768284440960.0, "grad_norm": 3.824813040755893, "language_loss": 0.64701551, "learning_rate": 2.0085906708279293e-06, "loss": 0.69283289, "num_input_tokens_seen": 382710, "step": 16, "time_per_iteration": 3.269314765930176 }, { "auxiliary_loss_clip": 0.02873206, "auxiliary_loss_mlp": 0.01431816, "balance_loss_clip": 1.61367083, "balance_loss_mlp": 1.13770235, "epoch": 0.0020441291408645466, "flos": 20814543417600.0, "grad_norm": 4.4468085220211835, "language_loss": 1.16141224, "learning_rate": 2.0525099325728135e-06, "loss": 1.20446253, "num_input_tokens_seen": 400890, "step": 17, "time_per_iteration": 3.0195155143737793 }, { "auxiliary_loss_clip": 0.02807219, "auxiliary_loss_mlp": 0.01668484, "balance_loss_clip": 1.71573555, "balance_loss_mlp": 1.45180929, "epoch": 0.0021643720315036373, "flos": 63857001582720.0, "grad_norm": 3.5637551802310266, "language_loss": 0.72239363, "learning_rate": 2.0939181139872922e-06, "loss": 0.76715064, "num_input_tokens_seen": 462605, "step": 18, "time_per_iteration": 3.2330667972564697 }, { "auxiliary_loss_clip": 0.028581, "auxiliary_loss_mlp": 0.01473629, "balance_loss_clip": 1.60797453, "balance_loss_mlp": 1.16616392, "epoch": 0.0022846149221427284, "flos": 31284981192960.0, "grad_norm": 5.052849558413146, "language_loss": 1.01752734, "learning_rate": 2.1330868934640175e-06, "loss": 1.06084466, "num_input_tokens_seen": 483280, "step": 19, "time_per_iteration": 2.939197063446045 }, { "auxiliary_loss_clip": 0.02761703, "auxiliary_loss_mlp": 0.01591906, "balance_loss_clip": 1.70997667, "balance_loss_mlp": 1.38133514, "epoch": 0.002404857812781819, "flos": 51083648161920.0, "grad_norm": 3.599382686018435, "language_loss": 0.76506174, "learning_rate": 2.170246112844971e-06, "loss": 0.8085978, "num_input_tokens_seen": 537620, "step": 20, "time_per_iteration": 3.0653860569000244 }, { "auxiliary_loss_clip": 0.02830854, "auxiliary_loss_mlp": 0.01447051, "balance_loss_clip": 1.60585761, "balance_loss_mlp": 1.14569008, "epoch": 0.0025251007034209102, "flos": 15815347309440.0, "grad_norm": 5.087308143485803, "language_loss": 1.01554823, "learning_rate": 2.2055919496770983e-06, "loss": 1.05832732, "num_input_tokens_seen": 555760, "step": 21, "time_per_iteration": 2.815840244293213 }, { "auxiliary_loss_clip": 0.02784108, "auxiliary_loss_mlp": 0.01413503, "balance_loss_clip": 1.59741044, "balance_loss_mlp": 1.12454009, "epoch": 0.0026453435940600014, "flos": 37851857458560.0, "grad_norm": 4.875791688057386, "language_loss": 0.89484751, "learning_rate": 2.2392931865974923e-06, "loss": 0.93682373, "num_input_tokens_seen": 578450, "step": 22, "time_per_iteration": 2.943260431289673 }, { "auxiliary_loss_clip": 0.02752024, "auxiliary_loss_mlp": 0.01386608, "balance_loss_clip": 1.58558714, "balance_loss_mlp": 1.09802604, "epoch": 0.002765586484699092, "flos": 21141976821120.0, "grad_norm": 4.456878153891676, "language_loss": 1.02051497, "learning_rate": 2.271496085962064e-06, "loss": 1.06190133, "num_input_tokens_seen": 596145, "step": 23, "time_per_iteration": 2.8832197189331055 }, { "auxiliary_loss_clip": 0.0273319, "auxiliary_loss_mlp": 0.01436626, "balance_loss_clip": 1.58454442, "balance_loss_mlp": 1.14499235, "epoch": 0.002885829375338183, "flos": 20667381396480.0, "grad_norm": 3.14335656242248, "language_loss": 1.02702713, "learning_rate": 2.3023282262611022e-06, "loss": 1.06872535, "num_input_tokens_seen": 614920, "step": 24, "time_per_iteration": 2.855179786682129 }, { "auxiliary_loss_clip": 0.0269432, "auxiliary_loss_mlp": 0.01383565, "balance_loss_clip": 1.57428026, "balance_loss_mlp": 1.10146832, "epoch": 0.003006072265977274, "flos": 34823869873920.0, "grad_norm": 4.03219827219884, "language_loss": 0.92548281, "learning_rate": 2.3319015548620114e-06, "loss": 0.96626163, "num_input_tokens_seen": 636060, "step": 25, "time_per_iteration": 2.9545159339904785 }, { "auxiliary_loss_clip": 0.02691766, "auxiliary_loss_mlp": 0.01389614, "balance_loss_clip": 1.5795958, "balance_loss_mlp": 1.10656321, "epoch": 0.003126315156616365, "flos": 24422021118720.0, "grad_norm": 2.409555234360913, "language_loss": 0.92906344, "learning_rate": 2.3603148416618152e-06, "loss": 0.96987718, "num_input_tokens_seen": 655575, "step": 26, "time_per_iteration": 2.8401596546173096 }, { "auxiliary_loss_clip": 0.02665196, "auxiliary_loss_mlp": 0.01402198, "balance_loss_clip": 1.57476282, "balance_loss_mlp": 1.11285329, "epoch": 0.003246558047255456, "flos": 23622326674560.0, "grad_norm": 2.2260456861779807, "language_loss": 1.00941598, "learning_rate": 2.3876556694204647e-06, "loss": 1.05008996, "num_input_tokens_seen": 675730, "step": 27, "time_per_iteration": 2.82129168510437 }, { "auxiliary_loss_clip": 0.0261853, "auxiliary_loss_mlp": 0.01387884, "balance_loss_clip": 1.5706892, "balance_loss_mlp": 1.10006499, "epoch": 0.003366800937894547, "flos": 17820275725440.0, "grad_norm": 3.964031273736502, "language_loss": 0.90706223, "learning_rate": 2.414002061950908e-06, "loss": 0.94712639, "num_input_tokens_seen": 694605, "step": 28, "time_per_iteration": 2.7319085597991943 }, { "auxiliary_loss_clip": 0.0261848, "auxiliary_loss_mlp": 0.0137853, "balance_loss_clip": 1.56292892, "balance_loss_mlp": 1.09528923, "epoch": 0.003487043828533638, "flos": 24426115269120.0, "grad_norm": 2.344530787211221, "language_loss": 0.99772131, "learning_rate": 2.4394238264681557e-06, "loss": 1.03769159, "num_input_tokens_seen": 714340, "step": 29, "time_per_iteration": 2.8428821563720703 }, { "auxiliary_loss_clip": 0.02608166, "auxiliary_loss_mlp": 0.01400786, "balance_loss_clip": 1.56078279, "balance_loss_mlp": 1.11888003, "epoch": 0.003607286719172729, "flos": 26140311002880.0, "grad_norm": 2.0627046631054524, "language_loss": 0.99542361, "learning_rate": 2.4639836682781433e-06, "loss": 1.03551304, "num_input_tokens_seen": 734470, "step": 30, "time_per_iteration": 2.824864149093628 }, { "auxiliary_loss_clip": 0.02560761, "auxiliary_loss_mlp": 0.01364637, "balance_loss_clip": 1.55760372, "balance_loss_mlp": 1.10943365, "epoch": 0.00372752960981182, "flos": 20593082113920.0, "grad_norm": 2.4527429912715255, "language_loss": 1.00216627, "learning_rate": 2.487738122623307e-06, "loss": 1.04142022, "num_input_tokens_seen": 753380, "step": 31, "time_per_iteration": 2.8619816303253174 }, { "auxiliary_loss_clip": 0.02507337, "auxiliary_loss_mlp": 0.01361767, "balance_loss_clip": 1.54143584, "balance_loss_mlp": 1.09607363, "epoch": 0.003847772500450911, "flos": 22674608282880.0, "grad_norm": 2.644670731490008, "language_loss": 0.99069548, "learning_rate": 2.510738338534912e-06, "loss": 1.02938652, "num_input_tokens_seen": 772105, "step": 32, "time_per_iteration": 3.1506917476654053 }, { "auxiliary_loss_clip": 0.02454353, "auxiliary_loss_mlp": 0.0135281, "balance_loss_clip": 1.52698088, "balance_loss_mlp": 1.08501875, "epoch": 0.003968015391090002, "flos": 17967796882560.0, "grad_norm": 2.2931215889912235, "language_loss": 1.02416885, "learning_rate": 2.5330307420306648e-06, "loss": 1.06224036, "num_input_tokens_seen": 788955, "step": 33, "time_per_iteration": 3.0651557445526123 }, { "auxiliary_loss_clip": 0.02413018, "auxiliary_loss_mlp": 0.01346079, "balance_loss_clip": 1.51708829, "balance_loss_mlp": 1.07981277, "epoch": 0.004088258281729093, "flos": 27304103658240.0, "grad_norm": 2.7479262377500273, "language_loss": 0.881145, "learning_rate": 2.554657600279796e-06, "loss": 0.91873598, "num_input_tokens_seen": 810230, "step": 34, "time_per_iteration": 3.0397989749908447 }, { "auxiliary_loss_clip": 0.02330016, "auxiliary_loss_mlp": 0.01319568, "balance_loss_clip": 1.49565673, "balance_loss_mlp": 1.06455588, "epoch": 0.004208501172368184, "flos": 23258587599360.0, "grad_norm": 2.1112514141812997, "language_loss": 1.03360033, "learning_rate": 2.5756575039679493e-06, "loss": 1.07009625, "num_input_tokens_seen": 829780, "step": 35, "time_per_iteration": 2.849703311920166 }, { "auxiliary_loss_clip": 0.02318, "auxiliary_loss_mlp": 0.01340056, "balance_loss_clip": 1.49303532, "balance_loss_mlp": 1.07779574, "epoch": 0.0043287440630072746, "flos": 17312104062720.0, "grad_norm": 2.5247454512658307, "language_loss": 0.9506346, "learning_rate": 2.5960657816942747e-06, "loss": 0.98721528, "num_input_tokens_seen": 848695, "step": 36, "time_per_iteration": 2.8934669494628906 }, { "auxiliary_loss_clip": 0.01970263, "auxiliary_loss_mlp": 0.01294235, "balance_loss_clip": 1.45352697, "balance_loss_mlp": 1.1355437, "epoch": 0.004448986953646365, "flos": 53092491160320.0, "grad_norm": 1.3827576621663105, "language_loss": 0.60941261, "learning_rate": 2.6159148575788668e-06, "loss": 0.6420576, "num_input_tokens_seen": 906730, "step": 37, "time_per_iteration": 3.2487406730651855 }, { "auxiliary_loss_clip": 0.02230809, "auxiliary_loss_mlp": 0.01314499, "balance_loss_clip": 1.47196531, "balance_loss_mlp": 1.07112169, "epoch": 0.004569229844285457, "flos": 13444165866240.0, "grad_norm": 2.549826725868638, "language_loss": 0.98657227, "learning_rate": 2.635234561171e-06, "loss": 1.02202535, "num_input_tokens_seen": 925125, "step": 38, "time_per_iteration": 2.744039297103882 }, { "auxiliary_loss_clip": 0.02217803, "auxiliary_loss_mlp": 0.01320199, "balance_loss_clip": 1.46357512, "balance_loss_mlp": 1.06614017, "epoch": 0.0046894727349245475, "flos": 16209609966720.0, "grad_norm": 2.2535202118579343, "language_loss": 0.94030154, "learning_rate": 2.6540523970949877e-06, "loss": 0.97568154, "num_input_tokens_seen": 939970, "step": 39, "time_per_iteration": 2.7685961723327637 }, { "auxiliary_loss_clip": 0.02163647, "auxiliary_loss_mlp": 0.01300189, "balance_loss_clip": 1.45423543, "balance_loss_mlp": 1.06444025, "epoch": 0.004809715625563638, "flos": 23914244505600.0, "grad_norm": 2.6015805317765346, "language_loss": 0.92455447, "learning_rate": 2.6723937805519533e-06, "loss": 0.95919275, "num_input_tokens_seen": 957470, "step": 40, "time_per_iteration": 4.7169578075408936 }, { "auxiliary_loss_clip": 0.02159091, "auxiliary_loss_mlp": 0.01312184, "balance_loss_clip": 1.44686449, "balance_loss_mlp": 1.06117702, "epoch": 0.00492995851620273, "flos": 20773030273920.0, "grad_norm": 2.1339366673313256, "language_loss": 0.92988682, "learning_rate": 2.690282243737839e-06, "loss": 0.96459955, "num_input_tokens_seen": 976405, "step": 41, "time_per_iteration": 3.651533842086792 }, { "auxiliary_loss_clip": 0.02117032, "auxiliary_loss_mlp": 0.01317385, "balance_loss_clip": 1.43368697, "balance_loss_mlp": 1.08392584, "epoch": 0.0050502014068418205, "flos": 20338655103360.0, "grad_norm": 2.5042782179467618, "language_loss": 0.99273968, "learning_rate": 2.7077396173840807e-06, "loss": 1.02708387, "num_input_tokens_seen": 994690, "step": 42, "time_per_iteration": 2.8019254207611084 }, { "auxiliary_loss_clip": 0.02099079, "auxiliary_loss_mlp": 0.01281967, "balance_loss_clip": 1.43357158, "balance_loss_mlp": 1.05804443, "epoch": 0.005170444297480911, "flos": 25994872834560.0, "grad_norm": 44.22817018780025, "language_loss": 0.92756361, "learning_rate": 2.7247861909342594e-06, "loss": 0.96137398, "num_input_tokens_seen": 1015615, "step": 43, "time_per_iteration": 2.7972021102905273 }, { "auxiliary_loss_clip": 0.02054076, "auxiliary_loss_mlp": 0.01297619, "balance_loss_clip": 1.41339183, "balance_loss_mlp": 1.07369637, "epoch": 0.005290687188120003, "flos": 20954055841920.0, "grad_norm": 2.494085457534451, "language_loss": 0.83109963, "learning_rate": 2.7414408543044743e-06, "loss": 0.86461663, "num_input_tokens_seen": 1031255, "step": 44, "time_per_iteration": 2.791658639907837 }, { "auxiliary_loss_clip": 0.02058007, "auxiliary_loss_mlp": 0.01262537, "balance_loss_clip": 1.41847444, "balance_loss_mlp": 1.05167961, "epoch": 0.005410930078759093, "flos": 15851401585920.0, "grad_norm": 4.112553475258077, "language_loss": 0.79394692, "learning_rate": 2.7577212237113157e-06, "loss": 0.82715237, "num_input_tokens_seen": 1048295, "step": 45, "time_per_iteration": 2.7529516220092773 }, { "auxiliary_loss_clip": 0.02037877, "auxiliary_loss_mlp": 0.0126015, "balance_loss_clip": 1.41055727, "balance_loss_mlp": 1.0490067, "epoch": 0.005531172969398184, "flos": 21104988791040.0, "grad_norm": 1.9688435939396838, "language_loss": 1.04321599, "learning_rate": 2.7736437536690466e-06, "loss": 1.07619619, "num_input_tokens_seen": 1067925, "step": 46, "time_per_iteration": 2.8125345706939697 }, { "auxiliary_loss_clip": 0.02004401, "auxiliary_loss_mlp": 0.01253719, "balance_loss_clip": 1.39910614, "balance_loss_mlp": 1.05268407, "epoch": 0.005651415860037276, "flos": 20844887431680.0, "grad_norm": 2.143924458469264, "language_loss": 1.07885861, "learning_rate": 2.789223836941131e-06, "loss": 1.11143994, "num_input_tokens_seen": 1088060, "step": 47, "time_per_iteration": 2.821887969970703 }, { "auxiliary_loss_clip": 0.0198874, "auxiliary_loss_mlp": 0.01278063, "balance_loss_clip": 1.39220262, "balance_loss_mlp": 1.070925, "epoch": 0.005771658750676366, "flos": 13260195383040.0, "grad_norm": 2.4954135580300947, "language_loss": 1.08755112, "learning_rate": 2.8044758939680847e-06, "loss": 1.12021911, "num_input_tokens_seen": 1104130, "step": 48, "time_per_iteration": 2.7487683296203613 }, { "auxiliary_loss_clip": 0.01944578, "auxiliary_loss_mlp": 0.0126126, "balance_loss_clip": 1.38353682, "balance_loss_mlp": 1.0590806, "epoch": 0.005891901641315457, "flos": 24425396997120.0, "grad_norm": 2.9500978686394657, "language_loss": 1.01993632, "learning_rate": 2.8194134530738863e-06, "loss": 1.05199468, "num_input_tokens_seen": 1122900, "step": 49, "time_per_iteration": 2.8465287685394287 }, { "auxiliary_loss_clip": 0.01928575, "auxiliary_loss_mlp": 0.01258553, "balance_loss_clip": 1.37752557, "balance_loss_mlp": 1.05894876, "epoch": 0.006012144531954548, "flos": 23076197314560.0, "grad_norm": 5.286645718614315, "language_loss": 0.90213507, "learning_rate": 2.834049222568994e-06, "loss": 0.93400633, "num_input_tokens_seen": 1140250, "step": 50, "time_per_iteration": 2.8800110816955566 }, { "auxiliary_loss_clip": 0.0192131, "auxiliary_loss_mlp": 0.01250717, "balance_loss_clip": 1.37422085, "balance_loss_mlp": 1.06007791, "epoch": 0.006132387422593639, "flos": 22528775064960.0, "grad_norm": 1.9853244312566956, "language_loss": 0.92470181, "learning_rate": 2.848395155712969e-06, "loss": 0.95642203, "num_input_tokens_seen": 1160470, "step": 51, "time_per_iteration": 2.7842273712158203 }, { "auxiliary_loss_clip": 0.01906633, "auxiliary_loss_mlp": 0.01257898, "balance_loss_clip": 1.36807382, "balance_loss_mlp": 1.06115532, "epoch": 0.00625263031323273, "flos": 27628340751360.0, "grad_norm": 2.2055042664574125, "language_loss": 0.97950733, "learning_rate": 2.8624625093687977e-06, "loss": 1.01115251, "num_input_tokens_seen": 1177605, "step": 52, "time_per_iteration": 2.8949031829833984 }, { "auxiliary_loss_clip": 0.01900364, "auxiliary_loss_mlp": 0.01236066, "balance_loss_clip": 1.36323476, "balance_loss_mlp": 1.05057669, "epoch": 0.006372873203871821, "flos": 23110671392640.0, "grad_norm": 3.456860771672256, "language_loss": 0.88892442, "learning_rate": 2.876261897070029e-06, "loss": 0.92028868, "num_input_tokens_seen": 1197735, "step": 53, "time_per_iteration": 2.8582842350006104 }, { "auxiliary_loss_clip": 0.01884389, "auxiliary_loss_mlp": 0.0124842, "balance_loss_clip": 1.3572824, "balance_loss_mlp": 1.0647428, "epoch": 0.006493116094510912, "flos": 22856028900480.0, "grad_norm": 2.3206723226459367, "language_loss": 0.92525929, "learning_rate": 2.889803337127447e-06, "loss": 0.95658731, "num_input_tokens_seen": 1216335, "step": 54, "time_per_iteration": 2.791677713394165 }, { "auxiliary_loss_clip": 0.01866479, "auxiliary_loss_mlp": 0.01244118, "balance_loss_clip": 1.35105157, "balance_loss_mlp": 1.0662576, "epoch": 0.006613358985150003, "flos": 23071708114560.0, "grad_norm": 2.458728816214611, "language_loss": 0.84757996, "learning_rate": 2.903096296321516e-06, "loss": 0.87868583, "num_input_tokens_seen": 1234480, "step": 55, "time_per_iteration": 2.78120756149292 }, { "auxiliary_loss_clip": 0.01842597, "auxiliary_loss_mlp": 0.01248503, "balance_loss_clip": 1.34197474, "balance_loss_mlp": 1.0684495, "epoch": 0.006733601875789094, "flos": 26537662229760.0, "grad_norm": 2.1095494148805853, "language_loss": 0.91581672, "learning_rate": 2.9161497296578907e-06, "loss": 0.94672775, "num_input_tokens_seen": 1253870, "step": 56, "time_per_iteration": 2.818150281906128 }, { "auxiliary_loss_clip": 0.01828797, "auxiliary_loss_mlp": 0.01234313, "balance_loss_clip": 1.33865118, "balance_loss_mlp": 1.06751513, "epoch": 0.006853844766428185, "flos": 15523178083200.0, "grad_norm": 2.436825366011175, "language_loss": 0.86047256, "learning_rate": 2.928972116604173e-06, "loss": 0.89110374, "num_input_tokens_seen": 1270145, "step": 57, "time_per_iteration": 2.7382733821868896 }, { "auxiliary_loss_clip": 0.01832322, "auxiliary_loss_mlp": 0.01238589, "balance_loss_clip": 1.33819222, "balance_loss_mlp": 1.06778634, "epoch": 0.006974087657067276, "flos": 24243760897920.0, "grad_norm": 3.074424301885012, "language_loss": 1.01875138, "learning_rate": 2.9415714941751377e-06, "loss": 1.04946041, "num_input_tokens_seen": 1291365, "step": 58, "time_per_iteration": 2.864509344100952 }, { "auxiliary_loss_clip": 0.01813821, "auxiliary_loss_mlp": 0.01219703, "balance_loss_clip": 1.33228135, "balance_loss_mlp": 1.05185616, "epoch": 0.007094330547706367, "flos": 25772513690880.0, "grad_norm": 1.881196730318686, "language_loss": 0.93571734, "learning_rate": 2.9539554871897396e-06, "loss": 0.96605265, "num_input_tokens_seen": 1311535, "step": 59, "time_per_iteration": 2.7973525524139404 }, { "auxiliary_loss_clip": 0.01795228, "auxiliary_loss_mlp": 0.01212906, "balance_loss_clip": 1.32497144, "balance_loss_mlp": 1.04992282, "epoch": 0.007214573438345458, "flos": 21319015979520.0, "grad_norm": 2.071720224954941, "language_loss": 0.9737581, "learning_rate": 2.9661313359851253e-06, "loss": 1.00383937, "num_input_tokens_seen": 1329420, "step": 60, "time_per_iteration": 2.8166849613189697 }, { "auxiliary_loss_clip": 0.01782965, "auxiliary_loss_mlp": 0.01214397, "balance_loss_clip": 1.31882048, "balance_loss_mlp": 1.05627811, "epoch": 0.007334816328984549, "flos": 24937088192640.0, "grad_norm": 2.144466701562681, "language_loss": 0.94086242, "learning_rate": 2.978105921839922e-06, "loss": 0.97083616, "num_input_tokens_seen": 1349965, "step": 61, "time_per_iteration": 2.8112339973449707 }, { "auxiliary_loss_clip": 0.01763091, "auxiliary_loss_mlp": 0.01214947, "balance_loss_clip": 1.31111288, "balance_loss_mlp": 1.05778122, "epoch": 0.00745505921962364, "flos": 18510586277760.0, "grad_norm": 2.3500046491558866, "language_loss": 0.72270763, "learning_rate": 2.9898857903302893e-06, "loss": 0.75248802, "num_input_tokens_seen": 1368915, "step": 62, "time_per_iteration": 2.782559394836426 }, { "auxiliary_loss_clip": 0.0175659, "auxiliary_loss_mlp": 0.01212894, "balance_loss_clip": 1.307459, "balance_loss_mlp": 1.0596385, "epoch": 0.007575302110262731, "flos": 18477656484480.0, "grad_norm": 3.4547535184072093, "language_loss": 0.87950653, "learning_rate": 3.001477172817253e-06, "loss": 0.90920138, "num_input_tokens_seen": 1386805, "step": 63, "time_per_iteration": 2.8497064113616943 }, { "auxiliary_loss_clip": 0.01739251, "auxiliary_loss_mlp": 0.01211677, "balance_loss_clip": 1.30153894, "balance_loss_mlp": 1.06643236, "epoch": 0.007695545000901822, "flos": 24973178382720.0, "grad_norm": 4.573660568404091, "language_loss": 0.96254265, "learning_rate": 3.012886006241894e-06, "loss": 0.99205196, "num_input_tokens_seen": 1406190, "step": 64, "time_per_iteration": 2.923413038253784 }, { "auxiliary_loss_clip": 0.01747061, "auxiliary_loss_mlp": 0.01204604, "balance_loss_clip": 1.30580282, "balance_loss_mlp": 1.06002736, "epoch": 0.007815787891540913, "flos": 21324223451520.0, "grad_norm": 2.5379260630849485, "language_loss": 0.88171202, "learning_rate": 3.0241179513858383e-06, "loss": 0.91122866, "num_input_tokens_seen": 1425500, "step": 65, "time_per_iteration": 2.7812423706054688 }, { "auxiliary_loss_clip": 0.01725294, "auxiliary_loss_mlp": 0.01191837, "balance_loss_clip": 1.28956234, "balance_loss_mlp": 1.04878557, "epoch": 0.007936030782180003, "flos": 21575777374080.0, "grad_norm": 2.4429313660620227, "language_loss": 0.87840319, "learning_rate": 3.035178409737647e-06, "loss": 0.90757447, "num_input_tokens_seen": 1442950, "step": 66, "time_per_iteration": 3.7623276710510254 }, { "auxiliary_loss_clip": 0.01710471, "auxiliary_loss_mlp": 0.01188176, "balance_loss_clip": 1.28688955, "balance_loss_mlp": 1.04302633, "epoch": 0.008056273672819095, "flos": 20120785159680.0, "grad_norm": 2.2425285114499616, "language_loss": 0.88801092, "learning_rate": 3.046072539090907e-06, "loss": 0.91699737, "num_input_tokens_seen": 1460915, "step": 67, "time_per_iteration": 3.666506290435791 }, { "auxiliary_loss_clip": 0.01696097, "auxiliary_loss_mlp": 0.01195827, "balance_loss_clip": 1.28116369, "balance_loss_mlp": 1.05597019, "epoch": 0.008176516563458186, "flos": 18333116156160.0, "grad_norm": 2.5295616822182247, "language_loss": 1.04809582, "learning_rate": 3.056805267986779e-06, "loss": 1.07701504, "num_input_tokens_seen": 1478385, "step": 68, "time_per_iteration": 2.73976469039917 }, { "auxiliary_loss_clip": 0.01687155, "auxiliary_loss_mlp": 0.01193415, "balance_loss_clip": 1.2736491, "balance_loss_mlp": 1.0573256, "epoch": 0.008296759454097276, "flos": 21872076664320.0, "grad_norm": 2.219357142562346, "language_loss": 0.95204049, "learning_rate": 3.0673813091022194e-06, "loss": 0.98084623, "num_input_tokens_seen": 1497605, "step": 69, "time_per_iteration": 2.7454895973205566 }, { "auxiliary_loss_clip": 0.01603786, "auxiliary_loss_mlp": 0.01089123, "balance_loss_clip": 1.28835285, "balance_loss_mlp": 1.01282871, "epoch": 0.008417002344736368, "flos": 63408228036480.0, "grad_norm": 1.2998267514403847, "language_loss": 0.62102962, "learning_rate": 3.0778051716749317e-06, "loss": 0.6479587, "num_input_tokens_seen": 1561150, "step": 70, "time_per_iteration": 3.3891353607177734 }, { "auxiliary_loss_clip": 0.01671173, "auxiliary_loss_mlp": 0.01194776, "balance_loss_clip": 1.26893854, "balance_loss_mlp": 1.05639744, "epoch": 0.008537245235375458, "flos": 22966454286720.0, "grad_norm": 3.1859319363237955, "language_loss": 0.90419269, "learning_rate": 3.0880811730470094e-06, "loss": 0.93285221, "num_input_tokens_seen": 1580605, "step": 71, "time_per_iteration": 2.7704780101776123 }, { "auxiliary_loss_clip": 0.01574643, "auxiliary_loss_mlp": 0.01084837, "balance_loss_clip": 1.27244866, "balance_loss_mlp": 1.01273942, "epoch": 0.008657488126014549, "flos": 61984046712960.0, "grad_norm": 1.1588247442291455, "language_loss": 0.58638275, "learning_rate": 3.098213449401257e-06, "loss": 0.6129775, "num_input_tokens_seen": 1647535, "step": 72, "time_per_iteration": 3.208491325378418 }, { "auxiliary_loss_clip": 0.01650787, "auxiliary_loss_mlp": 0.01170438, "balance_loss_clip": 1.25353026, "balance_loss_mlp": 1.0506562, "epoch": 0.00877773101665364, "flos": 30296791152000.0, "grad_norm": 2.9012850472271796, "language_loss": 0.98896146, "learning_rate": 3.1082059657570015e-06, "loss": 1.01717365, "num_input_tokens_seen": 1666770, "step": 73, "time_per_iteration": 2.810664653778076 }, { "auxiliary_loss_clip": 0.01632324, "auxiliary_loss_mlp": 0.011795, "balance_loss_clip": 1.25116515, "balance_loss_mlp": 1.05094445, "epoch": 0.00889797390729273, "flos": 23514056104320.0, "grad_norm": 3.3007821427273134, "language_loss": 0.96822041, "learning_rate": 3.1180625252858496e-06, "loss": 0.99633861, "num_input_tokens_seen": 1685200, "step": 74, "time_per_iteration": 2.788667917251587 }, { "auxiliary_loss_clip": 0.016286, "auxiliary_loss_mlp": 0.01184625, "balance_loss_clip": 1.248492, "balance_loss_mlp": 1.06017041, "epoch": 0.009018216797931822, "flos": 23075838178560.0, "grad_norm": 3.0346751952392395, "language_loss": 0.79887027, "learning_rate": 3.1277867780021663e-06, "loss": 0.82700247, "num_input_tokens_seen": 1701835, "step": 75, "time_per_iteration": 2.7972617149353027 }, { "auxiliary_loss_clip": 0.01611588, "auxiliary_loss_mlp": 0.01180862, "balance_loss_clip": 1.23923969, "balance_loss_mlp": 1.06003118, "epoch": 0.009138459688570914, "flos": 15918877284480.0, "grad_norm": 2.3343997057754513, "language_loss": 0.95568132, "learning_rate": 3.1373822288779824e-06, "loss": 0.98360586, "num_input_tokens_seen": 1718415, "step": 76, "time_per_iteration": 2.7947232723236084 }, { "auxiliary_loss_clip": 0.01616521, "auxiliary_loss_mlp": 0.01171047, "balance_loss_clip": 1.23916459, "balance_loss_mlp": 1.05622482, "epoch": 0.009258702579210003, "flos": 27016531372800.0, "grad_norm": 2.6381353053925216, "language_loss": 0.79500711, "learning_rate": 3.1468522454274533e-06, "loss": 0.82288283, "num_input_tokens_seen": 1738770, "step": 77, "time_per_iteration": 2.8492722511291504 }, { "auxiliary_loss_clip": 0.01608471, "auxiliary_loss_mlp": 0.01177554, "balance_loss_clip": 1.23278522, "balance_loss_mlp": 1.06625974, "epoch": 0.009378945469849095, "flos": 26903196984960.0, "grad_norm": 2.6468468105961875, "language_loss": 0.91596496, "learning_rate": 3.15620006480197e-06, "loss": 0.94382513, "num_input_tokens_seen": 1758040, "step": 78, "time_per_iteration": 2.8587656021118164 }, { "auxiliary_loss_clip": 0.01604942, "auxiliary_loss_mlp": 0.01161271, "balance_loss_clip": 1.23717332, "balance_loss_mlp": 1.04597163, "epoch": 0.009499188360488187, "flos": 35694236327040.0, "grad_norm": 4.220594998264973, "language_loss": 0.74977762, "learning_rate": 3.1654288004333087e-06, "loss": 0.77743971, "num_input_tokens_seen": 1776705, "step": 79, "time_per_iteration": 2.8365020751953125 }, { "auxiliary_loss_clip": 0.0158519, "auxiliary_loss_mlp": 0.0116236, "balance_loss_clip": 1.22640848, "balance_loss_mlp": 1.05531037, "epoch": 0.009619431251127276, "flos": 21503201944320.0, "grad_norm": 2.738978791848827, "language_loss": 0.7606588, "learning_rate": 3.1745414482589353e-06, "loss": 0.78813428, "num_input_tokens_seen": 1795915, "step": 80, "time_per_iteration": 2.8469934463500977 }, { "auxiliary_loss_clip": 0.01580241, "auxiliary_loss_mlp": 0.01153043, "balance_loss_clip": 1.22663128, "balance_loss_mlp": 1.04666078, "epoch": 0.009739674141766368, "flos": 17421056991360.0, "grad_norm": 2.5658920464748065, "language_loss": 0.86977172, "learning_rate": 3.1835408925606204e-06, "loss": 0.89710456, "num_input_tokens_seen": 1814055, "step": 81, "time_per_iteration": 2.810621738433838 }, { "auxiliary_loss_clip": 0.01564569, "auxiliary_loss_mlp": 0.0116794, "balance_loss_clip": 1.21554518, "balance_loss_mlp": 1.05340397, "epoch": 0.00985991703240546, "flos": 27527109246720.0, "grad_norm": 2.290767903041341, "language_loss": 0.8935039, "learning_rate": 3.1924299114448214e-06, "loss": 0.92082894, "num_input_tokens_seen": 1834535, "step": 82, "time_per_iteration": 2.809096097946167 }, { "auxiliary_loss_clip": 0.01567573, "auxiliary_loss_mlp": 0.0117184, "balance_loss_clip": 1.22087216, "balance_loss_mlp": 1.07075071, "epoch": 0.00998015992304455, "flos": 13808084509440.0, "grad_norm": 2.394775241067213, "language_loss": 0.83343977, "learning_rate": 3.2012111819909055e-06, "loss": 0.86083388, "num_input_tokens_seen": 1851865, "step": 83, "time_per_iteration": 2.7184343338012695 }, { "auxiliary_loss_clip": 0.01564822, "auxiliary_loss_mlp": 0.01167801, "balance_loss_clip": 1.21933949, "balance_loss_mlp": 1.06470919, "epoch": 0.010100402813683641, "flos": 20191385341440.0, "grad_norm": 4.757095289861437, "language_loss": 0.95105213, "learning_rate": 3.2098872850910627e-06, "loss": 0.97837836, "num_input_tokens_seen": 1868540, "step": 84, "time_per_iteration": 2.7360031604766846 }, { "auxiliary_loss_clip": 0.01562163, "auxiliary_loss_mlp": 0.01159864, "balance_loss_clip": 1.21468508, "balance_loss_mlp": 1.05891752, "epoch": 0.010220645704322733, "flos": 17201642762880.0, "grad_norm": 2.0004180767270396, "language_loss": 0.89428413, "learning_rate": 3.2184607100038194e-06, "loss": 0.92150438, "num_input_tokens_seen": 1887180, "step": 85, "time_per_iteration": 2.750227451324463 }, { "auxiliary_loss_clip": 0.01552791, "auxiliary_loss_mlp": 0.01161189, "balance_loss_clip": 1.21063185, "balance_loss_mlp": 1.06210256, "epoch": 0.010340888594961822, "flos": 21470415805440.0, "grad_norm": 2.3752904192283237, "language_loss": 0.93176895, "learning_rate": 3.2269338586412414e-06, "loss": 0.9589088, "num_input_tokens_seen": 1904765, "step": 86, "time_per_iteration": 2.752938985824585 }, { "auxiliary_loss_clip": 0.01546932, "auxiliary_loss_mlp": 0.0113992, "balance_loss_clip": 1.2087152, "balance_loss_mlp": 1.0426929, "epoch": 0.010461131485600914, "flos": 23002831785600.0, "grad_norm": 3.2205494392080705, "language_loss": 0.96444941, "learning_rate": 3.2353090496083106e-06, "loss": 0.99131793, "num_input_tokens_seen": 1922600, "step": 87, "time_per_iteration": 2.7402877807617188 }, { "auxiliary_loss_clip": 0.01536077, "auxiliary_loss_mlp": 0.01156892, "balance_loss_clip": 1.2071774, "balance_loss_mlp": 1.05785251, "epoch": 0.010581374376240005, "flos": 33546850571520.0, "grad_norm": 1.8413796431387275, "language_loss": 0.81381607, "learning_rate": 3.2435885220114572e-06, "loss": 0.84074575, "num_input_tokens_seen": 1943950, "step": 88, "time_per_iteration": 2.7854058742523193 }, { "auxiliary_loss_clip": 0.01535628, "auxiliary_loss_mlp": 0.01148931, "balance_loss_clip": 1.20549226, "balance_loss_mlp": 1.05976248, "epoch": 0.010701617266879095, "flos": 21763087822080.0, "grad_norm": 1.9772457907390564, "language_loss": 0.94058704, "learning_rate": 3.2517744390519113e-06, "loss": 0.96743262, "num_input_tokens_seen": 1962815, "step": 89, "time_per_iteration": 2.7509875297546387 }, { "auxiliary_loss_clip": 0.01530298, "auxiliary_loss_mlp": 0.01135542, "balance_loss_clip": 1.19939089, "balance_loss_mlp": 1.04103315, "epoch": 0.010821860157518187, "flos": 19060199256960.0, "grad_norm": 3.91896009919292, "language_loss": 0.7520929, "learning_rate": 3.259868891418298e-06, "loss": 0.77875137, "num_input_tokens_seen": 1980580, "step": 90, "time_per_iteration": 2.7212650775909424 }, { "auxiliary_loss_clip": 0.01537872, "auxiliary_loss_mlp": 0.01153044, "balance_loss_clip": 1.206424, "balance_loss_mlp": 1.05843949, "epoch": 0.010942103048157278, "flos": 25447378757760.0, "grad_norm": 2.4123536814294324, "language_loss": 0.84973824, "learning_rate": 3.2678739004917757e-06, "loss": 0.87664735, "num_input_tokens_seen": 2000315, "step": 91, "time_per_iteration": 2.817582130432129 }, { "auxiliary_loss_clip": 0.01522677, "auxiliary_loss_mlp": 0.01144505, "balance_loss_clip": 1.19868875, "balance_loss_mlp": 1.05495501, "epoch": 0.011062345938796368, "flos": 27493928058240.0, "grad_norm": 1.6812312559575984, "language_loss": 0.92266858, "learning_rate": 3.275791421376029e-06, "loss": 0.9493404, "num_input_tokens_seen": 2023760, "step": 92, "time_per_iteration": 3.922977924346924 }, { "auxiliary_loss_clip": 0.01515726, "auxiliary_loss_mlp": 0.01140649, "balance_loss_clip": 1.19271111, "balance_loss_mlp": 1.04799986, "epoch": 0.01118258882943546, "flos": 16071210864000.0, "grad_norm": 1.994407379473281, "language_loss": 0.96168458, "learning_rate": 3.2836233457634622e-06, "loss": 0.98824835, "num_input_tokens_seen": 2041895, "step": 93, "time_per_iteration": 4.59586501121521 }, { "auxiliary_loss_clip": 0.01523992, "auxiliary_loss_mlp": 0.0113666, "balance_loss_clip": 1.19697511, "balance_loss_mlp": 1.05764842, "epoch": 0.011302831720074551, "flos": 20668602458880.0, "grad_norm": 2.8427162385353704, "language_loss": 0.85247374, "learning_rate": 3.2913715046481135e-06, "loss": 0.87908024, "num_input_tokens_seen": 2061640, "step": 94, "time_per_iteration": 2.7976150512695312 }, { "auxiliary_loss_clip": 0.0151256, "auxiliary_loss_mlp": 0.01135555, "balance_loss_clip": 1.19105339, "balance_loss_mlp": 1.05368245, "epoch": 0.011423074610713641, "flos": 13072238490240.0, "grad_norm": 2.0647788360735126, "language_loss": 0.88761842, "learning_rate": 3.299037670895023e-06, "loss": 0.91409957, "num_input_tokens_seen": 2078255, "step": 95, "time_per_iteration": 2.781954765319824 }, { "auxiliary_loss_clip": 0.01514221, "auxiliary_loss_mlp": 0.01132428, "balance_loss_clip": 1.19299376, "balance_loss_mlp": 1.04821897, "epoch": 0.011543317501352733, "flos": 30335646689280.0, "grad_norm": 1.7587387437828401, "language_loss": 0.80381393, "learning_rate": 3.3066235616750667e-06, "loss": 0.83028042, "num_input_tokens_seen": 2099490, "step": 96, "time_per_iteration": 2.909999132156372 }, { "auxiliary_loss_clip": 0.01497582, "auxiliary_loss_mlp": 0.01135382, "balance_loss_clip": 1.1856389, "balance_loss_mlp": 1.05484462, "epoch": 0.011663560391991824, "flos": 15522962601600.0, "grad_norm": 2.3198670890628694, "language_loss": 0.92506838, "learning_rate": 3.3141308407736276e-06, "loss": 0.95139807, "num_input_tokens_seen": 2116125, "step": 97, "time_per_iteration": 2.7998838424682617 }, { "auxiliary_loss_clip": 0.01500404, "auxiliary_loss_mlp": 0.01129406, "balance_loss_clip": 1.18116665, "balance_loss_mlp": 1.05072796, "epoch": 0.011783803282630914, "flos": 19902125116800.0, "grad_norm": 3.0169711288427066, "language_loss": 0.86680913, "learning_rate": 3.321561120780869e-06, "loss": 0.89310718, "num_input_tokens_seen": 2134835, "step": 98, "time_per_iteration": 2.72688889503479 }, { "auxiliary_loss_clip": 0.01495115, "auxiliary_loss_mlp": 0.01125006, "balance_loss_clip": 1.18330252, "balance_loss_mlp": 1.04995203, "epoch": 0.011904046173270006, "flos": 22340674517760.0, "grad_norm": 2.17587826712167, "language_loss": 1.01564705, "learning_rate": 3.3289159651708192e-06, "loss": 1.04184818, "num_input_tokens_seen": 2152410, "step": 99, "time_per_iteration": 2.7411887645721436 }, { "auxiliary_loss_clip": 0.01493462, "auxiliary_loss_mlp": 0.01134296, "balance_loss_clip": 1.18038177, "balance_loss_mlp": 1.06038642, "epoch": 0.012024289063909096, "flos": 19100060375040.0, "grad_norm": 2.0166192151190323, "language_loss": 0.97754025, "learning_rate": 3.3361968902759768e-06, "loss": 1.0038178, "num_input_tokens_seen": 2172090, "step": 100, "time_per_iteration": 2.779336929321289 }, { "auxiliary_loss_clip": 0.01483579, "auxiliary_loss_mlp": 0.01135508, "balance_loss_clip": 1.178069, "balance_loss_mlp": 1.06469786, "epoch": 0.012144531954548187, "flos": 15012205159680.0, "grad_norm": 2.8707536561908156, "language_loss": 0.93887067, "learning_rate": 3.343405367163663e-06, "loss": 0.96506155, "num_input_tokens_seen": 2189020, "step": 101, "time_per_iteration": 2.777034282684326 }, { "auxiliary_loss_clip": 0.01484395, "auxiliary_loss_mlp": 0.0112259, "balance_loss_clip": 1.17845297, "balance_loss_mlp": 1.05197048, "epoch": 0.012264774845187279, "flos": 15122020014720.0, "grad_norm": 6.875835543669841, "language_loss": 0.81102765, "learning_rate": 3.350542823419951e-06, "loss": 0.83709753, "num_input_tokens_seen": 2205620, "step": 102, "time_per_iteration": 2.734243869781494 }, { "auxiliary_loss_clip": 0.01484316, "auxiliary_loss_mlp": 0.01117176, "balance_loss_clip": 1.17953575, "balance_loss_mlp": 1.04660368, "epoch": 0.012385017735826368, "flos": 13949248959360.0, "grad_norm": 3.389540487078456, "language_loss": 0.87674868, "learning_rate": 3.3576106448465615e-06, "loss": 0.90276366, "num_input_tokens_seen": 2219000, "step": 103, "time_per_iteration": 2.6559481620788574 }, { "auxiliary_loss_clip": 0.01479638, "auxiliary_loss_mlp": 0.0112939, "balance_loss_clip": 1.17575514, "balance_loss_mlp": 1.06015313, "epoch": 0.01250526062646546, "flos": 23623260428160.0, "grad_norm": 1.9282880503669313, "language_loss": 0.88130212, "learning_rate": 3.3646101770757797e-06, "loss": 0.90739238, "num_input_tokens_seen": 2237790, "step": 104, "time_per_iteration": 2.938173532485962 }, { "auxiliary_loss_clip": 0.01477257, "auxiliary_loss_mlp": 0.01124444, "balance_loss_clip": 1.17578983, "balance_loss_mlp": 1.05382419, "epoch": 0.012625503517104552, "flos": 34640078958720.0, "grad_norm": 1.650231066134, "language_loss": 0.8561911, "learning_rate": 3.371542727108104e-06, "loss": 0.88220811, "num_input_tokens_seen": 2259965, "step": 105, "time_per_iteration": 2.8657491207122803 }, { "auxiliary_loss_clip": 0.01477366, "auxiliary_loss_mlp": 0.01127403, "balance_loss_clip": 1.1710912, "balance_loss_mlp": 1.06298208, "epoch": 0.012745746407743641, "flos": 17821891837440.0, "grad_norm": 2.8613006690534406, "language_loss": 0.90162194, "learning_rate": 3.3784095647770114e-06, "loss": 0.92766958, "num_input_tokens_seen": 2278610, "step": 106, "time_per_iteration": 2.7867159843444824 }, { "auxiliary_loss_clip": 0.01475541, "auxiliary_loss_mlp": 0.01100802, "balance_loss_clip": 1.1693182, "balance_loss_mlp": 1.03499889, "epoch": 0.012865989298382733, "flos": 20595057361920.0, "grad_norm": 1.9691508351709524, "language_loss": 0.88480306, "learning_rate": 3.3852119241449547e-06, "loss": 0.91056645, "num_input_tokens_seen": 2297730, "step": 107, "time_per_iteration": 2.9053361415863037 }, { "auxiliary_loss_clip": 0.01476977, "auxiliary_loss_mlp": 0.01116414, "balance_loss_clip": 1.17029607, "balance_loss_mlp": 1.05576003, "epoch": 0.012986232189021825, "flos": 23948969978880.0, "grad_norm": 2.9034418336669003, "language_loss": 0.96280235, "learning_rate": 3.3919510048344295e-06, "loss": 0.98873615, "num_input_tokens_seen": 2315740, "step": 108, "time_per_iteration": 2.7983059883117676 }, { "auxiliary_loss_clip": 0.0146128, "auxiliary_loss_mlp": 0.01113405, "balance_loss_clip": 1.16531038, "balance_loss_mlp": 1.04669547, "epoch": 0.013106475079660914, "flos": 23725425686400.0, "grad_norm": 3.4401852475014953, "language_loss": 0.86709547, "learning_rate": 3.3986279732976907e-06, "loss": 0.89284235, "num_input_tokens_seen": 2334215, "step": 109, "time_per_iteration": 2.790989398956299 }, { "auxiliary_loss_clip": 0.01456177, "auxiliary_loss_mlp": 0.01114555, "balance_loss_clip": 1.16454327, "balance_loss_mlp": 1.0552845, "epoch": 0.013226717970300006, "flos": 21102438925440.0, "grad_norm": 1.9013836174535417, "language_loss": 0.95398986, "learning_rate": 3.4052439640284983e-06, "loss": 0.97969717, "num_input_tokens_seen": 2353130, "step": 110, "time_per_iteration": 2.7106270790100098 }, { "auxiliary_loss_clip": 0.01453704, "auxiliary_loss_mlp": 0.01115673, "balance_loss_clip": 1.16062534, "balance_loss_mlp": 1.05716538, "epoch": 0.013346960860939098, "flos": 24863902231680.0, "grad_norm": 1.752983957042692, "language_loss": 0.8112185, "learning_rate": 3.4118000807190217e-06, "loss": 0.83691227, "num_input_tokens_seen": 2374010, "step": 111, "time_per_iteration": 2.7654178142547607 }, { "auxiliary_loss_clip": 0.01462483, "auxiliary_loss_mlp": 0.01110887, "balance_loss_clip": 1.16443121, "balance_loss_mlp": 1.05037665, "epoch": 0.013467203751578187, "flos": 28181940140160.0, "grad_norm": 2.23689009784611, "language_loss": 0.76015806, "learning_rate": 3.4182973973648723e-06, "loss": 0.78589171, "num_input_tokens_seen": 2395220, "step": 112, "time_per_iteration": 2.7869327068328857 }, { "auxiliary_loss_clip": 0.01459778, "auxiliary_loss_mlp": 0.01110569, "balance_loss_clip": 1.16549063, "balance_loss_mlp": 1.05134535, "epoch": 0.013587446642217279, "flos": 18916233546240.0, "grad_norm": 3.3184543885205495, "language_loss": 0.95210057, "learning_rate": 3.424736959321014e-06, "loss": 0.97780406, "num_input_tokens_seen": 2413025, "step": 113, "time_per_iteration": 2.7201180458068848 }, { "auxiliary_loss_clip": 0.01458532, "auxiliary_loss_mlp": 0.01110163, "balance_loss_clip": 1.16163325, "balance_loss_mlp": 1.05268002, "epoch": 0.01370768953285637, "flos": 23988615615360.0, "grad_norm": 2.0413813015077875, "language_loss": 0.88817143, "learning_rate": 3.431119784311155e-06, "loss": 0.91385841, "num_input_tokens_seen": 2432700, "step": 114, "time_per_iteration": 2.747728109359741 }, { "auxiliary_loss_clip": 0.01456742, "auxiliary_loss_mlp": 0.01112716, "balance_loss_clip": 1.16523695, "balance_loss_mlp": 1.05897617, "epoch": 0.01382793242349546, "flos": 39202565512320.0, "grad_norm": 2.475430799183803, "language_loss": 0.77576876, "learning_rate": 3.43744686339307e-06, "loss": 0.80146337, "num_input_tokens_seen": 2455020, "step": 115, "time_per_iteration": 2.8845136165618896 }, { "auxiliary_loss_clip": 0.01450588, "auxiliary_loss_mlp": 0.01098063, "balance_loss_clip": 1.15495992, "balance_loss_mlp": 1.03745711, "epoch": 0.013948175314134552, "flos": 41353506714240.0, "grad_norm": 2.0657318616582647, "language_loss": 0.9096241, "learning_rate": 3.44371916188212e-06, "loss": 0.93511063, "num_input_tokens_seen": 2475775, "step": 116, "time_per_iteration": 2.9041669368743896 }, { "auxiliary_loss_clip": 0.01449576, "auxiliary_loss_mlp": 0.01105417, "balance_loss_clip": 1.15606666, "balance_loss_mlp": 1.05096221, "epoch": 0.014068418204773643, "flos": 22453542028800.0, "grad_norm": 2.0340544916118772, "language_loss": 0.86261851, "learning_rate": 3.449937620235143e-06, "loss": 0.88816845, "num_input_tokens_seen": 2496370, "step": 117, "time_per_iteration": 2.70540189743042 }, { "auxiliary_loss_clip": 0.01442429, "auxiliary_loss_mlp": 0.01104463, "balance_loss_clip": 1.15424657, "balance_loss_mlp": 1.0472424, "epoch": 0.014188661095412733, "flos": 23805147922560.0, "grad_norm": 1.9024540572394217, "language_loss": 0.89168489, "learning_rate": 3.456103154896722e-06, "loss": 0.91715384, "num_input_tokens_seen": 2517645, "step": 118, "time_per_iteration": 3.765204429626465 }, { "auxiliary_loss_clip": 0.01437781, "auxiliary_loss_mlp": 0.01116023, "balance_loss_clip": 1.1523478, "balance_loss_mlp": 1.06459594, "epoch": 0.014308903986051825, "flos": 23660248458240.0, "grad_norm": 1.806766071049921, "language_loss": 0.92408401, "learning_rate": 3.462216659109757e-06, "loss": 0.94962204, "num_input_tokens_seen": 2537825, "step": 119, "time_per_iteration": 4.637160301208496 }, { "auxiliary_loss_clip": 0.01448032, "auxiliary_loss_mlp": 0.01108621, "balance_loss_clip": 1.15620947, "balance_loss_mlp": 1.05011308, "epoch": 0.014429146876690916, "flos": 20667991927680.0, "grad_norm": 2.5269882520840152, "language_loss": 0.85467988, "learning_rate": 3.4682790036921077e-06, "loss": 0.8802464, "num_input_tokens_seen": 2556485, "step": 120, "time_per_iteration": 3.706763505935669 }, { "auxiliary_loss_clip": 0.01435106, "auxiliary_loss_mlp": 0.01098133, "balance_loss_clip": 1.14920521, "balance_loss_mlp": 1.04451287, "epoch": 0.014549389767330006, "flos": 20229199384320.0, "grad_norm": 2.0824040084400512, "language_loss": 0.83126497, "learning_rate": 3.4742910377810193e-06, "loss": 0.85659736, "num_input_tokens_seen": 2573945, "step": 121, "time_per_iteration": 2.8266806602478027 }, { "auxiliary_loss_clip": 0.01434528, "auxiliary_loss_mlp": 0.01100971, "balance_loss_clip": 1.14912331, "balance_loss_mlp": 1.04816127, "epoch": 0.014669632657969098, "flos": 18004174381440.0, "grad_norm": 2.1234942193359525, "language_loss": 0.88723534, "learning_rate": 3.4802535895469042e-06, "loss": 0.91259038, "num_input_tokens_seen": 2592695, "step": 122, "time_per_iteration": 2.798060655593872 }, { "auxiliary_loss_clip": 0.01436204, "auxiliary_loss_mlp": 0.01106911, "balance_loss_clip": 1.14992511, "balance_loss_mlp": 1.05691409, "epoch": 0.01478987554860819, "flos": 22741796672640.0, "grad_norm": 2.1117918111629135, "language_loss": 0.89790356, "learning_rate": 3.4861674668779934e-06, "loss": 0.92333472, "num_input_tokens_seen": 2610925, "step": 123, "time_per_iteration": 2.7501001358032227 }, { "auxiliary_loss_clip": 0.01429196, "auxiliary_loss_mlp": 0.01094674, "balance_loss_clip": 1.14724386, "balance_loss_mlp": 1.04782438, "epoch": 0.01491011843924728, "flos": 17198590106880.0, "grad_norm": 2.0388835135399406, "language_loss": 0.83994025, "learning_rate": 3.492033458037272e-06, "loss": 0.86517888, "num_input_tokens_seen": 2629495, "step": 124, "time_per_iteration": 2.787310838699341 }, { "auxiliary_loss_clip": 0.01426915, "auxiliary_loss_mlp": 0.01102147, "balance_loss_clip": 1.14474726, "balance_loss_mlp": 1.05293703, "epoch": 0.01503036132988637, "flos": 17673867889920.0, "grad_norm": 2.4358841681042187, "language_loss": 0.86811411, "learning_rate": 3.497852332293018e-06, "loss": 0.89340466, "num_input_tokens_seen": 2645070, "step": 125, "time_per_iteration": 2.6752512454986572 }, { "auxiliary_loss_clip": 0.01428633, "auxiliary_loss_mlp": 0.01102452, "balance_loss_clip": 1.14464772, "balance_loss_mlp": 1.05374336, "epoch": 0.015150604220525462, "flos": 18878239935360.0, "grad_norm": 2.1588874630789427, "language_loss": 0.968225, "learning_rate": 3.5036248405242356e-06, "loss": 0.99353576, "num_input_tokens_seen": 2663825, "step": 126, "time_per_iteration": 2.7756388187408447 }, { "auxiliary_loss_clip": 0.01427342, "auxiliary_loss_mlp": 0.01088421, "balance_loss_clip": 1.14310789, "balance_loss_mlp": 1.04166698, "epoch": 0.015270847111164552, "flos": 39420184060800.0, "grad_norm": 2.3217971322988915, "language_loss": 0.8251406, "learning_rate": 3.509351715802146e-06, "loss": 0.85029817, "num_input_tokens_seen": 2684710, "step": 127, "time_per_iteration": 2.8891191482543945 }, { "auxiliary_loss_clip": 0.01423095, "auxiliary_loss_mlp": 0.01088894, "balance_loss_clip": 1.14165354, "balance_loss_mlp": 1.04185367, "epoch": 0.015391090001803644, "flos": 43762466286720.0, "grad_norm": 2.0045216201446854, "language_loss": 0.78412199, "learning_rate": 3.5150336739488763e-06, "loss": 0.80924189, "num_input_tokens_seen": 2706995, "step": 128, "time_per_iteration": 2.9337754249572754 }, { "auxiliary_loss_clip": 0.01418247, "auxiliary_loss_mlp": 0.01096277, "balance_loss_clip": 1.14000511, "balance_loss_mlp": 1.04866505, "epoch": 0.015511332892442733, "flos": 18916341287040.0, "grad_norm": 1.8829458860688257, "language_loss": 0.8396486, "learning_rate": 3.5206714140744143e-06, "loss": 0.86479378, "num_input_tokens_seen": 2727050, "step": 129, "time_per_iteration": 2.7400522232055664 }, { "auxiliary_loss_clip": 0.01431873, "auxiliary_loss_mlp": 0.01107127, "balance_loss_clip": 1.15068018, "balance_loss_mlp": 1.05622435, "epoch": 0.015631575783081827, "flos": 24535283679360.0, "grad_norm": 2.936924842199411, "language_loss": 0.87714565, "learning_rate": 3.5262656190928208e-06, "loss": 0.90253562, "num_input_tokens_seen": 2745350, "step": 130, "time_per_iteration": 2.8518614768981934 }, { "auxiliary_loss_clip": 0.0147705, "auxiliary_loss_mlp": 0.01047416, "balance_loss_clip": 1.24932718, "balance_loss_mlp": 1.0218575, "epoch": 0.015751818673720917, "flos": 62328536098560.0, "grad_norm": 1.034354854534219, "language_loss": 0.71520704, "learning_rate": 3.5318169562186737e-06, "loss": 0.74045169, "num_input_tokens_seen": 2814195, "step": 131, "time_per_iteration": 3.354323625564575 }, { "auxiliary_loss_clip": 0.01414398, "auxiliary_loss_mlp": 0.01108343, "balance_loss_clip": 1.13749123, "balance_loss_mlp": 1.06044436, "epoch": 0.015872061564360006, "flos": 23878549365120.0, "grad_norm": 3.8316805273671926, "language_loss": 0.8226009, "learning_rate": 3.5373260774446292e-06, "loss": 0.84782833, "num_input_tokens_seen": 2834645, "step": 132, "time_per_iteration": 2.8335580825805664 }, { "auxiliary_loss_clip": 0.01410484, "auxiliary_loss_mlp": 0.01087375, "balance_loss_clip": 1.13413739, "balance_loss_mlp": 1.04596174, "epoch": 0.0159923044549991, "flos": 23367899664000.0, "grad_norm": 2.295363233538807, "language_loss": 0.90258223, "learning_rate": 3.542793620000961e-06, "loss": 0.92756081, "num_input_tokens_seen": 2854120, "step": 133, "time_per_iteration": 2.8527915477752686 }, { "auxiliary_loss_clip": 0.01413996, "auxiliary_loss_mlp": 0.01089292, "balance_loss_clip": 1.1383698, "balance_loss_mlp": 1.0492382, "epoch": 0.01611254734563819, "flos": 17858305249920.0, "grad_norm": 2.240721984769087, "language_loss": 0.86962557, "learning_rate": 3.5482202067978894e-06, "loss": 0.89465839, "num_input_tokens_seen": 2871330, "step": 134, "time_per_iteration": 2.7263360023498535 }, { "auxiliary_loss_clip": 0.01412893, "auxiliary_loss_mlp": 0.01091897, "balance_loss_clip": 1.13600302, "balance_loss_mlp": 1.04833817, "epoch": 0.01623279023627728, "flos": 20954774113920.0, "grad_norm": 2.798652089333793, "language_loss": 0.76097536, "learning_rate": 3.553606446851471e-06, "loss": 0.78602326, "num_input_tokens_seen": 2888070, "step": 135, "time_per_iteration": 2.8291869163513184 }, { "auxiliary_loss_clip": 0.01409765, "auxiliary_loss_mlp": 0.010846, "balance_loss_clip": 1.13389206, "balance_loss_mlp": 1.04287624, "epoch": 0.016353033126916373, "flos": 15742412743680.0, "grad_norm": 2.243809666548272, "language_loss": 0.83312851, "learning_rate": 3.5589529356937613e-06, "loss": 0.85807216, "num_input_tokens_seen": 2906465, "step": 136, "time_per_iteration": 2.7921762466430664 }, { "auxiliary_loss_clip": 0.01408099, "auxiliary_loss_mlp": 0.01087257, "balance_loss_clip": 1.13358688, "balance_loss_mlp": 1.04493737, "epoch": 0.016473276017555463, "flos": 18807280617600.0, "grad_norm": 1.6987356152795927, "language_loss": 0.76823217, "learning_rate": 3.5642602557679627e-06, "loss": 0.79318571, "num_input_tokens_seen": 2924915, "step": 137, "time_per_iteration": 2.7114295959472656 }, { "auxiliary_loss_clip": 0.01414616, "auxiliary_loss_mlp": 0.0111384, "balance_loss_clip": 1.14077044, "balance_loss_mlp": 1.07109118, "epoch": 0.016593518908194552, "flos": 24352641999360.0, "grad_norm": 2.425252580650448, "language_loss": 0.84153837, "learning_rate": 3.569528976809202e-06, "loss": 0.86682296, "num_input_tokens_seen": 2942130, "step": 138, "time_per_iteration": 2.8083784580230713 }, { "auxiliary_loss_clip": 0.01407977, "auxiliary_loss_mlp": 0.0108317, "balance_loss_clip": 1.13221502, "balance_loss_mlp": 1.04194784, "epoch": 0.016713761798833646, "flos": 22346133384960.0, "grad_norm": 1.738319234509562, "language_loss": 0.89907324, "learning_rate": 3.5747596562115522e-06, "loss": 0.92398471, "num_input_tokens_seen": 2962745, "step": 139, "time_per_iteration": 2.736917495727539 }, { "auxiliary_loss_clip": 0.01410843, "auxiliary_loss_mlp": 0.01090682, "balance_loss_clip": 1.13356233, "balance_loss_mlp": 1.0496738, "epoch": 0.016834004689472735, "flos": 17821820010240.0, "grad_norm": 4.329679301988367, "language_loss": 0.90983766, "learning_rate": 3.5799528393819138e-06, "loss": 0.93485302, "num_input_tokens_seen": 2981825, "step": 140, "time_per_iteration": 2.707399368286133 }, { "auxiliary_loss_clip": 0.01402501, "auxiliary_loss_mlp": 0.01085241, "balance_loss_clip": 1.13133824, "balance_loss_mlp": 1.04370821, "epoch": 0.016954247580111825, "flos": 20519501103360.0, "grad_norm": 2.167392985463678, "language_loss": 0.88083106, "learning_rate": 3.585109060081286e-06, "loss": 0.90570855, "num_input_tokens_seen": 3001625, "step": 141, "time_per_iteration": 2.7193057537078857 }, { "auxiliary_loss_clip": 0.01406484, "auxiliary_loss_mlp": 0.01100138, "balance_loss_clip": 1.13137007, "balance_loss_mlp": 1.05467129, "epoch": 0.017074490470750915, "flos": 22088869200000.0, "grad_norm": 2.1219657913225816, "language_loss": 0.78596741, "learning_rate": 3.590228840753992e-06, "loss": 0.81103367, "num_input_tokens_seen": 3022055, "step": 142, "time_per_iteration": 2.7812976837158203 }, { "auxiliary_loss_clip": 0.01397753, "auxiliary_loss_mlp": 0.01089602, "balance_loss_clip": 1.12918568, "balance_loss_mlp": 1.05002451, "epoch": 0.01719473336139001, "flos": 15997270717440.0, "grad_norm": 2.004976270626316, "language_loss": 0.87466812, "learning_rate": 3.5953126928453423e-06, "loss": 0.89954174, "num_input_tokens_seen": 3039605, "step": 143, "time_per_iteration": 2.6732726097106934 }, { "auxiliary_loss_clip": 0.01398209, "auxiliary_loss_mlp": 0.01089999, "balance_loss_clip": 1.1275568, "balance_loss_mlp": 1.0510416, "epoch": 0.017314976252029098, "flos": 22492038430080.0, "grad_norm": 1.8928630708794982, "language_loss": 0.80518627, "learning_rate": 3.600361117108239e-06, "loss": 0.83006829, "num_input_tokens_seen": 3059405, "step": 144, "time_per_iteration": 3.703705072402954 }, { "auxiliary_loss_clip": 0.01399885, "auxiliary_loss_mlp": 0.0110267, "balance_loss_clip": 1.12803328, "balance_loss_mlp": 1.06421292, "epoch": 0.017435219142668188, "flos": 22018053536640.0, "grad_norm": 1.8741605374397714, "language_loss": 0.97098404, "learning_rate": 3.6053746038991616e-06, "loss": 0.99600953, "num_input_tokens_seen": 3078490, "step": 145, "time_per_iteration": 3.612124443054199 }, { "auxiliary_loss_clip": 0.01404093, "auxiliary_loss_mlp": 0.01024416, "balance_loss_clip": 1.19178367, "balance_loss_mlp": 1.00133741, "epoch": 0.01755546203330728, "flos": 72240526149120.0, "grad_norm": 1.0507977384380534, "language_loss": 0.58418441, "learning_rate": 3.6103536334639843e-06, "loss": 0.60846949, "num_input_tokens_seen": 3131755, "step": 146, "time_per_iteration": 4.201429843902588 }, { "auxiliary_loss_clip": 0.01390371, "auxiliary_loss_mlp": 0.01088318, "balance_loss_clip": 1.12208343, "balance_loss_mlp": 1.0525316, "epoch": 0.01767570492394637, "flos": 25337061112320.0, "grad_norm": 2.508722123557612, "language_loss": 0.85574704, "learning_rate": 3.615298676214041e-06, "loss": 0.88053393, "num_input_tokens_seen": 3152035, "step": 147, "time_per_iteration": 2.7530224323272705 }, { "auxiliary_loss_clip": 0.01392544, "auxiliary_loss_mlp": 0.01076318, "balance_loss_clip": 1.12440968, "balance_loss_mlp": 1.03788507, "epoch": 0.01779594781458546, "flos": 20449188230400.0, "grad_norm": 2.5493355184899715, "language_loss": 0.88804615, "learning_rate": 3.6202101929928317e-06, "loss": 0.91273475, "num_input_tokens_seen": 3170625, "step": 148, "time_per_iteration": 2.7226905822753906 }, { "auxiliary_loss_clip": 0.01387817, "auxiliary_loss_mlp": 0.01075233, "balance_loss_clip": 1.12215137, "balance_loss_mlp": 1.03832603, "epoch": 0.017916190705224554, "flos": 16253601148800.0, "grad_norm": 2.1598483622943903, "language_loss": 0.88271588, "learning_rate": 3.6250886353337413e-06, "loss": 0.90734637, "num_input_tokens_seen": 3188155, "step": 149, "time_per_iteration": 2.702954053878784 }, { "auxiliary_loss_clip": 0.01389365, "auxiliary_loss_mlp": 0.01091172, "balance_loss_clip": 1.12154543, "balance_loss_mlp": 1.05507541, "epoch": 0.018036433595863644, "flos": 23330588411520.0, "grad_norm": 2.2077200531333347, "language_loss": 0.86413914, "learning_rate": 3.6299344457091488e-06, "loss": 0.88894451, "num_input_tokens_seen": 3209015, "step": 150, "time_per_iteration": 2.796477794647217 }, { "auxiliary_loss_clip": 0.01393626, "auxiliary_loss_mlp": 0.01083717, "balance_loss_clip": 1.12300491, "balance_loss_mlp": 1.04640436, "epoch": 0.018156676486502734, "flos": 18588010043520.0, "grad_norm": 7.05511520552158, "language_loss": 0.93628347, "learning_rate": 3.634748057771256e-06, "loss": 0.96105689, "num_input_tokens_seen": 3224955, "step": 151, "time_per_iteration": 2.7627451419830322 }, { "auxiliary_loss_clip": 0.0137941, "auxiliary_loss_mlp": 0.01087807, "balance_loss_clip": 1.11491776, "balance_loss_mlp": 1.0511626, "epoch": 0.018276919377141827, "flos": 25448707560960.0, "grad_norm": 1.772004659630072, "language_loss": 0.85623938, "learning_rate": 3.639529896584965e-06, "loss": 0.88091159, "num_input_tokens_seen": 3246330, "step": 152, "time_per_iteration": 2.8304662704467773 }, { "auxiliary_loss_clip": 0.01383751, "auxiliary_loss_mlp": 0.01093014, "balance_loss_clip": 1.11867595, "balance_loss_mlp": 1.05624986, "epoch": 0.018397162267780917, "flos": 20047311889920.0, "grad_norm": 2.934851762314538, "language_loss": 0.88875711, "learning_rate": 3.6442803788531233e-06, "loss": 0.91352475, "num_input_tokens_seen": 3264290, "step": 153, "time_per_iteration": 2.6817362308502197 }, { "auxiliary_loss_clip": 0.01388579, "auxiliary_loss_mlp": 0.01089264, "balance_loss_clip": 1.12014794, "balance_loss_mlp": 1.05216575, "epoch": 0.018517405158420007, "flos": 27565282425600.0, "grad_norm": 2.746939211522687, "language_loss": 0.9580251, "learning_rate": 3.6489999131344357e-06, "loss": 0.98280346, "num_input_tokens_seen": 3287065, "step": 154, "time_per_iteration": 2.737531900405884 }, { "auxiliary_loss_clip": 0.01384287, "auxiliary_loss_mlp": 0.01078624, "balance_loss_clip": 1.1220423, "balance_loss_mlp": 1.04419684, "epoch": 0.0186376480490591, "flos": 19354056422400.0, "grad_norm": 2.289419950971102, "language_loss": 0.90474176, "learning_rate": 3.653688900054313e-06, "loss": 0.92937088, "num_input_tokens_seen": 3305595, "step": 155, "time_per_iteration": 2.684894561767578 }, { "auxiliary_loss_clip": 0.01377568, "auxiliary_loss_mlp": 0.01075928, "balance_loss_clip": 1.11102676, "balance_loss_mlp": 1.04037929, "epoch": 0.01875789093969819, "flos": 26687840993280.0, "grad_norm": 2.5307630857015786, "language_loss": 0.76233053, "learning_rate": 3.6583477325089526e-06, "loss": 0.78686547, "num_input_tokens_seen": 3326135, "step": 156, "time_per_iteration": 2.734891891479492 }, { "auxiliary_loss_clip": 0.0137918, "auxiliary_loss_mlp": 0.01065948, "balance_loss_clip": 1.11586201, "balance_loss_mlp": 1.03206861, "epoch": 0.01887813383033728, "flos": 24353001135360.0, "grad_norm": 2.4190343604320454, "language_loss": 1.04121923, "learning_rate": 3.6629767958628916e-06, "loss": 1.06567049, "num_input_tokens_seen": 3343510, "step": 157, "time_per_iteration": 2.717433452606201 }, { "auxiliary_loss_clip": 0.01381726, "auxiliary_loss_mlp": 0.01071216, "balance_loss_clip": 1.11668026, "balance_loss_mlp": 1.03690767, "epoch": 0.018998376720976373, "flos": 14647532330880.0, "grad_norm": 2.6122358715373037, "language_loss": 0.85375851, "learning_rate": 3.667576468140291e-06, "loss": 0.87828791, "num_input_tokens_seen": 3361325, "step": 158, "time_per_iteration": 2.6094589233398438 }, { "auxiliary_loss_clip": 0.01374536, "auxiliary_loss_mlp": 0.01082752, "balance_loss_clip": 1.10974801, "balance_loss_mlp": 1.05016017, "epoch": 0.019118619611615463, "flos": 29305261146240.0, "grad_norm": 2.4375298892428208, "language_loss": 0.88701355, "learning_rate": 3.672147120210184e-06, "loss": 0.9115864, "num_input_tokens_seen": 3377925, "step": 159, "time_per_iteration": 2.7333104610443115 }, { "auxiliary_loss_clip": 0.0137595, "auxiliary_loss_mlp": 0.01077424, "balance_loss_clip": 1.11380601, "balance_loss_mlp": 1.04335451, "epoch": 0.019238862502254553, "flos": 20886723797760.0, "grad_norm": 2.063695412784422, "language_loss": 0.86271811, "learning_rate": 3.6766891159659177e-06, "loss": 0.88725185, "num_input_tokens_seen": 3396335, "step": 160, "time_per_iteration": 2.6537106037139893 }, { "auxiliary_loss_clip": 0.01377868, "auxiliary_loss_mlp": 0.0108042, "balance_loss_clip": 1.11676824, "balance_loss_mlp": 1.04801941, "epoch": 0.019359105392893646, "flos": 21360672777600.0, "grad_norm": 3.3820366762993093, "language_loss": 0.87688017, "learning_rate": 3.6812028124990075e-06, "loss": 0.90146303, "num_input_tokens_seen": 3413605, "step": 161, "time_per_iteration": 2.783328056335449 }, { "auxiliary_loss_clip": 0.01369602, "auxiliary_loss_mlp": 0.01089542, "balance_loss_clip": 1.10907066, "balance_loss_mlp": 1.05509019, "epoch": 0.019479348283532736, "flos": 16283729681280.0, "grad_norm": 5.897061300783691, "language_loss": 0.81908345, "learning_rate": 3.6856885602676016e-06, "loss": 0.8436749, "num_input_tokens_seen": 3429640, "step": 162, "time_per_iteration": 2.8100359439849854 }, { "auxiliary_loss_clip": 0.01374243, "auxiliary_loss_mlp": 0.01086312, "balance_loss_clip": 1.11087704, "balance_loss_mlp": 1.05083513, "epoch": 0.019599591174171826, "flos": 22091239497600.0, "grad_norm": 2.4752146681662883, "language_loss": 0.9422214, "learning_rate": 3.6901467032597733e-06, "loss": 0.96682692, "num_input_tokens_seen": 3448125, "step": 163, "time_per_iteration": 2.7646701335906982 }, { "auxiliary_loss_clip": 0.01374954, "auxiliary_loss_mlp": 0.01086527, "balance_loss_clip": 1.11148202, "balance_loss_mlp": 1.05317235, "epoch": 0.01971983406481092, "flos": 19609668581760.0, "grad_norm": 2.2528898344093675, "language_loss": 0.87445205, "learning_rate": 3.694577579151804e-06, "loss": 0.89906687, "num_input_tokens_seen": 3466535, "step": 164, "time_per_iteration": 2.678227663040161 }, { "auxiliary_loss_clip": 0.01372843, "auxiliary_loss_mlp": 0.01082727, "balance_loss_clip": 1.11297035, "balance_loss_mlp": 1.04891944, "epoch": 0.01984007695545001, "flos": 19099342103040.0, "grad_norm": 2.3087565474589478, "language_loss": 0.73751879, "learning_rate": 3.6989815194616703e-06, "loss": 0.76207447, "num_input_tokens_seen": 3483730, "step": 165, "time_per_iteration": 2.721855640411377 }, { "auxiliary_loss_clip": 0.01367522, "auxiliary_loss_mlp": 0.01077184, "balance_loss_clip": 1.10641289, "balance_loss_mlp": 1.04359126, "epoch": 0.0199603198460891, "flos": 20848406964480.0, "grad_norm": 2.1599769126087778, "language_loss": 0.79916817, "learning_rate": 3.703358849697888e-06, "loss": 0.82361519, "num_input_tokens_seen": 3503640, "step": 166, "time_per_iteration": 2.7804155349731445 }, { "auxiliary_loss_clip": 0.01367346, "auxiliary_loss_mlp": 0.01090152, "balance_loss_clip": 1.10961926, "balance_loss_mlp": 1.05653536, "epoch": 0.020080562736728192, "flos": 21870747861120.0, "grad_norm": 3.540123885987892, "language_loss": 0.82829487, "learning_rate": 3.7077098895038803e-06, "loss": 0.85286987, "num_input_tokens_seen": 3523010, "step": 167, "time_per_iteration": 2.676445484161377 }, { "auxiliary_loss_clip": 0.01372166, "auxiliary_loss_mlp": 0.0108413, "balance_loss_clip": 1.11094534, "balance_loss_mlp": 1.05020285, "epoch": 0.020200805627367282, "flos": 21688788539520.0, "grad_norm": 2.7341441601237406, "language_loss": 0.97065806, "learning_rate": 3.712034952798045e-06, "loss": 0.99522096, "num_input_tokens_seen": 3541125, "step": 168, "time_per_iteration": 2.6942994594573975 }, { "auxiliary_loss_clip": 0.01369286, "auxiliary_loss_mlp": 0.01071414, "balance_loss_clip": 1.10721612, "balance_loss_mlp": 1.03863096, "epoch": 0.02032104851800637, "flos": 33543043729920.0, "grad_norm": 2.1298611901045876, "language_loss": 0.84825754, "learning_rate": 3.7163343479096656e-06, "loss": 0.87266445, "num_input_tokens_seen": 3562700, "step": 169, "time_per_iteration": 2.8514304161071777 }, { "auxiliary_loss_clip": 0.01365952, "auxiliary_loss_mlp": 0.0107805, "balance_loss_clip": 1.10730815, "balance_loss_mlp": 1.04841447, "epoch": 0.020441291408645465, "flos": 31686965274240.0, "grad_norm": 2.452560502852618, "language_loss": 0.82848603, "learning_rate": 3.720608377710802e-06, "loss": 0.85292602, "num_input_tokens_seen": 3582790, "step": 170, "time_per_iteration": 2.8251020908355713 }, { "auxiliary_loss_clip": 0.01364928, "auxiliary_loss_mlp": 0.01076637, "balance_loss_clip": 1.10654247, "balance_loss_mlp": 1.04516613, "epoch": 0.020561534299284555, "flos": 20886687884160.0, "grad_norm": 6.815778605514142, "language_loss": 0.86413366, "learning_rate": 3.7248573397443277e-06, "loss": 0.88854933, "num_input_tokens_seen": 3601715, "step": 171, "time_per_iteration": 4.642110109329224 }, { "auxiliary_loss_clip": 0.01366191, "auxiliary_loss_mlp": 0.01077874, "balance_loss_clip": 1.10778582, "balance_loss_mlp": 1.04533017, "epoch": 0.020681777189923645, "flos": 20996610480000.0, "grad_norm": 2.315916625536218, "language_loss": 0.97485292, "learning_rate": 3.729081526348224e-06, "loss": 0.99929357, "num_input_tokens_seen": 3620245, "step": 172, "time_per_iteration": 3.713261604309082 }, { "auxiliary_loss_clip": 0.01363319, "auxiliary_loss_mlp": 0.01070251, "balance_loss_clip": 1.10550499, "balance_loss_mlp": 1.03780222, "epoch": 0.020802020080562738, "flos": 28257532312320.0, "grad_norm": 1.765911659289122, "language_loss": 0.85082412, "learning_rate": 3.7332812247762777e-06, "loss": 0.87515986, "num_input_tokens_seen": 3641545, "step": 173, "time_per_iteration": 2.796400308609009 }, { "auxiliary_loss_clip": 0.0136408, "auxiliary_loss_mlp": 0.01076739, "balance_loss_clip": 1.10752773, "balance_loss_mlp": 1.04512429, "epoch": 0.020922262971201828, "flos": 19681274344320.0, "grad_norm": 3.035002362356243, "language_loss": 0.9510386, "learning_rate": 3.737456717315293e-06, "loss": 0.9754467, "num_input_tokens_seen": 3660510, "step": 174, "time_per_iteration": 2.7366597652435303 }, { "auxiliary_loss_clip": 0.01361749, "auxiliary_loss_mlp": 0.01072852, "balance_loss_clip": 1.10672081, "balance_loss_mlp": 1.04095197, "epoch": 0.021042505861840918, "flos": 15666353694720.0, "grad_norm": 1.8185495640913043, "language_loss": 0.90805662, "learning_rate": 3.7416082813989552e-06, "loss": 0.93240261, "num_input_tokens_seen": 3677505, "step": 175, "time_per_iteration": 2.822354555130005 }, { "auxiliary_loss_clip": 0.01366643, "auxiliary_loss_mlp": 0.01085639, "balance_loss_clip": 1.10589266, "balance_loss_mlp": 1.05218852, "epoch": 0.02116274875248001, "flos": 21142012734720.0, "grad_norm": 2.076994759158947, "language_loss": 0.89300144, "learning_rate": 3.745736189718439e-06, "loss": 0.91752428, "num_input_tokens_seen": 3696760, "step": 176, "time_per_iteration": 2.7950334548950195 }, { "auxiliary_loss_clip": 0.01360056, "auxiliary_loss_mlp": 0.01067792, "balance_loss_clip": 1.10451031, "balance_loss_mlp": 1.0365591, "epoch": 0.0212829916431191, "flos": 24715770543360.0, "grad_norm": 2.8585790927405994, "language_loss": 0.72774327, "learning_rate": 3.749840710329894e-06, "loss": 0.75202179, "num_input_tokens_seen": 3717465, "step": 177, "time_per_iteration": 2.794891119003296 }, { "auxiliary_loss_clip": 0.01365424, "auxiliary_loss_mlp": 0.01080412, "balance_loss_clip": 1.10693729, "balance_loss_mlp": 1.04808211, "epoch": 0.02140323453375819, "flos": 16645493508480.0, "grad_norm": 2.948632077161472, "language_loss": 0.98119462, "learning_rate": 3.7539221067588938e-06, "loss": 1.00565302, "num_input_tokens_seen": 3731440, "step": 178, "time_per_iteration": 2.676165819168091 }, { "auxiliary_loss_clip": 0.01362085, "auxiliary_loss_mlp": 0.01075453, "balance_loss_clip": 1.10481203, "balance_loss_mlp": 1.043648, "epoch": 0.021523477424397284, "flos": 20299332689280.0, "grad_norm": 3.946655758873742, "language_loss": 0.93728751, "learning_rate": 3.757980638101964e-06, "loss": 0.96166289, "num_input_tokens_seen": 3744935, "step": 179, "time_per_iteration": 2.7103846073150635 }, { "auxiliary_loss_clip": 0.01363637, "auxiliary_loss_mlp": 0.01076617, "balance_loss_clip": 1.10566497, "balance_loss_mlp": 1.04497862, "epoch": 0.021643720315036374, "flos": 26104005331200.0, "grad_norm": 2.498527398970377, "language_loss": 0.89441907, "learning_rate": 3.7620165591252806e-06, "loss": 0.91882169, "num_input_tokens_seen": 3763035, "step": 180, "time_per_iteration": 2.71836256980896 }, { "auxiliary_loss_clip": 0.01362889, "auxiliary_loss_mlp": 0.01084502, "balance_loss_clip": 1.1052928, "balance_loss_mlp": 1.05474746, "epoch": 0.021763963205675464, "flos": 24787663614720.0, "grad_norm": 1.8360745307311057, "language_loss": 0.94365728, "learning_rate": 3.766030120360636e-06, "loss": 0.96813118, "num_input_tokens_seen": 3782665, "step": 181, "time_per_iteration": 2.7241501808166504 }, { "auxiliary_loss_clip": 0.01353767, "auxiliary_loss_mlp": 0.01066868, "balance_loss_clip": 1.09885478, "balance_loss_mlp": 1.03539693, "epoch": 0.021884206096314557, "flos": 25813559957760.0, "grad_norm": 2.1018421892561236, "language_loss": 0.90228653, "learning_rate": 3.7700215681987578e-06, "loss": 0.92649293, "num_input_tokens_seen": 3802435, "step": 182, "time_per_iteration": 2.737093210220337 }, { "auxiliary_loss_clip": 0.013588, "auxiliary_loss_mlp": 0.01076393, "balance_loss_clip": 1.10274744, "balance_loss_mlp": 1.04351485, "epoch": 0.022004448986953647, "flos": 20082719721600.0, "grad_norm": 1.9019508538440082, "language_loss": 0.82120854, "learning_rate": 3.7739911449800767e-06, "loss": 0.84556055, "num_input_tokens_seen": 3822490, "step": 183, "time_per_iteration": 2.780498743057251 }, { "auxiliary_loss_clip": 0.01354533, "auxiliary_loss_mlp": 0.01073144, "balance_loss_clip": 1.09957719, "balance_loss_mlp": 1.04145813, "epoch": 0.022124691877592736, "flos": 20480609652480.0, "grad_norm": 1.77338251694263, "language_loss": 0.80792928, "learning_rate": 3.7779390890830114e-06, "loss": 0.83220613, "num_input_tokens_seen": 3841140, "step": 184, "time_per_iteration": 2.894601821899414 }, { "auxiliary_loss_clip": 0.01358755, "auxiliary_loss_mlp": 0.01074548, "balance_loss_clip": 1.10013509, "balance_loss_mlp": 1.04329145, "epoch": 0.02224493476823183, "flos": 23586847015680.0, "grad_norm": 2.2848425747529633, "language_loss": 0.85840887, "learning_rate": 3.7818656350098723e-06, "loss": 0.88274193, "num_input_tokens_seen": 3862090, "step": 185, "time_per_iteration": 2.7528247833251953 }, { "auxiliary_loss_clip": 0.01352767, "auxiliary_loss_mlp": 0.01083098, "balance_loss_clip": 1.09890413, "balance_loss_mlp": 1.05117369, "epoch": 0.02236517765887092, "flos": 16909940413440.0, "grad_norm": 2.448344914460138, "language_loss": 0.77065176, "learning_rate": 3.7857710134704447e-06, "loss": 0.79501039, "num_input_tokens_seen": 3881025, "step": 186, "time_per_iteration": 2.7822039127349854 }, { "auxiliary_loss_clip": 0.01355251, "auxiliary_loss_mlp": 0.01072314, "balance_loss_clip": 1.10182607, "balance_loss_mlp": 1.0417726, "epoch": 0.02248542054951001, "flos": 43508182930560.0, "grad_norm": 2.0044047028130825, "language_loss": 0.79090834, "learning_rate": 3.7896554514633234e-06, "loss": 0.81518394, "num_input_tokens_seen": 3905310, "step": 187, "time_per_iteration": 2.971869707107544 }, { "auxiliary_loss_clip": 0.01354337, "auxiliary_loss_mlp": 0.01062054, "balance_loss_clip": 1.09999347, "balance_loss_mlp": 1.03284776, "epoch": 0.022605663440149103, "flos": 23367648268800.0, "grad_norm": 1.9822016558472235, "language_loss": 0.84392262, "learning_rate": 3.7935191723550955e-06, "loss": 0.86808658, "num_input_tokens_seen": 3924265, "step": 188, "time_per_iteration": 2.8568034172058105 }, { "auxiliary_loss_clip": 0.01349634, "auxiliary_loss_mlp": 0.01071481, "balance_loss_clip": 1.09792471, "balance_loss_mlp": 1.04206038, "epoch": 0.022725906330788193, "flos": 29019915504000.0, "grad_norm": 1.8551135978686237, "language_loss": 0.8872456, "learning_rate": 3.797362395957408e-06, "loss": 0.9114567, "num_input_tokens_seen": 3944830, "step": 189, "time_per_iteration": 2.817115306854248 }, { "auxiliary_loss_clip": 0.01358126, "auxiliary_loss_mlp": 0.01070464, "balance_loss_clip": 1.1024332, "balance_loss_mlp": 1.03784811, "epoch": 0.022846149221427282, "flos": 24496176746880.0, "grad_norm": 2.4411419543810964, "language_loss": 0.78200948, "learning_rate": 3.8011853386020055e-06, "loss": 0.80629539, "num_input_tokens_seen": 3965735, "step": 190, "time_per_iteration": 2.6819498538970947 }, { "auxiliary_loss_clip": 0.01350054, "auxiliary_loss_mlp": 0.01063786, "balance_loss_clip": 1.09790564, "balance_loss_mlp": 1.0342226, "epoch": 0.022966392112066376, "flos": 15523537219200.0, "grad_norm": 3.9576796182376737, "language_loss": 0.89456159, "learning_rate": 3.804988213213804e-06, "loss": 0.91869998, "num_input_tokens_seen": 3983975, "step": 191, "time_per_iteration": 2.6234421730041504 }, { "auxiliary_loss_clip": 0.01346265, "auxiliary_loss_mlp": 0.01019002, "balance_loss_clip": 1.15299368, "balance_loss_mlp": 0.99783033, "epoch": 0.023086635002705466, "flos": 55650408433920.0, "grad_norm": 1.019427291436444, "language_loss": 0.63212889, "learning_rate": 3.808771229382049e-06, "loss": 0.65578157, "num_input_tokens_seen": 4043440, "step": 192, "time_per_iteration": 3.218242645263672 }, { "auxiliary_loss_clip": 0.01347568, "auxiliary_loss_mlp": 0.0107117, "balance_loss_clip": 1.09610176, "balance_loss_mlp": 1.0412488, "epoch": 0.023206877893344555, "flos": 19313441118720.0, "grad_norm": 2.2535662489363113, "language_loss": 0.84537089, "learning_rate": 3.8125345934296324e-06, "loss": 0.86955827, "num_input_tokens_seen": 4061750, "step": 193, "time_per_iteration": 2.706188678741455 }, { "auxiliary_loss_clip": 0.01350683, "auxiliary_loss_mlp": 0.01077079, "balance_loss_clip": 1.09627473, "balance_loss_mlp": 1.04706228, "epoch": 0.02332712078398365, "flos": 23072965090560.0, "grad_norm": 2.624683274630268, "language_loss": 0.88149589, "learning_rate": 3.81627850848061e-06, "loss": 0.90577352, "num_input_tokens_seen": 4082345, "step": 194, "time_per_iteration": 2.7797164916992188 }, { "auxiliary_loss_clip": 0.01351577, "auxiliary_loss_mlp": 0.01085471, "balance_loss_clip": 1.0984205, "balance_loss_mlp": 1.05490601, "epoch": 0.02344736367462274, "flos": 24425971614720.0, "grad_norm": 2.1458932516656426, "language_loss": 0.86423361, "learning_rate": 3.820003174525994e-06, "loss": 0.88860404, "num_input_tokens_seen": 4101770, "step": 195, "time_per_iteration": 2.749197006225586 }, { "auxiliary_loss_clip": 0.01351168, "auxiliary_loss_mlp": 0.01080445, "balance_loss_clip": 1.09838223, "balance_loss_mlp": 1.05202508, "epoch": 0.02356760656526183, "flos": 21579799697280.0, "grad_norm": 2.3997893883473345, "language_loss": 0.82537609, "learning_rate": 3.823708788487851e-06, "loss": 0.84969223, "num_input_tokens_seen": 4118770, "step": 196, "time_per_iteration": 2.6551928520202637 }, { "auxiliary_loss_clip": 0.01349747, "auxiliary_loss_mlp": 0.01085047, "balance_loss_clip": 1.09975743, "balance_loss_mlp": 1.05557847, "epoch": 0.02368784945590092, "flos": 25193598192000.0, "grad_norm": 5.11499363813138, "language_loss": 0.84685999, "learning_rate": 3.827395544281781e-06, "loss": 0.87120795, "num_input_tokens_seen": 4141110, "step": 197, "time_per_iteration": 3.6538140773773193 }, { "auxiliary_loss_clip": 0.01350484, "auxiliary_loss_mlp": 0.01081758, "balance_loss_clip": 1.09930587, "balance_loss_mlp": 1.05076361, "epoch": 0.02380809234654001, "flos": 27562481164800.0, "grad_norm": 3.5902767270629443, "language_loss": 0.78814387, "learning_rate": 3.831063632877802e-06, "loss": 0.81246626, "num_input_tokens_seen": 4161430, "step": 198, "time_per_iteration": 3.5782968997955322 }, { "auxiliary_loss_clip": 0.0135117, "auxiliary_loss_mlp": 0.01072876, "balance_loss_clip": 1.10516047, "balance_loss_mlp": 1.04362202, "epoch": 0.0239283352371791, "flos": 18259786540800.0, "grad_norm": 2.737865105476419, "language_loss": 0.76051271, "learning_rate": 3.834713242359712e-06, "loss": 0.7847532, "num_input_tokens_seen": 4179260, "step": 199, "time_per_iteration": 4.422708988189697 }, { "auxiliary_loss_clip": 0.01348586, "auxiliary_loss_mlp": 0.01064193, "balance_loss_clip": 1.09678233, "balance_loss_mlp": 1.03462863, "epoch": 0.02404857812781819, "flos": 21395110942080.0, "grad_norm": 2.564441828849806, "language_loss": 0.87167037, "learning_rate": 3.838344557982959e-06, "loss": 0.89579821, "num_input_tokens_seen": 4200640, "step": 200, "time_per_iteration": 2.6950788497924805 }, { "auxiliary_loss_clip": 0.01343749, "auxiliary_loss_mlp": 0.01070193, "balance_loss_clip": 1.09375393, "balance_loss_mlp": 1.04250062, "epoch": 0.024168821018457284, "flos": 16654256426880.0, "grad_norm": 2.778046505885696, "language_loss": 0.84907341, "learning_rate": 3.841957762231063e-06, "loss": 0.87321281, "num_input_tokens_seen": 4218170, "step": 201, "time_per_iteration": 2.6996614933013916 }, { "auxiliary_loss_clip": 0.01343085, "auxiliary_loss_mlp": 0.01064802, "balance_loss_clip": 1.09474158, "balance_loss_mlp": 1.03623962, "epoch": 0.024289063909096374, "flos": 22820872464000.0, "grad_norm": 1.947605264421667, "language_loss": 0.87892032, "learning_rate": 3.8455530348706454e-06, "loss": 0.90299922, "num_input_tokens_seen": 4237770, "step": 202, "time_per_iteration": 2.7209503650665283 }, { "auxiliary_loss_clip": 0.01345884, "auxiliary_loss_mlp": 0.01086517, "balance_loss_clip": 1.10021853, "balance_loss_mlp": 1.05771613, "epoch": 0.024409306799735464, "flos": 17748598135680.0, "grad_norm": 1.8302845108892802, "language_loss": 0.77365136, "learning_rate": 3.849130553005099e-06, "loss": 0.7979753, "num_input_tokens_seen": 4255985, "step": 203, "time_per_iteration": 2.7838692665100098 }, { "auxiliary_loss_clip": 0.01344457, "auxiliary_loss_mlp": 0.01076582, "balance_loss_clip": 1.09541512, "balance_loss_mlp": 1.04627943, "epoch": 0.024529549690374557, "flos": 21616213109760.0, "grad_norm": 1.7449039585086255, "language_loss": 0.83758575, "learning_rate": 3.852690491126933e-06, "loss": 0.86179614, "num_input_tokens_seen": 4276035, "step": 204, "time_per_iteration": 2.7639167308807373 }, { "auxiliary_loss_clip": 0.01341731, "auxiliary_loss_mlp": 0.0106187, "balance_loss_clip": 1.09302378, "balance_loss_mlp": 1.03237748, "epoch": 0.024649792581013647, "flos": 25551662918400.0, "grad_norm": 2.3892280505199865, "language_loss": 0.91007835, "learning_rate": 3.856233021168845e-06, "loss": 0.93411434, "num_input_tokens_seen": 4295730, "step": 205, "time_per_iteration": 2.725076198577881 }, { "auxiliary_loss_clip": 0.0133654, "auxiliary_loss_mlp": 0.01067566, "balance_loss_clip": 1.09148061, "balance_loss_mlp": 1.04091108, "epoch": 0.024770035471652737, "flos": 34495574544000.0, "grad_norm": 2.004810950065198, "language_loss": 0.91282564, "learning_rate": 3.859758312553544e-06, "loss": 0.93686664, "num_input_tokens_seen": 4317950, "step": 206, "time_per_iteration": 2.7983551025390625 }, { "auxiliary_loss_clip": 0.01345279, "auxiliary_loss_mlp": 0.01074397, "balance_loss_clip": 1.09826636, "balance_loss_mlp": 1.04590607, "epoch": 0.02489027836229183, "flos": 21505428587520.0, "grad_norm": 2.2738405940201214, "language_loss": 0.9177134, "learning_rate": 3.8632665322423735e-06, "loss": 0.94191021, "num_input_tokens_seen": 4337605, "step": 207, "time_per_iteration": 2.744093179702759 }, { "auxiliary_loss_clip": 0.01339222, "auxiliary_loss_mlp": 0.0107511, "balance_loss_clip": 1.09389687, "balance_loss_mlp": 1.04737043, "epoch": 0.02501052125293092, "flos": 23219013790080.0, "grad_norm": 2.265971898717229, "language_loss": 0.8593263, "learning_rate": 3.866757844782762e-06, "loss": 0.88346964, "num_input_tokens_seen": 4358110, "step": 208, "time_per_iteration": 2.7320077419281006 }, { "auxiliary_loss_clip": 0.01344731, "auxiliary_loss_mlp": 0.01069079, "balance_loss_clip": 1.09651554, "balance_loss_mlp": 1.04051626, "epoch": 0.02513076414357001, "flos": 26388920010240.0, "grad_norm": 3.2690478286694735, "language_loss": 0.91356826, "learning_rate": 3.870232412354527e-06, "loss": 0.93770635, "num_input_tokens_seen": 4374955, "step": 209, "time_per_iteration": 2.724618911743164 }, { "auxiliary_loss_clip": 0.01340495, "auxiliary_loss_mlp": 0.01070072, "balance_loss_clip": 1.09234452, "balance_loss_mlp": 1.04267752, "epoch": 0.025251007034209103, "flos": 13590430047360.0, "grad_norm": 2.094663578833228, "language_loss": 0.92316067, "learning_rate": 3.873690394815086e-06, "loss": 0.94726634, "num_input_tokens_seen": 4391535, "step": 210, "time_per_iteration": 2.6325855255126953 }, { "auxiliary_loss_clip": 0.01337738, "auxiliary_loss_mlp": 0.01069722, "balance_loss_clip": 1.09267938, "balance_loss_mlp": 1.04049158, "epoch": 0.025371249924848193, "flos": 15049229103360.0, "grad_norm": 2.6921732077599403, "language_loss": 0.90955412, "learning_rate": 3.877131949743587e-06, "loss": 0.93362868, "num_input_tokens_seen": 4408400, "step": 211, "time_per_iteration": 2.655782699584961 }, { "auxiliary_loss_clip": 0.01339822, "auxiliary_loss_mlp": 0.01083525, "balance_loss_clip": 1.09241438, "balance_loss_mlp": 1.05520034, "epoch": 0.025491492815487283, "flos": 25553853648000.0, "grad_norm": 2.112388084054683, "language_loss": 0.78337145, "learning_rate": 3.880557232483993e-06, "loss": 0.80760491, "num_input_tokens_seen": 4427840, "step": 212, "time_per_iteration": 2.7007598876953125 }, { "auxiliary_loss_clip": 0.01337161, "auxiliary_loss_mlp": 0.01065349, "balance_loss_clip": 1.09019554, "balance_loss_mlp": 1.03690577, "epoch": 0.025611735706126376, "flos": 20630752502400.0, "grad_norm": 1.9050537163008023, "language_loss": 0.87135315, "learning_rate": 3.883966396187164e-06, "loss": 0.89537829, "num_input_tokens_seen": 4447110, "step": 213, "time_per_iteration": 2.6519830226898193 }, { "auxiliary_loss_clip": 0.01341428, "auxiliary_loss_mlp": 0.01066859, "balance_loss_clip": 1.09598637, "balance_loss_mlp": 1.03965557, "epoch": 0.025731978596765466, "flos": 19062282245760.0, "grad_norm": 1.9216226294464447, "language_loss": 0.89815706, "learning_rate": 3.887359591851937e-06, "loss": 0.9222399, "num_input_tokens_seen": 4464715, "step": 214, "time_per_iteration": 2.6289045810699463 }, { "auxiliary_loss_clip": 0.01334123, "auxiliary_loss_mlp": 0.01064258, "balance_loss_clip": 1.09235036, "balance_loss_mlp": 1.0365541, "epoch": 0.025852221487404556, "flos": 22163814927360.0, "grad_norm": 1.6241758902454755, "language_loss": 0.92227036, "learning_rate": 3.890736968365265e-06, "loss": 0.94625419, "num_input_tokens_seen": 4485030, "step": 215, "time_per_iteration": 2.6398439407348633 }, { "auxiliary_loss_clip": 0.01337411, "auxiliary_loss_mlp": 0.01065299, "balance_loss_clip": 1.09199119, "balance_loss_mlp": 1.03848851, "epoch": 0.02597246437804365, "flos": 26541971861760.0, "grad_norm": 2.3495783421615157, "language_loss": 0.85227561, "learning_rate": 3.894098672541412e-06, "loss": 0.87630272, "num_input_tokens_seen": 4505935, "step": 216, "time_per_iteration": 2.686149835586548 }, { "auxiliary_loss_clip": 0.01334611, "auxiliary_loss_mlp": 0.01068485, "balance_loss_clip": 1.09259605, "balance_loss_mlp": 1.04049468, "epoch": 0.02609270726868274, "flos": 32671671696000.0, "grad_norm": 1.773332424382651, "language_loss": 0.75175822, "learning_rate": 3.89744484916025e-06, "loss": 0.7757892, "num_input_tokens_seen": 4527045, "step": 217, "time_per_iteration": 2.7387919425964355 }, { "auxiliary_loss_clip": 0.01340724, "auxiliary_loss_mlp": 0.01069922, "balance_loss_clip": 1.09379494, "balance_loss_mlp": 1.04216969, "epoch": 0.02621295015932183, "flos": 26243553669120.0, "grad_norm": 2.3623934364442216, "language_loss": 0.87454277, "learning_rate": 3.900775641004673e-06, "loss": 0.89864922, "num_input_tokens_seen": 4546360, "step": 218, "time_per_iteration": 2.6601178646087646 }, { "auxiliary_loss_clip": 0.01345291, "auxiliary_loss_mlp": 0.01059097, "balance_loss_clip": 1.09750843, "balance_loss_mlp": 1.03096366, "epoch": 0.026333193049960922, "flos": 42921402353280.0, "grad_norm": 2.9871486806054515, "language_loss": 0.74316996, "learning_rate": 3.904091188897156e-06, "loss": 0.76721382, "num_input_tokens_seen": 4565495, "step": 219, "time_per_iteration": 2.852297067642212 }, { "auxiliary_loss_clip": 0.01335723, "auxiliary_loss_mlp": 0.01074941, "balance_loss_clip": 1.09376311, "balance_loss_mlp": 1.04757023, "epoch": 0.026453435940600012, "flos": 17963846386560.0, "grad_norm": 2.039110752351116, "language_loss": 0.8189019, "learning_rate": 3.90739163173548e-06, "loss": 0.84300852, "num_input_tokens_seen": 4583330, "step": 220, "time_per_iteration": 2.6257381439208984 }, { "auxiliary_loss_clip": 0.01334686, "auxiliary_loss_mlp": 0.01062534, "balance_loss_clip": 1.09210205, "balance_loss_mlp": 1.03525877, "epoch": 0.026573678831239102, "flos": 18984319776000.0, "grad_norm": 2.3646098469953634, "language_loss": 0.88485396, "learning_rate": 3.910677106527646e-06, "loss": 0.90882611, "num_input_tokens_seen": 4600520, "step": 221, "time_per_iteration": 2.631923198699951 }, { "auxiliary_loss_clip": 0.01335225, "auxiliary_loss_mlp": 0.01068749, "balance_loss_clip": 1.09215641, "balance_loss_mlp": 1.0421772, "epoch": 0.026693921721878195, "flos": 29241448634880.0, "grad_norm": 2.495656178816696, "language_loss": 0.84422481, "learning_rate": 3.913947748426004e-06, "loss": 0.86826456, "num_input_tokens_seen": 4617340, "step": 222, "time_per_iteration": 2.705756664276123 }, { "auxiliary_loss_clip": 0.01336869, "auxiliary_loss_mlp": 0.0107488, "balance_loss_clip": 1.09448564, "balance_loss_mlp": 1.04693711, "epoch": 0.026814164612517285, "flos": 14128083797760.0, "grad_norm": 2.682340461183856, "language_loss": 0.76596588, "learning_rate": 3.9172036907606136e-06, "loss": 0.79008335, "num_input_tokens_seen": 4630820, "step": 223, "time_per_iteration": 2.646345853805542 }, { "auxiliary_loss_clip": 0.01336172, "auxiliary_loss_mlp": 0.01066871, "balance_loss_clip": 1.09050369, "balance_loss_mlp": 1.03878498, "epoch": 0.026934407503156375, "flos": 23511973115520.0, "grad_norm": 1.7035982712761666, "language_loss": 0.94988763, "learning_rate": 3.920445065071855e-06, "loss": 0.97391808, "num_input_tokens_seen": 4651985, "step": 224, "time_per_iteration": 3.5990734100341797 }, { "auxiliary_loss_clip": 0.01334582, "auxiliary_loss_mlp": 0.01073995, "balance_loss_clip": 1.09163952, "balance_loss_mlp": 1.04745924, "epoch": 0.027054650393795468, "flos": 28950356816640.0, "grad_norm": 2.4969445658399243, "language_loss": 0.79693806, "learning_rate": 3.923672001142322e-06, "loss": 0.82102376, "num_input_tokens_seen": 4672295, "step": 225, "time_per_iteration": 4.600550651550293 }, { "auxiliary_loss_clip": 0.01332416, "auxiliary_loss_mlp": 0.01072312, "balance_loss_clip": 1.09160972, "balance_loss_mlp": 1.04575181, "epoch": 0.027174893284434558, "flos": 31431568596480.0, "grad_norm": 1.9575204741393715, "language_loss": 0.84118903, "learning_rate": 3.926884627027996e-06, "loss": 0.86523628, "num_input_tokens_seen": 4696065, "step": 226, "time_per_iteration": 2.770097017288208 }, { "auxiliary_loss_clip": 0.01332433, "auxiliary_loss_mlp": 0.01063276, "balance_loss_clip": 1.09111071, "balance_loss_mlp": 1.03675246, "epoch": 0.027295136175073648, "flos": 22054466949120.0, "grad_norm": 2.7766356357608726, "language_loss": 0.77470219, "learning_rate": 3.930083069088744e-06, "loss": 0.79865932, "num_input_tokens_seen": 4716065, "step": 227, "time_per_iteration": 2.748331308364868 }, { "auxiliary_loss_clip": 0.01318413, "auxiliary_loss_mlp": 0.01051288, "balance_loss_clip": 1.12957823, "balance_loss_mlp": 1.02916241, "epoch": 0.02741537906571274, "flos": 60800752972800.0, "grad_norm": 0.9833387857261546, "language_loss": 0.59288347, "learning_rate": 3.933267452018137e-06, "loss": 0.61658049, "num_input_tokens_seen": 4775860, "step": 228, "time_per_iteration": 3.268104314804077 }, { "auxiliary_loss_clip": 0.01336578, "auxiliary_loss_mlp": 0.01067695, "balance_loss_clip": 1.09661722, "balance_loss_mlp": 1.04108763, "epoch": 0.02753562195635183, "flos": 24606278910720.0, "grad_norm": 1.9192401743598655, "language_loss": 0.84389657, "learning_rate": 3.936437898872622e-06, "loss": 0.86793923, "num_input_tokens_seen": 4795835, "step": 229, "time_per_iteration": 2.743434190750122 }, { "auxiliary_loss_clip": 0.01333768, "auxiliary_loss_mlp": 0.01064506, "balance_loss_clip": 1.09338701, "balance_loss_mlp": 1.03754139, "epoch": 0.02765586484699092, "flos": 34094236907520.0, "grad_norm": 2.1200375194018783, "language_loss": 0.79598153, "learning_rate": 3.9395945311000525e-06, "loss": 0.81996429, "num_input_tokens_seen": 4817460, "step": 230, "time_per_iteration": 2.770836591720581 }, { "auxiliary_loss_clip": 0.01334586, "auxiliary_loss_mlp": 0.01078884, "balance_loss_clip": 1.09077573, "balance_loss_mlp": 1.05233645, "epoch": 0.027776107737630014, "flos": 14829922615680.0, "grad_norm": 2.319534973028515, "language_loss": 0.9057318, "learning_rate": 3.942737468567608e-06, "loss": 0.92986649, "num_input_tokens_seen": 4835475, "step": 231, "time_per_iteration": 2.695350170135498 }, { "auxiliary_loss_clip": 0.01333755, "auxiliary_loss_mlp": 0.01071099, "balance_loss_clip": 1.09413362, "balance_loss_mlp": 1.04499173, "epoch": 0.027896350628269104, "flos": 47920347066240.0, "grad_norm": 2.041666861544963, "language_loss": 0.85850513, "learning_rate": 3.9458668295891026e-06, "loss": 0.8825537, "num_input_tokens_seen": 4857760, "step": 232, "time_per_iteration": 2.9172284603118896 }, { "auxiliary_loss_clip": 0.01331233, "auxiliary_loss_mlp": 0.01068574, "balance_loss_clip": 1.08807802, "balance_loss_mlp": 1.04157329, "epoch": 0.028016593518908194, "flos": 21684550734720.0, "grad_norm": 4.920655946657788, "language_loss": 0.86658221, "learning_rate": 3.948982730951712e-06, "loss": 0.8905803, "num_input_tokens_seen": 4875855, "step": 233, "time_per_iteration": 2.753263473510742 }, { "auxiliary_loss_clip": 0.01332865, "auxiliary_loss_mlp": 0.01067236, "balance_loss_clip": 1.09086514, "balance_loss_mlp": 1.0399605, "epoch": 0.028136836409547287, "flos": 18439483305600.0, "grad_norm": 2.258860824789818, "language_loss": 0.82080585, "learning_rate": 3.9520852879421254e-06, "loss": 0.84480679, "num_input_tokens_seen": 4893200, "step": 234, "time_per_iteration": 2.633981466293335 }, { "auxiliary_loss_clip": 0.01331365, "auxiliary_loss_mlp": 0.01067306, "balance_loss_clip": 1.09400249, "balance_loss_mlp": 1.04045975, "epoch": 0.028257079300186377, "flos": 31576934937600.0, "grad_norm": 2.081354761983096, "language_loss": 0.81907266, "learning_rate": 3.955174614372137e-06, "loss": 0.84305936, "num_input_tokens_seen": 4912965, "step": 235, "time_per_iteration": 2.746469259262085 }, { "auxiliary_loss_clip": 0.01333942, "auxiliary_loss_mlp": 0.01069878, "balance_loss_clip": 1.09352171, "balance_loss_mlp": 1.04257917, "epoch": 0.028377322190825467, "flos": 23513337832320.0, "grad_norm": 2.0717758370152692, "language_loss": 0.84328347, "learning_rate": 3.9582508226037045e-06, "loss": 0.86732173, "num_input_tokens_seen": 4933105, "step": 236, "time_per_iteration": 2.665515422821045 }, { "auxiliary_loss_clip": 0.01333717, "auxiliary_loss_mlp": 0.01068421, "balance_loss_clip": 1.08935833, "balance_loss_mlp": 1.04176617, "epoch": 0.02849756508146456, "flos": 20479604071680.0, "grad_norm": 3.0120691442903182, "language_loss": 0.94016111, "learning_rate": 3.9613140235734636e-06, "loss": 0.9641825, "num_input_tokens_seen": 4950085, "step": 237, "time_per_iteration": 2.6792736053466797 }, { "auxiliary_loss_clip": 0.01334539, "auxiliary_loss_mlp": 0.01064122, "balance_loss_clip": 1.09280026, "balance_loss_mlp": 1.03727651, "epoch": 0.02861780797210365, "flos": 14283362292480.0, "grad_norm": 1.8379757802489076, "language_loss": 0.81080526, "learning_rate": 3.96436432681674e-06, "loss": 0.83479184, "num_input_tokens_seen": 4968075, "step": 238, "time_per_iteration": 2.626427412033081 }, { "auxiliary_loss_clip": 0.01329249, "auxiliary_loss_mlp": 0.01077297, "balance_loss_clip": 1.09112501, "balance_loss_mlp": 1.04975951, "epoch": 0.02873805086274274, "flos": 25808532053760.0, "grad_norm": 2.1774333955003575, "language_loss": 0.89272296, "learning_rate": 3.967401840491044e-06, "loss": 0.91678834, "num_input_tokens_seen": 4987355, "step": 239, "time_per_iteration": 2.7049341201782227 }, { "auxiliary_loss_clip": 0.01328911, "auxiliary_loss_mlp": 0.01069458, "balance_loss_clip": 1.09096456, "balance_loss_mlp": 1.04205179, "epoch": 0.028858293753381833, "flos": 17304238984320.0, "grad_norm": 2.998234233650083, "language_loss": 0.87629604, "learning_rate": 3.97042667139909e-06, "loss": 0.90027976, "num_input_tokens_seen": 5004680, "step": 240, "time_per_iteration": 2.6397783756256104 }, { "auxiliary_loss_clip": 0.01329721, "auxiliary_loss_mlp": 0.0106861, "balance_loss_clip": 1.09110284, "balance_loss_mlp": 1.04305148, "epoch": 0.028978536644020923, "flos": 23038347358080.0, "grad_norm": 1.9987228413248768, "language_loss": 0.874475, "learning_rate": 3.973438925011327e-06, "loss": 0.89845836, "num_input_tokens_seen": 5022965, "step": 241, "time_per_iteration": 2.687316417694092 }, { "auxiliary_loss_clip": 0.01328492, "auxiliary_loss_mlp": 0.01072294, "balance_loss_clip": 1.08799934, "balance_loss_mlp": 1.0476172, "epoch": 0.029098779534660012, "flos": 28329712692480.0, "grad_norm": 2.366750868317763, "language_loss": 0.91559958, "learning_rate": 3.976438705488002e-06, "loss": 0.9396075, "num_input_tokens_seen": 5042625, "step": 242, "time_per_iteration": 2.6888773441314697 }, { "auxiliary_loss_clip": 0.01333623, "auxiliary_loss_mlp": 0.01077333, "balance_loss_clip": 1.0954622, "balance_loss_mlp": 1.05154777, "epoch": 0.029219022425299106, "flos": 13881665520000.0, "grad_norm": 4.780878527150375, "language_loss": 0.92974204, "learning_rate": 3.9794261157007744e-06, "loss": 0.95385158, "num_input_tokens_seen": 5060380, "step": 243, "time_per_iteration": 2.6751065254211426 }, { "auxiliary_loss_clip": 0.01333766, "auxiliary_loss_mlp": 0.01070094, "balance_loss_clip": 1.09379029, "balance_loss_mlp": 1.04401064, "epoch": 0.029339265315938196, "flos": 19422501788160.0, "grad_norm": 2.6961802635285896, "language_loss": 0.8467474, "learning_rate": 3.982401257253887e-06, "loss": 0.87078595, "num_input_tokens_seen": 5078720, "step": 244, "time_per_iteration": 2.668060302734375 }, { "auxiliary_loss_clip": 0.0132449, "auxiliary_loss_mlp": 0.01066103, "balance_loss_clip": 1.08677423, "balance_loss_mlp": 1.0404253, "epoch": 0.029459508206577285, "flos": 15669550005120.0, "grad_norm": 1.9701326047134609, "language_loss": 0.89644766, "learning_rate": 3.985364230504893e-06, "loss": 0.92035365, "num_input_tokens_seen": 5096605, "step": 245, "time_per_iteration": 2.6138956546783447 }, { "auxiliary_loss_clip": 0.013266, "auxiliary_loss_mlp": 0.01071995, "balance_loss_clip": 1.08998895, "balance_loss_mlp": 1.044958, "epoch": 0.02957975109721638, "flos": 28220975245440.0, "grad_norm": 2.3461174988018283, "language_loss": 0.84366316, "learning_rate": 3.988315134584976e-06, "loss": 0.86764914, "num_input_tokens_seen": 5116285, "step": 246, "time_per_iteration": 2.699169635772705 }, { "auxiliary_loss_clip": 0.0133287, "auxiliary_loss_mlp": 0.01067408, "balance_loss_clip": 1.0915606, "balance_loss_mlp": 1.04021668, "epoch": 0.02969999398785547, "flos": 24315869450880.0, "grad_norm": 1.7245872075787525, "language_loss": 0.80360138, "learning_rate": 3.991254067418851e-06, "loss": 0.82760417, "num_input_tokens_seen": 5136825, "step": 247, "time_per_iteration": 2.682603120803833 }, { "auxiliary_loss_clip": 0.01328474, "auxiliary_loss_mlp": 0.01065203, "balance_loss_clip": 1.09132111, "balance_loss_mlp": 1.03890514, "epoch": 0.02982023687849456, "flos": 35078584193280.0, "grad_norm": 2.0934208962977254, "language_loss": 0.82747018, "learning_rate": 3.994181125744254e-06, "loss": 0.85140705, "num_input_tokens_seen": 5158630, "step": 248, "time_per_iteration": 2.7231874465942383 }, { "auxiliary_loss_clip": 0.01327485, "auxiliary_loss_mlp": 0.01066223, "balance_loss_clip": 1.08901417, "balance_loss_mlp": 1.03966284, "epoch": 0.02994047976913365, "flos": 26177155378560.0, "grad_norm": 2.267337908989615, "language_loss": 0.74237549, "learning_rate": 3.99709640513106e-06, "loss": 0.7663126, "num_input_tokens_seen": 5179510, "step": 249, "time_per_iteration": 2.6859195232391357 }, { "auxiliary_loss_clip": 0.01328932, "auxiliary_loss_mlp": 0.01073904, "balance_loss_clip": 1.08695233, "balance_loss_mlp": 1.04730844, "epoch": 0.03006072265977274, "flos": 25625028447360.0, "grad_norm": 2.5183232028917635, "language_loss": 0.856812, "learning_rate": 4e-06, "loss": 0.8808403, "num_input_tokens_seen": 5199345, "step": 250, "time_per_iteration": 3.59822416305542 }, { "auxiliary_loss_clip": 0.01327956, "auxiliary_loss_mlp": 0.01085146, "balance_loss_clip": 1.09198809, "balance_loss_mlp": 1.05917048, "epoch": 0.03018096555041183, "flos": 22127078292480.0, "grad_norm": 3.3249231532848507, "language_loss": 0.88415676, "learning_rate": 3.999999848300794e-06, "loss": 0.90828776, "num_input_tokens_seen": 5218330, "step": 251, "time_per_iteration": 3.5388479232788086 }, { "auxiliary_loss_clip": 0.01320408, "auxiliary_loss_mlp": 0.01068542, "balance_loss_clip": 1.08436441, "balance_loss_mlp": 1.04264998, "epoch": 0.030301208441050925, "flos": 30188197359360.0, "grad_norm": 1.6178770783383485, "language_loss": 0.89081287, "learning_rate": 3.999999393203203e-06, "loss": 0.91470242, "num_input_tokens_seen": 5240740, "step": 252, "time_per_iteration": 4.531958341598511 }, { "auxiliary_loss_clip": 0.01320788, "auxiliary_loss_mlp": 0.01058686, "balance_loss_clip": 1.08705235, "balance_loss_mlp": 1.03321052, "epoch": 0.030421451331690014, "flos": 23621392920960.0, "grad_norm": 2.0124070493505717, "language_loss": 0.84988511, "learning_rate": 3.999998634707293e-06, "loss": 0.87367988, "num_input_tokens_seen": 5260290, "step": 253, "time_per_iteration": 2.6529738903045654 }, { "auxiliary_loss_clip": 0.01328875, "auxiliary_loss_mlp": 0.0107228, "balance_loss_clip": 1.09229803, "balance_loss_mlp": 1.0452081, "epoch": 0.030541694222329104, "flos": 27928446883200.0, "grad_norm": 3.360548576622451, "language_loss": 0.96246898, "learning_rate": 3.999997572813182e-06, "loss": 0.98648047, "num_input_tokens_seen": 5278100, "step": 254, "time_per_iteration": 2.6530933380126953 }, { "auxiliary_loss_clip": 0.01323956, "auxiliary_loss_mlp": 0.01061945, "balance_loss_clip": 1.08559299, "balance_loss_mlp": 1.03662503, "epoch": 0.030661937112968194, "flos": 18588441006720.0, "grad_norm": 1.9224367540998473, "language_loss": 0.87627351, "learning_rate": 3.999996207521028e-06, "loss": 0.90013254, "num_input_tokens_seen": 5296810, "step": 255, "time_per_iteration": 2.6173388957977295 }, { "auxiliary_loss_clip": 0.01325613, "auxiliary_loss_mlp": 0.01059839, "balance_loss_clip": 1.08782148, "balance_loss_mlp": 1.03360105, "epoch": 0.030782180003607287, "flos": 12969139478400.0, "grad_norm": 2.8725600542972165, "language_loss": 0.82231289, "learning_rate": 3.999994538831039e-06, "loss": 0.84616745, "num_input_tokens_seen": 5313395, "step": 256, "time_per_iteration": 2.6085612773895264 }, { "auxiliary_loss_clip": 0.01321779, "auxiliary_loss_mlp": 0.01065932, "balance_loss_clip": 1.08574879, "balance_loss_mlp": 1.04011166, "epoch": 0.030902422894246377, "flos": 23335364920320.0, "grad_norm": 2.628418314360437, "language_loss": 0.85879534, "learning_rate": 3.99999256674347e-06, "loss": 0.88267243, "num_input_tokens_seen": 5333545, "step": 257, "time_per_iteration": 2.65468168258667 }, { "auxiliary_loss_clip": 0.01319293, "auxiliary_loss_mlp": 0.01053634, "balance_loss_clip": 1.13350284, "balance_loss_mlp": 1.03274906, "epoch": 0.031022665784885467, "flos": 55094151438720.0, "grad_norm": 1.0055763096718673, "language_loss": 0.53476125, "learning_rate": 3.999990291258618e-06, "loss": 0.55849057, "num_input_tokens_seen": 5392235, "step": 258, "time_per_iteration": 3.2629518508911133 }, { "auxiliary_loss_clip": 0.01325855, "auxiliary_loss_mlp": 0.01058969, "balance_loss_clip": 1.09059811, "balance_loss_mlp": 1.03317249, "epoch": 0.03114290867552456, "flos": 19317786664320.0, "grad_norm": 3.4230621901132854, "language_loss": 0.86465079, "learning_rate": 3.999987712376829e-06, "loss": 0.88849902, "num_input_tokens_seen": 5410555, "step": 259, "time_per_iteration": 2.634861946105957 }, { "auxiliary_loss_clip": 0.01321068, "auxiliary_loss_mlp": 0.01071819, "balance_loss_clip": 1.08917606, "balance_loss_mlp": 1.04647493, "epoch": 0.031263151566163654, "flos": 20959442881920.0, "grad_norm": 3.62420427810412, "language_loss": 0.8206659, "learning_rate": 3.999984830098494e-06, "loss": 0.84459484, "num_input_tokens_seen": 5430135, "step": 260, "time_per_iteration": 2.6655774116516113 }, { "auxiliary_loss_clip": 0.01326657, "auxiliary_loss_mlp": 0.01063737, "balance_loss_clip": 1.09090638, "balance_loss_mlp": 1.0376544, "epoch": 0.03138339445680274, "flos": 14793006412800.0, "grad_norm": 4.12364439235462, "language_loss": 0.97984093, "learning_rate": 3.999981644424051e-06, "loss": 1.00374496, "num_input_tokens_seen": 5444935, "step": 261, "time_per_iteration": 2.6675117015838623 }, { "auxiliary_loss_clip": 0.01324917, "auxiliary_loss_mlp": 0.01062388, "balance_loss_clip": 1.08807456, "balance_loss_mlp": 1.03578067, "epoch": 0.03150363734744183, "flos": 11655599022720.0, "grad_norm": 2.5314128276034293, "language_loss": 0.85968995, "learning_rate": 3.999978155353982e-06, "loss": 0.88356292, "num_input_tokens_seen": 5462080, "step": 262, "time_per_iteration": 2.632423162460327 }, { "auxiliary_loss_clip": 0.01323226, "auxiliary_loss_mlp": 0.0107028, "balance_loss_clip": 1.08640397, "balance_loss_mlp": 1.0439353, "epoch": 0.03162388023808092, "flos": 33727732485120.0, "grad_norm": 3.803522378064602, "language_loss": 0.80274165, "learning_rate": 3.9999743628888186e-06, "loss": 0.82667667, "num_input_tokens_seen": 5483870, "step": 263, "time_per_iteration": 2.75121808052063 }, { "auxiliary_loss_clip": 0.01321763, "auxiliary_loss_mlp": 0.01063304, "balance_loss_clip": 1.08674335, "balance_loss_mlp": 1.03714907, "epoch": 0.03174412312872001, "flos": 20810952057600.0, "grad_norm": 2.2593566658610382, "language_loss": 0.89489818, "learning_rate": 3.999970267029133e-06, "loss": 0.91874886, "num_input_tokens_seen": 5502830, "step": 264, "time_per_iteration": 2.6790049076080322 }, { "auxiliary_loss_clip": 0.01322157, "auxiliary_loss_mlp": 0.01072226, "balance_loss_clip": 1.08809042, "balance_loss_mlp": 1.04745436, "epoch": 0.0318643660193591, "flos": 23727939638400.0, "grad_norm": 2.0244784238178846, "language_loss": 0.80064666, "learning_rate": 3.999965867775548e-06, "loss": 0.82459044, "num_input_tokens_seen": 5523225, "step": 265, "time_per_iteration": 2.687027931213379 }, { "auxiliary_loss_clip": 0.0132329, "auxiliary_loss_mlp": 0.01061691, "balance_loss_clip": 1.08855629, "balance_loss_mlp": 1.03519034, "epoch": 0.0319846089099982, "flos": 13917863450880.0, "grad_norm": 2.776021398802981, "language_loss": 0.86790282, "learning_rate": 3.9999611651287315e-06, "loss": 0.8917526, "num_input_tokens_seen": 5541380, "step": 266, "time_per_iteration": 2.6232011318206787 }, { "auxiliary_loss_clip": 0.0132652, "auxiliary_loss_mlp": 0.01068223, "balance_loss_clip": 1.088148, "balance_loss_mlp": 1.04242659, "epoch": 0.03210485180063729, "flos": 14753253035520.0, "grad_norm": 3.2675831698480926, "language_loss": 0.78853977, "learning_rate": 3.999956159089396e-06, "loss": 0.81248724, "num_input_tokens_seen": 5558830, "step": 267, "time_per_iteration": 2.602123737335205 }, { "auxiliary_loss_clip": 0.01322681, "auxiliary_loss_mlp": 0.0106865, "balance_loss_clip": 1.08957887, "balance_loss_mlp": 1.04430747, "epoch": 0.03222509469127638, "flos": 28913153304960.0, "grad_norm": 2.4973045256274453, "language_loss": 0.79636657, "learning_rate": 3.999950849658302e-06, "loss": 0.82027984, "num_input_tokens_seen": 5577750, "step": 268, "time_per_iteration": 2.7178800106048584 }, { "auxiliary_loss_clip": 0.01324119, "auxiliary_loss_mlp": 0.0106684, "balance_loss_clip": 1.08816087, "balance_loss_mlp": 1.04098356, "epoch": 0.03234533758191547, "flos": 16946389739520.0, "grad_norm": 2.573500216360628, "language_loss": 0.83933139, "learning_rate": 3.999945236836254e-06, "loss": 0.86324096, "num_input_tokens_seen": 5596715, "step": 269, "time_per_iteration": 2.5950868129730225 }, { "auxiliary_loss_clip": 0.01328265, "auxiliary_loss_mlp": 0.01066551, "balance_loss_clip": 1.0930202, "balance_loss_mlp": 1.0407182, "epoch": 0.03246558047255456, "flos": 18989096284800.0, "grad_norm": 3.0219629024394172, "language_loss": 0.94734442, "learning_rate": 3.999939320624103e-06, "loss": 0.97129256, "num_input_tokens_seen": 5611865, "step": 270, "time_per_iteration": 2.6705379486083984 }, { "auxiliary_loss_clip": 0.01325169, "auxiliary_loss_mlp": 0.01078926, "balance_loss_clip": 1.09030771, "balance_loss_mlp": 1.05280685, "epoch": 0.03258582336319365, "flos": 23728334688000.0, "grad_norm": 2.2940659382051614, "language_loss": 0.898067, "learning_rate": 3.999933101022749e-06, "loss": 0.92210799, "num_input_tokens_seen": 5632270, "step": 271, "time_per_iteration": 2.7130696773529053 }, { "auxiliary_loss_clip": 0.01323248, "auxiliary_loss_mlp": 0.01072959, "balance_loss_clip": 1.08999038, "balance_loss_mlp": 1.04790163, "epoch": 0.032706066253832745, "flos": 27670823562240.0, "grad_norm": 3.5473858806194625, "language_loss": 0.86687464, "learning_rate": 3.999926578033132e-06, "loss": 0.89083672, "num_input_tokens_seen": 5652085, "step": 272, "time_per_iteration": 2.7470593452453613 }, { "auxiliary_loss_clip": 0.01323858, "auxiliary_loss_mlp": 0.01067737, "balance_loss_clip": 1.08761597, "balance_loss_mlp": 1.04155815, "epoch": 0.032826309144471835, "flos": 45624685968000.0, "grad_norm": 2.2778765551524733, "language_loss": 0.63006496, "learning_rate": 3.999919751656244e-06, "loss": 0.65398091, "num_input_tokens_seen": 5678985, "step": 273, "time_per_iteration": 2.864765167236328 }, { "auxiliary_loss_clip": 0.01322838, "auxiliary_loss_mlp": 0.01066048, "balance_loss_clip": 1.09083712, "balance_loss_mlp": 1.0402869, "epoch": 0.032946552035110925, "flos": 25812374808960.0, "grad_norm": 2.6568775006744265, "language_loss": 0.75592947, "learning_rate": 3.9999126218931195e-06, "loss": 0.7798183, "num_input_tokens_seen": 5697020, "step": 274, "time_per_iteration": 2.6612348556518555 }, { "auxiliary_loss_clip": 0.01326691, "auxiliary_loss_mlp": 0.01073133, "balance_loss_clip": 1.09275579, "balance_loss_mlp": 1.04747927, "epoch": 0.033066794925750015, "flos": 15121984101120.0, "grad_norm": 2.1686456987936054, "language_loss": 0.89623338, "learning_rate": 3.99990518874484e-06, "loss": 0.92023158, "num_input_tokens_seen": 5713460, "step": 275, "time_per_iteration": 2.6370582580566406 }, { "auxiliary_loss_clip": 0.01321563, "auxiliary_loss_mlp": 0.01076941, "balance_loss_clip": 1.08941877, "balance_loss_mlp": 1.05100095, "epoch": 0.033187037816389105, "flos": 22776593973120.0, "grad_norm": 2.9423238749033893, "language_loss": 0.92874289, "learning_rate": 3.999897452212534e-06, "loss": 0.95272791, "num_input_tokens_seen": 5730790, "step": 276, "time_per_iteration": 3.5640103816986084 }, { "auxiliary_loss_clip": 0.01322108, "auxiliary_loss_mlp": 0.0106039, "balance_loss_clip": 1.08742452, "balance_loss_mlp": 1.03529668, "epoch": 0.033307280707028195, "flos": 23331414424320.0, "grad_norm": 2.2372619885339993, "language_loss": 1.00153041, "learning_rate": 3.999889412297374e-06, "loss": 1.02535534, "num_input_tokens_seen": 5750215, "step": 277, "time_per_iteration": 3.610960006713867 }, { "auxiliary_loss_clip": 0.01321291, "auxiliary_loss_mlp": 0.01065971, "balance_loss_clip": 1.08609629, "balance_loss_mlp": 1.0411756, "epoch": 0.03342752359766729, "flos": 28840290566400.0, "grad_norm": 2.2697214977276796, "language_loss": 0.78663361, "learning_rate": 3.999881069000581e-06, "loss": 0.81050622, "num_input_tokens_seen": 5769945, "step": 278, "time_per_iteration": 4.497699975967407 }, { "auxiliary_loss_clip": 0.01321112, "auxiliary_loss_mlp": 0.01072575, "balance_loss_clip": 1.08727157, "balance_loss_mlp": 1.04617012, "epoch": 0.03354776648830638, "flos": 19384544090880.0, "grad_norm": 2.7450473194284997, "language_loss": 0.86590409, "learning_rate": 3.99987242232342e-06, "loss": 0.88984096, "num_input_tokens_seen": 5784950, "step": 279, "time_per_iteration": 2.658735513687134 }, { "auxiliary_loss_clip": 0.01325957, "auxiliary_loss_mlp": 0.01065395, "balance_loss_clip": 1.09201121, "balance_loss_mlp": 1.03862023, "epoch": 0.03366800937894547, "flos": 17858628472320.0, "grad_norm": 1.8807310759220126, "language_loss": 0.79655337, "learning_rate": 3.9998634722672026e-06, "loss": 0.820467, "num_input_tokens_seen": 5805005, "step": 280, "time_per_iteration": 2.681607246398926 }, { "auxiliary_loss_clip": 0.01323512, "auxiliary_loss_mlp": 0.01066826, "balance_loss_clip": 1.09029043, "balance_loss_mlp": 1.04120779, "epoch": 0.03378825226958456, "flos": 35951033635200.0, "grad_norm": 1.9575523994810513, "language_loss": 0.78630686, "learning_rate": 3.999854218833286e-06, "loss": 0.81021017, "num_input_tokens_seen": 5825825, "step": 281, "time_per_iteration": 2.7946791648864746 }, { "auxiliary_loss_clip": 0.01322846, "auxiliary_loss_mlp": 0.01065717, "balance_loss_clip": 1.08925176, "balance_loss_mlp": 1.04170823, "epoch": 0.03390849516022365, "flos": 25702488126720.0, "grad_norm": 2.7123377284950174, "language_loss": 0.81873286, "learning_rate": 3.999844662023075e-06, "loss": 0.84261853, "num_input_tokens_seen": 5845700, "step": 282, "time_per_iteration": 2.6687614917755127 }, { "auxiliary_loss_clip": 0.01318447, "auxiliary_loss_mlp": 0.01067055, "balance_loss_clip": 1.08707786, "balance_loss_mlp": 1.04126966, "epoch": 0.03402873805086274, "flos": 21284505987840.0, "grad_norm": 1.8106437359722554, "language_loss": 0.92072856, "learning_rate": 3.999834801838018e-06, "loss": 0.94458359, "num_input_tokens_seen": 5864680, "step": 283, "time_per_iteration": 2.6949539184570312 }, { "auxiliary_loss_clip": 0.01317917, "auxiliary_loss_mlp": 0.01068214, "balance_loss_clip": 1.08770132, "balance_loss_mlp": 1.04388344, "epoch": 0.03414898094150183, "flos": 22710913954560.0, "grad_norm": 2.283579054566723, "language_loss": 0.73864865, "learning_rate": 3.9998246382796115e-06, "loss": 0.76250994, "num_input_tokens_seen": 5884260, "step": 284, "time_per_iteration": 2.682826280593872 }, { "auxiliary_loss_clip": 0.01322845, "auxiliary_loss_mlp": 0.01063822, "balance_loss_clip": 1.08802819, "balance_loss_mlp": 1.03806138, "epoch": 0.03426922383214093, "flos": 18879927874560.0, "grad_norm": 2.092775959478182, "language_loss": 0.90731007, "learning_rate": 3.999814171349399e-06, "loss": 0.93117678, "num_input_tokens_seen": 5902120, "step": 285, "time_per_iteration": 2.6461944580078125 }, { "auxiliary_loss_clip": 0.0131787, "auxiliary_loss_mlp": 0.01062494, "balance_loss_clip": 1.08703804, "balance_loss_mlp": 1.0366019, "epoch": 0.03438946672278002, "flos": 34752012716160.0, "grad_norm": 1.69414458870202, "language_loss": 0.7377637, "learning_rate": 3.9998034010489655e-06, "loss": 0.76156735, "num_input_tokens_seen": 5925810, "step": 286, "time_per_iteration": 2.7384939193725586 }, { "auxiliary_loss_clip": 0.01320054, "auxiliary_loss_mlp": 0.01082423, "balance_loss_clip": 1.08971179, "balance_loss_mlp": 1.05779433, "epoch": 0.03450970961341911, "flos": 22164102236160.0, "grad_norm": 2.110435034955653, "language_loss": 0.75737274, "learning_rate": 3.999792327379946e-06, "loss": 0.78139752, "num_input_tokens_seen": 5945185, "step": 287, "time_per_iteration": 2.7713699340820312 }, { "auxiliary_loss_clip": 0.01323549, "auxiliary_loss_mlp": 0.01074058, "balance_loss_clip": 1.09286809, "balance_loss_mlp": 1.04941702, "epoch": 0.034629952504058197, "flos": 21725740656000.0, "grad_norm": 2.098926876668241, "language_loss": 0.96114737, "learning_rate": 3.999780950344021e-06, "loss": 0.9851234, "num_input_tokens_seen": 5963375, "step": 288, "time_per_iteration": 2.686983585357666 }, { "auxiliary_loss_clip": 0.013257, "auxiliary_loss_mlp": 0.0106841, "balance_loss_clip": 1.08967614, "balance_loss_mlp": 1.04287553, "epoch": 0.034750195394697286, "flos": 20047994248320.0, "grad_norm": 2.0014051995798665, "language_loss": 0.82634783, "learning_rate": 3.999769269942916e-06, "loss": 0.85028899, "num_input_tokens_seen": 5983415, "step": 289, "time_per_iteration": 2.8016090393066406 }, { "auxiliary_loss_clip": 0.01318493, "auxiliary_loss_mlp": 0.0106851, "balance_loss_clip": 1.08936989, "balance_loss_mlp": 1.04341626, "epoch": 0.034870438285336376, "flos": 27965865876480.0, "grad_norm": 1.836690275755057, "language_loss": 0.81235689, "learning_rate": 3.999757286178402e-06, "loss": 0.83622682, "num_input_tokens_seen": 6005850, "step": 290, "time_per_iteration": 2.745262861251831 }, { "auxiliary_loss_clip": 0.01320533, "auxiliary_loss_mlp": 0.01079291, "balance_loss_clip": 1.08920908, "balance_loss_mlp": 1.05313623, "epoch": 0.03499068117597547, "flos": 22017514832640.0, "grad_norm": 2.8624963773938683, "language_loss": 0.90525293, "learning_rate": 3.999744999052299e-06, "loss": 0.92925119, "num_input_tokens_seen": 6027240, "step": 291, "time_per_iteration": 2.698057174682617 }, { "auxiliary_loss_clip": 0.01289716, "auxiliary_loss_mlp": 0.01026554, "balance_loss_clip": 1.12116969, "balance_loss_mlp": 1.00719452, "epoch": 0.03511092406661456, "flos": 57242147725440.0, "grad_norm": 0.9876604892970776, "language_loss": 0.61166406, "learning_rate": 3.9997324085664675e-06, "loss": 0.63482678, "num_input_tokens_seen": 6087470, "step": 292, "time_per_iteration": 3.2579591274261475 }, { "auxiliary_loss_clip": 0.01317193, "auxiliary_loss_mlp": 0.01068142, "balance_loss_clip": 1.08704138, "balance_loss_mlp": 1.04261959, "epoch": 0.03523116695725365, "flos": 22928065626240.0, "grad_norm": 2.273920476369607, "language_loss": 0.91847277, "learning_rate": 3.999719514722821e-06, "loss": 0.94232613, "num_input_tokens_seen": 6107600, "step": 293, "time_per_iteration": 2.6865241527557373 }, { "auxiliary_loss_clip": 0.01317713, "auxiliary_loss_mlp": 0.01078235, "balance_loss_clip": 1.08919632, "balance_loss_mlp": 1.05336797, "epoch": 0.03535140984789274, "flos": 36903241226880.0, "grad_norm": 2.648958977278746, "language_loss": 0.74703169, "learning_rate": 3.999706317523314e-06, "loss": 0.77099109, "num_input_tokens_seen": 6126160, "step": 294, "time_per_iteration": 2.8165476322174072 }, { "auxiliary_loss_clip": 0.01315793, "auxiliary_loss_mlp": 0.01072081, "balance_loss_clip": 1.08586311, "balance_loss_mlp": 1.0475719, "epoch": 0.03547165273853183, "flos": 20449152316800.0, "grad_norm": 2.2084037499142797, "language_loss": 0.85742784, "learning_rate": 3.999692816969948e-06, "loss": 0.88130659, "num_input_tokens_seen": 6145695, "step": 295, "time_per_iteration": 2.7321693897247314 }, { "auxiliary_loss_clip": 0.01279599, "auxiliary_loss_mlp": 0.01019617, "balance_loss_clip": 1.1130271, "balance_loss_mlp": 1.00025737, "epoch": 0.03559189562917092, "flos": 69850564871040.0, "grad_norm": 1.1030142824770583, "language_loss": 0.69442737, "learning_rate": 3.999679013064772e-06, "loss": 0.71741951, "num_input_tokens_seen": 6212440, "step": 296, "time_per_iteration": 3.3360049724578857 }, { "auxiliary_loss_clip": 0.0131538, "auxiliary_loss_mlp": 0.01056987, "balance_loss_clip": 1.08562028, "balance_loss_mlp": 1.03201282, "epoch": 0.03571213851981002, "flos": 21651944163840.0, "grad_norm": 2.617394759645052, "language_loss": 0.85467654, "learning_rate": 3.99966490580988e-06, "loss": 0.87840021, "num_input_tokens_seen": 6229800, "step": 297, "time_per_iteration": 2.681971549987793 }, { "auxiliary_loss_clip": 0.01319749, "auxiliary_loss_mlp": 0.01064485, "balance_loss_clip": 1.08990335, "balance_loss_mlp": 1.03996372, "epoch": 0.03583238141044911, "flos": 43945610757120.0, "grad_norm": 2.304021239717147, "language_loss": 0.6586861, "learning_rate": 3.999650495207411e-06, "loss": 0.68252844, "num_input_tokens_seen": 6255825, "step": 298, "time_per_iteration": 2.9116694927215576 }, { "auxiliary_loss_clip": 0.01316481, "auxiliary_loss_mlp": 0.01074857, "balance_loss_clip": 1.08947873, "balance_loss_mlp": 1.04938161, "epoch": 0.0359526243010882, "flos": 18910810592640.0, "grad_norm": 5.645985397557338, "language_loss": 0.9037497, "learning_rate": 3.999635781259553e-06, "loss": 0.92766303, "num_input_tokens_seen": 6271090, "step": 299, "time_per_iteration": 2.7138283252716064 }, { "auxiliary_loss_clip": 0.01269333, "auxiliary_loss_mlp": 0.01018247, "balance_loss_clip": 1.1055131, "balance_loss_mlp": 0.99898285, "epoch": 0.03607286719172729, "flos": 61668892782720.0, "grad_norm": 0.9200019911346663, "language_loss": 0.52294785, "learning_rate": 3.999620763968535e-06, "loss": 0.54582363, "num_input_tokens_seen": 6329965, "step": 300, "time_per_iteration": 3.045332670211792 }, { "auxiliary_loss_clip": 0.01313963, "auxiliary_loss_mlp": 0.01062728, "balance_loss_clip": 1.08505249, "balance_loss_mlp": 1.03782499, "epoch": 0.03619311008236638, "flos": 27819062991360.0, "grad_norm": 1.690762597575365, "language_loss": 0.86303419, "learning_rate": 3.999605443336638e-06, "loss": 0.88680112, "num_input_tokens_seen": 6352095, "step": 301, "time_per_iteration": 2.7071921825408936 }, { "auxiliary_loss_clip": 0.01318291, "auxiliary_loss_mlp": 0.01062311, "balance_loss_clip": 1.08719039, "balance_loss_mlp": 1.03663313, "epoch": 0.03631335297300547, "flos": 13621133197440.0, "grad_norm": 3.8401469484759567, "language_loss": 0.89261353, "learning_rate": 3.999589819366185e-06, "loss": 0.91641951, "num_input_tokens_seen": 6365885, "step": 302, "time_per_iteration": 2.60490345954895 }, { "auxiliary_loss_clip": 0.01314948, "auxiliary_loss_mlp": 0.01052983, "balance_loss_clip": 1.08488846, "balance_loss_mlp": 1.02760291, "epoch": 0.036433595863644565, "flos": 27631788456960.0, "grad_norm": 1.8723594888469313, "language_loss": 0.84716046, "learning_rate": 3.999573892059547e-06, "loss": 0.87083983, "num_input_tokens_seen": 6385015, "step": 303, "time_per_iteration": 4.511361360549927 }, { "auxiliary_loss_clip": 0.01321653, "auxiliary_loss_mlp": 0.0107181, "balance_loss_clip": 1.08944273, "balance_loss_mlp": 1.04647779, "epoch": 0.036553838754283655, "flos": 24572020314240.0, "grad_norm": 2.099514017576814, "language_loss": 0.81038976, "learning_rate": 3.999557661419138e-06, "loss": 0.83432436, "num_input_tokens_seen": 6405165, "step": 304, "time_per_iteration": 3.592745542526245 }, { "auxiliary_loss_clip": 0.01320803, "auxiliary_loss_mlp": 0.01074351, "balance_loss_clip": 1.08924747, "balance_loss_mlp": 1.05012786, "epoch": 0.036674081644922744, "flos": 23404313076480.0, "grad_norm": 1.832786259192636, "language_loss": 0.81570023, "learning_rate": 3.9995411274474225e-06, "loss": 0.83965176, "num_input_tokens_seen": 6424445, "step": 305, "time_per_iteration": 4.3295698165893555 }, { "auxiliary_loss_clip": 0.01316519, "auxiliary_loss_mlp": 0.01077037, "balance_loss_clip": 1.08768296, "balance_loss_mlp": 1.05165744, "epoch": 0.036794324535561834, "flos": 27489690253440.0, "grad_norm": 1.8546594779545893, "language_loss": 0.8124631, "learning_rate": 3.999524290146908e-06, "loss": 0.83639866, "num_input_tokens_seen": 6444650, "step": 306, "time_per_iteration": 2.727748155593872 }, { "auxiliary_loss_clip": 0.0131782, "auxiliary_loss_mlp": 0.0105527, "balance_loss_clip": 1.08951974, "balance_loss_mlp": 1.03068948, "epoch": 0.036914567426200924, "flos": 19463476227840.0, "grad_norm": 3.285025056599215, "language_loss": 0.924052, "learning_rate": 3.9995071495201485e-06, "loss": 0.94778287, "num_input_tokens_seen": 6461755, "step": 307, "time_per_iteration": 2.7080063819885254 }, { "auxiliary_loss_clip": 0.01317007, "auxiliary_loss_mlp": 0.01069693, "balance_loss_clip": 1.08748102, "balance_loss_mlp": 1.04599416, "epoch": 0.037034810316840014, "flos": 22309324922880.0, "grad_norm": 3.4939997117454715, "language_loss": 0.97519827, "learning_rate": 3.999489705569744e-06, "loss": 0.99906528, "num_input_tokens_seen": 6479455, "step": 308, "time_per_iteration": 2.667457103729248 }, { "auxiliary_loss_clip": 0.01311319, "auxiliary_loss_mlp": 0.01057121, "balance_loss_clip": 1.08476472, "balance_loss_mlp": 1.03279078, "epoch": 0.03715505320747911, "flos": 18588333265920.0, "grad_norm": 2.2850686682001125, "language_loss": 0.8643949, "learning_rate": 3.999471958298341e-06, "loss": 0.88807935, "num_input_tokens_seen": 6498365, "step": 309, "time_per_iteration": 2.8387529850006104 }, { "auxiliary_loss_clip": 0.01318379, "auxiliary_loss_mlp": 0.01066744, "balance_loss_clip": 1.08935535, "balance_loss_mlp": 1.04337943, "epoch": 0.0372752960981182, "flos": 35955343267200.0, "grad_norm": 1.8100430479127776, "language_loss": 0.76217937, "learning_rate": 3.999453907708631e-06, "loss": 0.78603065, "num_input_tokens_seen": 6520770, "step": 310, "time_per_iteration": 2.8004956245422363 }, { "auxiliary_loss_clip": 0.01310883, "auxiliary_loss_mlp": 0.01068636, "balance_loss_clip": 1.08485603, "balance_loss_mlp": 1.04417467, "epoch": 0.03739553898875729, "flos": 20814040627200.0, "grad_norm": 2.10814216679222, "language_loss": 0.81444204, "learning_rate": 3.999435553803353e-06, "loss": 0.83823717, "num_input_tokens_seen": 6540170, "step": 311, "time_per_iteration": 2.7258293628692627 }, { "auxiliary_loss_clip": 0.01313362, "auxiliary_loss_mlp": 0.010591, "balance_loss_clip": 1.08579135, "balance_loss_mlp": 1.03498399, "epoch": 0.03751578187939638, "flos": 20264140339200.0, "grad_norm": 2.3749996338283808, "language_loss": 0.83500063, "learning_rate": 3.999416896585292e-06, "loss": 0.85872525, "num_input_tokens_seen": 6557200, "step": 312, "time_per_iteration": 2.763798236846924 }, { "auxiliary_loss_clip": 0.01312154, "auxiliary_loss_mlp": 0.01066258, "balance_loss_clip": 1.08331394, "balance_loss_mlp": 1.04176021, "epoch": 0.03763602477003547, "flos": 20668063754880.0, "grad_norm": 3.5120446996678774, "language_loss": 0.85992211, "learning_rate": 3.9993979360572775e-06, "loss": 0.88370627, "num_input_tokens_seen": 6577340, "step": 313, "time_per_iteration": 2.753394842147827 }, { "auxiliary_loss_clip": 0.01318802, "auxiliary_loss_mlp": 0.01076152, "balance_loss_clip": 1.08790016, "balance_loss_mlp": 1.05149984, "epoch": 0.03775626766067456, "flos": 16691352197760.0, "grad_norm": 3.573574028365135, "language_loss": 0.82751089, "learning_rate": 3.999378672222185e-06, "loss": 0.85146046, "num_input_tokens_seen": 6595125, "step": 314, "time_per_iteration": 2.7045633792877197 }, { "auxiliary_loss_clip": 0.01313333, "auxiliary_loss_mlp": 0.01057185, "balance_loss_clip": 1.08662617, "balance_loss_mlp": 1.03297329, "epoch": 0.03787651055131366, "flos": 21141797253120.0, "grad_norm": 3.0196864729065753, "language_loss": 0.83115232, "learning_rate": 3.9993591050829385e-06, "loss": 0.85485744, "num_input_tokens_seen": 6612990, "step": 315, "time_per_iteration": 2.6907410621643066 }, { "auxiliary_loss_clip": 0.01312125, "auxiliary_loss_mlp": 0.01071842, "balance_loss_clip": 1.08623862, "balance_loss_mlp": 1.04815555, "epoch": 0.037996753441952746, "flos": 22018089450240.0, "grad_norm": 1.9886595267986127, "language_loss": 0.79425728, "learning_rate": 3.999339234642506e-06, "loss": 0.81809694, "num_input_tokens_seen": 6632740, "step": 316, "time_per_iteration": 2.7331595420837402 }, { "auxiliary_loss_clip": 0.013154, "auxiliary_loss_mlp": 0.01064394, "balance_loss_clip": 1.08678091, "balance_loss_mlp": 1.04080248, "epoch": 0.038116996332591836, "flos": 27709391790720.0, "grad_norm": 1.929357726772405, "language_loss": 0.83790457, "learning_rate": 3.9993190609038994e-06, "loss": 0.86170244, "num_input_tokens_seen": 6651505, "step": 317, "time_per_iteration": 2.7512638568878174 }, { "auxiliary_loss_clip": 0.01311875, "auxiliary_loss_mlp": 0.01060508, "balance_loss_clip": 1.08382154, "balance_loss_mlp": 1.03673768, "epoch": 0.038237239223230926, "flos": 21178067011200.0, "grad_norm": 2.601853502626988, "language_loss": 0.83151728, "learning_rate": 3.999298583870182e-06, "loss": 0.85524112, "num_input_tokens_seen": 6671090, "step": 318, "time_per_iteration": 2.7298166751861572 }, { "auxiliary_loss_clip": 0.0131271, "auxiliary_loss_mlp": 0.01060011, "balance_loss_clip": 1.08576334, "balance_loss_mlp": 1.03639603, "epoch": 0.038357482113870016, "flos": 25556618995200.0, "grad_norm": 2.0867710402411666, "language_loss": 0.77480453, "learning_rate": 3.999277803544458e-06, "loss": 0.79853177, "num_input_tokens_seen": 6691245, "step": 319, "time_per_iteration": 2.6823902130126953 }, { "auxiliary_loss_clip": 0.012592, "auxiliary_loss_mlp": 0.01023644, "balance_loss_clip": 1.10264194, "balance_loss_mlp": 1.00542915, "epoch": 0.038477725004509106, "flos": 59227578034560.0, "grad_norm": 0.9528655352893038, "language_loss": 0.62399149, "learning_rate": 3.999256719929882e-06, "loss": 0.64681995, "num_input_tokens_seen": 6752520, "step": 320, "time_per_iteration": 3.255526304244995 }, { "auxiliary_loss_clip": 0.01256797, "auxiliary_loss_mlp": 0.01020968, "balance_loss_clip": 1.10106575, "balance_loss_mlp": 1.00284851, "epoch": 0.0385979678951482, "flos": 67317676398720.0, "grad_norm": 1.216440385511086, "language_loss": 0.67060792, "learning_rate": 3.999235333029651e-06, "loss": 0.6933856, "num_input_tokens_seen": 6806460, "step": 321, "time_per_iteration": 3.1571543216705322 }, { "auxiliary_loss_clip": 0.01314661, "auxiliary_loss_mlp": 0.01073088, "balance_loss_clip": 1.09028387, "balance_loss_mlp": 1.04955673, "epoch": 0.03871821078578729, "flos": 22746752749440.0, "grad_norm": 2.1617484881968108, "language_loss": 0.82003027, "learning_rate": 3.999213642847009e-06, "loss": 0.84390777, "num_input_tokens_seen": 6827045, "step": 322, "time_per_iteration": 2.7309353351593018 }, { "auxiliary_loss_clip": 0.01313408, "auxiliary_loss_mlp": 0.01063148, "balance_loss_clip": 1.08763611, "balance_loss_mlp": 1.04011703, "epoch": 0.03883845367642638, "flos": 26280613526400.0, "grad_norm": 1.597343574166061, "language_loss": 0.91040683, "learning_rate": 3.999191649385247e-06, "loss": 0.93417239, "num_input_tokens_seen": 6848220, "step": 323, "time_per_iteration": 2.7732057571411133 }, { "auxiliary_loss_clip": 0.01251088, "auxiliary_loss_mlp": 0.0102007, "balance_loss_clip": 1.09694231, "balance_loss_mlp": 1.00195062, "epoch": 0.03895869656706547, "flos": 56962835568000.0, "grad_norm": 0.9080004712426474, "language_loss": 0.59802544, "learning_rate": 3.999169352647702e-06, "loss": 0.62073708, "num_input_tokens_seen": 6909400, "step": 324, "time_per_iteration": 3.168935775756836 }, { "auxiliary_loss_clip": 0.01315532, "auxiliary_loss_mlp": 0.0106687, "balance_loss_clip": 1.08790207, "balance_loss_mlp": 1.04137087, "epoch": 0.03907893945770456, "flos": 24863363527680.0, "grad_norm": 2.2646251698061035, "language_loss": 0.83040237, "learning_rate": 3.999146752637755e-06, "loss": 0.85422641, "num_input_tokens_seen": 6930445, "step": 325, "time_per_iteration": 2.8366591930389404 }, { "auxiliary_loss_clip": 0.01308052, "auxiliary_loss_mlp": 0.01049143, "balance_loss_clip": 1.08210874, "balance_loss_mlp": 1.02615976, "epoch": 0.03919918234834365, "flos": 18368595815040.0, "grad_norm": 2.510934566558406, "language_loss": 0.89699757, "learning_rate": 3.999123849358836e-06, "loss": 0.92056954, "num_input_tokens_seen": 6948110, "step": 326, "time_per_iteration": 2.7013943195343018 }, { "auxiliary_loss_clip": 0.01306366, "auxiliary_loss_mlp": 0.01059775, "balance_loss_clip": 1.08141208, "balance_loss_mlp": 1.03635025, "epoch": 0.03931942523898275, "flos": 25225414663680.0, "grad_norm": 2.393072414944594, "language_loss": 0.74950385, "learning_rate": 3.999100642814418e-06, "loss": 0.77316523, "num_input_tokens_seen": 6968550, "step": 327, "time_per_iteration": 2.729597330093384 }, { "auxiliary_loss_clip": 0.01313726, "auxiliary_loss_mlp": 0.01080604, "balance_loss_clip": 1.08782375, "balance_loss_mlp": 1.05691767, "epoch": 0.03943966812962184, "flos": 23257905240960.0, "grad_norm": 2.3167488415121777, "language_loss": 0.8831619, "learning_rate": 3.999077133008022e-06, "loss": 0.90710521, "num_input_tokens_seen": 6987135, "step": 328, "time_per_iteration": 3.821798324584961 }, { "auxiliary_loss_clip": 0.013134, "auxiliary_loss_mlp": 0.01060082, "balance_loss_clip": 1.08665872, "balance_loss_mlp": 1.03746796, "epoch": 0.03955991102026093, "flos": 29168837291520.0, "grad_norm": 2.476713900224869, "language_loss": 0.90700901, "learning_rate": 3.9990533199432145e-06, "loss": 0.93074387, "num_input_tokens_seen": 7008630, "step": 329, "time_per_iteration": 2.8059892654418945 }, { "auxiliary_loss_clip": 0.0131155, "auxiliary_loss_mlp": 0.01063319, "balance_loss_clip": 1.08712769, "balance_loss_mlp": 1.04072928, "epoch": 0.03968015391090002, "flos": 17602441695360.0, "grad_norm": 2.283655493714867, "language_loss": 0.75729191, "learning_rate": 3.999029203623608e-06, "loss": 0.78104061, "num_input_tokens_seen": 7026350, "step": 330, "time_per_iteration": 3.635004758834839 }, { "auxiliary_loss_clip": 0.01308161, "auxiliary_loss_mlp": 0.01068551, "balance_loss_clip": 1.08619261, "balance_loss_mlp": 1.0446496, "epoch": 0.03980039680153911, "flos": 21799285752960.0, "grad_norm": 2.0577645270276674, "language_loss": 0.86685491, "learning_rate": 3.99900478405286e-06, "loss": 0.89062196, "num_input_tokens_seen": 7045660, "step": 331, "time_per_iteration": 4.98781943321228 }, { "auxiliary_loss_clip": 0.01309198, "auxiliary_loss_mlp": 0.01062785, "balance_loss_clip": 1.08673406, "balance_loss_mlp": 1.04000378, "epoch": 0.0399206396921782, "flos": 15195134148480.0, "grad_norm": 2.2891423543437615, "language_loss": 0.82645071, "learning_rate": 3.998980061234676e-06, "loss": 0.85017049, "num_input_tokens_seen": 7063575, "step": 332, "time_per_iteration": 2.743177652359009 }, { "auxiliary_loss_clip": 0.01310562, "auxiliary_loss_mlp": 0.01064225, "balance_loss_clip": 1.08475745, "balance_loss_mlp": 1.04097879, "epoch": 0.040040882582817294, "flos": 14422910630400.0, "grad_norm": 3.082906193171117, "language_loss": 0.75603271, "learning_rate": 3.9989550351728055e-06, "loss": 0.77978063, "num_input_tokens_seen": 7080505, "step": 333, "time_per_iteration": 2.7834603786468506 }, { "auxiliary_loss_clip": 0.01307984, "auxiliary_loss_mlp": 0.0106369, "balance_loss_clip": 1.08405399, "balance_loss_mlp": 1.04063535, "epoch": 0.040161125473456384, "flos": 19280906375040.0, "grad_norm": 2.425682165500368, "language_loss": 0.84480548, "learning_rate": 3.998929705871046e-06, "loss": 0.86852223, "num_input_tokens_seen": 7097860, "step": 334, "time_per_iteration": 2.7606918811798096 }, { "auxiliary_loss_clip": 0.01307143, "auxiliary_loss_mlp": 0.01066614, "balance_loss_clip": 1.08479512, "balance_loss_mlp": 1.04320109, "epoch": 0.040281368364095474, "flos": 17821101738240.0, "grad_norm": 2.7196764034101575, "language_loss": 0.89370859, "learning_rate": 3.99890407333324e-06, "loss": 0.9174462, "num_input_tokens_seen": 7116390, "step": 335, "time_per_iteration": 2.7913477420806885 }, { "auxiliary_loss_clip": 0.01306352, "auxiliary_loss_mlp": 0.01059459, "balance_loss_clip": 1.08273911, "balance_loss_mlp": 1.0369519, "epoch": 0.040401611254734564, "flos": 19573757959680.0, "grad_norm": 1.7071019171983308, "language_loss": 0.87105298, "learning_rate": 3.998878137563275e-06, "loss": 0.89471114, "num_input_tokens_seen": 7135940, "step": 336, "time_per_iteration": 2.7657151222229004 }, { "auxiliary_loss_clip": 0.01305734, "auxiliary_loss_mlp": 0.01060239, "balance_loss_clip": 1.08385599, "balance_loss_mlp": 1.03706503, "epoch": 0.040521854145373654, "flos": 22054466949120.0, "grad_norm": 4.023973498850077, "language_loss": 0.85500574, "learning_rate": 3.998851898565085e-06, "loss": 0.87866557, "num_input_tokens_seen": 7155745, "step": 337, "time_per_iteration": 2.8008317947387695 }, { "auxiliary_loss_clip": 0.013026, "auxiliary_loss_mlp": 0.01056539, "balance_loss_clip": 1.08045435, "balance_loss_mlp": 1.03396058, "epoch": 0.04064209703601274, "flos": 22674644196480.0, "grad_norm": 1.8636849512976839, "language_loss": 0.82970726, "learning_rate": 3.998825356342653e-06, "loss": 0.8532986, "num_input_tokens_seen": 7175920, "step": 338, "time_per_iteration": 2.6948704719543457 }, { "auxiliary_loss_clip": 0.01306297, "auxiliary_loss_mlp": 0.01067248, "balance_loss_clip": 1.08348393, "balance_loss_mlp": 1.04501569, "epoch": 0.04076233992665183, "flos": 38582172783360.0, "grad_norm": 3.2954249826319177, "language_loss": 0.73085696, "learning_rate": 3.998798510900003e-06, "loss": 0.75459242, "num_input_tokens_seen": 7198720, "step": 339, "time_per_iteration": 2.884826421737671 }, { "auxiliary_loss_clip": 0.01303867, "auxiliary_loss_mlp": 0.01055257, "balance_loss_clip": 1.08044529, "balance_loss_mlp": 1.03302407, "epoch": 0.04088258281729093, "flos": 25885309374720.0, "grad_norm": 2.428915147298781, "language_loss": 0.83820903, "learning_rate": 3.998771362241207e-06, "loss": 0.86180025, "num_input_tokens_seen": 7219125, "step": 340, "time_per_iteration": 2.759138822555542 }, { "auxiliary_loss_clip": 0.01305375, "auxiliary_loss_mlp": 0.01059119, "balance_loss_clip": 1.08261299, "balance_loss_mlp": 1.03608811, "epoch": 0.04100282570793002, "flos": 19789832223360.0, "grad_norm": 2.465196035488566, "language_loss": 0.87754071, "learning_rate": 3.998743910370385e-06, "loss": 0.90118563, "num_input_tokens_seen": 7237985, "step": 341, "time_per_iteration": 2.759354591369629 }, { "auxiliary_loss_clip": 0.01311315, "auxiliary_loss_mlp": 0.0106279, "balance_loss_clip": 1.08841968, "balance_loss_mlp": 1.04120147, "epoch": 0.04112306859856911, "flos": 22565152563840.0, "grad_norm": 2.696388828987813, "language_loss": 0.73411208, "learning_rate": 3.998716155291702e-06, "loss": 0.75785309, "num_input_tokens_seen": 7255825, "step": 342, "time_per_iteration": 2.6773743629455566 }, { "auxiliary_loss_clip": 0.01307663, "auxiliary_loss_mlp": 0.01062918, "balance_loss_clip": 1.08725023, "balance_loss_mlp": 1.04049456, "epoch": 0.0412433114892082, "flos": 25040654081280.0, "grad_norm": 2.6962587892967744, "language_loss": 0.90722132, "learning_rate": 3.998688097009366e-06, "loss": 0.93092716, "num_input_tokens_seen": 7276590, "step": 343, "time_per_iteration": 2.7878575325012207 }, { "auxiliary_loss_clip": 0.01304776, "auxiliary_loss_mlp": 0.01054735, "balance_loss_clip": 1.08235705, "balance_loss_mlp": 1.03226376, "epoch": 0.04136355437984729, "flos": 25191371548800.0, "grad_norm": 2.2108260745977, "language_loss": 0.79997599, "learning_rate": 3.998659735527636e-06, "loss": 0.82357109, "num_input_tokens_seen": 7295680, "step": 344, "time_per_iteration": 2.7431252002716064 }, { "auxiliary_loss_clip": 0.01304465, "auxiliary_loss_mlp": 0.01066004, "balance_loss_clip": 1.08306181, "balance_loss_mlp": 1.04360414, "epoch": 0.04148379727048638, "flos": 22966777509120.0, "grad_norm": 2.11061994895343, "language_loss": 0.77810282, "learning_rate": 3.998631070850813e-06, "loss": 0.80180752, "num_input_tokens_seen": 7316300, "step": 345, "time_per_iteration": 2.776367425918579 }, { "auxiliary_loss_clip": 0.01305195, "auxiliary_loss_mlp": 0.01063389, "balance_loss_clip": 1.08559155, "balance_loss_mlp": 1.04182386, "epoch": 0.041604040161125476, "flos": 14063481187200.0, "grad_norm": 4.444775144664832, "language_loss": 0.83528215, "learning_rate": 3.9986021029832455e-06, "loss": 0.85896802, "num_input_tokens_seen": 7333615, "step": 346, "time_per_iteration": 2.7372703552246094 }, { "auxiliary_loss_clip": 0.01306252, "auxiliary_loss_mlp": 0.01068526, "balance_loss_clip": 1.08248949, "balance_loss_mlp": 1.04555464, "epoch": 0.041724283051764566, "flos": 12091877614080.0, "grad_norm": 2.770425351241236, "language_loss": 0.91613495, "learning_rate": 3.9985728319293285e-06, "loss": 0.93988281, "num_input_tokens_seen": 7347590, "step": 347, "time_per_iteration": 2.7441864013671875 }, { "auxiliary_loss_clip": 0.0130458, "auxiliary_loss_mlp": 0.0105852, "balance_loss_clip": 1.08195376, "balance_loss_mlp": 1.03467858, "epoch": 0.041844525942403656, "flos": 12385303816320.0, "grad_norm": 2.1827232787304074, "language_loss": 0.84958255, "learning_rate": 3.998543257693501e-06, "loss": 0.87321353, "num_input_tokens_seen": 7364345, "step": 348, "time_per_iteration": 2.810096025466919 }, { "auxiliary_loss_clip": 0.01306118, "auxiliary_loss_mlp": 0.01061803, "balance_loss_clip": 1.08513725, "balance_loss_mlp": 1.03868878, "epoch": 0.041964768833042745, "flos": 23769345041280.0, "grad_norm": 1.8401850111862401, "language_loss": 0.87805331, "learning_rate": 3.998513380280251e-06, "loss": 0.90173256, "num_input_tokens_seen": 7384625, "step": 349, "time_per_iteration": 2.879423141479492 }, { "auxiliary_loss_clip": 0.01307398, "auxiliary_loss_mlp": 0.01069676, "balance_loss_clip": 1.08533716, "balance_loss_mlp": 1.04803944, "epoch": 0.042085011723681835, "flos": 11875336473600.0, "grad_norm": 2.224110517314045, "language_loss": 0.94985616, "learning_rate": 3.99848319969411e-06, "loss": 0.97362691, "num_input_tokens_seen": 7402225, "step": 350, "time_per_iteration": 2.8449652194976807 }, { "auxiliary_loss_clip": 0.01308544, "auxiliary_loss_mlp": 0.01080105, "balance_loss_clip": 1.0847466, "balance_loss_mlp": 1.05700254, "epoch": 0.042205254614320925, "flos": 16873957964160.0, "grad_norm": 2.25042103146398, "language_loss": 0.79032624, "learning_rate": 3.9984527159396564e-06, "loss": 0.81421268, "num_input_tokens_seen": 7420865, "step": 351, "time_per_iteration": 2.776538133621216 }, { "auxiliary_loss_clip": 0.01300707, "auxiliary_loss_mlp": 0.0105857, "balance_loss_clip": 1.07964826, "balance_loss_mlp": 1.03606355, "epoch": 0.04232549750496002, "flos": 25118508810240.0, "grad_norm": 2.1618391462617033, "language_loss": 0.84322441, "learning_rate": 3.9984219290215154e-06, "loss": 0.86681718, "num_input_tokens_seen": 7441040, "step": 352, "time_per_iteration": 2.887808322906494 }, { "auxiliary_loss_clip": 0.01302531, "auxiliary_loss_mlp": 0.01061996, "balance_loss_clip": 1.08265471, "balance_loss_mlp": 1.03989482, "epoch": 0.04244574039559911, "flos": 26724541714560.0, "grad_norm": 1.6080898760575693, "language_loss": 0.89167172, "learning_rate": 3.998390838944356e-06, "loss": 0.91531694, "num_input_tokens_seen": 7462545, "step": 353, "time_per_iteration": 2.9426677227020264 }, { "auxiliary_loss_clip": 0.01304319, "auxiliary_loss_mlp": 0.010628, "balance_loss_clip": 1.08155835, "balance_loss_mlp": 1.04032958, "epoch": 0.0425659832862382, "flos": 20923244951040.0, "grad_norm": 2.675552227400016, "language_loss": 0.90040159, "learning_rate": 3.998359445712895e-06, "loss": 0.92407274, "num_input_tokens_seen": 7481650, "step": 354, "time_per_iteration": 2.9474055767059326 }, { "auxiliary_loss_clip": 0.01299997, "auxiliary_loss_mlp": 0.01065532, "balance_loss_clip": 1.079427, "balance_loss_mlp": 1.04377627, "epoch": 0.04268622617687729, "flos": 23331127115520.0, "grad_norm": 2.345512648114349, "language_loss": 0.81065631, "learning_rate": 3.9983277493318955e-06, "loss": 0.8343116, "num_input_tokens_seen": 7500945, "step": 355, "time_per_iteration": 3.661593198776245 }, { "auxiliary_loss_clip": 0.01299461, "auxiliary_loss_mlp": 0.01069705, "balance_loss_clip": 1.07839632, "balance_loss_mlp": 1.04825902, "epoch": 0.04280646906751638, "flos": 25994010908160.0, "grad_norm": 1.6475529192907945, "language_loss": 0.8125273, "learning_rate": 3.998295749806165e-06, "loss": 0.83621895, "num_input_tokens_seen": 7522170, "step": 356, "time_per_iteration": 4.186137676239014 }, { "auxiliary_loss_clip": 0.01307839, "auxiliary_loss_mlp": 0.01067602, "balance_loss_clip": 1.08619905, "balance_loss_mlp": 1.04523838, "epoch": 0.04292671195815547, "flos": 26906824258560.0, "grad_norm": 1.8757236823451373, "language_loss": 0.83347529, "learning_rate": 3.998263447140558e-06, "loss": 0.85722971, "num_input_tokens_seen": 7542370, "step": 357, "time_per_iteration": 3.7313246726989746 }, { "auxiliary_loss_clip": 0.01300766, "auxiliary_loss_mlp": 0.01058312, "balance_loss_clip": 1.08021045, "balance_loss_mlp": 1.03684258, "epoch": 0.04304695484879457, "flos": 39457315745280.0, "grad_norm": 1.8881609556047723, "language_loss": 0.81823587, "learning_rate": 3.998230841339976e-06, "loss": 0.84182668, "num_input_tokens_seen": 7564380, "step": 358, "time_per_iteration": 4.581888437271118 }, { "auxiliary_loss_clip": 0.0130601, "auxiliary_loss_mlp": 0.01062046, "balance_loss_clip": 1.08613074, "balance_loss_mlp": 1.03909874, "epoch": 0.04316719773943366, "flos": 19646297475840.0, "grad_norm": 2.4071107708431447, "language_loss": 0.84765697, "learning_rate": 3.998197932409363e-06, "loss": 0.87133753, "num_input_tokens_seen": 7582390, "step": 359, "time_per_iteration": 2.8487353324890137 }, { "auxiliary_loss_clip": 0.01298683, "auxiliary_loss_mlp": 0.01065259, "balance_loss_clip": 1.08099985, "balance_loss_mlp": 1.04291952, "epoch": 0.04328744063007275, "flos": 22452320966400.0, "grad_norm": 1.9999089800113563, "language_loss": 0.86126244, "learning_rate": 3.9981647203537125e-06, "loss": 0.88490176, "num_input_tokens_seen": 7599890, "step": 360, "time_per_iteration": 2.8029441833496094 }, { "auxiliary_loss_clip": 0.01301987, "auxiliary_loss_mlp": 0.01060233, "balance_loss_clip": 1.08278143, "balance_loss_mlp": 1.03791761, "epoch": 0.04340768352071184, "flos": 21283033530240.0, "grad_norm": 2.0380236139052696, "language_loss": 0.9599191, "learning_rate": 3.998131205178063e-06, "loss": 0.98354125, "num_input_tokens_seen": 7618360, "step": 361, "time_per_iteration": 2.7862765789031982 }, { "auxiliary_loss_clip": 0.01301399, "auxiliary_loss_mlp": 0.01061211, "balance_loss_clip": 1.08097005, "balance_loss_mlp": 1.03936028, "epoch": 0.04352792641135093, "flos": 11583705951360.0, "grad_norm": 2.5153155869872963, "language_loss": 0.76497507, "learning_rate": 3.998097386887498e-06, "loss": 0.78860116, "num_input_tokens_seen": 7635435, "step": 362, "time_per_iteration": 2.739643096923828 }, { "auxiliary_loss_clip": 0.01298476, "auxiliary_loss_mlp": 0.01054598, "balance_loss_clip": 1.08033645, "balance_loss_mlp": 1.03351045, "epoch": 0.04364816930199002, "flos": 23623547736960.0, "grad_norm": 1.7054527306491407, "language_loss": 0.84745812, "learning_rate": 3.998063265487148e-06, "loss": 0.87098885, "num_input_tokens_seen": 7656485, "step": 363, "time_per_iteration": 2.786482334136963 }, { "auxiliary_loss_clip": 0.0130037, "auxiliary_loss_mlp": 0.01066211, "balance_loss_clip": 1.08118105, "balance_loss_mlp": 1.04416966, "epoch": 0.043768412192629114, "flos": 14429734214400.0, "grad_norm": 2.015176159517788, "language_loss": 0.80895352, "learning_rate": 3.99802884098219e-06, "loss": 0.83261931, "num_input_tokens_seen": 7674595, "step": 364, "time_per_iteration": 2.746842622756958 }, { "auxiliary_loss_clip": 0.01300242, "auxiliary_loss_mlp": 0.0106447, "balance_loss_clip": 1.07918453, "balance_loss_mlp": 1.04271448, "epoch": 0.043888655083268203, "flos": 26468893641600.0, "grad_norm": 5.112202410901094, "language_loss": 0.82266772, "learning_rate": 3.997994113377845e-06, "loss": 0.84631479, "num_input_tokens_seen": 7693495, "step": 365, "time_per_iteration": 2.703413486480713 }, { "auxiliary_loss_clip": 0.0130488, "auxiliary_loss_mlp": 0.01048836, "balance_loss_clip": 1.082183, "balance_loss_mlp": 1.02681768, "epoch": 0.04400889797390729, "flos": 27235263242880.0, "grad_norm": 2.031613874769078, "language_loss": 0.83080167, "learning_rate": 3.9979590826793815e-06, "loss": 0.85433888, "num_input_tokens_seen": 7714685, "step": 366, "time_per_iteration": 2.7634379863739014 }, { "auxiliary_loss_clip": 0.01305505, "auxiliary_loss_mlp": 0.01064038, "balance_loss_clip": 1.08512211, "balance_loss_mlp": 1.04184151, "epoch": 0.04412914086454638, "flos": 20119528183680.0, "grad_norm": 2.36196430208335, "language_loss": 0.81168079, "learning_rate": 3.997923748892113e-06, "loss": 0.83537614, "num_input_tokens_seen": 7734005, "step": 367, "time_per_iteration": 2.7115297317504883 }, { "auxiliary_loss_clip": 0.0129999, "auxiliary_loss_mlp": 0.01057496, "balance_loss_clip": 1.08238316, "balance_loss_mlp": 1.03479886, "epoch": 0.04424938375518547, "flos": 22604618632320.0, "grad_norm": 1.7152420413408684, "language_loss": 0.88808757, "learning_rate": 3.9978881120214015e-06, "loss": 0.91166246, "num_input_tokens_seen": 7755525, "step": 368, "time_per_iteration": 2.7616913318634033 }, { "auxiliary_loss_clip": 0.0130295, "auxiliary_loss_mlp": 0.01055326, "balance_loss_clip": 1.08200765, "balance_loss_mlp": 1.03380847, "epoch": 0.04436962664582456, "flos": 24132365844480.0, "grad_norm": 2.896197691613399, "language_loss": 0.78897095, "learning_rate": 3.997852172072652e-06, "loss": 0.8125537, "num_input_tokens_seen": 7776740, "step": 369, "time_per_iteration": 2.8414306640625 }, { "auxiliary_loss_clip": 0.0130594, "auxiliary_loss_mlp": 0.01069342, "balance_loss_clip": 1.08346272, "balance_loss_mlp": 1.0471096, "epoch": 0.04448986953646366, "flos": 18222906251520.0, "grad_norm": 2.429228854378308, "language_loss": 0.89165407, "learning_rate": 3.9978159290513155e-06, "loss": 0.91540694, "num_input_tokens_seen": 7794820, "step": 370, "time_per_iteration": 2.7003798484802246 }, { "auxiliary_loss_clip": 0.0130422, "auxiliary_loss_mlp": 0.01064042, "balance_loss_clip": 1.08245528, "balance_loss_mlp": 1.04319239, "epoch": 0.04461011242710275, "flos": 30117920400000.0, "grad_norm": 1.7572909647736008, "language_loss": 0.80081582, "learning_rate": 3.997779382962892e-06, "loss": 0.82449841, "num_input_tokens_seen": 7817705, "step": 371, "time_per_iteration": 2.729684591293335 }, { "auxiliary_loss_clip": 0.012991, "auxiliary_loss_mlp": 0.01059072, "balance_loss_clip": 1.07902133, "balance_loss_mlp": 1.03747165, "epoch": 0.04473035531774184, "flos": 29752529299200.0, "grad_norm": 2.211203970772327, "language_loss": 0.73391181, "learning_rate": 3.997742533812924e-06, "loss": 0.7574935, "num_input_tokens_seen": 7840970, "step": 372, "time_per_iteration": 2.8014276027679443 }, { "auxiliary_loss_clip": 0.01305514, "auxiliary_loss_mlp": 0.01072297, "balance_loss_clip": 1.08411372, "balance_loss_mlp": 1.04980254, "epoch": 0.04485059820838093, "flos": 13151565676800.0, "grad_norm": 2.2585814104012734, "language_loss": 0.92336059, "learning_rate": 3.997705381607001e-06, "loss": 0.94713867, "num_input_tokens_seen": 7857785, "step": 373, "time_per_iteration": 2.713834524154663 }, { "auxiliary_loss_clip": 0.01239225, "auxiliary_loss_mlp": 0.0105411, "balance_loss_clip": 1.09220409, "balance_loss_mlp": 1.03808868, "epoch": 0.04497084109902002, "flos": 68094209548800.0, "grad_norm": 0.985644250171076, "language_loss": 0.60259712, "learning_rate": 3.997667926350761e-06, "loss": 0.62553048, "num_input_tokens_seen": 7916115, "step": 374, "time_per_iteration": 3.187532663345337 }, { "auxiliary_loss_clip": 0.01240436, "auxiliary_loss_mlp": 0.01036521, "balance_loss_clip": 1.09328532, "balance_loss_mlp": 1.0204041, "epoch": 0.04509108398965911, "flos": 64342263346560.0, "grad_norm": 0.9403081416162344, "language_loss": 0.57804239, "learning_rate": 3.997630168049886e-06, "loss": 0.60081196, "num_input_tokens_seen": 7974480, "step": 375, "time_per_iteration": 3.3417928218841553 }, { "auxiliary_loss_clip": 0.01303439, "auxiliary_loss_mlp": 0.01055072, "balance_loss_clip": 1.08280194, "balance_loss_mlp": 1.03291106, "epoch": 0.045211326880298205, "flos": 22271115830400.0, "grad_norm": 1.7229937043583072, "language_loss": 0.77298784, "learning_rate": 3.997592106710101e-06, "loss": 0.79657298, "num_input_tokens_seen": 7993940, "step": 376, "time_per_iteration": 2.796447992324829 }, { "auxiliary_loss_clip": 0.01303372, "auxiliary_loss_mlp": 0.01057229, "balance_loss_clip": 1.08300948, "balance_loss_mlp": 1.03589034, "epoch": 0.045331569770937295, "flos": 32159441796480.0, "grad_norm": 2.495847909898285, "language_loss": 0.65897369, "learning_rate": 3.997553742337182e-06, "loss": 0.6825797, "num_input_tokens_seen": 8013365, "step": 377, "time_per_iteration": 2.773810386657715 }, { "auxiliary_loss_clip": 0.0129958, "auxiliary_loss_mlp": 0.01055049, "balance_loss_clip": 1.08120584, "balance_loss_mlp": 1.03446162, "epoch": 0.045451812661576385, "flos": 22163455791360.0, "grad_norm": 1.877991625011866, "language_loss": 0.91214526, "learning_rate": 3.997515074936949e-06, "loss": 0.9356916, "num_input_tokens_seen": 8034240, "step": 378, "time_per_iteration": 2.7045955657958984 }, { "auxiliary_loss_clip": 0.01298052, "auxiliary_loss_mlp": 0.01071118, "balance_loss_clip": 1.08247852, "balance_loss_mlp": 1.04999423, "epoch": 0.045572055552215475, "flos": 16581968305920.0, "grad_norm": 2.544744965089305, "language_loss": 0.86957765, "learning_rate": 3.997476104515268e-06, "loss": 0.8932693, "num_input_tokens_seen": 8052430, "step": 379, "time_per_iteration": 2.7172420024871826 }, { "auxiliary_loss_clip": 0.01295579, "auxiliary_loss_mlp": 0.0106892, "balance_loss_clip": 1.08241677, "balance_loss_mlp": 1.04824901, "epoch": 0.045692298442854565, "flos": 17603375448960.0, "grad_norm": 2.1092563019367243, "language_loss": 0.77518237, "learning_rate": 3.9974368310780485e-06, "loss": 0.79882729, "num_input_tokens_seen": 8069605, "step": 380, "time_per_iteration": 2.7907798290252686 }, { "auxiliary_loss_clip": 0.01307972, "auxiliary_loss_mlp": 0.01067523, "balance_loss_clip": 1.08557749, "balance_loss_mlp": 1.04570746, "epoch": 0.045812541333493655, "flos": 26761098781440.0, "grad_norm": 3.2425605281854764, "language_loss": 0.74529064, "learning_rate": 3.997397254631251e-06, "loss": 0.76904553, "num_input_tokens_seen": 8090225, "step": 381, "time_per_iteration": 3.7200708389282227 }, { "auxiliary_loss_clip": 0.01231443, "auxiliary_loss_mlp": 0.010238, "balance_loss_clip": 1.08650243, "balance_loss_mlp": 1.00768244, "epoch": 0.04593278422413275, "flos": 60250349894400.0, "grad_norm": 0.8247812469098866, "language_loss": 0.60053802, "learning_rate": 3.997357375180878e-06, "loss": 0.62309051, "num_input_tokens_seen": 8154505, "step": 382, "time_per_iteration": 4.339107990264893 }, { "auxiliary_loss_clip": 0.01298676, "auxiliary_loss_mlp": 0.01060494, "balance_loss_clip": 1.08090019, "balance_loss_mlp": 1.0389533, "epoch": 0.04605302711477184, "flos": 21799249839360.0, "grad_norm": 1.8168025270283124, "language_loss": 0.75171858, "learning_rate": 3.997317192732979e-06, "loss": 0.77531028, "num_input_tokens_seen": 8173285, "step": 383, "time_per_iteration": 3.6038684844970703 }, { "auxiliary_loss_clip": 0.01299038, "auxiliary_loss_mlp": 0.01077142, "balance_loss_clip": 1.081375, "balance_loss_mlp": 1.05556512, "epoch": 0.04617327000541093, "flos": 19459705299840.0, "grad_norm": 1.9412743610068113, "language_loss": 0.82465464, "learning_rate": 3.99727670729365e-06, "loss": 0.84841645, "num_input_tokens_seen": 8191845, "step": 384, "time_per_iteration": 4.02632737159729 }, { "auxiliary_loss_clip": 0.01303517, "auxiliary_loss_mlp": 0.01070097, "balance_loss_clip": 1.08547115, "balance_loss_mlp": 1.04859161, "epoch": 0.04629351289605002, "flos": 25411468135680.0, "grad_norm": 1.8357192773391977, "language_loss": 0.78038728, "learning_rate": 3.997235918869033e-06, "loss": 0.8041234, "num_input_tokens_seen": 8212880, "step": 385, "time_per_iteration": 2.765868902206421 }, { "auxiliary_loss_clip": 0.01299083, "auxiliary_loss_mlp": 0.01047429, "balance_loss_clip": 1.08254457, "balance_loss_mlp": 1.02656746, "epoch": 0.04641375578668911, "flos": 20558284813440.0, "grad_norm": 1.8318318340916997, "language_loss": 0.82801414, "learning_rate": 3.997194827465315e-06, "loss": 0.85147923, "num_input_tokens_seen": 8231475, "step": 386, "time_per_iteration": 2.7272448539733887 }, { "auxiliary_loss_clip": 0.01296671, "auxiliary_loss_mlp": 0.01065672, "balance_loss_clip": 1.07976961, "balance_loss_mlp": 1.04354692, "epoch": 0.0465339986773282, "flos": 13188661447680.0, "grad_norm": 2.3734939323255975, "language_loss": 0.9125222, "learning_rate": 3.997153433088728e-06, "loss": 0.9361456, "num_input_tokens_seen": 8248600, "step": 387, "time_per_iteration": 2.6354751586914062 }, { "auxiliary_loss_clip": 0.0129952, "auxiliary_loss_mlp": 0.01071006, "balance_loss_clip": 1.08143485, "balance_loss_mlp": 1.04926193, "epoch": 0.0466542415679673, "flos": 25556547168000.0, "grad_norm": 2.3030333426905822, "language_loss": 0.81515145, "learning_rate": 3.997111735745554e-06, "loss": 0.8388567, "num_input_tokens_seen": 8271570, "step": 388, "time_per_iteration": 2.740133047103882 }, { "auxiliary_loss_clip": 0.01301414, "auxiliary_loss_mlp": 0.01059946, "balance_loss_clip": 1.0825814, "balance_loss_mlp": 1.03746319, "epoch": 0.04677448445860639, "flos": 22236749493120.0, "grad_norm": 2.021095943779384, "language_loss": 0.82896596, "learning_rate": 3.997069735442118e-06, "loss": 0.85257959, "num_input_tokens_seen": 8291265, "step": 389, "time_per_iteration": 2.691093921661377 }, { "auxiliary_loss_clip": 0.01294315, "auxiliary_loss_mlp": 0.01066455, "balance_loss_clip": 1.08001554, "balance_loss_mlp": 1.04559326, "epoch": 0.04689472734924548, "flos": 28147825198080.0, "grad_norm": 1.476989223785034, "language_loss": 0.80427915, "learning_rate": 3.997027432184792e-06, "loss": 0.82788682, "num_input_tokens_seen": 8315925, "step": 390, "time_per_iteration": 2.857544422149658 }, { "auxiliary_loss_clip": 0.01300321, "auxiliary_loss_mlp": 0.01059756, "balance_loss_clip": 1.08176577, "balance_loss_mlp": 1.03794038, "epoch": 0.04701497023988457, "flos": 23148952312320.0, "grad_norm": 2.6784242365838717, "language_loss": 0.89645201, "learning_rate": 3.99698482597999e-06, "loss": 0.92005271, "num_input_tokens_seen": 8333605, "step": 391, "time_per_iteration": 2.6760802268981934 }, { "auxiliary_loss_clip": 0.01218102, "auxiliary_loss_mlp": 0.01029803, "balance_loss_clip": 1.07646561, "balance_loss_mlp": 1.01416254, "epoch": 0.04713521313052366, "flos": 64827668764800.0, "grad_norm": 0.8671329654157214, "language_loss": 0.6387763, "learning_rate": 3.99694191683418e-06, "loss": 0.66125536, "num_input_tokens_seen": 8394405, "step": 392, "time_per_iteration": 3.323683261871338 }, { "auxiliary_loss_clip": 0.01304822, "auxiliary_loss_mlp": 0.01059008, "balance_loss_clip": 1.08773148, "balance_loss_mlp": 1.03714466, "epoch": 0.047255456021162746, "flos": 18771585477120.0, "grad_norm": 2.469634407072093, "language_loss": 0.81678945, "learning_rate": 3.996898704753867e-06, "loss": 0.84042776, "num_input_tokens_seen": 8412355, "step": 393, "time_per_iteration": 2.7502825260162354 }, { "auxiliary_loss_clip": 0.01301762, "auxiliary_loss_mlp": 0.0105514, "balance_loss_clip": 1.08319759, "balance_loss_mlp": 1.03369403, "epoch": 0.04737569891180184, "flos": 22053820504320.0, "grad_norm": 1.9578657554630094, "language_loss": 0.87632799, "learning_rate": 3.996855189745609e-06, "loss": 0.89989698, "num_input_tokens_seen": 8431620, "step": 394, "time_per_iteration": 2.6881191730499268 }, { "auxiliary_loss_clip": 0.0129387, "auxiliary_loss_mlp": 0.01060433, "balance_loss_clip": 1.07794344, "balance_loss_mlp": 1.03877246, "epoch": 0.04749594180244093, "flos": 29057370410880.0, "grad_norm": 1.8408660893592135, "language_loss": 0.92664367, "learning_rate": 3.996811371816007e-06, "loss": 0.95018673, "num_input_tokens_seen": 8454045, "step": 395, "time_per_iteration": 2.805147886276245 }, { "auxiliary_loss_clip": 0.0130117, "auxiliary_loss_mlp": 0.01063718, "balance_loss_clip": 1.08685005, "balance_loss_mlp": 1.04210508, "epoch": 0.04761618469308002, "flos": 35112268172160.0, "grad_norm": 36.046821514440026, "language_loss": 0.7773487, "learning_rate": 3.996767250971707e-06, "loss": 0.80099761, "num_input_tokens_seen": 8476785, "step": 396, "time_per_iteration": 2.774437189102173 }, { "auxiliary_loss_clip": 0.01300496, "auxiliary_loss_mlp": 0.01055231, "balance_loss_clip": 1.08557272, "balance_loss_mlp": 1.03277218, "epoch": 0.04773642758371911, "flos": 25630702796160.0, "grad_norm": 2.2435854030148845, "language_loss": 0.86772102, "learning_rate": 3.996722827219403e-06, "loss": 0.89127827, "num_input_tokens_seen": 8498400, "step": 397, "time_per_iteration": 2.7121527194976807 }, { "auxiliary_loss_clip": 0.01300928, "auxiliary_loss_mlp": 0.01062399, "balance_loss_clip": 1.08493543, "balance_loss_mlp": 1.04061913, "epoch": 0.0478566704743582, "flos": 20631506688000.0, "grad_norm": 3.036395430501904, "language_loss": 0.82860404, "learning_rate": 3.996678100565833e-06, "loss": 0.85223728, "num_input_tokens_seen": 8517455, "step": 398, "time_per_iteration": 2.7156338691711426 }, { "auxiliary_loss_clip": 0.01294427, "auxiliary_loss_mlp": 0.01057869, "balance_loss_clip": 1.08134723, "balance_loss_mlp": 1.03598213, "epoch": 0.04797691336499729, "flos": 18835721210880.0, "grad_norm": 2.159826298269361, "language_loss": 0.8868925, "learning_rate": 3.996633071017783e-06, "loss": 0.91041541, "num_input_tokens_seen": 8534085, "step": 399, "time_per_iteration": 2.7330141067504883 }, { "auxiliary_loss_clip": 0.01296627, "auxiliary_loss_mlp": 0.01054714, "balance_loss_clip": 1.08340573, "balance_loss_mlp": 1.03367305, "epoch": 0.04809715625563638, "flos": 21099673578240.0, "grad_norm": 2.5186016810366665, "language_loss": 0.81773323, "learning_rate": 3.996587738582084e-06, "loss": 0.84124666, "num_input_tokens_seen": 8550885, "step": 400, "time_per_iteration": 2.8162965774536133 }, { "auxiliary_loss_clip": 0.01295155, "auxiliary_loss_mlp": 0.01062361, "balance_loss_clip": 1.07979548, "balance_loss_mlp": 1.04159427, "epoch": 0.04821739914627548, "flos": 23805650712960.0, "grad_norm": 3.326972525093555, "language_loss": 0.86486411, "learning_rate": 3.9965421032656115e-06, "loss": 0.88843924, "num_input_tokens_seen": 8570815, "step": 401, "time_per_iteration": 2.8045902252197266 }, { "auxiliary_loss_clip": 0.01296729, "auxiliary_loss_mlp": 0.01067682, "balance_loss_clip": 1.08001196, "balance_loss_mlp": 1.04620028, "epoch": 0.04833764203691457, "flos": 22200587475840.0, "grad_norm": 2.418068404146797, "language_loss": 0.9416489, "learning_rate": 3.99649616507529e-06, "loss": 0.96529293, "num_input_tokens_seen": 8589910, "step": 402, "time_per_iteration": 2.845588207244873 }, { "auxiliary_loss_clip": 0.0121522, "auxiliary_loss_mlp": 0.01021066, "balance_loss_clip": 1.07484472, "balance_loss_mlp": 1.00590253, "epoch": 0.04845788492755366, "flos": 65904376896000.0, "grad_norm": 0.8943154822960663, "language_loss": 0.6314038, "learning_rate": 3.996449924018088e-06, "loss": 0.65376663, "num_input_tokens_seen": 8650370, "step": 403, "time_per_iteration": 3.3108067512512207 }, { "auxiliary_loss_clip": 0.01293793, "auxiliary_loss_mlp": 0.01062806, "balance_loss_clip": 1.08061755, "balance_loss_mlp": 1.04341066, "epoch": 0.04857812781819275, "flos": 19281301424640.0, "grad_norm": 2.1374682061747716, "language_loss": 0.79250073, "learning_rate": 3.99640338010102e-06, "loss": 0.81606674, "num_input_tokens_seen": 8669475, "step": 404, "time_per_iteration": 2.8217215538024902 }, { "auxiliary_loss_clip": 0.01292562, "auxiliary_loss_mlp": 0.01065845, "balance_loss_clip": 1.07982242, "balance_loss_mlp": 1.04530525, "epoch": 0.04869837070883184, "flos": 24062376193920.0, "grad_norm": 2.5430162463719994, "language_loss": 0.78568745, "learning_rate": 3.996356533331146e-06, "loss": 0.80927151, "num_input_tokens_seen": 8691345, "step": 405, "time_per_iteration": 2.794588088989258 }, { "auxiliary_loss_clip": 0.01296698, "auxiliary_loss_mlp": 0.01061128, "balance_loss_clip": 1.07797432, "balance_loss_mlp": 1.03938472, "epoch": 0.04881861359947093, "flos": 25187169657600.0, "grad_norm": 2.1433920325949645, "language_loss": 0.61776197, "learning_rate": 3.996309383715573e-06, "loss": 0.64134026, "num_input_tokens_seen": 8710125, "step": 406, "time_per_iteration": 2.8070385456085205 }, { "auxiliary_loss_clip": 0.01295057, "auxiliary_loss_mlp": 0.01063814, "balance_loss_clip": 1.08058631, "balance_loss_mlp": 1.04306006, "epoch": 0.048938856490110025, "flos": 16362913213440.0, "grad_norm": 2.1611466198288385, "language_loss": 0.73808461, "learning_rate": 3.996261931261454e-06, "loss": 0.76167333, "num_input_tokens_seen": 8728705, "step": 407, "time_per_iteration": 2.7381701469421387 }, { "auxiliary_loss_clip": 0.01295502, "auxiliary_loss_mlp": 0.01062425, "balance_loss_clip": 1.08143938, "balance_loss_mlp": 1.04236197, "epoch": 0.049059099380749115, "flos": 29895094379520.0, "grad_norm": 1.6371908801135844, "language_loss": 0.86243308, "learning_rate": 3.996214175975987e-06, "loss": 0.88601232, "num_input_tokens_seen": 8749225, "step": 408, "time_per_iteration": 5.016650676727295 }, { "auxiliary_loss_clip": 0.01299312, "auxiliary_loss_mlp": 0.01067403, "balance_loss_clip": 1.08303428, "balance_loss_mlp": 1.04539657, "epoch": 0.049179342271388204, "flos": 35918858027520.0, "grad_norm": 3.0103757969915748, "language_loss": 0.78717828, "learning_rate": 3.996166117866417e-06, "loss": 0.81084538, "num_input_tokens_seen": 8771160, "step": 409, "time_per_iteration": 3.6577329635620117 }, { "auxiliary_loss_clip": 0.01292888, "auxiliary_loss_mlp": 0.0105238, "balance_loss_clip": 1.07714343, "balance_loss_mlp": 1.03207827, "epoch": 0.049299585162027294, "flos": 14611226659200.0, "grad_norm": 2.147704450284593, "language_loss": 0.86735439, "learning_rate": 3.996117756940035e-06, "loss": 0.89080709, "num_input_tokens_seen": 8787845, "step": 410, "time_per_iteration": 2.7041587829589844 }, { "auxiliary_loss_clip": 0.01294923, "auxiliary_loss_mlp": 0.01064632, "balance_loss_clip": 1.08086157, "balance_loss_mlp": 1.04346085, "epoch": 0.049419828052666384, "flos": 19567939956480.0, "grad_norm": 2.3163946704791774, "language_loss": 0.97729373, "learning_rate": 3.996069093204175e-06, "loss": 1.0008893, "num_input_tokens_seen": 8803805, "step": 411, "time_per_iteration": 3.9674177169799805 }, { "auxiliary_loss_clip": 0.01300716, "auxiliary_loss_mlp": 0.01059936, "balance_loss_clip": 1.08479512, "balance_loss_mlp": 1.03858531, "epoch": 0.049540070943305474, "flos": 13659916907520.0, "grad_norm": 2.9152168926512707, "language_loss": 0.87917984, "learning_rate": 3.996020126666221e-06, "loss": 0.90278637, "num_input_tokens_seen": 8820785, "step": 412, "time_per_iteration": 2.78570294380188 }, { "auxiliary_loss_clip": 0.01296021, "auxiliary_loss_mlp": 0.01055598, "balance_loss_clip": 1.08021772, "balance_loss_mlp": 1.03509426, "epoch": 0.04966031383394457, "flos": 21832035978240.0, "grad_norm": 2.2774229411581945, "language_loss": 0.82069218, "learning_rate": 3.995970857333601e-06, "loss": 0.84420836, "num_input_tokens_seen": 8841195, "step": 413, "time_per_iteration": 2.8887219429016113 }, { "auxiliary_loss_clip": 0.01296458, "auxiliary_loss_mlp": 0.01066028, "balance_loss_clip": 1.0806427, "balance_loss_mlp": 1.04477251, "epoch": 0.04978055672458366, "flos": 28618793349120.0, "grad_norm": 1.9582622893654724, "language_loss": 0.7977736, "learning_rate": 3.995921285213789e-06, "loss": 0.8213985, "num_input_tokens_seen": 8861455, "step": 414, "time_per_iteration": 2.8335537910461426 }, { "auxiliary_loss_clip": 0.01288851, "auxiliary_loss_mlp": 0.01063873, "balance_loss_clip": 1.07860112, "balance_loss_mlp": 1.04340506, "epoch": 0.04990079961522275, "flos": 19828220883840.0, "grad_norm": 2.516679362228342, "language_loss": 0.80763978, "learning_rate": 3.995871410314305e-06, "loss": 0.83116704, "num_input_tokens_seen": 8880015, "step": 415, "time_per_iteration": 2.728346824645996 }, { "auxiliary_loss_clip": 0.01213464, "auxiliary_loss_mlp": 0.01015647, "balance_loss_clip": 1.03963947, "balance_loss_mlp": 1.00067449, "epoch": 0.05002104250586184, "flos": 62735045293440.0, "grad_norm": 0.9178013081078011, "language_loss": 0.5961535, "learning_rate": 3.995821232642714e-06, "loss": 0.61844456, "num_input_tokens_seen": 8938420, "step": 416, "time_per_iteration": 3.4919590950012207 }, { "auxiliary_loss_clip": 0.01292676, "auxiliary_loss_mlp": 0.01059285, "balance_loss_clip": 1.04415894, "balance_loss_mlp": 1.03901958, "epoch": 0.05014128539650093, "flos": 27928518710400.0, "grad_norm": 2.0113300432579355, "language_loss": 0.82144356, "learning_rate": 3.995770752206629e-06, "loss": 0.84496313, "num_input_tokens_seen": 8959495, "step": 417, "time_per_iteration": 2.8074138164520264 }, { "auxiliary_loss_clip": 0.01294529, "auxiliary_loss_mlp": 0.01065327, "balance_loss_clip": 1.08147621, "balance_loss_mlp": 1.0438931, "epoch": 0.05026152828714002, "flos": 17705576620800.0, "grad_norm": 2.0955460711803515, "language_loss": 0.97296882, "learning_rate": 3.995719969013709e-06, "loss": 0.99656743, "num_input_tokens_seen": 8976675, "step": 418, "time_per_iteration": 2.712995767593384 }, { "auxiliary_loss_clip": 0.01280112, "auxiliary_loss_mlp": 0.01063151, "balance_loss_clip": 1.00061345, "balance_loss_mlp": 1.04220557, "epoch": 0.05038177117777912, "flos": 19133277477120.0, "grad_norm": 2.5380615644702016, "language_loss": 0.85832769, "learning_rate": 3.995668883071655e-06, "loss": 0.8817603, "num_input_tokens_seen": 8992900, "step": 419, "time_per_iteration": 2.8007326126098633 }, { "auxiliary_loss_clip": 0.012934, "auxiliary_loss_mlp": 0.01061208, "balance_loss_clip": 1.07975864, "balance_loss_mlp": 1.03882074, "epoch": 0.050502014068418206, "flos": 20667704618880.0, "grad_norm": 2.283471086397087, "language_loss": 0.90999115, "learning_rate": 3.995617494388219e-06, "loss": 0.93353724, "num_input_tokens_seen": 9011020, "step": 420, "time_per_iteration": 2.690749168395996 }, { "auxiliary_loss_clip": 0.01278343, "auxiliary_loss_mlp": 0.01062933, "balance_loss_clip": 0.99774724, "balance_loss_mlp": 1.04140389, "epoch": 0.050622256959057296, "flos": 21361103740800.0, "grad_norm": 2.468362324790034, "language_loss": 0.80492264, "learning_rate": 3.995565802971196e-06, "loss": 0.8283354, "num_input_tokens_seen": 9030995, "step": 421, "time_per_iteration": 2.741971492767334 }, { "auxiliary_loss_clip": 0.0128074, "auxiliary_loss_mlp": 0.01069846, "balance_loss_clip": 0.99946702, "balance_loss_mlp": 1.04910398, "epoch": 0.050742499849696386, "flos": 27673588909440.0, "grad_norm": 12.418676149382657, "language_loss": 0.6774869, "learning_rate": 3.995513808828427e-06, "loss": 0.7009927, "num_input_tokens_seen": 9053790, "step": 422, "time_per_iteration": 2.9383273124694824 }, { "auxiliary_loss_clip": 0.01287974, "auxiliary_loss_mlp": 0.01058096, "balance_loss_clip": 1.00188601, "balance_loss_mlp": 1.03715086, "epoch": 0.050862742740335476, "flos": 19865999013120.0, "grad_norm": 14.968609139129603, "language_loss": 0.7695123, "learning_rate": 3.9954615119678e-06, "loss": 0.79297298, "num_input_tokens_seen": 9072345, "step": 423, "time_per_iteration": 2.6945791244506836 }, { "auxiliary_loss_clip": 0.01278047, "auxiliary_loss_mlp": 0.01053476, "balance_loss_clip": 1.03864598, "balance_loss_mlp": 1.03238821, "epoch": 0.050982985630974566, "flos": 22085098272000.0, "grad_norm": 2.110880932477581, "language_loss": 0.80620861, "learning_rate": 3.995408912397248e-06, "loss": 0.8295238, "num_input_tokens_seen": 9090240, "step": 424, "time_per_iteration": 2.7216761112213135 }, { "auxiliary_loss_clip": 0.01286382, "auxiliary_loss_mlp": 0.0106203, "balance_loss_clip": 1.00254655, "balance_loss_mlp": 1.04087043, "epoch": 0.05110322852161366, "flos": 20740962407040.0, "grad_norm": 2.4634344458567385, "language_loss": 0.93328261, "learning_rate": 3.99535601012475e-06, "loss": 0.95676672, "num_input_tokens_seen": 9105570, "step": 425, "time_per_iteration": 2.8128373622894287 }, { "auxiliary_loss_clip": 0.01280202, "auxiliary_loss_mlp": 0.01132202, "balance_loss_clip": 0.96306694, "balance_loss_mlp": 0.0, "epoch": 0.05122347141225275, "flos": 28547295327360.0, "grad_norm": 1.5220029447969985, "language_loss": 0.75659072, "learning_rate": 3.995302805158333e-06, "loss": 0.78071475, "num_input_tokens_seen": 9128225, "step": 426, "time_per_iteration": 2.906956672668457 }, { "auxiliary_loss_clip": 0.01272085, "auxiliary_loss_mlp": 0.01057177, "balance_loss_clip": 0.99756658, "balance_loss_mlp": 1.03505182, "epoch": 0.05134371430289184, "flos": 19722679747200.0, "grad_norm": 2.368210113097231, "language_loss": 0.83597898, "learning_rate": 3.9952492975060665e-06, "loss": 0.85927165, "num_input_tokens_seen": 9148295, "step": 427, "time_per_iteration": 2.8525054454803467 }, { "auxiliary_loss_clip": 0.01285022, "auxiliary_loss_mlp": 0.01057754, "balance_loss_clip": 1.03642964, "balance_loss_mlp": 1.03814363, "epoch": 0.05146395719353093, "flos": 34458945649920.0, "grad_norm": 2.2269295103170643, "language_loss": 0.85363781, "learning_rate": 3.995195487176067e-06, "loss": 0.87706554, "num_input_tokens_seen": 9168525, "step": 428, "time_per_iteration": 2.813359498977661 }, { "auxiliary_loss_clip": 0.01294074, "auxiliary_loss_mlp": 0.01051951, "balance_loss_clip": 1.07988214, "balance_loss_mlp": 1.03153026, "epoch": 0.05158420008417002, "flos": 21760286561280.0, "grad_norm": 1.8508131706216202, "language_loss": 0.85516, "learning_rate": 3.995141374176499e-06, "loss": 0.87862021, "num_input_tokens_seen": 9186920, "step": 429, "time_per_iteration": 2.714059829711914 }, { "auxiliary_loss_clip": 0.01217069, "auxiliary_loss_mlp": 0.01127258, "balance_loss_clip": 0.96396875, "balance_loss_mlp": 0.0, "epoch": 0.05170444297480911, "flos": 72553956226560.0, "grad_norm": 0.8744775902311162, "language_loss": 0.63108802, "learning_rate": 3.995086958515572e-06, "loss": 0.6545313, "num_input_tokens_seen": 9244940, "step": 430, "time_per_iteration": 3.3359267711639404 }, { "auxiliary_loss_clip": 0.0120708, "auxiliary_loss_mlp": 0.0112716, "balance_loss_clip": 1.07158542, "balance_loss_mlp": 0.0, "epoch": 0.05182468586544821, "flos": 62416159326720.0, "grad_norm": 5.875594642712059, "language_loss": 0.59903908, "learning_rate": 3.995032240201538e-06, "loss": 0.62238145, "num_input_tokens_seen": 9307335, "step": 431, "time_per_iteration": 3.177755832672119 }, { "auxiliary_loss_clip": 0.01205343, "auxiliary_loss_mlp": 0.01018706, "balance_loss_clip": 0.9891032, "balance_loss_mlp": 1.00382853, "epoch": 0.0519449287560873, "flos": 41225989432320.0, "grad_norm": 0.936872519047258, "language_loss": 0.63136101, "learning_rate": 3.9949772192427e-06, "loss": 0.65360141, "num_input_tokens_seen": 9353960, "step": 432, "time_per_iteration": 3.0225021839141846 }, { "auxiliary_loss_clip": 0.01282019, "auxiliary_loss_mlp": 0.01058183, "balance_loss_clip": 0.99748194, "balance_loss_mlp": 1.03770304, "epoch": 0.05206517164672639, "flos": 17494530261120.0, "grad_norm": 1.8044563733163017, "language_loss": 0.79046977, "learning_rate": 3.994921895647405e-06, "loss": 0.81387174, "num_input_tokens_seen": 9372130, "step": 433, "time_per_iteration": 2.7928214073181152 }, { "auxiliary_loss_clip": 0.01202221, "auxiliary_loss_mlp": 0.01016603, "balance_loss_clip": 1.06768608, "balance_loss_mlp": 1.0018214, "epoch": 0.05218541453736548, "flos": 64002762973440.0, "grad_norm": 0.8357430563673366, "language_loss": 0.5536657, "learning_rate": 3.994866269424043e-06, "loss": 0.57585394, "num_input_tokens_seen": 9428500, "step": 434, "time_per_iteration": 4.039946556091309 }, { "auxiliary_loss_clip": 0.01264491, "auxiliary_loss_mlp": 0.01064573, "balance_loss_clip": 0.873362, "balance_loss_mlp": 1.04359198, "epoch": 0.05230565742800457, "flos": 19317319787520.0, "grad_norm": 2.222095152124562, "language_loss": 0.78103185, "learning_rate": 3.9948103405810545e-06, "loss": 0.80432254, "num_input_tokens_seen": 9447450, "step": 435, "time_per_iteration": 5.105177879333496 }, { "auxiliary_loss_clip": 0.01254354, "auxiliary_loss_mlp": 0.01052979, "balance_loss_clip": 0.95359534, "balance_loss_mlp": 1.03315449, "epoch": 0.05242590031864366, "flos": 25298636538240.0, "grad_norm": 2.026826083231853, "language_loss": 0.86012304, "learning_rate": 3.994754109126923e-06, "loss": 0.88319635, "num_input_tokens_seen": 9468945, "step": 436, "time_per_iteration": 2.9751672744750977 }, { "auxiliary_loss_clip": 0.01260865, "auxiliary_loss_mlp": 0.01058439, "balance_loss_clip": 0.88150817, "balance_loss_mlp": 1.0382452, "epoch": 0.052546143209282754, "flos": 26211629456640.0, "grad_norm": 1.825641433172261, "language_loss": 0.93589085, "learning_rate": 3.994697575070181e-06, "loss": 0.95908391, "num_input_tokens_seen": 9488405, "step": 437, "time_per_iteration": 4.249449253082275 }, { "auxiliary_loss_clip": 0.01280464, "auxiliary_loss_mlp": 0.01062485, "balance_loss_clip": 1.00226784, "balance_loss_mlp": 1.04239821, "epoch": 0.052666386099921844, "flos": 22158140578560.0, "grad_norm": 1.772532096349031, "language_loss": 0.9148041, "learning_rate": 3.994640738419402e-06, "loss": 0.93823361, "num_input_tokens_seen": 9507780, "step": 438, "time_per_iteration": 3.107560873031616 }, { "auxiliary_loss_clip": 0.01284946, "auxiliary_loss_mlp": 0.01054869, "balance_loss_clip": 1.03863776, "balance_loss_mlp": 1.03528321, "epoch": 0.052786628990560934, "flos": 23881817502720.0, "grad_norm": 1.9234829149061263, "language_loss": 0.80819523, "learning_rate": 3.9945835991832075e-06, "loss": 0.83159339, "num_input_tokens_seen": 9529665, "step": 439, "time_per_iteration": 2.854412317276001 }, { "auxiliary_loss_clip": 0.01290995, "auxiliary_loss_mlp": 0.01062309, "balance_loss_clip": 1.0820874, "balance_loss_mlp": 1.04262805, "epoch": 0.052906871881200024, "flos": 24605021934720.0, "grad_norm": 2.063037764150344, "language_loss": 0.92906946, "learning_rate": 3.994526157370268e-06, "loss": 0.95260251, "num_input_tokens_seen": 9548280, "step": 440, "time_per_iteration": 2.823514223098755 }, { "auxiliary_loss_clip": 0.01201404, "auxiliary_loss_mlp": 0.01013897, "balance_loss_clip": 0.9882552, "balance_loss_mlp": 0.99940091, "epoch": 0.053027114771839114, "flos": 56461631143680.0, "grad_norm": 0.8888762789517004, "language_loss": 0.59324265, "learning_rate": 3.994468412989296e-06, "loss": 0.61539567, "num_input_tokens_seen": 9609690, "step": 441, "time_per_iteration": 3.469059467315674 }, { "auxiliary_loss_clip": 0.01251087, "auxiliary_loss_mlp": 0.01058273, "balance_loss_clip": 0.99412256, "balance_loss_mlp": 1.03837717, "epoch": 0.053147357662478203, "flos": 17311098481920.0, "grad_norm": 2.087182224990165, "language_loss": 0.92708457, "learning_rate": 3.994410366049052e-06, "loss": 0.95017827, "num_input_tokens_seen": 9627550, "step": 442, "time_per_iteration": 2.844461441040039 }, { "auxiliary_loss_clip": 0.01283777, "auxiliary_loss_mlp": 0.01063372, "balance_loss_clip": 1.03930724, "balance_loss_mlp": 1.04311883, "epoch": 0.0532676005531173, "flos": 17164977955200.0, "grad_norm": 2.5296827178167294, "language_loss": 0.82967877, "learning_rate": 3.994352016558341e-06, "loss": 0.85315025, "num_input_tokens_seen": 9644855, "step": 443, "time_per_iteration": 2.772704601287842 }, { "auxiliary_loss_clip": 0.01279471, "auxiliary_loss_mlp": 0.01055304, "balance_loss_clip": 1.03957427, "balance_loss_mlp": 1.0354439, "epoch": 0.05338784344375639, "flos": 27819960831360.0, "grad_norm": 3.2806305612114945, "language_loss": 0.73845446, "learning_rate": 3.994293364526014e-06, "loss": 0.7618022, "num_input_tokens_seen": 9665740, "step": 444, "time_per_iteration": 2.855928659439087 }, { "auxiliary_loss_clip": 0.01273585, "auxiliary_loss_mlp": 0.01058779, "balance_loss_clip": 1.00129902, "balance_loss_mlp": 1.03849018, "epoch": 0.05350808633439548, "flos": 21507691144320.0, "grad_norm": 1.9125808172920526, "language_loss": 0.84891421, "learning_rate": 3.99423440996097e-06, "loss": 0.87223786, "num_input_tokens_seen": 9685280, "step": 445, "time_per_iteration": 2.7593653202056885 }, { "auxiliary_loss_clip": 0.01286993, "auxiliary_loss_mlp": 0.01057844, "balance_loss_clip": 1.00420988, "balance_loss_mlp": 1.03748286, "epoch": 0.05362832922503457, "flos": 20084299920000.0, "grad_norm": 11.753047436629835, "language_loss": 0.8154788, "learning_rate": 3.994175152872152e-06, "loss": 0.83892727, "num_input_tokens_seen": 9704365, "step": 446, "time_per_iteration": 2.7007040977478027 }, { "auxiliary_loss_clip": 0.01287329, "auxiliary_loss_mlp": 0.01054496, "balance_loss_clip": 1.0388577, "balance_loss_mlp": 1.03470683, "epoch": 0.05374857211567366, "flos": 26137222433280.0, "grad_norm": 1.9881664305880091, "language_loss": 0.78620076, "learning_rate": 3.994115593268548e-06, "loss": 0.80961907, "num_input_tokens_seen": 9724145, "step": 447, "time_per_iteration": 2.6770401000976562 }, { "auxiliary_loss_clip": 0.01290078, "auxiliary_loss_mlp": 0.01053975, "balance_loss_clip": 1.08019876, "balance_loss_mlp": 1.03473473, "epoch": 0.05386881500631275, "flos": 27486817165440.0, "grad_norm": 6.765922218901882, "language_loss": 0.82069802, "learning_rate": 3.994055731159195e-06, "loss": 0.8441385, "num_input_tokens_seen": 9741615, "step": 448, "time_per_iteration": 2.6923294067382812 }, { "auxiliary_loss_clip": 0.01288512, "auxiliary_loss_mlp": 0.01064763, "balance_loss_clip": 1.04421771, "balance_loss_mlp": 1.04504609, "epoch": 0.053989057896951846, "flos": 23585087249280.0, "grad_norm": 2.228260065815346, "language_loss": 0.87041271, "learning_rate": 3.993995566553172e-06, "loss": 0.89394546, "num_input_tokens_seen": 9760580, "step": 449, "time_per_iteration": 2.6741702556610107 }, { "auxiliary_loss_clip": 0.01267468, "auxiliary_loss_mlp": 0.01058627, "balance_loss_clip": 0.99466896, "balance_loss_mlp": 1.0390054, "epoch": 0.054109300787590936, "flos": 25228862369280.0, "grad_norm": 3.85073069261129, "language_loss": 0.77214986, "learning_rate": 3.993935099459607e-06, "loss": 0.79541087, "num_input_tokens_seen": 9782195, "step": 450, "time_per_iteration": 2.7355105876922607 }, { "auxiliary_loss_clip": 0.01280281, "auxiliary_loss_mlp": 0.01055098, "balance_loss_clip": 1.07530141, "balance_loss_mlp": 1.0361315, "epoch": 0.054229543678230026, "flos": 23841525421440.0, "grad_norm": 1.942298171554913, "language_loss": 0.73986846, "learning_rate": 3.993874329887673e-06, "loss": 0.76322228, "num_input_tokens_seen": 9800850, "step": 451, "time_per_iteration": 2.668098211288452 }, { "auxiliary_loss_clip": 0.01283909, "auxiliary_loss_mlp": 0.01057467, "balance_loss_clip": 1.04070878, "balance_loss_mlp": 1.03759527, "epoch": 0.054349786568869116, "flos": 16320933192960.0, "grad_norm": 3.1636325622341066, "language_loss": 0.86666262, "learning_rate": 3.993813257846589e-06, "loss": 0.89007634, "num_input_tokens_seen": 9817605, "step": 452, "time_per_iteration": 2.644425868988037 }, { "auxiliary_loss_clip": 0.01288556, "auxiliary_loss_mlp": 0.01058277, "balance_loss_clip": 1.04269886, "balance_loss_mlp": 1.03869104, "epoch": 0.054470029459508205, "flos": 18660729127680.0, "grad_norm": 2.2367666066246885, "language_loss": 0.92615795, "learning_rate": 3.993751883345619e-06, "loss": 0.94962633, "num_input_tokens_seen": 9835965, "step": 453, "time_per_iteration": 2.722057819366455 }, { "auxiliary_loss_clip": 0.01272285, "auxiliary_loss_mlp": 0.01061881, "balance_loss_clip": 0.9981575, "balance_loss_mlp": 1.04256868, "epoch": 0.054590272350147295, "flos": 17785298856960.0, "grad_norm": 2.3848495436885355, "language_loss": 0.87585652, "learning_rate": 3.993690206394073e-06, "loss": 0.89919817, "num_input_tokens_seen": 9852265, "step": 454, "time_per_iteration": 2.660248279571533 }, { "auxiliary_loss_clip": 0.01290037, "auxiliary_loss_mlp": 0.01065039, "balance_loss_clip": 1.00288153, "balance_loss_mlp": 1.04556084, "epoch": 0.054710515240786385, "flos": 17785945301760.0, "grad_norm": 3.152791782952014, "language_loss": 0.87756157, "learning_rate": 3.993628227001307e-06, "loss": 0.90111232, "num_input_tokens_seen": 9870465, "step": 455, "time_per_iteration": 2.6919097900390625 }, { "auxiliary_loss_clip": 0.01276051, "auxiliary_loss_mlp": 0.0106848, "balance_loss_clip": 0.99788499, "balance_loss_mlp": 1.04914403, "epoch": 0.05483075813142548, "flos": 48210900180480.0, "grad_norm": 1.8366203002459167, "language_loss": 0.71099269, "learning_rate": 3.993565945176726e-06, "loss": 0.73443806, "num_input_tokens_seen": 9891490, "step": 456, "time_per_iteration": 2.9438095092773438 }, { "auxiliary_loss_clip": 0.01282552, "auxiliary_loss_mlp": 0.01063799, "balance_loss_clip": 1.00404406, "balance_loss_mlp": 1.04430819, "epoch": 0.05495100102206457, "flos": 19682244011520.0, "grad_norm": 2.3754700399774937, "language_loss": 0.83885193, "learning_rate": 3.993503360929776e-06, "loss": 0.86231554, "num_input_tokens_seen": 9910375, "step": 457, "time_per_iteration": 2.7110438346862793 }, { "auxiliary_loss_clip": 0.01273765, "auxiliary_loss_mlp": 0.01060129, "balance_loss_clip": 0.84664518, "balance_loss_mlp": 1.03966093, "epoch": 0.05507124391270366, "flos": 26360048453760.0, "grad_norm": 1.5196639186893413, "language_loss": 0.81028104, "learning_rate": 3.99344047426995e-06, "loss": 0.83362007, "num_input_tokens_seen": 9931635, "step": 458, "time_per_iteration": 3.062098979949951 }, { "auxiliary_loss_clip": 0.0127652, "auxiliary_loss_mlp": 0.01055692, "balance_loss_clip": 0.92049766, "balance_loss_mlp": 1.03601098, "epoch": 0.05519148680334275, "flos": 22601314581120.0, "grad_norm": 2.907231545118109, "language_loss": 0.9348343, "learning_rate": 3.993377285206789e-06, "loss": 0.95815641, "num_input_tokens_seen": 9951420, "step": 459, "time_per_iteration": 3.39823055267334 }, { "auxiliary_loss_clip": 0.01245672, "auxiliary_loss_mlp": 0.01049709, "balance_loss_clip": 0.95487475, "balance_loss_mlp": 1.03062367, "epoch": 0.05531172969398184, "flos": 40552519380480.0, "grad_norm": 1.7550854246504093, "language_loss": 0.8644622, "learning_rate": 3.99331379374988e-06, "loss": 0.88741601, "num_input_tokens_seen": 9975025, "step": 460, "time_per_iteration": 3.7952721118927 }, { "auxiliary_loss_clip": 0.01279831, "auxiliary_loss_mlp": 0.01054422, "balance_loss_clip": 0.9951759, "balance_loss_mlp": 1.03520536, "epoch": 0.05543197258462093, "flos": 23477894087040.0, "grad_norm": 1.9178615541084696, "language_loss": 0.80070722, "learning_rate": 3.993249999908852e-06, "loss": 0.82404971, "num_input_tokens_seen": 9995175, "step": 461, "time_per_iteration": 4.788560390472412 }, { "auxiliary_loss_clip": 0.01288511, "auxiliary_loss_mlp": 0.01067799, "balance_loss_clip": 1.07844186, "balance_loss_mlp": 1.04773629, "epoch": 0.05555221547526003, "flos": 18624603024000.0, "grad_norm": 1.9618555421730064, "language_loss": 0.87173527, "learning_rate": 3.993185903693384e-06, "loss": 0.89529836, "num_input_tokens_seen": 10011975, "step": 462, "time_per_iteration": 2.6503851413726807 }, { "auxiliary_loss_clip": 0.01280146, "auxiliary_loss_mlp": 0.01060331, "balance_loss_clip": 0.99989271, "balance_loss_mlp": 1.04038763, "epoch": 0.05567245836589912, "flos": 23587098410880.0, "grad_norm": 2.072257675760028, "language_loss": 0.82486057, "learning_rate": 3.9931215051131995e-06, "loss": 0.84826535, "num_input_tokens_seen": 10032620, "step": 463, "time_per_iteration": 3.7878775596618652 }, { "auxiliary_loss_clip": 0.01285606, "auxiliary_loss_mlp": 0.01056737, "balance_loss_clip": 0.99664444, "balance_loss_mlp": 1.03749704, "epoch": 0.05579270125653821, "flos": 27746667129600.0, "grad_norm": 1.5703606910322003, "language_loss": 0.80161339, "learning_rate": 3.993056804178068e-06, "loss": 0.82503688, "num_input_tokens_seen": 10054165, "step": 464, "time_per_iteration": 2.83009934425354 }, { "auxiliary_loss_clip": 0.01268531, "auxiliary_loss_mlp": 0.01051642, "balance_loss_clip": 0.92241812, "balance_loss_mlp": 1.03202057, "epoch": 0.0559129441471773, "flos": 27014161075200.0, "grad_norm": 2.0962684569022683, "language_loss": 0.84256768, "learning_rate": 3.992991800897803e-06, "loss": 0.86576939, "num_input_tokens_seen": 10073970, "step": 465, "time_per_iteration": 2.8554723262786865 }, { "auxiliary_loss_clip": 0.01285598, "auxiliary_loss_mlp": 0.01061727, "balance_loss_clip": 1.07894707, "balance_loss_mlp": 1.04254675, "epoch": 0.05603318703781639, "flos": 15229787794560.0, "grad_norm": 3.9812703386256985, "language_loss": 0.89638519, "learning_rate": 3.9929264952822665e-06, "loss": 0.91985846, "num_input_tokens_seen": 10091505, "step": 466, "time_per_iteration": 2.724647045135498 }, { "auxiliary_loss_clip": 0.01287843, "auxiliary_loss_mlp": 0.01055938, "balance_loss_clip": 1.03895986, "balance_loss_mlp": 1.0365901, "epoch": 0.05615342992845548, "flos": 22266482976000.0, "grad_norm": 2.0726171731471625, "language_loss": 0.88289297, "learning_rate": 3.992860887341366e-06, "loss": 0.90633082, "num_input_tokens_seen": 10109675, "step": 467, "time_per_iteration": 2.7103519439697266 }, { "auxiliary_loss_clip": 0.0126466, "auxiliary_loss_mlp": 0.01057043, "balance_loss_clip": 0.95822722, "balance_loss_mlp": 1.03804123, "epoch": 0.056273672819094574, "flos": 23584979508480.0, "grad_norm": 2.412055359341681, "language_loss": 0.81268227, "learning_rate": 3.992794977085052e-06, "loss": 0.83589935, "num_input_tokens_seen": 10127675, "step": 468, "time_per_iteration": 2.8073015213012695 }, { "auxiliary_loss_clip": 0.01277355, "auxiliary_loss_mlp": 0.0105057, "balance_loss_clip": 0.96023226, "balance_loss_mlp": 1.03130615, "epoch": 0.056393915709733664, "flos": 19858708552320.0, "grad_norm": 2.3748796463471966, "language_loss": 0.84984124, "learning_rate": 3.992728764523326e-06, "loss": 0.87312055, "num_input_tokens_seen": 10146620, "step": 469, "time_per_iteration": 2.730539083480835 }, { "auxiliary_loss_clip": 0.01278356, "auxiliary_loss_mlp": 0.01047184, "balance_loss_clip": 0.99966079, "balance_loss_mlp": 1.02724051, "epoch": 0.05651415860037275, "flos": 22163779013760.0, "grad_norm": 1.6178371456149192, "language_loss": 0.80608428, "learning_rate": 3.99266224966623e-06, "loss": 0.82933974, "num_input_tokens_seen": 10167535, "step": 470, "time_per_iteration": 2.76692271232605 }, { "auxiliary_loss_clip": 0.0126284, "auxiliary_loss_mlp": 0.01058213, "balance_loss_clip": 0.99846864, "balance_loss_mlp": 1.03904438, "epoch": 0.05663440149101184, "flos": 19463548055040.0, "grad_norm": 1.9346640876581562, "language_loss": 0.88001215, "learning_rate": 3.992595432523855e-06, "loss": 0.90322274, "num_input_tokens_seen": 10184825, "step": 471, "time_per_iteration": 2.753382682800293 }, { "auxiliary_loss_clip": 0.0126569, "auxiliary_loss_mlp": 0.01051517, "balance_loss_clip": 0.95991492, "balance_loss_mlp": 1.03278899, "epoch": 0.05675464438165093, "flos": 22670226823680.0, "grad_norm": 1.8529556074196931, "language_loss": 0.85998523, "learning_rate": 3.992528313106338e-06, "loss": 0.88315725, "num_input_tokens_seen": 10203025, "step": 472, "time_per_iteration": 2.8550143241882324 }, { "auxiliary_loss_clip": 0.01286434, "auxiliary_loss_mlp": 0.01131664, "balance_loss_clip": 1.08201241, "balance_loss_mlp": 0.0, "epoch": 0.05687488727229002, "flos": 16901177495040.0, "grad_norm": 2.3824550490453613, "language_loss": 0.82565755, "learning_rate": 3.9924608914238595e-06, "loss": 0.8498385, "num_input_tokens_seen": 10218020, "step": 473, "time_per_iteration": 2.760143995285034 }, { "auxiliary_loss_clip": 0.01284354, "auxiliary_loss_mlp": 0.01050412, "balance_loss_clip": 1.04004729, "balance_loss_mlp": 1.03043211, "epoch": 0.05699513016292912, "flos": 29168980945920.0, "grad_norm": 2.7328413411741677, "language_loss": 0.84154499, "learning_rate": 3.992393167486648e-06, "loss": 0.8648926, "num_input_tokens_seen": 10237170, "step": 474, "time_per_iteration": 2.782965660095215 }, { "auxiliary_loss_clip": 0.01287499, "auxiliary_loss_mlp": 0.01050989, "balance_loss_clip": 1.07969761, "balance_loss_mlp": 1.03215361, "epoch": 0.05711537305356821, "flos": 18916197632640.0, "grad_norm": 2.334270797157165, "language_loss": 0.80579847, "learning_rate": 3.992325141304977e-06, "loss": 0.82918334, "num_input_tokens_seen": 10255125, "step": 475, "time_per_iteration": 2.6583127975463867 }, { "auxiliary_loss_clip": 0.01253818, "auxiliary_loss_mlp": 0.01056142, "balance_loss_clip": 0.95545721, "balance_loss_mlp": 1.03723586, "epoch": 0.0572356159442073, "flos": 26758979879040.0, "grad_norm": 2.2728986310284185, "language_loss": 0.86710984, "learning_rate": 3.992256812889166e-06, "loss": 0.89020944, "num_input_tokens_seen": 10271230, "step": 476, "time_per_iteration": 2.7603979110717773 }, { "auxiliary_loss_clip": 0.01292851, "auxiliary_loss_mlp": 0.01054529, "balance_loss_clip": 1.08487153, "balance_loss_mlp": 1.03588414, "epoch": 0.05735585883484639, "flos": 35116146840960.0, "grad_norm": 2.251959397518022, "language_loss": 0.76876926, "learning_rate": 3.992188182249582e-06, "loss": 0.792243, "num_input_tokens_seen": 10293125, "step": 477, "time_per_iteration": 2.7168002128601074 }, { "auxiliary_loss_clip": 0.0127897, "auxiliary_loss_mlp": 0.01061814, "balance_loss_clip": 1.000687, "balance_loss_mlp": 1.04282379, "epoch": 0.05747610172548548, "flos": 18734381965440.0, "grad_norm": 2.020271755552471, "language_loss": 0.90527058, "learning_rate": 3.992119249396633e-06, "loss": 0.92867845, "num_input_tokens_seen": 10311810, "step": 478, "time_per_iteration": 3.0527892112731934 }, { "auxiliary_loss_clip": 0.01271133, "auxiliary_loss_mlp": 0.0113177, "balance_loss_clip": 0.99688792, "balance_loss_mlp": 0.0, "epoch": 0.05759634461612457, "flos": 27964752554880.0, "grad_norm": 1.8601790938023324, "language_loss": 0.82253587, "learning_rate": 3.992050014340778e-06, "loss": 0.84656489, "num_input_tokens_seen": 10332165, "step": 479, "time_per_iteration": 2.9431068897247314 }, { "auxiliary_loss_clip": 0.01203514, "auxiliary_loss_mlp": 0.01028642, "balance_loss_clip": 1.03102541, "balance_loss_mlp": 1.01471806, "epoch": 0.057716587506763666, "flos": 69292009405440.0, "grad_norm": 0.8406451608896108, "language_loss": 0.55108654, "learning_rate": 3.99198047709252e-06, "loss": 0.57340813, "num_input_tokens_seen": 10393685, "step": 480, "time_per_iteration": 3.3525002002716064 }, { "auxiliary_loss_clip": 0.01274684, "auxiliary_loss_mlp": 0.01064862, "balance_loss_clip": 0.95858425, "balance_loss_mlp": 1.0454433, "epoch": 0.057836830397402755, "flos": 25009196745600.0, "grad_norm": 2.3330090281602796, "language_loss": 0.7872104, "learning_rate": 3.991910637662408e-06, "loss": 0.81060588, "num_input_tokens_seen": 10413975, "step": 481, "time_per_iteration": 2.86265230178833 }, { "auxiliary_loss_clip": 0.01283728, "auxiliary_loss_mlp": 0.0106999, "balance_loss_clip": 1.07770729, "balance_loss_mlp": 1.05045152, "epoch": 0.057957073288041845, "flos": 25593894334080.0, "grad_norm": 1.8638357867590605, "language_loss": 0.81012464, "learning_rate": 3.9918404960610355e-06, "loss": 0.83366185, "num_input_tokens_seen": 10433005, "step": 482, "time_per_iteration": 2.767510175704956 }, { "auxiliary_loss_clip": 0.01288419, "auxiliary_loss_mlp": 0.01057741, "balance_loss_clip": 1.03985643, "balance_loss_mlp": 1.03856039, "epoch": 0.058077316178680935, "flos": 20777411733120.0, "grad_norm": 2.1460848586991017, "language_loss": 0.77414113, "learning_rate": 3.991770052299043e-06, "loss": 0.79760265, "num_input_tokens_seen": 10451235, "step": 483, "time_per_iteration": 2.688371181488037 }, { "auxiliary_loss_clip": 0.0127669, "auxiliary_loss_mlp": 0.01065733, "balance_loss_clip": 0.99720091, "balance_loss_mlp": 1.0462184, "epoch": 0.058197559069320025, "flos": 18916484941440.0, "grad_norm": 2.331880406505502, "language_loss": 0.87626165, "learning_rate": 3.991699306387118e-06, "loss": 0.89968586, "num_input_tokens_seen": 10469705, "step": 484, "time_per_iteration": 2.727959394454956 }, { "auxiliary_loss_clip": 0.0128617, "auxiliary_loss_mlp": 0.01062219, "balance_loss_clip": 1.04011273, "balance_loss_mlp": 1.04305053, "epoch": 0.058317801959959115, "flos": 24863327614080.0, "grad_norm": 1.8752180947351422, "language_loss": 0.7799629, "learning_rate": 3.991628258335991e-06, "loss": 0.80344677, "num_input_tokens_seen": 10491910, "step": 485, "time_per_iteration": 2.80051326751709 }, { "auxiliary_loss_clip": 0.0126267, "auxiliary_loss_mlp": 0.01058793, "balance_loss_clip": 0.95749295, "balance_loss_mlp": 1.03778815, "epoch": 0.05843804485059821, "flos": 23257977068160.0, "grad_norm": 3.5188512184593175, "language_loss": 0.87939656, "learning_rate": 3.991556908156442e-06, "loss": 0.9026112, "num_input_tokens_seen": 10508435, "step": 486, "time_per_iteration": 2.7413275241851807 }, { "auxiliary_loss_clip": 0.01277761, "auxiliary_loss_mlp": 0.01050355, "balance_loss_clip": 0.99695164, "balance_loss_mlp": 1.03156793, "epoch": 0.0585582877412373, "flos": 23150532510720.0, "grad_norm": 2.7477193416088315, "language_loss": 0.87828457, "learning_rate": 3.9914852558592914e-06, "loss": 0.90156579, "num_input_tokens_seen": 10529485, "step": 487, "time_per_iteration": 4.635085821151733 }, { "auxiliary_loss_clip": 0.01281861, "auxiliary_loss_mlp": 0.01051876, "balance_loss_clip": 1.0403825, "balance_loss_mlp": 1.03302908, "epoch": 0.05867853063187639, "flos": 23506406507520.0, "grad_norm": 3.086160482744592, "language_loss": 0.81027031, "learning_rate": 3.991413301455413e-06, "loss": 0.83360767, "num_input_tokens_seen": 10545935, "step": 488, "time_per_iteration": 3.6199405193328857 }, { "auxiliary_loss_clip": 0.01257179, "auxiliary_loss_mlp": 0.01066197, "balance_loss_clip": 0.99272293, "balance_loss_mlp": 1.04742193, "epoch": 0.05879877352251548, "flos": 29495803818240.0, "grad_norm": 2.3594426297838527, "language_loss": 0.77782035, "learning_rate": 3.991341044955719e-06, "loss": 0.80105412, "num_input_tokens_seen": 10565690, "step": 489, "time_per_iteration": 3.826721429824829 }, { "auxiliary_loss_clip": 0.01276704, "auxiliary_loss_mlp": 0.01131914, "balance_loss_clip": 1.03577709, "balance_loss_mlp": 0.0, "epoch": 0.05891901641315457, "flos": 20157485880960.0, "grad_norm": 2.1295415894850236, "language_loss": 0.81785655, "learning_rate": 3.991268486371172e-06, "loss": 0.84194273, "num_input_tokens_seen": 10584245, "step": 490, "time_per_iteration": 2.7208027839660645 }, { "auxiliary_loss_clip": 0.01276847, "auxiliary_loss_mlp": 0.01051054, "balance_loss_clip": 0.9965502, "balance_loss_mlp": 1.03159904, "epoch": 0.05903925930379366, "flos": 24644200694400.0, "grad_norm": 3.272914366584611, "language_loss": 0.87908554, "learning_rate": 3.991195625712779e-06, "loss": 0.90236455, "num_input_tokens_seen": 10601210, "step": 491, "time_per_iteration": 2.748518466949463 }, { "auxiliary_loss_clip": 0.0128321, "auxiliary_loss_mlp": 0.01050515, "balance_loss_clip": 1.07818103, "balance_loss_mlp": 1.0318352, "epoch": 0.05915950219443276, "flos": 21250391045760.0, "grad_norm": 1.943115260728911, "language_loss": 0.81269288, "learning_rate": 3.991122462991592e-06, "loss": 0.83603013, "num_input_tokens_seen": 10620730, "step": 492, "time_per_iteration": 2.694455623626709 }, { "auxiliary_loss_clip": 0.01285904, "auxiliary_loss_mlp": 0.01058178, "balance_loss_clip": 1.07802582, "balance_loss_mlp": 1.03894997, "epoch": 0.05927974508507185, "flos": 9902727319680.0, "grad_norm": 3.010366875135687, "language_loss": 0.81037962, "learning_rate": 3.991048998218712e-06, "loss": 0.83382046, "num_input_tokens_seen": 10634035, "step": 493, "time_per_iteration": 2.6747994422912598 }, { "auxiliary_loss_clip": 0.01278182, "auxiliary_loss_mlp": 0.01050071, "balance_loss_clip": 1.03557014, "balance_loss_mlp": 1.02979374, "epoch": 0.05939998797571094, "flos": 18259499232000.0, "grad_norm": 2.267805367007421, "language_loss": 0.76658738, "learning_rate": 3.990975231405281e-06, "loss": 0.78986996, "num_input_tokens_seen": 10652485, "step": 494, "time_per_iteration": 2.6775646209716797 }, { "auxiliary_loss_clip": 0.01279573, "auxiliary_loss_mlp": 0.01063183, "balance_loss_clip": 1.0387013, "balance_loss_mlp": 1.0430243, "epoch": 0.05952023086635003, "flos": 28256598558720.0, "grad_norm": 2.193503603368784, "language_loss": 0.7864821, "learning_rate": 3.990901162562491e-06, "loss": 0.80990964, "num_input_tokens_seen": 10673175, "step": 495, "time_per_iteration": 2.7757959365844727 }, { "auxiliary_loss_clip": 0.01270882, "auxiliary_loss_mlp": 0.01132118, "balance_loss_clip": 0.95495307, "balance_loss_mlp": 0.0, "epoch": 0.05964047375698912, "flos": 14902498045440.0, "grad_norm": 1.9379328795870006, "language_loss": 0.90678549, "learning_rate": 3.9908267917015765e-06, "loss": 0.93081558, "num_input_tokens_seen": 10691235, "step": 496, "time_per_iteration": 2.7437403202056885 }, { "auxiliary_loss_clip": 0.01262293, "auxiliary_loss_mlp": 0.01057195, "balance_loss_clip": 1.03276896, "balance_loss_mlp": 1.03816915, "epoch": 0.059760716647628206, "flos": 23185581206400.0, "grad_norm": 2.2845317770526012, "language_loss": 0.93289876, "learning_rate": 3.990752118833821e-06, "loss": 0.95609355, "num_input_tokens_seen": 10708675, "step": 497, "time_per_iteration": 2.7001490592956543 }, { "auxiliary_loss_clip": 0.01285025, "auxiliary_loss_mlp": 0.01038578, "balance_loss_clip": 1.07892907, "balance_loss_mlp": 1.01988566, "epoch": 0.0598809595382673, "flos": 22746968231040.0, "grad_norm": 1.9167489419175017, "language_loss": 0.78112042, "learning_rate": 3.990677143970553e-06, "loss": 0.80435646, "num_input_tokens_seen": 10729485, "step": 498, "time_per_iteration": 2.723466157913208 }, { "auxiliary_loss_clip": 0.01269427, "auxiliary_loss_mlp": 0.01056917, "balance_loss_clip": 0.96327353, "balance_loss_mlp": 1.03660393, "epoch": 0.06000120242890639, "flos": 22127221946880.0, "grad_norm": 1.8894253783127366, "language_loss": 0.81156731, "learning_rate": 3.990601867123144e-06, "loss": 0.83483076, "num_input_tokens_seen": 10749210, "step": 499, "time_per_iteration": 2.7631378173828125 }, { "auxiliary_loss_clip": 0.01268069, "auxiliary_loss_mlp": 0.01047796, "balance_loss_clip": 0.91844475, "balance_loss_mlp": 1.0289731, "epoch": 0.06012144531954548, "flos": 19171773878400.0, "grad_norm": 2.3854529174528376, "language_loss": 0.84695786, "learning_rate": 3.990526288303014e-06, "loss": 0.87011653, "num_input_tokens_seen": 10768000, "step": 500, "time_per_iteration": 2.736171245574951 }, { "auxiliary_loss_clip": 0.01276268, "auxiliary_loss_mlp": 0.01131453, "balance_loss_clip": 0.99991918, "balance_loss_mlp": 0.0, "epoch": 0.06024168821018457, "flos": 22783345729920.0, "grad_norm": 1.6604436269932343, "language_loss": 0.9049753, "learning_rate": 3.9904504075216295e-06, "loss": 0.92905247, "num_input_tokens_seen": 10788760, "step": 501, "time_per_iteration": 2.7023308277130127 }, { "auxiliary_loss_clip": 0.01271712, "auxiliary_loss_mlp": 0.01050962, "balance_loss_clip": 0.95531154, "balance_loss_mlp": 1.03079212, "epoch": 0.06036193110082366, "flos": 18770687637120.0, "grad_norm": 2.470127922383296, "language_loss": 0.93880343, "learning_rate": 3.990374224790501e-06, "loss": 0.96203017, "num_input_tokens_seen": 10806965, "step": 502, "time_per_iteration": 2.7062010765075684 }, { "auxiliary_loss_clip": 0.01279723, "auxiliary_loss_mlp": 0.01063959, "balance_loss_clip": 1.00225151, "balance_loss_mlp": 1.04479027, "epoch": 0.06048217399146275, "flos": 17201570935680.0, "grad_norm": 2.03539939871987, "language_loss": 0.71017963, "learning_rate": 3.990297740121185e-06, "loss": 0.73361647, "num_input_tokens_seen": 10824900, "step": 503, "time_per_iteration": 2.648792028427124 }, { "auxiliary_loss_clip": 0.01284458, "auxiliary_loss_mlp": 0.01131683, "balance_loss_clip": 1.04154551, "balance_loss_mlp": 0.0, "epoch": 0.06060241688210185, "flos": 24024131187840.0, "grad_norm": 2.0481192586115275, "language_loss": 0.78559858, "learning_rate": 3.990220953525284e-06, "loss": 0.80975997, "num_input_tokens_seen": 10842010, "step": 504, "time_per_iteration": 2.6969640254974365 }, { "auxiliary_loss_clip": 0.01254814, "auxiliary_loss_mlp": 0.01054167, "balance_loss_clip": 0.99069679, "balance_loss_mlp": 1.03540373, "epoch": 0.06072265977274094, "flos": 14611190745600.0, "grad_norm": 2.571494418419231, "language_loss": 0.74112177, "learning_rate": 3.9901438650144465e-06, "loss": 0.76421154, "num_input_tokens_seen": 10858260, "step": 505, "time_per_iteration": 2.6279942989349365 }, { "auxiliary_loss_clip": 0.01259731, "auxiliary_loss_mlp": 0.01051801, "balance_loss_clip": 1.03105772, "balance_loss_mlp": 1.03312063, "epoch": 0.06084290266338003, "flos": 20558284813440.0, "grad_norm": 2.873482339334522, "language_loss": 0.91554391, "learning_rate": 3.990066474600367e-06, "loss": 0.93865925, "num_input_tokens_seen": 10876230, "step": 506, "time_per_iteration": 2.703270196914673 }, { "auxiliary_loss_clip": 0.01261346, "auxiliary_loss_mlp": 0.01046162, "balance_loss_clip": 1.03143907, "balance_loss_mlp": 1.02741075, "epoch": 0.06096314555401912, "flos": 22309217182080.0, "grad_norm": 1.7468184933794686, "language_loss": 0.67797267, "learning_rate": 3.989988782294786e-06, "loss": 0.70104778, "num_input_tokens_seen": 10896320, "step": 507, "time_per_iteration": 2.6488122940063477 }, { "auxiliary_loss_clip": 0.01252567, "auxiliary_loss_mlp": 0.0105611, "balance_loss_clip": 0.95663768, "balance_loss_mlp": 1.03733468, "epoch": 0.06108338844465821, "flos": 19131374056320.0, "grad_norm": 1.792277880000664, "language_loss": 0.94882923, "learning_rate": 3.989910788109489e-06, "loss": 0.97191596, "num_input_tokens_seen": 10912970, "step": 508, "time_per_iteration": 2.7236690521240234 }, { "auxiliary_loss_clip": 0.01274949, "auxiliary_loss_mlp": 0.01046753, "balance_loss_clip": 0.96100748, "balance_loss_mlp": 1.02763224, "epoch": 0.0612036313352973, "flos": 33584018169600.0, "grad_norm": 2.2923910342863283, "language_loss": 0.74738884, "learning_rate": 3.989832492056307e-06, "loss": 0.77060592, "num_input_tokens_seen": 10933995, "step": 509, "time_per_iteration": 2.8186583518981934 }, { "auxiliary_loss_clip": 0.01281792, "auxiliary_loss_mlp": 0.01057755, "balance_loss_clip": 1.04124355, "balance_loss_mlp": 1.03821659, "epoch": 0.06132387422593639, "flos": 27490552179840.0, "grad_norm": 7.9755203698380495, "language_loss": 0.80971313, "learning_rate": 3.989753894147119e-06, "loss": 0.8331086, "num_input_tokens_seen": 10954120, "step": 510, "time_per_iteration": 2.72200345993042 }, { "auxiliary_loss_clip": 0.01274951, "auxiliary_loss_mlp": 0.01059654, "balance_loss_clip": 1.04243469, "balance_loss_mlp": 1.04083109, "epoch": 0.061444117116575485, "flos": 25885057979520.0, "grad_norm": 2.112161783143218, "language_loss": 0.80097556, "learning_rate": 3.989674994393846e-06, "loss": 0.82432157, "num_input_tokens_seen": 10973595, "step": 511, "time_per_iteration": 2.7261440753936768 }, { "auxiliary_loss_clip": 0.01279843, "auxiliary_loss_mlp": 0.01058778, "balance_loss_clip": 1.04237604, "balance_loss_mlp": 1.03984809, "epoch": 0.061564360007214575, "flos": 28512031150080.0, "grad_norm": 3.0142453660137707, "language_loss": 0.93749958, "learning_rate": 3.98959579280846e-06, "loss": 0.96088576, "num_input_tokens_seen": 10991995, "step": 512, "time_per_iteration": 3.6707425117492676 }, { "auxiliary_loss_clip": 0.01256556, "auxiliary_loss_mlp": 0.0105604, "balance_loss_clip": 0.92350674, "balance_loss_mlp": 1.03693068, "epoch": 0.061684602897853665, "flos": 12094355652480.0, "grad_norm": 2.1068346404294456, "language_loss": 0.82800412, "learning_rate": 3.989516289402973e-06, "loss": 0.85113007, "num_input_tokens_seen": 11007625, "step": 513, "time_per_iteration": 4.6387481689453125 }, { "auxiliary_loss_clip": 0.01234211, "auxiliary_loss_mlp": 0.01052964, "balance_loss_clip": 0.87111652, "balance_loss_mlp": 1.03337824, "epoch": 0.061804845788492754, "flos": 19532639865600.0, "grad_norm": 2.345085022584433, "language_loss": 0.80318236, "learning_rate": 3.989436484189447e-06, "loss": 0.8260541, "num_input_tokens_seen": 11025570, "step": 514, "time_per_iteration": 2.902627944946289 }, { "auxiliary_loss_clip": 0.01281468, "auxiliary_loss_mlp": 0.01056247, "balance_loss_clip": 1.03663647, "balance_loss_mlp": 1.03654182, "epoch": 0.061925088679131844, "flos": 15341111020800.0, "grad_norm": 2.4702459244756243, "language_loss": 0.80274349, "learning_rate": 3.9893563771799885e-06, "loss": 0.82612062, "num_input_tokens_seen": 11042045, "step": 515, "time_per_iteration": 3.729170560836792 }, { "auxiliary_loss_clip": 0.01281956, "auxiliary_loss_mlp": 0.0105468, "balance_loss_clip": 1.07578242, "balance_loss_mlp": 1.03541529, "epoch": 0.062045331569770934, "flos": 25919927107200.0, "grad_norm": 2.0256878272231265, "language_loss": 0.86454415, "learning_rate": 3.989275968386749e-06, "loss": 0.8879106, "num_input_tokens_seen": 11059955, "step": 516, "time_per_iteration": 2.5740036964416504 }, { "auxiliary_loss_clip": 0.01264674, "auxiliary_loss_mlp": 0.01057771, "balance_loss_clip": 0.99592185, "balance_loss_mlp": 1.03791082, "epoch": 0.06216557446041003, "flos": 28110621686400.0, "grad_norm": 2.124715361557393, "language_loss": 0.76693928, "learning_rate": 3.989195257821926e-06, "loss": 0.79016376, "num_input_tokens_seen": 11078440, "step": 517, "time_per_iteration": 2.6528217792510986 }, { "auxiliary_loss_clip": 0.0127572, "auxiliary_loss_mlp": 0.01058851, "balance_loss_clip": 1.00316978, "balance_loss_mlp": 1.03933668, "epoch": 0.06228581735104912, "flos": 23478181395840.0, "grad_norm": 2.1098291220312273, "language_loss": 0.84551919, "learning_rate": 3.989114245497765e-06, "loss": 0.86886489, "num_input_tokens_seen": 11098240, "step": 518, "time_per_iteration": 2.6632275581359863 }, { "auxiliary_loss_clip": 0.01280283, "auxiliary_loss_mlp": 0.01057537, "balance_loss_clip": 1.03536725, "balance_loss_mlp": 1.03853452, "epoch": 0.06240606024168821, "flos": 15195205975680.0, "grad_norm": 2.176624623840006, "language_loss": 0.94645107, "learning_rate": 3.989032931426554e-06, "loss": 0.96982926, "num_input_tokens_seen": 11115395, "step": 519, "time_per_iteration": 2.6655571460723877 }, { "auxiliary_loss_clip": 0.01273829, "auxiliary_loss_mlp": 0.01056987, "balance_loss_clip": 0.99979645, "balance_loss_mlp": 1.03768671, "epoch": 0.06252630313232731, "flos": 20631829910400.0, "grad_norm": 1.909211867609446, "language_loss": 0.8675456, "learning_rate": 3.9889513156206295e-06, "loss": 0.8908537, "num_input_tokens_seen": 11134835, "step": 520, "time_per_iteration": 2.677464485168457 }, { "auxiliary_loss_clip": 0.01274485, "auxiliary_loss_mlp": 0.01049778, "balance_loss_clip": 0.96077573, "balance_loss_mlp": 1.03034651, "epoch": 0.06264654602296639, "flos": 20778058177920.0, "grad_norm": 3.230855715869339, "language_loss": 0.73550886, "learning_rate": 3.988869398092371e-06, "loss": 0.75875151, "num_input_tokens_seen": 11154745, "step": 521, "time_per_iteration": 2.8531460762023926 }, { "auxiliary_loss_clip": 0.01273767, "auxiliary_loss_mlp": 0.01057811, "balance_loss_clip": 0.99728054, "balance_loss_mlp": 1.0376054, "epoch": 0.06276678891360549, "flos": 29605798241280.0, "grad_norm": 2.3252545460340968, "language_loss": 0.78787816, "learning_rate": 3.988787178854206e-06, "loss": 0.81119394, "num_input_tokens_seen": 11174280, "step": 522, "time_per_iteration": 2.7476143836975098 }, { "auxiliary_loss_clip": 0.01281658, "auxiliary_loss_mlp": 0.01057153, "balance_loss_clip": 1.07846451, "balance_loss_mlp": 1.03798461, "epoch": 0.06288703180424457, "flos": 22126288193280.0, "grad_norm": 2.1294095901862025, "language_loss": 0.87443721, "learning_rate": 3.988704657918608e-06, "loss": 0.89782536, "num_input_tokens_seen": 11193340, "step": 523, "time_per_iteration": 2.6723520755767822 }, { "auxiliary_loss_clip": 0.01285121, "auxiliary_loss_mlp": 0.0105774, "balance_loss_clip": 1.04459357, "balance_loss_mlp": 1.0391438, "epoch": 0.06300727469488367, "flos": 14976689587200.0, "grad_norm": 2.2168488352674136, "language_loss": 0.79736221, "learning_rate": 3.988621835298094e-06, "loss": 0.82079089, "num_input_tokens_seen": 11210555, "step": 524, "time_per_iteration": 2.6518588066101074 }, { "auxiliary_loss_clip": 0.01277456, "auxiliary_loss_mlp": 0.01060154, "balance_loss_clip": 1.07794356, "balance_loss_mlp": 1.04056764, "epoch": 0.06312751758552275, "flos": 24535391420160.0, "grad_norm": 2.122504359350591, "language_loss": 0.91595417, "learning_rate": 3.988538711005229e-06, "loss": 0.93933028, "num_input_tokens_seen": 11230010, "step": 525, "time_per_iteration": 2.6924757957458496 }, { "auxiliary_loss_clip": 0.01275996, "auxiliary_loss_mlp": 0.01049369, "balance_loss_clip": 1.03977132, "balance_loss_mlp": 1.03003335, "epoch": 0.06324776047616185, "flos": 21507008785920.0, "grad_norm": 2.2265329985639513, "language_loss": 0.88030422, "learning_rate": 3.988455285052622e-06, "loss": 0.90355784, "num_input_tokens_seen": 11246190, "step": 526, "time_per_iteration": 2.713550329208374 }, { "auxiliary_loss_clip": 0.0127586, "auxiliary_loss_mlp": 0.01051265, "balance_loss_clip": 1.04067862, "balance_loss_mlp": 1.03190541, "epoch": 0.06336800336680094, "flos": 21688034353920.0, "grad_norm": 2.1905602930763566, "language_loss": 0.83840597, "learning_rate": 3.98837155745293e-06, "loss": 0.86167723, "num_input_tokens_seen": 11264230, "step": 527, "time_per_iteration": 2.700380802154541 }, { "auxiliary_loss_clip": 0.01276298, "auxiliary_loss_mlp": 0.01043576, "balance_loss_clip": 1.03983355, "balance_loss_mlp": 1.0249083, "epoch": 0.06348824625744003, "flos": 19500895221120.0, "grad_norm": 2.098450209863753, "language_loss": 0.76441294, "learning_rate": 3.988287528218854e-06, "loss": 0.78761172, "num_input_tokens_seen": 11283015, "step": 528, "time_per_iteration": 2.649432420730591 }, { "auxiliary_loss_clip": 0.01274562, "auxiliary_loss_mlp": 0.01052177, "balance_loss_clip": 1.03983831, "balance_loss_mlp": 1.03297257, "epoch": 0.06360848914807912, "flos": 15481233976320.0, "grad_norm": 2.282833176071588, "language_loss": 0.90441424, "learning_rate": 3.98820319736314e-06, "loss": 0.92768162, "num_input_tokens_seen": 11299630, "step": 529, "time_per_iteration": 2.6621100902557373 }, { "auxiliary_loss_clip": 0.0126677, "auxiliary_loss_mlp": 0.0106024, "balance_loss_clip": 0.95634341, "balance_loss_mlp": 1.04113066, "epoch": 0.0637287320387182, "flos": 20593369422720.0, "grad_norm": 1.9612274333375461, "language_loss": 0.85335904, "learning_rate": 3.988118564898582e-06, "loss": 0.87662911, "num_input_tokens_seen": 11319170, "step": 530, "time_per_iteration": 2.697397232055664 }, { "auxiliary_loss_clip": 0.01259491, "auxiliary_loss_mlp": 0.01132006, "balance_loss_clip": 0.95692277, "balance_loss_mlp": 0.0, "epoch": 0.0638489749293573, "flos": 17412222245760.0, "grad_norm": 2.308663529522769, "language_loss": 0.89439023, "learning_rate": 3.988033630838019e-06, "loss": 0.91830528, "num_input_tokens_seen": 11333210, "step": 531, "time_per_iteration": 2.680802822113037 }, { "auxiliary_loss_clip": 0.01278869, "auxiliary_loss_mlp": 0.01058405, "balance_loss_clip": 1.04000556, "balance_loss_mlp": 1.03899741, "epoch": 0.0639692178199964, "flos": 23807661874560.0, "grad_norm": 2.113047774960346, "language_loss": 0.88305807, "learning_rate": 3.987948395194334e-06, "loss": 0.90643084, "num_input_tokens_seen": 11355590, "step": 532, "time_per_iteration": 2.7365224361419678 }, { "auxiliary_loss_clip": 0.01267202, "auxiliary_loss_mlp": 0.01062358, "balance_loss_clip": 1.03632641, "balance_loss_mlp": 1.04339194, "epoch": 0.06408946071063548, "flos": 18477225521280.0, "grad_norm": 2.0530113391745237, "language_loss": 0.76766348, "learning_rate": 3.987862857980458e-06, "loss": 0.79095918, "num_input_tokens_seen": 11371535, "step": 533, "time_per_iteration": 2.660155773162842 }, { "auxiliary_loss_clip": 0.01275059, "auxiliary_loss_mlp": 0.01047262, "balance_loss_clip": 0.96018094, "balance_loss_mlp": 1.02932119, "epoch": 0.06420970360127458, "flos": 27162220936320.0, "grad_norm": 2.5332070375220113, "language_loss": 0.76568508, "learning_rate": 3.987777019209368e-06, "loss": 0.7889083, "num_input_tokens_seen": 11392050, "step": 534, "time_per_iteration": 2.783735513687134 }, { "auxiliary_loss_clip": 0.01278577, "auxiliary_loss_mlp": 0.01055191, "balance_loss_clip": 1.07803082, "balance_loss_mlp": 1.03663015, "epoch": 0.06432994649191366, "flos": 23659673840640.0, "grad_norm": 1.638979312110473, "language_loss": 0.80933321, "learning_rate": 3.987690878894084e-06, "loss": 0.83267093, "num_input_tokens_seen": 11411765, "step": 535, "time_per_iteration": 2.7125532627105713 }, { "auxiliary_loss_clip": 0.01270601, "auxiliary_loss_mlp": 0.0106186, "balance_loss_clip": 0.99616557, "balance_loss_mlp": 1.04271531, "epoch": 0.06445018938255276, "flos": 23403953940480.0, "grad_norm": 2.8913533640874576, "language_loss": 0.85381007, "learning_rate": 3.987604437047673e-06, "loss": 0.87713468, "num_input_tokens_seen": 11431565, "step": 536, "time_per_iteration": 2.703601598739624 }, { "auxiliary_loss_clip": 0.01274843, "auxiliary_loss_mlp": 0.01055097, "balance_loss_clip": 1.03924429, "balance_loss_mlp": 1.03483188, "epoch": 0.06457043227319184, "flos": 19646692525440.0, "grad_norm": 2.0963488985434537, "language_loss": 0.77770376, "learning_rate": 3.987517693683251e-06, "loss": 0.80100322, "num_input_tokens_seen": 11450140, "step": 537, "time_per_iteration": 2.647484302520752 }, { "auxiliary_loss_clip": 0.0127055, "auxiliary_loss_mlp": 0.01055255, "balance_loss_clip": 1.00103998, "balance_loss_mlp": 1.03619361, "epoch": 0.06469067516383094, "flos": 16978744915200.0, "grad_norm": 2.5631178877244554, "language_loss": 0.96346414, "learning_rate": 3.9874306488139745e-06, "loss": 0.98672211, "num_input_tokens_seen": 11465400, "step": 538, "time_per_iteration": 2.687753200531006 }, { "auxiliary_loss_clip": 0.01265161, "auxiliary_loss_mlp": 0.01052719, "balance_loss_clip": 0.9591713, "balance_loss_mlp": 1.03482568, "epoch": 0.06481091805447003, "flos": 23296401642240.0, "grad_norm": 2.122900419592608, "language_loss": 0.87995064, "learning_rate": 3.987343302453049e-06, "loss": 0.90312946, "num_input_tokens_seen": 11486675, "step": 539, "time_per_iteration": 5.4813220500946045 }, { "auxiliary_loss_clip": 0.01273618, "auxiliary_loss_mlp": 0.01054131, "balance_loss_clip": 1.00029325, "balance_loss_mlp": 1.03518856, "epoch": 0.06493116094510912, "flos": 29172356824320.0, "grad_norm": 1.690698524510862, "language_loss": 0.82531697, "learning_rate": 3.987255654613724e-06, "loss": 0.84859443, "num_input_tokens_seen": 11510440, "step": 540, "time_per_iteration": 2.7343897819519043 }, { "auxiliary_loss_clip": 0.01261564, "auxiliary_loss_mlp": 0.01066856, "balance_loss_clip": 0.95581239, "balance_loss_mlp": 1.04778242, "epoch": 0.06505140383574821, "flos": 19865065259520.0, "grad_norm": 2.668717955668562, "language_loss": 0.70487463, "learning_rate": 3.987167705309296e-06, "loss": 0.72815883, "num_input_tokens_seen": 11529715, "step": 541, "time_per_iteration": 2.7462780475616455 }, { "auxiliary_loss_clip": 0.01277892, "auxiliary_loss_mlp": 0.01131315, "balance_loss_clip": 1.03843963, "balance_loss_mlp": 0.0, "epoch": 0.0651716467263873, "flos": 17924703540480.0, "grad_norm": 2.1484646099406133, "language_loss": 0.95484716, "learning_rate": 3.987079454553108e-06, "loss": 0.97893918, "num_input_tokens_seen": 11547665, "step": 542, "time_per_iteration": 3.654343366622925 }, { "auxiliary_loss_clip": 0.01266925, "auxiliary_loss_mlp": 0.01059534, "balance_loss_clip": 0.96420902, "balance_loss_mlp": 1.04117608, "epoch": 0.0652918896170264, "flos": 20842840356480.0, "grad_norm": 1.7714274681691795, "language_loss": 0.91442716, "learning_rate": 3.986990902358546e-06, "loss": 0.93769181, "num_input_tokens_seen": 11564605, "step": 543, "time_per_iteration": 2.7294673919677734 }, { "auxiliary_loss_clip": 0.01272321, "auxiliary_loss_mlp": 0.01046567, "balance_loss_clip": 1.03380203, "balance_loss_mlp": 1.02830434, "epoch": 0.06541213250766549, "flos": 21872507627520.0, "grad_norm": 1.924220610605731, "language_loss": 0.93241823, "learning_rate": 3.986902048739045e-06, "loss": 0.95560718, "num_input_tokens_seen": 11584550, "step": 544, "time_per_iteration": 2.7034099102020264 }, { "auxiliary_loss_clip": 0.01272213, "auxiliary_loss_mlp": 0.01059491, "balance_loss_clip": 0.99630004, "balance_loss_mlp": 1.04017854, "epoch": 0.06553237539830457, "flos": 23110743219840.0, "grad_norm": 2.7792942635942803, "language_loss": 0.79875314, "learning_rate": 3.986812893708082e-06, "loss": 0.82207012, "num_input_tokens_seen": 11600740, "step": 545, "time_per_iteration": 2.6954257488250732 }, { "auxiliary_loss_clip": 0.01273445, "auxiliary_loss_mlp": 0.01054804, "balance_loss_clip": 0.99554408, "balance_loss_mlp": 1.0354805, "epoch": 0.06565261828894367, "flos": 17923769786880.0, "grad_norm": 1.9595664927496117, "language_loss": 0.81360817, "learning_rate": 3.9867234372791826e-06, "loss": 0.83689064, "num_input_tokens_seen": 11618695, "step": 546, "time_per_iteration": 2.7684597969055176 }, { "auxiliary_loss_clip": 0.01272852, "auxiliary_loss_mlp": 0.01057854, "balance_loss_clip": 1.03790534, "balance_loss_mlp": 1.03962708, "epoch": 0.06577286117958275, "flos": 22783058421120.0, "grad_norm": 1.8245929045752847, "language_loss": 0.87145901, "learning_rate": 3.986633679465918e-06, "loss": 0.89476609, "num_input_tokens_seen": 11638850, "step": 547, "time_per_iteration": 2.6678011417388916 }, { "auxiliary_loss_clip": 0.01266226, "auxiliary_loss_mlp": 0.01062598, "balance_loss_clip": 0.92113721, "balance_loss_mlp": 1.04459763, "epoch": 0.06589310407022185, "flos": 23696194993920.0, "grad_norm": 2.46642589572149, "language_loss": 0.80786037, "learning_rate": 3.986543620281904e-06, "loss": 0.83114862, "num_input_tokens_seen": 11658500, "step": 548, "time_per_iteration": 2.779505729675293 }, { "auxiliary_loss_clip": 0.01254404, "auxiliary_loss_mlp": 0.01055645, "balance_loss_clip": 0.99602407, "balance_loss_mlp": 1.03689384, "epoch": 0.06601334696086093, "flos": 26864772410880.0, "grad_norm": 1.6606405912786484, "language_loss": 0.91061419, "learning_rate": 3.986453259740802e-06, "loss": 0.93371463, "num_input_tokens_seen": 11676670, "step": 549, "time_per_iteration": 2.7766547203063965 }, { "auxiliary_loss_clip": 0.01265424, "auxiliary_loss_mlp": 0.01066172, "balance_loss_clip": 0.99678028, "balance_loss_mlp": 1.04824305, "epoch": 0.06613358985150003, "flos": 12567694101120.0, "grad_norm": 4.341202921219288, "language_loss": 0.78653961, "learning_rate": 3.986362597856319e-06, "loss": 0.80985558, "num_input_tokens_seen": 11693170, "step": 550, "time_per_iteration": 2.6609175205230713 }, { "auxiliary_loss_clip": 0.01259502, "auxiliary_loss_mlp": 0.01131596, "balance_loss_clip": 0.99305201, "balance_loss_mlp": 0.0, "epoch": 0.06625383274213913, "flos": 18332505624960.0, "grad_norm": 3.4614729289014186, "language_loss": 0.81707644, "learning_rate": 3.986271634642211e-06, "loss": 0.84098744, "num_input_tokens_seen": 11710150, "step": 551, "time_per_iteration": 2.7145276069641113 }, { "auxiliary_loss_clip": 0.01276413, "auxiliary_loss_mlp": 0.0105936, "balance_loss_clip": 1.07827437, "balance_loss_mlp": 1.0418725, "epoch": 0.06637407563277821, "flos": 15375585098880.0, "grad_norm": 2.1610577539357823, "language_loss": 0.81492537, "learning_rate": 3.986180370112274e-06, "loss": 0.83828306, "num_input_tokens_seen": 11726670, "step": 552, "time_per_iteration": 2.6236865520477295 }, { "auxiliary_loss_clip": 0.01273803, "auxiliary_loss_mlp": 0.01131982, "balance_loss_clip": 1.03561473, "balance_loss_mlp": 0.0, "epoch": 0.0664943185234173, "flos": 24025244509440.0, "grad_norm": 1.9577633708620472, "language_loss": 0.74641258, "learning_rate": 3.986088804280354e-06, "loss": 0.77047038, "num_input_tokens_seen": 11746400, "step": 553, "time_per_iteration": 2.704913377761841 }, { "auxiliary_loss_clip": 0.01271584, "auxiliary_loss_mlp": 0.01061225, "balance_loss_clip": 0.99772787, "balance_loss_mlp": 1.04323626, "epoch": 0.06661456141405639, "flos": 20957503547520.0, "grad_norm": 2.062349418769638, "language_loss": 0.93747723, "learning_rate": 3.985996937160342e-06, "loss": 0.96080524, "num_input_tokens_seen": 11765590, "step": 554, "time_per_iteration": 2.7048192024230957 }, { "auxiliary_loss_clip": 0.01268972, "auxiliary_loss_mlp": 0.01049413, "balance_loss_clip": 1.03696573, "balance_loss_mlp": 1.03190148, "epoch": 0.06673480430469549, "flos": 52223953322880.0, "grad_norm": 2.592974340006, "language_loss": 0.69179541, "learning_rate": 3.985904768766173e-06, "loss": 0.71497929, "num_input_tokens_seen": 11788365, "step": 555, "time_per_iteration": 2.972572088241577 }, { "auxiliary_loss_clip": 0.01270718, "auxiliary_loss_mlp": 0.01051646, "balance_loss_clip": 0.95898867, "balance_loss_mlp": 1.03246582, "epoch": 0.06685504719533458, "flos": 16217079995520.0, "grad_norm": 2.7008785486443454, "language_loss": 0.76017147, "learning_rate": 3.98581229911183e-06, "loss": 0.78339511, "num_input_tokens_seen": 11807285, "step": 556, "time_per_iteration": 2.7524077892303467 }, { "auxiliary_loss_clip": 0.01270371, "auxiliary_loss_mlp": 0.01044654, "balance_loss_clip": 1.03318751, "balance_loss_mlp": 1.02615285, "epoch": 0.06697529008597367, "flos": 22491535639680.0, "grad_norm": 1.8060514223799562, "language_loss": 0.92221379, "learning_rate": 3.985719528211341e-06, "loss": 0.945364, "num_input_tokens_seen": 11826655, "step": 557, "time_per_iteration": 2.7282683849334717 }, { "auxiliary_loss_clip": 0.01199405, "auxiliary_loss_mlp": 0.01016499, "balance_loss_clip": 0.99291122, "balance_loss_mlp": 1.00343406, "epoch": 0.06709553297661276, "flos": 62688216936960.0, "grad_norm": 0.8471440201801419, "language_loss": 0.62984955, "learning_rate": 3.985626456078777e-06, "loss": 0.65200859, "num_input_tokens_seen": 11891310, "step": 558, "time_per_iteration": 3.3663899898529053 }, { "auxiliary_loss_clip": 0.01271397, "auxiliary_loss_mlp": 0.0105224, "balance_loss_clip": 0.96103847, "balance_loss_mlp": 1.03432262, "epoch": 0.06721577586725185, "flos": 11216590997760.0, "grad_norm": 2.133844573517153, "language_loss": 0.86358899, "learning_rate": 3.985533082728259e-06, "loss": 0.88682532, "num_input_tokens_seen": 11906965, "step": 559, "time_per_iteration": 2.6734445095062256 }, { "auxiliary_loss_clip": 0.01276563, "auxiliary_loss_mlp": 0.01050487, "balance_loss_clip": 1.07468557, "balance_loss_mlp": 1.03195012, "epoch": 0.06733601875789094, "flos": 25922189664000.0, "grad_norm": 1.7230957008680543, "language_loss": 0.74649996, "learning_rate": 3.985439408173951e-06, "loss": 0.7697705, "num_input_tokens_seen": 11927190, "step": 560, "time_per_iteration": 2.6941797733306885 }, { "auxiliary_loss_clip": 0.01280297, "auxiliary_loss_mlp": 0.0105687, "balance_loss_clip": 1.07776129, "balance_loss_mlp": 1.03745079, "epoch": 0.06745626164853002, "flos": 20813645577600.0, "grad_norm": 2.106075874080189, "language_loss": 0.70950603, "learning_rate": 3.9853454324300634e-06, "loss": 0.73287773, "num_input_tokens_seen": 11946400, "step": 561, "time_per_iteration": 2.652513027191162 }, { "auxiliary_loss_clip": 0.01265225, "auxiliary_loss_mlp": 0.01053041, "balance_loss_clip": 0.87631953, "balance_loss_mlp": 1.03456354, "epoch": 0.06757650453916912, "flos": 19829262378240.0, "grad_norm": 1.9481619545909883, "language_loss": 0.77670801, "learning_rate": 3.985251155510852e-06, "loss": 0.7998907, "num_input_tokens_seen": 11965430, "step": 562, "time_per_iteration": 2.9320785999298096 }, { "auxiliary_loss_clip": 0.01264613, "auxiliary_loss_mlp": 0.01054448, "balance_loss_clip": 0.92177546, "balance_loss_mlp": 1.03593493, "epoch": 0.06769674742980822, "flos": 25739224761600.0, "grad_norm": 1.7578487542944237, "language_loss": 0.80322754, "learning_rate": 3.98515657743062e-06, "loss": 0.82641816, "num_input_tokens_seen": 11984895, "step": 563, "time_per_iteration": 2.9694786071777344 }, { "auxiliary_loss_clip": 0.01270941, "auxiliary_loss_mlp": 0.01057132, "balance_loss_clip": 0.99721789, "balance_loss_mlp": 1.0387615, "epoch": 0.0678169903204473, "flos": 13074788355840.0, "grad_norm": 1.936552984724245, "language_loss": 0.77663386, "learning_rate": 3.985061698203711e-06, "loss": 0.7999146, "num_input_tokens_seen": 12002010, "step": 564, "time_per_iteration": 3.5148019790649414 }, { "auxiliary_loss_clip": 0.01191273, "auxiliary_loss_mlp": 0.0101508, "balance_loss_clip": 1.06732583, "balance_loss_mlp": 1.00220537, "epoch": 0.0679372332110864, "flos": 70865830788480.0, "grad_norm": 0.8857372058128475, "language_loss": 0.63866729, "learning_rate": 3.984966517844523e-06, "loss": 0.66073084, "num_input_tokens_seen": 12057255, "step": 565, "time_per_iteration": 4.049649715423584 }, { "auxiliary_loss_clip": 0.01276948, "auxiliary_loss_mlp": 0.01057752, "balance_loss_clip": 1.07726073, "balance_loss_mlp": 1.03852367, "epoch": 0.06805747610172548, "flos": 28256418990720.0, "grad_norm": 2.250982452401852, "language_loss": 0.80431885, "learning_rate": 3.984871036367492e-06, "loss": 0.82766581, "num_input_tokens_seen": 12077280, "step": 566, "time_per_iteration": 3.6034345626831055 }, { "auxiliary_loss_clip": 0.01268162, "auxiliary_loss_mlp": 0.01131237, "balance_loss_clip": 1.03640747, "balance_loss_mlp": 0.0, "epoch": 0.06817771899236458, "flos": 20120533764480.0, "grad_norm": 1.7542902757677798, "language_loss": 0.82931501, "learning_rate": 3.984775253787102e-06, "loss": 0.85330898, "num_input_tokens_seen": 12095570, "step": 567, "time_per_iteration": 2.679945945739746 }, { "auxiliary_loss_clip": 0.01274382, "auxiliary_loss_mlp": 0.01059005, "balance_loss_clip": 1.03535175, "balance_loss_mlp": 1.04090905, "epoch": 0.06829796188300366, "flos": 17930629284480.0, "grad_norm": 8.042297312734624, "language_loss": 0.88222289, "learning_rate": 3.984679170117885e-06, "loss": 0.90555674, "num_input_tokens_seen": 12111775, "step": 568, "time_per_iteration": 3.655757188796997 }, { "auxiliary_loss_clip": 0.0127001, "auxiliary_loss_mlp": 0.01051368, "balance_loss_clip": 1.03664637, "balance_loss_mlp": 1.03317654, "epoch": 0.06841820477364276, "flos": 14501627285760.0, "grad_norm": 2.3527824218360562, "language_loss": 0.78751612, "learning_rate": 3.984582785374415e-06, "loss": 0.81072986, "num_input_tokens_seen": 12129215, "step": 569, "time_per_iteration": 2.707244396209717 }, { "auxiliary_loss_clip": 0.01268892, "auxiliary_loss_mlp": 0.01131091, "balance_loss_clip": 0.99909163, "balance_loss_mlp": 0.0, "epoch": 0.06853844766428185, "flos": 21938474954880.0, "grad_norm": 1.851809853413032, "language_loss": 0.80626488, "learning_rate": 3.9844860995713155e-06, "loss": 0.83026469, "num_input_tokens_seen": 12148755, "step": 570, "time_per_iteration": 2.7857816219329834 }, { "auxiliary_loss_clip": 0.01276136, "auxiliary_loss_mlp": 0.01048699, "balance_loss_clip": 1.04361701, "balance_loss_mlp": 1.03097224, "epoch": 0.06865869055492094, "flos": 16800628348800.0, "grad_norm": 6.72311540479711, "language_loss": 0.8265155, "learning_rate": 3.9843891127232524e-06, "loss": 0.84976387, "num_input_tokens_seen": 12166290, "step": 571, "time_per_iteration": 2.754481792449951 }, { "auxiliary_loss_clip": 0.01254191, "auxiliary_loss_mlp": 0.01059974, "balance_loss_clip": 0.91791427, "balance_loss_mlp": 1.04200912, "epoch": 0.06877893344556003, "flos": 19937281553280.0, "grad_norm": 2.152601226046298, "language_loss": 0.66953433, "learning_rate": 3.984291824844938e-06, "loss": 0.69267601, "num_input_tokens_seen": 12181385, "step": 572, "time_per_iteration": 2.80000901222229 }, { "auxiliary_loss_clip": 0.01275428, "auxiliary_loss_mlp": 0.01063487, "balance_loss_clip": 1.07543123, "balance_loss_mlp": 1.0456177, "epoch": 0.06889917633619912, "flos": 23039388852480.0, "grad_norm": 2.3219446792836154, "language_loss": 0.85218322, "learning_rate": 3.984194235951132e-06, "loss": 0.87557232, "num_input_tokens_seen": 12197530, "step": 573, "time_per_iteration": 2.7053534984588623 }, { "auxiliary_loss_clip": 0.01276416, "auxiliary_loss_mlp": 0.01051318, "balance_loss_clip": 1.07775617, "balance_loss_mlp": 1.03312612, "epoch": 0.06901941922683821, "flos": 20960556203520.0, "grad_norm": 3.929476574943472, "language_loss": 0.84374988, "learning_rate": 3.9840963460566375e-06, "loss": 0.86702716, "num_input_tokens_seen": 12216310, "step": 574, "time_per_iteration": 2.679130792617798 }, { "auxiliary_loss_clip": 0.01242021, "auxiliary_loss_mlp": 0.01053384, "balance_loss_clip": 0.87553948, "balance_loss_mlp": 1.03550243, "epoch": 0.06913966211747731, "flos": 24821850384000.0, "grad_norm": 1.9030850153597947, "language_loss": 0.8920331, "learning_rate": 3.983998155176305e-06, "loss": 0.91498709, "num_input_tokens_seen": 12236670, "step": 575, "time_per_iteration": 2.818049192428589 }, { "auxiliary_loss_clip": 0.0118524, "auxiliary_loss_mlp": 0.01014676, "balance_loss_clip": 1.06363583, "balance_loss_mlp": 1.00199246, "epoch": 0.06925990500811639, "flos": 58367446957440.0, "grad_norm": 0.8211551476710423, "language_loss": 0.5709604, "learning_rate": 3.9838996633250305e-06, "loss": 0.59295952, "num_input_tokens_seen": 12297185, "step": 576, "time_per_iteration": 3.1470839977264404 }, { "auxiliary_loss_clip": 0.01269864, "auxiliary_loss_mlp": 0.01065961, "balance_loss_clip": 1.03294492, "balance_loss_mlp": 1.0477581, "epoch": 0.06938014789875549, "flos": 12749940731520.0, "grad_norm": 2.430859364416376, "language_loss": 0.88435328, "learning_rate": 3.983800870517753e-06, "loss": 0.90771151, "num_input_tokens_seen": 12313975, "step": 577, "time_per_iteration": 2.711653470993042 }, { "auxiliary_loss_clip": 0.01270799, "auxiliary_loss_mlp": 0.01050024, "balance_loss_clip": 1.04081082, "balance_loss_mlp": 1.03217793, "epoch": 0.06950039078939457, "flos": 22820226019200.0, "grad_norm": 2.878226226781863, "language_loss": 0.77914679, "learning_rate": 3.983701776769463e-06, "loss": 0.80235505, "num_input_tokens_seen": 12331385, "step": 578, "time_per_iteration": 2.6943821907043457 }, { "auxiliary_loss_clip": 0.01255078, "auxiliary_loss_mlp": 0.01055814, "balance_loss_clip": 1.03419507, "balance_loss_mlp": 1.03758693, "epoch": 0.06962063368003367, "flos": 21941348042880.0, "grad_norm": 2.0525650412979908, "language_loss": 0.85644323, "learning_rate": 3.9836023820951885e-06, "loss": 0.87955213, "num_input_tokens_seen": 12350600, "step": 579, "time_per_iteration": 2.7420601844787598 }, { "auxiliary_loss_clip": 0.0125992, "auxiliary_loss_mlp": 0.0104795, "balance_loss_clip": 0.95574617, "balance_loss_mlp": 1.03016448, "epoch": 0.06974087657067275, "flos": 20706021452160.0, "grad_norm": 1.8810621697786643, "language_loss": 0.68381286, "learning_rate": 3.983502686510011e-06, "loss": 0.70689154, "num_input_tokens_seen": 12371430, "step": 580, "time_per_iteration": 2.7592196464538574 }, { "auxiliary_loss_clip": 0.01274107, "auxiliary_loss_mlp": 0.01131573, "balance_loss_clip": 1.03554249, "balance_loss_mlp": 0.0, "epoch": 0.06986111946131185, "flos": 22638230784000.0, "grad_norm": 1.9075521811176634, "language_loss": 0.73490655, "learning_rate": 3.9834026900290525e-06, "loss": 0.75896335, "num_input_tokens_seen": 12390825, "step": 581, "time_per_iteration": 2.6828911304473877 }, { "auxiliary_loss_clip": 0.01272734, "auxiliary_loss_mlp": 0.0104737, "balance_loss_clip": 1.0754329, "balance_loss_mlp": 1.02977443, "epoch": 0.06998136235195095, "flos": 26943453152640.0, "grad_norm": 2.137473529150511, "language_loss": 1.0027808, "learning_rate": 3.983302392667482e-06, "loss": 1.0259819, "num_input_tokens_seen": 12411670, "step": 582, "time_per_iteration": 2.8478379249572754 }, { "auxiliary_loss_clip": 0.01269041, "auxiliary_loss_mlp": 0.01059185, "balance_loss_clip": 1.03618681, "balance_loss_mlp": 1.04154229, "epoch": 0.07010160524259003, "flos": 22492505306880.0, "grad_norm": 1.7179950537886197, "language_loss": 0.93572509, "learning_rate": 3.983201794440517e-06, "loss": 0.95900738, "num_input_tokens_seen": 12431245, "step": 583, "time_per_iteration": 2.764920234680176 }, { "auxiliary_loss_clip": 0.01249705, "auxiliary_loss_mlp": 0.01058607, "balance_loss_clip": 0.99346697, "balance_loss_mlp": 1.04116702, "epoch": 0.07022184813322913, "flos": 18332541538560.0, "grad_norm": 1.7899757320092808, "language_loss": 0.67476285, "learning_rate": 3.9831008953634165e-06, "loss": 0.69784594, "num_input_tokens_seen": 12450535, "step": 584, "time_per_iteration": 2.7873117923736572 }, { "auxiliary_loss_clip": 0.01244419, "auxiliary_loss_mlp": 0.01050093, "balance_loss_clip": 0.91435623, "balance_loss_mlp": 1.03144908, "epoch": 0.07034209102386821, "flos": 24675550289280.0, "grad_norm": 8.462730363452577, "language_loss": 0.80982685, "learning_rate": 3.9829996954514864e-06, "loss": 0.8327719, "num_input_tokens_seen": 12469675, "step": 585, "time_per_iteration": 2.8430240154266357 }, { "auxiliary_loss_clip": 0.01259244, "auxiliary_loss_mlp": 0.01047043, "balance_loss_clip": 1.03434455, "balance_loss_mlp": 1.02956653, "epoch": 0.0704623339145073, "flos": 25995878415360.0, "grad_norm": 1.8187340095919184, "language_loss": 0.84360182, "learning_rate": 3.982898194720079e-06, "loss": 0.86666471, "num_input_tokens_seen": 12490405, "step": 586, "time_per_iteration": 2.842942476272583 }, { "auxiliary_loss_clip": 0.01260985, "auxiliary_loss_mlp": 0.0113156, "balance_loss_clip": 0.99720216, "balance_loss_mlp": 0.0, "epoch": 0.0705825768051464, "flos": 25338318088320.0, "grad_norm": 1.9480125110161914, "language_loss": 0.82694197, "learning_rate": 3.982796393184592e-06, "loss": 0.85086739, "num_input_tokens_seen": 12509485, "step": 587, "time_per_iteration": 2.771627187728882 }, { "auxiliary_loss_clip": 0.01178115, "auxiliary_loss_mlp": 0.01011035, "balance_loss_clip": 1.02006006, "balance_loss_mlp": 0.99873239, "epoch": 0.07070281969578548, "flos": 66047552507520.0, "grad_norm": 0.7931072759708363, "language_loss": 0.6264416, "learning_rate": 3.98269429086047e-06, "loss": 0.64833307, "num_input_tokens_seen": 12567325, "step": 588, "time_per_iteration": 3.2282121181488037 }, { "auxiliary_loss_clip": 0.01255088, "auxiliary_loss_mlp": 0.010536, "balance_loss_clip": 0.99550986, "balance_loss_mlp": 1.03587413, "epoch": 0.07082306258642458, "flos": 23653568528640.0, "grad_norm": 2.6537204720720275, "language_loss": 0.86163008, "learning_rate": 3.982591887763199e-06, "loss": 0.88471699, "num_input_tokens_seen": 12584785, "step": 589, "time_per_iteration": 2.7254421710968018 }, { "auxiliary_loss_clip": 0.0123322, "auxiliary_loss_mlp": 0.01045054, "balance_loss_clip": 0.94651741, "balance_loss_mlp": 1.02696955, "epoch": 0.07094330547706366, "flos": 13880049408000.0, "grad_norm": 2.4577131768341145, "language_loss": 0.81929857, "learning_rate": 3.982489183908316e-06, "loss": 0.84208131, "num_input_tokens_seen": 12601205, "step": 590, "time_per_iteration": 3.570585012435913 }, { "auxiliary_loss_clip": 0.01226166, "auxiliary_loss_mlp": 0.01052439, "balance_loss_clip": 0.86735159, "balance_loss_mlp": 1.03461766, "epoch": 0.07106354836770276, "flos": 24645098534400.0, "grad_norm": 1.7619933242426622, "language_loss": 0.84457922, "learning_rate": 3.982386179311399e-06, "loss": 0.86736524, "num_input_tokens_seen": 12621725, "step": 591, "time_per_iteration": 3.858222007751465 }, { "auxiliary_loss_clip": 0.01272758, "auxiliary_loss_mlp": 0.01051997, "balance_loss_clip": 1.03529644, "balance_loss_mlp": 1.03384185, "epoch": 0.07118379125834184, "flos": 16217223649920.0, "grad_norm": 2.3267299474648784, "language_loss": 0.87455869, "learning_rate": 3.982282873988075e-06, "loss": 0.89780617, "num_input_tokens_seen": 12639600, "step": 592, "time_per_iteration": 3.6939754486083984 }, { "auxiliary_loss_clip": 0.01264914, "auxiliary_loss_mlp": 0.01052825, "balance_loss_clip": 0.99761355, "balance_loss_mlp": 1.03630304, "epoch": 0.07130403414898094, "flos": 19719986227200.0, "grad_norm": 1.6868547852663371, "language_loss": 0.86933935, "learning_rate": 3.982179267954016e-06, "loss": 0.89251679, "num_input_tokens_seen": 12660030, "step": 593, "time_per_iteration": 2.7321505546569824 }, { "auxiliary_loss_clip": 0.01269888, "auxiliary_loss_mlp": 0.01056062, "balance_loss_clip": 1.07306051, "balance_loss_mlp": 1.03857374, "epoch": 0.07142427703962004, "flos": 21871933009920.0, "grad_norm": 2.142800266105234, "language_loss": 0.95862693, "learning_rate": 3.982075361224937e-06, "loss": 0.98188645, "num_input_tokens_seen": 12678395, "step": 594, "time_per_iteration": 3.6180832386016846 }, { "auxiliary_loss_clip": 0.01262971, "auxiliary_loss_mlp": 0.01131286, "balance_loss_clip": 1.03299487, "balance_loss_mlp": 0.0, "epoch": 0.07154451993025912, "flos": 18296595002880.0, "grad_norm": 1.9780981944196234, "language_loss": 0.88273835, "learning_rate": 3.981971153816602e-06, "loss": 0.906681, "num_input_tokens_seen": 12696000, "step": 595, "time_per_iteration": 2.6358699798583984 }, { "auxiliary_loss_clip": 0.01272167, "auxiliary_loss_mlp": 0.01055817, "balance_loss_clip": 1.07812285, "balance_loss_mlp": 1.03880632, "epoch": 0.07166476282089822, "flos": 22160690444160.0, "grad_norm": 1.6922984025782892, "language_loss": 0.96398252, "learning_rate": 3.981866645744819e-06, "loss": 0.98726243, "num_input_tokens_seen": 12716715, "step": 596, "time_per_iteration": 2.702413320541382 }, { "auxiliary_loss_clip": 0.01270376, "auxiliary_loss_mlp": 0.01131459, "balance_loss_clip": 1.07331204, "balance_loss_mlp": 0.0, "epoch": 0.0717850057115373, "flos": 14136343925760.0, "grad_norm": 2.4074966766514914, "language_loss": 0.81538361, "learning_rate": 3.9817618370254416e-06, "loss": 0.83940196, "num_input_tokens_seen": 12733370, "step": 597, "time_per_iteration": 2.668992280960083 }, { "auxiliary_loss_clip": 0.0126992, "auxiliary_loss_mlp": 0.01056613, "balance_loss_clip": 1.07382703, "balance_loss_mlp": 1.03957796, "epoch": 0.0719052486021764, "flos": 30917794412160.0, "grad_norm": 2.366699679469943, "language_loss": 0.8740108, "learning_rate": 3.9816567276743684e-06, "loss": 0.89727616, "num_input_tokens_seen": 12753235, "step": 598, "time_per_iteration": 2.6983084678649902 }, { "auxiliary_loss_clip": 0.01263481, "auxiliary_loss_mlp": 0.01048053, "balance_loss_clip": 0.99691492, "balance_loss_mlp": 1.030792, "epoch": 0.0720254914928155, "flos": 21287019939840.0, "grad_norm": 2.7403230835779913, "language_loss": 0.77547938, "learning_rate": 3.9815513177075466e-06, "loss": 0.79859471, "num_input_tokens_seen": 12772020, "step": 599, "time_per_iteration": 2.734607458114624 }, { "auxiliary_loss_clip": 0.01260861, "auxiliary_loss_mlp": 0.0104886, "balance_loss_clip": 1.03502643, "balance_loss_mlp": 1.03264761, "epoch": 0.07214573438345458, "flos": 27819170732160.0, "grad_norm": 1.505323681994851, "language_loss": 0.70144886, "learning_rate": 3.9814456071409646e-06, "loss": 0.72454607, "num_input_tokens_seen": 12792555, "step": 600, "time_per_iteration": 2.726377248764038 }, { "auxiliary_loss_clip": 0.01265487, "auxiliary_loss_mlp": 0.01048427, "balance_loss_clip": 0.91938657, "balance_loss_mlp": 1.02940083, "epoch": 0.07226597727409367, "flos": 25483576688640.0, "grad_norm": 2.3848443437097124, "language_loss": 0.8508817, "learning_rate": 3.981339595990659e-06, "loss": 0.87402081, "num_input_tokens_seen": 12811085, "step": 601, "time_per_iteration": 2.8457789421081543 }, { "auxiliary_loss_clip": 0.0126659, "auxiliary_loss_mlp": 0.01049358, "balance_loss_clip": 1.03388333, "balance_loss_mlp": 1.03167903, "epoch": 0.07238622016473276, "flos": 23513840622720.0, "grad_norm": 1.9316546663577134, "language_loss": 0.81191933, "learning_rate": 3.981233284272713e-06, "loss": 0.83507884, "num_input_tokens_seen": 12830830, "step": 602, "time_per_iteration": 2.697650194168091 }, { "auxiliary_loss_clip": 0.01259662, "auxiliary_loss_mlp": 0.01042218, "balance_loss_clip": 0.95341563, "balance_loss_mlp": 1.02575552, "epoch": 0.07250646305537185, "flos": 25453519983360.0, "grad_norm": 1.6594671984665834, "language_loss": 0.89952946, "learning_rate": 3.981126672003253e-06, "loss": 0.92254823, "num_input_tokens_seen": 12853505, "step": 603, "time_per_iteration": 2.905773878097534 }, { "auxiliary_loss_clip": 0.01265304, "auxiliary_loss_mlp": 0.01048648, "balance_loss_clip": 0.99145412, "balance_loss_mlp": 1.03116047, "epoch": 0.07262670594601094, "flos": 27155038216320.0, "grad_norm": 2.287109325708129, "language_loss": 0.77875942, "learning_rate": 3.981019759198451e-06, "loss": 0.80189896, "num_input_tokens_seen": 12872455, "step": 604, "time_per_iteration": 2.809784412384033 }, { "auxiliary_loss_clip": 0.01264415, "auxiliary_loss_mlp": 0.01046185, "balance_loss_clip": 0.99189031, "balance_loss_mlp": 1.02950764, "epoch": 0.07274694883665003, "flos": 26651607148800.0, "grad_norm": 2.880474285411748, "language_loss": 0.84847403, "learning_rate": 3.980912545874528e-06, "loss": 0.87158, "num_input_tokens_seen": 12892620, "step": 605, "time_per_iteration": 2.7152013778686523 }, { "auxiliary_loss_clip": 0.01257155, "auxiliary_loss_mlp": 0.01131343, "balance_loss_clip": 1.02997255, "balance_loss_mlp": 0.0, "epoch": 0.07286719172728913, "flos": 29862344154240.0, "grad_norm": 2.9221862795791163, "language_loss": 0.85620201, "learning_rate": 3.980805032047746e-06, "loss": 0.88008696, "num_input_tokens_seen": 12914090, "step": 606, "time_per_iteration": 2.7309224605560303 }, { "auxiliary_loss_clip": 0.0126447, "auxiliary_loss_mlp": 0.0106157, "balance_loss_clip": 0.99755502, "balance_loss_mlp": 1.04392719, "epoch": 0.07298743461792821, "flos": 17382057799680.0, "grad_norm": 1.8264049203441828, "language_loss": 0.80925286, "learning_rate": 3.980697217734415e-06, "loss": 0.83251321, "num_input_tokens_seen": 12931830, "step": 607, "time_per_iteration": 2.706329107284546 }, { "auxiliary_loss_clip": 0.01264126, "auxiliary_loss_mlp": 0.01131202, "balance_loss_clip": 0.91782433, "balance_loss_mlp": 0.0, "epoch": 0.07310767750856731, "flos": 19498201701120.0, "grad_norm": 1.8264642129946218, "language_loss": 0.91979933, "learning_rate": 3.980589102950891e-06, "loss": 0.94375265, "num_input_tokens_seen": 12949995, "step": 608, "time_per_iteration": 2.8734381198883057 }, { "auxiliary_loss_clip": 0.01261417, "auxiliary_loss_mlp": 0.01058415, "balance_loss_clip": 0.99570954, "balance_loss_mlp": 1.04102266, "epoch": 0.07322792039920639, "flos": 29168693637120.0, "grad_norm": 2.2798453745713045, "language_loss": 0.75866705, "learning_rate": 3.9804806877135755e-06, "loss": 0.78186536, "num_input_tokens_seen": 12968040, "step": 609, "time_per_iteration": 2.841574192047119 }, { "auxiliary_loss_clip": 0.01267079, "auxiliary_loss_mlp": 0.01131275, "balance_loss_clip": 1.03020537, "balance_loss_mlp": 0.0, "epoch": 0.07334816328984549, "flos": 23477822259840.0, "grad_norm": 2.4584654563601287, "language_loss": 0.85775745, "learning_rate": 3.980371972038915e-06, "loss": 0.88174105, "num_input_tokens_seen": 12988530, "step": 610, "time_per_iteration": 2.71313738822937 }, { "auxiliary_loss_clip": 0.01273007, "auxiliary_loss_mlp": 0.01045682, "balance_loss_clip": 1.07652199, "balance_loss_mlp": 1.02818227, "epoch": 0.07346840618048459, "flos": 22962467877120.0, "grad_norm": 1.7325489639606948, "language_loss": 0.84114468, "learning_rate": 3.980262955943399e-06, "loss": 0.8643316, "num_input_tokens_seen": 13008195, "step": 611, "time_per_iteration": 2.6765706539154053 }, { "auxiliary_loss_clip": 0.01258548, "auxiliary_loss_mlp": 0.010573, "balance_loss_clip": 0.99689686, "balance_loss_mlp": 1.04015815, "epoch": 0.07358864907112367, "flos": 17673903803520.0, "grad_norm": 2.875327970492868, "language_loss": 0.86862254, "learning_rate": 3.980153639443569e-06, "loss": 0.89178097, "num_input_tokens_seen": 13024180, "step": 612, "time_per_iteration": 2.835529327392578 }, { "auxiliary_loss_clip": 0.01272368, "auxiliary_loss_mlp": 0.01057655, "balance_loss_clip": 0.99615455, "balance_loss_mlp": 1.03964269, "epoch": 0.07370889196176277, "flos": 24097029840000.0, "grad_norm": 2.0317337344153543, "language_loss": 0.80219722, "learning_rate": 3.980044022556005e-06, "loss": 0.82549751, "num_input_tokens_seen": 13043865, "step": 613, "time_per_iteration": 2.7933669090270996 }, { "auxiliary_loss_clip": 0.01266544, "auxiliary_loss_mlp": 0.010579, "balance_loss_clip": 1.03619599, "balance_loss_mlp": 1.04017365, "epoch": 0.07382913485240185, "flos": 25885919905920.0, "grad_norm": 2.56014414923389, "language_loss": 0.72602677, "learning_rate": 3.9799341052973375e-06, "loss": 0.74927121, "num_input_tokens_seen": 13063700, "step": 614, "time_per_iteration": 2.7203080654144287 }, { "auxiliary_loss_clip": 0.01265714, "auxiliary_loss_mlp": 0.01048883, "balance_loss_clip": 1.00008893, "balance_loss_mlp": 1.03122783, "epoch": 0.07394937774304094, "flos": 16873850223360.0, "grad_norm": 3.0464334189789666, "language_loss": 0.75259328, "learning_rate": 3.979823887684241e-06, "loss": 0.77573919, "num_input_tokens_seen": 13082640, "step": 615, "time_per_iteration": 2.7615675926208496 }, { "auxiliary_loss_clip": 0.01272888, "auxiliary_loss_mlp": 0.01054405, "balance_loss_clip": 1.07698417, "balance_loss_mlp": 1.03703666, "epoch": 0.07406962063368003, "flos": 20703471586560.0, "grad_norm": 2.0228206250802683, "language_loss": 0.84434915, "learning_rate": 3.979713369733434e-06, "loss": 0.86762208, "num_input_tokens_seen": 13100505, "step": 616, "time_per_iteration": 3.5412650108337402 }, { "auxiliary_loss_clip": 0.01250386, "auxiliary_loss_mlp": 0.01047221, "balance_loss_clip": 1.02955675, "balance_loss_mlp": 1.02964973, "epoch": 0.07418986352431912, "flos": 21430985650560.0, "grad_norm": 6.458787297329697, "language_loss": 0.84732103, "learning_rate": 3.979602551461683e-06, "loss": 0.87029707, "num_input_tokens_seen": 13121285, "step": 617, "time_per_iteration": 3.6850905418395996 }, { "auxiliary_loss_clip": 0.0126237, "auxiliary_loss_mlp": 0.01060286, "balance_loss_clip": 0.9968276, "balance_loss_mlp": 1.04335845, "epoch": 0.07431010641495822, "flos": 12021133777920.0, "grad_norm": 2.407260342512204, "language_loss": 0.91675419, "learning_rate": 3.979491432885799e-06, "loss": 0.93998075, "num_input_tokens_seen": 13137550, "step": 618, "time_per_iteration": 3.541778802871704 }, { "auxiliary_loss_clip": 0.01239818, "auxiliary_loss_mlp": 0.01130799, "balance_loss_clip": 0.95325315, "balance_loss_mlp": 0.0, "epoch": 0.0744303493055973, "flos": 20957575374720.0, "grad_norm": 1.950923122978381, "language_loss": 0.83203578, "learning_rate": 3.97938001402264e-06, "loss": 0.85574198, "num_input_tokens_seen": 13156675, "step": 619, "time_per_iteration": 2.7219009399414062 }, { "auxiliary_loss_clip": 0.01254871, "auxiliary_loss_mlp": 0.01052115, "balance_loss_clip": 0.95301962, "balance_loss_mlp": 1.03436518, "epoch": 0.0745505921962364, "flos": 16253134272000.0, "grad_norm": 3.8587973956020867, "language_loss": 0.79942393, "learning_rate": 3.979268294889105e-06, "loss": 0.82249385, "num_input_tokens_seen": 13172225, "step": 620, "time_per_iteration": 2.7551558017730713 }, { "auxiliary_loss_clip": 0.01268, "auxiliary_loss_mlp": 0.01047166, "balance_loss_clip": 1.0735234, "balance_loss_mlp": 1.03027439, "epoch": 0.07467083508687548, "flos": 50944635550080.0, "grad_norm": 2.2257169214914123, "language_loss": 0.73943722, "learning_rate": 3.979156275502143e-06, "loss": 0.76258886, "num_input_tokens_seen": 13195885, "step": 621, "time_per_iteration": 3.8816018104553223 }, { "auxiliary_loss_clip": 0.01265592, "auxiliary_loss_mlp": 0.01047162, "balance_loss_clip": 0.95834744, "balance_loss_mlp": 1.02897084, "epoch": 0.07479107797751458, "flos": 17529686697600.0, "grad_norm": 2.481143250423937, "language_loss": 0.91784495, "learning_rate": 3.979043955878749e-06, "loss": 0.94097251, "num_input_tokens_seen": 13213730, "step": 622, "time_per_iteration": 2.696971893310547 }, { "auxiliary_loss_clip": 0.01264199, "auxiliary_loss_mlp": 0.01046761, "balance_loss_clip": 0.99938542, "balance_loss_mlp": 1.02930832, "epoch": 0.07491132086815366, "flos": 23473943591040.0, "grad_norm": 3.978414885401318, "language_loss": 0.832421, "learning_rate": 3.978931336035959e-06, "loss": 0.85553062, "num_input_tokens_seen": 13232540, "step": 623, "time_per_iteration": 2.7215795516967773 }, { "auxiliary_loss_clip": 0.01267353, "auxiliary_loss_mlp": 0.0105279, "balance_loss_clip": 1.03814042, "balance_loss_mlp": 1.03482485, "epoch": 0.07503156375879276, "flos": 20157557708160.0, "grad_norm": 2.3673648293914966, "language_loss": 0.82443732, "learning_rate": 3.9788184159908595e-06, "loss": 0.84763879, "num_input_tokens_seen": 13249670, "step": 624, "time_per_iteration": 2.658116340637207 }, { "auxiliary_loss_clip": 0.01253978, "auxiliary_loss_mlp": 0.01061648, "balance_loss_clip": 0.99405348, "balance_loss_mlp": 1.04525661, "epoch": 0.07515180664943186, "flos": 15115519653120.0, "grad_norm": 3.351288101714357, "language_loss": 0.83073002, "learning_rate": 3.97870519576058e-06, "loss": 0.85388625, "num_input_tokens_seen": 13266095, "step": 625, "time_per_iteration": 2.733816146850586 }, { "auxiliary_loss_clip": 0.01254657, "auxiliary_loss_mlp": 0.01130928, "balance_loss_clip": 0.95453244, "balance_loss_mlp": 0.0, "epoch": 0.07527204954007094, "flos": 21287702298240.0, "grad_norm": 2.2797794868155226, "language_loss": 0.8100028, "learning_rate": 3.978591675362295e-06, "loss": 0.83385861, "num_input_tokens_seen": 13284810, "step": 626, "time_per_iteration": 2.75335955619812 }, { "auxiliary_loss_clip": 0.01259409, "auxiliary_loss_mlp": 0.01054789, "balance_loss_clip": 0.92461336, "balance_loss_mlp": 1.03827894, "epoch": 0.07539229243071004, "flos": 21324187537920.0, "grad_norm": 1.8651833630096328, "language_loss": 0.87392437, "learning_rate": 3.978477854813226e-06, "loss": 0.89706635, "num_input_tokens_seen": 13304150, "step": 627, "time_per_iteration": 2.803443670272827 }, { "auxiliary_loss_clip": 0.01263143, "auxiliary_loss_mlp": 0.0104916, "balance_loss_clip": 1.03258801, "balance_loss_mlp": 1.03126669, "epoch": 0.07551253532134912, "flos": 13042540920960.0, "grad_norm": 1.8682135167914917, "language_loss": 0.82286954, "learning_rate": 3.97836373413064e-06, "loss": 0.84599257, "num_input_tokens_seen": 13322205, "step": 628, "time_per_iteration": 2.685316324234009 }, { "auxiliary_loss_clip": 0.01268852, "auxiliary_loss_mlp": 0.01048608, "balance_loss_clip": 1.07245779, "balance_loss_mlp": 1.03153694, "epoch": 0.07563277821198822, "flos": 19208761908480.0, "grad_norm": 2.1737142610115305, "language_loss": 0.74555671, "learning_rate": 3.978249313331848e-06, "loss": 0.76873136, "num_input_tokens_seen": 13340435, "step": 629, "time_per_iteration": 2.758843183517456 }, { "auxiliary_loss_clip": 0.01272053, "auxiliary_loss_mlp": 0.01131243, "balance_loss_clip": 1.03390992, "balance_loss_mlp": 0.0, "epoch": 0.07575302110262731, "flos": 19537200892800.0, "grad_norm": 2.8766857717520065, "language_loss": 0.62294555, "learning_rate": 3.978134592434208e-06, "loss": 0.6469785, "num_input_tokens_seen": 13358185, "step": 630, "time_per_iteration": 2.764472484588623 }, { "auxiliary_loss_clip": 0.01166556, "auxiliary_loss_mlp": 0.01009941, "balance_loss_clip": 0.90113026, "balance_loss_mlp": 0.99763834, "epoch": 0.0758732639932664, "flos": 67961808017280.0, "grad_norm": 1.0152263228347107, "language_loss": 0.59469879, "learning_rate": 3.978019571455123e-06, "loss": 0.61646378, "num_input_tokens_seen": 13410130, "step": 631, "time_per_iteration": 3.3887147903442383 }, { "auxiliary_loss_clip": 0.01267451, "auxiliary_loss_mlp": 0.01047724, "balance_loss_clip": 1.07474661, "balance_loss_mlp": 1.03058147, "epoch": 0.07599350688390549, "flos": 18989204025600.0, "grad_norm": 3.313455586100102, "language_loss": 0.84075844, "learning_rate": 3.977904250412042e-06, "loss": 0.8639102, "num_input_tokens_seen": 13429085, "step": 632, "time_per_iteration": 2.7318623065948486 }, { "auxiliary_loss_clip": 0.01267225, "auxiliary_loss_mlp": 0.01056561, "balance_loss_clip": 0.99670416, "balance_loss_mlp": 1.03904891, "epoch": 0.07611374977454458, "flos": 21069006341760.0, "grad_norm": 2.3485706081949784, "language_loss": 0.85452658, "learning_rate": 3.97778862932246e-06, "loss": 0.8777644, "num_input_tokens_seen": 13446250, "step": 633, "time_per_iteration": 2.7952635288238525 }, { "auxiliary_loss_clip": 0.01219276, "auxiliary_loss_mlp": 0.01047246, "balance_loss_clip": 0.78794193, "balance_loss_mlp": 1.02991283, "epoch": 0.07623399266518367, "flos": 18514536773760.0, "grad_norm": 2.3592535410253164, "language_loss": 0.94493878, "learning_rate": 3.9776727082039144e-06, "loss": 0.96760398, "num_input_tokens_seen": 13463220, "step": 634, "time_per_iteration": 3.3480186462402344 }, { "auxiliary_loss_clip": 0.01182301, "auxiliary_loss_mlp": 0.01014262, "balance_loss_clip": 1.06413531, "balance_loss_mlp": 1.00215006, "epoch": 0.07635423555582276, "flos": 44663036077440.0, "grad_norm": 0.8060934447526081, "language_loss": 0.55501735, "learning_rate": 3.977556487073991e-06, "loss": 0.57698298, "num_input_tokens_seen": 13517775, "step": 635, "time_per_iteration": 3.3123068809509277 }, { "auxiliary_loss_clip": 0.01248165, "auxiliary_loss_mlp": 0.0105356, "balance_loss_clip": 0.98933601, "balance_loss_mlp": 1.03645349, "epoch": 0.07647447844646185, "flos": 21761148487680.0, "grad_norm": 2.29602305102977, "language_loss": 0.81505013, "learning_rate": 3.97743996595032e-06, "loss": 0.83806741, "num_input_tokens_seen": 13537815, "step": 636, "time_per_iteration": 2.9253787994384766 }, { "auxiliary_loss_clip": 0.01268695, "auxiliary_loss_mlp": 0.01044501, "balance_loss_clip": 1.07400143, "balance_loss_mlp": 1.02745438, "epoch": 0.07659472133710095, "flos": 23806799948160.0, "grad_norm": 1.5347613830529474, "language_loss": 0.81574464, "learning_rate": 3.9773231448505804e-06, "loss": 0.83887661, "num_input_tokens_seen": 13559605, "step": 637, "time_per_iteration": 2.786910057067871 }, { "auxiliary_loss_clip": 0.01260342, "auxiliary_loss_mlp": 0.0113105, "balance_loss_clip": 0.99614602, "balance_loss_mlp": 0.0, "epoch": 0.07671496422774003, "flos": 21469984842240.0, "grad_norm": 1.9516042374436828, "language_loss": 0.78188944, "learning_rate": 3.977206023792491e-06, "loss": 0.8058033, "num_input_tokens_seen": 13579495, "step": 638, "time_per_iteration": 2.794328451156616 }, { "auxiliary_loss_clip": 0.01265518, "auxiliary_loss_mlp": 0.01055393, "balance_loss_clip": 1.03690326, "balance_loss_mlp": 1.03839421, "epoch": 0.07683520711837913, "flos": 16980971558400.0, "grad_norm": 1.998207212443615, "language_loss": 0.80953765, "learning_rate": 3.97708860279382e-06, "loss": 0.8327468, "num_input_tokens_seen": 13597605, "step": 639, "time_per_iteration": 2.7559540271759033 }, { "auxiliary_loss_clip": 0.01255998, "auxiliary_loss_mlp": 0.01056145, "balance_loss_clip": 0.95536083, "balance_loss_mlp": 1.0384779, "epoch": 0.07695545000901821, "flos": 23476744851840.0, "grad_norm": 1.78830442130868, "language_loss": 0.77980059, "learning_rate": 3.97697088187238e-06, "loss": 0.80292201, "num_input_tokens_seen": 13618120, "step": 640, "time_per_iteration": 2.8905818462371826 }, { "auxiliary_loss_clip": 0.01262543, "auxiliary_loss_mlp": 0.01050627, "balance_loss_clip": 0.99878752, "balance_loss_mlp": 1.03279293, "epoch": 0.07707569289965731, "flos": 17634258167040.0, "grad_norm": 2.267486350759504, "language_loss": 0.91798013, "learning_rate": 3.976852861046029e-06, "loss": 0.94111186, "num_input_tokens_seen": 13634735, "step": 641, "time_per_iteration": 2.686021566390991 }, { "auxiliary_loss_clip": 0.01257574, "auxiliary_loss_mlp": 0.01054085, "balance_loss_clip": 0.91667604, "balance_loss_mlp": 1.03738427, "epoch": 0.0771959357902964, "flos": 25775674087680.0, "grad_norm": 1.6574731412357806, "language_loss": 0.80296147, "learning_rate": 3.97673454033267e-06, "loss": 0.82607806, "num_input_tokens_seen": 13656835, "step": 642, "time_per_iteration": 3.808079719543457 }, { "auxiliary_loss_clip": 0.01267147, "auxiliary_loss_mlp": 0.01058279, "balance_loss_clip": 0.99483073, "balance_loss_mlp": 1.04110146, "epoch": 0.07731617868093549, "flos": 19828651847040.0, "grad_norm": 1.9869731582821268, "language_loss": 0.82506609, "learning_rate": 3.976615919750254e-06, "loss": 0.84832036, "num_input_tokens_seen": 13674535, "step": 643, "time_per_iteration": 3.8824453353881836 }, { "auxiliary_loss_clip": 0.01266897, "auxiliary_loss_mlp": 0.01062398, "balance_loss_clip": 1.03720355, "balance_loss_mlp": 1.04431367, "epoch": 0.07743642157157458, "flos": 21324654414720.0, "grad_norm": 2.0593238382908314, "language_loss": 0.86689651, "learning_rate": 3.976496999316775e-06, "loss": 0.89018947, "num_input_tokens_seen": 13693290, "step": 644, "time_per_iteration": 3.6355888843536377 }, { "auxiliary_loss_clip": 0.01262119, "auxiliary_loss_mlp": 0.01060661, "balance_loss_clip": 0.99724424, "balance_loss_mlp": 1.0431726, "epoch": 0.07755666446221367, "flos": 19969133938560.0, "grad_norm": 2.8193279930915995, "language_loss": 0.84256756, "learning_rate": 3.976377779050271e-06, "loss": 0.86579537, "num_input_tokens_seen": 13711420, "step": 645, "time_per_iteration": 2.774378776550293 }, { "auxiliary_loss_clip": 0.01251939, "auxiliary_loss_mlp": 0.01056292, "balance_loss_clip": 1.02951968, "balance_loss_mlp": 1.03907835, "epoch": 0.07767690735285276, "flos": 23623224514560.0, "grad_norm": 3.294297628584271, "language_loss": 0.84387708, "learning_rate": 3.976258258968831e-06, "loss": 0.86695933, "num_input_tokens_seen": 13729965, "step": 646, "time_per_iteration": 3.7569937705993652 }, { "auxiliary_loss_clip": 0.01262283, "auxiliary_loss_mlp": 0.01053496, "balance_loss_clip": 0.96025538, "balance_loss_mlp": 1.0357697, "epoch": 0.07779715024349185, "flos": 22236246702720.0, "grad_norm": 2.157973361043339, "language_loss": 0.7428776, "learning_rate": 3.976138439090583e-06, "loss": 0.76603544, "num_input_tokens_seen": 13748045, "step": 647, "time_per_iteration": 2.701493501663208 }, { "auxiliary_loss_clip": 0.01260719, "auxiliary_loss_mlp": 0.01061819, "balance_loss_clip": 0.9579196, "balance_loss_mlp": 1.04437923, "epoch": 0.07791739313413094, "flos": 20955097336320.0, "grad_norm": 2.499839155186761, "language_loss": 0.85546654, "learning_rate": 3.976018319433706e-06, "loss": 0.87869191, "num_input_tokens_seen": 13765590, "step": 648, "time_per_iteration": 2.749748468399048 }, { "auxiliary_loss_clip": 0.01270636, "auxiliary_loss_mlp": 0.01055728, "balance_loss_clip": 1.03784823, "balance_loss_mlp": 1.03943193, "epoch": 0.07803763602477004, "flos": 19312327797120.0, "grad_norm": 2.2510507629638683, "language_loss": 0.91128057, "learning_rate": 3.9758979000164205e-06, "loss": 0.93454427, "num_input_tokens_seen": 13782410, "step": 649, "time_per_iteration": 2.668240547180176 }, { "auxiliary_loss_clip": 0.01264907, "auxiliary_loss_mlp": 0.01052734, "balance_loss_clip": 0.95605993, "balance_loss_mlp": 1.03479266, "epoch": 0.07815787891540912, "flos": 22710806213760.0, "grad_norm": 1.8944602506948742, "language_loss": 0.72104329, "learning_rate": 3.975777180856995e-06, "loss": 0.74421966, "num_input_tokens_seen": 13801530, "step": 650, "time_per_iteration": 2.792232036590576 }, { "auxiliary_loss_clip": 0.01271447, "auxiliary_loss_mlp": 0.01052263, "balance_loss_clip": 1.07407629, "balance_loss_mlp": 1.03509688, "epoch": 0.07827812180604822, "flos": 22711129436160.0, "grad_norm": 2.278660349868121, "language_loss": 0.85939479, "learning_rate": 3.975656161973742e-06, "loss": 0.8826319, "num_input_tokens_seen": 13820615, "step": 651, "time_per_iteration": 2.6557323932647705 }, { "auxiliary_loss_clip": 0.01270414, "auxiliary_loss_mlp": 0.01050847, "balance_loss_clip": 1.07271814, "balance_loss_mlp": 1.03341866, "epoch": 0.0783983646966873, "flos": 21725597001600.0, "grad_norm": 2.3589969703452223, "language_loss": 0.8861087, "learning_rate": 3.9755348433850194e-06, "loss": 0.90932131, "num_input_tokens_seen": 13835955, "step": 652, "time_per_iteration": 2.647742986679077 }, { "auxiliary_loss_clip": 0.0116378, "auxiliary_loss_mlp": 0.01014534, "balance_loss_clip": 0.97270346, "balance_loss_mlp": 1.00285196, "epoch": 0.0785186075873264, "flos": 60640877537280.0, "grad_norm": 0.9885715547406374, "language_loss": 0.63583833, "learning_rate": 3.975413225109232e-06, "loss": 0.6576215, "num_input_tokens_seen": 13896505, "step": 653, "time_per_iteration": 3.4845471382141113 }, { "auxiliary_loss_clip": 0.0126958, "auxiliary_loss_mlp": 0.010531, "balance_loss_clip": 1.0355494, "balance_loss_mlp": 1.03441954, "epoch": 0.0786388504779655, "flos": 23877902920320.0, "grad_norm": 3.1929721463498, "language_loss": 0.93814051, "learning_rate": 3.975291307164829e-06, "loss": 0.96136725, "num_input_tokens_seen": 13915150, "step": 654, "time_per_iteration": 2.732806444168091 }, { "auxiliary_loss_clip": 0.01248737, "auxiliary_loss_mlp": 0.01055197, "balance_loss_clip": 0.95560503, "balance_loss_mlp": 1.0389483, "epoch": 0.07875909336860458, "flos": 15158684822400.0, "grad_norm": 1.9182799630556415, "language_loss": 0.84933209, "learning_rate": 3.975169089570306e-06, "loss": 0.87237138, "num_input_tokens_seen": 13933525, "step": 655, "time_per_iteration": 2.796555757522583 }, { "auxiliary_loss_clip": 0.01249253, "auxiliary_loss_mlp": 0.01052121, "balance_loss_clip": 1.03139365, "balance_loss_mlp": 1.03422737, "epoch": 0.07887933625924368, "flos": 22236857233920.0, "grad_norm": 1.922066214425766, "language_loss": 0.91645789, "learning_rate": 3.975046572344202e-06, "loss": 0.93947154, "num_input_tokens_seen": 13949985, "step": 656, "time_per_iteration": 2.677762269973755 }, { "auxiliary_loss_clip": 0.01257777, "auxiliary_loss_mlp": 0.01048214, "balance_loss_clip": 0.95489997, "balance_loss_mlp": 1.03120244, "epoch": 0.07899957914988276, "flos": 20777734955520.0, "grad_norm": 2.763420360782473, "language_loss": 0.71149242, "learning_rate": 3.974923755505103e-06, "loss": 0.73455232, "num_input_tokens_seen": 13969215, "step": 657, "time_per_iteration": 2.8534724712371826 }, { "auxiliary_loss_clip": 0.01242819, "auxiliary_loss_mlp": 0.01056681, "balance_loss_clip": 0.95320469, "balance_loss_mlp": 1.04061198, "epoch": 0.07911982204052186, "flos": 23003047267200.0, "grad_norm": 1.6731048590838131, "language_loss": 0.91068667, "learning_rate": 3.974800639071641e-06, "loss": 0.93368167, "num_input_tokens_seen": 13989935, "step": 658, "time_per_iteration": 2.802180528640747 }, { "auxiliary_loss_clip": 0.01238612, "auxiliary_loss_mlp": 0.01131582, "balance_loss_clip": 0.87499779, "balance_loss_mlp": 0.0, "epoch": 0.07924006493116094, "flos": 23111389664640.0, "grad_norm": 2.3056998122880588, "language_loss": 1.00500047, "learning_rate": 3.974677223062492e-06, "loss": 1.0287025, "num_input_tokens_seen": 14007150, "step": 659, "time_per_iteration": 2.8276190757751465 }, { "auxiliary_loss_clip": 0.01262182, "auxiliary_loss_mlp": 0.01051278, "balance_loss_clip": 0.99674249, "balance_loss_mlp": 1.03357506, "epoch": 0.07936030782180004, "flos": 16472153450880.0, "grad_norm": 3.1769302169235267, "language_loss": 0.74356806, "learning_rate": 3.974553507496378e-06, "loss": 0.76670265, "num_input_tokens_seen": 14025725, "step": 660, "time_per_iteration": 2.714745044708252 }, { "auxiliary_loss_clip": 0.01253007, "auxiliary_loss_mlp": 0.01047119, "balance_loss_clip": 0.99479383, "balance_loss_mlp": 1.02910638, "epoch": 0.07948055071243913, "flos": 23733290764800.0, "grad_norm": 2.3544839308202286, "language_loss": 0.88946086, "learning_rate": 3.974429492392068e-06, "loss": 0.91246212, "num_input_tokens_seen": 14045750, "step": 661, "time_per_iteration": 2.8031370639801025 }, { "auxiliary_loss_clip": 0.01268221, "auxiliary_loss_mlp": 0.01131315, "balance_loss_clip": 1.07443023, "balance_loss_mlp": 0.0, "epoch": 0.07960079360307822, "flos": 19573326996480.0, "grad_norm": 3.3047996118561467, "language_loss": 0.91190428, "learning_rate": 3.974305177768373e-06, "loss": 0.93589962, "num_input_tokens_seen": 14063960, "step": 662, "time_per_iteration": 2.6382367610931396 }, { "auxiliary_loss_clip": 0.01249221, "auxiliary_loss_mlp": 0.01050337, "balance_loss_clip": 0.95800471, "balance_loss_mlp": 1.03244376, "epoch": 0.07972103649371731, "flos": 23513409659520.0, "grad_norm": 2.1912159696956137, "language_loss": 0.865026, "learning_rate": 3.974180563644152e-06, "loss": 0.88802147, "num_input_tokens_seen": 14082525, "step": 663, "time_per_iteration": 2.7613940238952637 }, { "auxiliary_loss_clip": 0.01268741, "auxiliary_loss_mlp": 0.01056276, "balance_loss_clip": 0.99836993, "balance_loss_mlp": 1.038728, "epoch": 0.0798412793843564, "flos": 16726867770240.0, "grad_norm": 3.143432295850078, "language_loss": 0.89374965, "learning_rate": 3.97405565003831e-06, "loss": 0.91699982, "num_input_tokens_seen": 14098610, "step": 664, "time_per_iteration": 2.736199378967285 }, { "auxiliary_loss_clip": 0.01266442, "auxiliary_loss_mlp": 0.01052355, "balance_loss_clip": 0.95845062, "balance_loss_mlp": 1.03479505, "epoch": 0.07996152227499549, "flos": 18223337214720.0, "grad_norm": 2.1418344509267357, "language_loss": 0.78051943, "learning_rate": 3.973930436969794e-06, "loss": 0.80370736, "num_input_tokens_seen": 14117065, "step": 665, "time_per_iteration": 2.7394373416900635 }, { "auxiliary_loss_clip": 0.01250657, "auxiliary_loss_mlp": 0.01052692, "balance_loss_clip": 0.9925102, "balance_loss_mlp": 1.03465545, "epoch": 0.08008176516563459, "flos": 20594877793920.0, "grad_norm": 2.2319833498910477, "language_loss": 0.85895205, "learning_rate": 3.973804924457602e-06, "loss": 0.88198555, "num_input_tokens_seen": 14135145, "step": 666, "time_per_iteration": 2.7751450538635254 }, { "auxiliary_loss_clip": 0.01254835, "auxiliary_loss_mlp": 0.01051095, "balance_loss_clip": 0.99407816, "balance_loss_mlp": 1.03321385, "epoch": 0.08020200805627367, "flos": 31834306863360.0, "grad_norm": 2.1049414174915357, "language_loss": 0.8534956, "learning_rate": 3.973679112520771e-06, "loss": 0.87655491, "num_input_tokens_seen": 14156860, "step": 667, "time_per_iteration": 2.812270402908325 }, { "auxiliary_loss_clip": 0.01250664, "auxiliary_loss_mlp": 0.01047919, "balance_loss_clip": 0.95370591, "balance_loss_mlp": 1.03085995, "epoch": 0.08032225094691277, "flos": 17783503176960.0, "grad_norm": 2.4454532020315023, "language_loss": 0.99066585, "learning_rate": 3.973553001178389e-06, "loss": 1.01365173, "num_input_tokens_seen": 14174365, "step": 668, "time_per_iteration": 4.746300220489502 }, { "auxiliary_loss_clip": 0.01262207, "auxiliary_loss_mlp": 0.01050162, "balance_loss_clip": 0.96116072, "balance_loss_mlp": 1.03331745, "epoch": 0.08044249383755185, "flos": 24061693835520.0, "grad_norm": 2.2676032586129757, "language_loss": 0.75662392, "learning_rate": 3.973426590449585e-06, "loss": 0.77974761, "num_input_tokens_seen": 14192320, "step": 669, "time_per_iteration": 2.7866628170013428 }, { "auxiliary_loss_clip": 0.01258713, "auxiliary_loss_mlp": 0.01058535, "balance_loss_clip": 0.91909051, "balance_loss_mlp": 1.04191685, "epoch": 0.08056273672819095, "flos": 18223624523520.0, "grad_norm": 1.8351396907288444, "language_loss": 0.75082207, "learning_rate": 3.9732998803535364e-06, "loss": 0.77399457, "num_input_tokens_seen": 14210380, "step": 670, "time_per_iteration": 3.6791882514953613 }, { "auxiliary_loss_clip": 0.01270111, "auxiliary_loss_mlp": 0.01051276, "balance_loss_clip": 1.07512069, "balance_loss_mlp": 1.03404999, "epoch": 0.08068297961883003, "flos": 19676856971520.0, "grad_norm": 2.248448878223978, "language_loss": 0.85312736, "learning_rate": 3.973172870909465e-06, "loss": 0.87634122, "num_input_tokens_seen": 14225145, "step": 671, "time_per_iteration": 2.6429648399353027 }, { "auxiliary_loss_clip": 0.01270704, "auxiliary_loss_mlp": 0.01053979, "balance_loss_clip": 0.99609208, "balance_loss_mlp": 1.03618097, "epoch": 0.08080322250946913, "flos": 23148736830720.0, "grad_norm": 2.5735857567301523, "language_loss": 0.80916798, "learning_rate": 3.973045562136638e-06, "loss": 0.83241487, "num_input_tokens_seen": 14241960, "step": 672, "time_per_iteration": 3.689455032348633 }, { "auxiliary_loss_clip": 0.01272178, "auxiliary_loss_mlp": 0.01054746, "balance_loss_clip": 1.03691566, "balance_loss_mlp": 1.03637576, "epoch": 0.08092346540010822, "flos": 21763626526080.0, "grad_norm": 2.683099262052174, "language_loss": 0.91599256, "learning_rate": 3.972917954054368e-06, "loss": 0.93926185, "num_input_tokens_seen": 14260515, "step": 673, "time_per_iteration": 2.6984541416168213 }, { "auxiliary_loss_clip": 0.01253588, "auxiliary_loss_mlp": 0.01057986, "balance_loss_clip": 0.99640441, "balance_loss_mlp": 1.03961575, "epoch": 0.08104370829074731, "flos": 21032485188480.0, "grad_norm": 3.5629605022930857, "language_loss": 0.81764245, "learning_rate": 3.972790046682013e-06, "loss": 0.8407582, "num_input_tokens_seen": 14279190, "step": 674, "time_per_iteration": 2.7050678730010986 }, { "auxiliary_loss_clip": 0.01263229, "auxiliary_loss_mlp": 0.01044853, "balance_loss_clip": 0.95346463, "balance_loss_mlp": 1.0266614, "epoch": 0.0811639511813864, "flos": 20083186598400.0, "grad_norm": 1.8243532106309717, "language_loss": 0.7895937, "learning_rate": 3.972661840038977e-06, "loss": 0.81267446, "num_input_tokens_seen": 14299480, "step": 675, "time_per_iteration": 2.7759976387023926 }, { "auxiliary_loss_clip": 0.01266727, "auxiliary_loss_mlp": 0.01051087, "balance_loss_clip": 1.03695154, "balance_loss_mlp": 1.03423071, "epoch": 0.08128419407202549, "flos": 16836718538880.0, "grad_norm": 2.1428396286782174, "language_loss": 0.83414257, "learning_rate": 3.972533334144707e-06, "loss": 0.85732073, "num_input_tokens_seen": 14316405, "step": 676, "time_per_iteration": 2.670766830444336 }, { "auxiliary_loss_clip": 0.01266102, "auxiliary_loss_mlp": 0.01051595, "balance_loss_clip": 1.03374624, "balance_loss_mlp": 1.03434563, "epoch": 0.08140443696266458, "flos": 23769273214080.0, "grad_norm": 2.136359246824773, "language_loss": 0.7849648, "learning_rate": 3.972404529018699e-06, "loss": 0.80814177, "num_input_tokens_seen": 14336265, "step": 677, "time_per_iteration": 2.9524409770965576 }, { "auxiliary_loss_clip": 0.01254019, "auxiliary_loss_mlp": 0.0104636, "balance_loss_clip": 0.9903394, "balance_loss_mlp": 1.02986145, "epoch": 0.08152467985330367, "flos": 24390132819840.0, "grad_norm": 1.753008782585102, "language_loss": 0.85150945, "learning_rate": 3.972275424680493e-06, "loss": 0.87451321, "num_input_tokens_seen": 14356375, "step": 678, "time_per_iteration": 2.7924139499664307 }, { "auxiliary_loss_clip": 0.01268151, "auxiliary_loss_mlp": 0.01052666, "balance_loss_clip": 1.07358265, "balance_loss_mlp": 1.03510618, "epoch": 0.08164492274394276, "flos": 19317750750720.0, "grad_norm": 2.408130921861515, "language_loss": 0.91877818, "learning_rate": 3.972146021149673e-06, "loss": 0.94198638, "num_input_tokens_seen": 14374650, "step": 679, "time_per_iteration": 2.7372565269470215 }, { "auxiliary_loss_clip": 0.01260201, "auxiliary_loss_mlp": 0.01049109, "balance_loss_clip": 0.95662248, "balance_loss_mlp": 1.03216922, "epoch": 0.08176516563458186, "flos": 14830461319680.0, "grad_norm": 3.707020720478274, "language_loss": 0.78684008, "learning_rate": 3.972016318445868e-06, "loss": 0.80993313, "num_input_tokens_seen": 14392650, "step": 680, "time_per_iteration": 2.8038647174835205 }, { "auxiliary_loss_clip": 0.01265814, "auxiliary_loss_mlp": 0.01049129, "balance_loss_clip": 1.03348327, "balance_loss_mlp": 1.03234458, "epoch": 0.08188540852522094, "flos": 22602320161920.0, "grad_norm": 1.965130452177894, "language_loss": 0.92440784, "learning_rate": 3.971886316588757e-06, "loss": 0.94755727, "num_input_tokens_seen": 14413155, "step": 681, "time_per_iteration": 2.7989003658294678 }, { "auxiliary_loss_clip": 0.01247493, "auxiliary_loss_mlp": 0.01055646, "balance_loss_clip": 0.95575249, "balance_loss_mlp": 1.03771722, "epoch": 0.08200565141586004, "flos": 19463727623040.0, "grad_norm": 2.555068107985232, "language_loss": 0.73019779, "learning_rate": 3.9717560155980595e-06, "loss": 0.75322914, "num_input_tokens_seen": 14428805, "step": 682, "time_per_iteration": 2.76094126701355 }, { "auxiliary_loss_clip": 0.01265968, "auxiliary_loss_mlp": 0.01047182, "balance_loss_clip": 1.03558826, "balance_loss_mlp": 1.03020656, "epoch": 0.08212589430649912, "flos": 20594662312320.0, "grad_norm": 2.1232289533358633, "language_loss": 0.92148256, "learning_rate": 3.971625415493542e-06, "loss": 0.94461405, "num_input_tokens_seen": 14447125, "step": 683, "time_per_iteration": 2.74515962600708 }, { "auxiliary_loss_clip": 0.01252947, "auxiliary_loss_mlp": 0.01060107, "balance_loss_clip": 0.95388544, "balance_loss_mlp": 1.04246426, "epoch": 0.08224613719713822, "flos": 25953611086080.0, "grad_norm": 1.8785856397870275, "language_loss": 0.87581372, "learning_rate": 3.971494516295017e-06, "loss": 0.89894432, "num_input_tokens_seen": 14466575, "step": 684, "time_per_iteration": 2.8320789337158203 }, { "auxiliary_loss_clip": 0.01260601, "auxiliary_loss_mlp": 0.01052002, "balance_loss_clip": 0.95346022, "balance_loss_mlp": 1.03474033, "epoch": 0.08236638008777732, "flos": 23768734510080.0, "grad_norm": 2.6289587491493327, "language_loss": 0.85338145, "learning_rate": 3.971363318022341e-06, "loss": 0.87650746, "num_input_tokens_seen": 14487915, "step": 685, "time_per_iteration": 2.8157060146331787 }, { "auxiliary_loss_clip": 0.01258604, "auxiliary_loss_mlp": 0.01048422, "balance_loss_clip": 0.99125409, "balance_loss_mlp": 1.03130364, "epoch": 0.0824866229784164, "flos": 38799144887040.0, "grad_norm": 3.673717152773359, "language_loss": 0.68409705, "learning_rate": 3.971231820695417e-06, "loss": 0.70716733, "num_input_tokens_seen": 14511530, "step": 686, "time_per_iteration": 2.9129364490509033 }, { "auxiliary_loss_clip": 0.01265182, "auxiliary_loss_mlp": 0.0104721, "balance_loss_clip": 0.99474716, "balance_loss_mlp": 1.0300914, "epoch": 0.0826068658690555, "flos": 23107762391040.0, "grad_norm": 1.8863369586182699, "language_loss": 0.81540763, "learning_rate": 3.971100024334193e-06, "loss": 0.83853155, "num_input_tokens_seen": 14529050, "step": 687, "time_per_iteration": 2.7328097820281982 }, { "auxiliary_loss_clip": 0.01238195, "auxiliary_loss_mlp": 0.01055754, "balance_loss_clip": 0.94984907, "balance_loss_mlp": 1.03960133, "epoch": 0.08272710875969458, "flos": 21136374299520.0, "grad_norm": 2.124443629611172, "language_loss": 0.86343396, "learning_rate": 3.970967928958663e-06, "loss": 0.8863734, "num_input_tokens_seen": 14546165, "step": 688, "time_per_iteration": 2.8593130111694336 }, { "auxiliary_loss_clip": 0.01259129, "auxiliary_loss_mlp": 0.01053102, "balance_loss_clip": 0.91651177, "balance_loss_mlp": 1.0365324, "epoch": 0.08284735165033368, "flos": 19063000517760.0, "grad_norm": 1.6522815795475891, "language_loss": 0.83230329, "learning_rate": 3.970835534588865e-06, "loss": 0.8554256, "num_input_tokens_seen": 14563660, "step": 689, "time_per_iteration": 2.8147239685058594 }, { "auxiliary_loss_clip": 0.01272718, "auxiliary_loss_mlp": 0.01060648, "balance_loss_clip": 1.00341678, "balance_loss_mlp": 1.04391098, "epoch": 0.08296759454097276, "flos": 16727442387840.0, "grad_norm": 1.8188881272402397, "language_loss": 0.85603511, "learning_rate": 3.970702841244883e-06, "loss": 0.87936878, "num_input_tokens_seen": 14581980, "step": 690, "time_per_iteration": 2.819133758544922 }, { "auxiliary_loss_clip": 0.01268893, "auxiliary_loss_mlp": 0.01049604, "balance_loss_clip": 1.03769922, "balance_loss_mlp": 1.031901, "epoch": 0.08308783743161186, "flos": 18004928567040.0, "grad_norm": 1.9948913114410196, "language_loss": 0.82390183, "learning_rate": 3.970569848946847e-06, "loss": 0.84708679, "num_input_tokens_seen": 14601795, "step": 691, "time_per_iteration": 2.711665153503418 }, { "auxiliary_loss_clip": 0.01248644, "auxiliary_loss_mlp": 0.01054912, "balance_loss_clip": 1.03273928, "balance_loss_mlp": 1.03804398, "epoch": 0.08320808032225095, "flos": 15079788599040.0, "grad_norm": 2.690282196492058, "language_loss": 0.83032364, "learning_rate": 3.970436557714932e-06, "loss": 0.85335922, "num_input_tokens_seen": 14618315, "step": 692, "time_per_iteration": 2.663419008255005 }, { "auxiliary_loss_clip": 0.01258979, "auxiliary_loss_mlp": 0.0104213, "balance_loss_clip": 0.99529707, "balance_loss_mlp": 1.02464175, "epoch": 0.08332832321289003, "flos": 22383085501440.0, "grad_norm": 2.1510145335987447, "language_loss": 0.86424339, "learning_rate": 3.970302967569358e-06, "loss": 0.88725448, "num_input_tokens_seen": 14636905, "step": 693, "time_per_iteration": 2.7453248500823975 }, { "auxiliary_loss_clip": 0.01264252, "auxiliary_loss_mlp": 0.0104933, "balance_loss_clip": 1.03501213, "balance_loss_mlp": 1.03210425, "epoch": 0.08344856610352913, "flos": 24717386655360.0, "grad_norm": 2.475560403442135, "language_loss": 0.68267107, "learning_rate": 3.9701690785303896e-06, "loss": 0.70580691, "num_input_tokens_seen": 14656100, "step": 694, "time_per_iteration": 4.719552516937256 }, { "auxiliary_loss_clip": 0.01261981, "auxiliary_loss_mlp": 0.01056955, "balance_loss_clip": 1.03248668, "balance_loss_mlp": 1.03970551, "epoch": 0.08356880899416821, "flos": 25370206387200.0, "grad_norm": 2.2346730559379235, "language_loss": 0.88336307, "learning_rate": 3.970034890618339e-06, "loss": 0.90655243, "num_input_tokens_seen": 14675790, "step": 695, "time_per_iteration": 2.801924228668213 }, { "auxiliary_loss_clip": 0.01247332, "auxiliary_loss_mlp": 0.01047261, "balance_loss_clip": 1.0304811, "balance_loss_mlp": 1.03029728, "epoch": 0.08368905188480731, "flos": 24353072962560.0, "grad_norm": 1.9381002110534498, "language_loss": 0.87794816, "learning_rate": 3.969900403853562e-06, "loss": 0.90089411, "num_input_tokens_seen": 14694830, "step": 696, "time_per_iteration": 3.64528226852417 }, { "auxiliary_loss_clip": 0.0126729, "auxiliary_loss_mlp": 0.01051801, "balance_loss_clip": 1.07554817, "balance_loss_mlp": 1.03545773, "epoch": 0.08380929477544641, "flos": 18037319656320.0, "grad_norm": 1.6145922241432027, "language_loss": 0.77912319, "learning_rate": 3.96976561825646e-06, "loss": 0.80231404, "num_input_tokens_seen": 14711920, "step": 697, "time_per_iteration": 2.6639058589935303 }, { "auxiliary_loss_clip": 0.01259864, "auxiliary_loss_mlp": 0.01051577, "balance_loss_clip": 0.91905648, "balance_loss_mlp": 1.03526926, "epoch": 0.08392953766608549, "flos": 26286287875200.0, "grad_norm": 4.277408176208244, "language_loss": 0.87130606, "learning_rate": 3.969630533847479e-06, "loss": 0.8944205, "num_input_tokens_seen": 14730880, "step": 698, "time_per_iteration": 2.8860011100769043 }, { "auxiliary_loss_clip": 0.01260267, "auxiliary_loss_mlp": 0.01045625, "balance_loss_clip": 1.03145087, "balance_loss_mlp": 1.02930498, "epoch": 0.08404978055672459, "flos": 22492146170880.0, "grad_norm": 1.9149212169768148, "language_loss": 0.84247881, "learning_rate": 3.969495150647113e-06, "loss": 0.86553776, "num_input_tokens_seen": 14749050, "step": 699, "time_per_iteration": 3.7589035034179688 }, { "auxiliary_loss_clip": 0.01255922, "auxiliary_loss_mlp": 0.01041897, "balance_loss_clip": 0.95647907, "balance_loss_mlp": 1.02539861, "epoch": 0.08417002344736367, "flos": 24826878288000.0, "grad_norm": 1.6748925502165584, "language_loss": 0.76647806, "learning_rate": 3.969359468675899e-06, "loss": 0.78945625, "num_input_tokens_seen": 14769180, "step": 700, "time_per_iteration": 2.799381971359253 }, { "auxiliary_loss_clip": 0.01256747, "auxiliary_loss_mlp": 0.01044702, "balance_loss_clip": 1.03285646, "balance_loss_mlp": 1.02761936, "epoch": 0.08429026633800277, "flos": 16945922862720.0, "grad_norm": 2.1056944406443736, "language_loss": 0.89363223, "learning_rate": 3.969223487954418e-06, "loss": 0.91664672, "num_input_tokens_seen": 14786640, "step": 701, "time_per_iteration": 2.7628540992736816 }, { "auxiliary_loss_clip": 0.01250669, "auxiliary_loss_mlp": 0.0105803, "balance_loss_clip": 0.91710323, "balance_loss_mlp": 1.04119742, "epoch": 0.08441050922864185, "flos": 23841920471040.0, "grad_norm": 2.2942310213939465, "language_loss": 0.82949448, "learning_rate": 3.969087208503301e-06, "loss": 0.85258138, "num_input_tokens_seen": 14806720, "step": 702, "time_per_iteration": 2.782944917678833 }, { "auxiliary_loss_clip": 0.01240699, "auxiliary_loss_mlp": 0.01056505, "balance_loss_clip": 0.95353472, "balance_loss_mlp": 1.03913593, "epoch": 0.08453075211928095, "flos": 25520205582720.0, "grad_norm": 2.649241896050479, "language_loss": 0.84604919, "learning_rate": 3.968950630343219e-06, "loss": 0.86902118, "num_input_tokens_seen": 14823705, "step": 703, "time_per_iteration": 2.7997283935546875 }, { "auxiliary_loss_clip": 0.0124945, "auxiliary_loss_mlp": 0.01040365, "balance_loss_clip": 0.98912424, "balance_loss_mlp": 1.0238781, "epoch": 0.08465099500992004, "flos": 19532496211200.0, "grad_norm": 2.0584435928045224, "language_loss": 0.93707961, "learning_rate": 3.968813753494892e-06, "loss": 0.95997775, "num_input_tokens_seen": 14841865, "step": 704, "time_per_iteration": 2.7377164363861084 }, { "auxiliary_loss_clip": 0.0124271, "auxiliary_loss_mlp": 0.01131048, "balance_loss_clip": 0.9506129, "balance_loss_mlp": 0.0, "epoch": 0.08477123790055913, "flos": 29351299403520.0, "grad_norm": 2.2399285145587857, "language_loss": 0.75508481, "learning_rate": 3.968676577979084e-06, "loss": 0.77882236, "num_input_tokens_seen": 14861415, "step": 705, "time_per_iteration": 2.803431510925293 }, { "auxiliary_loss_clip": 0.01250569, "auxiliary_loss_mlp": 0.01054794, "balance_loss_clip": 0.91276205, "balance_loss_mlp": 1.03815281, "epoch": 0.08489148079119822, "flos": 18624495283200.0, "grad_norm": 2.864050349138455, "language_loss": 0.78167284, "learning_rate": 3.968539103816605e-06, "loss": 0.80472648, "num_input_tokens_seen": 14879215, "step": 706, "time_per_iteration": 2.7752628326416016 }, { "auxiliary_loss_clip": 0.01251931, "auxiliary_loss_mlp": 0.01130636, "balance_loss_clip": 0.99055046, "balance_loss_mlp": 0.0, "epoch": 0.0850117236818373, "flos": 23471393725440.0, "grad_norm": 2.218380056184846, "language_loss": 0.89506686, "learning_rate": 3.9684013310283085e-06, "loss": 0.9188925, "num_input_tokens_seen": 14897900, "step": 707, "time_per_iteration": 2.7660393714904785 }, { "auxiliary_loss_clip": 0.01258173, "auxiliary_loss_mlp": 0.0104915, "balance_loss_clip": 0.99863434, "balance_loss_mlp": 1.03317571, "epoch": 0.0851319665724764, "flos": 40625058896640.0, "grad_norm": 2.796467738578854, "language_loss": 0.64080739, "learning_rate": 3.9682632596350956e-06, "loss": 0.66388065, "num_input_tokens_seen": 14919065, "step": 708, "time_per_iteration": 2.8832929134368896 }, { "auxiliary_loss_clip": 0.01257035, "auxiliary_loss_mlp": 0.01054451, "balance_loss_clip": 1.03406799, "balance_loss_mlp": 1.03864431, "epoch": 0.0852522094631155, "flos": 15879554870400.0, "grad_norm": 2.17452727789309, "language_loss": 0.78246021, "learning_rate": 3.968124889657911e-06, "loss": 0.80557501, "num_input_tokens_seen": 14934165, "step": 709, "time_per_iteration": 2.678382635116577 }, { "auxiliary_loss_clip": 0.01247427, "auxiliary_loss_mlp": 0.01048185, "balance_loss_clip": 0.9116807, "balance_loss_mlp": 1.03186512, "epoch": 0.08537245235375458, "flos": 14567091822720.0, "grad_norm": 2.184518251385324, "language_loss": 0.9058609, "learning_rate": 3.967986221117746e-06, "loss": 0.92881703, "num_input_tokens_seen": 14950105, "step": 710, "time_per_iteration": 2.7694814205169678 }, { "auxiliary_loss_clip": 0.01252595, "auxiliary_loss_mlp": 0.01046281, "balance_loss_clip": 0.8369993, "balance_loss_mlp": 1.03021133, "epoch": 0.08549269524439368, "flos": 26468929555200.0, "grad_norm": 1.8804995145456673, "language_loss": 0.86317945, "learning_rate": 3.967847254035635e-06, "loss": 0.88616824, "num_input_tokens_seen": 14969490, "step": 711, "time_per_iteration": 3.0398683547973633 }, { "auxiliary_loss_clip": 0.01258128, "auxiliary_loss_mlp": 0.01047153, "balance_loss_clip": 0.95496827, "balance_loss_mlp": 1.03052354, "epoch": 0.08561293813503276, "flos": 13590214565760.0, "grad_norm": 4.7623049401052215, "language_loss": 0.86490172, "learning_rate": 3.967707988432661e-06, "loss": 0.88795453, "num_input_tokens_seen": 14987195, "step": 712, "time_per_iteration": 2.930471897125244 }, { "auxiliary_loss_clip": 0.01259209, "auxiliary_loss_mlp": 0.01052046, "balance_loss_clip": 1.06808078, "balance_loss_mlp": 1.03666842, "epoch": 0.08573318102567186, "flos": 26943524979840.0, "grad_norm": 2.2484292466500158, "language_loss": 0.87605321, "learning_rate": 3.967568424329949e-06, "loss": 0.89916581, "num_input_tokens_seen": 15007620, "step": 713, "time_per_iteration": 2.6944739818573 }, { "auxiliary_loss_clip": 0.01178596, "auxiliary_loss_mlp": 0.01028043, "balance_loss_clip": 0.98272991, "balance_loss_mlp": 1.01707578, "epoch": 0.08585342391631094, "flos": 67302739319040.0, "grad_norm": 0.8213683794440432, "language_loss": 0.555861, "learning_rate": 3.967428561748671e-06, "loss": 0.57792747, "num_input_tokens_seen": 15075590, "step": 714, "time_per_iteration": 3.4703855514526367 }, { "auxiliary_loss_clip": 0.01236651, "auxiliary_loss_mlp": 0.01053141, "balance_loss_clip": 0.90901089, "balance_loss_mlp": 1.03667831, "epoch": 0.08597366680695004, "flos": 22456594684800.0, "grad_norm": 1.9621769293042917, "language_loss": 0.87674975, "learning_rate": 3.967288400710045e-06, "loss": 0.89964765, "num_input_tokens_seen": 15095055, "step": 715, "time_per_iteration": 2.8550052642822266 }, { "auxiliary_loss_clip": 0.01251645, "auxiliary_loss_mlp": 0.01053759, "balance_loss_clip": 0.95769513, "balance_loss_mlp": 1.03734422, "epoch": 0.08609390969758914, "flos": 23550505430400.0, "grad_norm": 2.0200989983257953, "language_loss": 0.88442922, "learning_rate": 3.9671479412353335e-06, "loss": 0.90748334, "num_input_tokens_seen": 15113520, "step": 716, "time_per_iteration": 2.7616677284240723 }, { "auxiliary_loss_clip": 0.01256413, "auxiliary_loss_mlp": 0.01048691, "balance_loss_clip": 1.03245509, "balance_loss_mlp": 1.03314638, "epoch": 0.08621415258822822, "flos": 25885848078720.0, "grad_norm": 3.1328791580204265, "language_loss": 0.73852563, "learning_rate": 3.967007183345843e-06, "loss": 0.76157665, "num_input_tokens_seen": 15133375, "step": 717, "time_per_iteration": 2.7761011123657227 }, { "auxiliary_loss_clip": 0.01257698, "auxiliary_loss_mlp": 0.01051342, "balance_loss_clip": 1.03324533, "balance_loss_mlp": 1.03512931, "epoch": 0.08633439547886732, "flos": 13589568120960.0, "grad_norm": 2.3136301891923785, "language_loss": 0.89352143, "learning_rate": 3.966866127062927e-06, "loss": 0.91661185, "num_input_tokens_seen": 15150500, "step": 718, "time_per_iteration": 2.734149217605591 }, { "auxiliary_loss_clip": 0.01167893, "auxiliary_loss_mlp": 0.01013184, "balance_loss_clip": 1.01977015, "balance_loss_mlp": 1.00231206, "epoch": 0.0864546383695064, "flos": 57767342434560.0, "grad_norm": 0.8720489535245326, "language_loss": 0.62656844, "learning_rate": 3.966724772407982e-06, "loss": 0.64837921, "num_input_tokens_seen": 15208015, "step": 719, "time_per_iteration": 3.132209539413452 }, { "auxiliary_loss_clip": 0.01237055, "auxiliary_loss_mlp": 0.01055536, "balance_loss_clip": 0.95107353, "balance_loss_mlp": 1.03928804, "epoch": 0.0865748812601455, "flos": 20046952753920.0, "grad_norm": 2.193494924544307, "language_loss": 0.88720095, "learning_rate": 3.966583119402454e-06, "loss": 0.91012686, "num_input_tokens_seen": 15224780, "step": 720, "time_per_iteration": 3.7327733039855957 }, { "auxiliary_loss_clip": 0.01254604, "auxiliary_loss_mlp": 0.01130394, "balance_loss_clip": 1.03198981, "balance_loss_mlp": 0.0, "epoch": 0.08669512415078459, "flos": 35262446935680.0, "grad_norm": 2.501675932673333, "language_loss": 0.82134503, "learning_rate": 3.9664411680678305e-06, "loss": 0.84519494, "num_input_tokens_seen": 15246535, "step": 721, "time_per_iteration": 2.832578659057617 }, { "auxiliary_loss_clip": 0.01165316, "auxiliary_loss_mlp": 0.01011623, "balance_loss_clip": 0.93863928, "balance_loss_mlp": 1.00084686, "epoch": 0.08681536704142367, "flos": 65654870048640.0, "grad_norm": 0.8569311330217394, "language_loss": 0.61423385, "learning_rate": 3.966298918425644e-06, "loss": 0.63600326, "num_input_tokens_seen": 15304025, "step": 722, "time_per_iteration": 4.14725136756897 }, { "auxiliary_loss_clip": 0.01258594, "auxiliary_loss_mlp": 0.01044407, "balance_loss_clip": 1.02990532, "balance_loss_mlp": 1.02852821, "epoch": 0.08693560993206277, "flos": 34529940881280.0, "grad_norm": 1.7473004498216245, "language_loss": 0.82748711, "learning_rate": 3.966156370497476e-06, "loss": 0.85051709, "num_input_tokens_seen": 15327635, "step": 723, "time_per_iteration": 2.85221529006958 }, { "auxiliary_loss_clip": 0.01260509, "auxiliary_loss_mlp": 0.01048528, "balance_loss_clip": 1.02996671, "balance_loss_mlp": 1.03298306, "epoch": 0.08705585282270185, "flos": 23149419189120.0, "grad_norm": 1.7191263382084632, "language_loss": 0.88394725, "learning_rate": 3.96601352430495e-06, "loss": 0.90703762, "num_input_tokens_seen": 15347405, "step": 724, "time_per_iteration": 2.72139835357666 }, { "auxiliary_loss_clip": 0.01252959, "auxiliary_loss_mlp": 0.01052534, "balance_loss_clip": 0.99371457, "balance_loss_mlp": 1.03671455, "epoch": 0.08717609571334095, "flos": 29497599498240.0, "grad_norm": 9.10804814964547, "language_loss": 0.83161706, "learning_rate": 3.965870379869735e-06, "loss": 0.85467196, "num_input_tokens_seen": 15369450, "step": 725, "time_per_iteration": 3.7617712020874023 }, { "auxiliary_loss_clip": 0.01256431, "auxiliary_loss_mlp": 0.01046807, "balance_loss_clip": 1.02901828, "balance_loss_mlp": 1.03111899, "epoch": 0.08729633860398003, "flos": 20667489137280.0, "grad_norm": 2.0461197330297565, "language_loss": 0.87067342, "learning_rate": 3.965726937213547e-06, "loss": 0.89370573, "num_input_tokens_seen": 15388085, "step": 726, "time_per_iteration": 2.7341864109039307 }, { "auxiliary_loss_clip": 0.01251556, "auxiliary_loss_mlp": 0.01045376, "balance_loss_clip": 1.02782345, "balance_loss_mlp": 1.02917492, "epoch": 0.08741658149461913, "flos": 18369493655040.0, "grad_norm": 2.1862413579019337, "language_loss": 0.81204373, "learning_rate": 3.965583196358144e-06, "loss": 0.83501303, "num_input_tokens_seen": 15407120, "step": 727, "time_per_iteration": 2.660305976867676 }, { "auxiliary_loss_clip": 0.01260494, "auxiliary_loss_mlp": 0.01049084, "balance_loss_clip": 1.06890619, "balance_loss_mlp": 1.03240681, "epoch": 0.08753682438525823, "flos": 18729677283840.0, "grad_norm": 2.1988989956140546, "language_loss": 0.74538088, "learning_rate": 3.965439157325335e-06, "loss": 0.76847661, "num_input_tokens_seen": 15424485, "step": 728, "time_per_iteration": 2.6846816539764404 }, { "auxiliary_loss_clip": 0.01247039, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 0.98716027, "balance_loss_mlp": 1.02912092, "epoch": 0.08765706727589731, "flos": 27776113303680.0, "grad_norm": 2.105555601895299, "language_loss": 0.75646406, "learning_rate": 3.965294820136968e-06, "loss": 0.77938628, "num_input_tokens_seen": 15446285, "step": 729, "time_per_iteration": 2.821315288543701 }, { "auxiliary_loss_clip": 0.0125674, "auxiliary_loss_mlp": 0.01052356, "balance_loss_clip": 0.99370694, "balance_loss_mlp": 1.03557169, "epoch": 0.08777731016653641, "flos": 24389127239040.0, "grad_norm": 2.639716447549593, "language_loss": 0.87179446, "learning_rate": 3.965150184814938e-06, "loss": 0.89488542, "num_input_tokens_seen": 15465770, "step": 730, "time_per_iteration": 2.7339906692504883 }, { "auxiliary_loss_clip": 0.01239846, "auxiliary_loss_mlp": 0.01044448, "balance_loss_clip": 0.98944414, "balance_loss_mlp": 1.02820015, "epoch": 0.08789755305717549, "flos": 21981855605760.0, "grad_norm": 2.1428095445214828, "language_loss": 0.76822686, "learning_rate": 3.965005251381189e-06, "loss": 0.79106975, "num_input_tokens_seen": 15483705, "step": 731, "time_per_iteration": 2.7319061756134033 }, { "auxiliary_loss_clip": 0.01172089, "auxiliary_loss_mlp": 0.01009871, "balance_loss_clip": 1.01483548, "balance_loss_mlp": 0.99885559, "epoch": 0.08801779594781459, "flos": 58360120583040.0, "grad_norm": 0.9032321053118368, "language_loss": 0.6457926, "learning_rate": 3.964860019857705e-06, "loss": 0.6676122, "num_input_tokens_seen": 15548620, "step": 732, "time_per_iteration": 3.296032190322876 }, { "auxiliary_loss_clip": 0.0125605, "auxiliary_loss_mlp": 0.01055572, "balance_loss_clip": 1.07121754, "balance_loss_mlp": 1.03945458, "epoch": 0.08813803883845367, "flos": 23294785530240.0, "grad_norm": 2.2173239316595663, "language_loss": 0.83961427, "learning_rate": 3.964714490266518e-06, "loss": 0.8627305, "num_input_tokens_seen": 15569265, "step": 733, "time_per_iteration": 2.686323881149292 }, { "auxiliary_loss_clip": 0.01163583, "auxiliary_loss_mlp": 0.01009172, "balance_loss_clip": 1.01190019, "balance_loss_mlp": 0.99782318, "epoch": 0.08825828172909277, "flos": 63424924882560.0, "grad_norm": 0.8864229900990372, "language_loss": 0.64640856, "learning_rate": 3.964568662629706e-06, "loss": 0.66813612, "num_input_tokens_seen": 15630570, "step": 734, "time_per_iteration": 3.1993069648742676 }, { "auxiliary_loss_clip": 0.01253517, "auxiliary_loss_mlp": 0.01049803, "balance_loss_clip": 1.02845144, "balance_loss_mlp": 1.03416324, "epoch": 0.08837852461973186, "flos": 26720986268160.0, "grad_norm": 3.065368272820675, "language_loss": 0.84317172, "learning_rate": 3.9644225369693895e-06, "loss": 0.86620492, "num_input_tokens_seen": 15650870, "step": 735, "time_per_iteration": 2.7644100189208984 }, { "auxiliary_loss_clip": 0.01260068, "auxiliary_loss_mlp": 0.01045437, "balance_loss_clip": 1.07234359, "balance_loss_mlp": 1.02912927, "epoch": 0.08849876751037095, "flos": 27265427688960.0, "grad_norm": 2.6716861837636454, "language_loss": 0.86810148, "learning_rate": 3.964276113307735e-06, "loss": 0.89115655, "num_input_tokens_seen": 15670835, "step": 736, "time_per_iteration": 2.703094005584717 }, { "auxiliary_loss_clip": 0.0124877, "auxiliary_loss_mlp": 0.01050447, "balance_loss_clip": 0.95434439, "balance_loss_mlp": 1.03490257, "epoch": 0.08861901040101004, "flos": 19828759587840.0, "grad_norm": 1.9851352626678849, "language_loss": 0.80662352, "learning_rate": 3.9641293916669574e-06, "loss": 0.82961571, "num_input_tokens_seen": 15689795, "step": 737, "time_per_iteration": 2.7667667865753174 }, { "auxiliary_loss_clip": 0.0124718, "auxiliary_loss_mlp": 0.01048001, "balance_loss_clip": 0.95470476, "balance_loss_mlp": 1.03160977, "epoch": 0.08873925329164913, "flos": 23658704173440.0, "grad_norm": 1.7624336736134576, "language_loss": 0.82958102, "learning_rate": 3.9639823720693115e-06, "loss": 0.85253286, "num_input_tokens_seen": 15711650, "step": 738, "time_per_iteration": 2.7834839820861816 }, { "auxiliary_loss_clip": 0.01164101, "auxiliary_loss_mlp": 0.01012872, "balance_loss_clip": 0.93857431, "balance_loss_mlp": 1.00238168, "epoch": 0.08885949618228822, "flos": 71831541893760.0, "grad_norm": 0.8444990103102271, "language_loss": 0.60042846, "learning_rate": 3.963835054537102e-06, "loss": 0.62219822, "num_input_tokens_seen": 15780615, "step": 739, "time_per_iteration": 3.361875295639038 }, { "auxiliary_loss_clip": 0.01248952, "auxiliary_loss_mlp": 0.01057242, "balance_loss_clip": 0.98930657, "balance_loss_mlp": 1.04167318, "epoch": 0.08897973907292732, "flos": 22346169298560.0, "grad_norm": 2.5707090547513323, "language_loss": 0.60984063, "learning_rate": 3.963687439092676e-06, "loss": 0.63290262, "num_input_tokens_seen": 15801300, "step": 740, "time_per_iteration": 2.8956737518310547 }, { "auxiliary_loss_clip": 0.01255002, "auxiliary_loss_mlp": 0.01049589, "balance_loss_clip": 1.03094745, "balance_loss_mlp": 1.0339011, "epoch": 0.0890999819635664, "flos": 21251827589760.0, "grad_norm": 2.001387031995149, "language_loss": 0.80590063, "learning_rate": 3.963539525758427e-06, "loss": 0.82894653, "num_input_tokens_seen": 15820860, "step": 741, "time_per_iteration": 2.6863505840301514 }, { "auxiliary_loss_clip": 0.01258104, "auxiliary_loss_mlp": 0.01056538, "balance_loss_clip": 0.99461544, "balance_loss_mlp": 1.03933597, "epoch": 0.0892202248542055, "flos": 25370888745600.0, "grad_norm": 1.7377696426186904, "language_loss": 0.67874634, "learning_rate": 3.9633913145567925e-06, "loss": 0.70189285, "num_input_tokens_seen": 15841350, "step": 742, "time_per_iteration": 2.784518241882324 }, { "auxiliary_loss_clip": 0.01251706, "auxiliary_loss_mlp": 0.01053588, "balance_loss_clip": 0.99052262, "balance_loss_mlp": 1.03827, "epoch": 0.08934046774484458, "flos": 24457895827200.0, "grad_norm": 2.0793482598491795, "language_loss": 0.81600624, "learning_rate": 3.9632428055102575e-06, "loss": 0.83905923, "num_input_tokens_seen": 15861360, "step": 743, "time_per_iteration": 2.7490389347076416 }, { "auxiliary_loss_clip": 0.01261877, "auxiliary_loss_mlp": 0.01059945, "balance_loss_clip": 1.03585672, "balance_loss_mlp": 1.0436368, "epoch": 0.08946071063548368, "flos": 35772773414400.0, "grad_norm": 2.6044303433621896, "language_loss": 0.66935521, "learning_rate": 3.9630939986413495e-06, "loss": 0.69257349, "num_input_tokens_seen": 15883160, "step": 744, "time_per_iteration": 2.8413374423980713 }, { "auxiliary_loss_clip": 0.01234925, "auxiliary_loss_mlp": 0.01056127, "balance_loss_clip": 0.95288336, "balance_loss_mlp": 1.04113078, "epoch": 0.08958095352612276, "flos": 14356584167040.0, "grad_norm": 1.8143899042920868, "language_loss": 0.78017437, "learning_rate": 3.962944893972643e-06, "loss": 0.80308491, "num_input_tokens_seen": 15901610, "step": 745, "time_per_iteration": 2.7332496643066406 }, { "auxiliary_loss_clip": 0.01252576, "auxiliary_loss_mlp": 0.01046626, "balance_loss_clip": 0.99432433, "balance_loss_mlp": 1.03085434, "epoch": 0.08970119641676186, "flos": 17853277345920.0, "grad_norm": 12.403379284348157, "language_loss": 0.90733236, "learning_rate": 3.962795491526756e-06, "loss": 0.93032438, "num_input_tokens_seen": 15918770, "step": 746, "time_per_iteration": 4.617533445358276 }, { "auxiliary_loss_clip": 0.01262266, "auxiliary_loss_mlp": 0.01055326, "balance_loss_clip": 1.07209837, "balance_loss_mlp": 1.03981662, "epoch": 0.08982143930740095, "flos": 20811670329600.0, "grad_norm": 2.294954503751395, "language_loss": 0.89120007, "learning_rate": 3.962645791326354e-06, "loss": 0.91437602, "num_input_tokens_seen": 15938025, "step": 747, "time_per_iteration": 2.680729627609253 }, { "auxiliary_loss_clip": 0.01253704, "auxiliary_loss_mlp": 0.01043724, "balance_loss_clip": 1.03320074, "balance_loss_mlp": 1.0286206, "epoch": 0.08994168219804004, "flos": 24097712198400.0, "grad_norm": 2.0526466434111086, "language_loss": 0.8307057, "learning_rate": 3.962495793394146e-06, "loss": 0.85368001, "num_input_tokens_seen": 15957215, "step": 748, "time_per_iteration": 3.7206218242645264 }, { "auxiliary_loss_clip": 0.01165083, "auxiliary_loss_mlp": 0.01016363, "balance_loss_clip": 1.0519737, "balance_loss_mlp": 1.00534773, "epoch": 0.09006192508867913, "flos": 57188893812480.0, "grad_norm": 0.7424922231958306, "language_loss": 0.61246049, "learning_rate": 3.9623454977528864e-06, "loss": 0.63427484, "num_input_tokens_seen": 16015870, "step": 749, "time_per_iteration": 3.13033390045166 }, { "auxiliary_loss_clip": 0.01253737, "auxiliary_loss_mlp": 0.0105088, "balance_loss_clip": 0.9551934, "balance_loss_mlp": 1.03420234, "epoch": 0.09018216797931822, "flos": 20487505063680.0, "grad_norm": 1.934376535942244, "language_loss": 0.84994602, "learning_rate": 3.962194904425375e-06, "loss": 0.87299216, "num_input_tokens_seen": 16036500, "step": 750, "time_per_iteration": 3.7921817302703857 }, { "auxiliary_loss_clip": 0.01253123, "auxiliary_loss_mlp": 0.01048953, "balance_loss_clip": 1.031551, "balance_loss_mlp": 1.03369439, "epoch": 0.09030241086995731, "flos": 22638123043200.0, "grad_norm": 2.1252243613920427, "language_loss": 0.67949498, "learning_rate": 3.9620440134344566e-06, "loss": 0.70251578, "num_input_tokens_seen": 16054655, "step": 751, "time_per_iteration": 2.6708061695098877 }, { "auxiliary_loss_clip": 0.01252043, "auxiliary_loss_mlp": 0.01043129, "balance_loss_clip": 0.95693398, "balance_loss_mlp": 1.0268445, "epoch": 0.09042265376059641, "flos": 21871502046720.0, "grad_norm": 2.5168721785854453, "language_loss": 0.82002354, "learning_rate": 3.9618928248030215e-06, "loss": 0.84297526, "num_input_tokens_seen": 16074165, "step": 752, "time_per_iteration": 2.8737611770629883 }, { "auxiliary_loss_clip": 0.0125484, "auxiliary_loss_mlp": 0.0104989, "balance_loss_clip": 1.03231883, "balance_loss_mlp": 1.03392804, "epoch": 0.0905428966512355, "flos": 24316192673280.0, "grad_norm": 2.3435770537841085, "language_loss": 0.82610607, "learning_rate": 3.961741338554005e-06, "loss": 0.8491534, "num_input_tokens_seen": 16092505, "step": 753, "time_per_iteration": 2.7245750427246094 }, { "auxiliary_loss_clip": 0.01260145, "auxiliary_loss_mlp": 0.01049087, "balance_loss_clip": 0.99398768, "balance_loss_mlp": 1.03409064, "epoch": 0.09066313954187459, "flos": 35845061535360.0, "grad_norm": 1.947453051027106, "language_loss": 0.75796068, "learning_rate": 3.9615895547103865e-06, "loss": 0.78105295, "num_input_tokens_seen": 16116150, "step": 754, "time_per_iteration": 2.8391215801239014 }, { "auxiliary_loss_clip": 0.01250227, "auxiliary_loss_mlp": 0.01042804, "balance_loss_clip": 0.98898852, "balance_loss_mlp": 1.0271039, "epoch": 0.09078338243251367, "flos": 29168729550720.0, "grad_norm": 2.0488095614387896, "language_loss": 0.7766813, "learning_rate": 3.961437473295193e-06, "loss": 0.79961157, "num_input_tokens_seen": 16136295, "step": 755, "time_per_iteration": 2.7637486457824707 }, { "auxiliary_loss_clip": 0.01227034, "auxiliary_loss_mlp": 0.01039802, "balance_loss_clip": 0.90646458, "balance_loss_mlp": 1.02431631, "epoch": 0.09090362532315277, "flos": 21907699977600.0, "grad_norm": 2.3521157493117335, "language_loss": 0.72632384, "learning_rate": 3.961285094331495e-06, "loss": 0.7489922, "num_input_tokens_seen": 16154210, "step": 756, "time_per_iteration": 2.7039477825164795 }, { "auxiliary_loss_clip": 0.01255859, "auxiliary_loss_mlp": 0.01042572, "balance_loss_clip": 1.06900299, "balance_loss_mlp": 1.02732515, "epoch": 0.09102386821379185, "flos": 27344503480320.0, "grad_norm": 1.6832484488915613, "language_loss": 0.85717642, "learning_rate": 3.961132417842406e-06, "loss": 0.88016069, "num_input_tokens_seen": 16173995, "step": 757, "time_per_iteration": 2.670947551727295 }, { "auxiliary_loss_clip": 0.012397, "auxiliary_loss_mlp": 0.01046434, "balance_loss_clip": 1.02714813, "balance_loss_mlp": 1.03103197, "epoch": 0.09114411110443095, "flos": 20813501923200.0, "grad_norm": 2.9335736259577523, "language_loss": 0.75132698, "learning_rate": 3.960979443851089e-06, "loss": 0.77418834, "num_input_tokens_seen": 16191020, "step": 758, "time_per_iteration": 2.687638759613037 }, { "auxiliary_loss_clip": 0.01247469, "auxiliary_loss_mlp": 0.01040492, "balance_loss_clip": 0.99055314, "balance_loss_mlp": 1.02525723, "epoch": 0.09126435399507005, "flos": 26145949438080.0, "grad_norm": 1.6485575986170853, "language_loss": 0.78963619, "learning_rate": 3.96082617238075e-06, "loss": 0.8125158, "num_input_tokens_seen": 16213645, "step": 759, "time_per_iteration": 2.7657337188720703 }, { "auxiliary_loss_clip": 0.01250769, "auxiliary_loss_mlp": 0.01044837, "balance_loss_clip": 0.98990798, "balance_loss_mlp": 1.0296979, "epoch": 0.09138459688570913, "flos": 24388911757440.0, "grad_norm": 2.4487766925061036, "language_loss": 0.79794109, "learning_rate": 3.960672603454639e-06, "loss": 0.8208971, "num_input_tokens_seen": 16233625, "step": 760, "time_per_iteration": 2.7249398231506348 }, { "auxiliary_loss_clip": 0.01244645, "auxiliary_loss_mlp": 0.0104908, "balance_loss_clip": 1.02886939, "balance_loss_mlp": 1.03290367, "epoch": 0.09150483977634823, "flos": 21032664756480.0, "grad_norm": 3.772029654855619, "language_loss": 0.77075875, "learning_rate": 3.960518737096054e-06, "loss": 0.79369605, "num_input_tokens_seen": 16253255, "step": 761, "time_per_iteration": 2.6789979934692383 }, { "auxiliary_loss_clip": 0.01254565, "auxiliary_loss_mlp": 0.01054206, "balance_loss_clip": 1.03056777, "balance_loss_mlp": 1.0396384, "epoch": 0.09162508266698731, "flos": 22856998567680.0, "grad_norm": 3.9685902775088855, "language_loss": 0.72461772, "learning_rate": 3.960364573328334e-06, "loss": 0.74770546, "num_input_tokens_seen": 16272580, "step": 762, "time_per_iteration": 2.777009963989258 }, { "auxiliary_loss_clip": 0.01249049, "auxiliary_loss_mlp": 0.01039851, "balance_loss_clip": 0.95159256, "balance_loss_mlp": 1.02509344, "epoch": 0.0917453255576264, "flos": 21724411852800.0, "grad_norm": 2.37672112965199, "language_loss": 0.88568258, "learning_rate": 3.9602101121748675e-06, "loss": 0.9085716, "num_input_tokens_seen": 16293075, "step": 763, "time_per_iteration": 2.753866195678711 }, { "auxiliary_loss_clip": 0.01248406, "auxiliary_loss_mlp": 0.01044908, "balance_loss_clip": 0.99055207, "balance_loss_mlp": 1.03090131, "epoch": 0.0918655684482655, "flos": 14609215497600.0, "grad_norm": 2.05274035983202, "language_loss": 0.72653234, "learning_rate": 3.960055353659085e-06, "loss": 0.74946547, "num_input_tokens_seen": 16310185, "step": 764, "time_per_iteration": 2.6787283420562744 }, { "auxiliary_loss_clip": 0.01252049, "auxiliary_loss_mlp": 0.01043372, "balance_loss_clip": 0.95538735, "balance_loss_mlp": 1.02888787, "epoch": 0.09198581133890459, "flos": 23435016226560.0, "grad_norm": 1.859949585362936, "language_loss": 0.83401287, "learning_rate": 3.959900297804465e-06, "loss": 0.85696709, "num_input_tokens_seen": 16330355, "step": 765, "time_per_iteration": 2.775991678237915 }, { "auxiliary_loss_clip": 0.01231825, "auxiliary_loss_mlp": 0.01041456, "balance_loss_clip": 0.98556995, "balance_loss_mlp": 1.0264715, "epoch": 0.09210605422954368, "flos": 16795887753600.0, "grad_norm": 2.517213594406481, "language_loss": 0.77272713, "learning_rate": 3.9597449446345276e-06, "loss": 0.79545999, "num_input_tokens_seen": 16347600, "step": 766, "time_per_iteration": 2.683511972427368 }, { "auxiliary_loss_clip": 0.0123147, "auxiliary_loss_mlp": 0.01045572, "balance_loss_clip": 0.98806393, "balance_loss_mlp": 1.03089786, "epoch": 0.09222629712018277, "flos": 22674249146880.0, "grad_norm": 2.191626288548549, "language_loss": 0.83177352, "learning_rate": 3.95958929417284e-06, "loss": 0.85454392, "num_input_tokens_seen": 16365755, "step": 767, "time_per_iteration": 2.6702938079833984 }, { "auxiliary_loss_clip": 0.01160383, "auxiliary_loss_mlp": 0.01011188, "balance_loss_clip": 1.00759768, "balance_loss_mlp": 1.00088811, "epoch": 0.09234654001082186, "flos": 69976756327680.0, "grad_norm": 0.7386901054948797, "language_loss": 0.58821422, "learning_rate": 3.9594333464430145e-06, "loss": 0.60992992, "num_input_tokens_seen": 16435245, "step": 768, "time_per_iteration": 3.394482374191284 }, { "auxiliary_loss_clip": 0.01227758, "auxiliary_loss_mlp": 0.01041894, "balance_loss_clip": 0.83180749, "balance_loss_mlp": 1.0262773, "epoch": 0.09246678290146094, "flos": 20011437181440.0, "grad_norm": 2.02180339484846, "language_loss": 0.88294053, "learning_rate": 3.959277101468709e-06, "loss": 0.90563703, "num_input_tokens_seen": 16454795, "step": 769, "time_per_iteration": 2.800373077392578 }, { "auxiliary_loss_clip": 0.01235422, "auxiliary_loss_mlp": 0.01042443, "balance_loss_clip": 0.99048054, "balance_loss_mlp": 1.02694607, "epoch": 0.09258702579210004, "flos": 17747448900480.0, "grad_norm": 3.6949962946142563, "language_loss": 0.78748375, "learning_rate": 3.959120559273624e-06, "loss": 0.81026244, "num_input_tokens_seen": 16472580, "step": 770, "time_per_iteration": 2.6895720958709717 }, { "auxiliary_loss_clip": 0.01235753, "auxiliary_loss_mlp": 0.01047446, "balance_loss_clip": 0.99203199, "balance_loss_mlp": 1.03156757, "epoch": 0.09270726868273914, "flos": 20886544229760.0, "grad_norm": 2.2023018346334933, "language_loss": 0.83459705, "learning_rate": 3.958963719881509e-06, "loss": 0.85742909, "num_input_tokens_seen": 16490670, "step": 771, "time_per_iteration": 2.69238018989563 }, { "auxiliary_loss_clip": 0.0125007, "auxiliary_loss_mlp": 0.01051868, "balance_loss_clip": 1.03170669, "balance_loss_mlp": 1.0366925, "epoch": 0.09282751157337822, "flos": 17015697031680.0, "grad_norm": 2.2890327468756575, "language_loss": 0.93460566, "learning_rate": 3.958806583316154e-06, "loss": 0.95762503, "num_input_tokens_seen": 16508640, "step": 772, "time_per_iteration": 3.6273553371429443 }, { "auxiliary_loss_clip": 0.01256127, "auxiliary_loss_mlp": 0.01047519, "balance_loss_clip": 1.07170081, "balance_loss_mlp": 1.03240287, "epoch": 0.09294775446401732, "flos": 32523647748480.0, "grad_norm": 7.221685661537401, "language_loss": 0.78647238, "learning_rate": 3.9586491496013985e-06, "loss": 0.8095088, "num_input_tokens_seen": 16531035, "step": 773, "time_per_iteration": 2.7372355461120605 }, { "auxiliary_loss_clip": 0.01255992, "auxiliary_loss_mlp": 0.01045168, "balance_loss_clip": 1.03173816, "balance_loss_mlp": 1.03002834, "epoch": 0.0930679973546564, "flos": 18259750627200.0, "grad_norm": 2.2554481299106026, "language_loss": 0.83129728, "learning_rate": 3.958491418761124e-06, "loss": 0.85430884, "num_input_tokens_seen": 16548605, "step": 774, "time_per_iteration": 2.6144673824310303 }, { "auxiliary_loss_clip": 0.01248355, "auxiliary_loss_mlp": 0.01058806, "balance_loss_clip": 0.98790056, "balance_loss_mlp": 1.04398894, "epoch": 0.0931882402452955, "flos": 21099745405440.0, "grad_norm": 2.5113891006107525, "language_loss": 0.72697204, "learning_rate": 3.958333390819258e-06, "loss": 0.75004363, "num_input_tokens_seen": 16565535, "step": 775, "time_per_iteration": 3.568223237991333 }, { "auxiliary_loss_clip": 0.01258682, "auxiliary_loss_mlp": 0.01046584, "balance_loss_clip": 1.0719707, "balance_loss_mlp": 1.0313015, "epoch": 0.0933084831359346, "flos": 24207275658240.0, "grad_norm": 2.096441235058588, "language_loss": 0.80125839, "learning_rate": 3.9581750657997754e-06, "loss": 0.82431108, "num_input_tokens_seen": 16584900, "step": 776, "time_per_iteration": 2.7246439456939697 }, { "auxiliary_loss_clip": 0.01246036, "auxiliary_loss_mlp": 0.01045278, "balance_loss_clip": 0.99001729, "balance_loss_mlp": 1.03066325, "epoch": 0.09342872602657368, "flos": 25480272637440.0, "grad_norm": 2.0075321082273927, "language_loss": 0.89805746, "learning_rate": 3.95801644372669e-06, "loss": 0.92097062, "num_input_tokens_seen": 16604805, "step": 777, "time_per_iteration": 3.6878504753112793 }, { "auxiliary_loss_clip": 0.01257114, "auxiliary_loss_mlp": 0.01046527, "balance_loss_clip": 0.9895367, "balance_loss_mlp": 1.0315547, "epoch": 0.09354896891721277, "flos": 23149060053120.0, "grad_norm": 2.0480395606931063, "language_loss": 0.84331459, "learning_rate": 3.957857524624068e-06, "loss": 0.86635101, "num_input_tokens_seen": 16623685, "step": 778, "time_per_iteration": 2.718367338180542 }, { "auxiliary_loss_clip": 0.01250986, "auxiliary_loss_mlp": 0.0105374, "balance_loss_clip": 0.99280322, "balance_loss_mlp": 1.03848076, "epoch": 0.09366921180785186, "flos": 24279563779200.0, "grad_norm": 1.5771998707865118, "language_loss": 0.89483976, "learning_rate": 3.957698308516016e-06, "loss": 0.91788703, "num_input_tokens_seen": 16644985, "step": 779, "time_per_iteration": 2.7659237384796143 }, { "auxiliary_loss_clip": 0.01252269, "auxiliary_loss_mlp": 0.01129945, "balance_loss_clip": 1.03601539, "balance_loss_mlp": 0.0, "epoch": 0.09378945469849095, "flos": 18730036419840.0, "grad_norm": 2.780498432348566, "language_loss": 0.8223325, "learning_rate": 3.957538795426688e-06, "loss": 0.84615469, "num_input_tokens_seen": 16662410, "step": 780, "time_per_iteration": 2.6547601222991943 }, { "auxiliary_loss_clip": 0.01252482, "auxiliary_loss_mlp": 0.01055612, "balance_loss_clip": 0.99232745, "balance_loss_mlp": 1.03984046, "epoch": 0.09390969758913004, "flos": 23218834222080.0, "grad_norm": 3.398154640450128, "language_loss": 0.77567321, "learning_rate": 3.9573789853802804e-06, "loss": 0.79875416, "num_input_tokens_seen": 16680885, "step": 781, "time_per_iteration": 2.650268793106079 }, { "auxiliary_loss_clip": 0.01251243, "auxiliary_loss_mlp": 0.01129825, "balance_loss_clip": 0.99363422, "balance_loss_mlp": 0.0, "epoch": 0.09402994047976913, "flos": 19646728439040.0, "grad_norm": 2.0775835532470373, "language_loss": 0.74730796, "learning_rate": 3.957218878401037e-06, "loss": 0.77111864, "num_input_tokens_seen": 16699375, "step": 782, "time_per_iteration": 2.695493221282959 }, { "auxiliary_loss_clip": 0.01262383, "auxiliary_loss_mlp": 0.01051928, "balance_loss_clip": 1.07612252, "balance_loss_mlp": 1.03587031, "epoch": 0.09415018337040823, "flos": 29420463041280.0, "grad_norm": 1.906741283804164, "language_loss": 0.89395195, "learning_rate": 3.957058474513246e-06, "loss": 0.91709507, "num_input_tokens_seen": 16719230, "step": 783, "time_per_iteration": 2.6889994144439697 }, { "auxiliary_loss_clip": 0.0125261, "auxiliary_loss_mlp": 0.01049811, "balance_loss_clip": 1.03467965, "balance_loss_mlp": 1.03468347, "epoch": 0.09427042626104731, "flos": 24572092141440.0, "grad_norm": 1.82472587480208, "language_loss": 0.78246236, "learning_rate": 3.956897773741241e-06, "loss": 0.80548656, "num_input_tokens_seen": 16738220, "step": 784, "time_per_iteration": 2.641174554824829 }, { "auxiliary_loss_clip": 0.01232452, "auxiliary_loss_mlp": 0.01051512, "balance_loss_clip": 0.98671293, "balance_loss_mlp": 1.03602684, "epoch": 0.09439066915168641, "flos": 26359581576960.0, "grad_norm": 1.740508514746703, "language_loss": 0.71725404, "learning_rate": 3.956736776109398e-06, "loss": 0.74009365, "num_input_tokens_seen": 16759395, "step": 785, "time_per_iteration": 2.7622854709625244 }, { "auxiliary_loss_clip": 0.01239898, "auxiliary_loss_mlp": 0.01130518, "balance_loss_clip": 1.02908456, "balance_loss_mlp": 0.0, "epoch": 0.09451091204232549, "flos": 19427278296960.0, "grad_norm": 1.8678374994350269, "language_loss": 0.83726585, "learning_rate": 3.956575481642143e-06, "loss": 0.86097002, "num_input_tokens_seen": 16778285, "step": 786, "time_per_iteration": 2.646418571472168 }, { "auxiliary_loss_clip": 0.01240666, "auxiliary_loss_mlp": 0.01039849, "balance_loss_clip": 0.91013682, "balance_loss_mlp": 1.02443576, "epoch": 0.09463115493296459, "flos": 25368051571200.0, "grad_norm": 2.9927270113359765, "language_loss": 0.74886096, "learning_rate": 3.956413890363943e-06, "loss": 0.77166605, "num_input_tokens_seen": 16795265, "step": 787, "time_per_iteration": 2.8030455112457275 }, { "auxiliary_loss_clip": 0.01255534, "auxiliary_loss_mlp": 0.01045301, "balance_loss_clip": 1.03446829, "balance_loss_mlp": 1.02997112, "epoch": 0.09475139782360369, "flos": 10123254869760.0, "grad_norm": 2.5456778151423016, "language_loss": 0.81503236, "learning_rate": 3.956252002299312e-06, "loss": 0.83804077, "num_input_tokens_seen": 16811165, "step": 788, "time_per_iteration": 2.689772605895996 }, { "auxiliary_loss_clip": 0.01255567, "auxiliary_loss_mlp": 0.01041229, "balance_loss_clip": 1.0723536, "balance_loss_mlp": 1.02642274, "epoch": 0.09487164071424277, "flos": 17231088936960.0, "grad_norm": 2.8300103198181814, "language_loss": 0.9066751, "learning_rate": 3.956089817472807e-06, "loss": 0.92964303, "num_input_tokens_seen": 16828470, "step": 789, "time_per_iteration": 2.6343140602111816 }, { "auxiliary_loss_clip": 0.01251501, "auxiliary_loss_mlp": 0.01050207, "balance_loss_clip": 0.99467105, "balance_loss_mlp": 1.03511488, "epoch": 0.09499188360488187, "flos": 30849564528000.0, "grad_norm": 2.3749880169818036, "language_loss": 0.85370111, "learning_rate": 3.955927335909032e-06, "loss": 0.87671816, "num_input_tokens_seen": 16851680, "step": 790, "time_per_iteration": 2.789907455444336 }, { "auxiliary_loss_clip": 0.01241055, "auxiliary_loss_mlp": 0.01044282, "balance_loss_clip": 0.91401339, "balance_loss_mlp": 1.0289402, "epoch": 0.09511212649552095, "flos": 29351694453120.0, "grad_norm": 2.329822135965108, "language_loss": 0.75783712, "learning_rate": 3.955764557632634e-06, "loss": 0.78069043, "num_input_tokens_seen": 16871490, "step": 791, "time_per_iteration": 2.8316264152526855 }, { "auxiliary_loss_clip": 0.01243096, "auxiliary_loss_mlp": 0.01044933, "balance_loss_clip": 0.99144793, "balance_loss_mlp": 1.02924478, "epoch": 0.09523236938616005, "flos": 10378687461120.0, "grad_norm": 2.691394989173461, "language_loss": 0.94651353, "learning_rate": 3.955601482668309e-06, "loss": 0.96939379, "num_input_tokens_seen": 16889350, "step": 792, "time_per_iteration": 2.6259140968322754 }, { "auxiliary_loss_clip": 0.01240613, "auxiliary_loss_mlp": 0.01047386, "balance_loss_clip": 0.91136652, "balance_loss_mlp": 1.03014779, "epoch": 0.09535261227679913, "flos": 19061815368960.0, "grad_norm": 2.045892110114207, "language_loss": 0.88600934, "learning_rate": 3.955438111040794e-06, "loss": 0.90888935, "num_input_tokens_seen": 16907625, "step": 793, "time_per_iteration": 2.7525205612182617 }, { "auxiliary_loss_clip": 0.01239115, "auxiliary_loss_mlp": 0.01052194, "balance_loss_clip": 0.9125343, "balance_loss_mlp": 1.03623199, "epoch": 0.09547285516743823, "flos": 20922993555840.0, "grad_norm": 1.8966965843139063, "language_loss": 0.79866296, "learning_rate": 3.955274442774873e-06, "loss": 0.821576, "num_input_tokens_seen": 16926205, "step": 794, "time_per_iteration": 2.7612383365631104 }, { "auxiliary_loss_clip": 0.01255298, "auxiliary_loss_mlp": 0.0105133, "balance_loss_clip": 1.03290212, "balance_loss_mlp": 1.0357132, "epoch": 0.09559309805807732, "flos": 30154405639680.0, "grad_norm": 1.957261481508997, "language_loss": 0.70697969, "learning_rate": 3.9551104778953725e-06, "loss": 0.73004597, "num_input_tokens_seen": 16946500, "step": 795, "time_per_iteration": 2.7628393173217773 }, { "auxiliary_loss_clip": 0.012474, "auxiliary_loss_mlp": 0.01046028, "balance_loss_clip": 0.95168865, "balance_loss_mlp": 1.03121078, "epoch": 0.0957133409487164, "flos": 21066743784960.0, "grad_norm": 1.9912865877617705, "language_loss": 0.8546716, "learning_rate": 3.954946216427167e-06, "loss": 0.87760586, "num_input_tokens_seen": 16966960, "step": 796, "time_per_iteration": 2.80507493019104 }, { "auxiliary_loss_clip": 0.01144495, "auxiliary_loss_mlp": 0.01009898, "balance_loss_clip": 0.92539823, "balance_loss_mlp": 0.99974138, "epoch": 0.0958335838393555, "flos": 71297979315840.0, "grad_norm": 0.9045075368720491, "language_loss": 0.61584312, "learning_rate": 3.954781658395176e-06, "loss": 0.63738704, "num_input_tokens_seen": 17023215, "step": 797, "time_per_iteration": 3.207118034362793 }, { "auxiliary_loss_clip": 0.01256812, "auxiliary_loss_mlp": 0.01051186, "balance_loss_clip": 0.99273092, "balance_loss_mlp": 1.03598654, "epoch": 0.09595382672999458, "flos": 21872974504320.0, "grad_norm": 2.0106448918804123, "language_loss": 0.92244738, "learning_rate": 3.95461680382436e-06, "loss": 0.94552732, "num_input_tokens_seen": 17042140, "step": 798, "time_per_iteration": 3.7556865215301514 }, { "auxiliary_loss_clip": 0.01255709, "auxiliary_loss_mlp": 0.0104785, "balance_loss_clip": 1.03414214, "balance_loss_mlp": 1.03243673, "epoch": 0.09607406962063368, "flos": 18695562341760.0, "grad_norm": 3.7583852898790377, "language_loss": 0.86345971, "learning_rate": 3.9544516527397295e-06, "loss": 0.88649529, "num_input_tokens_seen": 17058490, "step": 799, "time_per_iteration": 2.649595022201538 }, { "auxiliary_loss_clip": 0.01237961, "auxiliary_loss_mlp": 0.01050651, "balance_loss_clip": 0.99161816, "balance_loss_mlp": 1.03446269, "epoch": 0.09619431251127276, "flos": 22568456615040.0, "grad_norm": 1.7217416596615387, "language_loss": 0.8051222, "learning_rate": 3.954286205166338e-06, "loss": 0.82800829, "num_input_tokens_seen": 17079655, "step": 800, "time_per_iteration": 3.658647060394287 }, { "auxiliary_loss_clip": 0.01260421, "auxiliary_loss_mlp": 0.0104277, "balance_loss_clip": 1.03781819, "balance_loss_mlp": 1.02683151, "epoch": 0.09631455540191186, "flos": 14246230608000.0, "grad_norm": 3.011177624869204, "language_loss": 0.83836293, "learning_rate": 3.954120461129282e-06, "loss": 0.86139482, "num_input_tokens_seen": 17097065, "step": 801, "time_per_iteration": 2.68165922164917 }, { "auxiliary_loss_clip": 0.01257512, "auxiliary_loss_mlp": 0.01062426, "balance_loss_clip": 1.07405508, "balance_loss_mlp": 1.04715538, "epoch": 0.09643479829255096, "flos": 20740387789440.0, "grad_norm": 3.144600849464967, "language_loss": 0.83577573, "learning_rate": 3.953954420653706e-06, "loss": 0.85897505, "num_input_tokens_seen": 17114090, "step": 802, "time_per_iteration": 3.5663681030273438 }, { "auxiliary_loss_clip": 0.01254782, "auxiliary_loss_mlp": 0.01046973, "balance_loss_clip": 1.03417552, "balance_loss_mlp": 1.03102231, "epoch": 0.09655504118319004, "flos": 24420476833920.0, "grad_norm": 2.034508138271821, "language_loss": 0.88272685, "learning_rate": 3.953788083764798e-06, "loss": 0.90574443, "num_input_tokens_seen": 17133325, "step": 803, "time_per_iteration": 2.679504632949829 }, { "auxiliary_loss_clip": 0.01247955, "auxiliary_loss_mlp": 0.01049316, "balance_loss_clip": 0.91679609, "balance_loss_mlp": 1.03392577, "epoch": 0.09667528407382914, "flos": 18441961344000.0, "grad_norm": 2.1225522995214985, "language_loss": 0.91936934, "learning_rate": 3.953621450487792e-06, "loss": 0.94234204, "num_input_tokens_seen": 17151945, "step": 804, "time_per_iteration": 2.861133575439453 }, { "auxiliary_loss_clip": 0.01153248, "auxiliary_loss_mlp": 0.0101433, "balance_loss_clip": 1.04416871, "balance_loss_mlp": 1.00441194, "epoch": 0.09679552696446822, "flos": 70816455544320.0, "grad_norm": 0.8519557461042494, "language_loss": 0.61241531, "learning_rate": 3.953454520847964e-06, "loss": 0.63409114, "num_input_tokens_seen": 17216790, "step": 805, "time_per_iteration": 3.354538917541504 }, { "auxiliary_loss_clip": 0.01245628, "auxiliary_loss_mlp": 0.01050476, "balance_loss_clip": 0.99231851, "balance_loss_mlp": 1.03453767, "epoch": 0.09691576985510732, "flos": 21945514020480.0, "grad_norm": 2.3161842312154683, "language_loss": 0.73592931, "learning_rate": 3.9532872948706395e-06, "loss": 0.75889039, "num_input_tokens_seen": 17236285, "step": 806, "time_per_iteration": 2.688232183456421 }, { "auxiliary_loss_clip": 0.01255835, "auxiliary_loss_mlp": 0.01060336, "balance_loss_clip": 0.99546152, "balance_loss_mlp": 1.04371798, "epoch": 0.09703601274574641, "flos": 17965211103360.0, "grad_norm": 3.6372491925157275, "language_loss": 0.82780719, "learning_rate": 3.9531197725811845e-06, "loss": 0.85096896, "num_input_tokens_seen": 17251670, "step": 807, "time_per_iteration": 2.6715714931488037 }, { "auxiliary_loss_clip": 0.01257711, "auxiliary_loss_mlp": 0.010542, "balance_loss_clip": 1.0752461, "balance_loss_mlp": 1.03901267, "epoch": 0.0971562556363855, "flos": 22162162901760.0, "grad_norm": 1.7238964610288765, "language_loss": 0.87852228, "learning_rate": 3.952951954005013e-06, "loss": 0.90164137, "num_input_tokens_seen": 17271355, "step": 808, "time_per_iteration": 2.6660726070404053 }, { "auxiliary_loss_clip": 0.01247986, "auxiliary_loss_mlp": 0.01047264, "balance_loss_clip": 0.98773199, "balance_loss_mlp": 1.03215981, "epoch": 0.0972764985270246, "flos": 25848716394240.0, "grad_norm": 1.7256852760841919, "language_loss": 0.84908098, "learning_rate": 3.952783839167584e-06, "loss": 0.87203348, "num_input_tokens_seen": 17291400, "step": 809, "time_per_iteration": 2.7439754009246826 }, { "auxiliary_loss_clip": 0.01252211, "auxiliary_loss_mlp": 0.01050965, "balance_loss_clip": 1.03115582, "balance_loss_mlp": 1.03487182, "epoch": 0.09739674141766368, "flos": 20339373375360.0, "grad_norm": 2.6096668956968094, "language_loss": 0.74181527, "learning_rate": 3.952615428094398e-06, "loss": 0.76484698, "num_input_tokens_seen": 17310920, "step": 810, "time_per_iteration": 2.7993547916412354 }, { "auxiliary_loss_clip": 0.01232824, "auxiliary_loss_mlp": 0.01052358, "balance_loss_clip": 0.91044074, "balance_loss_mlp": 1.03644395, "epoch": 0.09751698430830277, "flos": 15743059188480.0, "grad_norm": 2.1076136304895057, "language_loss": 0.73593485, "learning_rate": 3.952446720811004e-06, "loss": 0.75878668, "num_input_tokens_seen": 17329245, "step": 811, "time_per_iteration": 2.7895476818084717 }, { "auxiliary_loss_clip": 0.01146878, "auxiliary_loss_mlp": 0.01010303, "balance_loss_clip": 0.92641836, "balance_loss_mlp": 1.00043249, "epoch": 0.09763722719894186, "flos": 63716806800000.0, "grad_norm": 0.8435238857604066, "language_loss": 0.63683796, "learning_rate": 3.952277717342995e-06, "loss": 0.65840983, "num_input_tokens_seen": 17395680, "step": 812, "time_per_iteration": 3.373987913131714 }, { "auxiliary_loss_clip": 0.012591, "auxiliary_loss_mlp": 0.01056211, "balance_loss_clip": 0.99380374, "balance_loss_mlp": 1.03991544, "epoch": 0.09775747008958095, "flos": 22090916275200.0, "grad_norm": 1.856231341495713, "language_loss": 0.85209763, "learning_rate": 3.952108417716009e-06, "loss": 0.87525082, "num_input_tokens_seen": 17415135, "step": 813, "time_per_iteration": 2.73726749420166 }, { "auxiliary_loss_clip": 0.01255009, "auxiliary_loss_mlp": 0.01047763, "balance_loss_clip": 1.03580475, "balance_loss_mlp": 1.03158665, "epoch": 0.09787771298022005, "flos": 21286050272640.0, "grad_norm": 3.273316395327301, "language_loss": 0.84815246, "learning_rate": 3.951938821955727e-06, "loss": 0.87118018, "num_input_tokens_seen": 17434535, "step": 814, "time_per_iteration": 2.6455228328704834 }, { "auxiliary_loss_clip": 0.01247732, "auxiliary_loss_mlp": 0.01055754, "balance_loss_clip": 0.99355543, "balance_loss_mlp": 1.04034042, "epoch": 0.09799795587085913, "flos": 22054574689920.0, "grad_norm": 2.321131923394081, "language_loss": 0.76644957, "learning_rate": 3.9517689300878786e-06, "loss": 0.78948444, "num_input_tokens_seen": 17454270, "step": 815, "time_per_iteration": 2.7093756198883057 }, { "auxiliary_loss_clip": 0.01253165, "auxiliary_loss_mlp": 0.0104397, "balance_loss_clip": 1.06942701, "balance_loss_mlp": 1.02976036, "epoch": 0.09811819876149823, "flos": 22163743100160.0, "grad_norm": 1.584824046273688, "language_loss": 0.78517783, "learning_rate": 3.951598742138236e-06, "loss": 0.80814916, "num_input_tokens_seen": 17472995, "step": 816, "time_per_iteration": 2.6518614292144775 }, { "auxiliary_loss_clip": 0.01253322, "auxiliary_loss_mlp": 0.01042385, "balance_loss_clip": 0.98947412, "balance_loss_mlp": 1.02681661, "epoch": 0.09823844165213731, "flos": 22231111057920.0, "grad_norm": 2.4100125770900553, "language_loss": 0.79622412, "learning_rate": 3.951428258132615e-06, "loss": 0.8191812, "num_input_tokens_seen": 17491115, "step": 817, "time_per_iteration": 2.7635838985443115 }, { "auxiliary_loss_clip": 0.01255048, "auxiliary_loss_mlp": 0.01053106, "balance_loss_clip": 0.99701595, "balance_loss_mlp": 1.03808618, "epoch": 0.09835868454277641, "flos": 22487728798080.0, "grad_norm": 1.9010819279425812, "language_loss": 0.84478509, "learning_rate": 3.951257478096879e-06, "loss": 0.86786664, "num_input_tokens_seen": 17509480, "step": 818, "time_per_iteration": 2.7813637256622314 }, { "auxiliary_loss_clip": 0.01250199, "auxiliary_loss_mlp": 0.01130503, "balance_loss_clip": 0.99309343, "balance_loss_mlp": 0.0, "epoch": 0.0984789274334155, "flos": 16362554077440.0, "grad_norm": 2.609196622658837, "language_loss": 0.68372083, "learning_rate": 3.951086402056936e-06, "loss": 0.70752788, "num_input_tokens_seen": 17524080, "step": 819, "time_per_iteration": 2.66489315032959 }, { "auxiliary_loss_clip": 0.01240267, "auxiliary_loss_mlp": 0.01130598, "balance_loss_clip": 0.84435332, "balance_loss_mlp": 0.0, "epoch": 0.09859917032405459, "flos": 24243545416320.0, "grad_norm": 1.670310442572063, "language_loss": 0.83749413, "learning_rate": 3.950915030038735e-06, "loss": 0.86120272, "num_input_tokens_seen": 17543875, "step": 820, "time_per_iteration": 3.002549886703491 }, { "auxiliary_loss_clip": 0.01248243, "auxiliary_loss_mlp": 0.01049551, "balance_loss_clip": 1.03154588, "balance_loss_mlp": 1.0347209, "epoch": 0.09871941321469369, "flos": 17420195064960.0, "grad_norm": 2.1653838550323137, "language_loss": 0.83664227, "learning_rate": 3.9507433620682765e-06, "loss": 0.85962021, "num_input_tokens_seen": 17560810, "step": 821, "time_per_iteration": 2.931641101837158 }, { "auxiliary_loss_clip": 0.012422, "auxiliary_loss_mlp": 0.01053836, "balance_loss_clip": 0.94908774, "balance_loss_mlp": 1.0388757, "epoch": 0.09883965610533277, "flos": 28477341590400.0, "grad_norm": 1.7954678338581713, "language_loss": 0.88325846, "learning_rate": 3.9505713981716e-06, "loss": 0.90621877, "num_input_tokens_seen": 17583640, "step": 822, "time_per_iteration": 2.856818437576294 }, { "auxiliary_loss_clip": 0.0124835, "auxiliary_loss_mlp": 0.01052618, "balance_loss_clip": 0.99395835, "balance_loss_mlp": 1.03821731, "epoch": 0.09895989899597187, "flos": 23693932437120.0, "grad_norm": 2.173499740348231, "language_loss": 0.81018376, "learning_rate": 3.950399138374795e-06, "loss": 0.83319342, "num_input_tokens_seen": 17602720, "step": 823, "time_per_iteration": 2.7046563625335693 }, { "auxiliary_loss_clip": 0.01249066, "auxiliary_loss_mlp": 0.01052101, "balance_loss_clip": 1.02857304, "balance_loss_mlp": 1.03720009, "epoch": 0.09908014188661095, "flos": 24679608526080.0, "grad_norm": 1.8263374415742801, "language_loss": 0.74060404, "learning_rate": 3.95022658270399e-06, "loss": 0.76361573, "num_input_tokens_seen": 17623085, "step": 824, "time_per_iteration": 3.7567479610443115 }, { "auxiliary_loss_clip": 0.01242926, "auxiliary_loss_mlp": 0.01046971, "balance_loss_clip": 0.99063075, "balance_loss_mlp": 1.03156924, "epoch": 0.09920038477725004, "flos": 14064307200000.0, "grad_norm": 2.013603858790762, "language_loss": 0.78056836, "learning_rate": 3.9500537311853635e-06, "loss": 0.80346727, "num_input_tokens_seen": 17641040, "step": 825, "time_per_iteration": 3.609637498855591 }, { "auxiliary_loss_clip": 0.01248395, "auxiliary_loss_mlp": 0.01050102, "balance_loss_clip": 1.02669501, "balance_loss_mlp": 1.03456903, "epoch": 0.09932062766788914, "flos": 13407070095360.0, "grad_norm": 2.2458340844521087, "language_loss": 0.831779, "learning_rate": 3.949880583845136e-06, "loss": 0.85476398, "num_input_tokens_seen": 17659115, "step": 826, "time_per_iteration": 3.6277496814727783 }, { "auxiliary_loss_clip": 0.01246194, "auxiliary_loss_mlp": 0.01046353, "balance_loss_clip": 0.99152493, "balance_loss_mlp": 1.03180957, "epoch": 0.09944087055852822, "flos": 19500751566720.0, "grad_norm": 1.7613133290650316, "language_loss": 0.81293166, "learning_rate": 3.949707140709575e-06, "loss": 0.83585715, "num_input_tokens_seen": 17678845, "step": 827, "time_per_iteration": 2.6526918411254883 }, { "auxiliary_loss_clip": 0.01254672, "auxiliary_loss_mlp": 0.01049516, "balance_loss_clip": 1.03206825, "balance_loss_mlp": 1.03437662, "epoch": 0.09956111344916732, "flos": 17749100926080.0, "grad_norm": 3.6024275758816935, "language_loss": 0.83441699, "learning_rate": 3.949533401804991e-06, "loss": 0.85745883, "num_input_tokens_seen": 17695750, "step": 828, "time_per_iteration": 3.543985366821289 }, { "auxiliary_loss_clip": 0.01252987, "auxiliary_loss_mlp": 0.01130246, "balance_loss_clip": 1.03361809, "balance_loss_mlp": 0.0, "epoch": 0.0996813563398064, "flos": 17967581400960.0, "grad_norm": 1.9959035536057423, "language_loss": 0.90832114, "learning_rate": 3.949359367157739e-06, "loss": 0.93215346, "num_input_tokens_seen": 17714445, "step": 829, "time_per_iteration": 2.644397735595703 }, { "auxiliary_loss_clip": 0.01254533, "auxiliary_loss_mlp": 0.01050541, "balance_loss_clip": 1.03184271, "balance_loss_mlp": 1.03523493, "epoch": 0.0998015992304455, "flos": 17457039440640.0, "grad_norm": 2.117126362659637, "language_loss": 0.75473833, "learning_rate": 3.949185036794222e-06, "loss": 0.77778906, "num_input_tokens_seen": 17732455, "step": 830, "time_per_iteration": 2.6573104858398438 }, { "auxiliary_loss_clip": 0.01252735, "auxiliary_loss_mlp": 0.0104709, "balance_loss_clip": 1.07065976, "balance_loss_mlp": 1.03297555, "epoch": 0.0999218421210846, "flos": 25888757080320.0, "grad_norm": 1.7813715852048297, "language_loss": 0.78727925, "learning_rate": 3.949010410740884e-06, "loss": 0.81027746, "num_input_tokens_seen": 17755280, "step": 831, "time_per_iteration": 2.7306854724884033 }, { "auxiliary_loss_clip": 0.01233891, "auxiliary_loss_mlp": 0.0113034, "balance_loss_clip": 0.98819697, "balance_loss_mlp": 0.0, "epoch": 0.10004208501172368, "flos": 21215916967680.0, "grad_norm": 1.734287525277213, "language_loss": 0.86471748, "learning_rate": 3.948835489024216e-06, "loss": 0.88835979, "num_input_tokens_seen": 17775015, "step": 832, "time_per_iteration": 2.744828701019287 }, { "auxiliary_loss_clip": 0.01250994, "auxiliary_loss_mlp": 0.01045942, "balance_loss_clip": 1.02849078, "balance_loss_mlp": 1.03088617, "epoch": 0.10016232790236278, "flos": 17348409734400.0, "grad_norm": 2.3309961628826574, "language_loss": 0.9040345, "learning_rate": 3.948660271670755e-06, "loss": 0.92700386, "num_input_tokens_seen": 17792165, "step": 833, "time_per_iteration": 2.6303975582122803 }, { "auxiliary_loss_clip": 0.01246818, "auxiliary_loss_mlp": 0.01045359, "balance_loss_clip": 0.99304372, "balance_loss_mlp": 1.03073156, "epoch": 0.10028257079300186, "flos": 25666541591040.0, "grad_norm": 2.180734393546764, "language_loss": 0.83980823, "learning_rate": 3.948484758707079e-06, "loss": 0.86272997, "num_input_tokens_seen": 17811765, "step": 834, "time_per_iteration": 2.7691404819488525 }, { "auxiliary_loss_clip": 0.01235111, "auxiliary_loss_mlp": 0.01045698, "balance_loss_clip": 0.94875741, "balance_loss_mlp": 1.03065348, "epoch": 0.10040281368364096, "flos": 25156035544320.0, "grad_norm": 21.001508234440724, "language_loss": 0.83298492, "learning_rate": 3.948308950159815e-06, "loss": 0.855793, "num_input_tokens_seen": 17830445, "step": 835, "time_per_iteration": 2.7930033206939697 }, { "auxiliary_loss_clip": 0.01241074, "auxiliary_loss_mlp": 0.01052739, "balance_loss_clip": 0.95006895, "balance_loss_mlp": 1.03774214, "epoch": 0.10052305657428004, "flos": 17603303621760.0, "grad_norm": 2.9453008077780023, "language_loss": 0.75736082, "learning_rate": 3.9481328460556326e-06, "loss": 0.78029895, "num_input_tokens_seen": 17847665, "step": 836, "time_per_iteration": 2.6678123474121094 }, { "auxiliary_loss_clip": 0.01233082, "auxiliary_loss_mlp": 0.01041155, "balance_loss_clip": 0.9864102, "balance_loss_mlp": 1.02646828, "epoch": 0.10064329946491914, "flos": 18660154510080.0, "grad_norm": 2.08024967472353, "language_loss": 0.8957473, "learning_rate": 3.9479564464212455e-06, "loss": 0.91848969, "num_input_tokens_seen": 17866825, "step": 837, "time_per_iteration": 2.673401355743408 }, { "auxiliary_loss_clip": 0.01254186, "auxiliary_loss_mlp": 0.01044668, "balance_loss_clip": 1.06909072, "balance_loss_mlp": 1.02926588, "epoch": 0.10076354235555823, "flos": 17199056983680.0, "grad_norm": 2.4344127336738213, "language_loss": 0.76393461, "learning_rate": 3.947779751283414e-06, "loss": 0.78692317, "num_input_tokens_seen": 17883995, "step": 838, "time_per_iteration": 2.621530771255493 }, { "auxiliary_loss_clip": 0.01255404, "auxiliary_loss_mlp": 0.01130318, "balance_loss_clip": 1.03612804, "balance_loss_mlp": 0.0, "epoch": 0.10088378524619732, "flos": 22962252395520.0, "grad_norm": 1.6916319850765733, "language_loss": 0.75828207, "learning_rate": 3.947602760668944e-06, "loss": 0.78213924, "num_input_tokens_seen": 17903785, "step": 839, "time_per_iteration": 2.7008626461029053 }, { "auxiliary_loss_clip": 0.01252819, "auxiliary_loss_mlp": 0.01044596, "balance_loss_clip": 1.0325675, "balance_loss_mlp": 1.02995718, "epoch": 0.10100402813683641, "flos": 37885828746240.0, "grad_norm": 1.8883916128476534, "language_loss": 0.71590757, "learning_rate": 3.947425474604684e-06, "loss": 0.73888171, "num_input_tokens_seen": 17927720, "step": 840, "time_per_iteration": 2.784925937652588 }, { "auxiliary_loss_clip": 0.01243153, "auxiliary_loss_mlp": 0.01043611, "balance_loss_clip": 0.9886601, "balance_loss_mlp": 1.02882886, "epoch": 0.1011242710274755, "flos": 21543458112000.0, "grad_norm": 1.9030119181031004, "language_loss": 0.92401731, "learning_rate": 3.947247893117528e-06, "loss": 0.94688493, "num_input_tokens_seen": 17946225, "step": 841, "time_per_iteration": 2.704291582107544 }, { "auxiliary_loss_clip": 0.01240178, "auxiliary_loss_mlp": 0.01052403, "balance_loss_clip": 1.02693129, "balance_loss_mlp": 1.03683472, "epoch": 0.10124451391811459, "flos": 13621456419840.0, "grad_norm": 4.0559671133046695, "language_loss": 0.69303733, "learning_rate": 3.947070016234413e-06, "loss": 0.71596313, "num_input_tokens_seen": 17962015, "step": 842, "time_per_iteration": 2.662444591522217 }, { "auxiliary_loss_clip": 0.01257523, "auxiliary_loss_mlp": 0.01051864, "balance_loss_clip": 0.99325514, "balance_loss_mlp": 1.0364027, "epoch": 0.10136475680875369, "flos": 16649228522880.0, "grad_norm": 2.4358204833838917, "language_loss": 0.74860466, "learning_rate": 3.946891843982326e-06, "loss": 0.77169847, "num_input_tokens_seen": 17979680, "step": 843, "time_per_iteration": 2.700622797012329 }, { "auxiliary_loss_clip": 0.01252635, "auxiliary_loss_mlp": 0.01046123, "balance_loss_clip": 1.03237474, "balance_loss_mlp": 1.03032792, "epoch": 0.10148499969939277, "flos": 19461034103040.0, "grad_norm": 2.2356279364518388, "language_loss": 0.74356705, "learning_rate": 3.9467133763882935e-06, "loss": 0.76655465, "num_input_tokens_seen": 17998145, "step": 844, "time_per_iteration": 2.633592367172241 }, { "auxiliary_loss_clip": 0.01238387, "auxiliary_loss_mlp": 0.01038993, "balance_loss_clip": 1.02963376, "balance_loss_mlp": 1.0241518, "epoch": 0.10160524259003187, "flos": 21104988791040.0, "grad_norm": 2.412995429975761, "language_loss": 0.86209536, "learning_rate": 3.9465346134793905e-06, "loss": 0.88486916, "num_input_tokens_seen": 18017955, "step": 845, "time_per_iteration": 2.751154661178589 }, { "auxiliary_loss_clip": 0.0124624, "auxiliary_loss_mlp": 0.01043824, "balance_loss_clip": 0.95446193, "balance_loss_mlp": 1.02899468, "epoch": 0.10172548548067095, "flos": 17712687513600.0, "grad_norm": 2.537680092678929, "language_loss": 0.80142975, "learning_rate": 3.9463555552827335e-06, "loss": 0.82433039, "num_input_tokens_seen": 18035125, "step": 846, "time_per_iteration": 2.7100913524627686 }, { "auxiliary_loss_clip": 0.01236532, "auxiliary_loss_mlp": 0.01045021, "balance_loss_clip": 1.02631402, "balance_loss_mlp": 1.02972674, "epoch": 0.10184572837131005, "flos": 21104845136640.0, "grad_norm": 3.5916605675453113, "language_loss": 0.85755789, "learning_rate": 3.946176201825487e-06, "loss": 0.88037336, "num_input_tokens_seen": 18053160, "step": 847, "time_per_iteration": 2.6777496337890625 }, { "auxiliary_loss_clip": 0.01246193, "auxiliary_loss_mlp": 0.01049918, "balance_loss_clip": 0.99264359, "balance_loss_mlp": 1.0342772, "epoch": 0.10196597126194913, "flos": 26067591918720.0, "grad_norm": 2.087863814237613, "language_loss": 0.8353864, "learning_rate": 3.9459965531348575e-06, "loss": 0.85834742, "num_input_tokens_seen": 18072815, "step": 848, "time_per_iteration": 2.8323965072631836 }, { "auxiliary_loss_clip": 0.01245519, "auxiliary_loss_mlp": 0.01130734, "balance_loss_clip": 0.99178088, "balance_loss_mlp": 0.0, "epoch": 0.10208621415258823, "flos": 29314634595840.0, "grad_norm": 2.2695144847182744, "language_loss": 0.85732293, "learning_rate": 3.945816609238098e-06, "loss": 0.8810854, "num_input_tokens_seen": 18092225, "step": 849, "time_per_iteration": 2.7144289016723633 }, { "auxiliary_loss_clip": 0.01236074, "auxiliary_loss_mlp": 0.01057176, "balance_loss_clip": 0.9145624, "balance_loss_mlp": 1.04167879, "epoch": 0.10220645704322733, "flos": 23805794367360.0, "grad_norm": 4.068568147357138, "language_loss": 0.85351712, "learning_rate": 3.945636370162507e-06, "loss": 0.87644964, "num_input_tokens_seen": 18112335, "step": 850, "time_per_iteration": 2.8065648078918457 }, { "auxiliary_loss_clip": 0.01248846, "auxiliary_loss_mlp": 0.01045188, "balance_loss_clip": 1.03237784, "balance_loss_mlp": 1.03087139, "epoch": 0.10232669993386641, "flos": 23218546913280.0, "grad_norm": 1.7867626302017703, "language_loss": 0.79042244, "learning_rate": 3.945455835935425e-06, "loss": 0.81336284, "num_input_tokens_seen": 18131520, "step": 851, "time_per_iteration": 4.587406158447266 }, { "auxiliary_loss_clip": 0.01248033, "auxiliary_loss_mlp": 0.01048086, "balance_loss_clip": 0.99184728, "balance_loss_mlp": 1.03298259, "epoch": 0.1024469428245055, "flos": 22922929981440.0, "grad_norm": 2.3161337914848015, "language_loss": 0.75297612, "learning_rate": 3.94527500658424e-06, "loss": 0.77593732, "num_input_tokens_seen": 18149185, "step": 852, "time_per_iteration": 3.633558511734009 }, { "auxiliary_loss_clip": 0.0123935, "auxiliary_loss_mlp": 0.01041437, "balance_loss_clip": 0.91454589, "balance_loss_mlp": 1.02731109, "epoch": 0.10256718571514459, "flos": 31359495957120.0, "grad_norm": 2.3998670557862365, "language_loss": 0.81047142, "learning_rate": 3.945093882136382e-06, "loss": 0.83327937, "num_input_tokens_seen": 18172960, "step": 853, "time_per_iteration": 2.8365273475646973 }, { "auxiliary_loss_clip": 0.01251197, "auxiliary_loss_mlp": 0.01129912, "balance_loss_clip": 0.99578011, "balance_loss_mlp": 0.0, "epoch": 0.10268742860578368, "flos": 23474877344640.0, "grad_norm": 2.060213636269515, "language_loss": 0.84864712, "learning_rate": 3.944912462619329e-06, "loss": 0.87245822, "num_input_tokens_seen": 18191925, "step": 854, "time_per_iteration": 3.726989507675171 }, { "auxiliary_loss_clip": 0.01247891, "auxiliary_loss_mlp": 0.01047522, "balance_loss_clip": 0.9902187, "balance_loss_mlp": 1.03196478, "epoch": 0.10280767149642277, "flos": 25520313323520.0, "grad_norm": 2.049842151656829, "language_loss": 0.80917114, "learning_rate": 3.9447307480606025e-06, "loss": 0.83212525, "num_input_tokens_seen": 18212010, "step": 855, "time_per_iteration": 2.679940700531006 }, { "auxiliary_loss_clip": 0.01237388, "auxiliary_loss_mlp": 0.01040936, "balance_loss_clip": 0.99124902, "balance_loss_mlp": 1.02627325, "epoch": 0.10292791438706186, "flos": 17347691462400.0, "grad_norm": 2.2371465133863153, "language_loss": 0.9023838, "learning_rate": 3.944548738487767e-06, "loss": 0.92516708, "num_input_tokens_seen": 18229525, "step": 856, "time_per_iteration": 2.6802895069122314 }, { "auxiliary_loss_clip": 0.01257881, "auxiliary_loss_mlp": 0.01052777, "balance_loss_clip": 1.07269835, "balance_loss_mlp": 1.03816211, "epoch": 0.10304815727770096, "flos": 27052693390080.0, "grad_norm": 1.984426427307951, "language_loss": 0.91019911, "learning_rate": 3.944366433928434e-06, "loss": 0.93330562, "num_input_tokens_seen": 18249505, "step": 857, "time_per_iteration": 2.7023134231567383 }, { "auxiliary_loss_clip": 0.01238588, "auxiliary_loss_mlp": 0.01046265, "balance_loss_clip": 0.98821729, "balance_loss_mlp": 1.03230619, "epoch": 0.10316840016834004, "flos": 22782591544320.0, "grad_norm": 1.6452481127848406, "language_loss": 0.83564621, "learning_rate": 3.9441838344102594e-06, "loss": 0.85849476, "num_input_tokens_seen": 18269230, "step": 858, "time_per_iteration": 2.7573189735412598 }, { "auxiliary_loss_clip": 0.01253867, "auxiliary_loss_mlp": 0.01059521, "balance_loss_clip": 0.99529308, "balance_loss_mlp": 1.04469121, "epoch": 0.10328864305897914, "flos": 20704584908160.0, "grad_norm": 2.8146380477697317, "language_loss": 0.66956866, "learning_rate": 3.944000939960943e-06, "loss": 0.69270253, "num_input_tokens_seen": 18287955, "step": 859, "time_per_iteration": 2.729935646057129 }, { "auxiliary_loss_clip": 0.01251095, "auxiliary_loss_mlp": 0.01042877, "balance_loss_clip": 1.03095424, "balance_loss_mlp": 1.02881002, "epoch": 0.10340888594961822, "flos": 28478814048000.0, "grad_norm": 1.541101962487206, "language_loss": 0.79845917, "learning_rate": 3.943817750608229e-06, "loss": 0.82139885, "num_input_tokens_seen": 18310505, "step": 860, "time_per_iteration": 2.7694082260131836 }, { "auxiliary_loss_clip": 0.01251744, "auxiliary_loss_mlp": 0.01039332, "balance_loss_clip": 1.03310251, "balance_loss_mlp": 1.02531302, "epoch": 0.10352912884025732, "flos": 13370333460480.0, "grad_norm": 2.515220709318716, "language_loss": 0.81818485, "learning_rate": 3.943634266379908e-06, "loss": 0.84109557, "num_input_tokens_seen": 18327400, "step": 861, "time_per_iteration": 2.661341428756714 }, { "auxiliary_loss_clip": 0.01252698, "auxiliary_loss_mlp": 0.01048114, "balance_loss_clip": 1.03028226, "balance_loss_mlp": 1.03316474, "epoch": 0.10364937173089642, "flos": 25558558329600.0, "grad_norm": 1.8659039298054234, "language_loss": 0.84923643, "learning_rate": 3.943450487303815e-06, "loss": 0.87224454, "num_input_tokens_seen": 18347895, "step": 862, "time_per_iteration": 2.743417978286743 }, { "auxiliary_loss_clip": 0.01245287, "auxiliary_loss_mlp": 0.010409, "balance_loss_clip": 1.02984643, "balance_loss_mlp": 1.02642846, "epoch": 0.1037696146215355, "flos": 21215486004480.0, "grad_norm": 1.8523473113564572, "language_loss": 0.85359859, "learning_rate": 3.943266413407827e-06, "loss": 0.87646043, "num_input_tokens_seen": 18367170, "step": 863, "time_per_iteration": 2.7132906913757324 }, { "auxiliary_loss_clip": 0.01253651, "auxiliary_loss_mlp": 0.0105475, "balance_loss_clip": 1.03243613, "balance_loss_mlp": 1.0410533, "epoch": 0.1038898575121746, "flos": 25807382818560.0, "grad_norm": 1.9654895863581325, "language_loss": 0.84961027, "learning_rate": 3.94308204471987e-06, "loss": 0.87269431, "num_input_tokens_seen": 18386185, "step": 864, "time_per_iteration": 2.7591214179992676 }, { "auxiliary_loss_clip": 0.01239973, "auxiliary_loss_mlp": 0.01048111, "balance_loss_clip": 0.95026577, "balance_loss_mlp": 1.03440166, "epoch": 0.10401010040281368, "flos": 19062425900160.0, "grad_norm": 2.718417706687038, "language_loss": 0.74556088, "learning_rate": 3.942897381267912e-06, "loss": 0.76844168, "num_input_tokens_seen": 18402550, "step": 865, "time_per_iteration": 2.8216373920440674 }, { "auxiliary_loss_clip": 0.01251945, "auxiliary_loss_mlp": 0.01041681, "balance_loss_clip": 1.03123379, "balance_loss_mlp": 1.02683914, "epoch": 0.10413034329345278, "flos": 16355119962240.0, "grad_norm": 2.751308918354223, "language_loss": 0.66412938, "learning_rate": 3.942712423079965e-06, "loss": 0.68706566, "num_input_tokens_seen": 18418940, "step": 866, "time_per_iteration": 2.7109463214874268 }, { "auxiliary_loss_clip": 0.01224213, "auxiliary_loss_mlp": 0.01042771, "balance_loss_clip": 0.94329393, "balance_loss_mlp": 1.02882314, "epoch": 0.10425058618409186, "flos": 17236511890560.0, "grad_norm": 2.4497064364982, "language_loss": 0.89800489, "learning_rate": 3.942527170184088e-06, "loss": 0.92067468, "num_input_tokens_seen": 18435560, "step": 867, "time_per_iteration": 2.800931692123413 }, { "auxiliary_loss_clip": 0.0125357, "auxiliary_loss_mlp": 0.01047321, "balance_loss_clip": 1.07077384, "balance_loss_mlp": 1.03323686, "epoch": 0.10437082907473096, "flos": 17967365919360.0, "grad_norm": 2.239894129787103, "language_loss": 0.7734617, "learning_rate": 3.942341622608385e-06, "loss": 0.79647064, "num_input_tokens_seen": 18452590, "step": 868, "time_per_iteration": 2.6357569694519043 }, { "auxiliary_loss_clip": 0.01245071, "auxiliary_loss_mlp": 0.01046589, "balance_loss_clip": 0.99397135, "balance_loss_mlp": 1.03242695, "epoch": 0.10449107196537005, "flos": 36283315374720.0, "grad_norm": 2.4756924013568495, "language_loss": 0.77848256, "learning_rate": 3.942155780381001e-06, "loss": 0.80139923, "num_input_tokens_seen": 18476325, "step": 869, "time_per_iteration": 2.824266195297241 }, { "auxiliary_loss_clip": 0.01246348, "auxiliary_loss_mlp": 0.01041699, "balance_loss_clip": 0.98950446, "balance_loss_mlp": 1.02732253, "epoch": 0.10461131485600914, "flos": 23802095266560.0, "grad_norm": 1.863625092418395, "language_loss": 0.75953114, "learning_rate": 3.94196964353013e-06, "loss": 0.78241163, "num_input_tokens_seen": 18495775, "step": 870, "time_per_iteration": 2.7658772468566895 }, { "auxiliary_loss_clip": 0.01241403, "auxiliary_loss_mlp": 0.01129479, "balance_loss_clip": 0.98687899, "balance_loss_mlp": 0.0, "epoch": 0.10473155774664823, "flos": 18405476104320.0, "grad_norm": 1.8115832499577809, "language_loss": 0.80954152, "learning_rate": 3.941783212084008e-06, "loss": 0.8332504, "num_input_tokens_seen": 18513530, "step": 871, "time_per_iteration": 2.7647695541381836 }, { "auxiliary_loss_clip": 0.01231196, "auxiliary_loss_mlp": 0.01048989, "balance_loss_clip": 0.99125206, "balance_loss_mlp": 1.03443336, "epoch": 0.10485180063728732, "flos": 25592637358080.0, "grad_norm": 3.4975047562422548, "language_loss": 0.78755462, "learning_rate": 3.941596486070916e-06, "loss": 0.81035638, "num_input_tokens_seen": 18531575, "step": 872, "time_per_iteration": 2.708238124847412 }, { "auxiliary_loss_clip": 0.01237741, "auxiliary_loss_mlp": 0.01044368, "balance_loss_clip": 0.91518092, "balance_loss_mlp": 1.03028965, "epoch": 0.10497204352792641, "flos": 27088747666560.0, "grad_norm": 2.183684652389814, "language_loss": 0.58399874, "learning_rate": 3.941409465519182e-06, "loss": 0.60681981, "num_input_tokens_seen": 18552100, "step": 873, "time_per_iteration": 2.7953338623046875 }, { "auxiliary_loss_clip": 0.01235958, "auxiliary_loss_mlp": 0.01044667, "balance_loss_clip": 1.02611113, "balance_loss_mlp": 1.02965856, "epoch": 0.10509228641856551, "flos": 32858479353600.0, "grad_norm": 1.710601677682532, "language_loss": 0.85167271, "learning_rate": 3.941222150457176e-06, "loss": 0.87447894, "num_input_tokens_seen": 18575355, "step": 874, "time_per_iteration": 2.8089675903320312 }, { "auxiliary_loss_clip": 0.01248527, "auxiliary_loss_mlp": 0.0103789, "balance_loss_clip": 1.02804017, "balance_loss_mlp": 1.02401948, "epoch": 0.10521252930920459, "flos": 14319165173760.0, "grad_norm": 2.499571492883242, "language_loss": 0.71357405, "learning_rate": 3.941034540913311e-06, "loss": 0.73643827, "num_input_tokens_seen": 18592885, "step": 875, "time_per_iteration": 2.6019771099090576 }, { "auxiliary_loss_clip": 0.01249576, "auxiliary_loss_mlp": 0.01130047, "balance_loss_clip": 1.03075862, "balance_loss_mlp": 0.0, "epoch": 0.10533277219984369, "flos": 21687028773120.0, "grad_norm": 1.5833303335005169, "language_loss": 0.82512015, "learning_rate": 3.940846636916051e-06, "loss": 0.84891641, "num_input_tokens_seen": 18612920, "step": 876, "time_per_iteration": 2.7547402381896973 }, { "auxiliary_loss_clip": 0.01244354, "auxiliary_loss_mlp": 0.01045025, "balance_loss_clip": 0.99603009, "balance_loss_mlp": 1.03179884, "epoch": 0.10545301509048277, "flos": 22269787027200.0, "grad_norm": 2.0453250324250627, "language_loss": 0.86590427, "learning_rate": 3.940658438493899e-06, "loss": 0.88879812, "num_input_tokens_seen": 18630765, "step": 877, "time_per_iteration": 3.6292178630828857 }, { "auxiliary_loss_clip": 0.01250252, "auxiliary_loss_mlp": 0.01044743, "balance_loss_clip": 1.06624484, "balance_loss_mlp": 1.03039062, "epoch": 0.10557325798112187, "flos": 22199725549440.0, "grad_norm": 2.3532175220348233, "language_loss": 0.75960958, "learning_rate": 3.940469945675405e-06, "loss": 0.78255951, "num_input_tokens_seen": 18649150, "step": 878, "time_per_iteration": 3.704881429672241 }, { "auxiliary_loss_clip": 0.01214699, "auxiliary_loss_mlp": 0.01038991, "balance_loss_clip": 0.90744913, "balance_loss_mlp": 1.02521014, "epoch": 0.10569350087176095, "flos": 25775889569280.0, "grad_norm": 1.97037677740796, "language_loss": 0.91455787, "learning_rate": 3.940281158489163e-06, "loss": 0.93709481, "num_input_tokens_seen": 18668380, "step": 879, "time_per_iteration": 3.6932387351989746 }, { "auxiliary_loss_clip": 0.01228522, "auxiliary_loss_mlp": 0.01041902, "balance_loss_clip": 0.86690605, "balance_loss_mlp": 1.02707839, "epoch": 0.10581374376240005, "flos": 17311385790720.0, "grad_norm": 1.7709309045179566, "language_loss": 0.82928073, "learning_rate": 3.940092076963812e-06, "loss": 0.85198498, "num_input_tokens_seen": 18685875, "step": 880, "time_per_iteration": 3.8824379444122314 }, { "auxiliary_loss_clip": 0.0124058, "auxiliary_loss_mlp": 0.01044003, "balance_loss_clip": 0.98758632, "balance_loss_mlp": 1.02946508, "epoch": 0.10593398665303914, "flos": 34349454017280.0, "grad_norm": 2.2165373762791347, "language_loss": 0.7861312, "learning_rate": 3.9399027011280355e-06, "loss": 0.80897707, "num_input_tokens_seen": 18707970, "step": 881, "time_per_iteration": 3.0365660190582275 }, { "auxiliary_loss_clip": 0.01245575, "auxiliary_loss_mlp": 0.01044906, "balance_loss_clip": 0.99256122, "balance_loss_mlp": 1.03014755, "epoch": 0.10605422954367823, "flos": 23257977068160.0, "grad_norm": 2.0338482780269307, "language_loss": 0.77025455, "learning_rate": 3.939713031010561e-06, "loss": 0.79315937, "num_input_tokens_seen": 18726335, "step": 882, "time_per_iteration": 2.7360289096832275 }, { "auxiliary_loss_clip": 0.01239632, "auxiliary_loss_mlp": 0.01042915, "balance_loss_clip": 0.95197916, "balance_loss_mlp": 1.02940273, "epoch": 0.10617447243431732, "flos": 22820118278400.0, "grad_norm": 2.7061830362800667, "language_loss": 0.77484298, "learning_rate": 3.939523066640163e-06, "loss": 0.7976684, "num_input_tokens_seen": 18745230, "step": 883, "time_per_iteration": 2.6785061359405518 }, { "auxiliary_loss_clip": 0.0124732, "auxiliary_loss_mlp": 0.01052482, "balance_loss_clip": 1.02875209, "balance_loss_mlp": 1.03828454, "epoch": 0.10629471532495641, "flos": 24386577373440.0, "grad_norm": 1.7730560531804918, "language_loss": 0.81175625, "learning_rate": 3.939332808045657e-06, "loss": 0.83475423, "num_input_tokens_seen": 18764880, "step": 884, "time_per_iteration": 2.6967084407806396 }, { "auxiliary_loss_clip": 0.01243888, "auxiliary_loss_mlp": 0.01042549, "balance_loss_clip": 0.95259035, "balance_loss_mlp": 1.0282979, "epoch": 0.1064149582155955, "flos": 21105491581440.0, "grad_norm": 1.6928442202763552, "language_loss": 0.84572732, "learning_rate": 3.939142255255906e-06, "loss": 0.86859167, "num_input_tokens_seen": 18785765, "step": 885, "time_per_iteration": 2.713872194290161 }, { "auxiliary_loss_clip": 0.01242613, "auxiliary_loss_mlp": 0.01046705, "balance_loss_clip": 1.02792895, "balance_loss_mlp": 1.03217888, "epoch": 0.1065352011062346, "flos": 20702035042560.0, "grad_norm": 1.824754562763651, "language_loss": 0.86743021, "learning_rate": 3.938951408299817e-06, "loss": 0.8903234, "num_input_tokens_seen": 18804605, "step": 886, "time_per_iteration": 2.672173261642456 }, { "auxiliary_loss_clip": 0.01145391, "auxiliary_loss_mlp": 0.01011729, "balance_loss_clip": 0.89462614, "balance_loss_mlp": 1.00295568, "epoch": 0.10665544399687368, "flos": 62659632689280.0, "grad_norm": 0.7958578808109096, "language_loss": 0.54436463, "learning_rate": 3.938760267206342e-06, "loss": 0.56593585, "num_input_tokens_seen": 18866425, "step": 887, "time_per_iteration": 3.271183967590332 }, { "auxiliary_loss_clip": 0.01246068, "auxiliary_loss_mlp": 0.01045978, "balance_loss_clip": 1.06653666, "balance_loss_mlp": 1.03200638, "epoch": 0.10677568688751278, "flos": 26140382830080.0, "grad_norm": 2.1076672699133505, "language_loss": 0.79001701, "learning_rate": 3.938568832004475e-06, "loss": 0.8129375, "num_input_tokens_seen": 18885130, "step": 888, "time_per_iteration": 2.641880512237549 }, { "auxiliary_loss_clip": 0.01226641, "auxiliary_loss_mlp": 0.01036885, "balance_loss_clip": 0.98499364, "balance_loss_mlp": 1.02191246, "epoch": 0.10689592977815186, "flos": 12786533712000.0, "grad_norm": 2.0440893650682943, "language_loss": 0.75504887, "learning_rate": 3.938377102723257e-06, "loss": 0.77768415, "num_input_tokens_seen": 18902265, "step": 889, "time_per_iteration": 2.708059549331665 }, { "auxiliary_loss_clip": 0.01219869, "auxiliary_loss_mlp": 0.01047248, "balance_loss_clip": 0.90839827, "balance_loss_mlp": 1.03272796, "epoch": 0.10701617266879096, "flos": 22126683242880.0, "grad_norm": 2.064570531699201, "language_loss": 0.83378637, "learning_rate": 3.938185079391774e-06, "loss": 0.85645753, "num_input_tokens_seen": 18919310, "step": 890, "time_per_iteration": 2.750368118286133 }, { "auxiliary_loss_clip": 0.01247961, "auxiliary_loss_mlp": 0.01049307, "balance_loss_clip": 1.06767559, "balance_loss_mlp": 1.03494823, "epoch": 0.10713641555943004, "flos": 19745625559680.0, "grad_norm": 3.5021988314511585, "language_loss": 1.05998826, "learning_rate": 3.937992762039157e-06, "loss": 1.08296084, "num_input_tokens_seen": 18932635, "step": 891, "time_per_iteration": 2.618579149246216 }, { "auxiliary_loss_clip": 0.01243522, "auxiliary_loss_mlp": 0.01042314, "balance_loss_clip": 1.03094172, "balance_loss_mlp": 1.02752566, "epoch": 0.10725665845006914, "flos": 23952992302080.0, "grad_norm": 1.8070869462722337, "language_loss": 0.80440164, "learning_rate": 3.937800150694577e-06, "loss": 0.82725996, "num_input_tokens_seen": 18953810, "step": 892, "time_per_iteration": 2.704911708831787 }, { "auxiliary_loss_clip": 0.01239786, "auxiliary_loss_mlp": 0.01050135, "balance_loss_clip": 0.91370583, "balance_loss_mlp": 1.03548455, "epoch": 0.10737690134070824, "flos": 18551704371840.0, "grad_norm": 2.163744326538849, "language_loss": 0.75973421, "learning_rate": 3.937607245387255e-06, "loss": 0.78263336, "num_input_tokens_seen": 18973175, "step": 893, "time_per_iteration": 2.7964820861816406 }, { "auxiliary_loss_clip": 0.0125015, "auxiliary_loss_mlp": 0.0105093, "balance_loss_clip": 0.98913932, "balance_loss_mlp": 1.03775132, "epoch": 0.10749714423134732, "flos": 22707609903360.0, "grad_norm": 2.302871240270919, "language_loss": 0.72189856, "learning_rate": 3.937414046146455e-06, "loss": 0.74490935, "num_input_tokens_seen": 18991130, "step": 894, "time_per_iteration": 2.683856964111328 }, { "auxiliary_loss_clip": 0.01248367, "auxiliary_loss_mlp": 0.01045445, "balance_loss_clip": 1.0696882, "balance_loss_mlp": 1.03114557, "epoch": 0.10761738712198642, "flos": 21106066199040.0, "grad_norm": 2.074475007140264, "language_loss": 0.75528991, "learning_rate": 3.9372205530014845e-06, "loss": 0.77822804, "num_input_tokens_seen": 19009610, "step": 895, "time_per_iteration": 2.684900999069214 }, { "auxiliary_loss_clip": 0.01247351, "auxiliary_loss_mlp": 0.01046976, "balance_loss_clip": 1.06509805, "balance_loss_mlp": 1.03346944, "epoch": 0.1077376300126255, "flos": 23766723348480.0, "grad_norm": 2.252076363588934, "language_loss": 0.71640122, "learning_rate": 3.937026765981696e-06, "loss": 0.73934448, "num_input_tokens_seen": 19029680, "step": 896, "time_per_iteration": 2.6805202960968018 }, { "auxiliary_loss_clip": 0.01244501, "auxiliary_loss_mlp": 0.01056659, "balance_loss_clip": 0.95565182, "balance_loss_mlp": 1.04228199, "epoch": 0.1078578729032646, "flos": 20919581763840.0, "grad_norm": 1.767337246012119, "language_loss": 0.79468977, "learning_rate": 3.936832685116488e-06, "loss": 0.81770134, "num_input_tokens_seen": 19047775, "step": 897, "time_per_iteration": 2.706228494644165 }, { "auxiliary_loss_clip": 0.01245654, "auxiliary_loss_mlp": 0.01041548, "balance_loss_clip": 1.06749821, "balance_loss_mlp": 1.02650356, "epoch": 0.10797811579390369, "flos": 14829886702080.0, "grad_norm": 2.4058051980413686, "language_loss": 0.90425599, "learning_rate": 3.936638310435301e-06, "loss": 0.92712802, "num_input_tokens_seen": 19065640, "step": 898, "time_per_iteration": 2.736837863922119 }, { "auxiliary_loss_clip": 0.01252411, "auxiliary_loss_mlp": 0.01050873, "balance_loss_clip": 1.03063619, "balance_loss_mlp": 1.03672314, "epoch": 0.10809835868454278, "flos": 19536985411200.0, "grad_norm": 2.227565454039306, "language_loss": 0.81531298, "learning_rate": 3.936443641967623e-06, "loss": 0.83834589, "num_input_tokens_seen": 19084470, "step": 899, "time_per_iteration": 2.6587626934051514 }, { "auxiliary_loss_clip": 0.0124824, "auxiliary_loss_mlp": 0.01051183, "balance_loss_clip": 0.99074817, "balance_loss_mlp": 1.03711605, "epoch": 0.10821860157518187, "flos": 18442320480000.0, "grad_norm": 2.1108848749879137, "language_loss": 0.82833123, "learning_rate": 3.936248679742983e-06, "loss": 0.85132545, "num_input_tokens_seen": 19102965, "step": 900, "time_per_iteration": 2.69197416305542 }, { "auxiliary_loss_clip": 0.01146966, "auxiliary_loss_mlp": 0.01027563, "balance_loss_clip": 0.92807651, "balance_loss_mlp": 1.01855087, "epoch": 0.10833884446582095, "flos": 49359468447360.0, "grad_norm": 1.05704244510929, "language_loss": 0.70184135, "learning_rate": 3.936053423790959e-06, "loss": 0.72358668, "num_input_tokens_seen": 19151285, "step": 901, "time_per_iteration": 3.0979115962982178 }, { "auxiliary_loss_clip": 0.01248623, "auxiliary_loss_mlp": 0.01049343, "balance_loss_clip": 1.06797862, "balance_loss_mlp": 1.03567028, "epoch": 0.10845908735646005, "flos": 20411912891520.0, "grad_norm": 1.7945043937549885, "language_loss": 0.77109873, "learning_rate": 3.935857874141168e-06, "loss": 0.79407841, "num_input_tokens_seen": 19170120, "step": 902, "time_per_iteration": 2.704047441482544 }, { "auxiliary_loss_clip": 0.01240585, "auxiliary_loss_mlp": 0.01047815, "balance_loss_clip": 0.99142361, "balance_loss_mlp": 1.03330684, "epoch": 0.10857933024709913, "flos": 14027750133120.0, "grad_norm": 2.0760090243398577, "language_loss": 0.83619976, "learning_rate": 3.935662030823279e-06, "loss": 0.85908377, "num_input_tokens_seen": 19186305, "step": 903, "time_per_iteration": 3.8053674697875977 }, { "auxiliary_loss_clip": 0.01248124, "auxiliary_loss_mlp": 0.01040829, "balance_loss_clip": 1.02810907, "balance_loss_mlp": 1.02605939, "epoch": 0.10869957313773823, "flos": 13369004657280.0, "grad_norm": 2.6699388176487027, "language_loss": 0.72378445, "learning_rate": 3.935465893866998e-06, "loss": 0.746674, "num_input_tokens_seen": 19204530, "step": 904, "time_per_iteration": 3.7215278148651123 }, { "auxiliary_loss_clip": 0.01242135, "auxiliary_loss_mlp": 0.01040388, "balance_loss_clip": 0.9908917, "balance_loss_mlp": 1.02627909, "epoch": 0.10881981602837733, "flos": 25807095509760.0, "grad_norm": 1.8007452388397582, "language_loss": 0.80109119, "learning_rate": 3.935269463302079e-06, "loss": 0.82391649, "num_input_tokens_seen": 19222735, "step": 905, "time_per_iteration": 3.754164695739746 }, { "auxiliary_loss_clip": 0.01253245, "auxiliary_loss_mlp": 0.01048694, "balance_loss_clip": 1.03085935, "balance_loss_mlp": 1.03409052, "epoch": 0.10894005891901641, "flos": 20777555387520.0, "grad_norm": 1.7627481356336043, "language_loss": 0.76709759, "learning_rate": 3.935072739158322e-06, "loss": 0.79011697, "num_input_tokens_seen": 19242445, "step": 906, "time_per_iteration": 3.7267701625823975 }, { "auxiliary_loss_clip": 0.01245361, "auxiliary_loss_mlp": 0.01048793, "balance_loss_clip": 0.99158794, "balance_loss_mlp": 1.03391528, "epoch": 0.10906030180965551, "flos": 26649883296000.0, "grad_norm": 1.577981168155191, "language_loss": 0.79899681, "learning_rate": 3.934875721465569e-06, "loss": 0.82193834, "num_input_tokens_seen": 19262865, "step": 907, "time_per_iteration": 2.757697343826294 }, { "auxiliary_loss_clip": 0.0124426, "auxiliary_loss_mlp": 0.01042474, "balance_loss_clip": 0.98906207, "balance_loss_mlp": 1.02753663, "epoch": 0.10918054470029459, "flos": 36534402420480.0, "grad_norm": 3.2035282256218163, "language_loss": 0.71758127, "learning_rate": 3.9346784102537076e-06, "loss": 0.74044859, "num_input_tokens_seen": 19285000, "step": 908, "time_per_iteration": 2.8257834911346436 }, { "auxiliary_loss_clip": 0.01246484, "auxiliary_loss_mlp": 0.01047398, "balance_loss_clip": 1.06678128, "balance_loss_mlp": 1.03318787, "epoch": 0.10930078759093369, "flos": 21762549118080.0, "grad_norm": 1.8962285418291118, "language_loss": 0.77944732, "learning_rate": 3.934480805552669e-06, "loss": 0.80238616, "num_input_tokens_seen": 19306010, "step": 909, "time_per_iteration": 2.7548158168792725 }, { "auxiliary_loss_clip": 0.01247784, "auxiliary_loss_mlp": 0.0113003, "balance_loss_clip": 1.06844544, "balance_loss_mlp": 0.0, "epoch": 0.10942103048157277, "flos": 22601781457920.0, "grad_norm": 2.2380353491370943, "language_loss": 0.88073039, "learning_rate": 3.93428290739243e-06, "loss": 0.90450847, "num_input_tokens_seen": 19325380, "step": 910, "time_per_iteration": 2.664882183074951 }, { "auxiliary_loss_clip": 0.01245976, "auxiliary_loss_mlp": 0.0104575, "balance_loss_clip": 0.99067402, "balance_loss_mlp": 1.03210068, "epoch": 0.10954127337221187, "flos": 15045781397760.0, "grad_norm": 2.4192516456156734, "language_loss": 0.80025995, "learning_rate": 3.9340847158030125e-06, "loss": 0.82317722, "num_input_tokens_seen": 19338960, "step": 911, "time_per_iteration": 2.6762990951538086 }, { "auxiliary_loss_clip": 0.01253427, "auxiliary_loss_mlp": 0.01041839, "balance_loss_clip": 1.03075361, "balance_loss_mlp": 1.0276413, "epoch": 0.10966151626285096, "flos": 21650974496640.0, "grad_norm": 2.3183304877775766, "language_loss": 0.75335842, "learning_rate": 3.9338862308144814e-06, "loss": 0.77631116, "num_input_tokens_seen": 19357780, "step": 912, "time_per_iteration": 2.675182819366455 }, { "auxiliary_loss_clip": 0.01250247, "auxiliary_loss_mlp": 0.01039529, "balance_loss_clip": 1.07132149, "balance_loss_mlp": 1.02519965, "epoch": 0.10978175915349005, "flos": 20121359777280.0, "grad_norm": 1.60373254058056, "language_loss": 0.84564114, "learning_rate": 3.933687452456946e-06, "loss": 0.86853886, "num_input_tokens_seen": 19377680, "step": 913, "time_per_iteration": 2.65260910987854 }, { "auxiliary_loss_clip": 0.01242162, "auxiliary_loss_mlp": 0.01044228, "balance_loss_clip": 0.94820046, "balance_loss_mlp": 1.02862394, "epoch": 0.10990200204412914, "flos": 20412667077120.0, "grad_norm": 2.329103420404068, "language_loss": 0.86265731, "learning_rate": 3.933488380760562e-06, "loss": 0.88552123, "num_input_tokens_seen": 19397040, "step": 914, "time_per_iteration": 2.7189724445343018 }, { "auxiliary_loss_clip": 0.01250594, "auxiliary_loss_mlp": 0.01130212, "balance_loss_clip": 1.06900382, "balance_loss_mlp": 0.0, "epoch": 0.11002224493476823, "flos": 17530117660800.0, "grad_norm": 2.3493010361189133, "language_loss": 0.87316108, "learning_rate": 3.9332890157555286e-06, "loss": 0.8969692, "num_input_tokens_seen": 19413975, "step": 915, "time_per_iteration": 2.66332745552063 }, { "auxiliary_loss_clip": 0.01251172, "auxiliary_loss_mlp": 0.01049691, "balance_loss_clip": 0.9936353, "balance_loss_mlp": 1.03585112, "epoch": 0.11014248782540732, "flos": 12203093099520.0, "grad_norm": 1.8391963753352272, "language_loss": 0.76435769, "learning_rate": 3.933089357472088e-06, "loss": 0.78736627, "num_input_tokens_seen": 19432005, "step": 916, "time_per_iteration": 2.7626030445098877 }, { "auxiliary_loss_clip": 0.01253967, "auxiliary_loss_mlp": 0.01057633, "balance_loss_clip": 1.07292569, "balance_loss_mlp": 1.04380488, "epoch": 0.11026273071604642, "flos": 22382977760640.0, "grad_norm": 1.9204877627916774, "language_loss": 0.85791409, "learning_rate": 3.932889405940529e-06, "loss": 0.88103008, "num_input_tokens_seen": 19450100, "step": 917, "time_per_iteration": 2.6892714500427246 }, { "auxiliary_loss_clip": 0.01248754, "auxiliary_loss_mlp": 0.01045666, "balance_loss_clip": 0.99498451, "balance_loss_mlp": 1.03146863, "epoch": 0.1103829736066855, "flos": 19829046896640.0, "grad_norm": 2.288443770192401, "language_loss": 0.79878461, "learning_rate": 3.932689161191184e-06, "loss": 0.82172883, "num_input_tokens_seen": 19467805, "step": 918, "time_per_iteration": 2.7007129192352295 }, { "auxiliary_loss_clip": 0.01244205, "auxiliary_loss_mlp": 0.01046507, "balance_loss_clip": 1.0285573, "balance_loss_mlp": 1.03117657, "epoch": 0.1105032164973246, "flos": 22669616292480.0, "grad_norm": 2.6637636564174243, "language_loss": 0.87865698, "learning_rate": 3.93248862325443e-06, "loss": 0.90156412, "num_input_tokens_seen": 19486710, "step": 919, "time_per_iteration": 2.7927794456481934 }, { "auxiliary_loss_clip": 0.011546, "auxiliary_loss_mlp": 0.01008784, "balance_loss_clip": 1.00657403, "balance_loss_mlp": 0.99958116, "epoch": 0.11062345938796368, "flos": 66483507876480.0, "grad_norm": 0.9575458666651306, "language_loss": 0.64492768, "learning_rate": 3.932287792160688e-06, "loss": 0.66656154, "num_input_tokens_seen": 19545170, "step": 920, "time_per_iteration": 3.224428653717041 }, { "auxiliary_loss_clip": 0.01248345, "auxiliary_loss_mlp": 0.01041919, "balance_loss_clip": 1.02728796, "balance_loss_mlp": 1.02737498, "epoch": 0.11074370227860278, "flos": 21907771804800.0, "grad_norm": 2.766035128272467, "language_loss": 0.80528909, "learning_rate": 3.932086667940424e-06, "loss": 0.82819176, "num_input_tokens_seen": 19561875, "step": 921, "time_per_iteration": 2.645012378692627 }, { "auxiliary_loss_clip": 0.0124668, "auxiliary_loss_mlp": 0.01129725, "balance_loss_clip": 1.03222227, "balance_loss_mlp": 0.0, "epoch": 0.11086394516924186, "flos": 28658115763200.0, "grad_norm": 4.357399354975338, "language_loss": 0.81718123, "learning_rate": 3.93188525062415e-06, "loss": 0.84094536, "num_input_tokens_seen": 19582340, "step": 922, "time_per_iteration": 2.739753007888794 }, { "auxiliary_loss_clip": 0.01248194, "auxiliary_loss_mlp": 0.01038326, "balance_loss_clip": 1.03144598, "balance_loss_mlp": 1.02385378, "epoch": 0.11098418805988096, "flos": 24535247765760.0, "grad_norm": 2.639026384317663, "language_loss": 0.86152947, "learning_rate": 3.931683540242418e-06, "loss": 0.88439465, "num_input_tokens_seen": 19603405, "step": 923, "time_per_iteration": 2.7865912914276123 }, { "auxiliary_loss_clip": 0.01240358, "auxiliary_loss_mlp": 0.01047031, "balance_loss_clip": 1.02735758, "balance_loss_mlp": 1.03265405, "epoch": 0.11110443095052006, "flos": 22960384888320.0, "grad_norm": 2.5036435572136866, "language_loss": 0.90996075, "learning_rate": 3.9314815368258295e-06, "loss": 0.93283463, "num_input_tokens_seen": 19619885, "step": 924, "time_per_iteration": 2.7202258110046387 }, { "auxiliary_loss_clip": 0.01257217, "auxiliary_loss_mlp": 0.01043984, "balance_loss_clip": 1.03765464, "balance_loss_mlp": 1.02977419, "epoch": 0.11122467384115914, "flos": 18950025265920.0, "grad_norm": 2.86522439429699, "language_loss": 0.7889694, "learning_rate": 3.9312792404050275e-06, "loss": 0.81198138, "num_input_tokens_seen": 19637940, "step": 925, "time_per_iteration": 2.632133722305298 }, { "auxiliary_loss_clip": 0.01248864, "auxiliary_loss_mlp": 0.01041414, "balance_loss_clip": 1.07116389, "balance_loss_mlp": 1.02771688, "epoch": 0.11134491673179824, "flos": 25082957324160.0, "grad_norm": 1.9378852865396665, "language_loss": 0.77259821, "learning_rate": 3.9310766510107e-06, "loss": 0.79550099, "num_input_tokens_seen": 19657115, "step": 926, "time_per_iteration": 2.683227300643921 }, { "auxiliary_loss_clip": 0.01247164, "auxiliary_loss_mlp": 0.01052133, "balance_loss_clip": 0.95212662, "balance_loss_mlp": 1.03730321, "epoch": 0.11146515962243732, "flos": 24499121662080.0, "grad_norm": 1.8335074037753851, "language_loss": 0.92157614, "learning_rate": 3.9308737686735806e-06, "loss": 0.94456911, "num_input_tokens_seen": 19677075, "step": 927, "time_per_iteration": 2.738724946975708 }, { "auxiliary_loss_clip": 0.01250753, "auxiliary_loss_mlp": 0.01045961, "balance_loss_clip": 1.06998551, "balance_loss_mlp": 1.03163195, "epoch": 0.11158540251307641, "flos": 22343763087360.0, "grad_norm": 2.238294958058628, "language_loss": 0.827075, "learning_rate": 3.9306705934244455e-06, "loss": 0.85004216, "num_input_tokens_seen": 19697155, "step": 928, "time_per_iteration": 2.6504878997802734 }, { "auxiliary_loss_clip": 0.01225771, "auxiliary_loss_mlp": 0.01043126, "balance_loss_clip": 0.98731256, "balance_loss_mlp": 1.02870131, "epoch": 0.11170564540371551, "flos": 19902304684800.0, "grad_norm": 1.6986875426986112, "language_loss": 0.88102007, "learning_rate": 3.930467125294116e-06, "loss": 0.90370905, "num_input_tokens_seen": 19716705, "step": 929, "time_per_iteration": 2.7229909896850586 }, { "auxiliary_loss_clip": 0.0114079, "auxiliary_loss_mlp": 0.01015473, "balance_loss_clip": 0.84814823, "balance_loss_mlp": 1.00660419, "epoch": 0.1118258882943546, "flos": 64586239499520.0, "grad_norm": 0.9291605996327463, "language_loss": 0.60503703, "learning_rate": 3.930263364313458e-06, "loss": 0.62659967, "num_input_tokens_seen": 19767275, "step": 930, "time_per_iteration": 5.120313405990601 }, { "auxiliary_loss_clip": 0.01237677, "auxiliary_loss_mlp": 0.01041736, "balance_loss_clip": 0.94968426, "balance_loss_mlp": 1.02767491, "epoch": 0.11194613118499369, "flos": 17201965985280.0, "grad_norm": 1.9539797640185805, "language_loss": 0.82879347, "learning_rate": 3.930059310513384e-06, "loss": 0.85158759, "num_input_tokens_seen": 19786315, "step": 931, "time_per_iteration": 3.0023744106292725 }, { "auxiliary_loss_clip": 0.01225066, "auxiliary_loss_mlp": 0.01129922, "balance_loss_clip": 0.94663274, "balance_loss_mlp": 0.0, "epoch": 0.11206637407563277, "flos": 31863465728640.0, "grad_norm": 1.9608649697238376, "language_loss": 0.84028757, "learning_rate": 3.929854963924846e-06, "loss": 0.86383748, "num_input_tokens_seen": 19806580, "step": 932, "time_per_iteration": 3.7418384552001953 }, { "auxiliary_loss_clip": 0.01237672, "auxiliary_loss_mlp": 0.01043831, "balance_loss_clip": 0.94630694, "balance_loss_mlp": 1.03020525, "epoch": 0.11218661696627187, "flos": 21945621761280.0, "grad_norm": 1.8756015515114843, "language_loss": 0.77506286, "learning_rate": 3.929650324578845e-06, "loss": 0.79787791, "num_input_tokens_seen": 19826045, "step": 933, "time_per_iteration": 2.70637583732605 }, { "auxiliary_loss_clip": 0.0125007, "auxiliary_loss_mlp": 0.01043836, "balance_loss_clip": 0.99365222, "balance_loss_mlp": 1.02882802, "epoch": 0.11230685985691095, "flos": 25878198481920.0, "grad_norm": 3.5440928732593147, "language_loss": 0.82024819, "learning_rate": 3.929445392506423e-06, "loss": 0.84318721, "num_input_tokens_seen": 19843985, "step": 934, "time_per_iteration": 2.7320775985717773 }, { "auxiliary_loss_clip": 0.01244572, "auxiliary_loss_mlp": 0.01047396, "balance_loss_clip": 1.03211987, "balance_loss_mlp": 1.03332877, "epoch": 0.11242710274755005, "flos": 22231506107520.0, "grad_norm": 1.8907299375438693, "language_loss": 0.75800443, "learning_rate": 3.92924016773867e-06, "loss": 0.7809242, "num_input_tokens_seen": 19860480, "step": 935, "time_per_iteration": 2.6751673221588135 }, { "auxiliary_loss_clip": 0.01241317, "auxiliary_loss_mlp": 0.01129617, "balance_loss_clip": 0.98844409, "balance_loss_mlp": 0.0, "epoch": 0.11254734563818915, "flos": 17712184723200.0, "grad_norm": 2.777668073874363, "language_loss": 0.73216581, "learning_rate": 3.9290346503067175e-06, "loss": 0.75587517, "num_input_tokens_seen": 19877145, "step": 936, "time_per_iteration": 2.6854026317596436 }, { "auxiliary_loss_clip": 0.01250849, "auxiliary_loss_mlp": 0.01051302, "balance_loss_clip": 1.02937567, "balance_loss_mlp": 1.03718209, "epoch": 0.11266758852882823, "flos": 54930397334400.0, "grad_norm": 1.9444778930758746, "language_loss": 0.78541052, "learning_rate": 3.9288288402417415e-06, "loss": 0.80843204, "num_input_tokens_seen": 19903405, "step": 937, "time_per_iteration": 2.981329917907715 }, { "auxiliary_loss_clip": 0.0125089, "auxiliary_loss_mlp": 0.01057375, "balance_loss_clip": 1.03203702, "balance_loss_mlp": 1.04262841, "epoch": 0.11278783141946733, "flos": 18878132194560.0, "grad_norm": 2.3983907343223327, "language_loss": 0.70730364, "learning_rate": 3.928622737574964e-06, "loss": 0.73038638, "num_input_tokens_seen": 19918740, "step": 938, "time_per_iteration": 2.6387927532196045 }, { "auxiliary_loss_clip": 0.0124082, "auxiliary_loss_mlp": 0.01044695, "balance_loss_clip": 0.98717588, "balance_loss_mlp": 1.03052115, "epoch": 0.11290807431010641, "flos": 26469252777600.0, "grad_norm": 2.9366330969729497, "language_loss": 0.91139603, "learning_rate": 3.928416342337652e-06, "loss": 0.93425119, "num_input_tokens_seen": 19938475, "step": 939, "time_per_iteration": 2.769258737564087 }, { "auxiliary_loss_clip": 0.01241352, "auxiliary_loss_mlp": 0.01039042, "balance_loss_clip": 0.98899722, "balance_loss_mlp": 1.02596438, "epoch": 0.1130283172007455, "flos": 22710590732160.0, "grad_norm": 1.9782341043727143, "language_loss": 0.82616234, "learning_rate": 3.928209654561113e-06, "loss": 0.84896636, "num_input_tokens_seen": 19959310, "step": 940, "time_per_iteration": 2.815558433532715 }, { "auxiliary_loss_clip": 0.01237558, "auxiliary_loss_mlp": 0.01044233, "balance_loss_clip": 0.99010247, "balance_loss_mlp": 1.03020215, "epoch": 0.1131485600913846, "flos": 23219911630080.0, "grad_norm": 2.000239439836931, "language_loss": 0.81485438, "learning_rate": 3.928002674276703e-06, "loss": 0.83767229, "num_input_tokens_seen": 19978700, "step": 941, "time_per_iteration": 2.6846182346343994 }, { "auxiliary_loss_clip": 0.01212697, "auxiliary_loss_mlp": 0.01054279, "balance_loss_clip": 0.90457475, "balance_loss_mlp": 1.03999805, "epoch": 0.11326880298202369, "flos": 14064271286400.0, "grad_norm": 2.6213929827555003, "language_loss": 0.75214213, "learning_rate": 3.92779540151582e-06, "loss": 0.77481186, "num_input_tokens_seen": 19995785, "step": 942, "time_per_iteration": 2.7253170013427734 }, { "auxiliary_loss_clip": 0.01243196, "auxiliary_loss_mlp": 0.01038055, "balance_loss_clip": 0.98965234, "balance_loss_mlp": 1.02410758, "epoch": 0.11338904587266278, "flos": 16325386479360.0, "grad_norm": 2.1217392741248435, "language_loss": 0.85601634, "learning_rate": 3.927587836309907e-06, "loss": 0.87882882, "num_input_tokens_seen": 20013615, "step": 943, "time_per_iteration": 2.6773602962493896 }, { "auxiliary_loss_clip": 0.01234694, "auxiliary_loss_mlp": 0.01041743, "balance_loss_clip": 0.98578233, "balance_loss_mlp": 1.02740812, "epoch": 0.11350928876330187, "flos": 24426258923520.0, "grad_norm": 1.9077555843440834, "language_loss": 0.78328282, "learning_rate": 3.927379978690452e-06, "loss": 0.80604714, "num_input_tokens_seen": 20032880, "step": 944, "time_per_iteration": 2.7381234169006348 }, { "auxiliary_loss_clip": 0.01221434, "auxiliary_loss_mlp": 0.01043241, "balance_loss_clip": 0.94293523, "balance_loss_mlp": 1.02947187, "epoch": 0.11362953165394096, "flos": 24497074586880.0, "grad_norm": 2.1247780619628407, "language_loss": 0.87162375, "learning_rate": 3.927171828688987e-06, "loss": 0.89427054, "num_input_tokens_seen": 20052405, "step": 945, "time_per_iteration": 2.7737443447113037 }, { "auxiliary_loss_clip": 0.01246162, "auxiliary_loss_mlp": 0.01041628, "balance_loss_clip": 1.06739461, "balance_loss_mlp": 1.028157, "epoch": 0.11374977454458005, "flos": 24060831909120.0, "grad_norm": 2.3504871591058922, "language_loss": 0.82682574, "learning_rate": 3.926963386337088e-06, "loss": 0.84970361, "num_input_tokens_seen": 20070635, "step": 946, "time_per_iteration": 2.626084566116333 }, { "auxiliary_loss_clip": 0.01250487, "auxiliary_loss_mlp": 0.01043718, "balance_loss_clip": 1.06787455, "balance_loss_mlp": 1.02868605, "epoch": 0.11387001743521914, "flos": 39457638967680.0, "grad_norm": 3.2904301032425094, "language_loss": 0.70149368, "learning_rate": 3.926754651666375e-06, "loss": 0.72443569, "num_input_tokens_seen": 20091195, "step": 947, "time_per_iteration": 2.756784439086914 }, { "auxiliary_loss_clip": 0.01242509, "auxiliary_loss_mlp": 0.01048696, "balance_loss_clip": 0.95185184, "balance_loss_mlp": 1.03449857, "epoch": 0.11399026032585824, "flos": 25082454533760.0, "grad_norm": 2.8491159083453885, "language_loss": 0.77997899, "learning_rate": 3.926545624708513e-06, "loss": 0.80289102, "num_input_tokens_seen": 20110435, "step": 948, "time_per_iteration": 2.810208320617676 }, { "auxiliary_loss_clip": 0.01233475, "auxiliary_loss_mlp": 0.01043482, "balance_loss_clip": 0.9481464, "balance_loss_mlp": 1.03013015, "epoch": 0.11411050321649732, "flos": 17961835224960.0, "grad_norm": 2.2118548806690796, "language_loss": 0.85432279, "learning_rate": 3.926336305495213e-06, "loss": 0.87709236, "num_input_tokens_seen": 20128995, "step": 949, "time_per_iteration": 2.738759994506836 }, { "auxiliary_loss_clip": 0.01226704, "auxiliary_loss_mlp": 0.01056313, "balance_loss_clip": 0.94854486, "balance_loss_mlp": 1.04256177, "epoch": 0.11423074610713642, "flos": 22455409536000.0, "grad_norm": 2.160805242451491, "language_loss": 0.88721597, "learning_rate": 3.926126694058226e-06, "loss": 0.91004616, "num_input_tokens_seen": 20148145, "step": 950, "time_per_iteration": 2.7151246070861816 }, { "auxiliary_loss_clip": 0.01240344, "auxiliary_loss_mlp": 0.01055373, "balance_loss_clip": 0.91628516, "balance_loss_mlp": 1.04162836, "epoch": 0.1143509889977755, "flos": 19717687756800.0, "grad_norm": 1.6854023017980129, "language_loss": 0.82009244, "learning_rate": 3.92591679042935e-06, "loss": 0.84304953, "num_input_tokens_seen": 20168035, "step": 951, "time_per_iteration": 2.7533152103424072 }, { "auxiliary_loss_clip": 0.01247431, "auxiliary_loss_mlp": 0.01045125, "balance_loss_clip": 1.03224087, "balance_loss_mlp": 1.03051019, "epoch": 0.1144712318884146, "flos": 19822869757440.0, "grad_norm": 1.6851621217444748, "language_loss": 0.82304496, "learning_rate": 3.92570659464043e-06, "loss": 0.84597051, "num_input_tokens_seen": 20186095, "step": 952, "time_per_iteration": 2.715031385421753 }, { "auxiliary_loss_clip": 0.01240771, "auxiliary_loss_mlp": 0.01129836, "balance_loss_clip": 1.02993608, "balance_loss_mlp": 0.0, "epoch": 0.1145914747790537, "flos": 14939198766720.0, "grad_norm": 2.374782846492086, "language_loss": 0.79490888, "learning_rate": 3.925496106723349e-06, "loss": 0.81861496, "num_input_tokens_seen": 20203535, "step": 953, "time_per_iteration": 2.6155781745910645 }, { "auxiliary_loss_clip": 0.01242902, "auxiliary_loss_mlp": 0.01049927, "balance_loss_clip": 1.02922046, "balance_loss_mlp": 1.03680205, "epoch": 0.11471171766969278, "flos": 19865029345920.0, "grad_norm": 1.8925914285384728, "language_loss": 0.84014332, "learning_rate": 3.9252853267100405e-06, "loss": 0.86307162, "num_input_tokens_seen": 20222780, "step": 954, "time_per_iteration": 2.718285322189331 }, { "auxiliary_loss_clip": 0.01229594, "auxiliary_loss_mlp": 0.01041719, "balance_loss_clip": 0.94848877, "balance_loss_mlp": 1.02790272, "epoch": 0.11483196056033187, "flos": 22526476594560.0, "grad_norm": 1.8519240401712342, "language_loss": 0.839113, "learning_rate": 3.9250742546324786e-06, "loss": 0.86182618, "num_input_tokens_seen": 20243015, "step": 955, "time_per_iteration": 3.6652302742004395 }, { "auxiliary_loss_clip": 0.01237879, "auxiliary_loss_mlp": 0.01044963, "balance_loss_clip": 0.98529232, "balance_loss_mlp": 1.03154016, "epoch": 0.11495220345097096, "flos": 28220292887040.0, "grad_norm": 1.691738749827971, "language_loss": 0.86573702, "learning_rate": 3.924862890522683e-06, "loss": 0.88856548, "num_input_tokens_seen": 20263025, "step": 956, "time_per_iteration": 3.788710355758667 }, { "auxiliary_loss_clip": 0.01244225, "auxiliary_loss_mlp": 0.01037991, "balance_loss_clip": 1.02582383, "balance_loss_mlp": 1.02430558, "epoch": 0.11507244634161005, "flos": 17492267704320.0, "grad_norm": 2.44266695959853, "language_loss": 0.86191416, "learning_rate": 3.9246512344127174e-06, "loss": 0.8847363, "num_input_tokens_seen": 20280685, "step": 957, "time_per_iteration": 4.541559934616089 }, { "auxiliary_loss_clip": 0.01211501, "auxiliary_loss_mlp": 0.01057276, "balance_loss_clip": 0.866413, "balance_loss_mlp": 1.04401994, "epoch": 0.11519268923224914, "flos": 22564937082240.0, "grad_norm": 2.271263374993002, "language_loss": 0.81655383, "learning_rate": 3.9244392863346895e-06, "loss": 0.83924156, "num_input_tokens_seen": 20300090, "step": 958, "time_per_iteration": 2.805371046066284 }, { "auxiliary_loss_clip": 0.01242831, "auxiliary_loss_mlp": 0.01049771, "balance_loss_clip": 0.99196184, "balance_loss_mlp": 1.03515601, "epoch": 0.11531293212288823, "flos": 16982839065600.0, "grad_norm": 2.7204862716706963, "language_loss": 0.92216384, "learning_rate": 3.9242270463207524e-06, "loss": 0.94508982, "num_input_tokens_seen": 20318480, "step": 959, "time_per_iteration": 2.6584877967834473 }, { "auxiliary_loss_clip": 0.01224876, "auxiliary_loss_mlp": 0.0104061, "balance_loss_clip": 0.90915847, "balance_loss_mlp": 1.02660298, "epoch": 0.11543317501352733, "flos": 12422004537600.0, "grad_norm": 2.570112077755836, "language_loss": 0.85026211, "learning_rate": 3.924014514403102e-06, "loss": 0.87291694, "num_input_tokens_seen": 20334635, "step": 960, "time_per_iteration": 2.773401975631714 }, { "auxiliary_loss_clip": 0.01227614, "auxiliary_loss_mlp": 0.01051026, "balance_loss_clip": 0.91007406, "balance_loss_mlp": 1.03654242, "epoch": 0.11555341790416641, "flos": 19821648695040.0, "grad_norm": 2.088544843392585, "language_loss": 0.90683651, "learning_rate": 3.92380169061398e-06, "loss": 0.92962289, "num_input_tokens_seen": 20352415, "step": 961, "time_per_iteration": 2.8227875232696533 }, { "auxiliary_loss_clip": 0.01229709, "auxiliary_loss_mlp": 0.0112944, "balance_loss_clip": 0.94608235, "balance_loss_mlp": 0.0, "epoch": 0.11567366079480551, "flos": 25738865625600.0, "grad_norm": 1.9332878625074885, "language_loss": 0.83615005, "learning_rate": 3.9235885749856705e-06, "loss": 0.85974157, "num_input_tokens_seen": 20371095, "step": 962, "time_per_iteration": 2.8287193775177 }, { "auxiliary_loss_clip": 0.01238218, "auxiliary_loss_mlp": 0.01044802, "balance_loss_clip": 0.99029839, "balance_loss_mlp": 1.0307591, "epoch": 0.1157939036854446, "flos": 18223301301120.0, "grad_norm": 2.0377148997506134, "language_loss": 0.8244521, "learning_rate": 3.9233751675505035e-06, "loss": 0.84728229, "num_input_tokens_seen": 20389805, "step": 963, "time_per_iteration": 2.724259853363037 }, { "auxiliary_loss_clip": 0.01237824, "auxiliary_loss_mlp": 0.01043371, "balance_loss_clip": 0.99247575, "balance_loss_mlp": 1.02882791, "epoch": 0.11591414657608369, "flos": 23073755189760.0, "grad_norm": 2.029578220638217, "language_loss": 0.84961998, "learning_rate": 3.923161468340853e-06, "loss": 0.87243187, "num_input_tokens_seen": 20409640, "step": 964, "time_per_iteration": 2.7520484924316406 }, { "auxiliary_loss_clip": 0.01219238, "auxiliary_loss_mlp": 0.01049408, "balance_loss_clip": 0.90538323, "balance_loss_mlp": 1.03543615, "epoch": 0.11603438946672277, "flos": 19461716461440.0, "grad_norm": 1.7708553902404338, "language_loss": 0.81496716, "learning_rate": 3.9229474773891374e-06, "loss": 0.83765358, "num_input_tokens_seen": 20428180, "step": 965, "time_per_iteration": 2.788696765899658 }, { "auxiliary_loss_clip": 0.0124459, "auxiliary_loss_mlp": 0.01037581, "balance_loss_clip": 0.94704461, "balance_loss_mlp": 1.0236392, "epoch": 0.11615463235736187, "flos": 26831986272000.0, "grad_norm": 1.8530300903047137, "language_loss": 0.836721, "learning_rate": 3.922733194727818e-06, "loss": 0.85954267, "num_input_tokens_seen": 20447975, "step": 966, "time_per_iteration": 2.802902936935425 }, { "auxiliary_loss_clip": 0.01251266, "auxiliary_loss_mlp": 0.01047276, "balance_loss_clip": 1.03188622, "balance_loss_mlp": 1.03344798, "epoch": 0.11627487524800097, "flos": 18580324533120.0, "grad_norm": 1.902106147249991, "language_loss": 0.87738836, "learning_rate": 3.922518620389402e-06, "loss": 0.90037382, "num_input_tokens_seen": 20464840, "step": 967, "time_per_iteration": 2.663503408432007 }, { "auxiliary_loss_clip": 0.01202308, "auxiliary_loss_mlp": 0.01044771, "balance_loss_clip": 0.83003187, "balance_loss_mlp": 1.03064454, "epoch": 0.11639511813864005, "flos": 18150474476160.0, "grad_norm": 3.4166814125052816, "language_loss": 0.89414144, "learning_rate": 3.922303754406439e-06, "loss": 0.91661227, "num_input_tokens_seen": 20482680, "step": 968, "time_per_iteration": 2.7544078826904297 }, { "auxiliary_loss_clip": 0.01215374, "auxiliary_loss_mlp": 0.01044524, "balance_loss_clip": 0.944148, "balance_loss_mlp": 1.03064752, "epoch": 0.11651536102927915, "flos": 20922023888640.0, "grad_norm": 1.8725230553798085, "language_loss": 0.79077452, "learning_rate": 3.922088596811526e-06, "loss": 0.81337351, "num_input_tokens_seen": 20501810, "step": 969, "time_per_iteration": 2.736635446548462 }, { "auxiliary_loss_clip": 0.01233518, "auxiliary_loss_mlp": 0.01040427, "balance_loss_clip": 1.02865934, "balance_loss_mlp": 1.02630031, "epoch": 0.11663560391991823, "flos": 16508602776960.0, "grad_norm": 2.247879383790113, "language_loss": 0.86611164, "learning_rate": 3.9218731476373e-06, "loss": 0.88885105, "num_input_tokens_seen": 20517995, "step": 970, "time_per_iteration": 2.608745574951172 }, { "auxiliary_loss_clip": 0.01245501, "auxiliary_loss_mlp": 0.01052002, "balance_loss_clip": 1.0302074, "balance_loss_mlp": 1.03721964, "epoch": 0.11675584681055733, "flos": 19865029345920.0, "grad_norm": 1.9339143128424945, "language_loss": 0.84849036, "learning_rate": 3.9216574069164455e-06, "loss": 0.87146533, "num_input_tokens_seen": 20536970, "step": 971, "time_per_iteration": 2.65287184715271 }, { "auxiliary_loss_clip": 0.0124304, "auxiliary_loss_mlp": 0.0103997, "balance_loss_clip": 1.0674715, "balance_loss_mlp": 1.02646363, "epoch": 0.11687608970119642, "flos": 21944364785280.0, "grad_norm": 1.5466473630200472, "language_loss": 0.80166042, "learning_rate": 3.921441374681691e-06, "loss": 0.82449055, "num_input_tokens_seen": 20557030, "step": 972, "time_per_iteration": 2.637359619140625 }, { "auxiliary_loss_clip": 0.01234561, "auxiliary_loss_mlp": 0.0104526, "balance_loss_clip": 0.98764241, "balance_loss_mlp": 1.0320636, "epoch": 0.1169963325918355, "flos": 24061155131520.0, "grad_norm": 1.7887107271731475, "language_loss": 0.64987791, "learning_rate": 3.921225050965808e-06, "loss": 0.67267615, "num_input_tokens_seen": 20576915, "step": 973, "time_per_iteration": 2.7027862071990967 }, { "auxiliary_loss_clip": 0.01240905, "auxiliary_loss_mlp": 0.01044642, "balance_loss_clip": 0.95302457, "balance_loss_mlp": 1.03142726, "epoch": 0.1171165754824746, "flos": 23368151059200.0, "grad_norm": 2.1921487575020486, "language_loss": 0.75163805, "learning_rate": 3.921008435801612e-06, "loss": 0.77449358, "num_input_tokens_seen": 20596000, "step": 974, "time_per_iteration": 2.7194364070892334 }, { "auxiliary_loss_clip": 0.0123119, "auxiliary_loss_mlp": 0.01042871, "balance_loss_clip": 1.02787733, "balance_loss_mlp": 1.02919722, "epoch": 0.11723681837311369, "flos": 18552243075840.0, "grad_norm": 3.216062937479027, "language_loss": 0.75539923, "learning_rate": 3.920791529221963e-06, "loss": 0.77813983, "num_input_tokens_seen": 20614675, "step": 975, "time_per_iteration": 2.6604483127593994 }, { "auxiliary_loss_clip": 0.01241755, "auxiliary_loss_mlp": 0.01129496, "balance_loss_clip": 0.98907208, "balance_loss_mlp": 0.0, "epoch": 0.11735706126375278, "flos": 23550541344000.0, "grad_norm": 1.7601170045093366, "language_loss": 0.7633338, "learning_rate": 3.920574331259768e-06, "loss": 0.78704631, "num_input_tokens_seen": 20635875, "step": 976, "time_per_iteration": 2.7480719089508057 }, { "auxiliary_loss_clip": 0.01230012, "auxiliary_loss_mlp": 0.01046779, "balance_loss_clip": 0.99063766, "balance_loss_mlp": 1.03409481, "epoch": 0.11747730415439187, "flos": 22381541216640.0, "grad_norm": 2.084250258788493, "language_loss": 0.7961337, "learning_rate": 3.9203568419479716e-06, "loss": 0.81890154, "num_input_tokens_seen": 20656430, "step": 977, "time_per_iteration": 2.777332067489624 }, { "auxiliary_loss_clip": 0.01236719, "auxiliary_loss_mlp": 0.01041878, "balance_loss_clip": 0.98723638, "balance_loss_mlp": 1.0279429, "epoch": 0.11759754704503096, "flos": 22200731130240.0, "grad_norm": 1.8873674281680453, "language_loss": 0.75140733, "learning_rate": 3.92013906131957e-06, "loss": 0.77419335, "num_input_tokens_seen": 20675360, "step": 978, "time_per_iteration": 2.786381483078003 }, { "auxiliary_loss_clip": 0.01235064, "auxiliary_loss_mlp": 0.0104912, "balance_loss_clip": 0.94953668, "balance_loss_mlp": 1.03509498, "epoch": 0.11771778993567006, "flos": 22309755886080.0, "grad_norm": 1.4958910424832517, "language_loss": 0.82604885, "learning_rate": 3.9199209894076e-06, "loss": 0.84889072, "num_input_tokens_seen": 20695675, "step": 979, "time_per_iteration": 2.724550485610962 }, { "auxiliary_loss_clip": 0.01243306, "auxiliary_loss_mlp": 0.01043865, "balance_loss_clip": 1.06416512, "balance_loss_mlp": 1.02994776, "epoch": 0.11783803282630914, "flos": 21288169175040.0, "grad_norm": 2.5690663601867496, "language_loss": 0.8978014, "learning_rate": 3.919702626245142e-06, "loss": 0.92067313, "num_input_tokens_seen": 20715330, "step": 980, "time_per_iteration": 2.6463065147399902 }, { "auxiliary_loss_clip": 0.01229095, "auxiliary_loss_mlp": 0.01043754, "balance_loss_clip": 0.9886511, "balance_loss_mlp": 1.02993727, "epoch": 0.11795827571694824, "flos": 25371535190400.0, "grad_norm": 2.025630547716018, "language_loss": 0.66223001, "learning_rate": 3.919483971865322e-06, "loss": 0.68495846, "num_input_tokens_seen": 20735325, "step": 981, "time_per_iteration": 2.8132784366607666 }, { "auxiliary_loss_clip": 0.01235531, "auxiliary_loss_mlp": 0.01039951, "balance_loss_clip": 0.98888469, "balance_loss_mlp": 1.02602744, "epoch": 0.11807851860758732, "flos": 23622218933760.0, "grad_norm": 2.253686784848359, "language_loss": 0.87913752, "learning_rate": 3.91926502630131e-06, "loss": 0.9018923, "num_input_tokens_seen": 20755940, "step": 982, "time_per_iteration": 4.6052470207214355 }, { "auxiliary_loss_clip": 0.01246403, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.03389049, "balance_loss_mlp": 1.02885938, "epoch": 0.11819876149822642, "flos": 24972496024320.0, "grad_norm": 1.915408123824362, "language_loss": 0.72009313, "learning_rate": 3.91904578958632e-06, "loss": 0.74297583, "num_input_tokens_seen": 20775355, "step": 983, "time_per_iteration": 2.676649332046509 }, { "auxiliary_loss_clip": 0.01245741, "auxiliary_loss_mlp": 0.01040978, "balance_loss_clip": 1.067541, "balance_loss_mlp": 1.02775753, "epoch": 0.11831900438886551, "flos": 23003226835200.0, "grad_norm": 2.079789048810317, "language_loss": 0.84096104, "learning_rate": 3.918826261753608e-06, "loss": 0.86382824, "num_input_tokens_seen": 20794935, "step": 984, "time_per_iteration": 3.746967315673828 }, { "auxiliary_loss_clip": 0.01240819, "auxiliary_loss_mlp": 0.01045134, "balance_loss_clip": 0.99225193, "balance_loss_mlp": 1.03184175, "epoch": 0.1184392472795046, "flos": 27965147604480.0, "grad_norm": 3.071676116865211, "language_loss": 0.70702952, "learning_rate": 3.918606442836478e-06, "loss": 0.72988904, "num_input_tokens_seen": 20817155, "step": 985, "time_per_iteration": 2.763836145401001 }, { "auxiliary_loss_clip": 0.01241649, "auxiliary_loss_mlp": 0.01042686, "balance_loss_clip": 1.03216302, "balance_loss_mlp": 1.0293467, "epoch": 0.1185594901701437, "flos": 19898497843200.0, "grad_norm": 1.8193195213981121, "language_loss": 0.77525008, "learning_rate": 3.918386332868277e-06, "loss": 0.79809344, "num_input_tokens_seen": 20835125, "step": 986, "time_per_iteration": 2.64924693107605 }, { "auxiliary_loss_clip": 0.01231208, "auxiliary_loss_mlp": 0.01045364, "balance_loss_clip": 1.0281527, "balance_loss_mlp": 1.03197694, "epoch": 0.11867973306078278, "flos": 18912354877440.0, "grad_norm": 1.7438408205493736, "language_loss": 0.94639611, "learning_rate": 3.918165931882394e-06, "loss": 0.96916187, "num_input_tokens_seen": 20853525, "step": 987, "time_per_iteration": 2.722853422164917 }, { "auxiliary_loss_clip": 0.01220766, "auxiliary_loss_mlp": 0.01041498, "balance_loss_clip": 0.87003589, "balance_loss_mlp": 1.02799153, "epoch": 0.11879997595142187, "flos": 16982803152000.0, "grad_norm": 2.904188305733763, "language_loss": 0.75211048, "learning_rate": 3.917945239912264e-06, "loss": 0.77473307, "num_input_tokens_seen": 20871000, "step": 988, "time_per_iteration": 2.7853171825408936 }, { "auxiliary_loss_clip": 0.01236812, "auxiliary_loss_mlp": 0.01044097, "balance_loss_clip": 0.91391224, "balance_loss_mlp": 1.03154421, "epoch": 0.11892021884206096, "flos": 17530369056000.0, "grad_norm": 4.02191044160906, "language_loss": 0.75877774, "learning_rate": 3.917724256991367e-06, "loss": 0.78158677, "num_input_tokens_seen": 20889745, "step": 989, "time_per_iteration": 2.8100104331970215 }, { "auxiliary_loss_clip": 0.01227043, "auxiliary_loss_mlp": 0.01037458, "balance_loss_clip": 0.98934877, "balance_loss_mlp": 1.0246129, "epoch": 0.11904046173270005, "flos": 30955895763840.0, "grad_norm": 2.334814048202528, "language_loss": 0.81683636, "learning_rate": 3.9175029831532245e-06, "loss": 0.83948135, "num_input_tokens_seen": 20909260, "step": 990, "time_per_iteration": 2.8697144985198975 }, { "auxiliary_loss_clip": 0.01239448, "auxiliary_loss_mlp": 0.0105478, "balance_loss_clip": 0.95510817, "balance_loss_mlp": 1.04115438, "epoch": 0.11916070462333915, "flos": 20157234485760.0, "grad_norm": 2.0192174150327817, "language_loss": 0.88572019, "learning_rate": 3.917281418431404e-06, "loss": 0.90866244, "num_input_tokens_seen": 20928305, "step": 991, "time_per_iteration": 2.863839864730835 }, { "auxiliary_loss_clip": 0.01241113, "auxiliary_loss_mlp": 0.01042619, "balance_loss_clip": 0.99280906, "balance_loss_mlp": 1.02977967, "epoch": 0.11928094751397823, "flos": 23551115961600.0, "grad_norm": 2.5419708850930025, "language_loss": 0.77032578, "learning_rate": 3.917059562859516e-06, "loss": 0.79316306, "num_input_tokens_seen": 20947630, "step": 992, "time_per_iteration": 2.718507766723633 }, { "auxiliary_loss_clip": 0.01227482, "auxiliary_loss_mlp": 0.01043664, "balance_loss_clip": 0.98964536, "balance_loss_mlp": 1.03108132, "epoch": 0.11940119040461733, "flos": 23908426502400.0, "grad_norm": 2.007562757860166, "language_loss": 0.88645804, "learning_rate": 3.916837416471218e-06, "loss": 0.90916944, "num_input_tokens_seen": 20964250, "step": 993, "time_per_iteration": 2.7485082149505615 }, { "auxiliary_loss_clip": 0.01234625, "auxiliary_loss_mlp": 0.01041794, "balance_loss_clip": 1.02630401, "balance_loss_mlp": 1.02834678, "epoch": 0.11952143329525641, "flos": 13844533835520.0, "grad_norm": 2.3312135068126767, "language_loss": 0.7255367, "learning_rate": 3.916614979300207e-06, "loss": 0.74830091, "num_input_tokens_seen": 20979095, "step": 994, "time_per_iteration": 2.63142991065979 }, { "auxiliary_loss_clip": 0.01225391, "auxiliary_loss_mlp": 0.01038711, "balance_loss_clip": 0.91158473, "balance_loss_mlp": 1.02588379, "epoch": 0.11964167618589551, "flos": 27015525792000.0, "grad_norm": 1.5928490332850995, "language_loss": 0.78455806, "learning_rate": 3.9163922513802274e-06, "loss": 0.807199, "num_input_tokens_seen": 21001430, "step": 995, "time_per_iteration": 2.8058924674987793 }, { "auxiliary_loss_clip": 0.01244592, "auxiliary_loss_mlp": 0.01044661, "balance_loss_clip": 1.06516409, "balance_loss_mlp": 1.03130937, "epoch": 0.1197619190765346, "flos": 12567622273920.0, "grad_norm": 2.9384378069757866, "language_loss": 0.83059579, "learning_rate": 3.916169232745067e-06, "loss": 0.85348827, "num_input_tokens_seen": 21019105, "step": 996, "time_per_iteration": 2.7506258487701416 }, { "auxiliary_loss_clip": 0.01225817, "auxiliary_loss_mlp": 0.01050558, "balance_loss_clip": 0.98607779, "balance_loss_mlp": 1.03689635, "epoch": 0.11988216196717369, "flos": 16909437623040.0, "grad_norm": 4.371925969018296, "language_loss": 0.91772717, "learning_rate": 3.915945923428559e-06, "loss": 0.94049096, "num_input_tokens_seen": 21035630, "step": 997, "time_per_iteration": 2.814249038696289 }, { "auxiliary_loss_clip": 0.01237219, "auxiliary_loss_mlp": 0.01044539, "balance_loss_clip": 1.02640724, "balance_loss_mlp": 1.03085935, "epoch": 0.12000240485781279, "flos": 16216577205120.0, "grad_norm": 1.9769544734927338, "language_loss": 0.82852656, "learning_rate": 3.915722323464577e-06, "loss": 0.85134411, "num_input_tokens_seen": 21054235, "step": 998, "time_per_iteration": 2.686048984527588 }, { "auxiliary_loss_clip": 0.01241092, "auxiliary_loss_mlp": 0.01044018, "balance_loss_clip": 1.02865791, "balance_loss_mlp": 1.03075027, "epoch": 0.12012264774845187, "flos": 49344887525760.0, "grad_norm": 2.3123538187543673, "language_loss": 0.70102823, "learning_rate": 3.91549843288704e-06, "loss": 0.72387934, "num_input_tokens_seen": 21077915, "step": 999, "time_per_iteration": 2.9076857566833496 }, { "auxiliary_loss_clip": 0.01234588, "auxiliary_loss_mlp": 0.01129067, "balance_loss_clip": 0.94952846, "balance_loss_mlp": 0.0, "epoch": 0.12024289063909097, "flos": 26979435601920.0, "grad_norm": 1.9455597893065562, "language_loss": 0.78974825, "learning_rate": 3.915274251729916e-06, "loss": 0.81338483, "num_input_tokens_seen": 21099205, "step": 1000, "time_per_iteration": 2.7463879585266113 }, { "auxiliary_loss_clip": 0.01239917, "auxiliary_loss_mlp": 0.01048232, "balance_loss_clip": 0.95276392, "balance_loss_mlp": 1.0353992, "epoch": 0.12036313352973005, "flos": 19537308633600.0, "grad_norm": 2.1036485359164625, "language_loss": 0.90168232, "learning_rate": 3.91504978002721e-06, "loss": 0.92456383, "num_input_tokens_seen": 21118260, "step": 1001, "time_per_iteration": 2.717036485671997 }, { "auxiliary_loss_clip": 0.01241004, "auxiliary_loss_mlp": 0.01129089, "balance_loss_clip": 0.98797518, "balance_loss_mlp": 0.0, "epoch": 0.12048337642036915, "flos": 17268256535040.0, "grad_norm": 2.1702323056336916, "language_loss": 0.76002872, "learning_rate": 3.914825017812974e-06, "loss": 0.78372967, "num_input_tokens_seen": 21134910, "step": 1002, "time_per_iteration": 2.6608242988586426 }, { "auxiliary_loss_clip": 0.01239644, "auxiliary_loss_mlp": 0.01040855, "balance_loss_clip": 0.99227762, "balance_loss_mlp": 1.02748013, "epoch": 0.12060361931100824, "flos": 22856962654080.0, "grad_norm": 2.1410884628788533, "language_loss": 0.72734976, "learning_rate": 3.9145999651213065e-06, "loss": 0.75015479, "num_input_tokens_seen": 21154150, "step": 1003, "time_per_iteration": 2.6695621013641357 }, { "auxiliary_loss_clip": 0.01242741, "auxiliary_loss_mlp": 0.01046852, "balance_loss_clip": 1.02973866, "balance_loss_mlp": 1.03339326, "epoch": 0.12072386220164733, "flos": 16726795943040.0, "grad_norm": 3.242848530281079, "language_loss": 0.88468969, "learning_rate": 3.9143746219863465e-06, "loss": 0.90758562, "num_input_tokens_seen": 21171255, "step": 1004, "time_per_iteration": 2.613420248031616 }, { "auxiliary_loss_clip": 0.01162985, "auxiliary_loss_mlp": 0.01008095, "balance_loss_clip": 0.97622454, "balance_loss_mlp": 1.00041795, "epoch": 0.12084410509228642, "flos": 55144176105600.0, "grad_norm": 0.9842836020406001, "language_loss": 0.64765674, "learning_rate": 3.914148988442278e-06, "loss": 0.66936761, "num_input_tokens_seen": 21227045, "step": 1005, "time_per_iteration": 3.208634614944458 }, { "auxiliary_loss_clip": 0.01226163, "auxiliary_loss_mlp": 0.01039663, "balance_loss_clip": 0.98901898, "balance_loss_mlp": 1.02618647, "epoch": 0.1209643479829255, "flos": 26760236855040.0, "grad_norm": 3.4432989437903063, "language_loss": 0.95217228, "learning_rate": 3.91392306452333e-06, "loss": 0.97483051, "num_input_tokens_seen": 21244120, "step": 1006, "time_per_iteration": 2.7247962951660156 }, { "auxiliary_loss_clip": 0.01245029, "auxiliary_loss_mlp": 0.01039835, "balance_loss_clip": 1.06721497, "balance_loss_mlp": 1.02608991, "epoch": 0.1210845908735646, "flos": 11035026725760.0, "grad_norm": 3.076983470956357, "language_loss": 0.66192901, "learning_rate": 3.913696850263774e-06, "loss": 0.68477768, "num_input_tokens_seen": 21258485, "step": 1007, "time_per_iteration": 2.6388988494873047 }, { "auxiliary_loss_clip": 0.01236809, "auxiliary_loss_mlp": 0.01038263, "balance_loss_clip": 1.02608323, "balance_loss_mlp": 1.02528059, "epoch": 0.1212048337642037, "flos": 20484631975680.0, "grad_norm": 2.4408800973429177, "language_loss": 0.79623902, "learning_rate": 3.913470345697929e-06, "loss": 0.81898969, "num_input_tokens_seen": 21277115, "step": 1008, "time_per_iteration": 4.580037593841553 }, { "auxiliary_loss_clip": 0.01239612, "auxiliary_loss_mlp": 0.01040745, "balance_loss_clip": 0.91237068, "balance_loss_mlp": 1.02794766, "epoch": 0.12132507665484278, "flos": 22346061557760.0, "grad_norm": 2.080717016702495, "language_loss": 0.85817128, "learning_rate": 3.913243550860153e-06, "loss": 0.88097489, "num_input_tokens_seen": 21294880, "step": 1009, "time_per_iteration": 3.7340235710144043 }, { "auxiliary_loss_clip": 0.01243495, "auxiliary_loss_mlp": 0.0104416, "balance_loss_clip": 1.03201079, "balance_loss_mlp": 1.03139877, "epoch": 0.12144531954548188, "flos": 29314957818240.0, "grad_norm": 2.1350181289953136, "language_loss": 0.76199573, "learning_rate": 3.913016465784852e-06, "loss": 0.78487229, "num_input_tokens_seen": 21315555, "step": 1010, "time_per_iteration": 3.822282552719116 }, { "auxiliary_loss_clip": 0.01236682, "auxiliary_loss_mlp": 0.01038503, "balance_loss_clip": 0.91099209, "balance_loss_mlp": 1.02535367, "epoch": 0.12156556243612096, "flos": 20485242506880.0, "grad_norm": 2.3403730560868232, "language_loss": 0.72507846, "learning_rate": 3.912789090506474e-06, "loss": 0.74783033, "num_input_tokens_seen": 21334815, "step": 1011, "time_per_iteration": 2.729619026184082 }, { "auxiliary_loss_clip": 0.01238964, "auxiliary_loss_mlp": 0.01040613, "balance_loss_clip": 0.94640625, "balance_loss_mlp": 1.02782238, "epoch": 0.12168580532676006, "flos": 16472009796480.0, "grad_norm": 6.30443242250921, "language_loss": 0.72008646, "learning_rate": 3.9125614250595114e-06, "loss": 0.74288225, "num_input_tokens_seen": 21351025, "step": 1012, "time_per_iteration": 2.738476514816284 }, { "auxiliary_loss_clip": 0.01240379, "auxiliary_loss_mlp": 0.01047993, "balance_loss_clip": 1.02811038, "balance_loss_mlp": 1.03546369, "epoch": 0.12180604821739914, "flos": 15341290588800.0, "grad_norm": 2.5479312992582996, "language_loss": 0.88785428, "learning_rate": 3.912333469478502e-06, "loss": 0.91073799, "num_input_tokens_seen": 21368990, "step": 1013, "time_per_iteration": 2.683885335922241 }, { "auxiliary_loss_clip": 0.01235008, "auxiliary_loss_mlp": 0.01041122, "balance_loss_clip": 0.98713797, "balance_loss_mlp": 1.02868879, "epoch": 0.12192629110803824, "flos": 19318038059520.0, "grad_norm": 2.831666372265466, "language_loss": 0.77972019, "learning_rate": 3.912105223798025e-06, "loss": 0.80248153, "num_input_tokens_seen": 21388410, "step": 1014, "time_per_iteration": 2.728947639465332 }, { "auxiliary_loss_clip": 0.01149722, "auxiliary_loss_mlp": 0.0100671, "balance_loss_clip": 0.92453015, "balance_loss_mlp": 0.99893761, "epoch": 0.12204653399867733, "flos": 47725354085760.0, "grad_norm": 0.9963456413941629, "language_loss": 0.67687374, "learning_rate": 3.9118766880527065e-06, "loss": 0.69843811, "num_input_tokens_seen": 21442845, "step": 1015, "time_per_iteration": 3.240788698196411 }, { "auxiliary_loss_clip": 0.01216931, "auxiliary_loss_mlp": 0.01036176, "balance_loss_clip": 0.90633833, "balance_loss_mlp": 1.02367687, "epoch": 0.12216677688931642, "flos": 18221936584320.0, "grad_norm": 1.659343603569369, "language_loss": 0.73622257, "learning_rate": 3.9116478622772145e-06, "loss": 0.75875366, "num_input_tokens_seen": 21461420, "step": 1016, "time_per_iteration": 2.7614383697509766 }, { "auxiliary_loss_clip": 0.01234263, "auxiliary_loss_mlp": 0.01036835, "balance_loss_clip": 1.0252316, "balance_loss_mlp": 1.02367473, "epoch": 0.12228701977995551, "flos": 27525636789120.0, "grad_norm": 1.730122048826382, "language_loss": 0.88039732, "learning_rate": 3.911418746506261e-06, "loss": 0.90310836, "num_input_tokens_seen": 21481550, "step": 1017, "time_per_iteration": 2.7158286571502686 }, { "auxiliary_loss_clip": 0.01242848, "auxiliary_loss_mlp": 0.01046562, "balance_loss_clip": 1.03129005, "balance_loss_mlp": 1.03290057, "epoch": 0.1224072626705946, "flos": 21798136517760.0, "grad_norm": 1.6631673650026868, "language_loss": 0.78256077, "learning_rate": 3.911189340774604e-06, "loss": 0.80545491, "num_input_tokens_seen": 21501680, "step": 1018, "time_per_iteration": 2.9484362602233887 }, { "auxiliary_loss_clip": 0.01243823, "auxiliary_loss_mlp": 0.01044903, "balance_loss_clip": 0.98773026, "balance_loss_mlp": 1.0325408, "epoch": 0.1225275055612337, "flos": 20703758895360.0, "grad_norm": 1.7718868416404956, "language_loss": 0.79153848, "learning_rate": 3.910959645117043e-06, "loss": 0.81442571, "num_input_tokens_seen": 21521015, "step": 1019, "time_per_iteration": 2.721900701522827 }, { "auxiliary_loss_clip": 0.0114564, "auxiliary_loss_mlp": 0.01124674, "balance_loss_clip": 0.96637285, "balance_loss_mlp": 0.0, "epoch": 0.12264774845187278, "flos": 57745294462080.0, "grad_norm": 0.8186341488115472, "language_loss": 0.56737781, "learning_rate": 3.910729659568423e-06, "loss": 0.59008092, "num_input_tokens_seen": 21578200, "step": 1020, "time_per_iteration": 3.271026611328125 }, { "auxiliary_loss_clip": 0.01240335, "auxiliary_loss_mlp": 0.01044955, "balance_loss_clip": 0.99288934, "balance_loss_mlp": 1.03309393, "epoch": 0.12276799134251187, "flos": 26396282298240.0, "grad_norm": 1.7631214802983586, "language_loss": 0.82124233, "learning_rate": 3.9104993841636344e-06, "loss": 0.84409529, "num_input_tokens_seen": 21598770, "step": 1021, "time_per_iteration": 2.7575602531433105 }, { "auxiliary_loss_clip": 0.01235763, "auxiliary_loss_mlp": 0.01128959, "balance_loss_clip": 0.99434054, "balance_loss_mlp": 0.0, "epoch": 0.12288823423315097, "flos": 21064193919360.0, "grad_norm": 1.7509929041651418, "language_loss": 0.8084172, "learning_rate": 3.910268818937608e-06, "loss": 0.83206439, "num_input_tokens_seen": 21616925, "step": 1022, "time_per_iteration": 2.778214931488037 }, { "auxiliary_loss_clip": 0.01231659, "auxiliary_loss_mlp": 0.01043119, "balance_loss_clip": 0.91350579, "balance_loss_mlp": 1.02960062, "epoch": 0.12300847712379005, "flos": 12312441077760.0, "grad_norm": 3.2815265481953597, "language_loss": 0.87454808, "learning_rate": 3.9100379639253196e-06, "loss": 0.89729583, "num_input_tokens_seen": 21633645, "step": 1023, "time_per_iteration": 2.8459107875823975 }, { "auxiliary_loss_clip": 0.0123721, "auxiliary_loss_mlp": 0.01041676, "balance_loss_clip": 0.98755252, "balance_loss_mlp": 1.02847958, "epoch": 0.12312872001442915, "flos": 16762239688320.0, "grad_norm": 2.524808117789222, "language_loss": 0.86596131, "learning_rate": 3.909806819161791e-06, "loss": 0.8887502, "num_input_tokens_seen": 21649120, "step": 1024, "time_per_iteration": 2.67317795753479 }, { "auxiliary_loss_clip": 0.01239843, "auxiliary_loss_mlp": 0.0104624, "balance_loss_clip": 0.94875801, "balance_loss_mlp": 1.03320456, "epoch": 0.12324896290506823, "flos": 18404937400320.0, "grad_norm": 2.252665352894258, "language_loss": 0.86157465, "learning_rate": 3.909575384682086e-06, "loss": 0.88443547, "num_input_tokens_seen": 21668000, "step": 1025, "time_per_iteration": 2.6492562294006348 }, { "auxiliary_loss_clip": 0.01242706, "auxiliary_loss_mlp": 0.01043539, "balance_loss_clip": 1.02767873, "balance_loss_mlp": 1.03084922, "epoch": 0.12336920579570733, "flos": 18915407533440.0, "grad_norm": 1.8743385221079862, "language_loss": 0.69456816, "learning_rate": 3.9093436605213144e-06, "loss": 0.71743059, "num_input_tokens_seen": 21688500, "step": 1026, "time_per_iteration": 2.68941593170166 }, { "auxiliary_loss_clip": 0.01236584, "auxiliary_loss_mlp": 0.01039097, "balance_loss_clip": 0.98846954, "balance_loss_mlp": 1.02612734, "epoch": 0.12348944868634643, "flos": 23878369797120.0, "grad_norm": 2.059265473419083, "language_loss": 0.79155707, "learning_rate": 3.909111646714627e-06, "loss": 0.81431401, "num_input_tokens_seen": 21709345, "step": 1027, "time_per_iteration": 2.677626132965088 }, { "auxiliary_loss_clip": 0.01241012, "auxiliary_loss_mlp": 0.01039827, "balance_loss_clip": 1.06715572, "balance_loss_mlp": 1.02693403, "epoch": 0.12360969157698551, "flos": 19026084314880.0, "grad_norm": 2.0407573997979065, "language_loss": 0.72384018, "learning_rate": 3.9088793432972206e-06, "loss": 0.74664855, "num_input_tokens_seen": 21728165, "step": 1028, "time_per_iteration": 2.611727237701416 }, { "auxiliary_loss_clip": 0.01233845, "auxiliary_loss_mlp": 0.01040876, "balance_loss_clip": 0.91168582, "balance_loss_mlp": 1.0287044, "epoch": 0.1237299344676246, "flos": 13224607983360.0, "grad_norm": 1.9892294485895519, "language_loss": 0.82213616, "learning_rate": 3.908646750304336e-06, "loss": 0.84488332, "num_input_tokens_seen": 21745850, "step": 1029, "time_per_iteration": 2.7580325603485107 }, { "auxiliary_loss_clip": 0.01243089, "auxiliary_loss_mlp": 0.0104205, "balance_loss_clip": 0.99329424, "balance_loss_mlp": 1.02928257, "epoch": 0.12385017735826369, "flos": 20485673470080.0, "grad_norm": 1.7883153852368865, "language_loss": 0.8733958, "learning_rate": 3.908413867771257e-06, "loss": 0.89624715, "num_input_tokens_seen": 21764760, "step": 1030, "time_per_iteration": 2.6688504219055176 }, { "auxiliary_loss_clip": 0.01238242, "auxiliary_loss_mlp": 0.01042098, "balance_loss_clip": 1.02743697, "balance_loss_mlp": 1.02895474, "epoch": 0.12397042024890279, "flos": 17347835116800.0, "grad_norm": 1.7853712805205773, "language_loss": 0.80647761, "learning_rate": 3.908180695733311e-06, "loss": 0.82928103, "num_input_tokens_seen": 21784250, "step": 1031, "time_per_iteration": 2.5846118927001953 }, { "auxiliary_loss_clip": 0.01202624, "auxiliary_loss_mlp": 0.0103644, "balance_loss_clip": 0.9037472, "balance_loss_mlp": 1.02310085, "epoch": 0.12409066313954187, "flos": 20412343854720.0, "grad_norm": 1.7747958560397403, "language_loss": 0.825661, "learning_rate": 3.907947234225871e-06, "loss": 0.84805167, "num_input_tokens_seen": 21803260, "step": 1032, "time_per_iteration": 2.766573429107666 }, { "auxiliary_loss_clip": 0.01223397, "auxiliary_loss_mlp": 0.01045045, "balance_loss_clip": 0.87154096, "balance_loss_mlp": 1.0323019, "epoch": 0.12421090603018096, "flos": 20736688688640.0, "grad_norm": 1.9235830519520554, "language_loss": 0.87323415, "learning_rate": 3.907713483284352e-06, "loss": 0.89591861, "num_input_tokens_seen": 21822735, "step": 1033, "time_per_iteration": 3.843247890472412 }, { "auxiliary_loss_clip": 0.01208849, "auxiliary_loss_mlp": 0.01055798, "balance_loss_clip": 0.82487309, "balance_loss_mlp": 1.04293585, "epoch": 0.12433114892082006, "flos": 24498834353280.0, "grad_norm": 2.227744904467663, "language_loss": 0.9762125, "learning_rate": 3.907479442944216e-06, "loss": 0.99885899, "num_input_tokens_seen": 21841140, "step": 1034, "time_per_iteration": 3.8314459323883057 }, { "auxiliary_loss_clip": 0.01237219, "auxiliary_loss_mlp": 0.01042559, "balance_loss_clip": 1.02896595, "balance_loss_mlp": 1.02988744, "epoch": 0.12445139181145914, "flos": 19682315838720.0, "grad_norm": 2.410747087983905, "language_loss": 0.92307496, "learning_rate": 3.907245113240963e-06, "loss": 0.94587272, "num_input_tokens_seen": 21859260, "step": 1035, "time_per_iteration": 3.9229350090026855 }, { "auxiliary_loss_clip": 0.0123245, "auxiliary_loss_mlp": 0.01041952, "balance_loss_clip": 0.94588637, "balance_loss_mlp": 1.02889252, "epoch": 0.12457163470209824, "flos": 46423087522560.0, "grad_norm": 1.8982372668274068, "language_loss": 0.736395, "learning_rate": 3.907010494210144e-06, "loss": 0.75913894, "num_input_tokens_seen": 21881920, "step": 1036, "time_per_iteration": 3.8685195446014404 }, { "auxiliary_loss_clip": 0.01241484, "auxiliary_loss_mlp": 0.0103681, "balance_loss_clip": 1.02855051, "balance_loss_mlp": 1.02343488, "epoch": 0.12469187759273732, "flos": 20376289578240.0, "grad_norm": 1.9456567789252956, "language_loss": 0.91794837, "learning_rate": 3.9067755858873495e-06, "loss": 0.94073129, "num_input_tokens_seen": 21898720, "step": 1037, "time_per_iteration": 2.6029372215270996 }, { "auxiliary_loss_clip": 0.0114248, "auxiliary_loss_mlp": 0.01015408, "balance_loss_clip": 0.9238404, "balance_loss_mlp": 1.00782609, "epoch": 0.12481212048337642, "flos": 69224641447680.0, "grad_norm": 0.8624428647677352, "language_loss": 0.62808549, "learning_rate": 3.906540388308214e-06, "loss": 0.6496644, "num_input_tokens_seen": 21958305, "step": 1038, "time_per_iteration": 3.3202598094940186 }, { "auxiliary_loss_clip": 0.01217335, "auxiliary_loss_mlp": 0.01052041, "balance_loss_clip": 0.9115721, "balance_loss_mlp": 1.03972065, "epoch": 0.12493236337401552, "flos": 18223696350720.0, "grad_norm": 3.568449878222541, "language_loss": 0.81776762, "learning_rate": 3.906304901508417e-06, "loss": 0.84046131, "num_input_tokens_seen": 21977205, "step": 1039, "time_per_iteration": 2.759296417236328 }, { "auxiliary_loss_clip": 0.01243161, "auxiliary_loss_mlp": 0.01049942, "balance_loss_clip": 1.03217995, "balance_loss_mlp": 1.03742492, "epoch": 0.12505260626465461, "flos": 30044375303040.0, "grad_norm": 2.2040699140987052, "language_loss": 0.75904268, "learning_rate": 3.9060691255236835e-06, "loss": 0.78197372, "num_input_tokens_seen": 21997770, "step": 1040, "time_per_iteration": 2.709343671798706 }, { "auxiliary_loss_clip": 0.01237367, "auxiliary_loss_mlp": 0.01046477, "balance_loss_clip": 1.02765512, "balance_loss_mlp": 1.03272676, "epoch": 0.1251728491552937, "flos": 24433980347520.0, "grad_norm": 2.1714816148022327, "language_loss": 0.80733591, "learning_rate": 3.905833060389778e-06, "loss": 0.83017433, "num_input_tokens_seen": 22021890, "step": 1041, "time_per_iteration": 2.7563669681549072 }, { "auxiliary_loss_clip": 0.01244476, "auxiliary_loss_mlp": 0.01129053, "balance_loss_clip": 1.06830657, "balance_loss_mlp": 0.0, "epoch": 0.12529309204593278, "flos": 27119809952640.0, "grad_norm": 3.0656549591242235, "language_loss": 0.7849344, "learning_rate": 3.905596706142513e-06, "loss": 0.80866969, "num_input_tokens_seen": 22043300, "step": 1042, "time_per_iteration": 2.6335866451263428 }, { "auxiliary_loss_clip": 0.01223218, "auxiliary_loss_mlp": 0.01044218, "balance_loss_clip": 0.94514167, "balance_loss_mlp": 1.03152847, "epoch": 0.12541333493657186, "flos": 30774151923840.0, "grad_norm": 2.920447038164132, "language_loss": 0.86103302, "learning_rate": 3.9053600628177435e-06, "loss": 0.8837074, "num_input_tokens_seen": 22062910, "step": 1043, "time_per_iteration": 2.8093018531799316 }, { "auxiliary_loss_clip": 0.01239881, "auxiliary_loss_mlp": 0.01039102, "balance_loss_clip": 1.06523538, "balance_loss_mlp": 1.02706194, "epoch": 0.12553357782721097, "flos": 23659566099840.0, "grad_norm": 2.306795425705535, "language_loss": 0.84547007, "learning_rate": 3.905123130451367e-06, "loss": 0.86825991, "num_input_tokens_seen": 22084010, "step": 1044, "time_per_iteration": 2.6313507556915283 }, { "auxiliary_loss_clip": 0.01239853, "auxiliary_loss_mlp": 0.01048145, "balance_loss_clip": 1.06706131, "balance_loss_mlp": 1.03533626, "epoch": 0.12565382071785006, "flos": 24863758577280.0, "grad_norm": 1.8581045650611763, "language_loss": 0.79311049, "learning_rate": 3.904885909079326e-06, "loss": 0.81599039, "num_input_tokens_seen": 22102795, "step": 1045, "time_per_iteration": 2.6323068141937256 }, { "auxiliary_loss_clip": 0.01238903, "auxiliary_loss_mlp": 0.01040328, "balance_loss_clip": 1.02699625, "balance_loss_mlp": 1.02729225, "epoch": 0.12577406360848914, "flos": 21360780518400.0, "grad_norm": 2.4852539452868125, "language_loss": 0.78367275, "learning_rate": 3.904648398737607e-06, "loss": 0.80646503, "num_input_tokens_seen": 22121360, "step": 1046, "time_per_iteration": 2.640881299972534 }, { "auxiliary_loss_clip": 0.01235974, "auxiliary_loss_mlp": 0.01050967, "balance_loss_clip": 1.06257129, "balance_loss_mlp": 1.03840792, "epoch": 0.12589430649912825, "flos": 36138056774400.0, "grad_norm": 1.9165484921538805, "language_loss": 0.78064269, "learning_rate": 3.9044105994622406e-06, "loss": 0.8035121, "num_input_tokens_seen": 22142505, "step": 1047, "time_per_iteration": 2.7487714290618896 }, { "auxiliary_loss_clip": 0.01238326, "auxiliary_loss_mlp": 0.01129428, "balance_loss_clip": 0.98809415, "balance_loss_mlp": 0.0, "epoch": 0.12601454938976733, "flos": 25337671643520.0, "grad_norm": 1.8720631823244405, "language_loss": 0.81624597, "learning_rate": 3.9041725112893005e-06, "loss": 0.8399235, "num_input_tokens_seen": 22163730, "step": 1048, "time_per_iteration": 2.7274484634399414 }, { "auxiliary_loss_clip": 0.01230815, "auxiliary_loss_mlp": 0.01049219, "balance_loss_clip": 0.95254683, "balance_loss_mlp": 1.03707778, "epoch": 0.12613479228040642, "flos": 15560094286080.0, "grad_norm": 1.810198069234511, "language_loss": 0.75148392, "learning_rate": 3.903934134254904e-06, "loss": 0.77428424, "num_input_tokens_seen": 22181520, "step": 1049, "time_per_iteration": 2.728729724884033 }, { "auxiliary_loss_clip": 0.01245313, "auxiliary_loss_mlp": 0.0104231, "balance_loss_clip": 1.02667236, "balance_loss_mlp": 1.02950668, "epoch": 0.1262550351710455, "flos": 21470595373440.0, "grad_norm": 2.3998734893851785, "language_loss": 0.84802568, "learning_rate": 3.903695468395213e-06, "loss": 0.87090194, "num_input_tokens_seen": 22199390, "step": 1050, "time_per_iteration": 2.698988437652588 }, { "auxiliary_loss_clip": 0.01238176, "auxiliary_loss_mlp": 0.01042044, "balance_loss_clip": 0.98550326, "balance_loss_mlp": 1.02965808, "epoch": 0.1263752780616846, "flos": 31576719456000.0, "grad_norm": 2.460169788009642, "language_loss": 0.55713809, "learning_rate": 3.903456513746434e-06, "loss": 0.57994032, "num_input_tokens_seen": 22220365, "step": 1051, "time_per_iteration": 2.747149705886841 }, { "auxiliary_loss_clip": 0.01237635, "auxiliary_loss_mlp": 0.01043142, "balance_loss_clip": 1.06578732, "balance_loss_mlp": 1.02988577, "epoch": 0.1264955209523237, "flos": 28768217927040.0, "grad_norm": 1.7067586089133564, "language_loss": 0.87465632, "learning_rate": 3.903217270344815e-06, "loss": 0.8974641, "num_input_tokens_seen": 22240615, "step": 1052, "time_per_iteration": 2.773541212081909 }, { "auxiliary_loss_clip": 0.01226578, "auxiliary_loss_mlp": 0.01038378, "balance_loss_clip": 0.9467274, "balance_loss_mlp": 1.02591515, "epoch": 0.12661576384296278, "flos": 29241125412480.0, "grad_norm": 1.7679171845384511, "language_loss": 0.82475525, "learning_rate": 3.902977738226648e-06, "loss": 0.84740478, "num_input_tokens_seen": 22261350, "step": 1053, "time_per_iteration": 2.8017654418945312 }, { "auxiliary_loss_clip": 0.01241085, "auxiliary_loss_mlp": 0.01040314, "balance_loss_clip": 1.02933466, "balance_loss_mlp": 1.02700424, "epoch": 0.12673600673360189, "flos": 20850346298880.0, "grad_norm": 1.8099660569893408, "language_loss": 0.91274482, "learning_rate": 3.902737917428273e-06, "loss": 0.9355588, "num_input_tokens_seen": 22279515, "step": 1054, "time_per_iteration": 2.6250412464141846 }, { "auxiliary_loss_clip": 0.01238716, "auxiliary_loss_mlp": 0.0103666, "balance_loss_clip": 1.06418657, "balance_loss_mlp": 1.02365422, "epoch": 0.12685624962424097, "flos": 25263695583360.0, "grad_norm": 1.7295663528219876, "language_loss": 0.84009612, "learning_rate": 3.902497807986068e-06, "loss": 0.86284983, "num_input_tokens_seen": 22299535, "step": 1055, "time_per_iteration": 2.614899158477783 }, { "auxiliary_loss_clip": 0.012349, "auxiliary_loss_mlp": 0.0103979, "balance_loss_clip": 0.9472391, "balance_loss_mlp": 1.02666545, "epoch": 0.12697649251488005, "flos": 27527109246720.0, "grad_norm": 1.937529965841755, "language_loss": 0.84087968, "learning_rate": 3.902257409936458e-06, "loss": 0.8636266, "num_input_tokens_seen": 22320300, "step": 1056, "time_per_iteration": 2.8100788593292236 }, { "auxiliary_loss_clip": 0.01237558, "auxiliary_loss_mlp": 0.01036945, "balance_loss_clip": 0.98998106, "balance_loss_mlp": 1.02466655, "epoch": 0.12709673540551916, "flos": 21251863503360.0, "grad_norm": 1.9788970179480048, "language_loss": 0.84148365, "learning_rate": 3.902016723315912e-06, "loss": 0.86422873, "num_input_tokens_seen": 22338240, "step": 1057, "time_per_iteration": 2.6426479816436768 }, { "auxiliary_loss_clip": 0.01230724, "auxiliary_loss_mlp": 0.01044463, "balance_loss_clip": 1.02348804, "balance_loss_mlp": 1.03209496, "epoch": 0.12721697829615825, "flos": 25337707557120.0, "grad_norm": 2.6522484504328365, "language_loss": 0.69331169, "learning_rate": 3.901775748160941e-06, "loss": 0.7160635, "num_input_tokens_seen": 22357420, "step": 1058, "time_per_iteration": 2.721519947052002 }, { "auxiliary_loss_clip": 0.01141782, "auxiliary_loss_mlp": 0.0101384, "balance_loss_clip": 0.97022963, "balance_loss_mlp": 1.00640142, "epoch": 0.12733722118679733, "flos": 61943287754880.0, "grad_norm": 0.8256668241946233, "language_loss": 0.60886747, "learning_rate": 3.901534484508101e-06, "loss": 0.63042367, "num_input_tokens_seen": 22420095, "step": 1059, "time_per_iteration": 4.188354730606079 }, { "auxiliary_loss_clip": 0.01218434, "auxiliary_loss_mlp": 0.01040716, "balance_loss_clip": 0.98336804, "balance_loss_mlp": 1.02815771, "epoch": 0.1274574640774364, "flos": 26976742081920.0, "grad_norm": 2.41305271897889, "language_loss": 0.74728537, "learning_rate": 3.901292932393991e-06, "loss": 0.76987684, "num_input_tokens_seen": 22438975, "step": 1060, "time_per_iteration": 3.7952871322631836 }, { "auxiliary_loss_clip": 0.01242884, "auxiliary_loss_mlp": 0.01047278, "balance_loss_clip": 1.0683161, "balance_loss_mlp": 1.0348264, "epoch": 0.12757770696807552, "flos": 22236318529920.0, "grad_norm": 3.8403859658660995, "language_loss": 0.85424626, "learning_rate": 3.9010510918552555e-06, "loss": 0.87714791, "num_input_tokens_seen": 22458050, "step": 1061, "time_per_iteration": 3.5809082984924316 }, { "auxiliary_loss_clip": 0.01230534, "auxiliary_loss_mlp": 0.0104399, "balance_loss_clip": 0.98564446, "balance_loss_mlp": 1.03110385, "epoch": 0.1276979498587146, "flos": 28547905858560.0, "grad_norm": 2.7740046681443595, "language_loss": 0.74260819, "learning_rate": 3.900808962928581e-06, "loss": 0.76535344, "num_input_tokens_seen": 22475665, "step": 1062, "time_per_iteration": 3.658972978591919 }, { "auxiliary_loss_clip": 0.01235665, "auxiliary_loss_mlp": 0.01047223, "balance_loss_clip": 1.06704926, "balance_loss_mlp": 1.03485465, "epoch": 0.1278181927493537, "flos": 17420338719360.0, "grad_norm": 2.260634504839598, "language_loss": 0.89699614, "learning_rate": 3.900566545650698e-06, "loss": 0.91982502, "num_input_tokens_seen": 22493335, "step": 1063, "time_per_iteration": 2.5883326530456543 }, { "auxiliary_loss_clip": 0.01237515, "auxiliary_loss_mlp": 0.01043835, "balance_loss_clip": 1.02930856, "balance_loss_mlp": 1.0308888, "epoch": 0.1279384356399928, "flos": 21138636856320.0, "grad_norm": 3.9090456434322047, "language_loss": 0.81727129, "learning_rate": 3.900323840058381e-06, "loss": 0.84008479, "num_input_tokens_seen": 22511045, "step": 1064, "time_per_iteration": 2.6358118057250977 }, { "auxiliary_loss_clip": 0.01236875, "auxiliary_loss_mlp": 0.01034948, "balance_loss_clip": 1.02590179, "balance_loss_mlp": 1.02275276, "epoch": 0.12805867853063188, "flos": 26576733248640.0, "grad_norm": 2.1266797678398364, "language_loss": 0.81629324, "learning_rate": 3.900080846188449e-06, "loss": 0.83901143, "num_input_tokens_seen": 22529635, "step": 1065, "time_per_iteration": 2.695277214050293 }, { "auxiliary_loss_clip": 0.01238358, "auxiliary_loss_mlp": 0.01045781, "balance_loss_clip": 1.0643301, "balance_loss_mlp": 1.03324616, "epoch": 0.12817892142127096, "flos": 16436206915200.0, "grad_norm": 1.902755654114165, "language_loss": 0.81277192, "learning_rate": 3.8998375640777625e-06, "loss": 0.83561325, "num_input_tokens_seen": 22547505, "step": 1066, "time_per_iteration": 2.603999137878418 }, { "auxiliary_loss_clip": 0.01142145, "auxiliary_loss_mlp": 0.01008905, "balance_loss_clip": 0.96879363, "balance_loss_mlp": 1.00175273, "epoch": 0.12829916431191005, "flos": 60757049099520.0, "grad_norm": 0.706943319684227, "language_loss": 0.52634895, "learning_rate": 3.899593993763229e-06, "loss": 0.54785943, "num_input_tokens_seen": 22608465, "step": 1067, "time_per_iteration": 3.2305378913879395 }, { "auxiliary_loss_clip": 0.01222932, "auxiliary_loss_mlp": 0.01043402, "balance_loss_clip": 0.94748783, "balance_loss_mlp": 1.03081954, "epoch": 0.12841940720254916, "flos": 29786895636480.0, "grad_norm": 2.1243423476693977, "language_loss": 0.8122133, "learning_rate": 3.899350135281796e-06, "loss": 0.8348766, "num_input_tokens_seen": 22629465, "step": 1068, "time_per_iteration": 2.7537877559661865 }, { "auxiliary_loss_clip": 0.0123327, "auxiliary_loss_mlp": 0.01035245, "balance_loss_clip": 0.94961643, "balance_loss_mlp": 1.02296615, "epoch": 0.12853965009318824, "flos": 25951851319680.0, "grad_norm": 2.189155303241346, "language_loss": 0.79712045, "learning_rate": 3.8991059886704585e-06, "loss": 0.81980556, "num_input_tokens_seen": 22648970, "step": 1069, "time_per_iteration": 2.7639639377593994 }, { "auxiliary_loss_clip": 0.01222137, "auxiliary_loss_mlp": 0.0104406, "balance_loss_clip": 0.94745016, "balance_loss_mlp": 1.03151321, "epoch": 0.12865989298382732, "flos": 30846871008000.0, "grad_norm": 2.0204685773964526, "language_loss": 0.83195031, "learning_rate": 3.898861553966252e-06, "loss": 0.85461229, "num_input_tokens_seen": 22668620, "step": 1070, "time_per_iteration": 2.8116071224212646 }, { "auxiliary_loss_clip": 0.012177, "auxiliary_loss_mlp": 0.01038611, "balance_loss_clip": 0.82592839, "balance_loss_mlp": 1.02602255, "epoch": 0.12878013587446643, "flos": 25885776251520.0, "grad_norm": 1.6475321308741258, "language_loss": 0.88062739, "learning_rate": 3.898616831206257e-06, "loss": 0.90319049, "num_input_tokens_seen": 22689045, "step": 1071, "time_per_iteration": 2.9015517234802246 }, { "auxiliary_loss_clip": 0.01224796, "auxiliary_loss_mlp": 0.0105219, "balance_loss_clip": 0.9448545, "balance_loss_mlp": 1.03906476, "epoch": 0.12890037876510552, "flos": 23333138277120.0, "grad_norm": 2.1572424709768523, "language_loss": 0.76381862, "learning_rate": 3.8983718204276e-06, "loss": 0.78658855, "num_input_tokens_seen": 22711265, "step": 1072, "time_per_iteration": 2.8142619132995605 }, { "auxiliary_loss_clip": 0.0123264, "auxiliary_loss_mlp": 0.01041634, "balance_loss_clip": 0.98689437, "balance_loss_mlp": 1.0294745, "epoch": 0.1290206216557446, "flos": 23587242065280.0, "grad_norm": 1.8629005635263078, "language_loss": 0.82806885, "learning_rate": 3.898126521667446e-06, "loss": 0.8508116, "num_input_tokens_seen": 22731420, "step": 1073, "time_per_iteration": 2.760676860809326 }, { "auxiliary_loss_clip": 0.01234608, "auxiliary_loss_mlp": 0.010429, "balance_loss_clip": 1.02545178, "balance_loss_mlp": 1.03104496, "epoch": 0.12914086454638368, "flos": 24170610850560.0, "grad_norm": 1.7144472279186287, "language_loss": 0.83139539, "learning_rate": 3.897880934963007e-06, "loss": 0.85417044, "num_input_tokens_seen": 22750970, "step": 1074, "time_per_iteration": 2.753506898880005 }, { "auxiliary_loss_clip": 0.01227559, "auxiliary_loss_mlp": 0.01044918, "balance_loss_clip": 0.98457909, "balance_loss_mlp": 1.03294325, "epoch": 0.1292611074370228, "flos": 20267157081600.0, "grad_norm": 2.4130986825221017, "language_loss": 0.78197235, "learning_rate": 3.89763506035154e-06, "loss": 0.80469716, "num_input_tokens_seen": 22768820, "step": 1075, "time_per_iteration": 2.7325541973114014 }, { "auxiliary_loss_clip": 0.01220985, "auxiliary_loss_mlp": 0.01043093, "balance_loss_clip": 1.02422035, "balance_loss_mlp": 1.03093362, "epoch": 0.12938135032766188, "flos": 27377684668800.0, "grad_norm": 1.6829051855992012, "language_loss": 0.81054676, "learning_rate": 3.897388897870343e-06, "loss": 0.83318758, "num_input_tokens_seen": 22789460, "step": 1076, "time_per_iteration": 2.7737293243408203 }, { "auxiliary_loss_clip": 0.0124358, "auxiliary_loss_mlp": 0.01047837, "balance_loss_clip": 0.9867382, "balance_loss_mlp": 1.03502202, "epoch": 0.12950159321830096, "flos": 29277107861760.0, "grad_norm": 1.776435340740662, "language_loss": 0.74897069, "learning_rate": 3.89714244755676e-06, "loss": 0.77188492, "num_input_tokens_seen": 22810820, "step": 1077, "time_per_iteration": 2.764070987701416 }, { "auxiliary_loss_clip": 0.0120982, "auxiliary_loss_mlp": 0.01039951, "balance_loss_clip": 0.90445697, "balance_loss_mlp": 1.02740407, "epoch": 0.12962183610894007, "flos": 24534888629760.0, "grad_norm": 2.416846899509836, "language_loss": 0.86337662, "learning_rate": 3.896895709448175e-06, "loss": 0.88587439, "num_input_tokens_seen": 22830570, "step": 1078, "time_per_iteration": 2.8652729988098145 }, { "auxiliary_loss_clip": 0.01215202, "auxiliary_loss_mlp": 0.01040358, "balance_loss_clip": 0.86562669, "balance_loss_mlp": 1.02757835, "epoch": 0.12974207899957915, "flos": 11215944552960.0, "grad_norm": 5.351164917160013, "language_loss": 0.77325135, "learning_rate": 3.896648683582019e-06, "loss": 0.79580694, "num_input_tokens_seen": 22845905, "step": 1079, "time_per_iteration": 2.809500217437744 }, { "auxiliary_loss_clip": 0.01227239, "auxiliary_loss_mlp": 0.01047822, "balance_loss_clip": 0.90941364, "balance_loss_mlp": 1.03628826, "epoch": 0.12986232189021824, "flos": 24717889445760.0, "grad_norm": 2.2776327135883587, "language_loss": 0.80834156, "learning_rate": 3.896401369995766e-06, "loss": 0.83109224, "num_input_tokens_seen": 22865710, "step": 1080, "time_per_iteration": 2.8260974884033203 }, { "auxiliary_loss_clip": 0.01239222, "auxiliary_loss_mlp": 0.01040398, "balance_loss_clip": 1.06682563, "balance_loss_mlp": 1.02883494, "epoch": 0.12998256478085732, "flos": 23915357827200.0, "grad_norm": 1.797488217091338, "language_loss": 0.7936188, "learning_rate": 3.896153768726932e-06, "loss": 0.81641495, "num_input_tokens_seen": 22886020, "step": 1081, "time_per_iteration": 2.699359655380249 }, { "auxiliary_loss_clip": 0.01232852, "auxiliary_loss_mlp": 0.01040266, "balance_loss_clip": 1.02788007, "balance_loss_mlp": 1.02743864, "epoch": 0.13010280767149643, "flos": 18624207974400.0, "grad_norm": 2.212966424389325, "language_loss": 0.88144636, "learning_rate": 3.8959058798130806e-06, "loss": 0.90417755, "num_input_tokens_seen": 22903995, "step": 1082, "time_per_iteration": 2.679471015930176 }, { "auxiliary_loss_clip": 0.0123168, "auxiliary_loss_mlp": 0.01129366, "balance_loss_clip": 0.98698401, "balance_loss_mlp": 0.0, "epoch": 0.1302230505621355, "flos": 22783992174720.0, "grad_norm": 3.5926277065491505, "language_loss": 0.7527225, "learning_rate": 3.895657703291814e-06, "loss": 0.77633297, "num_input_tokens_seen": 22924100, "step": 1083, "time_per_iteration": 2.837441921234131 }, { "auxiliary_loss_clip": 0.01238589, "auxiliary_loss_mlp": 0.01035083, "balance_loss_clip": 0.98434806, "balance_loss_mlp": 1.02272153, "epoch": 0.1303432934527746, "flos": 21323612920320.0, "grad_norm": 7.428883518641305, "language_loss": 0.7947427, "learning_rate": 3.895409239200781e-06, "loss": 0.81747949, "num_input_tokens_seen": 22939985, "step": 1084, "time_per_iteration": 2.7675540447235107 }, { "auxiliary_loss_clip": 0.01227652, "auxiliary_loss_mlp": 0.01043135, "balance_loss_clip": 1.02465391, "balance_loss_mlp": 1.03088641, "epoch": 0.1304635363434137, "flos": 20922490765440.0, "grad_norm": 2.994479245077126, "language_loss": 0.9144789, "learning_rate": 3.895160487577673e-06, "loss": 0.93718678, "num_input_tokens_seen": 22957555, "step": 1085, "time_per_iteration": 4.3083906173706055 }, { "auxiliary_loss_clip": 0.01135597, "auxiliary_loss_mlp": 0.01040555, "balance_loss_clip": 1.0019424, "balance_loss_mlp": 1.03402185, "epoch": 0.1305837792340528, "flos": 63245659080960.0, "grad_norm": 0.7865893224661836, "language_loss": 0.60915816, "learning_rate": 3.894911448460226e-06, "loss": 0.63091969, "num_input_tokens_seen": 23016870, "step": 1086, "time_per_iteration": 4.148955821990967 }, { "auxiliary_loss_clip": 0.01198202, "auxiliary_loss_mlp": 0.01036842, "balance_loss_clip": 0.82551152, "balance_loss_mlp": 1.02516508, "epoch": 0.13070402212469187, "flos": 26428852955520.0, "grad_norm": 2.3427094795877013, "language_loss": 0.72594452, "learning_rate": 3.8946621218862195e-06, "loss": 0.74829495, "num_input_tokens_seen": 23037870, "step": 1087, "time_per_iteration": 1.1518328189849854 }, { "auxiliary_loss_clip": 0.01228451, "auxiliary_loss_mlp": 0.01041286, "balance_loss_clip": 0.94789469, "balance_loss_mlp": 1.02866197, "epoch": 0.13082426501533098, "flos": 27673409341440.0, "grad_norm": 1.8323694264977026, "language_loss": 0.8879016, "learning_rate": 3.894412507893475e-06, "loss": 0.91059899, "num_input_tokens_seen": 23058150, "step": 1088, "time_per_iteration": 3.729907751083374 }, { "auxiliary_loss_clip": 0.01235403, "auxiliary_loss_mlp": 0.01040717, "balance_loss_clip": 0.90953076, "balance_loss_mlp": 1.02747893, "epoch": 0.13094450790597006, "flos": 24826770547200.0, "grad_norm": 2.016285417194981, "language_loss": 0.71923339, "learning_rate": 3.894162606519859e-06, "loss": 0.74199456, "num_input_tokens_seen": 23077100, "step": 1089, "time_per_iteration": 2.836951971054077 }, { "auxiliary_loss_clip": 0.01226124, "auxiliary_loss_mlp": 0.01052432, "balance_loss_clip": 0.9100132, "balance_loss_mlp": 1.04005194, "epoch": 0.13106475079660915, "flos": 19062605468160.0, "grad_norm": 2.1073758907978144, "language_loss": 0.77019691, "learning_rate": 3.893912417803282e-06, "loss": 0.79298252, "num_input_tokens_seen": 23096815, "step": 1090, "time_per_iteration": 2.8140432834625244 }, { "auxiliary_loss_clip": 0.01223301, "auxiliary_loss_mlp": 0.01036372, "balance_loss_clip": 0.90284288, "balance_loss_mlp": 1.02353358, "epoch": 0.13118499368724823, "flos": 28913189218560.0, "grad_norm": 2.2042873621197168, "language_loss": 0.7746321, "learning_rate": 3.8936619417816975e-06, "loss": 0.79722881, "num_input_tokens_seen": 23117145, "step": 1091, "time_per_iteration": 2.8294126987457275 }, { "auxiliary_loss_clip": 0.01231028, "auxiliary_loss_mlp": 0.01038047, "balance_loss_clip": 0.95134342, "balance_loss_mlp": 1.02508903, "epoch": 0.13130523657788734, "flos": 14283398206080.0, "grad_norm": 2.2063677222951688, "language_loss": 0.71144462, "learning_rate": 3.8934111784931015e-06, "loss": 0.73413539, "num_input_tokens_seen": 23134595, "step": 1092, "time_per_iteration": 2.8399102687835693 }, { "auxiliary_loss_clip": 0.01138758, "auxiliary_loss_mlp": 0.0100962, "balance_loss_clip": 0.96159542, "balance_loss_mlp": 1.00294387, "epoch": 0.13142547946852642, "flos": 70174155519360.0, "grad_norm": 0.9150952583582795, "language_loss": 0.59048074, "learning_rate": 3.893160127975535e-06, "loss": 0.61196452, "num_input_tokens_seen": 23195285, "step": 1093, "time_per_iteration": 3.4045591354370117 }, { "auxiliary_loss_clip": 0.01230942, "auxiliary_loss_mlp": 0.01040526, "balance_loss_clip": 0.9071849, "balance_loss_mlp": 1.02878404, "epoch": 0.1315457223591655, "flos": 45805998844800.0, "grad_norm": 2.279997757516046, "language_loss": 0.81360036, "learning_rate": 3.8929087902670826e-06, "loss": 0.83631504, "num_input_tokens_seen": 23216915, "step": 1094, "time_per_iteration": 3.028249502182007 }, { "auxiliary_loss_clip": 0.01139562, "auxiliary_loss_mlp": 0.010063, "balance_loss_clip": 0.99987721, "balance_loss_mlp": 0.99952859, "epoch": 0.13166596524980462, "flos": 62881165820160.0, "grad_norm": 0.9477252399004898, "language_loss": 0.60736108, "learning_rate": 3.8926571654058715e-06, "loss": 0.6288197, "num_input_tokens_seen": 23273560, "step": 1095, "time_per_iteration": 3.2353291511535645 }, { "auxiliary_loss_clip": 0.01222023, "auxiliary_loss_mlp": 0.01041781, "balance_loss_clip": 0.94669878, "balance_loss_mlp": 1.02946639, "epoch": 0.1317862081404437, "flos": 23586523793280.0, "grad_norm": 2.386611523400928, "language_loss": 0.76951766, "learning_rate": 3.892405253430074e-06, "loss": 0.79215574, "num_input_tokens_seen": 23291080, "step": 1096, "time_per_iteration": 2.990349769592285 }, { "auxiliary_loss_clip": 0.01233539, "auxiliary_loss_mlp": 0.01128894, "balance_loss_clip": 0.98811316, "balance_loss_mlp": 0.0, "epoch": 0.13190645103108278, "flos": 20260764460800.0, "grad_norm": 2.0179991062469433, "language_loss": 0.81974423, "learning_rate": 3.892153054377904e-06, "loss": 0.84336853, "num_input_tokens_seen": 23308485, "step": 1097, "time_per_iteration": 2.752824306488037 }, { "auxiliary_loss_clip": 0.01140057, "auxiliary_loss_mlp": 0.01009496, "balance_loss_clip": 0.85503751, "balance_loss_mlp": 1.00248623, "epoch": 0.13202669392172187, "flos": 53455440136320.0, "grad_norm": 0.9677166298691136, "language_loss": 0.59396881, "learning_rate": 3.891900568287619e-06, "loss": 0.61546433, "num_input_tokens_seen": 23360870, "step": 1098, "time_per_iteration": 3.1946420669555664 }, { "auxiliary_loss_clip": 0.01233782, "auxiliary_loss_mlp": 0.01035651, "balance_loss_clip": 0.94748431, "balance_loss_mlp": 1.0234741, "epoch": 0.13214693681236098, "flos": 15851293845120.0, "grad_norm": 2.582294362076957, "language_loss": 0.72324443, "learning_rate": 3.891647795197523e-06, "loss": 0.74593872, "num_input_tokens_seen": 23376910, "step": 1099, "time_per_iteration": 2.8221275806427 }, { "auxiliary_loss_clip": 0.0123381, "auxiliary_loss_mlp": 0.0104095, "balance_loss_clip": 0.94467944, "balance_loss_mlp": 1.0279026, "epoch": 0.13226717970300006, "flos": 19353840940800.0, "grad_norm": 2.176648106329632, "language_loss": 0.68860376, "learning_rate": 3.8913947351459605e-06, "loss": 0.71135139, "num_input_tokens_seen": 23394450, "step": 1100, "time_per_iteration": 2.8683626651763916 }, { "auxiliary_loss_clip": 0.01233113, "auxiliary_loss_mlp": 0.01040405, "balance_loss_clip": 1.06380224, "balance_loss_mlp": 1.02861524, "epoch": 0.13238742259363914, "flos": 20698084546560.0, "grad_norm": 1.8752042512698457, "language_loss": 0.67836231, "learning_rate": 3.89114138817132e-06, "loss": 0.70109749, "num_input_tokens_seen": 23411115, "step": 1101, "time_per_iteration": 2.785754680633545 }, { "auxiliary_loss_clip": 0.01228828, "auxiliary_loss_mlp": 0.01037717, "balance_loss_clip": 1.02611041, "balance_loss_mlp": 1.02547979, "epoch": 0.13250766548427825, "flos": 21032449274880.0, "grad_norm": 1.792672890638599, "language_loss": 0.84553164, "learning_rate": 3.890887754312035e-06, "loss": 0.86819708, "num_input_tokens_seen": 23429360, "step": 1102, "time_per_iteration": 2.786247968673706 }, { "auxiliary_loss_clip": 0.01222991, "auxiliary_loss_mlp": 0.01045409, "balance_loss_clip": 0.98334205, "balance_loss_mlp": 1.0333097, "epoch": 0.13262790837491734, "flos": 22637871648000.0, "grad_norm": 1.907150597922227, "language_loss": 0.87793082, "learning_rate": 3.890633833606581e-06, "loss": 0.90061486, "num_input_tokens_seen": 23449050, "step": 1103, "time_per_iteration": 2.9589502811431885 }, { "auxiliary_loss_clip": 0.01230991, "auxiliary_loss_mlp": 0.01045166, "balance_loss_clip": 1.02811956, "balance_loss_mlp": 1.03310752, "epoch": 0.13274815126555642, "flos": 19683141851520.0, "grad_norm": 3.593861964783029, "language_loss": 0.69697952, "learning_rate": 3.890379626093477e-06, "loss": 0.71974111, "num_input_tokens_seen": 23468800, "step": 1104, "time_per_iteration": 2.7944653034210205 }, { "auxiliary_loss_clip": 0.01206805, "auxiliary_loss_mlp": 0.01039652, "balance_loss_clip": 0.90309972, "balance_loss_mlp": 1.02662849, "epoch": 0.1328683941561955, "flos": 21317687176320.0, "grad_norm": 2.4258539449994907, "language_loss": 0.92503041, "learning_rate": 3.890125131811287e-06, "loss": 0.94749498, "num_input_tokens_seen": 23486850, "step": 1105, "time_per_iteration": 2.7123348712921143 }, { "auxiliary_loss_clip": 0.01213971, "auxiliary_loss_mlp": 0.01032383, "balance_loss_clip": 0.97962803, "balance_loss_mlp": 1.02074802, "epoch": 0.1329886370468346, "flos": 13699131580800.0, "grad_norm": 2.0380301002046015, "language_loss": 0.75481927, "learning_rate": 3.889870350798618e-06, "loss": 0.77728283, "num_input_tokens_seen": 23504195, "step": 1106, "time_per_iteration": 2.727754831314087 }, { "auxiliary_loss_clip": 0.01233487, "auxiliary_loss_mlp": 0.01037349, "balance_loss_clip": 1.06293082, "balance_loss_mlp": 1.02536798, "epoch": 0.1331088799374737, "flos": 21032413361280.0, "grad_norm": 1.6216238509433838, "language_loss": 0.78597879, "learning_rate": 3.889615283094119e-06, "loss": 0.80868709, "num_input_tokens_seen": 23523385, "step": 1107, "time_per_iteration": 2.6100823879241943 }, { "auxiliary_loss_clip": 0.01239191, "auxiliary_loss_mlp": 0.01041825, "balance_loss_clip": 1.06265187, "balance_loss_mlp": 1.02924824, "epoch": 0.13322912282811278, "flos": 18260432985600.0, "grad_norm": 2.1437321889612435, "language_loss": 0.84705377, "learning_rate": 3.889359928736485e-06, "loss": 0.86986399, "num_input_tokens_seen": 23541330, "step": 1108, "time_per_iteration": 2.7019705772399902 }, { "auxiliary_loss_clip": 0.01224313, "auxiliary_loss_mlp": 0.01128928, "balance_loss_clip": 0.98514199, "balance_loss_mlp": 0.0, "epoch": 0.1333493657187519, "flos": 24460876656000.0, "grad_norm": 2.3505019453258504, "language_loss": 0.90951514, "learning_rate": 3.889104287764451e-06, "loss": 0.93304753, "num_input_tokens_seen": 23561705, "step": 1109, "time_per_iteration": 2.7104086875915527 }, { "auxiliary_loss_clip": 0.0122968, "auxiliary_loss_mlp": 0.01040898, "balance_loss_clip": 0.98960221, "balance_loss_mlp": 1.0287919, "epoch": 0.13346960860939097, "flos": 22158930677760.0, "grad_norm": 2.099979396121369, "language_loss": 0.90407121, "learning_rate": 3.888848360216798e-06, "loss": 0.92677701, "num_input_tokens_seen": 23579350, "step": 1110, "time_per_iteration": 2.6922497749328613 }, { "auxiliary_loss_clip": 0.01141145, "auxiliary_loss_mlp": 0.01012363, "balance_loss_clip": 0.96166813, "balance_loss_mlp": 1.00563931, "epoch": 0.13358985150003005, "flos": 67931212608000.0, "grad_norm": 1.0819822501750942, "language_loss": 0.56593758, "learning_rate": 3.888592146132351e-06, "loss": 0.58747268, "num_input_tokens_seen": 23640620, "step": 1111, "time_per_iteration": 3.3504269123077393 }, { "auxiliary_loss_clip": 0.01232501, "auxiliary_loss_mlp": 0.01045692, "balance_loss_clip": 1.02824759, "balance_loss_mlp": 1.03356254, "epoch": 0.13371009439066917, "flos": 26834284742400.0, "grad_norm": 1.7313738176673839, "language_loss": 0.78415167, "learning_rate": 3.888335645549978e-06, "loss": 0.80693364, "num_input_tokens_seen": 23661040, "step": 1112, "time_per_iteration": 5.316360712051392 }, { "auxiliary_loss_clip": 0.0123668, "auxiliary_loss_mlp": 0.01033521, "balance_loss_clip": 1.06662083, "balance_loss_mlp": 1.02160573, "epoch": 0.13383033728130825, "flos": 26322844942080.0, "grad_norm": 9.066211207573609, "language_loss": 0.81769645, "learning_rate": 3.888078858508588e-06, "loss": 0.84039843, "num_input_tokens_seen": 23680900, "step": 1113, "time_per_iteration": 3.5524415969848633 }, { "auxiliary_loss_clip": 0.01227594, "auxiliary_loss_mlp": 0.01043008, "balance_loss_clip": 0.98818803, "balance_loss_mlp": 1.03102112, "epoch": 0.13395058017194733, "flos": 22563931501440.0, "grad_norm": 1.8438133863351176, "language_loss": 0.84480363, "learning_rate": 3.8878217850471365e-06, "loss": 0.8675096, "num_input_tokens_seen": 23700815, "step": 1114, "time_per_iteration": 3.6761999130249023 }, { "auxiliary_loss_clip": 0.0123942, "auxiliary_loss_mlp": 0.01041992, "balance_loss_clip": 1.0679009, "balance_loss_mlp": 1.02945113, "epoch": 0.13407082306258641, "flos": 25810938264960.0, "grad_norm": 2.629205780929822, "language_loss": 0.73921424, "learning_rate": 3.887564425204621e-06, "loss": 0.76202834, "num_input_tokens_seen": 23722500, "step": 1115, "time_per_iteration": 2.705805540084839 }, { "auxiliary_loss_clip": 0.01135301, "auxiliary_loss_mlp": 0.01008355, "balance_loss_clip": 0.92454171, "balance_loss_mlp": 1.0017271, "epoch": 0.13419106595322552, "flos": 68338365269760.0, "grad_norm": 0.8372801753606768, "language_loss": 0.54648054, "learning_rate": 3.887306779020083e-06, "loss": 0.56791711, "num_input_tokens_seen": 23777155, "step": 1116, "time_per_iteration": 3.2912580966949463 }, { "auxiliary_loss_clip": 0.01237262, "auxiliary_loss_mlp": 0.01040572, "balance_loss_clip": 1.02855134, "balance_loss_mlp": 1.02870512, "epoch": 0.1343113088438646, "flos": 20449080489600.0, "grad_norm": 2.1857032776183654, "language_loss": 0.70267874, "learning_rate": 3.887048846532608e-06, "loss": 0.72545707, "num_input_tokens_seen": 23794130, "step": 1117, "time_per_iteration": 2.6662750244140625 }, { "auxiliary_loss_clip": 0.01135951, "auxiliary_loss_mlp": 0.01010573, "balance_loss_clip": 0.92216057, "balance_loss_mlp": 1.00370622, "epoch": 0.1344315517345037, "flos": 67389784951680.0, "grad_norm": 0.7626443364478569, "language_loss": 0.58124143, "learning_rate": 3.8867906277813224e-06, "loss": 0.60270667, "num_input_tokens_seen": 23852285, "step": 1118, "time_per_iteration": 3.232752561569214 }, { "auxiliary_loss_clip": 0.01240204, "auxiliary_loss_mlp": 0.01128705, "balance_loss_clip": 1.02898049, "balance_loss_mlp": 0.0, "epoch": 0.1345517946251428, "flos": 40734442788480.0, "grad_norm": 2.1641047479406934, "language_loss": 0.73774004, "learning_rate": 3.886532122805399e-06, "loss": 0.76142913, "num_input_tokens_seen": 23874765, "step": 1119, "time_per_iteration": 2.9111435413360596 }, { "auxiliary_loss_clip": 0.01212618, "auxiliary_loss_mlp": 0.01042419, "balance_loss_clip": 0.86919874, "balance_loss_mlp": 1.02986681, "epoch": 0.13467203751578188, "flos": 22816850140800.0, "grad_norm": 1.6987609650665336, "language_loss": 0.8920275, "learning_rate": 3.886273331644053e-06, "loss": 0.9145779, "num_input_tokens_seen": 23893635, "step": 1120, "time_per_iteration": 2.8786089420318604 }, { "auxiliary_loss_clip": 0.01221392, "auxiliary_loss_mlp": 0.0104176, "balance_loss_clip": 0.91059953, "balance_loss_mlp": 1.02977383, "epoch": 0.13479228040642097, "flos": 17091576512640.0, "grad_norm": 1.9792098742619173, "language_loss": 0.82329893, "learning_rate": 3.886014254336542e-06, "loss": 0.84593046, "num_input_tokens_seen": 23910110, "step": 1121, "time_per_iteration": 2.7635750770568848 }, { "auxiliary_loss_clip": 0.01231596, "auxiliary_loss_mlp": 0.01037304, "balance_loss_clip": 1.02670693, "balance_loss_mlp": 1.02599716, "epoch": 0.13491252329706005, "flos": 23730525417600.0, "grad_norm": 1.6725308313887117, "language_loss": 0.92645931, "learning_rate": 3.885754890922168e-06, "loss": 0.94914836, "num_input_tokens_seen": 23930440, "step": 1122, "time_per_iteration": 2.747213840484619 }, { "auxiliary_loss_clip": 0.01212115, "auxiliary_loss_mlp": 0.01046171, "balance_loss_clip": 0.8327145, "balance_loss_mlp": 1.0337733, "epoch": 0.13503276618769916, "flos": 34127058960000.0, "grad_norm": 2.0502466159368664, "language_loss": 0.78521764, "learning_rate": 3.885495241440277e-06, "loss": 0.80780053, "num_input_tokens_seen": 23954535, "step": 1123, "time_per_iteration": 3.086962938308716 }, { "auxiliary_loss_clip": 0.01234059, "auxiliary_loss_mlp": 0.01038828, "balance_loss_clip": 1.06365037, "balance_loss_mlp": 1.02663875, "epoch": 0.13515300907833824, "flos": 17712328377600.0, "grad_norm": 1.7591936524882847, "language_loss": 0.74504739, "learning_rate": 3.885235305930257e-06, "loss": 0.76777625, "num_input_tokens_seen": 23972735, "step": 1124, "time_per_iteration": 2.872965097427368 }, { "auxiliary_loss_clip": 0.01222835, "auxiliary_loss_mlp": 0.01043037, "balance_loss_clip": 0.94996512, "balance_loss_mlp": 1.03088951, "epoch": 0.13527325196897733, "flos": 20260872201600.0, "grad_norm": 1.9074088066836445, "language_loss": 0.85209441, "learning_rate": 3.884975084431539e-06, "loss": 0.87475312, "num_input_tokens_seen": 23987685, "step": 1125, "time_per_iteration": 2.7535109519958496 }, { "auxiliary_loss_clip": 0.01222613, "auxiliary_loss_mlp": 0.01128845, "balance_loss_clip": 1.02603614, "balance_loss_mlp": 0.0, "epoch": 0.13539349485961644, "flos": 18186492839040.0, "grad_norm": 2.406015427565103, "language_loss": 0.91383803, "learning_rate": 3.8847145769836e-06, "loss": 0.9373526, "num_input_tokens_seen": 24004105, "step": 1126, "time_per_iteration": 2.8265395164489746 }, { "auxiliary_loss_clip": 0.01238995, "auxiliary_loss_mlp": 0.01038963, "balance_loss_clip": 1.06697464, "balance_loss_mlp": 1.02629709, "epoch": 0.13551373775025552, "flos": 19317463441920.0, "grad_norm": 2.8355830628493353, "language_loss": 0.66730797, "learning_rate": 3.884453783625959e-06, "loss": 0.69008756, "num_input_tokens_seen": 24021715, "step": 1127, "time_per_iteration": 2.658266067504883 }, { "auxiliary_loss_clip": 0.01228766, "auxiliary_loss_mlp": 0.0103958, "balance_loss_clip": 0.9902212, "balance_loss_mlp": 1.0271883, "epoch": 0.1356339806408946, "flos": 20850813175680.0, "grad_norm": 2.2351881287801874, "language_loss": 0.85039473, "learning_rate": 3.884192704398176e-06, "loss": 0.87307811, "num_input_tokens_seen": 24038915, "step": 1128, "time_per_iteration": 2.7343695163726807 }, { "auxiliary_loss_clip": 0.01234885, "auxiliary_loss_mlp": 0.01037683, "balance_loss_clip": 1.02615857, "balance_loss_mlp": 1.02638817, "epoch": 0.13575422353153369, "flos": 50476037696640.0, "grad_norm": 1.807517175633649, "language_loss": 0.74473387, "learning_rate": 3.883931339339858e-06, "loss": 0.76745951, "num_input_tokens_seen": 24063300, "step": 1129, "time_per_iteration": 2.9391303062438965 }, { "auxiliary_loss_clip": 0.0123878, "auxiliary_loss_mlp": 0.01042037, "balance_loss_clip": 1.02792478, "balance_loss_mlp": 1.02971041, "epoch": 0.1358744664221728, "flos": 18150797698560.0, "grad_norm": 2.9702016498601425, "language_loss": 0.79028523, "learning_rate": 3.883669688490654e-06, "loss": 0.81309342, "num_input_tokens_seen": 24081070, "step": 1130, "time_per_iteration": 2.721463203430176 }, { "auxiliary_loss_clip": 0.01214932, "auxiliary_loss_mlp": 0.01128594, "balance_loss_clip": 0.98404956, "balance_loss_mlp": 0.0, "epoch": 0.13599470931281188, "flos": 18442966924800.0, "grad_norm": 4.932944080369545, "language_loss": 0.85744429, "learning_rate": 3.883407751890256e-06, "loss": 0.88087964, "num_input_tokens_seen": 24099675, "step": 1131, "time_per_iteration": 2.8485989570617676 }, { "auxiliary_loss_clip": 0.01223962, "auxiliary_loss_mlp": 0.01046842, "balance_loss_clip": 0.94622898, "balance_loss_mlp": 1.03435493, "epoch": 0.13611495220345096, "flos": 26680766014080.0, "grad_norm": 3.2899439410363227, "language_loss": 0.85825199, "learning_rate": 3.8831455295783994e-06, "loss": 0.88095999, "num_input_tokens_seen": 24118925, "step": 1132, "time_per_iteration": 2.7564826011657715 }, { "auxiliary_loss_clip": 0.0121971, "auxiliary_loss_mlp": 0.01039215, "balance_loss_clip": 0.98505473, "balance_loss_mlp": 1.02702558, "epoch": 0.13623519509409007, "flos": 21686238673920.0, "grad_norm": 1.6156221221877785, "language_loss": 0.73796386, "learning_rate": 3.882883021594864e-06, "loss": 0.76055306, "num_input_tokens_seen": 24137065, "step": 1133, "time_per_iteration": 2.7751646041870117 }, { "auxiliary_loss_clip": 0.01221942, "auxiliary_loss_mlp": 0.01040921, "balance_loss_clip": 0.95110178, "balance_loss_mlp": 1.02826154, "epoch": 0.13635543798472916, "flos": 14830389492480.0, "grad_norm": 2.0832857281675423, "language_loss": 0.86755133, "learning_rate": 3.8826202279794705e-06, "loss": 0.89017993, "num_input_tokens_seen": 24154125, "step": 1134, "time_per_iteration": 2.746945381164551 }, { "auxiliary_loss_clip": 0.01240315, "auxiliary_loss_mlp": 0.01043261, "balance_loss_clip": 1.06866193, "balance_loss_mlp": 1.03144109, "epoch": 0.13647568087536824, "flos": 22890323410560.0, "grad_norm": 1.8939059257850888, "language_loss": 0.70128417, "learning_rate": 3.882357148772085e-06, "loss": 0.7241199, "num_input_tokens_seen": 24171550, "step": 1135, "time_per_iteration": 2.765996217727661 }, { "auxiliary_loss_clip": 0.0121467, "auxiliary_loss_mlp": 0.01035, "balance_loss_clip": 0.94882143, "balance_loss_mlp": 1.02286434, "epoch": 0.13659592376600732, "flos": 19937927998080.0, "grad_norm": 2.311881755721559, "language_loss": 0.84485173, "learning_rate": 3.882093784012617e-06, "loss": 0.86734843, "num_input_tokens_seen": 24190190, "step": 1136, "time_per_iteration": 2.827521800994873 }, { "auxiliary_loss_clip": 0.01226881, "auxiliary_loss_mlp": 0.01047538, "balance_loss_clip": 0.98539865, "balance_loss_mlp": 1.0347352, "epoch": 0.13671616665664643, "flos": 21428579439360.0, "grad_norm": 1.6811740354003715, "language_loss": 0.84324551, "learning_rate": 3.881830133741019e-06, "loss": 0.86598969, "num_input_tokens_seen": 24209055, "step": 1137, "time_per_iteration": 2.7346200942993164 }, { "auxiliary_loss_clip": 0.01231754, "auxiliary_loss_mlp": 0.0104432, "balance_loss_clip": 0.95199001, "balance_loss_mlp": 1.03259003, "epoch": 0.13683640954728551, "flos": 22778138257920.0, "grad_norm": 2.933112257532739, "language_loss": 0.76062942, "learning_rate": 3.881566197997285e-06, "loss": 0.78339016, "num_input_tokens_seen": 24225490, "step": 1138, "time_per_iteration": 4.210619688034058 }, { "auxiliary_loss_clip": 0.01224522, "auxiliary_loss_mlp": 0.01036336, "balance_loss_clip": 0.98995185, "balance_loss_mlp": 1.02480793, "epoch": 0.1369566524379246, "flos": 21725884310400.0, "grad_norm": 1.5945946944379061, "language_loss": 0.75177598, "learning_rate": 3.881301976821456e-06, "loss": 0.7743845, "num_input_tokens_seen": 24245520, "step": 1139, "time_per_iteration": 4.6529927253723145 }, { "auxiliary_loss_clip": 0.01227095, "auxiliary_loss_mlp": 0.01040667, "balance_loss_clip": 1.02759409, "balance_loss_mlp": 1.02891934, "epoch": 0.1370768953285637, "flos": 18624459369600.0, "grad_norm": 1.8010721793572988, "language_loss": 0.90759695, "learning_rate": 3.881037470253612e-06, "loss": 0.93027461, "num_input_tokens_seen": 24265035, "step": 1140, "time_per_iteration": 3.6907455921173096 }, { "auxiliary_loss_clip": 0.01225909, "auxiliary_loss_mlp": 0.01035829, "balance_loss_clip": 0.91374052, "balance_loss_mlp": 1.02429008, "epoch": 0.1371971382192028, "flos": 14939521989120.0, "grad_norm": 2.5829918972225467, "language_loss": 0.79250598, "learning_rate": 3.88077267833388e-06, "loss": 0.81512332, "num_input_tokens_seen": 24281550, "step": 1141, "time_per_iteration": 2.7145824432373047 }, { "auxiliary_loss_clip": 0.0121635, "auxiliary_loss_mlp": 0.01041087, "balance_loss_clip": 0.9072448, "balance_loss_mlp": 1.02961874, "epoch": 0.13731738110984187, "flos": 19023785844480.0, "grad_norm": 2.082071215423518, "language_loss": 0.84159958, "learning_rate": 3.880507601102427e-06, "loss": 0.86417395, "num_input_tokens_seen": 24299485, "step": 1142, "time_per_iteration": 2.837521553039551 }, { "auxiliary_loss_clip": 0.01238828, "auxiliary_loss_mlp": 0.01035183, "balance_loss_clip": 1.07068753, "balance_loss_mlp": 1.02346504, "epoch": 0.13743762400048098, "flos": 18187462506240.0, "grad_norm": 2.0003361935031454, "language_loss": 0.82137144, "learning_rate": 3.880242238599467e-06, "loss": 0.84411156, "num_input_tokens_seen": 24316010, "step": 1143, "time_per_iteration": 2.5987327098846436 }, { "auxiliary_loss_clip": 0.01235052, "auxiliary_loss_mlp": 0.01043091, "balance_loss_clip": 1.0667038, "balance_loss_mlp": 1.03066325, "epoch": 0.13755786689112007, "flos": 21031982398080.0, "grad_norm": 1.8627974949670316, "language_loss": 0.83480388, "learning_rate": 3.879976590865254e-06, "loss": 0.85758531, "num_input_tokens_seen": 24335465, "step": 1144, "time_per_iteration": 2.668107032775879 }, { "auxiliary_loss_clip": 0.01232208, "auxiliary_loss_mlp": 0.01043434, "balance_loss_clip": 0.99076164, "balance_loss_mlp": 1.03131032, "epoch": 0.13767810978175915, "flos": 21360636864000.0, "grad_norm": 3.168099619445142, "language_loss": 0.87245548, "learning_rate": 3.879710657940087e-06, "loss": 0.89521182, "num_input_tokens_seen": 24354415, "step": 1145, "time_per_iteration": 2.782545566558838 }, { "auxiliary_loss_clip": 0.01233386, "auxiliary_loss_mlp": 0.0104458, "balance_loss_clip": 1.02753139, "balance_loss_mlp": 1.03233743, "epoch": 0.13779835267239823, "flos": 30592084861440.0, "grad_norm": 1.8983468749217998, "language_loss": 0.69871825, "learning_rate": 3.879444439864308e-06, "loss": 0.72149789, "num_input_tokens_seen": 24373990, "step": 1146, "time_per_iteration": 2.8548550605773926 }, { "auxiliary_loss_clip": 0.01232128, "auxiliary_loss_mlp": 0.01128901, "balance_loss_clip": 1.02651024, "balance_loss_mlp": 0.0, "epoch": 0.13791859556303734, "flos": 22669867687680.0, "grad_norm": 1.7529941432230547, "language_loss": 0.86095941, "learning_rate": 3.879177936678301e-06, "loss": 0.88456959, "num_input_tokens_seen": 24392995, "step": 1147, "time_per_iteration": 2.7659494876861572 }, { "auxiliary_loss_clip": 0.01232796, "auxiliary_loss_mlp": 0.01048101, "balance_loss_clip": 0.98719925, "balance_loss_mlp": 1.03579843, "epoch": 0.13803883845367643, "flos": 35224166016000.0, "grad_norm": 1.9471018818295216, "language_loss": 0.77537262, "learning_rate": 3.878911148422496e-06, "loss": 0.79818165, "num_input_tokens_seen": 24414470, "step": 1148, "time_per_iteration": 2.7901663780212402 }, { "auxiliary_loss_clip": 0.01234342, "auxiliary_loss_mlp": 0.01045783, "balance_loss_clip": 1.02642131, "balance_loss_mlp": 1.03305149, "epoch": 0.1381590813443155, "flos": 32014542332160.0, "grad_norm": 2.8411161584535125, "language_loss": 0.70637488, "learning_rate": 3.878644075137364e-06, "loss": 0.72917616, "num_input_tokens_seen": 24435120, "step": 1149, "time_per_iteration": 2.7428128719329834 }, { "auxiliary_loss_clip": 0.01199616, "auxiliary_loss_mlp": 0.01045881, "balance_loss_clip": 0.94199896, "balance_loss_mlp": 1.03351867, "epoch": 0.13827932423495462, "flos": 17821855923840.0, "grad_norm": 1.9083550493110222, "language_loss": 0.79177034, "learning_rate": 3.878376716863418e-06, "loss": 0.81422526, "num_input_tokens_seen": 24451420, "step": 1150, "time_per_iteration": 2.681762933731079 }, { "auxiliary_loss_clip": 0.01227082, "auxiliary_loss_mlp": 0.01043801, "balance_loss_clip": 0.98481101, "balance_loss_mlp": 1.03216636, "epoch": 0.1383995671255937, "flos": 19427098728960.0, "grad_norm": 1.9293839154283492, "language_loss": 0.71752781, "learning_rate": 3.878109073641219e-06, "loss": 0.74023664, "num_input_tokens_seen": 24470450, "step": 1151, "time_per_iteration": 2.7709639072418213 }, { "auxiliary_loss_clip": 0.01220997, "auxiliary_loss_mlp": 0.01039126, "balance_loss_clip": 0.90959537, "balance_loss_mlp": 1.02783084, "epoch": 0.13851981001623279, "flos": 28296603331200.0, "grad_norm": 1.7674736667931774, "language_loss": 0.81226337, "learning_rate": 3.877841145511366e-06, "loss": 0.83486462, "num_input_tokens_seen": 24493190, "step": 1152, "time_per_iteration": 2.9140021800994873 }, { "auxiliary_loss_clip": 0.01237489, "auxiliary_loss_mlp": 0.01051125, "balance_loss_clip": 1.02844596, "balance_loss_mlp": 1.03811884, "epoch": 0.13864005290687187, "flos": 21213079793280.0, "grad_norm": 1.6780898199763552, "language_loss": 0.83128405, "learning_rate": 3.8775729325145035e-06, "loss": 0.8541702, "num_input_tokens_seen": 24512425, "step": 1153, "time_per_iteration": 2.918734073638916 }, { "auxiliary_loss_clip": 0.0112882, "auxiliary_loss_mlp": 0.01006281, "balance_loss_clip": 0.92935562, "balance_loss_mlp": 0.99965346, "epoch": 0.13876029579751098, "flos": 71653389413760.0, "grad_norm": 0.8530619383006204, "language_loss": 0.64774299, "learning_rate": 3.877304434691321e-06, "loss": 0.66909397, "num_input_tokens_seen": 24579275, "step": 1154, "time_per_iteration": 3.4279837608337402 }, { "auxiliary_loss_clip": 0.01227739, "auxiliary_loss_mlp": 0.01044888, "balance_loss_clip": 0.94888252, "balance_loss_mlp": 1.03398669, "epoch": 0.13888053868815006, "flos": 21941348042880.0, "grad_norm": 1.7833204429331702, "language_loss": 0.79831445, "learning_rate": 3.877035652082548e-06, "loss": 0.82104069, "num_input_tokens_seen": 24598720, "step": 1155, "time_per_iteration": 2.736335039138794 }, { "auxiliary_loss_clip": 0.01224793, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 0.99055153, "balance_loss_mlp": 1.02922916, "epoch": 0.13900078157878915, "flos": 19608627087360.0, "grad_norm": 1.6632860183262241, "language_loss": 0.85205984, "learning_rate": 3.87676658472896e-06, "loss": 0.87472653, "num_input_tokens_seen": 24617530, "step": 1156, "time_per_iteration": 2.9242546558380127 }, { "auxiliary_loss_clip": 0.01231811, "auxiliary_loss_mlp": 0.01045354, "balance_loss_clip": 1.02461755, "balance_loss_mlp": 1.03212762, "epoch": 0.13912102446942826, "flos": 22638051216000.0, "grad_norm": 1.7682526914397703, "language_loss": 0.85192329, "learning_rate": 3.876497232671372e-06, "loss": 0.87469494, "num_input_tokens_seen": 24637485, "step": 1157, "time_per_iteration": 2.6849324703216553 }, { "auxiliary_loss_clip": 0.01227406, "auxiliary_loss_mlp": 0.0104223, "balance_loss_clip": 0.91015702, "balance_loss_mlp": 1.03021955, "epoch": 0.13924126736006734, "flos": 29643324975360.0, "grad_norm": 2.0789535501356196, "language_loss": 0.83676517, "learning_rate": 3.876227595950647e-06, "loss": 0.85946155, "num_input_tokens_seen": 24656915, "step": 1158, "time_per_iteration": 3.0056419372558594 }, { "auxiliary_loss_clip": 0.01234204, "auxiliary_loss_mlp": 0.010387, "balance_loss_clip": 1.06396103, "balance_loss_mlp": 1.02677917, "epoch": 0.13936151025070642, "flos": 27417653527680.0, "grad_norm": 1.9850027306982112, "language_loss": 0.78704464, "learning_rate": 3.875957674607686e-06, "loss": 0.80977368, "num_input_tokens_seen": 24679190, "step": 1159, "time_per_iteration": 2.711754560470581 }, { "auxiliary_loss_clip": 0.01222026, "auxiliary_loss_mlp": 0.0112972, "balance_loss_clip": 1.02154672, "balance_loss_mlp": 0.0, "epoch": 0.1394817531413455, "flos": 16399326625920.0, "grad_norm": 2.083513749451306, "language_loss": 0.88063622, "learning_rate": 3.8756874686834386e-06, "loss": 0.9041537, "num_input_tokens_seen": 24697405, "step": 1160, "time_per_iteration": 2.6906468868255615 }, { "auxiliary_loss_clip": 0.01239353, "auxiliary_loss_mlp": 0.01129338, "balance_loss_clip": 1.02754235, "balance_loss_mlp": 0.0, "epoch": 0.13960199603198462, "flos": 30922319525760.0, "grad_norm": 1.601970722078597, "language_loss": 0.80351412, "learning_rate": 3.875416978218893e-06, "loss": 0.82720101, "num_input_tokens_seen": 24720600, "step": 1161, "time_per_iteration": 2.8243651390075684 }, { "auxiliary_loss_clip": 0.0123627, "auxiliary_loss_mlp": 0.01046701, "balance_loss_clip": 0.94588006, "balance_loss_mlp": 1.03390408, "epoch": 0.1397222389226237, "flos": 18113773754880.0, "grad_norm": 2.1155828470949363, "language_loss": 0.83105707, "learning_rate": 3.8751462032550835e-06, "loss": 0.85388678, "num_input_tokens_seen": 24737605, "step": 1162, "time_per_iteration": 2.7223563194274902 }, { "auxiliary_loss_clip": 0.01230302, "auxiliary_loss_mlp": 0.0104282, "balance_loss_clip": 0.99252617, "balance_loss_mlp": 1.03128672, "epoch": 0.13984248181326278, "flos": 16872772815360.0, "grad_norm": 3.1799031612485518, "language_loss": 0.82584751, "learning_rate": 3.874875143833085e-06, "loss": 0.84857869, "num_input_tokens_seen": 24755845, "step": 1163, "time_per_iteration": 2.7354133129119873 }, { "auxiliary_loss_clip": 0.01236457, "auxiliary_loss_mlp": 0.01044647, "balance_loss_clip": 1.02719879, "balance_loss_mlp": 1.03191507, "epoch": 0.1399627247039019, "flos": 54121401267840.0, "grad_norm": 2.6709572220624787, "language_loss": 0.68820655, "learning_rate": 3.874603799994019e-06, "loss": 0.71101755, "num_input_tokens_seen": 24779380, "step": 1164, "time_per_iteration": 4.857868909835815 }, { "auxiliary_loss_clip": 0.01212285, "auxiliary_loss_mlp": 0.01048389, "balance_loss_clip": 0.94265711, "balance_loss_mlp": 1.03584194, "epoch": 0.14008296759454097, "flos": 11765521618560.0, "grad_norm": 1.880410760292643, "language_loss": 0.86905593, "learning_rate": 3.874332171779046e-06, "loss": 0.89166272, "num_input_tokens_seen": 24794260, "step": 1165, "time_per_iteration": 3.6649887561798096 }, { "auxiliary_loss_clip": 0.01224096, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 0.94626725, "balance_loss_mlp": 1.02773869, "epoch": 0.14020321048518006, "flos": 22017514832640.0, "grad_norm": 1.7865717118075302, "language_loss": 0.75735497, "learning_rate": 3.874060259229373e-06, "loss": 0.77999908, "num_input_tokens_seen": 24815835, "step": 1166, "time_per_iteration": 3.762666940689087 }, { "auxiliary_loss_clip": 0.01239143, "auxiliary_loss_mlp": 0.01040221, "balance_loss_clip": 1.0291748, "balance_loss_mlp": 1.02720308, "epoch": 0.14032345337581917, "flos": 23404313076480.0, "grad_norm": 2.5112205189004726, "language_loss": 0.93775666, "learning_rate": 3.873788062386249e-06, "loss": 0.96055037, "num_input_tokens_seen": 24834095, "step": 1167, "time_per_iteration": 2.748765468597412 }, { "auxiliary_loss_clip": 0.0123292, "auxiliary_loss_mlp": 0.01052046, "balance_loss_clip": 0.95149064, "balance_loss_mlp": 1.03896832, "epoch": 0.14044369626645825, "flos": 29645767100160.0, "grad_norm": 2.3024234663252106, "language_loss": 0.81976414, "learning_rate": 3.873515581290965e-06, "loss": 0.84261382, "num_input_tokens_seen": 24858900, "step": 1168, "time_per_iteration": 2.841552972793579 }, { "auxiliary_loss_clip": 0.0123209, "auxiliary_loss_mlp": 0.01047999, "balance_loss_clip": 0.95314968, "balance_loss_mlp": 1.03511822, "epoch": 0.14056393915709733, "flos": 18332972501760.0, "grad_norm": 2.148075943429343, "language_loss": 0.76296067, "learning_rate": 3.8732428159848575e-06, "loss": 0.78576159, "num_input_tokens_seen": 24877875, "step": 1169, "time_per_iteration": 2.7504730224609375 }, { "auxiliary_loss_clip": 0.01233114, "auxiliary_loss_mlp": 0.010522, "balance_loss_clip": 1.02763677, "balance_loss_mlp": 1.03869367, "epoch": 0.14068418204773642, "flos": 26687517770880.0, "grad_norm": 1.7109608932952798, "language_loss": 0.78138393, "learning_rate": 3.872969766509304e-06, "loss": 0.80423701, "num_input_tokens_seen": 24898430, "step": 1170, "time_per_iteration": 2.710857629776001 }, { "auxiliary_loss_clip": 0.01121227, "auxiliary_loss_mlp": 0.01011628, "balance_loss_clip": 0.92059386, "balance_loss_mlp": 1.00538123, "epoch": 0.14080442493837553, "flos": 65259314501760.0, "grad_norm": 0.7671432061988768, "language_loss": 0.55671811, "learning_rate": 3.872696432905726e-06, "loss": 0.57804668, "num_input_tokens_seen": 24959250, "step": 1171, "time_per_iteration": 3.2900147438049316 }, { "auxiliary_loss_clip": 0.01237119, "auxiliary_loss_mlp": 0.01038355, "balance_loss_clip": 1.02412748, "balance_loss_mlp": 1.02549815, "epoch": 0.1409246678290146, "flos": 25776715582080.0, "grad_norm": 2.171342587209219, "language_loss": 0.7133835, "learning_rate": 3.872422815215589e-06, "loss": 0.73613822, "num_input_tokens_seen": 24978330, "step": 1172, "time_per_iteration": 2.8074417114257812 }, { "auxiliary_loss_clip": 0.01221075, "auxiliary_loss_mlp": 0.01045533, "balance_loss_clip": 1.01961005, "balance_loss_mlp": 1.03301585, "epoch": 0.1410449107196537, "flos": 21868521217920.0, "grad_norm": 1.9153391343439314, "language_loss": 0.7427392, "learning_rate": 3.8721489134803994e-06, "loss": 0.7654053, "num_input_tokens_seen": 24997120, "step": 1173, "time_per_iteration": 2.7628114223480225 }, { "auxiliary_loss_clip": 0.01228902, "auxiliary_loss_mlp": 0.01047142, "balance_loss_clip": 1.02458525, "balance_loss_mlp": 1.03460121, "epoch": 0.1411651536102928, "flos": 16684133564160.0, "grad_norm": 2.309797097935536, "language_loss": 0.72327018, "learning_rate": 3.871874727741707e-06, "loss": 0.74603063, "num_input_tokens_seen": 25014350, "step": 1174, "time_per_iteration": 2.7211568355560303 }, { "auxiliary_loss_clip": 0.0122806, "auxiliary_loss_mlp": 0.01039337, "balance_loss_clip": 1.02630663, "balance_loss_mlp": 1.02662385, "epoch": 0.1412853965009319, "flos": 20992264934400.0, "grad_norm": 1.758258228518568, "language_loss": 0.9668386, "learning_rate": 3.871600258041108e-06, "loss": 0.98951256, "num_input_tokens_seen": 25033875, "step": 1175, "time_per_iteration": 2.7194104194641113 }, { "auxiliary_loss_clip": 0.01218394, "auxiliary_loss_mlp": 0.01043698, "balance_loss_clip": 0.98250741, "balance_loss_mlp": 1.0317775, "epoch": 0.14140563939157097, "flos": 20335279224960.0, "grad_norm": 3.7799219353928115, "language_loss": 0.85625136, "learning_rate": 3.871325504420238e-06, "loss": 0.87887233, "num_input_tokens_seen": 25052865, "step": 1176, "time_per_iteration": 2.716390371322632 }, { "auxiliary_loss_clip": 0.01234278, "auxiliary_loss_mlp": 0.01047257, "balance_loss_clip": 1.06496596, "balance_loss_mlp": 1.03496051, "epoch": 0.14152588228221005, "flos": 21068826773760.0, "grad_norm": 1.9085672999010814, "language_loss": 0.8135708, "learning_rate": 3.871050466920776e-06, "loss": 0.83638608, "num_input_tokens_seen": 25072770, "step": 1177, "time_per_iteration": 2.694099187850952 }, { "auxiliary_loss_clip": 0.01219761, "auxiliary_loss_mlp": 0.01041862, "balance_loss_clip": 0.94771725, "balance_loss_mlp": 1.02950573, "epoch": 0.14164612517284916, "flos": 18223157646720.0, "grad_norm": 1.763546391387733, "language_loss": 0.79916131, "learning_rate": 3.870775145584447e-06, "loss": 0.82177758, "num_input_tokens_seen": 25090550, "step": 1178, "time_per_iteration": 2.8142940998077393 }, { "auxiliary_loss_clip": 0.0123615, "auxiliary_loss_mlp": 0.01042132, "balance_loss_clip": 0.98531479, "balance_loss_mlp": 1.0288105, "epoch": 0.14176636806348825, "flos": 22744454279040.0, "grad_norm": 2.6172135906587606, "language_loss": 0.64679909, "learning_rate": 3.8704995404530145e-06, "loss": 0.66958195, "num_input_tokens_seen": 25106175, "step": 1179, "time_per_iteration": 2.7478079795837402 }, { "auxiliary_loss_clip": 0.01230833, "auxiliary_loss_mlp": 0.01044476, "balance_loss_clip": 1.06592572, "balance_loss_mlp": 1.03303838, "epoch": 0.14188661095412733, "flos": 22091095843200.0, "grad_norm": 1.6466897164928096, "language_loss": 0.85094035, "learning_rate": 3.87022365156829e-06, "loss": 0.87369347, "num_input_tokens_seen": 25126890, "step": 1180, "time_per_iteration": 2.7426059246063232 }, { "auxiliary_loss_clip": 0.01220643, "auxiliary_loss_mlp": 0.01043509, "balance_loss_clip": 0.83200169, "balance_loss_mlp": 1.03131986, "epoch": 0.14200685384476644, "flos": 24352390604160.0, "grad_norm": 2.2658592817786687, "language_loss": 0.81105995, "learning_rate": 3.869947478972123e-06, "loss": 0.83370149, "num_input_tokens_seen": 25147915, "step": 1181, "time_per_iteration": 3.0602848529815674 }, { "auxiliary_loss_clip": 0.0121993, "auxiliary_loss_mlp": 0.0104488, "balance_loss_clip": 1.0217396, "balance_loss_mlp": 1.03189206, "epoch": 0.14212709673540552, "flos": 24022048199040.0, "grad_norm": 1.8953585622275928, "language_loss": 0.8206417, "learning_rate": 3.869671022706412e-06, "loss": 0.84328979, "num_input_tokens_seen": 25166645, "step": 1182, "time_per_iteration": 2.9852817058563232 }, { "auxiliary_loss_clip": 0.01205645, "auxiliary_loss_mlp": 0.01041268, "balance_loss_clip": 0.90482986, "balance_loss_mlp": 1.02840519, "epoch": 0.1422473396260446, "flos": 26431797870720.0, "grad_norm": 1.8824117680012327, "language_loss": 0.65039021, "learning_rate": 3.869394282813092e-06, "loss": 0.67285937, "num_input_tokens_seen": 25185845, "step": 1183, "time_per_iteration": 2.8784945011138916 }, { "auxiliary_loss_clip": 0.01233315, "auxiliary_loss_mlp": 0.01041885, "balance_loss_clip": 0.94633901, "balance_loss_mlp": 1.02938557, "epoch": 0.1423675825166837, "flos": 17055306754560.0, "grad_norm": 2.3233128328616037, "language_loss": 0.89540172, "learning_rate": 3.869117259334147e-06, "loss": 0.91815376, "num_input_tokens_seen": 25203770, "step": 1184, "time_per_iteration": 2.854956865310669 }, { "auxiliary_loss_clip": 0.01225897, "auxiliary_loss_mlp": 0.01037218, "balance_loss_clip": 1.02572107, "balance_loss_mlp": 1.02440274, "epoch": 0.1424878254073228, "flos": 17929480049280.0, "grad_norm": 1.7934926377506148, "language_loss": 0.8219499, "learning_rate": 3.868839952311599e-06, "loss": 0.84458101, "num_input_tokens_seen": 25221725, "step": 1185, "time_per_iteration": 2.869370937347412 }, { "auxiliary_loss_clip": 0.01225722, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 0.98793972, "balance_loss_mlp": 1.03261709, "epoch": 0.14260806829796188, "flos": 20303606407680.0, "grad_norm": 2.8931316025083595, "language_loss": 0.80484235, "learning_rate": 3.868562361787516e-06, "loss": 0.82755363, "num_input_tokens_seen": 25240855, "step": 1186, "time_per_iteration": 2.774563789367676 }, { "auxiliary_loss_clip": 0.01213628, "auxiliary_loss_mlp": 0.01036461, "balance_loss_clip": 0.82751083, "balance_loss_mlp": 1.0244031, "epoch": 0.14272831118860096, "flos": 23185724860800.0, "grad_norm": 1.9406424695908273, "language_loss": 0.69160384, "learning_rate": 3.868284487804009e-06, "loss": 0.71410477, "num_input_tokens_seen": 25260085, "step": 1187, "time_per_iteration": 2.9536516666412354 }, { "auxiliary_loss_clip": 0.01231829, "auxiliary_loss_mlp": 0.0103817, "balance_loss_clip": 0.98635006, "balance_loss_mlp": 1.0258559, "epoch": 0.14284855407924008, "flos": 27232210586880.0, "grad_norm": 1.6678856268146347, "language_loss": 0.77938604, "learning_rate": 3.86800633040323e-06, "loss": 0.80208606, "num_input_tokens_seen": 25280675, "step": 1188, "time_per_iteration": 3.2001824378967285 }, { "auxiliary_loss_clip": 0.01229589, "auxiliary_loss_mlp": 0.01128967, "balance_loss_clip": 0.99180132, "balance_loss_mlp": 0.0, "epoch": 0.14296879696987916, "flos": 28184202696960.0, "grad_norm": 8.837652653515654, "language_loss": 0.78506064, "learning_rate": 3.867727889627376e-06, "loss": 0.8086462, "num_input_tokens_seen": 25300290, "step": 1189, "time_per_iteration": 2.853957414627075 }, { "auxiliary_loss_clip": 0.01212236, "auxiliary_loss_mlp": 0.01035318, "balance_loss_clip": 0.94677943, "balance_loss_mlp": 1.02297401, "epoch": 0.14308903986051824, "flos": 19390290266880.0, "grad_norm": 2.289679515863155, "language_loss": 0.78395879, "learning_rate": 3.867449165518687e-06, "loss": 0.80643427, "num_input_tokens_seen": 25316760, "step": 1190, "time_per_iteration": 3.942830801010132 }, { "auxiliary_loss_clip": 0.01233982, "auxiliary_loss_mlp": 0.01129234, "balance_loss_clip": 1.06310439, "balance_loss_mlp": 0.0, "epoch": 0.14320928275115732, "flos": 17457506317440.0, "grad_norm": 1.8339305502273286, "language_loss": 0.71477395, "learning_rate": 3.867170158119444e-06, "loss": 0.73840612, "num_input_tokens_seen": 25335760, "step": 1191, "time_per_iteration": 3.6408591270446777 }, { "auxiliary_loss_clip": 0.0123567, "auxiliary_loss_mlp": 0.01043266, "balance_loss_clip": 1.06557274, "balance_loss_mlp": 1.0312264, "epoch": 0.14332952564179643, "flos": 21466070259840.0, "grad_norm": 1.983929953568458, "language_loss": 0.7529037, "learning_rate": 3.866890867471972e-06, "loss": 0.77569306, "num_input_tokens_seen": 25354230, "step": 1192, "time_per_iteration": 3.652254581451416 }, { "auxiliary_loss_clip": 0.01216464, "auxiliary_loss_mlp": 0.0105073, "balance_loss_clip": 0.97976363, "balance_loss_mlp": 1.03870177, "epoch": 0.14344976853243552, "flos": 16396992241920.0, "grad_norm": 2.4230533222140234, "language_loss": 0.89756083, "learning_rate": 3.86661129361864e-06, "loss": 0.92023277, "num_input_tokens_seen": 25368720, "step": 1193, "time_per_iteration": 2.8154423236846924 }, { "auxiliary_loss_clip": 0.01224658, "auxiliary_loss_mlp": 0.01041273, "balance_loss_clip": 0.98580194, "balance_loss_mlp": 1.02881551, "epoch": 0.1435700114230746, "flos": 18916736336640.0, "grad_norm": 3.0224923476707533, "language_loss": 0.85999954, "learning_rate": 3.866331436601859e-06, "loss": 0.88265884, "num_input_tokens_seen": 25386715, "step": 1194, "time_per_iteration": 2.8087003231048584 }, { "auxiliary_loss_clip": 0.01236821, "auxiliary_loss_mlp": 0.01044043, "balance_loss_clip": 1.06781828, "balance_loss_mlp": 1.0321579, "epoch": 0.1436902543137137, "flos": 19755394058880.0, "grad_norm": 2.1649216905026045, "language_loss": 0.73580289, "learning_rate": 3.866051296464083e-06, "loss": 0.7586115, "num_input_tokens_seen": 25405550, "step": 1195, "time_per_iteration": 2.7245981693267822 }, { "auxiliary_loss_clip": 0.01235149, "auxiliary_loss_mlp": 0.0112891, "balance_loss_clip": 1.06429625, "balance_loss_mlp": 0.0, "epoch": 0.1438104972043528, "flos": 14684807669760.0, "grad_norm": 5.757984740221322, "language_loss": 0.85460615, "learning_rate": 3.86577087324781e-06, "loss": 0.87824678, "num_input_tokens_seen": 25422040, "step": 1196, "time_per_iteration": 2.7068254947662354 }, { "auxiliary_loss_clip": 0.01229552, "auxiliary_loss_mlp": 0.01040866, "balance_loss_clip": 1.02970576, "balance_loss_mlp": 1.02923083, "epoch": 0.14393074009499188, "flos": 17092330698240.0, "grad_norm": 3.061823811143228, "language_loss": 0.77205956, "learning_rate": 3.865490166995578e-06, "loss": 0.79476374, "num_input_tokens_seen": 25440270, "step": 1197, "time_per_iteration": 2.674532651901245 }, { "auxiliary_loss_clip": 0.0122931, "auxiliary_loss_mlp": 0.01046056, "balance_loss_clip": 1.02655315, "balance_loss_mlp": 1.0340755, "epoch": 0.144050982985631, "flos": 30476200608000.0, "grad_norm": 2.2038799514953564, "language_loss": 0.84365523, "learning_rate": 3.86520917774997e-06, "loss": 0.86640888, "num_input_tokens_seen": 25459705, "step": 1198, "time_per_iteration": 2.764723062515259 }, { "auxiliary_loss_clip": 0.01228417, "auxiliary_loss_mlp": 0.01042838, "balance_loss_clip": 1.02750301, "balance_loss_mlp": 1.03120351, "epoch": 0.14417122587627007, "flos": 17858484817920.0, "grad_norm": 2.1122990409795133, "language_loss": 0.7508167, "learning_rate": 3.864927905553614e-06, "loss": 0.77352929, "num_input_tokens_seen": 25477615, "step": 1199, "time_per_iteration": 2.742495536804199 }, { "auxiliary_loss_clip": 0.0122201, "auxiliary_loss_mlp": 0.01033416, "balance_loss_clip": 0.9483723, "balance_loss_mlp": 1.02236557, "epoch": 0.14429146876690915, "flos": 21613914639360.0, "grad_norm": 1.7149227423759787, "language_loss": 0.88839769, "learning_rate": 3.8646463504491765e-06, "loss": 0.91095197, "num_input_tokens_seen": 25497750, "step": 1200, "time_per_iteration": 2.8267555236816406 }, { "auxiliary_loss_clip": 0.01235088, "auxiliary_loss_mlp": 0.01051498, "balance_loss_clip": 1.02888596, "balance_loss_mlp": 1.03920734, "epoch": 0.14441171165754824, "flos": 23258120722560.0, "grad_norm": 3.9496833556694173, "language_loss": 0.83667713, "learning_rate": 3.8643645124793705e-06, "loss": 0.85954297, "num_input_tokens_seen": 25516650, "step": 1201, "time_per_iteration": 2.737396240234375 }, { "auxiliary_loss_clip": 0.01228297, "auxiliary_loss_mlp": 0.01038308, "balance_loss_clip": 1.02552533, "balance_loss_mlp": 1.02666688, "epoch": 0.14453195454818735, "flos": 42854213963520.0, "grad_norm": 1.728345288882742, "language_loss": 0.74786675, "learning_rate": 3.8640823916869515e-06, "loss": 0.77053273, "num_input_tokens_seen": 25540960, "step": 1202, "time_per_iteration": 2.887930393218994 }, { "auxiliary_loss_clip": 0.0123362, "auxiliary_loss_mlp": 0.01039017, "balance_loss_clip": 1.06557298, "balance_loss_mlp": 1.02770424, "epoch": 0.14465219743882643, "flos": 27235873774080.0, "grad_norm": 1.517728823874865, "language_loss": 0.78431201, "learning_rate": 3.863799988114714e-06, "loss": 0.80703843, "num_input_tokens_seen": 25562990, "step": 1203, "time_per_iteration": 2.6822500228881836 }, { "auxiliary_loss_clip": 0.01233668, "auxiliary_loss_mlp": 0.01048255, "balance_loss_clip": 1.06409097, "balance_loss_mlp": 1.03587556, "epoch": 0.1447724403294655, "flos": 16690705752960.0, "grad_norm": 2.168974992994287, "language_loss": 0.70439959, "learning_rate": 3.863517301805502e-06, "loss": 0.72721887, "num_input_tokens_seen": 25581380, "step": 1204, "time_per_iteration": 2.5870983600616455 }, { "auxiliary_loss_clip": 0.01228572, "auxiliary_loss_mlp": 0.01043198, "balance_loss_clip": 0.94999838, "balance_loss_mlp": 1.03042507, "epoch": 0.14489268322010462, "flos": 20073741321600.0, "grad_norm": 2.9454129117768217, "language_loss": 0.97092819, "learning_rate": 3.863234332802196e-06, "loss": 0.99364597, "num_input_tokens_seen": 25593585, "step": 1205, "time_per_iteration": 2.682025194168091 }, { "auxiliary_loss_clip": 0.01220847, "auxiliary_loss_mlp": 0.01042242, "balance_loss_clip": 0.98541236, "balance_loss_mlp": 1.03062487, "epoch": 0.1450129261107437, "flos": 27125627955840.0, "grad_norm": 2.242179404391003, "language_loss": 0.73716825, "learning_rate": 3.862951081147723e-06, "loss": 0.75979912, "num_input_tokens_seen": 25613750, "step": 1206, "time_per_iteration": 2.642958879470825 }, { "auxiliary_loss_clip": 0.01234468, "auxiliary_loss_mlp": 0.01034511, "balance_loss_clip": 1.02860308, "balance_loss_mlp": 1.0231626, "epoch": 0.1451331690013828, "flos": 25702344472320.0, "grad_norm": 3.0559080999358943, "language_loss": 0.7801221, "learning_rate": 3.862667546885053e-06, "loss": 0.80281186, "num_input_tokens_seen": 25632300, "step": 1207, "time_per_iteration": 2.5828306674957275 }, { "auxiliary_loss_clip": 0.01230786, "auxiliary_loss_mlp": 0.01034195, "balance_loss_clip": 0.98569381, "balance_loss_mlp": 1.02190471, "epoch": 0.14525341189202187, "flos": 25737393168000.0, "grad_norm": 1.8770510841364272, "language_loss": 0.7386142, "learning_rate": 3.8623837300571965e-06, "loss": 0.76126397, "num_input_tokens_seen": 25651285, "step": 1208, "time_per_iteration": 2.6467573642730713 }, { "auxiliary_loss_clip": 0.01235069, "auxiliary_loss_mlp": 0.01047225, "balance_loss_clip": 1.06623495, "balance_loss_mlp": 1.03564441, "epoch": 0.14537365478266098, "flos": 23073898844160.0, "grad_norm": 1.7853944121593608, "language_loss": 0.84207112, "learning_rate": 3.8620996307072085e-06, "loss": 0.86489403, "num_input_tokens_seen": 25671990, "step": 1209, "time_per_iteration": 2.613157033920288 }, { "auxiliary_loss_clip": 0.012285, "auxiliary_loss_mlp": 0.01038176, "balance_loss_clip": 0.94664395, "balance_loss_mlp": 1.02630305, "epoch": 0.14549389767330007, "flos": 20595021448320.0, "grad_norm": 2.0886725112859303, "language_loss": 0.64578456, "learning_rate": 3.861815248878188e-06, "loss": 0.66845131, "num_input_tokens_seen": 25689475, "step": 1210, "time_per_iteration": 2.723633289337158 }, { "auxiliary_loss_clip": 0.01224018, "auxiliary_loss_mlp": 0.01035788, "balance_loss_clip": 0.9891625, "balance_loss_mlp": 1.02488089, "epoch": 0.14561414056393915, "flos": 15121804533120.0, "grad_norm": 2.2658890327167924, "language_loss": 0.79625148, "learning_rate": 3.861530584613274e-06, "loss": 0.81884956, "num_input_tokens_seen": 25707475, "step": 1211, "time_per_iteration": 2.6491146087646484 }, { "auxiliary_loss_clip": 0.01233557, "auxiliary_loss_mlp": 0.01128699, "balance_loss_clip": 1.02795815, "balance_loss_mlp": 0.0, "epoch": 0.14573438345457826, "flos": 19427493778560.0, "grad_norm": 2.1008383311213406, "language_loss": 0.82173777, "learning_rate": 3.86124563795565e-06, "loss": 0.84536028, "num_input_tokens_seen": 25726290, "step": 1212, "time_per_iteration": 2.627596616744995 }, { "auxiliary_loss_clip": 0.01233174, "auxiliary_loss_mlp": 0.01036935, "balance_loss_clip": 1.06664348, "balance_loss_mlp": 1.02484739, "epoch": 0.14585462634521734, "flos": 24828422572800.0, "grad_norm": 2.6130040472406897, "language_loss": 0.7033968, "learning_rate": 3.860960408948543e-06, "loss": 0.72609788, "num_input_tokens_seen": 25748040, "step": 1213, "time_per_iteration": 2.755383253097534 }, { "auxiliary_loss_clip": 0.01223051, "auxiliary_loss_mlp": 0.01038917, "balance_loss_clip": 1.02611125, "balance_loss_mlp": 1.02755046, "epoch": 0.14597486923585642, "flos": 15448627405440.0, "grad_norm": 2.4196431347032585, "language_loss": 0.89976943, "learning_rate": 3.860674897635222e-06, "loss": 0.92238903, "num_input_tokens_seen": 25764525, "step": 1214, "time_per_iteration": 2.617854118347168 }, { "auxiliary_loss_clip": 0.01233858, "auxiliary_loss_mlp": 0.01040756, "balance_loss_clip": 1.02981055, "balance_loss_mlp": 1.02838755, "epoch": 0.1460951121264955, "flos": 16655154266880.0, "grad_norm": 2.11431975963493, "language_loss": 0.83568048, "learning_rate": 3.860389104058998e-06, "loss": 0.85842657, "num_input_tokens_seen": 25782755, "step": 1215, "time_per_iteration": 2.6128957271575928 }, { "auxiliary_loss_clip": 0.01224135, "auxiliary_loss_mlp": 0.01034492, "balance_loss_clip": 0.9846819, "balance_loss_mlp": 1.02297056, "epoch": 0.14621535501713462, "flos": 24863291700480.0, "grad_norm": 2.070111295788625, "language_loss": 0.72765291, "learning_rate": 3.860103028263227e-06, "loss": 0.75023913, "num_input_tokens_seen": 25805860, "step": 1216, "time_per_iteration": 4.590798377990723 }, { "auxiliary_loss_clip": 0.01216886, "auxiliary_loss_mlp": 0.0103708, "balance_loss_clip": 0.90453827, "balance_loss_mlp": 1.02540922, "epoch": 0.1463355979077737, "flos": 25228000442880.0, "grad_norm": 2.8432337311121443, "language_loss": 0.70153594, "learning_rate": 3.859816670291304e-06, "loss": 0.72407562, "num_input_tokens_seen": 25824955, "step": 1217, "time_per_iteration": 3.5294976234436035 }, { "auxiliary_loss_clip": 0.01213317, "auxiliary_loss_mlp": 0.01048006, "balance_loss_clip": 0.86826265, "balance_loss_mlp": 1.03634143, "epoch": 0.14645584079841278, "flos": 22054143726720.0, "grad_norm": 3.5671324649027465, "language_loss": 0.89938664, "learning_rate": 3.859530030186672e-06, "loss": 0.92199993, "num_input_tokens_seen": 25841965, "step": 1218, "time_per_iteration": 3.7408287525177 }, { "auxiliary_loss_clip": 0.01232232, "auxiliary_loss_mlp": 0.0104268, "balance_loss_clip": 0.98898506, "balance_loss_mlp": 1.03057468, "epoch": 0.1465760836890519, "flos": 23623870959360.0, "grad_norm": 2.4194191791030537, "language_loss": 0.82190394, "learning_rate": 3.859243107992813e-06, "loss": 0.84465313, "num_input_tokens_seen": 25860770, "step": 1219, "time_per_iteration": 2.6756644248962402 }, { "auxiliary_loss_clip": 0.01225706, "auxiliary_loss_mlp": 0.01041218, "balance_loss_clip": 0.9422909, "balance_loss_mlp": 1.02923751, "epoch": 0.14669632657969098, "flos": 37407893356800.0, "grad_norm": 7.381072012183392, "language_loss": 0.78219306, "learning_rate": 3.858955903753252e-06, "loss": 0.80486226, "num_input_tokens_seen": 25879410, "step": 1220, "time_per_iteration": 2.880070924758911 }, { "auxiliary_loss_clip": 0.01232322, "auxiliary_loss_mlp": 0.01043362, "balance_loss_clip": 1.02671731, "balance_loss_mlp": 1.03203726, "epoch": 0.14681656947033006, "flos": 28365910623360.0, "grad_norm": 1.4949828831273027, "language_loss": 0.83492076, "learning_rate": 3.858668417511559e-06, "loss": 0.85767758, "num_input_tokens_seen": 25902160, "step": 1221, "time_per_iteration": 2.77994704246521 }, { "auxiliary_loss_clip": 0.01229412, "auxiliary_loss_mlp": 0.01036665, "balance_loss_clip": 0.98805285, "balance_loss_mlp": 1.02595401, "epoch": 0.14693681236096917, "flos": 18479488078080.0, "grad_norm": 2.0258291403406266, "language_loss": 0.76237994, "learning_rate": 3.8583806493113445e-06, "loss": 0.78504068, "num_input_tokens_seen": 25920505, "step": 1222, "time_per_iteration": 2.6642942428588867 }, { "auxiliary_loss_clip": 0.01228519, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.02469063, "balance_loss_mlp": 1.02161837, "epoch": 0.14705705525160825, "flos": 20777806782720.0, "grad_norm": 2.1604243589624463, "language_loss": 0.82430255, "learning_rate": 3.858092599196263e-06, "loss": 0.84692204, "num_input_tokens_seen": 25938460, "step": 1223, "time_per_iteration": 2.6723389625549316 }, { "auxiliary_loss_clip": 0.01226276, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.02504897, "balance_loss_mlp": 1.02375209, "epoch": 0.14717729814224734, "flos": 29932944336000.0, "grad_norm": 2.248452549651491, "language_loss": 0.82230866, "learning_rate": 3.857804267210012e-06, "loss": 0.84492254, "num_input_tokens_seen": 25957760, "step": 1224, "time_per_iteration": 2.7076826095581055 }, { "auxiliary_loss_clip": 0.01212332, "auxiliary_loss_mlp": 0.01035615, "balance_loss_clip": 0.94428015, "balance_loss_mlp": 1.02485013, "epoch": 0.14729754103288642, "flos": 20047491457920.0, "grad_norm": 2.1471548554948874, "language_loss": 0.88015819, "learning_rate": 3.857515653396331e-06, "loss": 0.9026376, "num_input_tokens_seen": 25974970, "step": 1225, "time_per_iteration": 2.682464361190796 }, { "auxiliary_loss_clip": 0.01225325, "auxiliary_loss_mlp": 0.01033729, "balance_loss_clip": 0.90851253, "balance_loss_mlp": 1.02205825, "epoch": 0.14741778392352553, "flos": 19281516906240.0, "grad_norm": 2.4096376149682404, "language_loss": 0.87028837, "learning_rate": 3.857226757799002e-06, "loss": 0.89287889, "num_input_tokens_seen": 25992525, "step": 1226, "time_per_iteration": 2.752952814102173 }, { "auxiliary_loss_clip": 0.01226591, "auxiliary_loss_mlp": 0.01049818, "balance_loss_clip": 0.98437667, "balance_loss_mlp": 1.03875518, "epoch": 0.1475380268141646, "flos": 25411108999680.0, "grad_norm": 2.76426439244477, "language_loss": 0.74600625, "learning_rate": 3.85693758046185e-06, "loss": 0.76877034, "num_input_tokens_seen": 26010815, "step": 1227, "time_per_iteration": 2.68622088432312 }, { "auxiliary_loss_clip": 0.01234935, "auxiliary_loss_mlp": 0.01050071, "balance_loss_clip": 1.06854463, "balance_loss_mlp": 1.03806674, "epoch": 0.1476582697048037, "flos": 20847652778880.0, "grad_norm": 1.849062817960655, "language_loss": 0.82637578, "learning_rate": 3.8566481214287435e-06, "loss": 0.84922588, "num_input_tokens_seen": 26028935, "step": 1228, "time_per_iteration": 2.6700174808502197 }, { "auxiliary_loss_clip": 0.0121307, "auxiliary_loss_mlp": 0.01041421, "balance_loss_clip": 0.94237214, "balance_loss_mlp": 1.02850497, "epoch": 0.1477785125954428, "flos": 14028109269120.0, "grad_norm": 2.2766283184056926, "language_loss": 0.90532613, "learning_rate": 3.8563583807435935e-06, "loss": 0.92787105, "num_input_tokens_seen": 26045080, "step": 1229, "time_per_iteration": 2.733867883682251 }, { "auxiliary_loss_clip": 0.01231905, "auxiliary_loss_mlp": 0.01128948, "balance_loss_clip": 1.02485704, "balance_loss_mlp": 0.0, "epoch": 0.1478987554860819, "flos": 20516699842560.0, "grad_norm": 1.9313372982048622, "language_loss": 0.7772783, "learning_rate": 3.856068358450353e-06, "loss": 0.80088687, "num_input_tokens_seen": 26065030, "step": 1230, "time_per_iteration": 2.736259698867798 }, { "auxiliary_loss_clip": 0.01224474, "auxiliary_loss_mlp": 0.01040938, "balance_loss_clip": 0.98979008, "balance_loss_mlp": 1.0296489, "epoch": 0.14801899837672097, "flos": 17857012360320.0, "grad_norm": 1.846768637217415, "language_loss": 0.8579855, "learning_rate": 3.8557780545930186e-06, "loss": 0.88063961, "num_input_tokens_seen": 26083445, "step": 1231, "time_per_iteration": 2.6625282764434814 }, { "auxiliary_loss_clip": 0.01224553, "auxiliary_loss_mlp": 0.01037764, "balance_loss_clip": 0.98844635, "balance_loss_mlp": 1.02586102, "epoch": 0.14813924126736006, "flos": 20881408584960.0, "grad_norm": 2.0508242991571706, "language_loss": 0.79421937, "learning_rate": 3.855487469215628e-06, "loss": 0.81684256, "num_input_tokens_seen": 26102375, "step": 1232, "time_per_iteration": 2.7617204189300537 }, { "auxiliary_loss_clip": 0.0122681, "auxiliary_loss_mlp": 0.01035389, "balance_loss_clip": 0.95150286, "balance_loss_mlp": 1.02323532, "epoch": 0.14825948415799917, "flos": 37414070496000.0, "grad_norm": 2.394953443484635, "language_loss": 0.72260803, "learning_rate": 3.855196602362264e-06, "loss": 0.74523002, "num_input_tokens_seen": 26125295, "step": 1233, "time_per_iteration": 2.9862589836120605 }, { "auxiliary_loss_clip": 0.01232056, "auxiliary_loss_mlp": 0.01037627, "balance_loss_clip": 1.02637339, "balance_loss_mlp": 1.02623713, "epoch": 0.14837972704863825, "flos": 22014641744640.0, "grad_norm": 1.9741995416251255, "language_loss": 0.94095552, "learning_rate": 3.854905454077051e-06, "loss": 0.96365237, "num_input_tokens_seen": 26142905, "step": 1234, "time_per_iteration": 2.66083025932312 }, { "auxiliary_loss_clip": 0.01210205, "auxiliary_loss_mlp": 0.01042659, "balance_loss_clip": 0.8269667, "balance_loss_mlp": 1.03084588, "epoch": 0.14849996993927733, "flos": 20996323171200.0, "grad_norm": 1.773315054978524, "language_loss": 0.88485491, "learning_rate": 3.854614024404155e-06, "loss": 0.90738356, "num_input_tokens_seen": 26161215, "step": 1235, "time_per_iteration": 2.9048283100128174 }, { "auxiliary_loss_clip": 0.01210796, "auxiliary_loss_mlp": 0.01039214, "balance_loss_clip": 0.98251009, "balance_loss_mlp": 1.02793646, "epoch": 0.14862021282991644, "flos": 20047994248320.0, "grad_norm": 2.0118344211743797, "language_loss": 0.89131862, "learning_rate": 3.8543223133877865e-06, "loss": 0.91381872, "num_input_tokens_seen": 26179810, "step": 1236, "time_per_iteration": 2.8416314125061035 }, { "auxiliary_loss_clip": 0.01205727, "auxiliary_loss_mlp": 0.01034508, "balance_loss_clip": 0.98139668, "balance_loss_mlp": 1.02208614, "epoch": 0.14874045572055553, "flos": 22712027276160.0, "grad_norm": 1.7816771621312106, "language_loss": 0.88499963, "learning_rate": 3.854030321072198e-06, "loss": 0.90740198, "num_input_tokens_seen": 26199715, "step": 1237, "time_per_iteration": 2.732807159423828 }, { "auxiliary_loss_clip": 0.01229017, "auxiliary_loss_mlp": 0.01043282, "balance_loss_clip": 0.94801986, "balance_loss_mlp": 1.03203452, "epoch": 0.1488606986111946, "flos": 25411288567680.0, "grad_norm": 1.967113094709023, "language_loss": 0.73256457, "learning_rate": 3.853738047501682e-06, "loss": 0.75528753, "num_input_tokens_seen": 26220275, "step": 1238, "time_per_iteration": 2.77341890335083 }, { "auxiliary_loss_clip": 0.01234834, "auxiliary_loss_mlp": 0.01045069, "balance_loss_clip": 1.02972674, "balance_loss_mlp": 1.03301752, "epoch": 0.1489809415018337, "flos": 17018749687680.0, "grad_norm": 1.8611343135871306, "language_loss": 0.77682149, "learning_rate": 3.85344549272058e-06, "loss": 0.79962051, "num_input_tokens_seen": 26238255, "step": 1239, "time_per_iteration": 2.701951742172241 }, { "auxiliary_loss_clip": 0.01221785, "auxiliary_loss_mlp": 0.01036257, "balance_loss_clip": 1.02302837, "balance_loss_mlp": 1.02509332, "epoch": 0.1491011843924728, "flos": 33659394860160.0, "grad_norm": 1.9149205295622154, "language_loss": 0.82557416, "learning_rate": 3.853152656773269e-06, "loss": 0.84815466, "num_input_tokens_seen": 26259690, "step": 1240, "time_per_iteration": 2.8422696590423584 }, { "auxiliary_loss_clip": 0.01225388, "auxiliary_loss_mlp": 0.01037768, "balance_loss_clip": 0.98726863, "balance_loss_mlp": 1.02647924, "epoch": 0.14922142728311188, "flos": 21179000764800.0, "grad_norm": 1.8934625899656476, "language_loss": 0.84825993, "learning_rate": 3.852859539704174e-06, "loss": 0.87089157, "num_input_tokens_seen": 26278990, "step": 1241, "time_per_iteration": 2.7545578479766846 }, { "auxiliary_loss_clip": 0.01224183, "auxiliary_loss_mlp": 0.01039389, "balance_loss_clip": 0.90951014, "balance_loss_mlp": 1.02793932, "epoch": 0.14934167017375097, "flos": 29860548474240.0, "grad_norm": 2.179173887091971, "language_loss": 0.76404941, "learning_rate": 3.85256614155776e-06, "loss": 0.78668517, "num_input_tokens_seen": 26299120, "step": 1242, "time_per_iteration": 5.005924224853516 }, { "auxiliary_loss_clip": 0.01227395, "auxiliary_loss_mlp": 0.01036862, "balance_loss_clip": 1.02330709, "balance_loss_mlp": 1.02526283, "epoch": 0.14946191306439008, "flos": 17019216564480.0, "grad_norm": 10.380757658990534, "language_loss": 0.74258733, "learning_rate": 3.852272462378535e-06, "loss": 0.76522988, "num_input_tokens_seen": 26316995, "step": 1243, "time_per_iteration": 3.631434679031372 }, { "auxiliary_loss_clip": 0.01228097, "auxiliary_loss_mlp": 0.01036697, "balance_loss_clip": 0.988478, "balance_loss_mlp": 1.02497315, "epoch": 0.14958215595502916, "flos": 15669047214720.0, "grad_norm": 1.9457665714382548, "language_loss": 0.77623892, "learning_rate": 3.85197850221105e-06, "loss": 0.7988869, "num_input_tokens_seen": 26333295, "step": 1244, "time_per_iteration": 3.5984115600585938 }, { "auxiliary_loss_clip": 0.01226995, "auxiliary_loss_mlp": 0.01037333, "balance_loss_clip": 1.02721763, "balance_loss_mlp": 1.02608609, "epoch": 0.14970239884566824, "flos": 33108560818560.0, "grad_norm": 1.6541652061195207, "language_loss": 0.75930393, "learning_rate": 3.851684261099899e-06, "loss": 0.7819472, "num_input_tokens_seen": 26355035, "step": 1245, "time_per_iteration": 2.748771905899048 }, { "auxiliary_loss_clip": 0.01223937, "auxiliary_loss_mlp": 0.01040728, "balance_loss_clip": 0.98688316, "balance_loss_mlp": 1.02921844, "epoch": 0.14982264173630733, "flos": 17821245392640.0, "grad_norm": 1.9188733986793403, "language_loss": 0.86372954, "learning_rate": 3.851389739089718e-06, "loss": 0.88637626, "num_input_tokens_seen": 26371655, "step": 1246, "time_per_iteration": 2.6719393730163574 }, { "auxiliary_loss_clip": 0.01235108, "auxiliary_loss_mlp": 0.01043203, "balance_loss_clip": 1.0324651, "balance_loss_mlp": 1.03122854, "epoch": 0.14994288462694644, "flos": 32409559175040.0, "grad_norm": 1.9349032104798722, "language_loss": 0.80322105, "learning_rate": 3.851094936225186e-06, "loss": 0.82600421, "num_input_tokens_seen": 26392540, "step": 1247, "time_per_iteration": 2.723912000656128 }, { "auxiliary_loss_clip": 0.01218309, "auxiliary_loss_mlp": 0.01047734, "balance_loss_clip": 0.98466122, "balance_loss_mlp": 1.03594458, "epoch": 0.15006312751758552, "flos": 31794661226880.0, "grad_norm": 1.4438862020939858, "language_loss": 0.76688373, "learning_rate": 3.850799852551024e-06, "loss": 0.78954411, "num_input_tokens_seen": 26414960, "step": 1248, "time_per_iteration": 2.795224905014038 }, { "auxiliary_loss_clip": 0.01217968, "auxiliary_loss_mlp": 0.01037956, "balance_loss_clip": 1.02297032, "balance_loss_mlp": 1.02631497, "epoch": 0.1501833704082246, "flos": 16618022582400.0, "grad_norm": 2.257720894285293, "language_loss": 0.8604728, "learning_rate": 3.850504488111995e-06, "loss": 0.88303208, "num_input_tokens_seen": 26431635, "step": 1249, "time_per_iteration": 2.5969855785369873 }, { "auxiliary_loss_clip": 0.01220485, "auxiliary_loss_mlp": 0.01037635, "balance_loss_clip": 0.98606718, "balance_loss_mlp": 1.02599478, "epoch": 0.15030361329886371, "flos": 23471178243840.0, "grad_norm": 1.8813768281420014, "language_loss": 0.82383269, "learning_rate": 3.850208842952907e-06, "loss": 0.84641391, "num_input_tokens_seen": 26450440, "step": 1250, "time_per_iteration": 2.737710475921631 }, { "auxiliary_loss_clip": 0.0123031, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 0.90758467, "balance_loss_mlp": 1.02649689, "epoch": 0.1504238561895028, "flos": 25629409906560.0, "grad_norm": 1.762893373672034, "language_loss": 0.79635376, "learning_rate": 3.849912917118608e-06, "loss": 0.81903803, "num_input_tokens_seen": 26471480, "step": 1251, "time_per_iteration": 2.760601758956909 }, { "auxiliary_loss_clip": 0.01143787, "auxiliary_loss_mlp": 0.01012285, "balance_loss_clip": 1.00979996, "balance_loss_mlp": 1.00599122, "epoch": 0.15054409908014188, "flos": 52095146129280.0, "grad_norm": 0.8868631259676234, "language_loss": 0.59230995, "learning_rate": 3.849616710653992e-06, "loss": 0.61387068, "num_input_tokens_seen": 26532950, "step": 1252, "time_per_iteration": 3.2639870643615723 }, { "auxiliary_loss_clip": 0.01228282, "auxiliary_loss_mlp": 0.01043531, "balance_loss_clip": 1.02609825, "balance_loss_mlp": 1.03212881, "epoch": 0.150664341970781, "flos": 18880251096960.0, "grad_norm": 1.7626170465810485, "language_loss": 0.74838138, "learning_rate": 3.84932022360399e-06, "loss": 0.77109957, "num_input_tokens_seen": 26551615, "step": 1253, "time_per_iteration": 2.7913713455200195 }, { "auxiliary_loss_clip": 0.01223749, "auxiliary_loss_mlp": 0.0103781, "balance_loss_clip": 0.99046838, "balance_loss_mlp": 1.02629471, "epoch": 0.15078458486142007, "flos": 22163240309760.0, "grad_norm": 2.560326297936765, "language_loss": 0.84168375, "learning_rate": 3.849023456013581e-06, "loss": 0.8642993, "num_input_tokens_seen": 26569175, "step": 1254, "time_per_iteration": 2.7548065185546875 }, { "auxiliary_loss_clip": 0.01232699, "auxiliary_loss_mlp": 0.01041541, "balance_loss_clip": 1.02446342, "balance_loss_mlp": 1.02994752, "epoch": 0.15090482775205916, "flos": 26651894457600.0, "grad_norm": 3.541023487698327, "language_loss": 0.62166244, "learning_rate": 3.848726407927784e-06, "loss": 0.64440489, "num_input_tokens_seen": 26589560, "step": 1255, "time_per_iteration": 2.722886800765991 }, { "auxiliary_loss_clip": 0.01231502, "auxiliary_loss_mlp": 0.01037995, "balance_loss_clip": 0.99021524, "balance_loss_mlp": 1.02578795, "epoch": 0.15102507064269824, "flos": 21798998444160.0, "grad_norm": 2.4203257992358083, "language_loss": 0.86158156, "learning_rate": 3.84842907939166e-06, "loss": 0.88427651, "num_input_tokens_seen": 26608785, "step": 1256, "time_per_iteration": 2.7167162895202637 }, { "auxiliary_loss_clip": 0.01219442, "auxiliary_loss_mlp": 0.01042471, "balance_loss_clip": 0.94876343, "balance_loss_mlp": 1.03151631, "epoch": 0.15114531353333735, "flos": 22820908377600.0, "grad_norm": 4.067680280230286, "language_loss": 0.70781565, "learning_rate": 3.8481314704503146e-06, "loss": 0.73043478, "num_input_tokens_seen": 26628615, "step": 1257, "time_per_iteration": 2.7123422622680664 }, { "auxiliary_loss_clip": 0.0123107, "auxiliary_loss_mlp": 0.01036743, "balance_loss_clip": 1.03033209, "balance_loss_mlp": 1.0251441, "epoch": 0.15126555642397643, "flos": 19682674974720.0, "grad_norm": 2.543598424540406, "language_loss": 0.87627077, "learning_rate": 3.847833581148895e-06, "loss": 0.89894891, "num_input_tokens_seen": 26647525, "step": 1258, "time_per_iteration": 2.6773979663848877 }, { "auxiliary_loss_clip": 0.01228331, "auxiliary_loss_mlp": 0.01034852, "balance_loss_clip": 1.06165028, "balance_loss_mlp": 1.02385449, "epoch": 0.15138579931461552, "flos": 28726022424960.0, "grad_norm": 9.32155719460005, "language_loss": 0.80993736, "learning_rate": 3.84753541153259e-06, "loss": 0.83256912, "num_input_tokens_seen": 26667095, "step": 1259, "time_per_iteration": 2.80942964553833 }, { "auxiliary_loss_clip": 0.0122804, "auxiliary_loss_mlp": 0.0103624, "balance_loss_clip": 1.02629924, "balance_loss_mlp": 1.02499211, "epoch": 0.15150604220525463, "flos": 22127006465280.0, "grad_norm": 1.5051693959453383, "language_loss": 0.83437455, "learning_rate": 3.847236961646633e-06, "loss": 0.8570174, "num_input_tokens_seen": 26686075, "step": 1260, "time_per_iteration": 2.667391061782837 }, { "auxiliary_loss_clip": 0.01220826, "auxiliary_loss_mlp": 0.01035506, "balance_loss_clip": 0.98671401, "balance_loss_mlp": 1.02399611, "epoch": 0.1516262850958937, "flos": 12968708515200.0, "grad_norm": 2.861607890651191, "language_loss": 0.77954662, "learning_rate": 3.846938231536296e-06, "loss": 0.80210996, "num_input_tokens_seen": 26701695, "step": 1261, "time_per_iteration": 2.69889497756958 }, { "auxiliary_loss_clip": 0.01233136, "auxiliary_loss_mlp": 0.01043412, "balance_loss_clip": 1.02864265, "balance_loss_mlp": 1.03118753, "epoch": 0.1517465279865328, "flos": 21797130936960.0, "grad_norm": 1.7265116724985607, "language_loss": 0.80947912, "learning_rate": 3.8466392212468995e-06, "loss": 0.83224452, "num_input_tokens_seen": 26721885, "step": 1262, "time_per_iteration": 2.671769380569458 }, { "auxiliary_loss_clip": 0.01136759, "auxiliary_loss_mlp": 0.01004004, "balance_loss_clip": 0.92264867, "balance_loss_mlp": 0.99775785, "epoch": 0.15186677087717187, "flos": 58174569901440.0, "grad_norm": 0.8245697295824245, "language_loss": 0.61961877, "learning_rate": 3.8463399308238e-06, "loss": 0.64102638, "num_input_tokens_seen": 26780990, "step": 1263, "time_per_iteration": 3.2462751865386963 }, { "auxiliary_loss_clip": 0.01230292, "auxiliary_loss_mlp": 0.01035399, "balance_loss_clip": 1.02896619, "balance_loss_mlp": 1.02437234, "epoch": 0.15198701376781099, "flos": 32669696448000.0, "grad_norm": 1.6645594369680958, "language_loss": 0.63924646, "learning_rate": 3.846040360312402e-06, "loss": 0.66190338, "num_input_tokens_seen": 26804250, "step": 1264, "time_per_iteration": 2.746842622756958 }, { "auxiliary_loss_clip": 0.01230163, "auxiliary_loss_mlp": 0.0103986, "balance_loss_clip": 1.06424081, "balance_loss_mlp": 1.02869642, "epoch": 0.15210725665845007, "flos": 28402575431040.0, "grad_norm": 2.764081904499292, "language_loss": 0.80991924, "learning_rate": 3.8457405097581485e-06, "loss": 0.83261943, "num_input_tokens_seen": 26823240, "step": 1265, "time_per_iteration": 2.7216782569885254 }, { "auxiliary_loss_clip": 0.01226196, "auxiliary_loss_mlp": 0.01041579, "balance_loss_clip": 0.9067378, "balance_loss_mlp": 1.03000963, "epoch": 0.15222749954908915, "flos": 19938179393280.0, "grad_norm": 1.9635921029558314, "language_loss": 0.78114653, "learning_rate": 3.8454403792065275e-06, "loss": 0.80382431, "num_input_tokens_seen": 26842060, "step": 1266, "time_per_iteration": 2.777832269668579 }, { "auxiliary_loss_clip": 0.01212658, "auxiliary_loss_mlp": 0.01039339, "balance_loss_clip": 0.94743794, "balance_loss_mlp": 1.02792454, "epoch": 0.15234774243972826, "flos": 21324223451520.0, "grad_norm": 2.0623650898143815, "language_loss": 0.85619497, "learning_rate": 3.845139968703068e-06, "loss": 0.87871492, "num_input_tokens_seen": 26859580, "step": 1267, "time_per_iteration": 2.709420919418335 }, { "auxiliary_loss_clip": 0.01215993, "auxiliary_loss_mlp": 0.01045229, "balance_loss_clip": 0.90472209, "balance_loss_mlp": 1.03384435, "epoch": 0.15246798533036734, "flos": 25957812977280.0, "grad_norm": 2.7230034777646797, "language_loss": 0.82981384, "learning_rate": 3.844839278293342e-06, "loss": 0.85242605, "num_input_tokens_seen": 26880430, "step": 1268, "time_per_iteration": 4.940703392028809 }, { "auxiliary_loss_clip": 0.01234126, "auxiliary_loss_mlp": 0.01036902, "balance_loss_clip": 1.06716871, "balance_loss_mlp": 1.02479076, "epoch": 0.15258822822100643, "flos": 25811907932160.0, "grad_norm": 2.4278601317627437, "language_loss": 0.77180219, "learning_rate": 3.8445383080229654e-06, "loss": 0.79451251, "num_input_tokens_seen": 26896445, "step": 1269, "time_per_iteration": 3.6280465126037598 }, { "auxiliary_loss_clip": 0.01211434, "auxiliary_loss_mlp": 0.01035232, "balance_loss_clip": 0.9806459, "balance_loss_mlp": 1.02364445, "epoch": 0.1527084711116455, "flos": 25265455349760.0, "grad_norm": 2.6480893325687536, "language_loss": 0.73900223, "learning_rate": 3.844237057937593e-06, "loss": 0.76146889, "num_input_tokens_seen": 26915450, "step": 1270, "time_per_iteration": 3.6154019832611084 }, { "auxiliary_loss_clip": 0.01232014, "auxiliary_loss_mlp": 0.01043236, "balance_loss_clip": 1.02474046, "balance_loss_mlp": 1.03186941, "epoch": 0.15282871400228462, "flos": 29240227572480.0, "grad_norm": 2.331320403433069, "language_loss": 0.77776033, "learning_rate": 3.843935528082926e-06, "loss": 0.80051279, "num_input_tokens_seen": 26936475, "step": 1271, "time_per_iteration": 2.7363672256469727 }, { "auxiliary_loss_clip": 0.01228452, "auxiliary_loss_mlp": 0.01051785, "balance_loss_clip": 1.02437329, "balance_loss_mlp": 1.03954184, "epoch": 0.1529489568929237, "flos": 20882952869760.0, "grad_norm": 2.046086523341349, "language_loss": 0.84964007, "learning_rate": 3.843633718504704e-06, "loss": 0.87244236, "num_input_tokens_seen": 26954920, "step": 1272, "time_per_iteration": 2.7312135696411133 }, { "auxiliary_loss_clip": 0.012286, "auxiliary_loss_mlp": 0.01035124, "balance_loss_clip": 0.94967675, "balance_loss_mlp": 1.02366805, "epoch": 0.1530691997835628, "flos": 20083833043200.0, "grad_norm": 2.52196826343789, "language_loss": 0.89930749, "learning_rate": 3.843331629248715e-06, "loss": 0.92194468, "num_input_tokens_seen": 26972520, "step": 1273, "time_per_iteration": 2.7508604526519775 }, { "auxiliary_loss_clip": 0.01230841, "auxiliary_loss_mlp": 0.01042762, "balance_loss_clip": 1.0661993, "balance_loss_mlp": 1.03119898, "epoch": 0.1531894426742019, "flos": 28759814144640.0, "grad_norm": 2.0784488097212788, "language_loss": 0.76569819, "learning_rate": 3.843029260360782e-06, "loss": 0.78843421, "num_input_tokens_seen": 26990890, "step": 1274, "time_per_iteration": 2.783740997314453 }, { "auxiliary_loss_clip": 0.01226953, "auxiliary_loss_mlp": 0.01048947, "balance_loss_clip": 1.02659631, "balance_loss_mlp": 1.03761601, "epoch": 0.15330968556484098, "flos": 22236282616320.0, "grad_norm": 1.931985383851977, "language_loss": 0.79058337, "learning_rate": 3.8427266118867755e-06, "loss": 0.81334239, "num_input_tokens_seen": 27010640, "step": 1275, "time_per_iteration": 2.6750051975250244 }, { "auxiliary_loss_clip": 0.01224323, "auxiliary_loss_mlp": 0.01048175, "balance_loss_clip": 0.98691022, "balance_loss_mlp": 1.03608727, "epoch": 0.15342992845548006, "flos": 27527504296320.0, "grad_norm": 2.0034668027657347, "language_loss": 0.825086, "learning_rate": 3.842423683872608e-06, "loss": 0.84781098, "num_input_tokens_seen": 27031215, "step": 1276, "time_per_iteration": 2.7704145908355713 }, { "auxiliary_loss_clip": 0.0122769, "auxiliary_loss_mlp": 0.01040198, "balance_loss_clip": 1.02385712, "balance_loss_mlp": 1.02923727, "epoch": 0.15355017134611917, "flos": 19609596754560.0, "grad_norm": 2.3951314533643036, "language_loss": 0.77923679, "learning_rate": 3.842120476364232e-06, "loss": 0.80191565, "num_input_tokens_seen": 27049665, "step": 1277, "time_per_iteration": 2.6268343925476074 }, { "auxiliary_loss_clip": 0.01233229, "auxiliary_loss_mlp": 0.0103506, "balance_loss_clip": 1.02419484, "balance_loss_mlp": 1.0234257, "epoch": 0.15367041423675826, "flos": 18478590238080.0, "grad_norm": 2.0588487168097176, "language_loss": 0.84217793, "learning_rate": 3.841816989407644e-06, "loss": 0.86486089, "num_input_tokens_seen": 27065155, "step": 1278, "time_per_iteration": 2.6564981937408447 }, { "auxiliary_loss_clip": 0.01218089, "auxiliary_loss_mlp": 0.01043271, "balance_loss_clip": 0.94534445, "balance_loss_mlp": 1.03137994, "epoch": 0.15379065712739734, "flos": 41427662342400.0, "grad_norm": 2.054387093183573, "language_loss": 0.76728874, "learning_rate": 3.841513223048884e-06, "loss": 0.78990233, "num_input_tokens_seen": 27085840, "step": 1279, "time_per_iteration": 2.867799997329712 }, { "auxiliary_loss_clip": 0.01220467, "auxiliary_loss_mlp": 0.0103309, "balance_loss_clip": 0.94658178, "balance_loss_mlp": 1.02183664, "epoch": 0.15391090001803642, "flos": 22054215553920.0, "grad_norm": 2.6897684937449755, "language_loss": 0.7856701, "learning_rate": 3.841209177334031e-06, "loss": 0.80820572, "num_input_tokens_seen": 27104200, "step": 1280, "time_per_iteration": 2.7460241317749023 }, { "auxiliary_loss_clip": 0.01221938, "auxiliary_loss_mlp": 0.0104234, "balance_loss_clip": 1.02410865, "balance_loss_mlp": 1.03116405, "epoch": 0.15403114290867553, "flos": 15450351258240.0, "grad_norm": 1.8877897901560219, "language_loss": 0.74945271, "learning_rate": 3.84090485230921e-06, "loss": 0.77209544, "num_input_tokens_seen": 27122440, "step": 1281, "time_per_iteration": 2.8171281814575195 }, { "auxiliary_loss_clip": 0.01228527, "auxiliary_loss_mlp": 0.01040748, "balance_loss_clip": 1.06352544, "balance_loss_mlp": 1.02925634, "epoch": 0.15415138579931462, "flos": 17929156826880.0, "grad_norm": 2.715918773294313, "language_loss": 0.7601164, "learning_rate": 3.840600248020588e-06, "loss": 0.78280914, "num_input_tokens_seen": 27139380, "step": 1282, "time_per_iteration": 2.65242862701416 }, { "auxiliary_loss_clip": 0.01228478, "auxiliary_loss_mlp": 0.01042206, "balance_loss_clip": 0.98235989, "balance_loss_mlp": 1.03035641, "epoch": 0.1542716286899537, "flos": 11429325296640.0, "grad_norm": 2.3772532049114643, "language_loss": 0.80177915, "learning_rate": 3.840295364514371e-06, "loss": 0.82448602, "num_input_tokens_seen": 27156760, "step": 1283, "time_per_iteration": 2.7141082286834717 }, { "auxiliary_loss_clip": 0.01225261, "auxiliary_loss_mlp": 0.01041203, "balance_loss_clip": 0.98730743, "balance_loss_mlp": 1.02994394, "epoch": 0.1543918715805928, "flos": 17420338719360.0, "grad_norm": 2.121794306112556, "language_loss": 0.7841965, "learning_rate": 3.83999020183681e-06, "loss": 0.80686116, "num_input_tokens_seen": 27175455, "step": 1284, "time_per_iteration": 2.7373268604278564 }, { "auxiliary_loss_clip": 0.01212889, "auxiliary_loss_mlp": 0.01037274, "balance_loss_clip": 0.86932743, "balance_loss_mlp": 1.02546, "epoch": 0.1545121144712319, "flos": 17786376264960.0, "grad_norm": 3.4994272276576, "language_loss": 0.78650463, "learning_rate": 3.839684760034199e-06, "loss": 0.80900621, "num_input_tokens_seen": 27193660, "step": 1285, "time_per_iteration": 2.8279058933258057 }, { "auxiliary_loss_clip": 0.01218055, "auxiliary_loss_mlp": 0.01038789, "balance_loss_clip": 0.94848144, "balance_loss_mlp": 1.02722609, "epoch": 0.15463235736187098, "flos": 28220185146240.0, "grad_norm": 2.752681843639125, "language_loss": 0.65765005, "learning_rate": 3.8393790391528716e-06, "loss": 0.68021852, "num_input_tokens_seen": 27214355, "step": 1286, "time_per_iteration": 3.074650287628174 }, { "auxiliary_loss_clip": 0.01219863, "auxiliary_loss_mlp": 0.01041332, "balance_loss_clip": 0.98221755, "balance_loss_mlp": 1.02955365, "epoch": 0.15475260025251006, "flos": 22856890826880.0, "grad_norm": 1.817237260196046, "language_loss": 0.89066529, "learning_rate": 3.8390730392392075e-06, "loss": 0.91327727, "num_input_tokens_seen": 27234335, "step": 1287, "time_per_iteration": 2.7645111083984375 }, { "auxiliary_loss_clip": 0.01231228, "auxiliary_loss_mlp": 0.01047467, "balance_loss_clip": 1.06487274, "balance_loss_mlp": 1.03633273, "epoch": 0.15487284314314917, "flos": 17602872658560.0, "grad_norm": 2.27031568017244, "language_loss": 0.79199505, "learning_rate": 3.838766760339626e-06, "loss": 0.81478196, "num_input_tokens_seen": 27252860, "step": 1288, "time_per_iteration": 2.6137137413024902 }, { "auxiliary_loss_clip": 0.01201437, "auxiliary_loss_mlp": 0.01035253, "balance_loss_clip": 0.94237381, "balance_loss_mlp": 1.02354622, "epoch": 0.15499308603378825, "flos": 20082037363200.0, "grad_norm": 2.506627266568125, "language_loss": 0.79505426, "learning_rate": 3.838460202500587e-06, "loss": 0.8174212, "num_input_tokens_seen": 27268650, "step": 1289, "time_per_iteration": 2.730428457260132 }, { "auxiliary_loss_clip": 0.01216738, "auxiliary_loss_mlp": 0.01038987, "balance_loss_clip": 0.94786948, "balance_loss_mlp": 1.02683353, "epoch": 0.15511332892442733, "flos": 15918051271680.0, "grad_norm": 2.0378578552966524, "language_loss": 0.7416808, "learning_rate": 3.838153365768599e-06, "loss": 0.764238, "num_input_tokens_seen": 27285160, "step": 1290, "time_per_iteration": 2.70137095451355 }, { "auxiliary_loss_clip": 0.01227255, "auxiliary_loss_mlp": 0.01040964, "balance_loss_clip": 0.9553231, "balance_loss_mlp": 1.02898312, "epoch": 0.15523357181506645, "flos": 41282475569280.0, "grad_norm": 2.1848576675942284, "language_loss": 0.75638741, "learning_rate": 3.837846250190206e-06, "loss": 0.7790696, "num_input_tokens_seen": 27308025, "step": 1291, "time_per_iteration": 2.8804614543914795 }, { "auxiliary_loss_clip": 0.01218364, "auxiliary_loss_mlp": 0.01128898, "balance_loss_clip": 0.90889949, "balance_loss_mlp": 0.0, "epoch": 0.15535381470570553, "flos": 18478769806080.0, "grad_norm": 2.1041792855554995, "language_loss": 0.76847965, "learning_rate": 3.837538855811998e-06, "loss": 0.79195225, "num_input_tokens_seen": 27326200, "step": 1292, "time_per_iteration": 2.7544639110565186 }, { "auxiliary_loss_clip": 0.01231076, "auxiliary_loss_mlp": 0.01037181, "balance_loss_clip": 0.98805714, "balance_loss_mlp": 1.02483726, "epoch": 0.1554740575963446, "flos": 13918150759680.0, "grad_norm": 2.1791819915448682, "language_loss": 0.70759606, "learning_rate": 3.837231182680606e-06, "loss": 0.73027861, "num_input_tokens_seen": 27344165, "step": 1293, "time_per_iteration": 3.5352225303649902 }, { "auxiliary_loss_clip": 0.01233319, "auxiliary_loss_mlp": 0.01040338, "balance_loss_clip": 1.02695441, "balance_loss_mlp": 1.02924585, "epoch": 0.1555943004869837, "flos": 20847078161280.0, "grad_norm": 1.53583119998272, "language_loss": 0.75946188, "learning_rate": 3.836923230842706e-06, "loss": 0.78219843, "num_input_tokens_seen": 27363280, "step": 1294, "time_per_iteration": 2.689864158630371 }, { "auxiliary_loss_clip": 0.01220989, "auxiliary_loss_mlp": 0.01036434, "balance_loss_clip": 0.90561795, "balance_loss_mlp": 1.02469766, "epoch": 0.1557145433776228, "flos": 22085888371200.0, "grad_norm": 2.118422462325321, "language_loss": 0.80689096, "learning_rate": 3.836615000345011e-06, "loss": 0.82946515, "num_input_tokens_seen": 27381460, "step": 1295, "time_per_iteration": 3.720414161682129 }, { "auxiliary_loss_clip": 0.01229933, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.06411982, "balance_loss_mlp": 1.02465665, "epoch": 0.1558347862682619, "flos": 19791987039360.0, "grad_norm": 2.045077953350158, "language_loss": 0.78326654, "learning_rate": 3.836306491234282e-06, "loss": 0.80592275, "num_input_tokens_seen": 27399310, "step": 1296, "time_per_iteration": 4.557539939880371 }, { "auxiliary_loss_clip": 0.01219529, "auxiliary_loss_mlp": 0.01043765, "balance_loss_clip": 0.98709285, "balance_loss_mlp": 1.03095555, "epoch": 0.15595502915890097, "flos": 17237086508160.0, "grad_norm": 2.231543428808244, "language_loss": 0.75176919, "learning_rate": 3.835997703557317e-06, "loss": 0.77440214, "num_input_tokens_seen": 27416050, "step": 1297, "time_per_iteration": 2.6326398849487305 }, { "auxiliary_loss_clip": 0.01217546, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 0.90303314, "balance_loss_mlp": 1.02702558, "epoch": 0.15607527204954008, "flos": 19719519350400.0, "grad_norm": 1.7316942592492808, "language_loss": 0.80295044, "learning_rate": 3.83568863736096e-06, "loss": 0.82550979, "num_input_tokens_seen": 27434920, "step": 1298, "time_per_iteration": 2.7854371070861816 }, { "auxiliary_loss_clip": 0.01220358, "auxiliary_loss_mlp": 0.0103994, "balance_loss_clip": 0.94329995, "balance_loss_mlp": 1.02821624, "epoch": 0.15619551494017916, "flos": 18515650095360.0, "grad_norm": 2.299699744556694, "language_loss": 0.89570725, "learning_rate": 3.8353792926920975e-06, "loss": 0.91831023, "num_input_tokens_seen": 27453570, "step": 1299, "time_per_iteration": 2.707085132598877 }, { "auxiliary_loss_clip": 0.01235816, "auxiliary_loss_mlp": 0.0104011, "balance_loss_clip": 1.0282402, "balance_loss_mlp": 1.02762294, "epoch": 0.15631575783081825, "flos": 19902125116800.0, "grad_norm": 2.416539396668676, "language_loss": 0.81670183, "learning_rate": 3.835069669597655e-06, "loss": 0.83946109, "num_input_tokens_seen": 27471960, "step": 1300, "time_per_iteration": 2.6493425369262695 }, { "auxiliary_loss_clip": 0.01233554, "auxiliary_loss_mlp": 0.01129281, "balance_loss_clip": 1.02543259, "balance_loss_mlp": 0.0, "epoch": 0.15643600072145733, "flos": 20777663128320.0, "grad_norm": 1.8682429357480648, "language_loss": 0.79823935, "learning_rate": 3.834759768124603e-06, "loss": 0.8218677, "num_input_tokens_seen": 27490835, "step": 1301, "time_per_iteration": 2.6955666542053223 }, { "auxiliary_loss_clip": 0.0123019, "auxiliary_loss_mlp": 0.01037932, "balance_loss_clip": 0.9533813, "balance_loss_mlp": 1.0264461, "epoch": 0.15655624361209644, "flos": 18546389159040.0, "grad_norm": 2.357822612477531, "language_loss": 0.76466846, "learning_rate": 3.834449588319953e-06, "loss": 0.7873497, "num_input_tokens_seen": 27508870, "step": 1302, "time_per_iteration": 2.7371978759765625 }, { "auxiliary_loss_clip": 0.01225403, "auxiliary_loss_mlp": 0.01037823, "balance_loss_clip": 1.02769852, "balance_loss_mlp": 1.02649248, "epoch": 0.15667648650273552, "flos": 25229544727680.0, "grad_norm": 6.286210375468758, "language_loss": 0.85118306, "learning_rate": 3.834139130230758e-06, "loss": 0.8738153, "num_input_tokens_seen": 27528175, "step": 1303, "time_per_iteration": 2.6852285861968994 }, { "auxiliary_loss_clip": 0.01230425, "auxiliary_loss_mlp": 0.01040418, "balance_loss_clip": 0.98619878, "balance_loss_mlp": 1.0292778, "epoch": 0.1567967293933746, "flos": 24827093769600.0, "grad_norm": 1.704646550315088, "language_loss": 0.81262863, "learning_rate": 3.833828393904117e-06, "loss": 0.8353371, "num_input_tokens_seen": 27548455, "step": 1304, "time_per_iteration": 2.7537972927093506 }, { "auxiliary_loss_clip": 0.01214536, "auxiliary_loss_mlp": 0.01039445, "balance_loss_clip": 0.90582955, "balance_loss_mlp": 1.02777433, "epoch": 0.15691697228401372, "flos": 19164555244800.0, "grad_norm": 2.3443059754551436, "language_loss": 0.77451235, "learning_rate": 3.833517379387165e-06, "loss": 0.79705215, "num_input_tokens_seen": 27564910, "step": 1305, "time_per_iteration": 2.739086389541626 }, { "auxiliary_loss_clip": 0.01232368, "auxiliary_loss_mlp": 0.01040799, "balance_loss_clip": 1.02807379, "balance_loss_mlp": 1.02912211, "epoch": 0.1570372151746528, "flos": 24790931752320.0, "grad_norm": 2.1010668937760175, "language_loss": 0.88977802, "learning_rate": 3.833206086727085e-06, "loss": 0.91250974, "num_input_tokens_seen": 27584260, "step": 1306, "time_per_iteration": 2.6706650257110596 }, { "auxiliary_loss_clip": 0.01225575, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 0.94442844, "balance_loss_mlp": 1.02349973, "epoch": 0.15715745806529188, "flos": 24863650836480.0, "grad_norm": 2.035054298116174, "language_loss": 0.70640129, "learning_rate": 3.8328945159710994e-06, "loss": 0.72900343, "num_input_tokens_seen": 27604440, "step": 1307, "time_per_iteration": 2.756871223449707 }, { "auxiliary_loss_clip": 0.01237704, "auxiliary_loss_mlp": 0.0112873, "balance_loss_clip": 1.03105152, "balance_loss_mlp": 0.0, "epoch": 0.157277700955931, "flos": 21872148491520.0, "grad_norm": 2.2862021856351404, "language_loss": 0.88784933, "learning_rate": 3.832582667166473e-06, "loss": 0.91151369, "num_input_tokens_seen": 27624250, "step": 1308, "time_per_iteration": 2.725101947784424 }, { "auxiliary_loss_clip": 0.01221741, "auxiliary_loss_mlp": 0.01037553, "balance_loss_clip": 0.98455167, "balance_loss_mlp": 1.0260911, "epoch": 0.15739794384657008, "flos": 24533344344960.0, "grad_norm": 1.6613671361545137, "language_loss": 0.81607139, "learning_rate": 3.8322705403605125e-06, "loss": 0.83866429, "num_input_tokens_seen": 27644595, "step": 1309, "time_per_iteration": 2.7512145042419434 }, { "auxiliary_loss_clip": 0.01217484, "auxiliary_loss_mlp": 0.01034515, "balance_loss_clip": 0.98568785, "balance_loss_mlp": 1.02423882, "epoch": 0.15751818673720916, "flos": 17745329998080.0, "grad_norm": 2.5285595580777667, "language_loss": 0.81493086, "learning_rate": 3.831958135600568e-06, "loss": 0.83745086, "num_input_tokens_seen": 27662145, "step": 1310, "time_per_iteration": 2.652204990386963 }, { "auxiliary_loss_clip": 0.01231562, "auxiliary_loss_mlp": 0.01036923, "balance_loss_clip": 1.02722001, "balance_loss_mlp": 1.02567017, "epoch": 0.15763842962784824, "flos": 17858520731520.0, "grad_norm": 1.8855615330011266, "language_loss": 0.79680026, "learning_rate": 3.831645452934032e-06, "loss": 0.81948507, "num_input_tokens_seen": 27680575, "step": 1311, "time_per_iteration": 2.6663315296173096 }, { "auxiliary_loss_clip": 0.01233958, "auxiliary_loss_mlp": 0.01039043, "balance_loss_clip": 1.06808662, "balance_loss_mlp": 1.0275569, "epoch": 0.15775867251848735, "flos": 26980908059520.0, "grad_norm": 1.8581088762393456, "language_loss": 0.79977584, "learning_rate": 3.831332492408336e-06, "loss": 0.82250589, "num_input_tokens_seen": 27701985, "step": 1312, "time_per_iteration": 2.630394697189331 }, { "auxiliary_loss_clip": 0.01218795, "auxiliary_loss_mlp": 0.01036171, "balance_loss_clip": 0.98422718, "balance_loss_mlp": 1.02538872, "epoch": 0.15787891540912644, "flos": 19240398812160.0, "grad_norm": 1.8751365930115202, "language_loss": 0.69198823, "learning_rate": 3.831019254070957e-06, "loss": 0.71453792, "num_input_tokens_seen": 27719770, "step": 1313, "time_per_iteration": 2.704052209854126 }, { "auxiliary_loss_clip": 0.01221559, "auxiliary_loss_mlp": 0.01040875, "balance_loss_clip": 0.90822381, "balance_loss_mlp": 1.02968764, "epoch": 0.15799915829976552, "flos": 27271102037760.0, "grad_norm": 2.7280718486740123, "language_loss": 0.95641643, "learning_rate": 3.8307057379694135e-06, "loss": 0.97904086, "num_input_tokens_seen": 27739105, "step": 1314, "time_per_iteration": 2.8083596229553223 }, { "auxiliary_loss_clip": 0.01230577, "auxiliary_loss_mlp": 0.01042817, "balance_loss_clip": 1.06375027, "balance_loss_mlp": 1.03124762, "epoch": 0.15811940119040463, "flos": 20405520270720.0, "grad_norm": 2.2393904601539067, "language_loss": 0.82540971, "learning_rate": 3.830391944151264e-06, "loss": 0.84814364, "num_input_tokens_seen": 27754985, "step": 1315, "time_per_iteration": 2.6178042888641357 }, { "auxiliary_loss_clip": 0.01226296, "auxiliary_loss_mlp": 0.01037095, "balance_loss_clip": 0.984519, "balance_loss_mlp": 1.02567494, "epoch": 0.1582396440810437, "flos": 32599347661440.0, "grad_norm": 1.7950551123579988, "language_loss": 0.67263007, "learning_rate": 3.830077872664114e-06, "loss": 0.69526398, "num_input_tokens_seen": 27776110, "step": 1316, "time_per_iteration": 2.8036296367645264 }, { "auxiliary_loss_clip": 0.01220681, "auxiliary_loss_mlp": 0.01029506, "balance_loss_clip": 0.86583364, "balance_loss_mlp": 1.01790142, "epoch": 0.1583598869716828, "flos": 33800559310080.0, "grad_norm": 1.7921115843550595, "language_loss": 0.72834235, "learning_rate": 3.829763523555604e-06, "loss": 0.75084424, "num_input_tokens_seen": 27796510, "step": 1317, "time_per_iteration": 2.8475751876831055 }, { "auxiliary_loss_clip": 0.01221494, "auxiliary_loss_mlp": 0.01040535, "balance_loss_clip": 1.02714372, "balance_loss_mlp": 1.02951407, "epoch": 0.15848012986232188, "flos": 24681332378880.0, "grad_norm": 2.210854891588461, "language_loss": 0.78183746, "learning_rate": 3.829448896873423e-06, "loss": 0.80445778, "num_input_tokens_seen": 27815610, "step": 1318, "time_per_iteration": 2.6793034076690674 }, { "auxiliary_loss_clip": 0.01207846, "auxiliary_loss_mlp": 0.01128647, "balance_loss_clip": 0.90884948, "balance_loss_mlp": 0.0, "epoch": 0.158600372752961, "flos": 22602068766720.0, "grad_norm": 2.8884746556517, "language_loss": 0.79527694, "learning_rate": 3.829133992665299e-06, "loss": 0.8186419, "num_input_tokens_seen": 27834735, "step": 1319, "time_per_iteration": 2.8449747562408447 }, { "auxiliary_loss_clip": 0.01216088, "auxiliary_loss_mlp": 0.01037361, "balance_loss_clip": 1.02567339, "balance_loss_mlp": 1.02688837, "epoch": 0.15872061564360007, "flos": 27927944092800.0, "grad_norm": 2.0461708203069042, "language_loss": 0.89006048, "learning_rate": 3.828818810979002e-06, "loss": 0.91259497, "num_input_tokens_seen": 27853065, "step": 1320, "time_per_iteration": 4.195743799209595 }, { "auxiliary_loss_clip": 0.01228395, "auxiliary_loss_mlp": 0.01038105, "balance_loss_clip": 1.06539297, "balance_loss_mlp": 1.02679801, "epoch": 0.15884085853423915, "flos": 23696805525120.0, "grad_norm": 2.1474464743697976, "language_loss": 0.8061462, "learning_rate": 3.8285033518623454e-06, "loss": 0.82881117, "num_input_tokens_seen": 27873315, "step": 1321, "time_per_iteration": 4.6268956661224365 }, { "auxiliary_loss_clip": 0.01235574, "auxiliary_loss_mlp": 0.01037956, "balance_loss_clip": 1.02927566, "balance_loss_mlp": 1.02698278, "epoch": 0.15896110142487826, "flos": 23112359331840.0, "grad_norm": 2.39461893404025, "language_loss": 0.81743968, "learning_rate": 3.8281876153631845e-06, "loss": 0.84017497, "num_input_tokens_seen": 27890070, "step": 1322, "time_per_iteration": 3.637411594390869 }, { "auxiliary_loss_clip": 0.01215259, "auxiliary_loss_mlp": 0.0103659, "balance_loss_clip": 0.90583152, "balance_loss_mlp": 1.02535498, "epoch": 0.15908134431551735, "flos": 14685238632960.0, "grad_norm": 1.9351663216069293, "language_loss": 0.64440048, "learning_rate": 3.827871601529416e-06, "loss": 0.66691893, "num_input_tokens_seen": 27908590, "step": 1323, "time_per_iteration": 2.7205147743225098 }, { "auxiliary_loss_clip": 0.01219739, "auxiliary_loss_mlp": 0.01039543, "balance_loss_clip": 0.94770646, "balance_loss_mlp": 1.02827144, "epoch": 0.15920158720615643, "flos": 20193611984640.0, "grad_norm": 1.74034097790216, "language_loss": 0.80793869, "learning_rate": 3.827555310408979e-06, "loss": 0.83053154, "num_input_tokens_seen": 27927985, "step": 1324, "time_per_iteration": 2.732818841934204 }, { "auxiliary_loss_clip": 0.01220994, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 0.94702733, "balance_loss_mlp": 1.0260551, "epoch": 0.1593218300967955, "flos": 24826626892800.0, "grad_norm": 1.6179295639716103, "language_loss": 0.82928193, "learning_rate": 3.827238742049854e-06, "loss": 0.85187334, "num_input_tokens_seen": 27948280, "step": 1325, "time_per_iteration": 2.7678544521331787 }, { "auxiliary_loss_clip": 0.01228151, "auxiliary_loss_mlp": 0.01040781, "balance_loss_clip": 1.06399822, "balance_loss_mlp": 1.02948546, "epoch": 0.15944207298743462, "flos": 28328707111680.0, "grad_norm": 1.7867260904675704, "language_loss": 0.51800686, "learning_rate": 3.826921896500066e-06, "loss": 0.5406962, "num_input_tokens_seen": 27969565, "step": 1326, "time_per_iteration": 2.7041444778442383 }, { "auxiliary_loss_clip": 0.01229415, "auxiliary_loss_mlp": 0.01039909, "balance_loss_clip": 0.9485153, "balance_loss_mlp": 1.02812481, "epoch": 0.1595623158780737, "flos": 22964838174720.0, "grad_norm": 2.677278656284657, "language_loss": 0.77878976, "learning_rate": 3.826604773807678e-06, "loss": 0.80148298, "num_input_tokens_seen": 27987540, "step": 1327, "time_per_iteration": 2.7585999965667725 }, { "auxiliary_loss_clip": 0.01221439, "auxiliary_loss_mlp": 0.01038779, "balance_loss_clip": 0.98092818, "balance_loss_mlp": 1.02648854, "epoch": 0.1596825587687128, "flos": 19710540950400.0, "grad_norm": 2.9208686812257683, "language_loss": 0.72977352, "learning_rate": 3.826287374020798e-06, "loss": 0.7523756, "num_input_tokens_seen": 28002345, "step": 1328, "time_per_iteration": 2.7162647247314453 }, { "auxiliary_loss_clip": 0.01232301, "auxiliary_loss_mlp": 0.01037984, "balance_loss_clip": 1.06619275, "balance_loss_mlp": 1.0267489, "epoch": 0.1598028016593519, "flos": 22637727993600.0, "grad_norm": 1.9895682217711863, "language_loss": 0.82317996, "learning_rate": 3.825969697187575e-06, "loss": 0.84588277, "num_input_tokens_seen": 28021675, "step": 1329, "time_per_iteration": 2.615781545639038 }, { "auxiliary_loss_clip": 0.01218342, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 0.94447136, "balance_loss_mlp": 1.02481711, "epoch": 0.15992304454999098, "flos": 20482908122880.0, "grad_norm": 5.971599822808925, "language_loss": 0.69939768, "learning_rate": 3.8256517433562015e-06, "loss": 0.72193694, "num_input_tokens_seen": 28039615, "step": 1330, "time_per_iteration": 2.7719292640686035 }, { "auxiliary_loss_clip": 0.01230337, "auxiliary_loss_mlp": 0.01038643, "balance_loss_clip": 1.06608236, "balance_loss_mlp": 1.0271275, "epoch": 0.16004328744063007, "flos": 17676094533120.0, "grad_norm": 2.3447716375762737, "language_loss": 0.91316921, "learning_rate": 3.82533351257491e-06, "loss": 0.93585902, "num_input_tokens_seen": 28057565, "step": 1331, "time_per_iteration": 2.6199679374694824 }, { "auxiliary_loss_clip": 0.01225716, "auxiliary_loss_mlp": 0.01043887, "balance_loss_clip": 1.02753067, "balance_loss_mlp": 1.03246665, "epoch": 0.16016353033126918, "flos": 24098717779200.0, "grad_norm": 1.7534524976328176, "language_loss": 0.88832396, "learning_rate": 3.825015004891975e-06, "loss": 0.91102004, "num_input_tokens_seen": 28076305, "step": 1332, "time_per_iteration": 2.706936836242676 }, { "auxiliary_loss_clip": 0.01221024, "auxiliary_loss_mlp": 0.01034088, "balance_loss_clip": 1.02365208, "balance_loss_mlp": 1.02377677, "epoch": 0.16028377322190826, "flos": 27634841112960.0, "grad_norm": 2.1689588020541546, "language_loss": 0.75971115, "learning_rate": 3.824696220355716e-06, "loss": 0.78226221, "num_input_tokens_seen": 28097895, "step": 1333, "time_per_iteration": 2.7112550735473633 }, { "auxiliary_loss_clip": 0.01223766, "auxiliary_loss_mlp": 0.01034203, "balance_loss_clip": 0.98689634, "balance_loss_mlp": 1.0229075, "epoch": 0.16040401611254734, "flos": 20961202648320.0, "grad_norm": 1.6220800416191106, "language_loss": 0.78795344, "learning_rate": 3.824377159014491e-06, "loss": 0.81053317, "num_input_tokens_seen": 28118790, "step": 1334, "time_per_iteration": 2.7114012241363525 }, { "auxiliary_loss_clip": 0.01222387, "auxiliary_loss_mlp": 0.01037937, "balance_loss_clip": 1.02530015, "balance_loss_mlp": 1.02672553, "epoch": 0.16052425900318643, "flos": 21247051080960.0, "grad_norm": 2.7882434915283847, "language_loss": 0.85264969, "learning_rate": 3.824057820916702e-06, "loss": 0.8752529, "num_input_tokens_seen": 28135995, "step": 1335, "time_per_iteration": 2.6697916984558105 }, { "auxiliary_loss_clip": 0.01226517, "auxiliary_loss_mlp": 0.01036007, "balance_loss_clip": 0.98608047, "balance_loss_mlp": 1.02422905, "epoch": 0.16064450189382554, "flos": 15524004096000.0, "grad_norm": 2.1534884293726875, "language_loss": 0.71322078, "learning_rate": 3.8237382061107904e-06, "loss": 0.73584604, "num_input_tokens_seen": 28152715, "step": 1336, "time_per_iteration": 2.7032151222229004 }, { "auxiliary_loss_clip": 0.01203168, "auxiliary_loss_mlp": 0.01034221, "balance_loss_clip": 0.82481003, "balance_loss_mlp": 1.0230931, "epoch": 0.16076474478446462, "flos": 21178497974400.0, "grad_norm": 1.852431311650141, "language_loss": 0.78618306, "learning_rate": 3.823418314645243e-06, "loss": 0.80855691, "num_input_tokens_seen": 28171590, "step": 1337, "time_per_iteration": 2.8204007148742676 }, { "auxiliary_loss_clip": 0.01202357, "auxiliary_loss_mlp": 0.01038597, "balance_loss_clip": 0.90733433, "balance_loss_mlp": 1.02678895, "epoch": 0.1608849876751037, "flos": 18366476912640.0, "grad_norm": 2.4255040437227815, "language_loss": 0.75279474, "learning_rate": 3.823098146568588e-06, "loss": 0.7752043, "num_input_tokens_seen": 28191295, "step": 1338, "time_per_iteration": 2.792429208755493 }, { "auxiliary_loss_clip": 0.01224751, "auxiliary_loss_mlp": 0.0103233, "balance_loss_clip": 1.02484465, "balance_loss_mlp": 1.0210228, "epoch": 0.1610052305657428, "flos": 29497024880640.0, "grad_norm": 1.7946128636897758, "language_loss": 0.71796429, "learning_rate": 3.822777701929394e-06, "loss": 0.74053514, "num_input_tokens_seen": 28213120, "step": 1339, "time_per_iteration": 2.779815673828125 }, { "auxiliary_loss_clip": 0.01213743, "auxiliary_loss_mlp": 0.01034859, "balance_loss_clip": 1.02129972, "balance_loss_mlp": 1.02395713, "epoch": 0.1611254734563819, "flos": 26797871329920.0, "grad_norm": 1.784453724171736, "language_loss": 0.73768032, "learning_rate": 3.8224569807762714e-06, "loss": 0.76016635, "num_input_tokens_seen": 28232440, "step": 1340, "time_per_iteration": 2.672757387161255 }, { "auxiliary_loss_clip": 0.01194723, "auxiliary_loss_mlp": 0.01034697, "balance_loss_clip": 0.90174246, "balance_loss_mlp": 1.02347922, "epoch": 0.16124571634702098, "flos": 22419570741120.0, "grad_norm": 1.8437020244459885, "language_loss": 0.7655412, "learning_rate": 3.822135983157873e-06, "loss": 0.78783542, "num_input_tokens_seen": 28251715, "step": 1341, "time_per_iteration": 2.792264223098755 }, { "auxiliary_loss_clip": 0.01225784, "auxiliary_loss_mlp": 0.01127877, "balance_loss_clip": 1.06371236, "balance_loss_mlp": 0.0, "epoch": 0.16136595923766006, "flos": 10999116103680.0, "grad_norm": 2.2098838441357285, "language_loss": 0.84051883, "learning_rate": 3.821814709122896e-06, "loss": 0.86405545, "num_input_tokens_seen": 28269765, "step": 1342, "time_per_iteration": 2.6348891258239746 }, { "auxiliary_loss_clip": 0.01222839, "auxiliary_loss_mlp": 0.01036874, "balance_loss_clip": 0.98817956, "balance_loss_mlp": 1.02561474, "epoch": 0.16148620212829917, "flos": 21214983214080.0, "grad_norm": 2.7520972076350425, "language_loss": 0.85290021, "learning_rate": 3.821493158720076e-06, "loss": 0.87549734, "num_input_tokens_seen": 28288870, "step": 1343, "time_per_iteration": 2.7001538276672363 }, { "auxiliary_loss_clip": 0.01219467, "auxiliary_loss_mlp": 0.01032409, "balance_loss_clip": 0.94331264, "balance_loss_mlp": 1.02095258, "epoch": 0.16160644501893826, "flos": 16758468760320.0, "grad_norm": 3.0585664831176227, "language_loss": 0.72979134, "learning_rate": 3.821171331998191e-06, "loss": 0.75231004, "num_input_tokens_seen": 28305400, "step": 1344, "time_per_iteration": 2.756074905395508 }, { "auxiliary_loss_clip": 0.01138779, "auxiliary_loss_mlp": 0.01011749, "balance_loss_clip": 0.92872602, "balance_loss_mlp": 1.0059073, "epoch": 0.16172668790957734, "flos": 64444967308800.0, "grad_norm": 0.7202535512669153, "language_loss": 0.54538047, "learning_rate": 3.820849229006064e-06, "loss": 0.56688571, "num_input_tokens_seen": 28373150, "step": 1345, "time_per_iteration": 4.4260172843933105 }, { "auxiliary_loss_clip": 0.0122542, "auxiliary_loss_mlp": 0.01036369, "balance_loss_clip": 1.06191266, "balance_loss_mlp": 1.02550316, "epoch": 0.16184693080021645, "flos": 23257689759360.0, "grad_norm": 1.9707257937779468, "language_loss": 0.70767105, "learning_rate": 3.8205268497925564e-06, "loss": 0.73028898, "num_input_tokens_seen": 28393620, "step": 1346, "time_per_iteration": 2.6711957454681396 }, { "auxiliary_loss_clip": 0.01229447, "auxiliary_loss_mlp": 0.01043321, "balance_loss_clip": 1.06566226, "balance_loss_mlp": 1.03226995, "epoch": 0.16196717369085553, "flos": 17451113696640.0, "grad_norm": 2.3585248684011306, "language_loss": 0.78587008, "learning_rate": 3.8202041944065725e-06, "loss": 0.8085978, "num_input_tokens_seen": 28409440, "step": 1347, "time_per_iteration": 3.6911520957946777 }, { "auxiliary_loss_clip": 0.01229984, "auxiliary_loss_mlp": 0.01039708, "balance_loss_clip": 1.06638813, "balance_loss_mlp": 1.02866316, "epoch": 0.16208741658149461, "flos": 23873377806720.0, "grad_norm": 2.2568898911176136, "language_loss": 0.73646909, "learning_rate": 3.819881262897061e-06, "loss": 0.759166, "num_input_tokens_seen": 28427575, "step": 1348, "time_per_iteration": 2.673736333847046 }, { "auxiliary_loss_clip": 0.01228886, "auxiliary_loss_mlp": 0.01037512, "balance_loss_clip": 0.95379275, "balance_loss_mlp": 1.02618074, "epoch": 0.1622076594721337, "flos": 25884806584320.0, "grad_norm": 1.8405770304588958, "language_loss": 0.73302931, "learning_rate": 3.819558055313008e-06, "loss": 0.75569332, "num_input_tokens_seen": 28448260, "step": 1349, "time_per_iteration": 3.742353916168213 }, { "auxiliary_loss_clip": 0.01231335, "auxiliary_loss_mlp": 0.01045106, "balance_loss_clip": 1.02560568, "balance_loss_mlp": 1.03417492, "epoch": 0.1623279023627728, "flos": 21539759011200.0, "grad_norm": 1.7654080656957252, "language_loss": 0.77314508, "learning_rate": 3.819234571703444e-06, "loss": 0.79590952, "num_input_tokens_seen": 28467085, "step": 1350, "time_per_iteration": 2.702362060546875 }, { "auxiliary_loss_clip": 0.01215729, "auxiliary_loss_mlp": 0.01038508, "balance_loss_clip": 1.0228467, "balance_loss_mlp": 1.02660477, "epoch": 0.1624481452534119, "flos": 22085421494400.0, "grad_norm": 1.6970553362510066, "language_loss": 0.85473812, "learning_rate": 3.8189108121174435e-06, "loss": 0.87728047, "num_input_tokens_seen": 28486850, "step": 1351, "time_per_iteration": 2.7397353649139404 }, { "auxiliary_loss_clip": 0.01213186, "auxiliary_loss_mlp": 0.01040805, "balance_loss_clip": 0.94730401, "balance_loss_mlp": 1.02975464, "epoch": 0.16256838814405097, "flos": 27087490690560.0, "grad_norm": 1.9512075051883726, "language_loss": 0.83742231, "learning_rate": 3.818586776604118e-06, "loss": 0.85996222, "num_input_tokens_seen": 28507490, "step": 1352, "time_per_iteration": 2.826267719268799 }, { "auxiliary_loss_clip": 0.0121753, "auxiliary_loss_mlp": 0.01048258, "balance_loss_clip": 0.98619103, "balance_loss_mlp": 1.03711796, "epoch": 0.16268863103469008, "flos": 20120354196480.0, "grad_norm": 2.2818606550294853, "language_loss": 0.614048, "learning_rate": 3.818262465212625e-06, "loss": 0.63670594, "num_input_tokens_seen": 28527615, "step": 1353, "time_per_iteration": 2.7371585369110107 }, { "auxiliary_loss_clip": 0.01218193, "auxiliary_loss_mlp": 0.01045704, "balance_loss_clip": 1.02613521, "balance_loss_mlp": 1.03424788, "epoch": 0.16280887392532917, "flos": 18332792933760.0, "grad_norm": 1.9952382278905942, "language_loss": 0.76936698, "learning_rate": 3.817937877992161e-06, "loss": 0.79200602, "num_input_tokens_seen": 28544910, "step": 1354, "time_per_iteration": 2.668504238128662 }, { "auxiliary_loss_clip": 0.01215814, "auxiliary_loss_mlp": 0.01128324, "balance_loss_clip": 0.94396973, "balance_loss_mlp": 0.0, "epoch": 0.16292911681596825, "flos": 11874330892800.0, "grad_norm": 3.0434062501603134, "language_loss": 0.85365808, "learning_rate": 3.817613014991967e-06, "loss": 0.87709939, "num_input_tokens_seen": 28561050, "step": 1355, "time_per_iteration": 2.7284553050994873 }, { "auxiliary_loss_clip": 0.01207317, "auxiliary_loss_mlp": 0.01037258, "balance_loss_clip": 0.94384253, "balance_loss_mlp": 1.02631426, "epoch": 0.16304935970660733, "flos": 26103466627200.0, "grad_norm": 2.111284899635569, "language_loss": 0.76424515, "learning_rate": 3.817287876261323e-06, "loss": 0.78669089, "num_input_tokens_seen": 28581385, "step": 1356, "time_per_iteration": 2.791071653366089 }, { "auxiliary_loss_clip": 0.01222575, "auxiliary_loss_mlp": 0.0103741, "balance_loss_clip": 0.98951578, "balance_loss_mlp": 1.02645445, "epoch": 0.16316960259724644, "flos": 29351945848320.0, "grad_norm": 2.9078094058423845, "language_loss": 0.80209875, "learning_rate": 3.816962461849553e-06, "loss": 0.82469857, "num_input_tokens_seen": 28603255, "step": 1357, "time_per_iteration": 2.9271836280822754 }, { "auxiliary_loss_clip": 0.01212423, "auxiliary_loss_mlp": 0.01042184, "balance_loss_clip": 0.9842965, "balance_loss_mlp": 1.03149724, "epoch": 0.16328984548788553, "flos": 20886759711360.0, "grad_norm": 3.2663523598154587, "language_loss": 0.84243882, "learning_rate": 3.8166367718060235e-06, "loss": 0.86498493, "num_input_tokens_seen": 28623145, "step": 1358, "time_per_iteration": 2.749490261077881 }, { "auxiliary_loss_clip": 0.01217111, "auxiliary_loss_mlp": 0.01038357, "balance_loss_clip": 1.02084637, "balance_loss_mlp": 1.02716923, "epoch": 0.1634100883785246, "flos": 18041090584320.0, "grad_norm": 2.3483035269664394, "language_loss": 0.76644397, "learning_rate": 3.816310806180139e-06, "loss": 0.78899866, "num_input_tokens_seen": 28641555, "step": 1359, "time_per_iteration": 2.6836793422698975 }, { "auxiliary_loss_clip": 0.01221919, "auxiliary_loss_mlp": 0.01038813, "balance_loss_clip": 0.98787922, "balance_loss_mlp": 1.02766097, "epoch": 0.16353033126916372, "flos": 24572128055040.0, "grad_norm": 2.098118962784957, "language_loss": 0.81090659, "learning_rate": 3.81598456502135e-06, "loss": 0.83351386, "num_input_tokens_seen": 28661575, "step": 1360, "time_per_iteration": 2.754650354385376 }, { "auxiliary_loss_clip": 0.0121902, "auxiliary_loss_mlp": 0.01036234, "balance_loss_clip": 0.98884904, "balance_loss_mlp": 1.02507651, "epoch": 0.1636505741598028, "flos": 19892895321600.0, "grad_norm": 2.710432932973262, "language_loss": 0.8703537, "learning_rate": 3.8156580483791455e-06, "loss": 0.89290619, "num_input_tokens_seen": 28676765, "step": 1361, "time_per_iteration": 2.7317891120910645 }, { "auxiliary_loss_clip": 0.01227644, "auxiliary_loss_mlp": 0.01042028, "balance_loss_clip": 1.06531656, "balance_loss_mlp": 1.03154397, "epoch": 0.16377081705044189, "flos": 28402611344640.0, "grad_norm": 2.4854971920872915, "language_loss": 0.76320547, "learning_rate": 3.815331256303059e-06, "loss": 0.7859022, "num_input_tokens_seen": 28696795, "step": 1362, "time_per_iteration": 2.690920829772949 }, { "auxiliary_loss_clip": 0.01217917, "auxiliary_loss_mlp": 0.01038837, "balance_loss_clip": 0.95075369, "balance_loss_mlp": 1.02823901, "epoch": 0.163891059941081, "flos": 21908059113600.0, "grad_norm": 2.240480053567059, "language_loss": 0.77040064, "learning_rate": 3.815004188842665e-06, "loss": 0.79296815, "num_input_tokens_seen": 28714835, "step": 1363, "time_per_iteration": 2.740558385848999 }, { "auxiliary_loss_clip": 0.01212891, "auxiliary_loss_mlp": 0.01035604, "balance_loss_clip": 0.98117495, "balance_loss_mlp": 1.02523851, "epoch": 0.16401130283172008, "flos": 26797619934720.0, "grad_norm": 1.5710503899246953, "language_loss": 0.79526561, "learning_rate": 3.814676846047578e-06, "loss": 0.81775057, "num_input_tokens_seen": 28735710, "step": 1364, "time_per_iteration": 2.7487094402313232 }, { "auxiliary_loss_clip": 0.01221755, "auxiliary_loss_mlp": 0.01043751, "balance_loss_clip": 1.02650285, "balance_loss_mlp": 1.03295064, "epoch": 0.16413154572235916, "flos": 32997417160320.0, "grad_norm": 1.938107753585279, "language_loss": 0.69767058, "learning_rate": 3.8143492279674565e-06, "loss": 0.72032559, "num_input_tokens_seen": 28758405, "step": 1365, "time_per_iteration": 2.774569272994995 }, { "auxiliary_loss_clip": 0.01140516, "auxiliary_loss_mlp": 0.01008263, "balance_loss_clip": 0.936023, "balance_loss_mlp": 1.00256526, "epoch": 0.16425178861299825, "flos": 40113622074240.0, "grad_norm": 0.9112968012753897, "language_loss": 0.58411658, "learning_rate": 3.8140213346519997e-06, "loss": 0.60560441, "num_input_tokens_seen": 28809000, "step": 1366, "time_per_iteration": 3.1135013103485107 }, { "auxiliary_loss_clip": 0.01215571, "auxiliary_loss_mlp": 0.01041252, "balance_loss_clip": 0.95039093, "balance_loss_mlp": 1.0295099, "epoch": 0.16437203150363736, "flos": 25447486498560.0, "grad_norm": 1.8367034896524665, "language_loss": 0.76970875, "learning_rate": 3.813693166150948e-06, "loss": 0.79227698, "num_input_tokens_seen": 28829210, "step": 1367, "time_per_iteration": 2.7697064876556396 }, { "auxiliary_loss_clip": 0.01211899, "auxiliary_loss_mlp": 0.01035895, "balance_loss_clip": 0.94728214, "balance_loss_mlp": 1.02496386, "epoch": 0.16449227439427644, "flos": 23476888506240.0, "grad_norm": 1.957510614776648, "language_loss": 0.85374248, "learning_rate": 3.813364722514086e-06, "loss": 0.87622046, "num_input_tokens_seen": 28847545, "step": 1368, "time_per_iteration": 2.7621452808380127 }, { "auxiliary_loss_clip": 0.01219931, "auxiliary_loss_mlp": 0.01037348, "balance_loss_clip": 1.02416348, "balance_loss_mlp": 1.02697134, "epoch": 0.16461251728491552, "flos": 13545217802880.0, "grad_norm": 2.921797041479831, "language_loss": 0.80338567, "learning_rate": 3.8130360037912368e-06, "loss": 0.82595849, "num_input_tokens_seen": 28863990, "step": 1369, "time_per_iteration": 2.6928422451019287 }, { "auxiliary_loss_clip": 0.01224113, "auxiliary_loss_mlp": 0.01027755, "balance_loss_clip": 1.02499008, "balance_loss_mlp": 1.0173775, "epoch": 0.16473276017555463, "flos": 23003298662400.0, "grad_norm": 2.3322649282989647, "language_loss": 0.82242924, "learning_rate": 3.812707010032268e-06, "loss": 0.84494787, "num_input_tokens_seen": 28883045, "step": 1370, "time_per_iteration": 2.711017608642578 }, { "auxiliary_loss_clip": 0.01231593, "auxiliary_loss_mlp": 0.01041761, "balance_loss_clip": 1.03171897, "balance_loss_mlp": 1.03081167, "epoch": 0.16485300306619372, "flos": 24790680357120.0, "grad_norm": 3.2537020774599155, "language_loss": 0.79359031, "learning_rate": 3.8123777412870863e-06, "loss": 0.81632388, "num_input_tokens_seen": 28902545, "step": 1371, "time_per_iteration": 3.5212948322296143 }, { "auxiliary_loss_clip": 0.01230405, "auxiliary_loss_mlp": 0.01037894, "balance_loss_clip": 0.98818511, "balance_loss_mlp": 1.02696228, "epoch": 0.1649732459568328, "flos": 21106497162240.0, "grad_norm": 2.2542766083424928, "language_loss": 0.78092664, "learning_rate": 3.812048197605643e-06, "loss": 0.80360967, "num_input_tokens_seen": 28921440, "step": 1372, "time_per_iteration": 2.7557320594787598 }, { "auxiliary_loss_clip": 0.01223895, "auxiliary_loss_mlp": 0.01028839, "balance_loss_clip": 1.02618957, "balance_loss_mlp": 1.01862288, "epoch": 0.16509348884747188, "flos": 20266726118400.0, "grad_norm": 2.0264399985253956, "language_loss": 0.81203938, "learning_rate": 3.8117183790379277e-06, "loss": 0.83456677, "num_input_tokens_seen": 28939890, "step": 1373, "time_per_iteration": 3.8448901176452637 }, { "auxiliary_loss_clip": 0.01226555, "auxiliary_loss_mlp": 0.0103804, "balance_loss_clip": 1.0639751, "balance_loss_mlp": 1.02741826, "epoch": 0.165213731738111, "flos": 11035493602560.0, "grad_norm": 2.9839739766009292, "language_loss": 0.94336784, "learning_rate": 3.811388285633976e-06, "loss": 0.96601379, "num_input_tokens_seen": 28955875, "step": 1374, "time_per_iteration": 2.658301591873169 }, { "auxiliary_loss_clip": 0.01227606, "auxiliary_loss_mlp": 0.01040145, "balance_loss_clip": 0.91294783, "balance_loss_mlp": 1.02900505, "epoch": 0.16533397462875007, "flos": 29972051268480.0, "grad_norm": 2.4176268444816555, "language_loss": 0.62211949, "learning_rate": 3.811057917443861e-06, "loss": 0.64479697, "num_input_tokens_seen": 28975140, "step": 1375, "time_per_iteration": 3.780526876449585 }, { "auxiliary_loss_clip": 0.01135696, "auxiliary_loss_mlp": 0.01002673, "balance_loss_clip": 0.96589589, "balance_loss_mlp": 0.99687976, "epoch": 0.16545421751938916, "flos": 65556763027200.0, "grad_norm": 0.8520675418860193, "language_loss": 0.68288875, "learning_rate": 3.8107272745177e-06, "loss": 0.70427251, "num_input_tokens_seen": 29047470, "step": 1376, "time_per_iteration": 3.4521124362945557 }, { "auxiliary_loss_clip": 0.01223424, "auxiliary_loss_mlp": 0.01037711, "balance_loss_clip": 0.9496398, "balance_loss_mlp": 1.02689242, "epoch": 0.16557446041002827, "flos": 22492361652480.0, "grad_norm": 1.8839360497766553, "language_loss": 0.78635848, "learning_rate": 3.8103963569056513e-06, "loss": 0.80896986, "num_input_tokens_seen": 29066605, "step": 1377, "time_per_iteration": 2.756791353225708 }, { "auxiliary_loss_clip": 0.01213616, "auxiliary_loss_mlp": 0.01041464, "balance_loss_clip": 0.98340386, "balance_loss_mlp": 1.03052616, "epoch": 0.16569470330066735, "flos": 24602723464320.0, "grad_norm": 1.5813885389699118, "language_loss": 0.8812288, "learning_rate": 3.8100651646579146e-06, "loss": 0.90377963, "num_input_tokens_seen": 29085815, "step": 1378, "time_per_iteration": 2.762028932571411 }, { "auxiliary_loss_clip": 0.01212532, "auxiliary_loss_mlp": 0.01037863, "balance_loss_clip": 0.98189807, "balance_loss_mlp": 1.02752185, "epoch": 0.16581494619130643, "flos": 15006207588480.0, "grad_norm": 2.37454077160248, "language_loss": 0.92360032, "learning_rate": 3.8097336978247317e-06, "loss": 0.94610423, "num_input_tokens_seen": 29102520, "step": 1379, "time_per_iteration": 2.6714839935302734 }, { "auxiliary_loss_clip": 0.01208962, "auxiliary_loss_mlp": 0.01041358, "balance_loss_clip": 0.98458761, "balance_loss_mlp": 1.03051019, "epoch": 0.16593518908194552, "flos": 17420338719360.0, "grad_norm": 2.0958931360487303, "language_loss": 0.89229119, "learning_rate": 3.8094019564563854e-06, "loss": 0.91479439, "num_input_tokens_seen": 29119450, "step": 1380, "time_per_iteration": 2.7315618991851807 }, { "auxiliary_loss_clip": 0.01226795, "auxiliary_loss_mlp": 0.01127634, "balance_loss_clip": 1.06443048, "balance_loss_mlp": 0.0, "epoch": 0.16605543197258463, "flos": 20412631163520.0, "grad_norm": 2.322322046116175, "language_loss": 0.75328624, "learning_rate": 3.809069940603201e-06, "loss": 0.77683049, "num_input_tokens_seen": 29137405, "step": 1381, "time_per_iteration": 2.675062656402588 }, { "auxiliary_loss_clip": 0.01208265, "auxiliary_loss_mlp": 0.01047683, "balance_loss_clip": 0.98432487, "balance_loss_mlp": 1.03644156, "epoch": 0.1661756748632237, "flos": 14209745368320.0, "grad_norm": 2.550497468353177, "language_loss": 0.77671158, "learning_rate": 3.8087376503155452e-06, "loss": 0.79927105, "num_input_tokens_seen": 29154890, "step": 1382, "time_per_iteration": 2.697268486022949 }, { "auxiliary_loss_clip": 0.01122957, "auxiliary_loss_mlp": 0.01005045, "balance_loss_clip": 0.95703793, "balance_loss_mlp": 0.99937099, "epoch": 0.1662959177538628, "flos": 66080877350400.0, "grad_norm": 0.8992457634718918, "language_loss": 0.56302243, "learning_rate": 3.808405085643826e-06, "loss": 0.58430248, "num_input_tokens_seen": 29219770, "step": 1383, "time_per_iteration": 3.3395376205444336 }, { "auxiliary_loss_clip": 0.01227445, "auxiliary_loss_mlp": 0.01127435, "balance_loss_clip": 1.06630993, "balance_loss_mlp": 0.0, "epoch": 0.1664161606445019, "flos": 20740567357440.0, "grad_norm": 2.0435643369311527, "language_loss": 0.88899821, "learning_rate": 3.8080722466384925e-06, "loss": 0.91254699, "num_input_tokens_seen": 29237620, "step": 1384, "time_per_iteration": 2.6390233039855957 }, { "auxiliary_loss_clip": 0.01228525, "auxiliary_loss_mlp": 0.01036062, "balance_loss_clip": 1.06410921, "balance_loss_mlp": 1.02556539, "epoch": 0.166536403535141, "flos": 25260930236160.0, "grad_norm": 2.4569812001780384, "language_loss": 0.71038187, "learning_rate": 3.8077391333500376e-06, "loss": 0.73302776, "num_input_tokens_seen": 29256760, "step": 1385, "time_per_iteration": 2.8038716316223145 }, { "auxiliary_loss_clip": 0.01224152, "auxiliary_loss_mlp": 0.01039711, "balance_loss_clip": 0.99114114, "balance_loss_mlp": 1.02941775, "epoch": 0.16665664642578007, "flos": 25447450584960.0, "grad_norm": 1.7557905236740188, "language_loss": 0.76695698, "learning_rate": 3.8074057458289934e-06, "loss": 0.7895956, "num_input_tokens_seen": 29277450, "step": 1386, "time_per_iteration": 2.740044355392456 }, { "auxiliary_loss_clip": 0.01219334, "auxiliary_loss_mlp": 0.01043097, "balance_loss_clip": 0.98405993, "balance_loss_mlp": 1.03223753, "epoch": 0.16677688931641918, "flos": 22200767043840.0, "grad_norm": 4.1016823539503795, "language_loss": 0.82265735, "learning_rate": 3.807072084125934e-06, "loss": 0.84528172, "num_input_tokens_seen": 29299300, "step": 1387, "time_per_iteration": 2.716252565383911 }, { "auxiliary_loss_clip": 0.01219009, "auxiliary_loss_mlp": 0.01033813, "balance_loss_clip": 0.98645955, "balance_loss_mlp": 1.02305412, "epoch": 0.16689713220705826, "flos": 16945958776320.0, "grad_norm": 2.8175195925449668, "language_loss": 0.80542082, "learning_rate": 3.806738148291477e-06, "loss": 0.82794905, "num_input_tokens_seen": 29316125, "step": 1388, "time_per_iteration": 2.6236255168914795 }, { "auxiliary_loss_clip": 0.0121982, "auxiliary_loss_mlp": 0.01046938, "balance_loss_clip": 0.87028003, "balance_loss_mlp": 1.03521991, "epoch": 0.16701737509769735, "flos": 36244423923840.0, "grad_norm": 1.798502638426506, "language_loss": 0.70991135, "learning_rate": 3.8064039383762793e-06, "loss": 0.73257887, "num_input_tokens_seen": 29338490, "step": 1389, "time_per_iteration": 2.906285285949707 }, { "auxiliary_loss_clip": 0.01224745, "auxiliary_loss_mlp": 0.01039208, "balance_loss_clip": 1.02675486, "balance_loss_mlp": 1.0287118, "epoch": 0.16713761798833643, "flos": 23258659426560.0, "grad_norm": 2.0975260257895356, "language_loss": 0.77091873, "learning_rate": 3.8060694544310396e-06, "loss": 0.79355824, "num_input_tokens_seen": 29357000, "step": 1390, "time_per_iteration": 2.6673362255096436 }, { "auxiliary_loss_clip": 0.01227673, "auxiliary_loss_mlp": 0.01033045, "balance_loss_clip": 1.06493688, "balance_loss_mlp": 1.02207136, "epoch": 0.16725786087897554, "flos": 25302515207040.0, "grad_norm": 1.776664923183026, "language_loss": 0.78621006, "learning_rate": 3.8057346965065006e-06, "loss": 0.80881727, "num_input_tokens_seen": 29378230, "step": 1391, "time_per_iteration": 2.6764965057373047 }, { "auxiliary_loss_clip": 0.01224585, "auxiliary_loss_mlp": 0.0104221, "balance_loss_clip": 0.99047172, "balance_loss_mlp": 1.03187442, "epoch": 0.16737810376961462, "flos": 31831541516160.0, "grad_norm": 1.6837019310751036, "language_loss": 0.84370041, "learning_rate": 3.805399664653443e-06, "loss": 0.86636841, "num_input_tokens_seen": 29400370, "step": 1392, "time_per_iteration": 2.8374216556549072 }, { "auxiliary_loss_clip": 0.01228146, "auxiliary_loss_mlp": 0.01037796, "balance_loss_clip": 1.06533611, "balance_loss_mlp": 1.02704978, "epoch": 0.1674983466602537, "flos": 27961843553280.0, "grad_norm": 2.5018179017248956, "language_loss": 0.74412251, "learning_rate": 3.805064358922692e-06, "loss": 0.76678187, "num_input_tokens_seen": 29418660, "step": 1393, "time_per_iteration": 2.6492671966552734 }, { "auxiliary_loss_clip": 0.01229225, "auxiliary_loss_mlp": 0.01034395, "balance_loss_clip": 1.02778816, "balance_loss_mlp": 1.02325475, "epoch": 0.16761858955089282, "flos": 21762656858880.0, "grad_norm": 1.9797017044431426, "language_loss": 0.81408584, "learning_rate": 3.8047287793651136e-06, "loss": 0.83672208, "num_input_tokens_seen": 29440105, "step": 1394, "time_per_iteration": 2.693225860595703 }, { "auxiliary_loss_clip": 0.01225123, "auxiliary_loss_mlp": 0.01037295, "balance_loss_clip": 0.94912809, "balance_loss_mlp": 1.0264585, "epoch": 0.1677388324415319, "flos": 23805507058560.0, "grad_norm": 1.7734286529062135, "language_loss": 0.88750601, "learning_rate": 3.8043929260316137e-06, "loss": 0.91013026, "num_input_tokens_seen": 29458260, "step": 1395, "time_per_iteration": 2.736147880554199 }, { "auxiliary_loss_clip": 0.01229539, "auxiliary_loss_mlp": 0.01041691, "balance_loss_clip": 0.99400502, "balance_loss_mlp": 1.03093255, "epoch": 0.16785907533217098, "flos": 20558859431040.0, "grad_norm": 7.766669431293974, "language_loss": 0.83721411, "learning_rate": 3.8040567989731417e-06, "loss": 0.8599264, "num_input_tokens_seen": 29476205, "step": 1396, "time_per_iteration": 2.742990732192993 }, { "auxiliary_loss_clip": 0.01223729, "auxiliary_loss_mlp": 0.01037629, "balance_loss_clip": 1.02866483, "balance_loss_mlp": 1.02767539, "epoch": 0.16797931822281006, "flos": 15669657745920.0, "grad_norm": 2.1553775885827746, "language_loss": 0.79986185, "learning_rate": 3.8037203982406876e-06, "loss": 0.82247543, "num_input_tokens_seen": 29494370, "step": 1397, "time_per_iteration": 3.617685556411743 }, { "auxiliary_loss_clip": 0.01229307, "auxiliary_loss_mlp": 0.01039414, "balance_loss_clip": 1.06893492, "balance_loss_mlp": 1.02853632, "epoch": 0.16809956111344918, "flos": 16541101607040.0, "grad_norm": 2.0895104372728617, "language_loss": 0.73370445, "learning_rate": 3.8033837238852835e-06, "loss": 0.75639164, "num_input_tokens_seen": 29511070, "step": 1398, "time_per_iteration": 2.6557905673980713 }, { "auxiliary_loss_clip": 0.01207045, "auxiliary_loss_mlp": 0.01035199, "balance_loss_clip": 0.98216736, "balance_loss_mlp": 1.02466071, "epoch": 0.16821980400408826, "flos": 23258084808960.0, "grad_norm": 1.8377164569775868, "language_loss": 0.69574606, "learning_rate": 3.8030467759580017e-06, "loss": 0.7181685, "num_input_tokens_seen": 29531990, "step": 1399, "time_per_iteration": 3.658759355545044 }, { "auxiliary_loss_clip": 0.01225701, "auxiliary_loss_mlp": 0.01040243, "balance_loss_clip": 1.02540946, "balance_loss_mlp": 1.02945495, "epoch": 0.16834004689472734, "flos": 20774754126720.0, "grad_norm": 2.8639585916017336, "language_loss": 0.86749262, "learning_rate": 3.802709554509958e-06, "loss": 0.8901521, "num_input_tokens_seen": 29549790, "step": 1400, "time_per_iteration": 3.5319089889526367 }, { "auxiliary_loss_clip": 0.01221594, "auxiliary_loss_mlp": 0.01035502, "balance_loss_clip": 0.98632061, "balance_loss_mlp": 1.02514303, "epoch": 0.16846028978536645, "flos": 26687302289280.0, "grad_norm": 1.681372126761196, "language_loss": 0.79231596, "learning_rate": 3.8023720595923083e-06, "loss": 0.81488687, "num_input_tokens_seen": 29569045, "step": 1401, "time_per_iteration": 3.6385819911956787 }, { "auxiliary_loss_clip": 0.01213839, "auxiliary_loss_mlp": 0.01037992, "balance_loss_clip": 0.90654558, "balance_loss_mlp": 1.02702427, "epoch": 0.16858053267600553, "flos": 18843298980480.0, "grad_norm": 2.0121434601356953, "language_loss": 0.87355089, "learning_rate": 3.80203429125625e-06, "loss": 0.89606923, "num_input_tokens_seen": 29587220, "step": 1402, "time_per_iteration": 2.7745559215545654 }, { "auxiliary_loss_clip": 0.01205478, "auxiliary_loss_mlp": 0.01043594, "balance_loss_clip": 0.86716104, "balance_loss_mlp": 1.03319907, "epoch": 0.16870077556664462, "flos": 27744548227200.0, "grad_norm": 1.9455614026348038, "language_loss": 0.69944549, "learning_rate": 3.8016962495530225e-06, "loss": 0.72193623, "num_input_tokens_seen": 29606410, "step": 1403, "time_per_iteration": 2.8470160961151123 }, { "auxiliary_loss_clip": 0.01228875, "auxiliary_loss_mlp": 0.01041765, "balance_loss_clip": 1.06506097, "balance_loss_mlp": 1.03136945, "epoch": 0.1688210184572837, "flos": 13730768484480.0, "grad_norm": 2.2205599918155774, "language_loss": 0.76634657, "learning_rate": 3.8013579345339063e-06, "loss": 0.78905296, "num_input_tokens_seen": 29621275, "step": 1404, "time_per_iteration": 2.67893385887146 }, { "auxiliary_loss_clip": 0.0122253, "auxiliary_loss_mlp": 0.01039543, "balance_loss_clip": 0.94775534, "balance_loss_mlp": 1.02843308, "epoch": 0.1689412613479228, "flos": 26468785900800.0, "grad_norm": 1.8387313942729169, "language_loss": 0.69350815, "learning_rate": 3.801019346250224e-06, "loss": 0.71612883, "num_input_tokens_seen": 29641420, "step": 1405, "time_per_iteration": 2.748471260070801 }, { "auxiliary_loss_clip": 0.01219863, "auxiliary_loss_mlp": 0.01034348, "balance_loss_clip": 1.02232909, "balance_loss_mlp": 1.02412021, "epoch": 0.1690615042385619, "flos": 21138852337920.0, "grad_norm": 2.884316499328902, "language_loss": 0.83709502, "learning_rate": 3.8006804847533395e-06, "loss": 0.85963714, "num_input_tokens_seen": 29660935, "step": 1406, "time_per_iteration": 2.692992925643921 }, { "auxiliary_loss_clip": 0.01224016, "auxiliary_loss_mlp": 0.01029904, "balance_loss_clip": 1.06363988, "balance_loss_mlp": 1.01888287, "epoch": 0.16918174712920098, "flos": 20849340718080.0, "grad_norm": 2.0953811280879386, "language_loss": 0.85594684, "learning_rate": 3.8003413500946556e-06, "loss": 0.87848604, "num_input_tokens_seen": 29681045, "step": 1407, "time_per_iteration": 2.7124314308166504 }, { "auxiliary_loss_clip": 0.01223207, "auxiliary_loss_mlp": 0.01035138, "balance_loss_clip": 0.98820651, "balance_loss_mlp": 1.02492142, "epoch": 0.1693019900198401, "flos": 16983270028800.0, "grad_norm": 2.5864997352545025, "language_loss": 0.82441032, "learning_rate": 3.8000019423256216e-06, "loss": 0.84699374, "num_input_tokens_seen": 29698810, "step": 1408, "time_per_iteration": 2.71551775932312 }, { "auxiliary_loss_clip": 0.01210308, "auxiliary_loss_mlp": 0.01036398, "balance_loss_clip": 0.98534179, "balance_loss_mlp": 1.02635503, "epoch": 0.16942223291047917, "flos": 26796901662720.0, "grad_norm": 1.681194681543702, "language_loss": 0.8802464, "learning_rate": 3.7996622614977234e-06, "loss": 0.90271354, "num_input_tokens_seen": 29720000, "step": 1409, "time_per_iteration": 2.841222047805786 }, { "auxiliary_loss_clip": 0.01221746, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 0.98788512, "balance_loss_mlp": 1.02886307, "epoch": 0.16954247580111825, "flos": 18583700411520.0, "grad_norm": 1.837032885615014, "language_loss": 0.78943896, "learning_rate": 3.799322307662492e-06, "loss": 0.81204438, "num_input_tokens_seen": 29737820, "step": 1410, "time_per_iteration": 2.8021624088287354 }, { "auxiliary_loss_clip": 0.01219408, "auxiliary_loss_mlp": 0.01038313, "balance_loss_clip": 0.90867913, "balance_loss_mlp": 1.0276494, "epoch": 0.16966271869175734, "flos": 13983651210240.0, "grad_norm": 2.150768201457444, "language_loss": 0.83812207, "learning_rate": 3.798982080871496e-06, "loss": 0.86069924, "num_input_tokens_seen": 29752960, "step": 1411, "time_per_iteration": 2.859306573867798 }, { "auxiliary_loss_clip": 0.01226523, "auxiliary_loss_mlp": 0.01035513, "balance_loss_clip": 1.06400299, "balance_loss_mlp": 1.02437842, "epoch": 0.16978296158239645, "flos": 37487328284160.0, "grad_norm": 2.087771127961732, "language_loss": 0.6790216, "learning_rate": 3.798641581176349e-06, "loss": 0.70164198, "num_input_tokens_seen": 29775240, "step": 1412, "time_per_iteration": 2.7873575687408447 }, { "auxiliary_loss_clip": 0.01219608, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 0.98448402, "balance_loss_mlp": 1.02704298, "epoch": 0.16990320447303553, "flos": 28328958506880.0, "grad_norm": 2.0981778768712376, "language_loss": 0.74330831, "learning_rate": 3.7983008086287044e-06, "loss": 0.76587212, "num_input_tokens_seen": 29796560, "step": 1413, "time_per_iteration": 2.756704092025757 }, { "auxiliary_loss_clip": 0.01221089, "auxiliary_loss_mlp": 0.01038015, "balance_loss_clip": 0.98519796, "balance_loss_mlp": 1.02701807, "epoch": 0.1700234473636746, "flos": 20188189031040.0, "grad_norm": 2.3718744279033497, "language_loss": 0.79132533, "learning_rate": 3.797959763280257e-06, "loss": 0.81391633, "num_input_tokens_seen": 29815245, "step": 1414, "time_per_iteration": 2.673025131225586 }, { "auxiliary_loss_clip": 0.01222162, "auxiliary_loss_mlp": 0.01037837, "balance_loss_clip": 1.02343059, "balance_loss_mlp": 1.02697742, "epoch": 0.17014369025431372, "flos": 24858658846080.0, "grad_norm": 2.3027528130106094, "language_loss": 0.79067552, "learning_rate": 3.797618445182743e-06, "loss": 0.81327552, "num_input_tokens_seen": 29836640, "step": 1415, "time_per_iteration": 2.7187867164611816 }, { "auxiliary_loss_clip": 0.01210713, "auxiliary_loss_mlp": 0.01035264, "balance_loss_clip": 0.90602911, "balance_loss_mlp": 1.0247916, "epoch": 0.1702639331449528, "flos": 16467233287680.0, "grad_norm": 2.053386804575927, "language_loss": 0.84633803, "learning_rate": 3.79727685438794e-06, "loss": 0.86879784, "num_input_tokens_seen": 29850830, "step": 1416, "time_per_iteration": 2.715710163116455 }, { "auxiliary_loss_clip": 0.0114089, "auxiliary_loss_mlp": 0.01011475, "balance_loss_clip": 1.01007271, "balance_loss_mlp": 1.00613427, "epoch": 0.1703841760355919, "flos": 52508870979840.0, "grad_norm": 0.8416153502834665, "language_loss": 0.61684084, "learning_rate": 3.796934990947667e-06, "loss": 0.63836449, "num_input_tokens_seen": 29912515, "step": 1417, "time_per_iteration": 3.229262351989746 }, { "auxiliary_loss_clip": 0.01138664, "auxiliary_loss_mlp": 0.01011608, "balance_loss_clip": 1.00827503, "balance_loss_mlp": 1.00605321, "epoch": 0.170504418926231, "flos": 49370637576960.0, "grad_norm": 0.8918029131732983, "language_loss": 0.6243782, "learning_rate": 3.7965928549137854e-06, "loss": 0.64588094, "num_input_tokens_seen": 29969330, "step": 1418, "time_per_iteration": 3.2210707664489746 }, { "auxiliary_loss_clip": 0.01222835, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 0.94273901, "balance_loss_mlp": 1.0238626, "epoch": 0.17062466181687008, "flos": 25849219184640.0, "grad_norm": 2.0643910118475084, "language_loss": 0.77508569, "learning_rate": 3.7962504463381953e-06, "loss": 0.79766536, "num_input_tokens_seen": 29990820, "step": 1419, "time_per_iteration": 2.7932238578796387 }, { "auxiliary_loss_clip": 0.01212443, "auxiliary_loss_mlp": 0.01127552, "balance_loss_clip": 0.98477125, "balance_loss_mlp": 0.0, "epoch": 0.17074490470750917, "flos": 20960412549120.0, "grad_norm": 1.944521141302535, "language_loss": 0.7881012, "learning_rate": 3.7959077652728412e-06, "loss": 0.81150115, "num_input_tokens_seen": 30009275, "step": 1420, "time_per_iteration": 2.748434066772461 }, { "auxiliary_loss_clip": 0.01219857, "auxiliary_loss_mlp": 0.0103196, "balance_loss_clip": 0.98420703, "balance_loss_mlp": 1.02137387, "epoch": 0.17086514759814825, "flos": 20959766104320.0, "grad_norm": 2.092164516328352, "language_loss": 0.77020848, "learning_rate": 3.795564811769707e-06, "loss": 0.79272658, "num_input_tokens_seen": 30027630, "step": 1421, "time_per_iteration": 2.7824485301971436 }, { "auxiliary_loss_clip": 0.01222839, "auxiliary_loss_mlp": 0.01036535, "balance_loss_clip": 0.99023968, "balance_loss_mlp": 1.02585435, "epoch": 0.17098539048878736, "flos": 28474073452800.0, "grad_norm": 2.067056227069238, "language_loss": 0.77856642, "learning_rate": 3.795221585880818e-06, "loss": 0.8011601, "num_input_tokens_seen": 30048310, "step": 1422, "time_per_iteration": 2.7680647373199463 }, { "auxiliary_loss_clip": 0.0122498, "auxiliary_loss_mlp": 0.01038115, "balance_loss_clip": 0.95340276, "balance_loss_mlp": 1.02818537, "epoch": 0.17110563337942644, "flos": 16290014561280.0, "grad_norm": 1.9066896491091432, "language_loss": 0.91176915, "learning_rate": 3.794878087658242e-06, "loss": 0.93440014, "num_input_tokens_seen": 30066080, "step": 1423, "time_per_iteration": 3.6416242122650146 }, { "auxiliary_loss_clip": 0.01223088, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.02316535, "balance_loss_mlp": 1.02516735, "epoch": 0.17122587627006552, "flos": 29674207693440.0, "grad_norm": 2.1432245970277415, "language_loss": 0.78536534, "learning_rate": 3.7945343171540873e-06, "loss": 0.8079505, "num_input_tokens_seen": 30086955, "step": 1424, "time_per_iteration": 2.7379777431488037 }, { "auxiliary_loss_clip": 0.01225245, "auxiliary_loss_mlp": 0.01034919, "balance_loss_clip": 1.06324077, "balance_loss_mlp": 1.0246675, "epoch": 0.17134611916070464, "flos": 25338389915520.0, "grad_norm": 2.2705769708417174, "language_loss": 0.78939581, "learning_rate": 3.7941902744205033e-06, "loss": 0.81199747, "num_input_tokens_seen": 30107990, "step": 1425, "time_per_iteration": 3.598726272583008 }, { "auxiliary_loss_clip": 0.01225769, "auxiliary_loss_mlp": 0.01030031, "balance_loss_clip": 0.98571759, "balance_loss_mlp": 1.01930773, "epoch": 0.17146636205134372, "flos": 13953845900160.0, "grad_norm": 1.8564345679787464, "language_loss": 0.8358475, "learning_rate": 3.7938459595096817e-06, "loss": 0.85840547, "num_input_tokens_seen": 30126535, "step": 1426, "time_per_iteration": 3.6797492504119873 }, { "auxiliary_loss_clip": 0.01236223, "auxiliary_loss_mlp": 0.01039046, "balance_loss_clip": 1.02860975, "balance_loss_mlp": 1.02751231, "epoch": 0.1715866049419828, "flos": 23915214172800.0, "grad_norm": 1.7971457604788694, "language_loss": 0.861, "learning_rate": 3.7935013724738545e-06, "loss": 0.8837527, "num_input_tokens_seen": 30147035, "step": 1427, "time_per_iteration": 3.5231099128723145 }, { "auxiliary_loss_clip": 0.01220904, "auxiliary_loss_mlp": 0.01034387, "balance_loss_clip": 1.0261941, "balance_loss_mlp": 1.02445138, "epoch": 0.17170684783262188, "flos": 22709369669760.0, "grad_norm": 2.1390322947868254, "language_loss": 0.7785362, "learning_rate": 3.7931565133652945e-06, "loss": 0.80108905, "num_input_tokens_seen": 30167110, "step": 1428, "time_per_iteration": 2.6752405166625977 }, { "auxiliary_loss_clip": 0.01224038, "auxiliary_loss_mlp": 0.01047865, "balance_loss_clip": 1.0625844, "balance_loss_mlp": 1.03702331, "epoch": 0.171827090723261, "flos": 26613290315520.0, "grad_norm": 2.535277736022648, "language_loss": 0.68395144, "learning_rate": 3.792811382236317e-06, "loss": 0.70667052, "num_input_tokens_seen": 30185620, "step": 1429, "time_per_iteration": 2.6358935832977295 }, { "auxiliary_loss_clip": 0.01225524, "auxiliary_loss_mlp": 0.0104099, "balance_loss_clip": 1.02370477, "balance_loss_mlp": 1.03040457, "epoch": 0.17194733361390008, "flos": 28148507556480.0, "grad_norm": 1.8815318779950085, "language_loss": 0.77985179, "learning_rate": 3.792465979139279e-06, "loss": 0.80251694, "num_input_tokens_seen": 30208225, "step": 1430, "time_per_iteration": 2.7385568618774414 }, { "auxiliary_loss_clip": 0.01132525, "auxiliary_loss_mlp": 0.01007194, "balance_loss_clip": 0.9278121, "balance_loss_mlp": 1.00204372, "epoch": 0.17206757650453916, "flos": 65530689753600.0, "grad_norm": 0.9276122320385036, "language_loss": 0.65700316, "learning_rate": 3.792120304126576e-06, "loss": 0.6784004, "num_input_tokens_seen": 30271600, "step": 1431, "time_per_iteration": 3.374321460723877 }, { "auxiliary_loss_clip": 0.01214825, "auxiliary_loss_mlp": 0.01032564, "balance_loss_clip": 0.83143914, "balance_loss_mlp": 1.0219965, "epoch": 0.17218781939517827, "flos": 22273486128000.0, "grad_norm": 1.9740856744441453, "language_loss": 0.83406961, "learning_rate": 3.791774357250649e-06, "loss": 0.85654342, "num_input_tokens_seen": 30290430, "step": 1432, "time_per_iteration": 2.8212292194366455 }, { "auxiliary_loss_clip": 0.0121913, "auxiliary_loss_mlp": 0.01032208, "balance_loss_clip": 0.98576272, "balance_loss_mlp": 1.0216397, "epoch": 0.17230806228581735, "flos": 14137313592960.0, "grad_norm": 2.4647538929290636, "language_loss": 0.79289162, "learning_rate": 3.7914281385639757e-06, "loss": 0.81540495, "num_input_tokens_seen": 30308305, "step": 1433, "time_per_iteration": 2.708639144897461 }, { "auxiliary_loss_clip": 0.01223893, "auxiliary_loss_mlp": 0.01037109, "balance_loss_clip": 1.02357006, "balance_loss_mlp": 1.02670217, "epoch": 0.17242830517645644, "flos": 20704836303360.0, "grad_norm": 2.1064421227557695, "language_loss": 0.79479039, "learning_rate": 3.7910816481190784e-06, "loss": 0.8174004, "num_input_tokens_seen": 30328120, "step": 1434, "time_per_iteration": 2.677584648132324 }, { "auxiliary_loss_clip": 0.01211946, "auxiliary_loss_mlp": 0.01033979, "balance_loss_clip": 0.98457658, "balance_loss_mlp": 1.02300584, "epoch": 0.17254854806709552, "flos": 30774582887040.0, "grad_norm": 2.0285180806378733, "language_loss": 0.74936575, "learning_rate": 3.7907348859685193e-06, "loss": 0.77182502, "num_input_tokens_seen": 30349825, "step": 1435, "time_per_iteration": 2.797846555709839 }, { "auxiliary_loss_clip": 0.0121676, "auxiliary_loss_mlp": 0.01035603, "balance_loss_clip": 1.02391648, "balance_loss_mlp": 1.02586973, "epoch": 0.17266879095773463, "flos": 26614726859520.0, "grad_norm": 2.455526695203486, "language_loss": 0.8048197, "learning_rate": 3.790387852164902e-06, "loss": 0.82734334, "num_input_tokens_seen": 30370555, "step": 1436, "time_per_iteration": 2.7418124675750732 }, { "auxiliary_loss_clip": 0.0122557, "auxiliary_loss_mlp": 0.01032638, "balance_loss_clip": 1.02744341, "balance_loss_mlp": 1.02227235, "epoch": 0.1727890338483737, "flos": 20266295155200.0, "grad_norm": 1.7730763965617695, "language_loss": 0.76601046, "learning_rate": 3.7900405467608707e-06, "loss": 0.78859252, "num_input_tokens_seen": 30390100, "step": 1437, "time_per_iteration": 2.719839096069336 }, { "auxiliary_loss_clip": 0.01198895, "auxiliary_loss_mlp": 0.01042012, "balance_loss_clip": 0.90250993, "balance_loss_mlp": 1.03097951, "epoch": 0.1729092767390128, "flos": 18179812909440.0, "grad_norm": 3.361009358540471, "language_loss": 0.79573703, "learning_rate": 3.7896929698091114e-06, "loss": 0.81814611, "num_input_tokens_seen": 30402915, "step": 1438, "time_per_iteration": 2.752800226211548 }, { "auxiliary_loss_clip": 0.0123247, "auxiliary_loss_mlp": 0.01045398, "balance_loss_clip": 1.06925666, "balance_loss_mlp": 1.0346688, "epoch": 0.1730295196296519, "flos": 26759518583040.0, "grad_norm": 5.903064739348749, "language_loss": 0.67813778, "learning_rate": 3.7893451213623518e-06, "loss": 0.70091641, "num_input_tokens_seen": 30420145, "step": 1439, "time_per_iteration": 2.670522451400757 }, { "auxiliary_loss_clip": 0.01223805, "auxiliary_loss_mlp": 0.01127861, "balance_loss_clip": 1.02540171, "balance_loss_mlp": 0.0, "epoch": 0.173149762520291, "flos": 23842531002240.0, "grad_norm": 7.879289033474152, "language_loss": 0.82549846, "learning_rate": 3.7889970014733606e-06, "loss": 0.84901512, "num_input_tokens_seen": 30439250, "step": 1440, "time_per_iteration": 2.6995689868927 }, { "auxiliary_loss_clip": 0.01198215, "auxiliary_loss_mlp": 0.01030073, "balance_loss_clip": 0.90463096, "balance_loss_mlp": 1.01962459, "epoch": 0.17327000541093007, "flos": 23368186972800.0, "grad_norm": 1.8203823364589864, "language_loss": 0.78035343, "learning_rate": 3.7886486101949463e-06, "loss": 0.80263633, "num_input_tokens_seen": 30460430, "step": 1441, "time_per_iteration": 2.9254112243652344 }, { "auxiliary_loss_clip": 0.01204596, "auxiliary_loss_mlp": 0.01034348, "balance_loss_clip": 0.90547287, "balance_loss_mlp": 1.02394676, "epoch": 0.17339024830156918, "flos": 18221290139520.0, "grad_norm": 2.1100716055866187, "language_loss": 0.87833846, "learning_rate": 3.7882999475799594e-06, "loss": 0.90072787, "num_input_tokens_seen": 30478465, "step": 1442, "time_per_iteration": 2.840306282043457 }, { "auxiliary_loss_clip": 0.01193508, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 0.90589321, "balance_loss_mlp": 1.02577996, "epoch": 0.17351049119220827, "flos": 23332024955520.0, "grad_norm": 1.7474624998805541, "language_loss": 0.81544966, "learning_rate": 3.787951013681293e-06, "loss": 0.837749, "num_input_tokens_seen": 30496510, "step": 1443, "time_per_iteration": 2.7381224632263184 }, { "auxiliary_loss_clip": 0.01219422, "auxiliary_loss_mlp": 0.01038456, "balance_loss_clip": 1.02237535, "balance_loss_mlp": 1.02707791, "epoch": 0.17363073408284735, "flos": 23803495896960.0, "grad_norm": 2.1350964011453626, "language_loss": 0.77222621, "learning_rate": 3.787601808551879e-06, "loss": 0.79480499, "num_input_tokens_seen": 30516325, "step": 1444, "time_per_iteration": 2.701591968536377 }, { "auxiliary_loss_clip": 0.01222039, "auxiliary_loss_mlp": 0.01037585, "balance_loss_clip": 0.94652188, "balance_loss_mlp": 1.02719581, "epoch": 0.17375097697348643, "flos": 18515290959360.0, "grad_norm": 4.78103050179962, "language_loss": 0.84084523, "learning_rate": 3.7872523322446926e-06, "loss": 0.86344147, "num_input_tokens_seen": 30535210, "step": 1445, "time_per_iteration": 2.695833683013916 }, { "auxiliary_loss_clip": 0.0121929, "auxiliary_loss_mlp": 0.01034887, "balance_loss_clip": 0.90387845, "balance_loss_mlp": 1.02377701, "epoch": 0.17387121986412554, "flos": 38877897456000.0, "grad_norm": 2.8207490491861744, "language_loss": 0.60272598, "learning_rate": 3.7869025848127478e-06, "loss": 0.62526774, "num_input_tokens_seen": 30559405, "step": 1446, "time_per_iteration": 2.9511208534240723 }, { "auxiliary_loss_clip": 0.01224152, "auxiliary_loss_mlp": 0.01032407, "balance_loss_clip": 1.02349448, "balance_loss_mlp": 1.02213728, "epoch": 0.17399146275476463, "flos": 20375714960640.0, "grad_norm": 3.090764806019778, "language_loss": 0.80618596, "learning_rate": 3.786552566309102e-06, "loss": 0.82875156, "num_input_tokens_seen": 30577615, "step": 1447, "time_per_iteration": 2.629279136657715 }, { "auxiliary_loss_clip": 0.01220906, "auxiliary_loss_mlp": 0.01127829, "balance_loss_clip": 0.98961568, "balance_loss_mlp": 0.0, "epoch": 0.1741117056454037, "flos": 19164339763200.0, "grad_norm": 2.4281657965126966, "language_loss": 0.86403108, "learning_rate": 3.7862022767868517e-06, "loss": 0.88751841, "num_input_tokens_seen": 30595205, "step": 1448, "time_per_iteration": 2.746969223022461 }, { "auxiliary_loss_clip": 0.01213799, "auxiliary_loss_mlp": 0.01034059, "balance_loss_clip": 0.94824809, "balance_loss_mlp": 1.02363968, "epoch": 0.17423194853604282, "flos": 25374300537600.0, "grad_norm": 2.374435087705813, "language_loss": 0.84289992, "learning_rate": 3.7858517162991367e-06, "loss": 0.86537856, "num_input_tokens_seen": 30615280, "step": 1449, "time_per_iteration": 3.6277964115142822 }, { "auxiliary_loss_clip": 0.01217878, "auxiliary_loss_mlp": 0.01028829, "balance_loss_clip": 0.9450388, "balance_loss_mlp": 1.01895273, "epoch": 0.1743521914266819, "flos": 25191874339200.0, "grad_norm": 2.8446955026426077, "language_loss": 0.60981584, "learning_rate": 3.7855008848991363e-06, "loss": 0.63228291, "num_input_tokens_seen": 30633485, "step": 1450, "time_per_iteration": 2.8369107246398926 }, { "auxiliary_loss_clip": 0.01217447, "auxiliary_loss_mlp": 0.01036939, "balance_loss_clip": 0.98774302, "balance_loss_mlp": 1.02685964, "epoch": 0.17447243431732098, "flos": 25666577504640.0, "grad_norm": 1.9727491274171631, "language_loss": 0.77308643, "learning_rate": 3.7851497826400714e-06, "loss": 0.79563034, "num_input_tokens_seen": 30653625, "step": 1451, "time_per_iteration": 3.6749515533447266 }, { "auxiliary_loss_clip": 0.01225283, "auxiliary_loss_mlp": 0.01038605, "balance_loss_clip": 1.06507099, "balance_loss_mlp": 1.02823961, "epoch": 0.17459267720796007, "flos": 36281950657920.0, "grad_norm": 1.9517310046485834, "language_loss": 0.75805938, "learning_rate": 3.7847984095752034e-06, "loss": 0.78069824, "num_input_tokens_seen": 30677080, "step": 1452, "time_per_iteration": 3.740875244140625 }, { "auxiliary_loss_clip": 0.01222039, "auxiliary_loss_mlp": 0.01032745, "balance_loss_clip": 1.06206155, "balance_loss_mlp": 1.02236211, "epoch": 0.17471292009859918, "flos": 20011113959040.0, "grad_norm": 1.8175477254344568, "language_loss": 0.80034226, "learning_rate": 3.784446765757836e-06, "loss": 0.82289016, "num_input_tokens_seen": 30695725, "step": 1453, "time_per_iteration": 3.491454601287842 }, { "auxiliary_loss_clip": 0.01202106, "auxiliary_loss_mlp": 0.01035981, "balance_loss_clip": 0.9456566, "balance_loss_mlp": 1.024912, "epoch": 0.17483316298923826, "flos": 27819242559360.0, "grad_norm": 2.015124481921483, "language_loss": 0.77568877, "learning_rate": 3.7840948512413133e-06, "loss": 0.79806966, "num_input_tokens_seen": 30713310, "step": 1454, "time_per_iteration": 2.781083106994629 }, { "auxiliary_loss_clip": 0.01213533, "auxiliary_loss_mlp": 0.01037368, "balance_loss_clip": 0.94746405, "balance_loss_mlp": 1.02711558, "epoch": 0.17495340587987734, "flos": 44017934791680.0, "grad_norm": 2.056856600559266, "language_loss": 0.78717828, "learning_rate": 3.7837426660790196e-06, "loss": 0.80968726, "num_input_tokens_seen": 30734725, "step": 1455, "time_per_iteration": 2.8924458026885986 }, { "auxiliary_loss_clip": 0.01222146, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.06398797, "balance_loss_mlp": 1.0274477, "epoch": 0.17507364877051645, "flos": 20885825957760.0, "grad_norm": 2.7636591031027273, "language_loss": 0.81827056, "learning_rate": 3.783390210324382e-06, "loss": 0.8408649, "num_input_tokens_seen": 30754450, "step": 1456, "time_per_iteration": 2.6560745239257812 }, { "auxiliary_loss_clip": 0.01216129, "auxiliary_loss_mlp": 0.01031452, "balance_loss_clip": 0.94886518, "balance_loss_mlp": 1.0216949, "epoch": 0.17519389166115554, "flos": 24717602136960.0, "grad_norm": 4.6161754787188265, "language_loss": 0.72690737, "learning_rate": 3.7830374840308676e-06, "loss": 0.74938321, "num_input_tokens_seen": 30774605, "step": 1457, "time_per_iteration": 2.749913454055786 }, { "auxiliary_loss_clip": 0.01223792, "auxiliary_loss_mlp": 0.01034744, "balance_loss_clip": 1.02537692, "balance_loss_mlp": 1.02348423, "epoch": 0.17531413455179462, "flos": 23798144770560.0, "grad_norm": 3.6989506202724725, "language_loss": 0.82227379, "learning_rate": 3.7826844872519842e-06, "loss": 0.84485912, "num_input_tokens_seen": 30792460, "step": 1458, "time_per_iteration": 2.651806116104126 }, { "auxiliary_loss_clip": 0.01217142, "auxiliary_loss_mlp": 0.01035558, "balance_loss_clip": 0.98851281, "balance_loss_mlp": 1.02538371, "epoch": 0.1754343774424337, "flos": 24572379450240.0, "grad_norm": 2.0506986021142066, "language_loss": 0.72846377, "learning_rate": 3.782331220041282e-06, "loss": 0.75099075, "num_input_tokens_seen": 30812525, "step": 1459, "time_per_iteration": 2.7674171924591064 }, { "auxiliary_loss_clip": 0.01224455, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 0.9448123, "balance_loss_mlp": 1.02281654, "epoch": 0.17555462033307281, "flos": 18114599767680.0, "grad_norm": 13.431746479044929, "language_loss": 0.82695889, "learning_rate": 3.7819776824523504e-06, "loss": 0.84953356, "num_input_tokens_seen": 30830390, "step": 1460, "time_per_iteration": 2.710502862930298 }, { "auxiliary_loss_clip": 0.01229523, "auxiliary_loss_mlp": 0.01038792, "balance_loss_clip": 0.98669916, "balance_loss_mlp": 1.02870655, "epoch": 0.1756748632237119, "flos": 28366018364160.0, "grad_norm": 2.1506485039769787, "language_loss": 0.83716607, "learning_rate": 3.7816238745388213e-06, "loss": 0.85984921, "num_input_tokens_seen": 30849935, "step": 1461, "time_per_iteration": 2.7023630142211914 }, { "auxiliary_loss_clip": 0.01222846, "auxiliary_loss_mlp": 0.01035832, "balance_loss_clip": 0.98319006, "balance_loss_mlp": 1.02544272, "epoch": 0.17579510611435098, "flos": 25732939881600.0, "grad_norm": 2.137331603303107, "language_loss": 0.87098789, "learning_rate": 3.781269796354367e-06, "loss": 0.89357471, "num_input_tokens_seen": 30869555, "step": 1462, "time_per_iteration": 2.9235737323760986 }, { "auxiliary_loss_clip": 0.01222182, "auxiliary_loss_mlp": 0.01040025, "balance_loss_clip": 0.98666543, "balance_loss_mlp": 1.0301125, "epoch": 0.1759153490049901, "flos": 18588081870720.0, "grad_norm": 1.6193729685143672, "language_loss": 0.85918391, "learning_rate": 3.7809154479527006e-06, "loss": 0.88180602, "num_input_tokens_seen": 30888760, "step": 1463, "time_per_iteration": 2.769636869430542 }, { "auxiliary_loss_clip": 0.01211185, "auxiliary_loss_mlp": 0.01030936, "balance_loss_clip": 0.94743443, "balance_loss_mlp": 1.02172756, "epoch": 0.17603559189562917, "flos": 18619323724800.0, "grad_norm": 2.70153445152724, "language_loss": 0.84565115, "learning_rate": 3.780560829387577e-06, "loss": 0.86807233, "num_input_tokens_seen": 30907260, "step": 1464, "time_per_iteration": 2.708341121673584 }, { "auxiliary_loss_clip": 0.01134925, "auxiliary_loss_mlp": 0.01011428, "balance_loss_clip": 1.0064249, "balance_loss_mlp": 1.00603998, "epoch": 0.17615583478626826, "flos": 60530775373440.0, "grad_norm": 0.8587121476491945, "language_loss": 0.57965505, "learning_rate": 3.7802059407127915e-06, "loss": 0.60111862, "num_input_tokens_seen": 30965810, "step": 1465, "time_per_iteration": 3.2143850326538086 }, { "auxiliary_loss_clip": 0.01207599, "auxiliary_loss_mlp": 0.01035522, "balance_loss_clip": 0.98059571, "balance_loss_mlp": 1.02549028, "epoch": 0.17627607767690734, "flos": 23616221362560.0, "grad_norm": 2.0968955516078895, "language_loss": 0.85881841, "learning_rate": 3.7798507819821797e-06, "loss": 0.88124961, "num_input_tokens_seen": 30982935, "step": 1466, "time_per_iteration": 2.7912309169769287 }, { "auxiliary_loss_clip": 0.01205146, "auxiliary_loss_mlp": 0.01043489, "balance_loss_clip": 0.94369686, "balance_loss_mlp": 1.03249764, "epoch": 0.17639632056754645, "flos": 17639070589440.0, "grad_norm": 2.4180660605041977, "language_loss": 0.78692746, "learning_rate": 3.7794953532496197e-06, "loss": 0.80941379, "num_input_tokens_seen": 30998840, "step": 1467, "time_per_iteration": 2.7149970531463623 }, { "auxiliary_loss_clip": 0.01123537, "auxiliary_loss_mlp": 0.0112276, "balance_loss_clip": 0.88474488, "balance_loss_mlp": 0.0, "epoch": 0.17651656345818553, "flos": 57932604910080.0, "grad_norm": 0.8550478581421991, "language_loss": 0.57954013, "learning_rate": 3.7791396545690295e-06, "loss": 0.6020031, "num_input_tokens_seen": 31060075, "step": 1468, "time_per_iteration": 3.2714014053344727 }, { "auxiliary_loss_clip": 0.01222783, "auxiliary_loss_mlp": 0.01039336, "balance_loss_clip": 1.0270437, "balance_loss_mlp": 1.02873266, "epoch": 0.17663680634882462, "flos": 22929502170240.0, "grad_norm": 2.4211283437834448, "language_loss": 0.80510849, "learning_rate": 3.7787836859943685e-06, "loss": 0.8277297, "num_input_tokens_seen": 31078800, "step": 1469, "time_per_iteration": 2.695596218109131 }, { "auxiliary_loss_clip": 0.01221849, "auxiliary_loss_mlp": 0.01032785, "balance_loss_clip": 1.02477026, "balance_loss_mlp": 1.02268159, "epoch": 0.17675704923946373, "flos": 22637979388800.0, "grad_norm": 2.775505922865807, "language_loss": 0.79015082, "learning_rate": 3.7784274475796363e-06, "loss": 0.81269717, "num_input_tokens_seen": 31097430, "step": 1470, "time_per_iteration": 2.6744048595428467 }, { "auxiliary_loss_clip": 0.01220712, "auxiliary_loss_mlp": 0.01033391, "balance_loss_clip": 0.9456771, "balance_loss_mlp": 1.02260852, "epoch": 0.1768772921301028, "flos": 27126525795840.0, "grad_norm": 2.3107660383670887, "language_loss": 0.76163167, "learning_rate": 3.7780709393788745e-06, "loss": 0.78417265, "num_input_tokens_seen": 31117905, "step": 1471, "time_per_iteration": 2.744274616241455 }, { "auxiliary_loss_clip": 0.0121873, "auxiliary_loss_mlp": 0.01035337, "balance_loss_clip": 1.05958641, "balance_loss_mlp": 1.02509737, "epoch": 0.1769975350207419, "flos": 19172133014400.0, "grad_norm": 2.057695970231504, "language_loss": 0.75304246, "learning_rate": 3.777714161446165e-06, "loss": 0.77558309, "num_input_tokens_seen": 31137610, "step": 1472, "time_per_iteration": 2.6612963676452637 }, { "auxiliary_loss_clip": 0.01221915, "auxiliary_loss_mlp": 0.01037543, "balance_loss_clip": 1.02382648, "balance_loss_mlp": 1.02725577, "epoch": 0.177117777911381, "flos": 36134932291200.0, "grad_norm": 2.171213910231569, "language_loss": 0.69185913, "learning_rate": 3.7773571138356304e-06, "loss": 0.71445376, "num_input_tokens_seen": 31157780, "step": 1473, "time_per_iteration": 2.8052754402160645 }, { "auxiliary_loss_clip": 0.01201665, "auxiliary_loss_mlp": 0.01035918, "balance_loss_clip": 0.90739989, "balance_loss_mlp": 1.02561867, "epoch": 0.17723802080202009, "flos": 22090593052800.0, "grad_norm": 2.4109842958315877, "language_loss": 0.89006567, "learning_rate": 3.776999796601435e-06, "loss": 0.91244149, "num_input_tokens_seen": 31176540, "step": 1474, "time_per_iteration": 2.750242233276367 }, { "auxiliary_loss_clip": 0.01225818, "auxiliary_loss_mlp": 0.0104128, "balance_loss_clip": 1.02313733, "balance_loss_mlp": 1.03058076, "epoch": 0.17735826369265917, "flos": 30222671437440.0, "grad_norm": 1.8543625112383113, "language_loss": 0.72908229, "learning_rate": 3.776642209797783e-06, "loss": 0.75175327, "num_input_tokens_seen": 31198370, "step": 1475, "time_per_iteration": 3.7008273601531982 }, { "auxiliary_loss_clip": 0.01210373, "auxiliary_loss_mlp": 0.01032158, "balance_loss_clip": 1.01915002, "balance_loss_mlp": 1.02169156, "epoch": 0.17747850658329825, "flos": 21397588980480.0, "grad_norm": 2.028217403949056, "language_loss": 0.78130054, "learning_rate": 3.7762843534789205e-06, "loss": 0.80372584, "num_input_tokens_seen": 31217120, "step": 1476, "time_per_iteration": 2.6800475120544434 }, { "auxiliary_loss_clip": 0.01225872, "auxiliary_loss_mlp": 0.01039986, "balance_loss_clip": 0.98548174, "balance_loss_mlp": 1.02899539, "epoch": 0.17759874947393736, "flos": 16983341856000.0, "grad_norm": 2.2519739709402056, "language_loss": 0.88363367, "learning_rate": 3.7759262276991343e-06, "loss": 0.90629232, "num_input_tokens_seen": 31234730, "step": 1477, "time_per_iteration": 3.6415069103240967 }, { "auxiliary_loss_clip": 0.01224132, "auxiliary_loss_mlp": 0.01037045, "balance_loss_clip": 0.98665273, "balance_loss_mlp": 1.02608418, "epoch": 0.17771899236457644, "flos": 11546107390080.0, "grad_norm": 2.050908469044794, "language_loss": 0.79641557, "learning_rate": 3.7755678325127506e-06, "loss": 0.8190273, "num_input_tokens_seen": 31252410, "step": 1478, "time_per_iteration": 3.678212881088257 }, { "auxiliary_loss_clip": 0.01210788, "auxiliary_loss_mlp": 0.01035807, "balance_loss_clip": 0.90773666, "balance_loss_mlp": 1.02556729, "epoch": 0.17783923525521553, "flos": 18807747494400.0, "grad_norm": 1.9421131197856327, "language_loss": 0.75572681, "learning_rate": 3.7752091679741393e-06, "loss": 0.7781927, "num_input_tokens_seen": 31270200, "step": 1479, "time_per_iteration": 3.739596128463745 }, { "auxiliary_loss_clip": 0.0121657, "auxiliary_loss_mlp": 0.0104007, "balance_loss_clip": 1.02227378, "balance_loss_mlp": 1.02948451, "epoch": 0.17795947814585464, "flos": 30408365773440.0, "grad_norm": 2.608871118558892, "language_loss": 0.76903439, "learning_rate": 3.774850234137708e-06, "loss": 0.79160082, "num_input_tokens_seen": 31287495, "step": 1480, "time_per_iteration": 2.7029147148132324 }, { "auxiliary_loss_clip": 0.01217324, "auxiliary_loss_mlp": 0.01041756, "balance_loss_clip": 1.02233553, "balance_loss_mlp": 1.03104472, "epoch": 0.17807972103649372, "flos": 24389055411840.0, "grad_norm": 2.307415233468777, "language_loss": 0.82724649, "learning_rate": 3.7744910310579076e-06, "loss": 0.8498373, "num_input_tokens_seen": 31306420, "step": 1481, "time_per_iteration": 2.7295899391174316 }, { "auxiliary_loss_clip": 0.01221522, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.06383181, "balance_loss_mlp": 1.0296303, "epoch": 0.1781999639271328, "flos": 20301559332480.0, "grad_norm": 2.291934710473129, "language_loss": 0.85357302, "learning_rate": 3.774131558789229e-06, "loss": 0.87619215, "num_input_tokens_seen": 31325750, "step": 1482, "time_per_iteration": 2.5948657989501953 }, { "auxiliary_loss_clip": 0.0122651, "auxiliary_loss_mlp": 0.01127598, "balance_loss_clip": 1.06600046, "balance_loss_mlp": 0.0, "epoch": 0.1783202068177719, "flos": 15924479806080.0, "grad_norm": 2.7742974612690796, "language_loss": 0.69078183, "learning_rate": 3.773771817386203e-06, "loss": 0.71432287, "num_input_tokens_seen": 31343080, "step": 1483, "time_per_iteration": 2.694758653640747 }, { "auxiliary_loss_clip": 0.01219038, "auxiliary_loss_mlp": 0.01040976, "balance_loss_clip": 0.98398608, "balance_loss_mlp": 1.0308969, "epoch": 0.178440449708411, "flos": 20631758083200.0, "grad_norm": 1.8380943301349253, "language_loss": 0.79336715, "learning_rate": 3.773411806903403e-06, "loss": 0.81596726, "num_input_tokens_seen": 31362160, "step": 1484, "time_per_iteration": 2.6771438121795654 }, { "auxiliary_loss_clip": 0.01218725, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 0.8697536, "balance_loss_mlp": 1.02664208, "epoch": 0.17856069259905008, "flos": 21686059105920.0, "grad_norm": 1.7278366285822488, "language_loss": 0.94333994, "learning_rate": 3.7730515273954415e-06, "loss": 0.96590191, "num_input_tokens_seen": 31380770, "step": 1485, "time_per_iteration": 2.801913022994995 }, { "auxiliary_loss_clip": 0.01224128, "auxiliary_loss_mlp": 0.01034823, "balance_loss_clip": 1.06434524, "balance_loss_mlp": 1.02402842, "epoch": 0.17868093548968916, "flos": 26572962320640.0, "grad_norm": 2.0034450701289854, "language_loss": 0.85261464, "learning_rate": 3.772690978916973e-06, "loss": 0.87520421, "num_input_tokens_seen": 31400525, "step": 1486, "time_per_iteration": 2.667943239212036 }, { "auxiliary_loss_clip": 0.01219563, "auxiliary_loss_mlp": 0.01039678, "balance_loss_clip": 1.02371109, "balance_loss_mlp": 1.02874637, "epoch": 0.17880117838032827, "flos": 18581006891520.0, "grad_norm": 4.297521062962569, "language_loss": 0.86763382, "learning_rate": 3.772330161522693e-06, "loss": 0.89022619, "num_input_tokens_seen": 31418435, "step": 1487, "time_per_iteration": 2.6398961544036865 }, { "auxiliary_loss_clip": 0.01220657, "auxiliary_loss_mlp": 0.01040905, "balance_loss_clip": 0.98891854, "balance_loss_mlp": 1.0303843, "epoch": 0.17892142127096736, "flos": 26541217676160.0, "grad_norm": 1.9256641133426353, "language_loss": 0.7995162, "learning_rate": 3.7719690752673365e-06, "loss": 0.82213175, "num_input_tokens_seen": 31439230, "step": 1488, "time_per_iteration": 2.7059648036956787 }, { "auxiliary_loss_clip": 0.01216411, "auxiliary_loss_mlp": 0.01041391, "balance_loss_clip": 0.9475283, "balance_loss_mlp": 1.03053069, "epoch": 0.17904166416160644, "flos": 23872623621120.0, "grad_norm": 4.0703422674244365, "language_loss": 0.78164411, "learning_rate": 3.7716077202056796e-06, "loss": 0.80422211, "num_input_tokens_seen": 31457705, "step": 1489, "time_per_iteration": 2.7452056407928467 }, { "auxiliary_loss_clip": 0.01204411, "auxiliary_loss_mlp": 0.01041766, "balance_loss_clip": 0.9838323, "balance_loss_mlp": 1.03107858, "epoch": 0.17916190705224552, "flos": 19134426712320.0, "grad_norm": 2.209952782336745, "language_loss": 0.93517995, "learning_rate": 3.7712460963925404e-06, "loss": 0.95764172, "num_input_tokens_seen": 31473645, "step": 1490, "time_per_iteration": 2.636601448059082 }, { "auxiliary_loss_clip": 0.01209309, "auxiliary_loss_mlp": 0.01037692, "balance_loss_clip": 0.98242211, "balance_loss_mlp": 1.0273385, "epoch": 0.17928214994288463, "flos": 25152120961920.0, "grad_norm": 1.9231585889822476, "language_loss": 0.75474709, "learning_rate": 3.7708842038827775e-06, "loss": 0.77721709, "num_input_tokens_seen": 31492605, "step": 1491, "time_per_iteration": 2.7341666221618652 }, { "auxiliary_loss_clip": 0.0121857, "auxiliary_loss_mlp": 0.01038006, "balance_loss_clip": 1.02215552, "balance_loss_mlp": 1.02740204, "epoch": 0.17940239283352372, "flos": 22384629786240.0, "grad_norm": 3.9084240995028092, "language_loss": 0.85848421, "learning_rate": 3.770522042731288e-06, "loss": 0.88104999, "num_input_tokens_seen": 31514500, "step": 1492, "time_per_iteration": 2.7682597637176514 }, { "auxiliary_loss_clip": 0.01206546, "auxiliary_loss_mlp": 0.01042536, "balance_loss_clip": 0.90833443, "balance_loss_mlp": 1.03198612, "epoch": 0.1795226357241628, "flos": 23178685795200.0, "grad_norm": 2.1444808899361485, "language_loss": 0.87641776, "learning_rate": 3.7701596129930122e-06, "loss": 0.89890862, "num_input_tokens_seen": 31533225, "step": 1493, "time_per_iteration": 2.7732303142547607 }, { "auxiliary_loss_clip": 0.0122874, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 0.94862258, "balance_loss_mlp": 1.02081013, "epoch": 0.1796428786148019, "flos": 22090413484800.0, "grad_norm": 1.7992891085679048, "language_loss": 0.73527771, "learning_rate": 3.7697969147229315e-06, "loss": 0.75788182, "num_input_tokens_seen": 31551385, "step": 1494, "time_per_iteration": 2.8225412368774414 }, { "auxiliary_loss_clip": 0.01218634, "auxiliary_loss_mlp": 0.01036464, "balance_loss_clip": 1.02361465, "balance_loss_mlp": 1.026021, "epoch": 0.179763121505441, "flos": 21324618501120.0, "grad_norm": 2.4310200499743164, "language_loss": 0.85421932, "learning_rate": 3.7694339479760647e-06, "loss": 0.87677026, "num_input_tokens_seen": 31570415, "step": 1495, "time_per_iteration": 2.6583383083343506 }, { "auxiliary_loss_clip": 0.01126331, "auxiliary_loss_mlp": 0.01015513, "balance_loss_clip": 0.96285832, "balance_loss_mlp": 1.0103631, "epoch": 0.17988336439608008, "flos": 68161864815360.0, "grad_norm": 0.9909461366139642, "language_loss": 0.57303309, "learning_rate": 3.769070712807476e-06, "loss": 0.59445155, "num_input_tokens_seen": 31632445, "step": 1496, "time_per_iteration": 3.344831943511963 }, { "auxiliary_loss_clip": 0.01198686, "auxiliary_loss_mlp": 0.01040554, "balance_loss_clip": 0.86771929, "balance_loss_mlp": 1.02977753, "epoch": 0.18000360728671919, "flos": 21945047143680.0, "grad_norm": 1.8473192654441326, "language_loss": 0.786807, "learning_rate": 3.768707209272266e-06, "loss": 0.80919945, "num_input_tokens_seen": 31652575, "step": 1497, "time_per_iteration": 2.803335189819336 }, { "auxiliary_loss_clip": 0.01212033, "auxiliary_loss_mlp": 0.01037814, "balance_loss_clip": 0.98308414, "balance_loss_mlp": 1.02718043, "epoch": 0.18012385017735827, "flos": 18986330937600.0, "grad_norm": 2.3670935228672816, "language_loss": 0.76448905, "learning_rate": 3.768343437425579e-06, "loss": 0.78698754, "num_input_tokens_seen": 31671145, "step": 1498, "time_per_iteration": 2.6632847785949707 }, { "auxiliary_loss_clip": 0.01201676, "auxiliary_loss_mlp": 0.0103927, "balance_loss_clip": 0.82738161, "balance_loss_mlp": 1.02826679, "epoch": 0.18024409306799735, "flos": 19748103598080.0, "grad_norm": 2.3320267741001253, "language_loss": 0.86128259, "learning_rate": 3.7679793973225987e-06, "loss": 0.88369203, "num_input_tokens_seen": 31686955, "step": 1499, "time_per_iteration": 2.7862725257873535 }, { "auxiliary_loss_clip": 0.01121973, "auxiliary_loss_mlp": 0.01007187, "balance_loss_clip": 0.88499671, "balance_loss_mlp": 1.00210881, "epoch": 0.18036433595863643, "flos": 67227183060480.0, "grad_norm": 0.8545573675477448, "language_loss": 0.61614871, "learning_rate": 3.767615089018549e-06, "loss": 0.63744032, "num_input_tokens_seen": 31749300, "step": 1500, "time_per_iteration": 3.2718756198883057 }, { "auxiliary_loss_clip": 0.01206402, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 0.9825958, "balance_loss_mlp": 1.02158153, "epoch": 0.18048457884927555, "flos": 18181464935040.0, "grad_norm": 2.3642428112350604, "language_loss": 0.86031842, "learning_rate": 3.7672505125686966e-06, "loss": 0.88270926, "num_input_tokens_seen": 31765665, "step": 1501, "time_per_iteration": 3.713339328765869 }, { "auxiliary_loss_clip": 0.01209987, "auxiliary_loss_mlp": 0.01040425, "balance_loss_clip": 0.90419471, "balance_loss_mlp": 1.03016675, "epoch": 0.18060482173991463, "flos": 15813767111040.0, "grad_norm": 3.2108658449268774, "language_loss": 0.84074688, "learning_rate": 3.7668856680283455e-06, "loss": 0.86325103, "num_input_tokens_seen": 31782690, "step": 1502, "time_per_iteration": 2.7153382301330566 }, { "auxiliary_loss_clip": 0.01224677, "auxiliary_loss_mlp": 0.01031511, "balance_loss_clip": 0.98648793, "balance_loss_mlp": 1.02050841, "epoch": 0.1807250646305537, "flos": 18587399512320.0, "grad_norm": 2.0442332694466323, "language_loss": 0.82488304, "learning_rate": 3.7665205554528437e-06, "loss": 0.84744489, "num_input_tokens_seen": 31802045, "step": 1503, "time_per_iteration": 3.6648521423339844 }, { "auxiliary_loss_clip": 0.01216677, "auxiliary_loss_mlp": 0.01049779, "balance_loss_clip": 0.98561156, "balance_loss_mlp": 1.03961647, "epoch": 0.18084530752119282, "flos": 23149131880320.0, "grad_norm": 1.7884769459921845, "language_loss": 0.74111378, "learning_rate": 3.7661551748975782e-06, "loss": 0.76377833, "num_input_tokens_seen": 31820220, "step": 1504, "time_per_iteration": 3.6453049182891846 }, { "auxiliary_loss_clip": 0.01128056, "auxiliary_loss_mlp": 0.01008482, "balance_loss_clip": 0.96126199, "balance_loss_mlp": 1.0034039, "epoch": 0.1809655504118319, "flos": 59803153568640.0, "grad_norm": 0.972777796428356, "language_loss": 0.60459203, "learning_rate": 3.7657895264179772e-06, "loss": 0.62595737, "num_input_tokens_seen": 31876195, "step": 1505, "time_per_iteration": 4.173713207244873 }, { "auxiliary_loss_clip": 0.01208726, "auxiliary_loss_mlp": 0.01032877, "balance_loss_clip": 0.98211402, "balance_loss_mlp": 1.02236891, "epoch": 0.181085793302471, "flos": 44201941188480.0, "grad_norm": 2.0356474811312006, "language_loss": 0.74591553, "learning_rate": 3.765423610069509e-06, "loss": 0.76833153, "num_input_tokens_seen": 31901585, "step": 1506, "time_per_iteration": 2.856381416320801 }, { "auxiliary_loss_clip": 0.01219816, "auxiliary_loss_mlp": 0.01033047, "balance_loss_clip": 0.98818445, "balance_loss_mlp": 1.02220547, "epoch": 0.18120603619311007, "flos": 34898384638080.0, "grad_norm": 3.401740728883686, "language_loss": 0.72863472, "learning_rate": 3.765057425907683e-06, "loss": 0.75116336, "num_input_tokens_seen": 31923045, "step": 1507, "time_per_iteration": 2.848522424697876 }, { "auxiliary_loss_clip": 0.01225656, "auxiliary_loss_mlp": 0.01032505, "balance_loss_clip": 1.02341437, "balance_loss_mlp": 1.02171087, "epoch": 0.18132627908374918, "flos": 21506757390720.0, "grad_norm": 2.581439471851583, "language_loss": 0.78414327, "learning_rate": 3.764690973988048e-06, "loss": 0.80672491, "num_input_tokens_seen": 31943385, "step": 1508, "time_per_iteration": 2.707705497741699 }, { "auxiliary_loss_clip": 0.0122445, "auxiliary_loss_mlp": 0.01033457, "balance_loss_clip": 0.94973886, "balance_loss_mlp": 1.02343106, "epoch": 0.18144652197438826, "flos": 29057693633280.0, "grad_norm": 2.005931757532964, "language_loss": 0.74332821, "learning_rate": 3.7643242543661967e-06, "loss": 0.76590723, "num_input_tokens_seen": 31966045, "step": 1509, "time_per_iteration": 2.799921989440918 }, { "auxiliary_loss_clip": 0.01116279, "auxiliary_loss_mlp": 0.01003929, "balance_loss_clip": 0.95831752, "balance_loss_mlp": 0.99875563, "epoch": 0.18156676486502735, "flos": 68675064382080.0, "grad_norm": 0.8115755485826464, "language_loss": 0.60506344, "learning_rate": 3.7639572670977573e-06, "loss": 0.62626553, "num_input_tokens_seen": 32021540, "step": 1510, "time_per_iteration": 3.1865227222442627 }, { "auxiliary_loss_clip": 0.01222275, "auxiliary_loss_mlp": 0.01036889, "balance_loss_clip": 0.94818664, "balance_loss_mlp": 1.02627945, "epoch": 0.18168700775566646, "flos": 26471515334400.0, "grad_norm": 1.8030127759505026, "language_loss": 0.76635647, "learning_rate": 3.7635900122384042e-06, "loss": 0.78894812, "num_input_tokens_seen": 32044535, "step": 1511, "time_per_iteration": 2.767364501953125 }, { "auxiliary_loss_clip": 0.01220793, "auxiliary_loss_mlp": 0.01035461, "balance_loss_clip": 0.98429388, "balance_loss_mlp": 1.02452397, "epoch": 0.18180725064630554, "flos": 15005668884480.0, "grad_norm": 2.5388170802986076, "language_loss": 0.87024307, "learning_rate": 3.7632224898438477e-06, "loss": 0.8928057, "num_input_tokens_seen": 32061010, "step": 1512, "time_per_iteration": 2.733278512954712 }, { "auxiliary_loss_clip": 0.01221246, "auxiliary_loss_mlp": 0.01033883, "balance_loss_clip": 0.94555867, "balance_loss_mlp": 1.02380967, "epoch": 0.18192749353694462, "flos": 19682387665920.0, "grad_norm": 1.8278595318912754, "language_loss": 0.79109573, "learning_rate": 3.762854699969842e-06, "loss": 0.81364703, "num_input_tokens_seen": 32081520, "step": 1513, "time_per_iteration": 2.7558624744415283 }, { "auxiliary_loss_clip": 0.01219186, "auxiliary_loss_mlp": 0.0103617, "balance_loss_clip": 1.02540112, "balance_loss_mlp": 1.02591181, "epoch": 0.1820477364275837, "flos": 20702717400960.0, "grad_norm": 1.8981047002262192, "language_loss": 0.73306584, "learning_rate": 3.762486642672179e-06, "loss": 0.75561941, "num_input_tokens_seen": 32098460, "step": 1514, "time_per_iteration": 2.657149314880371 }, { "auxiliary_loss_clip": 0.01224269, "auxiliary_loss_mlp": 0.0103899, "balance_loss_clip": 0.98836672, "balance_loss_mlp": 1.0287199, "epoch": 0.18216797931822282, "flos": 17128708197120.0, "grad_norm": 2.3847042247888672, "language_loss": 0.8701182, "learning_rate": 3.7621183180066946e-06, "loss": 0.89275074, "num_input_tokens_seen": 32116420, "step": 1515, "time_per_iteration": 2.7091410160064697 }, { "auxiliary_loss_clip": 0.01216166, "auxiliary_loss_mlp": 0.01038213, "balance_loss_clip": 0.98177618, "balance_loss_mlp": 1.02765703, "epoch": 0.1822882222088619, "flos": 29242561956480.0, "grad_norm": 1.518745457907666, "language_loss": 0.73875356, "learning_rate": 3.7617497260292625e-06, "loss": 0.76129735, "num_input_tokens_seen": 32138475, "step": 1516, "time_per_iteration": 2.7711570262908936 }, { "auxiliary_loss_clip": 0.01211077, "auxiliary_loss_mlp": 0.01038265, "balance_loss_clip": 0.98229444, "balance_loss_mlp": 1.02775085, "epoch": 0.18240846509950098, "flos": 17702739446400.0, "grad_norm": 2.83453674515599, "language_loss": 0.78923047, "learning_rate": 3.7613808667957967e-06, "loss": 0.81172383, "num_input_tokens_seen": 32151165, "step": 1517, "time_per_iteration": 2.589765787124634 }, { "auxiliary_loss_clip": 0.01221109, "auxiliary_loss_mlp": 0.01040252, "balance_loss_clip": 0.9857372, "balance_loss_mlp": 1.02961898, "epoch": 0.1825287079901401, "flos": 14790025584000.0, "grad_norm": 1.9717751618982362, "language_loss": 0.90918422, "learning_rate": 3.7610117403622547e-06, "loss": 0.9317978, "num_input_tokens_seen": 32167725, "step": 1518, "time_per_iteration": 2.7266182899475098 }, { "auxiliary_loss_clip": 0.01206428, "auxiliary_loss_mlp": 0.01037532, "balance_loss_clip": 0.94267261, "balance_loss_mlp": 1.02637959, "epoch": 0.18264895088077918, "flos": 21946232292480.0, "grad_norm": 1.6389829768711541, "language_loss": 0.90275908, "learning_rate": 3.7606423467846313e-06, "loss": 0.92519867, "num_input_tokens_seen": 32187330, "step": 1519, "time_per_iteration": 2.7746312618255615 }, { "auxiliary_loss_clip": 0.01224498, "auxiliary_loss_mlp": 0.01038902, "balance_loss_clip": 0.94926268, "balance_loss_mlp": 1.02848864, "epoch": 0.18276919377141826, "flos": 20886759711360.0, "grad_norm": 6.089273027398127, "language_loss": 0.79282737, "learning_rate": 3.760272686118964e-06, "loss": 0.81546134, "num_input_tokens_seen": 32205550, "step": 1520, "time_per_iteration": 2.6982057094573975 }, { "auxiliary_loss_clip": 0.01220841, "auxiliary_loss_mlp": 0.01038835, "balance_loss_clip": 0.98520827, "balance_loss_mlp": 1.0281961, "epoch": 0.18288943666205737, "flos": 21469877101440.0, "grad_norm": 2.2673084996019184, "language_loss": 0.92722619, "learning_rate": 3.7599027584213297e-06, "loss": 0.94982302, "num_input_tokens_seen": 32224430, "step": 1521, "time_per_iteration": 2.762747287750244 }, { "auxiliary_loss_clip": 0.01222286, "auxiliary_loss_mlp": 0.01045073, "balance_loss_clip": 1.02228272, "balance_loss_mlp": 1.03364682, "epoch": 0.18300967955269645, "flos": 21539363961600.0, "grad_norm": 1.9123574614788597, "language_loss": 0.77886248, "learning_rate": 3.7595325637478465e-06, "loss": 0.80153608, "num_input_tokens_seen": 32242455, "step": 1522, "time_per_iteration": 2.6424601078033447 }, { "auxiliary_loss_clip": 0.0121318, "auxiliary_loss_mlp": 0.01041312, "balance_loss_clip": 0.98850238, "balance_loss_mlp": 1.03026724, "epoch": 0.18312992244333554, "flos": 28876237102080.0, "grad_norm": 1.8696439255181956, "language_loss": 0.81485593, "learning_rate": 3.7591621021546723e-06, "loss": 0.83740085, "num_input_tokens_seen": 32264450, "step": 1523, "time_per_iteration": 2.766500234603882 }, { "auxiliary_loss_clip": 0.01216699, "auxiliary_loss_mlp": 0.01034317, "balance_loss_clip": 1.02381074, "balance_loss_mlp": 1.02295637, "epoch": 0.18325016533397462, "flos": 20120102801280.0, "grad_norm": 1.656926125087907, "language_loss": 0.81359828, "learning_rate": 3.7587913736980062e-06, "loss": 0.83610839, "num_input_tokens_seen": 32284090, "step": 1524, "time_per_iteration": 2.666642665863037 }, { "auxiliary_loss_clip": 0.01199875, "auxiliary_loss_mlp": 0.01038719, "balance_loss_clip": 0.86607128, "balance_loss_mlp": 1.02749515, "epoch": 0.18337040822461373, "flos": 23329187781120.0, "grad_norm": 1.6724694058429925, "language_loss": 0.84748423, "learning_rate": 3.7584203784340865e-06, "loss": 0.86987013, "num_input_tokens_seen": 32303260, "step": 1525, "time_per_iteration": 2.811844825744629 }, { "auxiliary_loss_clip": 0.01215289, "auxiliary_loss_mlp": 0.01036485, "balance_loss_clip": 0.98350042, "balance_loss_mlp": 1.02577972, "epoch": 0.1834906511152528, "flos": 25009555881600.0, "grad_norm": 2.4390505660027353, "language_loss": 0.85320073, "learning_rate": 3.7580491164191938e-06, "loss": 0.87571847, "num_input_tokens_seen": 32321570, "step": 1526, "time_per_iteration": 2.690946340560913 }, { "auxiliary_loss_clip": 0.01125418, "auxiliary_loss_mlp": 0.0100935, "balance_loss_clip": 0.99741006, "balance_loss_mlp": 1.00417614, "epoch": 0.1836108940058919, "flos": 67251493589760.0, "grad_norm": 0.7493874806177653, "language_loss": 0.61273354, "learning_rate": 3.757677587709648e-06, "loss": 0.63408124, "num_input_tokens_seen": 32384835, "step": 1527, "time_per_iteration": 4.168447732925415 }, { "auxiliary_loss_clip": 0.01215832, "auxiliary_loss_mlp": 0.01036317, "balance_loss_clip": 0.95065504, "balance_loss_mlp": 1.02506328, "epoch": 0.183731136896531, "flos": 25738721971200.0, "grad_norm": 2.163776052671387, "language_loss": 0.75904703, "learning_rate": 3.7573057923618095e-06, "loss": 0.78156859, "num_input_tokens_seen": 32404930, "step": 1528, "time_per_iteration": 3.6047170162200928 }, { "auxiliary_loss_clip": 0.01207537, "auxiliary_loss_mlp": 0.01036789, "balance_loss_clip": 0.9045766, "balance_loss_mlp": 1.02551222, "epoch": 0.1838513797871701, "flos": 20449403712000.0, "grad_norm": 2.435210745239451, "language_loss": 0.74043196, "learning_rate": 3.7569337304320793e-06, "loss": 0.7628752, "num_input_tokens_seen": 32424515, "step": 1529, "time_per_iteration": 2.764995813369751 }, { "auxiliary_loss_clip": 0.01124573, "auxiliary_loss_mlp": 0.010091, "balance_loss_clip": 0.95897496, "balance_loss_mlp": 1.00390291, "epoch": 0.18397162267780917, "flos": 68565141786240.0, "grad_norm": 0.8435068442142483, "language_loss": 0.64454317, "learning_rate": 3.756561401976899e-06, "loss": 0.66587991, "num_input_tokens_seen": 32484220, "step": 1530, "time_per_iteration": 4.024532079696655 }, { "auxiliary_loss_clip": 0.01228284, "auxiliary_loss_mlp": 0.01035005, "balance_loss_clip": 1.064852, "balance_loss_mlp": 1.02388871, "epoch": 0.18409186556844825, "flos": 31941104976000.0, "grad_norm": 1.9483786665911607, "language_loss": 0.82760113, "learning_rate": 3.7561888070527514e-06, "loss": 0.85023403, "num_input_tokens_seen": 32506260, "step": 1531, "time_per_iteration": 2.8423593044281006 }, { "auxiliary_loss_clip": 0.01205074, "auxiliary_loss_mlp": 0.01128094, "balance_loss_clip": 0.94813645, "balance_loss_mlp": 0.0, "epoch": 0.18421210845908736, "flos": 20120533764480.0, "grad_norm": 2.684988393343302, "language_loss": 0.79707336, "learning_rate": 3.7558159457161577e-06, "loss": 0.82040513, "num_input_tokens_seen": 32524225, "step": 1532, "time_per_iteration": 3.6455302238464355 }, { "auxiliary_loss_clip": 0.01221701, "auxiliary_loss_mlp": 0.01127727, "balance_loss_clip": 0.98772115, "balance_loss_mlp": 0.0, "epoch": 0.18433235134972645, "flos": 23110491824640.0, "grad_norm": 6.063451271854564, "language_loss": 0.78298867, "learning_rate": 3.755442818023681e-06, "loss": 0.80648297, "num_input_tokens_seen": 32543850, "step": 1533, "time_per_iteration": 2.7101340293884277 }, { "auxiliary_loss_clip": 0.01218017, "auxiliary_loss_mlp": 0.01039252, "balance_loss_clip": 0.94781244, "balance_loss_mlp": 1.028476, "epoch": 0.18445259424036553, "flos": 18291351617280.0, "grad_norm": 2.1794306027642647, "language_loss": 0.75756538, "learning_rate": 3.7550694240319246e-06, "loss": 0.78013802, "num_input_tokens_seen": 32561725, "step": 1534, "time_per_iteration": 2.741417169570923 }, { "auxiliary_loss_clip": 0.01223747, "auxiliary_loss_mlp": 0.01029279, "balance_loss_clip": 1.02219892, "balance_loss_mlp": 1.01904488, "epoch": 0.18457283713100464, "flos": 21324079797120.0, "grad_norm": 2.641789613389145, "language_loss": 0.76260567, "learning_rate": 3.7546957637975326e-06, "loss": 0.78513598, "num_input_tokens_seen": 32579135, "step": 1535, "time_per_iteration": 2.657919406890869 }, { "auxiliary_loss_clip": 0.01199879, "auxiliary_loss_mlp": 0.01044038, "balance_loss_clip": 0.86269486, "balance_loss_mlp": 1.03376842, "epoch": 0.18469308002164372, "flos": 20375679047040.0, "grad_norm": 1.5675321287082964, "language_loss": 0.74023485, "learning_rate": 3.7543218373771873e-06, "loss": 0.76267397, "num_input_tokens_seen": 32598460, "step": 1536, "time_per_iteration": 2.8819005489349365 }, { "auxiliary_loss_clip": 0.0120291, "auxiliary_loss_mlp": 0.01128041, "balance_loss_clip": 0.86661184, "balance_loss_mlp": 0.0, "epoch": 0.1848133229122828, "flos": 26435892021120.0, "grad_norm": 1.5471911324559835, "language_loss": 0.78336662, "learning_rate": 3.753947644827615e-06, "loss": 0.80667615, "num_input_tokens_seen": 32621920, "step": 1537, "time_per_iteration": 2.8810486793518066 }, { "auxiliary_loss_clip": 0.01128573, "auxiliary_loss_mlp": 0.01006444, "balance_loss_clip": 0.96050245, "balance_loss_mlp": 1.00134218, "epoch": 0.1849335658029219, "flos": 70547447612160.0, "grad_norm": 0.922196076926753, "language_loss": 0.5722599, "learning_rate": 3.753573186205579e-06, "loss": 0.59361005, "num_input_tokens_seen": 32690040, "step": 1538, "time_per_iteration": 3.393996000289917 }, { "auxiliary_loss_clip": 0.01204927, "auxiliary_loss_mlp": 0.01127956, "balance_loss_clip": 0.9809674, "balance_loss_mlp": 0.0, "epoch": 0.185053808693561, "flos": 17384140788480.0, "grad_norm": 2.5187579497116754, "language_loss": 0.78257018, "learning_rate": 3.753198461567885e-06, "loss": 0.80589902, "num_input_tokens_seen": 32707285, "step": 1539, "time_per_iteration": 2.7070086002349854 }, { "auxiliary_loss_clip": 0.01211748, "auxiliary_loss_mlp": 0.01034535, "balance_loss_clip": 0.94994211, "balance_loss_mlp": 1.0246172, "epoch": 0.18517405158420008, "flos": 28986159697920.0, "grad_norm": 1.8154132201027322, "language_loss": 0.9193961, "learning_rate": 3.7528234709713783e-06, "loss": 0.94185901, "num_input_tokens_seen": 32730030, "step": 1540, "time_per_iteration": 2.7835798263549805 }, { "auxiliary_loss_clip": 0.01224719, "auxiliary_loss_mlp": 0.01034025, "balance_loss_clip": 1.0264982, "balance_loss_mlp": 1.02373171, "epoch": 0.18529429447483917, "flos": 26794962328320.0, "grad_norm": 1.9280385886488802, "language_loss": 0.84277248, "learning_rate": 3.7524482144729447e-06, "loss": 0.86535996, "num_input_tokens_seen": 32749485, "step": 1541, "time_per_iteration": 2.7083382606506348 }, { "auxiliary_loss_clip": 0.01208339, "auxiliary_loss_mlp": 0.01037236, "balance_loss_clip": 0.94322294, "balance_loss_mlp": 1.02678776, "epoch": 0.18541453736547828, "flos": 13581595301760.0, "grad_norm": 1.9611849746159749, "language_loss": 0.83724368, "learning_rate": 3.7520726921295106e-06, "loss": 0.85969937, "num_input_tokens_seen": 32766205, "step": 1542, "time_per_iteration": 2.6506288051605225 }, { "auxiliary_loss_clip": 0.01212829, "auxiliary_loss_mlp": 0.01037325, "balance_loss_clip": 1.02031112, "balance_loss_mlp": 1.02681088, "epoch": 0.18553478025611736, "flos": 24025424077440.0, "grad_norm": 2.0210219295104497, "language_loss": 0.72385311, "learning_rate": 3.751696903998042e-06, "loss": 0.74635464, "num_input_tokens_seen": 32784840, "step": 1543, "time_per_iteration": 2.737553358078003 }, { "auxiliary_loss_clip": 0.01218829, "auxiliary_loss_mlp": 0.01037998, "balance_loss_clip": 1.02464628, "balance_loss_mlp": 1.02705407, "epoch": 0.18565502314675644, "flos": 25885165720320.0, "grad_norm": 1.7850101548185335, "language_loss": 0.70118099, "learning_rate": 3.7513208501355456e-06, "loss": 0.72374928, "num_input_tokens_seen": 32805945, "step": 1544, "time_per_iteration": 2.6618640422821045 }, { "auxiliary_loss_clip": 0.0121175, "auxiliary_loss_mlp": 0.01042404, "balance_loss_clip": 0.98329961, "balance_loss_mlp": 1.03209805, "epoch": 0.18577526603739553, "flos": 19610063631360.0, "grad_norm": 2.025734596074511, "language_loss": 0.8353548, "learning_rate": 3.750944530599069e-06, "loss": 0.85789633, "num_input_tokens_seen": 32825515, "step": 1545, "time_per_iteration": 2.744520902633667 }, { "auxiliary_loss_clip": 0.01228105, "auxiliary_loss_mlp": 0.0103396, "balance_loss_clip": 1.02637529, "balance_loss_mlp": 1.02282, "epoch": 0.18589550892803464, "flos": 18474891137280.0, "grad_norm": 2.1629476297471286, "language_loss": 0.80601454, "learning_rate": 3.7505679454456992e-06, "loss": 0.82863516, "num_input_tokens_seen": 32842125, "step": 1546, "time_per_iteration": 2.6197149753570557 }, { "auxiliary_loss_clip": 0.01202777, "auxiliary_loss_mlp": 0.01036796, "balance_loss_clip": 0.82720315, "balance_loss_mlp": 1.02596545, "epoch": 0.18601575181867372, "flos": 23549966726400.0, "grad_norm": 6.234507529824744, "language_loss": 0.70433998, "learning_rate": 3.750191094732564e-06, "loss": 0.72673571, "num_input_tokens_seen": 32862990, "step": 1547, "time_per_iteration": 2.8268232345581055 }, { "auxiliary_loss_clip": 0.0120133, "auxiliary_loss_mlp": 0.01127914, "balance_loss_clip": 0.8269062, "balance_loss_mlp": 0.0, "epoch": 0.1861359947093128, "flos": 26360192108160.0, "grad_norm": 3.182505202271995, "language_loss": 0.75189531, "learning_rate": 3.7498139785168313e-06, "loss": 0.77518773, "num_input_tokens_seen": 32883595, "step": 1548, "time_per_iteration": 2.8313329219818115 }, { "auxiliary_loss_clip": 0.01220504, "auxiliary_loss_mlp": 0.01038089, "balance_loss_clip": 1.02747416, "balance_loss_mlp": 1.02814746, "epoch": 0.1862562375999519, "flos": 23331198942720.0, "grad_norm": 1.9386641082320737, "language_loss": 0.77021253, "learning_rate": 3.749436596855709e-06, "loss": 0.7927984, "num_input_tokens_seen": 32902895, "step": 1549, "time_per_iteration": 2.747950315475464 }, { "auxiliary_loss_clip": 0.01212256, "auxiliary_loss_mlp": 0.01035521, "balance_loss_clip": 1.02182364, "balance_loss_mlp": 1.02490592, "epoch": 0.186376480490591, "flos": 16648222942080.0, "grad_norm": 2.1409167951963566, "language_loss": 0.90850306, "learning_rate": 3.749058949806446e-06, "loss": 0.9309808, "num_input_tokens_seen": 32919620, "step": 1550, "time_per_iteration": 2.6950807571411133 }, { "auxiliary_loss_clip": 0.01217749, "auxiliary_loss_mlp": 0.01036771, "balance_loss_clip": 1.0220325, "balance_loss_mlp": 1.02636981, "epoch": 0.18649672338123008, "flos": 21468656039040.0, "grad_norm": 1.6570600015458514, "language_loss": 0.84252107, "learning_rate": 3.748681037426331e-06, "loss": 0.86506635, "num_input_tokens_seen": 32938830, "step": 1551, "time_per_iteration": 2.622922420501709 }, { "auxiliary_loss_clip": 0.01222016, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.06387162, "balance_loss_mlp": 1.02603459, "epoch": 0.1866169662718692, "flos": 12312728386560.0, "grad_norm": 2.2396822885999836, "language_loss": 0.91599375, "learning_rate": 3.7483028597726936e-06, "loss": 0.93857896, "num_input_tokens_seen": 32955600, "step": 1552, "time_per_iteration": 2.608158588409424 }, { "auxiliary_loss_clip": 0.01220546, "auxiliary_loss_mlp": 0.01039102, "balance_loss_clip": 0.95073748, "balance_loss_mlp": 1.02765799, "epoch": 0.18673720916250827, "flos": 23581280407680.0, "grad_norm": 1.82187756218373, "language_loss": 0.62381637, "learning_rate": 3.7479244169029017e-06, "loss": 0.64641285, "num_input_tokens_seen": 32975390, "step": 1553, "time_per_iteration": 3.624063491821289 }, { "auxiliary_loss_clip": 0.01220924, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.02149785, "balance_loss_mlp": 1.01711655, "epoch": 0.18685745205314735, "flos": 19718370115200.0, "grad_norm": 2.4183351152026242, "language_loss": 0.73403728, "learning_rate": 3.7475457088743658e-06, "loss": 0.75652504, "num_input_tokens_seen": 32992640, "step": 1554, "time_per_iteration": 3.6638669967651367 }, { "auxiliary_loss_clip": 0.01210691, "auxiliary_loss_mlp": 0.01032091, "balance_loss_clip": 0.98687822, "balance_loss_mlp": 1.02164781, "epoch": 0.18697769494378644, "flos": 34204123589760.0, "grad_norm": 2.141393541718933, "language_loss": 0.74623388, "learning_rate": 3.7471667357445348e-06, "loss": 0.76866174, "num_input_tokens_seen": 33012470, "step": 1555, "time_per_iteration": 3.692715644836426 }, { "auxiliary_loss_clip": 0.01212488, "auxiliary_loss_mlp": 0.01029321, "balance_loss_clip": 0.86854529, "balance_loss_mlp": 1.01872301, "epoch": 0.18709793783442555, "flos": 34241327101440.0, "grad_norm": 2.5444820943914244, "language_loss": 0.72271729, "learning_rate": 3.7467874975709e-06, "loss": 0.74513531, "num_input_tokens_seen": 33033275, "step": 1556, "time_per_iteration": 2.88051176071167 }, { "auxiliary_loss_clip": 0.012275, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.0286243, "balance_loss_mlp": 1.02688301, "epoch": 0.18721818072506463, "flos": 40734550529280.0, "grad_norm": 2.1597985574944127, "language_loss": 0.78601623, "learning_rate": 3.7464079944109904e-06, "loss": 0.80866611, "num_input_tokens_seen": 33055135, "step": 1557, "time_per_iteration": 2.8607914447784424 }, { "auxiliary_loss_clip": 0.01214782, "auxiliary_loss_mlp": 0.01026688, "balance_loss_clip": 0.94346344, "balance_loss_mlp": 1.0172168, "epoch": 0.18733842361570371, "flos": 22157386392960.0, "grad_norm": 2.4997896100288504, "language_loss": 0.77412415, "learning_rate": 3.746028226322376e-06, "loss": 0.79653883, "num_input_tokens_seen": 33071015, "step": 1558, "time_per_iteration": 3.7889819145202637 }, { "auxiliary_loss_clip": 0.01215159, "auxiliary_loss_mlp": 0.01031215, "balance_loss_clip": 0.98487568, "balance_loss_mlp": 1.02103484, "epoch": 0.18745866650634282, "flos": 18914940656640.0, "grad_norm": 1.7647392842013072, "language_loss": 0.75782752, "learning_rate": 3.745648193362669e-06, "loss": 0.7802912, "num_input_tokens_seen": 33090370, "step": 1559, "time_per_iteration": 2.7788212299346924 }, { "auxiliary_loss_clip": 0.01218684, "auxiliary_loss_mlp": 0.01041245, "balance_loss_clip": 0.98603475, "balance_loss_mlp": 1.03115964, "epoch": 0.1875789093969819, "flos": 19314626267520.0, "grad_norm": 2.042591603370094, "language_loss": 0.71839797, "learning_rate": 3.745267895589518e-06, "loss": 0.74099731, "num_input_tokens_seen": 33108910, "step": 1560, "time_per_iteration": 2.662247896194458 }, { "auxiliary_loss_clip": 0.01218759, "auxiliary_loss_mlp": 0.01030562, "balance_loss_clip": 0.98664665, "balance_loss_mlp": 1.02084661, "epoch": 0.187699152287621, "flos": 17018965169280.0, "grad_norm": 2.3043381712355733, "language_loss": 0.82368886, "learning_rate": 3.7448873330606154e-06, "loss": 0.84618211, "num_input_tokens_seen": 33126680, "step": 1561, "time_per_iteration": 2.656966209411621 }, { "auxiliary_loss_clip": 0.01207399, "auxiliary_loss_mlp": 0.01042295, "balance_loss_clip": 0.94732285, "balance_loss_mlp": 1.03030825, "epoch": 0.18781939517826007, "flos": 22346384780160.0, "grad_norm": 2.2821468253103663, "language_loss": 0.87081897, "learning_rate": 3.7445065058336914e-06, "loss": 0.89331597, "num_input_tokens_seen": 33145550, "step": 1562, "time_per_iteration": 2.7156050205230713 }, { "auxiliary_loss_clip": 0.01196221, "auxiliary_loss_mlp": 0.01037346, "balance_loss_clip": 0.90134996, "balance_loss_mlp": 1.02640259, "epoch": 0.18793963806889918, "flos": 14611478054400.0, "grad_norm": 1.7184017421522266, "language_loss": 0.86494112, "learning_rate": 3.7441254139665176e-06, "loss": 0.88727677, "num_input_tokens_seen": 33161735, "step": 1563, "time_per_iteration": 2.8349523544311523 }, { "auxiliary_loss_clip": 0.01225349, "auxiliary_loss_mlp": 0.01033421, "balance_loss_clip": 1.06711149, "balance_loss_mlp": 1.02315676, "epoch": 0.18805988095953827, "flos": 17457075354240.0, "grad_norm": 1.7744974478514453, "language_loss": 0.82631934, "learning_rate": 3.743744057516905e-06, "loss": 0.84890711, "num_input_tokens_seen": 33179795, "step": 1564, "time_per_iteration": 2.6978836059570312 }, { "auxiliary_loss_clip": 0.01213352, "auxiliary_loss_mlp": 0.0103778, "balance_loss_clip": 0.90873563, "balance_loss_mlp": 1.02669978, "epoch": 0.18818012385017735, "flos": 15043877976960.0, "grad_norm": 2.9902941923067017, "language_loss": 0.87556595, "learning_rate": 3.743362436542706e-06, "loss": 0.89807725, "num_input_tokens_seen": 33194485, "step": 1565, "time_per_iteration": 2.824460983276367 }, { "auxiliary_loss_clip": 0.01221635, "auxiliary_loss_mlp": 0.01030774, "balance_loss_clip": 1.06203961, "balance_loss_mlp": 1.02033091, "epoch": 0.18830036674081646, "flos": 47551975136640.0, "grad_norm": 2.0220304787263768, "language_loss": 0.76763725, "learning_rate": 3.7429805511018115e-06, "loss": 0.79016137, "num_input_tokens_seen": 33216145, "step": 1566, "time_per_iteration": 2.8290510177612305 }, { "auxiliary_loss_clip": 0.01214131, "auxiliary_loss_mlp": 0.01128465, "balance_loss_clip": 0.94805223, "balance_loss_mlp": 0.0, "epoch": 0.18842060963145554, "flos": 30044626698240.0, "grad_norm": 1.6617412079026828, "language_loss": 0.78082025, "learning_rate": 3.7425984012521524e-06, "loss": 0.80424619, "num_input_tokens_seen": 33236345, "step": 1567, "time_per_iteration": 2.7791688442230225 }, { "auxiliary_loss_clip": 0.01118826, "auxiliary_loss_mlp": 0.01122396, "balance_loss_clip": 0.91908801, "balance_loss_mlp": 0.0, "epoch": 0.18854085252209463, "flos": 70318372625280.0, "grad_norm": 0.7576224564787651, "language_loss": 0.60440403, "learning_rate": 3.7422159870517025e-06, "loss": 0.62681627, "num_input_tokens_seen": 33301600, "step": 1568, "time_per_iteration": 3.2971949577331543 }, { "auxiliary_loss_clip": 0.01213734, "auxiliary_loss_mlp": 0.01034407, "balance_loss_clip": 0.98425055, "balance_loss_mlp": 1.02424431, "epoch": 0.1886610954127337, "flos": 21289318410240.0, "grad_norm": 1.5322107024289464, "language_loss": 0.78712511, "learning_rate": 3.7418333085584717e-06, "loss": 0.80960655, "num_input_tokens_seen": 33322785, "step": 1569, "time_per_iteration": 2.749469041824341 }, { "auxiliary_loss_clip": 0.01217668, "auxiliary_loss_mlp": 0.01038685, "balance_loss_clip": 0.94852853, "balance_loss_mlp": 1.02882028, "epoch": 0.18878133830337282, "flos": 17266819991040.0, "grad_norm": 3.124407925658383, "language_loss": 0.90783584, "learning_rate": 3.7414503658305128e-06, "loss": 0.93039936, "num_input_tokens_seen": 33340020, "step": 1570, "time_per_iteration": 2.774329662322998 }, { "auxiliary_loss_clip": 0.01219686, "auxiliary_loss_mlp": 0.01031329, "balance_loss_clip": 0.90548348, "balance_loss_mlp": 1.02084446, "epoch": 0.1889015811940119, "flos": 25775207210880.0, "grad_norm": 2.5254341465295687, "language_loss": 0.77939105, "learning_rate": 3.7410671589259185e-06, "loss": 0.80190122, "num_input_tokens_seen": 33358620, "step": 1571, "time_per_iteration": 2.8492259979248047 }, { "auxiliary_loss_clip": 0.01222898, "auxiliary_loss_mlp": 0.01043745, "balance_loss_clip": 1.0644803, "balance_loss_mlp": 1.03309333, "epoch": 0.18902182408465099, "flos": 21032197879680.0, "grad_norm": 2.9678536912915305, "language_loss": 0.796404, "learning_rate": 3.7406836879028205e-06, "loss": 0.81907046, "num_input_tokens_seen": 33378845, "step": 1572, "time_per_iteration": 2.6566336154937744 }, { "auxiliary_loss_clip": 0.01219114, "auxiliary_loss_mlp": 0.01039705, "balance_loss_clip": 1.0266279, "balance_loss_mlp": 1.02929842, "epoch": 0.1891420669752901, "flos": 22272121411200.0, "grad_norm": 2.142538751314335, "language_loss": 0.77039331, "learning_rate": 3.7402999528193907e-06, "loss": 0.79298151, "num_input_tokens_seen": 33398345, "step": 1573, "time_per_iteration": 2.7761759757995605 }, { "auxiliary_loss_clip": 0.01201482, "auxiliary_loss_mlp": 0.01128023, "balance_loss_clip": 0.94589424, "balance_loss_mlp": 0.0, "epoch": 0.18926230986592918, "flos": 22017802141440.0, "grad_norm": 3.8473184576503874, "language_loss": 0.85560882, "learning_rate": 3.739915953733842e-06, "loss": 0.87890387, "num_input_tokens_seen": 33416390, "step": 1574, "time_per_iteration": 2.778399705886841 }, { "auxiliary_loss_clip": 0.01221891, "auxiliary_loss_mlp": 0.01034386, "balance_loss_clip": 1.06220233, "balance_loss_mlp": 1.02377677, "epoch": 0.18938255275656826, "flos": 24462672336000.0, "grad_norm": 2.5789876422672173, "language_loss": 0.82006335, "learning_rate": 3.7395316907044264e-06, "loss": 0.84262609, "num_input_tokens_seen": 33437175, "step": 1575, "time_per_iteration": 2.6679880619049072 }, { "auxiliary_loss_clip": 0.01219494, "auxiliary_loss_mlp": 0.01034099, "balance_loss_clip": 1.02414238, "balance_loss_mlp": 1.02392483, "epoch": 0.18950279564720737, "flos": 24427049022720.0, "grad_norm": 1.5244337459984934, "language_loss": 0.79772162, "learning_rate": 3.7391471637894364e-06, "loss": 0.8202576, "num_input_tokens_seen": 33459440, "step": 1576, "time_per_iteration": 2.7203779220581055 }, { "auxiliary_loss_clip": 0.0121625, "auxiliary_loss_mlp": 0.01038764, "balance_loss_clip": 0.94502991, "balance_loss_mlp": 1.02851832, "epoch": 0.18962303853784646, "flos": 19756291898880.0, "grad_norm": 2.2436480895847395, "language_loss": 0.84869564, "learning_rate": 3.738762373047205e-06, "loss": 0.8712458, "num_input_tokens_seen": 33479360, "step": 1577, "time_per_iteration": 2.73563551902771 }, { "auxiliary_loss_clip": 0.01218996, "auxiliary_loss_mlp": 0.01028446, "balance_loss_clip": 0.94770455, "balance_loss_mlp": 1.01851022, "epoch": 0.18974328142848554, "flos": 21032054225280.0, "grad_norm": 1.6531666213439131, "language_loss": 0.83411413, "learning_rate": 3.738377318536103e-06, "loss": 0.8565886, "num_input_tokens_seen": 33499245, "step": 1578, "time_per_iteration": 2.752204656600952 }, { "auxiliary_loss_clip": 0.01219724, "auxiliary_loss_mlp": 0.01037418, "balance_loss_clip": 1.06512094, "balance_loss_mlp": 1.02775049, "epoch": 0.18986352431912462, "flos": 12966122736000.0, "grad_norm": 2.4146126949776843, "language_loss": 0.713732, "learning_rate": 3.7379920003145447e-06, "loss": 0.73630339, "num_input_tokens_seen": 33513520, "step": 1579, "time_per_iteration": 3.5319066047668457 }, { "auxiliary_loss_clip": 0.01212623, "auxiliary_loss_mlp": 0.01038996, "balance_loss_clip": 0.98743439, "balance_loss_mlp": 1.02776051, "epoch": 0.18998376720976373, "flos": 23767908497280.0, "grad_norm": 2.0287572124960245, "language_loss": 0.83587241, "learning_rate": 3.7376064184409817e-06, "loss": 0.85838854, "num_input_tokens_seen": 33533100, "step": 1580, "time_per_iteration": 3.602477550506592 }, { "auxiliary_loss_clip": 0.01216062, "auxiliary_loss_mlp": 0.01036782, "balance_loss_clip": 0.98779124, "balance_loss_mlp": 1.02594018, "epoch": 0.19010401010040281, "flos": 22966023323520.0, "grad_norm": 2.1389975533140793, "language_loss": 0.87069046, "learning_rate": 3.7372205729739063e-06, "loss": 0.89321887, "num_input_tokens_seen": 33554915, "step": 1581, "time_per_iteration": 3.759874105453491 }, { "auxiliary_loss_clip": 0.01222847, "auxiliary_loss_mlp": 0.01036711, "balance_loss_clip": 1.02471817, "balance_loss_mlp": 1.02570772, "epoch": 0.1902242529910419, "flos": 19135647774720.0, "grad_norm": 2.2645807687792767, "language_loss": 0.71959507, "learning_rate": 3.7368344639718514e-06, "loss": 0.74219066, "num_input_tokens_seen": 33572850, "step": 1582, "time_per_iteration": 2.6539809703826904 }, { "auxiliary_loss_clip": 0.01219323, "auxiliary_loss_mlp": 0.01033451, "balance_loss_clip": 1.02327919, "balance_loss_mlp": 1.02344942, "epoch": 0.190344495881681, "flos": 25483935824640.0, "grad_norm": 1.54481417034873, "language_loss": 0.80520922, "learning_rate": 3.7364480914933895e-06, "loss": 0.82773703, "num_input_tokens_seen": 33593090, "step": 1583, "time_per_iteration": 2.7198872566223145 }, { "auxiliary_loss_clip": 0.01211671, "auxiliary_loss_mlp": 0.01127898, "balance_loss_clip": 0.90974414, "balance_loss_mlp": 0.0, "epoch": 0.1904647387723201, "flos": 26792843425920.0, "grad_norm": 1.889082049877589, "language_loss": 0.80910873, "learning_rate": 3.7360614555971325e-06, "loss": 0.83250439, "num_input_tokens_seen": 33612745, "step": 1584, "time_per_iteration": 3.6977150440216064 }, { "auxiliary_loss_clip": 0.01218203, "auxiliary_loss_mlp": 0.01127607, "balance_loss_clip": 1.02392447, "balance_loss_mlp": 0.0, "epoch": 0.19058498166295917, "flos": 23987753688960.0, "grad_norm": 2.1760486949284017, "language_loss": 0.84702498, "learning_rate": 3.735674556341733e-06, "loss": 0.8704831, "num_input_tokens_seen": 33632360, "step": 1585, "time_per_iteration": 2.673722267150879 }, { "auxiliary_loss_clip": 0.01214988, "auxiliary_loss_mlp": 0.01034541, "balance_loss_clip": 0.98750347, "balance_loss_mlp": 1.02448535, "epoch": 0.19070522455359826, "flos": 28293299280000.0, "grad_norm": 1.927295314030911, "language_loss": 0.82589686, "learning_rate": 3.7352873937858835e-06, "loss": 0.84839213, "num_input_tokens_seen": 33653895, "step": 1586, "time_per_iteration": 2.794018030166626 }, { "auxiliary_loss_clip": 0.01208323, "auxiliary_loss_mlp": 0.01127666, "balance_loss_clip": 0.94430906, "balance_loss_mlp": 0.0, "epoch": 0.19082546744423737, "flos": 25660220797440.0, "grad_norm": 1.8783223797083337, "language_loss": 0.72202337, "learning_rate": 3.734899967988316e-06, "loss": 0.74538326, "num_input_tokens_seen": 33672075, "step": 1587, "time_per_iteration": 2.784284830093384 }, { "auxiliary_loss_clip": 0.01204253, "auxiliary_loss_mlp": 0.01032975, "balance_loss_clip": 0.94344079, "balance_loss_mlp": 1.02289593, "epoch": 0.19094571033487645, "flos": 19719483436800.0, "grad_norm": 1.6983273067380809, "language_loss": 0.83785236, "learning_rate": 3.7345122790078026e-06, "loss": 0.86022472, "num_input_tokens_seen": 33689640, "step": 1588, "time_per_iteration": 2.7300777435302734 }, { "auxiliary_loss_clip": 0.01216819, "auxiliary_loss_mlp": 0.01034929, "balance_loss_clip": 1.02358222, "balance_loss_mlp": 1.02424824, "epoch": 0.19106595322551553, "flos": 21616320850560.0, "grad_norm": 2.8350386461869705, "language_loss": 0.92944175, "learning_rate": 3.7341243269031556e-06, "loss": 0.95195919, "num_input_tokens_seen": 33708630, "step": 1589, "time_per_iteration": 2.738039016723633 }, { "auxiliary_loss_clip": 0.01208721, "auxiliary_loss_mlp": 0.01035221, "balance_loss_clip": 0.98557377, "balance_loss_mlp": 1.0254395, "epoch": 0.19118619611615464, "flos": 29896890059520.0, "grad_norm": 1.6353581764857104, "language_loss": 0.7741363, "learning_rate": 3.7337361117332275e-06, "loss": 0.79657578, "num_input_tokens_seen": 33730370, "step": 1590, "time_per_iteration": 2.8342690467834473 }, { "auxiliary_loss_clip": 0.01214963, "auxiliary_loss_mlp": 0.01031, "balance_loss_clip": 0.94436777, "balance_loss_mlp": 1.02080727, "epoch": 0.19130643900679373, "flos": 17273428093440.0, "grad_norm": 3.0974490953350196, "language_loss": 0.77104449, "learning_rate": 3.7333476335569087e-06, "loss": 0.79350412, "num_input_tokens_seen": 33748370, "step": 1591, "time_per_iteration": 2.7460079193115234 }, { "auxiliary_loss_clip": 0.01214333, "auxiliary_loss_mlp": 0.01028466, "balance_loss_clip": 0.9849562, "balance_loss_mlp": 1.01823759, "epoch": 0.1914266818974328, "flos": 24826339584000.0, "grad_norm": 2.7979840775124964, "language_loss": 0.67291516, "learning_rate": 3.7329588924331325e-06, "loss": 0.69534314, "num_input_tokens_seen": 33769575, "step": 1592, "time_per_iteration": 2.779834032058716 }, { "auxiliary_loss_clip": 0.01205764, "auxiliary_loss_mlp": 0.01040584, "balance_loss_clip": 0.94331551, "balance_loss_mlp": 1.03061771, "epoch": 0.1915469247880719, "flos": 18952467390720.0, "grad_norm": 1.7246207827664024, "language_loss": 0.82393903, "learning_rate": 3.732569888420871e-06, "loss": 0.84640247, "num_input_tokens_seen": 33789110, "step": 1593, "time_per_iteration": 2.7457573413848877 }, { "auxiliary_loss_clip": 0.01220637, "auxiliary_loss_mlp": 0.01040353, "balance_loss_clip": 1.06049097, "balance_loss_mlp": 1.03038144, "epoch": 0.191667167678711, "flos": 21032952065280.0, "grad_norm": 2.3841829045591387, "language_loss": 0.82554406, "learning_rate": 3.732180621579134e-06, "loss": 0.84815395, "num_input_tokens_seen": 33808325, "step": 1594, "time_per_iteration": 2.6956474781036377 }, { "auxiliary_loss_clip": 0.01226342, "auxiliary_loss_mlp": 0.01040351, "balance_loss_clip": 0.94926965, "balance_loss_mlp": 1.02974176, "epoch": 0.1917874105693501, "flos": 34237663914240.0, "grad_norm": 1.9261394783916204, "language_loss": 0.8140378, "learning_rate": 3.7317910919669745e-06, "loss": 0.83670485, "num_input_tokens_seen": 33829520, "step": 1595, "time_per_iteration": 2.876417398452759 }, { "auxiliary_loss_clip": 0.0122067, "auxiliary_loss_mlp": 0.01033781, "balance_loss_clip": 1.02538466, "balance_loss_mlp": 1.02274776, "epoch": 0.19190765345998917, "flos": 23550613171200.0, "grad_norm": 2.4294604105987996, "language_loss": 0.76358151, "learning_rate": 3.7314012996434826e-06, "loss": 0.78612602, "num_input_tokens_seen": 33848250, "step": 1596, "time_per_iteration": 2.6571226119995117 }, { "auxiliary_loss_clip": 0.01219485, "auxiliary_loss_mlp": 0.0103618, "balance_loss_clip": 0.98772341, "balance_loss_mlp": 1.0257436, "epoch": 0.19202789635062828, "flos": 19861330245120.0, "grad_norm": 3.1790011323167326, "language_loss": 0.80884731, "learning_rate": 3.7310112446677907e-06, "loss": 0.83140397, "num_input_tokens_seen": 33866160, "step": 1597, "time_per_iteration": 2.6682708263397217 }, { "auxiliary_loss_clip": 0.01222229, "auxiliary_loss_mlp": 0.01036669, "balance_loss_clip": 1.06389761, "balance_loss_mlp": 1.02573144, "epoch": 0.19214813924126736, "flos": 20922957642240.0, "grad_norm": 2.152014543988575, "language_loss": 0.69401163, "learning_rate": 3.7306209270990695e-06, "loss": 0.7166006, "num_input_tokens_seen": 33884165, "step": 1598, "time_per_iteration": 2.6273739337921143 }, { "auxiliary_loss_clip": 0.01218407, "auxiliary_loss_mlp": 0.01037762, "balance_loss_clip": 0.98667765, "balance_loss_mlp": 1.02771831, "epoch": 0.19226838213190645, "flos": 26359725231360.0, "grad_norm": 1.9165059547380705, "language_loss": 0.86712599, "learning_rate": 3.7302303469965292e-06, "loss": 0.88968766, "num_input_tokens_seen": 33903705, "step": 1599, "time_per_iteration": 2.710939407348633 }, { "auxiliary_loss_clip": 0.01216884, "auxiliary_loss_mlp": 0.01043273, "balance_loss_clip": 1.024647, "balance_loss_mlp": 1.03247857, "epoch": 0.19238862502254553, "flos": 20850525866880.0, "grad_norm": 2.0886475283760486, "language_loss": 0.70736492, "learning_rate": 3.7298395044194206e-06, "loss": 0.72996652, "num_input_tokens_seen": 33922515, "step": 1600, "time_per_iteration": 2.6785035133361816 }, { "auxiliary_loss_clip": 0.01223022, "auxiliary_loss_mlp": 0.01032844, "balance_loss_clip": 1.06514359, "balance_loss_mlp": 1.02209115, "epoch": 0.19250886791318464, "flos": 21726063878400.0, "grad_norm": 2.7201077938740874, "language_loss": 0.94225693, "learning_rate": 3.7294483994270356e-06, "loss": 0.96481556, "num_input_tokens_seen": 33940840, "step": 1601, "time_per_iteration": 2.6700634956359863 }, { "auxiliary_loss_clip": 0.01200206, "auxiliary_loss_mlp": 0.01036927, "balance_loss_clip": 0.90493542, "balance_loss_mlp": 1.02676427, "epoch": 0.19262911080382372, "flos": 23367827836800.0, "grad_norm": 2.3559992544666715, "language_loss": 0.7768079, "learning_rate": 3.7290570320787033e-06, "loss": 0.7991792, "num_input_tokens_seen": 33960420, "step": 1602, "time_per_iteration": 2.810736894607544 }, { "auxiliary_loss_clip": 0.01219269, "auxiliary_loss_mlp": 0.01033429, "balance_loss_clip": 1.0250597, "balance_loss_mlp": 1.02265847, "epoch": 0.1927493536944628, "flos": 21943502858880.0, "grad_norm": 2.1009893451506994, "language_loss": 0.71282613, "learning_rate": 3.728665402433793e-06, "loss": 0.73535311, "num_input_tokens_seen": 33978990, "step": 1603, "time_per_iteration": 2.7478225231170654 }, { "auxiliary_loss_clip": 0.01213219, "auxiliary_loss_mlp": 0.0103003, "balance_loss_clip": 0.98444307, "balance_loss_mlp": 1.02005839, "epoch": 0.19286959658510192, "flos": 16545590807040.0, "grad_norm": 2.305382979878725, "language_loss": 0.86218929, "learning_rate": 3.7282735105517164e-06, "loss": 0.88462174, "num_input_tokens_seen": 33997115, "step": 1604, "time_per_iteration": 2.6777350902557373 }, { "auxiliary_loss_clip": 0.01220403, "auxiliary_loss_mlp": 0.01032667, "balance_loss_clip": 0.90664172, "balance_loss_mlp": 1.02128839, "epoch": 0.192989839475741, "flos": 21616967295360.0, "grad_norm": 1.860230027927688, "language_loss": 0.67009455, "learning_rate": 3.727881356491922e-06, "loss": 0.69262528, "num_input_tokens_seen": 34015525, "step": 1605, "time_per_iteration": 3.5879034996032715 }, { "auxiliary_loss_clip": 0.01220538, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.06533253, "balance_loss_mlp": 1.02905321, "epoch": 0.19311008236638008, "flos": 19281516906240.0, "grad_norm": 2.0752527369371, "language_loss": 0.75970608, "learning_rate": 3.7274889403139002e-06, "loss": 0.78230417, "num_input_tokens_seen": 34033150, "step": 1606, "time_per_iteration": 3.6786701679229736 }, { "auxiliary_loss_clip": 0.01206265, "auxiliary_loss_mlp": 0.01033953, "balance_loss_clip": 0.90685749, "balance_loss_mlp": 1.02445769, "epoch": 0.1932303252570192, "flos": 28652369587200.0, "grad_norm": 2.1832674732785247, "language_loss": 0.78656918, "learning_rate": 3.727096262077179e-06, "loss": 0.80897135, "num_input_tokens_seen": 34052145, "step": 1607, "time_per_iteration": 3.731642961502075 }, { "auxiliary_loss_clip": 0.01216253, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.02263856, "balance_loss_mlp": 1.0197376, "epoch": 0.19335056814765827, "flos": 18368990864640.0, "grad_norm": 1.8241171638739615, "language_loss": 0.85300893, "learning_rate": 3.7267033218413285e-06, "loss": 0.87546706, "num_input_tokens_seen": 34069940, "step": 1608, "time_per_iteration": 2.6484055519104004 }, { "auxiliary_loss_clip": 0.01207859, "auxiliary_loss_mlp": 0.01047231, "balance_loss_clip": 0.86468494, "balance_loss_mlp": 1.03659141, "epoch": 0.19347081103829736, "flos": 13260877741440.0, "grad_norm": 2.1618356578835622, "language_loss": 0.8122046, "learning_rate": 3.726310119665957e-06, "loss": 0.83475554, "num_input_tokens_seen": 34086275, "step": 1609, "time_per_iteration": 3.7205262184143066 }, { "auxiliary_loss_clip": 0.01219942, "auxiliary_loss_mlp": 0.01037792, "balance_loss_clip": 1.02601445, "balance_loss_mlp": 1.02753353, "epoch": 0.19359105392893644, "flos": 20300122788480.0, "grad_norm": 1.7920971432023933, "language_loss": 0.85436749, "learning_rate": 3.725916655610713e-06, "loss": 0.8769449, "num_input_tokens_seen": 34105605, "step": 1610, "time_per_iteration": 2.7121846675872803 }, { "auxiliary_loss_clip": 0.01205169, "auxiliary_loss_mlp": 0.01037976, "balance_loss_clip": 0.98132461, "balance_loss_mlp": 1.02727652, "epoch": 0.19371129681957555, "flos": 20484596062080.0, "grad_norm": 5.398857913740429, "language_loss": 0.75745517, "learning_rate": 3.725522929735284e-06, "loss": 0.7798866, "num_input_tokens_seen": 34122540, "step": 1611, "time_per_iteration": 2.6762776374816895 }, { "auxiliary_loss_clip": 0.01218987, "auxiliary_loss_mlp": 0.01038238, "balance_loss_clip": 0.98176062, "balance_loss_mlp": 1.02823627, "epoch": 0.19383153971021463, "flos": 30445497457920.0, "grad_norm": 2.1382072857575682, "language_loss": 0.73757529, "learning_rate": 3.725128942099399e-06, "loss": 0.76014757, "num_input_tokens_seen": 34142940, "step": 1612, "time_per_iteration": 2.75068736076355 }, { "auxiliary_loss_clip": 0.01205302, "auxiliary_loss_mlp": 0.01034327, "balance_loss_clip": 0.981408, "balance_loss_mlp": 1.02345514, "epoch": 0.19395178260085372, "flos": 24569937325440.0, "grad_norm": 1.890971823500587, "language_loss": 0.79951864, "learning_rate": 3.7247346927628245e-06, "loss": 0.82191497, "num_input_tokens_seen": 34162875, "step": 1613, "time_per_iteration": 2.7090158462524414 }, { "auxiliary_loss_clip": 0.01218585, "auxiliary_loss_mlp": 0.01127902, "balance_loss_clip": 0.98732406, "balance_loss_mlp": 0.0, "epoch": 0.19407202549149283, "flos": 28950608211840.0, "grad_norm": 1.7737442739080256, "language_loss": 0.79385036, "learning_rate": 3.7243401817853694e-06, "loss": 0.81731522, "num_input_tokens_seen": 34183565, "step": 1614, "time_per_iteration": 2.783646821975708 }, { "auxiliary_loss_clip": 0.012083, "auxiliary_loss_mlp": 0.01034335, "balance_loss_clip": 1.02297735, "balance_loss_mlp": 1.02413654, "epoch": 0.1941922683821319, "flos": 18004497603840.0, "grad_norm": 2.1702068909519636, "language_loss": 0.71844858, "learning_rate": 3.723945409226879e-06, "loss": 0.74087489, "num_input_tokens_seen": 34202055, "step": 1615, "time_per_iteration": 2.655085325241089 }, { "auxiliary_loss_clip": 0.01214449, "auxiliary_loss_mlp": 0.01033976, "balance_loss_clip": 1.02107513, "balance_loss_mlp": 1.02382505, "epoch": 0.194312511272771, "flos": 9720337034880.0, "grad_norm": 2.329209161753681, "language_loss": 0.79653001, "learning_rate": 3.723550375147241e-06, "loss": 0.81901425, "num_input_tokens_seen": 34216830, "step": 1616, "time_per_iteration": 2.6776275634765625 }, { "auxiliary_loss_clip": 0.01198061, "auxiliary_loss_mlp": 0.01035125, "balance_loss_clip": 0.94158709, "balance_loss_mlp": 1.02417541, "epoch": 0.19443275416341008, "flos": 27016208150400.0, "grad_norm": 1.6617418448933052, "language_loss": 0.79945695, "learning_rate": 3.7231550796063816e-06, "loss": 0.82178885, "num_input_tokens_seen": 34236840, "step": 1617, "time_per_iteration": 2.800529718399048 }, { "auxiliary_loss_clip": 0.01221279, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 0.98680902, "balance_loss_mlp": 1.02883101, "epoch": 0.1945529970540492, "flos": 15846625077120.0, "grad_norm": 2.4624537014460643, "language_loss": 0.6498, "learning_rate": 3.722759522664266e-06, "loss": 0.67241061, "num_input_tokens_seen": 34254140, "step": 1618, "time_per_iteration": 2.648414134979248 }, { "auxiliary_loss_clip": 0.01210374, "auxiliary_loss_mlp": 0.01036892, "balance_loss_clip": 0.90437907, "balance_loss_mlp": 1.02679515, "epoch": 0.19467323994468827, "flos": 19314985403520.0, "grad_norm": 1.9424359339025765, "language_loss": 0.81954527, "learning_rate": 3.7223637043809016e-06, "loss": 0.84201789, "num_input_tokens_seen": 34273120, "step": 1619, "time_per_iteration": 2.789442300796509 }, { "auxiliary_loss_clip": 0.01221007, "auxiliary_loss_mlp": 0.01035895, "balance_loss_clip": 0.95061028, "balance_loss_mlp": 1.02606583, "epoch": 0.19479348283532735, "flos": 24133227770880.0, "grad_norm": 2.0323252086964545, "language_loss": 0.86424118, "learning_rate": 3.7219676248163322e-06, "loss": 0.88681018, "num_input_tokens_seen": 34290285, "step": 1620, "time_per_iteration": 2.7655465602874756 }, { "auxiliary_loss_clip": 0.01221582, "auxiliary_loss_mlp": 0.01042986, "balance_loss_clip": 1.02458549, "balance_loss_mlp": 1.03272223, "epoch": 0.19491372572596646, "flos": 25775638174080.0, "grad_norm": 2.039672552694647, "language_loss": 0.93300432, "learning_rate": 3.721571284030643e-06, "loss": 0.95564997, "num_input_tokens_seen": 34310095, "step": 1621, "time_per_iteration": 2.710444688796997 }, { "auxiliary_loss_clip": 0.01220343, "auxiliary_loss_mlp": 0.01034924, "balance_loss_clip": 1.02469039, "balance_loss_mlp": 1.02494597, "epoch": 0.19503396861660555, "flos": 19645220067840.0, "grad_norm": 2.212183269761317, "language_loss": 0.79305112, "learning_rate": 3.7211746820839587e-06, "loss": 0.81560373, "num_input_tokens_seen": 34327190, "step": 1622, "time_per_iteration": 2.699882745742798 }, { "auxiliary_loss_clip": 0.01186107, "auxiliary_loss_mlp": 0.01036995, "balance_loss_clip": 0.82384115, "balance_loss_mlp": 1.02654004, "epoch": 0.19515421150724463, "flos": 21033023892480.0, "grad_norm": 1.8382504700151436, "language_loss": 0.80821085, "learning_rate": 3.7207778190364437e-06, "loss": 0.83044183, "num_input_tokens_seen": 34345615, "step": 1623, "time_per_iteration": 2.832578420639038 }, { "auxiliary_loss_clip": 0.01188628, "auxiliary_loss_mlp": 0.01033523, "balance_loss_clip": 0.86363, "balance_loss_mlp": 1.02273488, "epoch": 0.1952744543978837, "flos": 32961255143040.0, "grad_norm": 1.6611097471659373, "language_loss": 0.73879969, "learning_rate": 3.720380694948302e-06, "loss": 0.76102126, "num_input_tokens_seen": 34368500, "step": 1624, "time_per_iteration": 2.8873798847198486 }, { "auxiliary_loss_clip": 0.01144969, "auxiliary_loss_mlp": 0.01010176, "balance_loss_clip": 0.89586425, "balance_loss_mlp": 1.00524068, "epoch": 0.19539469728852282, "flos": 64044312030720.0, "grad_norm": 1.0644198594144398, "language_loss": 0.7126888, "learning_rate": 3.719983309879777e-06, "loss": 0.73424017, "num_input_tokens_seen": 34428280, "step": 1625, "time_per_iteration": 3.409972667694092 }, { "auxiliary_loss_clip": 0.01200612, "auxiliary_loss_mlp": 0.01038467, "balance_loss_clip": 0.94371009, "balance_loss_mlp": 1.02842975, "epoch": 0.1955149401791619, "flos": 13370908078080.0, "grad_norm": 2.108328730888849, "language_loss": 0.77604115, "learning_rate": 3.719585663891151e-06, "loss": 0.79843193, "num_input_tokens_seen": 34445815, "step": 1626, "time_per_iteration": 2.83164644241333 }, { "auxiliary_loss_clip": 0.01201629, "auxiliary_loss_mlp": 0.01040725, "balance_loss_clip": 0.90715218, "balance_loss_mlp": 1.03053296, "epoch": 0.195635183069801, "flos": 18728887184640.0, "grad_norm": 1.963803208136876, "language_loss": 0.78831077, "learning_rate": 3.719187757042747e-06, "loss": 0.81073427, "num_input_tokens_seen": 34463635, "step": 1627, "time_per_iteration": 2.727722644805908 }, { "auxiliary_loss_clip": 0.01133865, "auxiliary_loss_mlp": 0.01010142, "balance_loss_clip": 0.96823525, "balance_loss_mlp": 1.00520635, "epoch": 0.1957554259604401, "flos": 69313952615040.0, "grad_norm": 0.7359323698069045, "language_loss": 0.5499022, "learning_rate": 3.7187895893949275e-06, "loss": 0.57134229, "num_input_tokens_seen": 34530105, "step": 1628, "time_per_iteration": 3.374173402786255 }, { "auxiliary_loss_clip": 0.01196946, "auxiliary_loss_mlp": 0.01029568, "balance_loss_clip": 0.90351343, "balance_loss_mlp": 1.01955462, "epoch": 0.19587566885107918, "flos": 21069257736960.0, "grad_norm": 2.545931521578847, "language_loss": 0.76006627, "learning_rate": 3.7183911610080937e-06, "loss": 0.78233135, "num_input_tokens_seen": 34546970, "step": 1629, "time_per_iteration": 2.75081205368042 }, { "auxiliary_loss_clip": 0.01211533, "auxiliary_loss_mlp": 0.01034243, "balance_loss_clip": 0.94344103, "balance_loss_mlp": 1.02372837, "epoch": 0.19599591174171827, "flos": 22194661731840.0, "grad_norm": 2.3072657879667866, "language_loss": 0.75265837, "learning_rate": 3.7179924719426872e-06, "loss": 0.77511609, "num_input_tokens_seen": 34564865, "step": 1630, "time_per_iteration": 2.759579658508301 }, { "auxiliary_loss_clip": 0.01219433, "auxiliary_loss_mlp": 0.01046732, "balance_loss_clip": 1.02442145, "balance_loss_mlp": 1.03571117, "epoch": 0.19611615463235738, "flos": 23768375374080.0, "grad_norm": 2.6708809838632823, "language_loss": 0.75873488, "learning_rate": 3.7175935222591885e-06, "loss": 0.78139651, "num_input_tokens_seen": 34584165, "step": 1631, "time_per_iteration": 3.636859178543091 }, { "auxiliary_loss_clip": 0.01222416, "auxiliary_loss_mlp": 0.01040924, "balance_loss_clip": 0.99082327, "balance_loss_mlp": 1.0307498, "epoch": 0.19623639752299646, "flos": 28618218731520.0, "grad_norm": 2.0559620711163986, "language_loss": 0.74467558, "learning_rate": 3.717194312018118e-06, "loss": 0.76730901, "num_input_tokens_seen": 34603150, "step": 1632, "time_per_iteration": 3.7242753505706787 }, { "auxiliary_loss_clip": 0.01220972, "auxiliary_loss_mlp": 0.01041157, "balance_loss_clip": 1.02482367, "balance_loss_mlp": 1.03035057, "epoch": 0.19635664041363554, "flos": 21032700670080.0, "grad_norm": 2.0198504942688995, "language_loss": 0.76083392, "learning_rate": 3.716794841280036e-06, "loss": 0.78345525, "num_input_tokens_seen": 34621855, "step": 1633, "time_per_iteration": 2.6876745223999023 }, { "auxiliary_loss_clip": 0.01221501, "auxiliary_loss_mlp": 0.01038272, "balance_loss_clip": 1.02338314, "balance_loss_mlp": 1.02738857, "epoch": 0.19647688330427462, "flos": 18879748306560.0, "grad_norm": 2.025263544288532, "language_loss": 0.77424574, "learning_rate": 3.7163951101055407e-06, "loss": 0.79684353, "num_input_tokens_seen": 34639915, "step": 1634, "time_per_iteration": 3.5800082683563232 }, { "auxiliary_loss_clip": 0.01211644, "auxiliary_loss_mlp": 0.01036076, "balance_loss_clip": 0.98660672, "balance_loss_mlp": 1.02541256, "epoch": 0.19659712619491373, "flos": 24242503921920.0, "grad_norm": 1.8088065475212394, "language_loss": 0.79099119, "learning_rate": 3.715995118555273e-06, "loss": 0.8134684, "num_input_tokens_seen": 34659890, "step": 1635, "time_per_iteration": 2.8329617977142334 }, { "auxiliary_loss_clip": 0.01208848, "auxiliary_loss_mlp": 0.01037337, "balance_loss_clip": 0.90694928, "balance_loss_mlp": 1.0262742, "epoch": 0.19671736908555282, "flos": 24717422568960.0, "grad_norm": 1.9891266259640483, "language_loss": 0.8594169, "learning_rate": 3.71559486668991e-06, "loss": 0.88187873, "num_input_tokens_seen": 34678750, "step": 1636, "time_per_iteration": 3.6327967643737793 }, { "auxiliary_loss_clip": 0.01224514, "auxiliary_loss_mlp": 0.01127285, "balance_loss_clip": 1.02552783, "balance_loss_mlp": 0.0, "epoch": 0.1968376119761919, "flos": 23842279607040.0, "grad_norm": 1.5479800146136715, "language_loss": 0.77341223, "learning_rate": 3.715194354570169e-06, "loss": 0.79693025, "num_input_tokens_seen": 34698755, "step": 1637, "time_per_iteration": 2.7140421867370605 }, { "auxiliary_loss_clip": 0.01216368, "auxiliary_loss_mlp": 0.01037116, "balance_loss_clip": 1.02533913, "balance_loss_mlp": 1.02717972, "epoch": 0.196957854866831, "flos": 18113917409280.0, "grad_norm": 1.9250166597814586, "language_loss": 0.82927108, "learning_rate": 3.714793582256809e-06, "loss": 0.85180593, "num_input_tokens_seen": 34715820, "step": 1638, "time_per_iteration": 2.626667022705078 }, { "auxiliary_loss_clip": 0.01217281, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.06222486, "balance_loss_mlp": 1.02664053, "epoch": 0.1970780977574701, "flos": 21653129312640.0, "grad_norm": 2.10987416270139, "language_loss": 0.85145688, "learning_rate": 3.7143925498106253e-06, "loss": 0.87399685, "num_input_tokens_seen": 34734360, "step": 1639, "time_per_iteration": 2.6734514236450195 }, { "auxiliary_loss_clip": 0.01211166, "auxiliary_loss_mlp": 0.01042154, "balance_loss_clip": 0.98024577, "balance_loss_mlp": 1.03158629, "epoch": 0.19719834064810918, "flos": 20811813984000.0, "grad_norm": 1.8868428135734994, "language_loss": 0.79123682, "learning_rate": 3.7139912572924558e-06, "loss": 0.81377006, "num_input_tokens_seen": 34753390, "step": 1640, "time_per_iteration": 2.7062230110168457 }, { "auxiliary_loss_clip": 0.01211242, "auxiliary_loss_mlp": 0.01031196, "balance_loss_clip": 1.01952219, "balance_loss_mlp": 1.02072942, "epoch": 0.19731858353874826, "flos": 23434800744960.0, "grad_norm": 2.8640932664370657, "language_loss": 0.80304825, "learning_rate": 3.7135897047631744e-06, "loss": 0.82547265, "num_input_tokens_seen": 34771275, "step": 1641, "time_per_iteration": 2.703068256378174 }, { "auxiliary_loss_clip": 0.01218225, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 0.98532879, "balance_loss_mlp": 1.02423596, "epoch": 0.19743882642938737, "flos": 23988184652160.0, "grad_norm": 2.6951938847303505, "language_loss": 0.76308966, "learning_rate": 3.713187892283698e-06, "loss": 0.78561568, "num_input_tokens_seen": 34790885, "step": 1642, "time_per_iteration": 2.736959934234619 }, { "auxiliary_loss_clip": 0.01210297, "auxiliary_loss_mlp": 0.01029899, "balance_loss_clip": 0.90563422, "balance_loss_mlp": 1.01911032, "epoch": 0.19755906932002645, "flos": 15004340081280.0, "grad_norm": 2.135454942346143, "language_loss": 0.87292749, "learning_rate": 3.71278581991498e-06, "loss": 0.89532942, "num_input_tokens_seen": 34806745, "step": 1643, "time_per_iteration": 2.7780888080596924 }, { "auxiliary_loss_clip": 0.01217414, "auxiliary_loss_mlp": 0.01127508, "balance_loss_clip": 0.94966936, "balance_loss_mlp": 0.0, "epoch": 0.19767931221066554, "flos": 19494466686720.0, "grad_norm": 1.8472645453796077, "language_loss": 0.79353678, "learning_rate": 3.712383487718015e-06, "loss": 0.81698602, "num_input_tokens_seen": 34824985, "step": 1644, "time_per_iteration": 2.7123570442199707 }, { "auxiliary_loss_clip": 0.01202523, "auxiliary_loss_mlp": 0.0103444, "balance_loss_clip": 0.86860573, "balance_loss_mlp": 1.02370548, "epoch": 0.19779955510130465, "flos": 25737895958400.0, "grad_norm": 1.7593067800007083, "language_loss": 0.86422324, "learning_rate": 3.7119808957538365e-06, "loss": 0.88659286, "num_input_tokens_seen": 34843980, "step": 1645, "time_per_iteration": 2.819626569747925 }, { "auxiliary_loss_clip": 0.01204047, "auxiliary_loss_mlp": 0.01039567, "balance_loss_clip": 0.98026466, "balance_loss_mlp": 1.02868342, "epoch": 0.19791979799194373, "flos": 20777699041920.0, "grad_norm": 3.173487031507285, "language_loss": 0.79810786, "learning_rate": 3.711578044083517e-06, "loss": 0.820544, "num_input_tokens_seen": 34860780, "step": 1646, "time_per_iteration": 2.8106539249420166 }, { "auxiliary_loss_clip": 0.01214049, "auxiliary_loss_mlp": 0.01037663, "balance_loss_clip": 0.9831059, "balance_loss_mlp": 1.02765524, "epoch": 0.1980400408825828, "flos": 25589010084480.0, "grad_norm": 1.7328059287382163, "language_loss": 0.74375832, "learning_rate": 3.7111749327681698e-06, "loss": 0.76627553, "num_input_tokens_seen": 34880815, "step": 1647, "time_per_iteration": 2.783585786819458 }, { "auxiliary_loss_clip": 0.01221809, "auxiliary_loss_mlp": 0.01035086, "balance_loss_clip": 1.02595878, "balance_loss_mlp": 1.02493525, "epoch": 0.1981602837732219, "flos": 23513840622720.0, "grad_norm": 2.231002535193413, "language_loss": 0.86227626, "learning_rate": 3.7107715618689455e-06, "loss": 0.88484514, "num_input_tokens_seen": 34899790, "step": 1648, "time_per_iteration": 2.7130489349365234 }, { "auxiliary_loss_clip": 0.01214752, "auxiliary_loss_mlp": 0.01032151, "balance_loss_clip": 1.02414751, "balance_loss_mlp": 1.022012, "epoch": 0.198280526663861, "flos": 23185365724800.0, "grad_norm": 6.369482799092971, "language_loss": 0.83400285, "learning_rate": 3.710367931447035e-06, "loss": 0.8564719, "num_input_tokens_seen": 34921570, "step": 1649, "time_per_iteration": 2.6708791255950928 }, { "auxiliary_loss_clip": 0.01223138, "auxiliary_loss_mlp": 0.01035161, "balance_loss_clip": 1.02375436, "balance_loss_mlp": 1.02468801, "epoch": 0.1984007695545001, "flos": 21689470897920.0, "grad_norm": 2.0866084457316796, "language_loss": 0.86403465, "learning_rate": 3.70996404156367e-06, "loss": 0.88661766, "num_input_tokens_seen": 34941205, "step": 1650, "time_per_iteration": 2.7325820922851562 }, { "auxiliary_loss_clip": 0.01202264, "auxiliary_loss_mlp": 0.01035151, "balance_loss_clip": 0.90626729, "balance_loss_mlp": 1.02492261, "epoch": 0.19852101244513917, "flos": 36064008887040.0, "grad_norm": 1.8597286526253525, "language_loss": 0.72945476, "learning_rate": 3.7095598922801187e-06, "loss": 0.75182897, "num_input_tokens_seen": 34963280, "step": 1651, "time_per_iteration": 2.8614847660064697 }, { "auxiliary_loss_clip": 0.01220587, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 1.0629003, "balance_loss_mlp": 1.02638781, "epoch": 0.19864125533577828, "flos": 23105894883840.0, "grad_norm": 2.018248002313611, "language_loss": 0.75681788, "learning_rate": 3.7091554836576914e-06, "loss": 0.77939439, "num_input_tokens_seen": 34979955, "step": 1652, "time_per_iteration": 2.67767596244812 }, { "auxiliary_loss_clip": 0.01214552, "auxiliary_loss_mlp": 0.01127572, "balance_loss_clip": 1.02336967, "balance_loss_mlp": 0.0, "epoch": 0.19876149822641737, "flos": 24608505553920.0, "grad_norm": 1.6064314307238143, "language_loss": 0.82879329, "learning_rate": 3.708750815757736e-06, "loss": 0.85221446, "num_input_tokens_seen": 35000725, "step": 1653, "time_per_iteration": 2.6493659019470215 }, { "auxiliary_loss_clip": 0.01224406, "auxiliary_loss_mlp": 0.01035986, "balance_loss_clip": 1.02609956, "balance_loss_mlp": 1.02543569, "epoch": 0.19888174111705645, "flos": 32196645308160.0, "grad_norm": 2.18833951014958, "language_loss": 0.73592836, "learning_rate": 3.7083458886416407e-06, "loss": 0.75853229, "num_input_tokens_seen": 35019920, "step": 1654, "time_per_iteration": 2.7876737117767334 }, { "auxiliary_loss_clip": 0.01215327, "auxiliary_loss_mlp": 0.01036897, "balance_loss_clip": 0.87253356, "balance_loss_mlp": 1.02640641, "epoch": 0.19900198400769553, "flos": 24608469640320.0, "grad_norm": 3.1850555238718683, "language_loss": 0.87829947, "learning_rate": 3.707940702370832e-06, "loss": 0.90082169, "num_input_tokens_seen": 35040765, "step": 1655, "time_per_iteration": 2.7842180728912354 }, { "auxiliary_loss_clip": 0.0111442, "auxiliary_loss_mlp": 0.01008719, "balance_loss_clip": 0.99283266, "balance_loss_mlp": 1.00387907, "epoch": 0.19912222689833464, "flos": 67915805673600.0, "grad_norm": 0.7830619035928907, "language_loss": 0.5833143, "learning_rate": 3.707535257006777e-06, "loss": 0.60454571, "num_input_tokens_seen": 35106390, "step": 1656, "time_per_iteration": 3.302994728088379 }, { "auxiliary_loss_clip": 0.01217942, "auxiliary_loss_mlp": 0.0104248, "balance_loss_clip": 0.98491818, "balance_loss_mlp": 1.03227556, "epoch": 0.19924246978897373, "flos": 15742340916480.0, "grad_norm": 2.046833695604353, "language_loss": 0.89071602, "learning_rate": 3.707129552610981e-06, "loss": 0.91332018, "num_input_tokens_seen": 35125040, "step": 1657, "time_per_iteration": 4.699731349945068 }, { "auxiliary_loss_clip": 0.01208642, "auxiliary_loss_mlp": 0.01036212, "balance_loss_clip": 0.98686051, "balance_loss_mlp": 1.02614462, "epoch": 0.1993627126796128, "flos": 17566566986880.0, "grad_norm": 2.3788115302750055, "language_loss": 0.7388972, "learning_rate": 3.70672358924499e-06, "loss": 0.76134574, "num_input_tokens_seen": 35144280, "step": 1658, "time_per_iteration": 2.6862566471099854 }, { "auxiliary_loss_clip": 0.01212863, "auxiliary_loss_mlp": 0.01040126, "balance_loss_clip": 0.94899619, "balance_loss_mlp": 1.02973127, "epoch": 0.19948295557025192, "flos": 40843826680320.0, "grad_norm": 1.8425025549378267, "language_loss": 0.78515363, "learning_rate": 3.706317366970386e-06, "loss": 0.80768347, "num_input_tokens_seen": 35165280, "step": 1659, "time_per_iteration": 3.7526538372039795 }, { "auxiliary_loss_clip": 0.01222635, "auxiliary_loss_mlp": 0.01127831, "balance_loss_clip": 1.06212163, "balance_loss_mlp": 0.0, "epoch": 0.199603198460891, "flos": 25082418620160.0, "grad_norm": 1.7951252497130255, "language_loss": 0.83624107, "learning_rate": 3.705910885848795e-06, "loss": 0.85974574, "num_input_tokens_seen": 35183655, "step": 1660, "time_per_iteration": 2.8295798301696777 }, { "auxiliary_loss_clip": 0.01215466, "auxiliary_loss_mlp": 0.01031622, "balance_loss_clip": 1.02373791, "balance_loss_mlp": 1.02153134, "epoch": 0.19972344135153008, "flos": 20084120352000.0, "grad_norm": 1.9913846890883344, "language_loss": 0.84891421, "learning_rate": 3.705504145941879e-06, "loss": 0.8713851, "num_input_tokens_seen": 35201825, "step": 1661, "time_per_iteration": 3.665513753890991 }, { "auxiliary_loss_clip": 0.01220328, "auxiliary_loss_mlp": 0.01033883, "balance_loss_clip": 1.06335282, "balance_loss_mlp": 1.02361298, "epoch": 0.1998436842421692, "flos": 23727472761600.0, "grad_norm": 1.9691157060210922, "language_loss": 0.79103643, "learning_rate": 3.7050971473113403e-06, "loss": 0.81357849, "num_input_tokens_seen": 35221600, "step": 1662, "time_per_iteration": 2.661255121231079 }, { "auxiliary_loss_clip": 0.0121341, "auxiliary_loss_mlp": 0.01126885, "balance_loss_clip": 1.02237988, "balance_loss_mlp": 0.0, "epoch": 0.19996392713280828, "flos": 36102361633920.0, "grad_norm": 1.7004663363114525, "language_loss": 0.79939914, "learning_rate": 3.7046898900189196e-06, "loss": 0.82280219, "num_input_tokens_seen": 35245935, "step": 1663, "time_per_iteration": 2.8234400749206543 }, { "auxiliary_loss_clip": 0.01221604, "auxiliary_loss_mlp": 0.0103764, "balance_loss_clip": 0.95188689, "balance_loss_mlp": 1.02702498, "epoch": 0.20008417002344736, "flos": 23657662679040.0, "grad_norm": 1.8979668514158048, "language_loss": 0.82868385, "learning_rate": 3.704282374126398e-06, "loss": 0.85127628, "num_input_tokens_seen": 35265615, "step": 1664, "time_per_iteration": 2.72611665725708 }, { "auxiliary_loss_clip": 0.01211609, "auxiliary_loss_mlp": 0.01036372, "balance_loss_clip": 0.94688326, "balance_loss_mlp": 1.02515435, "epoch": 0.20020441291408644, "flos": 21872076664320.0, "grad_norm": 1.8413413721715621, "language_loss": 0.87447125, "learning_rate": 3.7038745996955954e-06, "loss": 0.89695108, "num_input_tokens_seen": 35284960, "step": 1665, "time_per_iteration": 309.98071932792664 }, { "auxiliary_loss_clip": 0.01221425, "auxiliary_loss_mlp": 0.01037797, "balance_loss_clip": 0.94793344, "balance_loss_mlp": 1.02699089, "epoch": 0.20032465580472555, "flos": 23179691376000.0, "grad_norm": 3.0458244013967795, "language_loss": 0.71696347, "learning_rate": 3.703466566788371e-06, "loss": 0.73955572, "num_input_tokens_seen": 35304090, "step": 1666, "time_per_iteration": 2.802441120147705 }, { "auxiliary_loss_clip": 0.01206469, "auxiliary_loss_mlp": 0.01040798, "balance_loss_clip": 0.98398095, "balance_loss_mlp": 1.03000927, "epoch": 0.20044489869536464, "flos": 23873521461120.0, "grad_norm": 1.8977712242231253, "language_loss": 0.74865568, "learning_rate": 3.703058275466622e-06, "loss": 0.77112842, "num_input_tokens_seen": 35323325, "step": 1667, "time_per_iteration": 2.722126007080078 }, { "auxiliary_loss_clip": 0.0121763, "auxiliary_loss_mlp": 0.01038791, "balance_loss_clip": 0.98687661, "balance_loss_mlp": 1.02793717, "epoch": 0.20056514158600372, "flos": 21945226711680.0, "grad_norm": 1.7401498831800766, "language_loss": 0.77691972, "learning_rate": 3.7026497257922877e-06, "loss": 0.7994839, "num_input_tokens_seen": 35343635, "step": 1668, "time_per_iteration": 2.714282274246216 }, { "auxiliary_loss_clip": 0.01206685, "auxiliary_loss_mlp": 0.01038794, "balance_loss_clip": 0.9049859, "balance_loss_mlp": 1.02791643, "epoch": 0.20068538447664283, "flos": 23879159896320.0, "grad_norm": 1.8420686130386008, "language_loss": 0.8534168, "learning_rate": 3.7022409178273436e-06, "loss": 0.87587154, "num_input_tokens_seen": 35364615, "step": 1669, "time_per_iteration": 2.826768159866333 }, { "auxiliary_loss_clip": 0.01214471, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.02236032, "balance_loss_mlp": 1.02588511, "epoch": 0.2008056273672819, "flos": 18442823270400.0, "grad_norm": 1.9474920273378389, "language_loss": 0.78768837, "learning_rate": 3.7018318516338054e-06, "loss": 0.81019497, "num_input_tokens_seen": 35383775, "step": 1670, "time_per_iteration": 2.666628837585449 }, { "auxiliary_loss_clip": 0.0122449, "auxiliary_loss_mlp": 0.01038796, "balance_loss_clip": 1.02585685, "balance_loss_mlp": 1.02845442, "epoch": 0.200925870257921, "flos": 23659530186240.0, "grad_norm": 4.012510801447694, "language_loss": 0.81984222, "learning_rate": 3.7014225272737284e-06, "loss": 0.84247506, "num_input_tokens_seen": 35403000, "step": 1671, "time_per_iteration": 2.729698896408081 }, { "auxiliary_loss_clip": 0.01208341, "auxiliary_loss_mlp": 0.01034527, "balance_loss_clip": 1.02395403, "balance_loss_mlp": 1.02360725, "epoch": 0.20104611314856008, "flos": 16217115909120.0, "grad_norm": 4.335556678248814, "language_loss": 0.73437572, "learning_rate": 3.701012944809207e-06, "loss": 0.75680441, "num_input_tokens_seen": 35420115, "step": 1672, "time_per_iteration": 2.6326162815093994 }, { "auxiliary_loss_clip": 0.01218112, "auxiliary_loss_mlp": 0.01127314, "balance_loss_clip": 0.98984492, "balance_loss_mlp": 0.0, "epoch": 0.2011663560391992, "flos": 21397373498880.0, "grad_norm": 2.0850362366161144, "language_loss": 0.79278982, "learning_rate": 3.700603104302374e-06, "loss": 0.81624413, "num_input_tokens_seen": 35439925, "step": 1673, "time_per_iteration": 2.752744197845459 }, { "auxiliary_loss_clip": 0.01116907, "auxiliary_loss_mlp": 0.01011872, "balance_loss_clip": 0.92631328, "balance_loss_mlp": 1.00693691, "epoch": 0.20128659892983827, "flos": 62229459409920.0, "grad_norm": 0.8995925716891555, "language_loss": 0.55949306, "learning_rate": 3.7001930058154027e-06, "loss": 0.58078086, "num_input_tokens_seen": 35504885, "step": 1674, "time_per_iteration": 3.3057944774627686 }, { "auxiliary_loss_clip": 0.01215678, "auxiliary_loss_mlp": 0.01043311, "balance_loss_clip": 0.94711226, "balance_loss_mlp": 1.03286862, "epoch": 0.20140684182047736, "flos": 28438737448320.0, "grad_norm": 2.5290662503589973, "language_loss": 0.79244113, "learning_rate": 3.6997826494105037e-06, "loss": 0.81503105, "num_input_tokens_seen": 35525330, "step": 1675, "time_per_iteration": 2.7952358722686768 }, { "auxiliary_loss_clip": 0.01214345, "auxiliary_loss_mlp": 0.01043365, "balance_loss_clip": 0.98538464, "balance_loss_mlp": 1.03312516, "epoch": 0.20152708471111647, "flos": 28074064619520.0, "grad_norm": 2.014915913377188, "language_loss": 0.69776225, "learning_rate": 3.6993720351499286e-06, "loss": 0.72033936, "num_input_tokens_seen": 35546455, "step": 1676, "time_per_iteration": 2.7816250324249268 }, { "auxiliary_loss_clip": 0.01212528, "auxiliary_loss_mlp": 0.01040452, "balance_loss_clip": 0.98819268, "balance_loss_mlp": 1.02976537, "epoch": 0.20164732760175555, "flos": 23549751244800.0, "grad_norm": 1.8452610604239599, "language_loss": 0.76958162, "learning_rate": 3.6989611630959666e-06, "loss": 0.7921114, "num_input_tokens_seen": 35565010, "step": 1677, "time_per_iteration": 2.749936103820801 }, { "auxiliary_loss_clip": 0.0112117, "auxiliary_loss_mlp": 0.0101472, "balance_loss_clip": 0.99499631, "balance_loss_mlp": 1.0098561, "epoch": 0.20176757049239463, "flos": 71100616037760.0, "grad_norm": 0.6814419407802931, "language_loss": 0.58330625, "learning_rate": 3.6985500333109474e-06, "loss": 0.60466516, "num_input_tokens_seen": 35633340, "step": 1678, "time_per_iteration": 3.341141939163208 }, { "auxiliary_loss_clip": 0.01201658, "auxiliary_loss_mlp": 0.01037886, "balance_loss_clip": 0.94382095, "balance_loss_mlp": 1.0280509, "epoch": 0.20188781338303372, "flos": 21430159637760.0, "grad_norm": 3.043779199998314, "language_loss": 0.76641291, "learning_rate": 3.6981386458572385e-06, "loss": 0.78880835, "num_input_tokens_seen": 35651315, "step": 1679, "time_per_iteration": 2.709470748901367 }, { "auxiliary_loss_clip": 0.01205106, "auxiliary_loss_mlp": 0.01033932, "balance_loss_clip": 0.9442817, "balance_loss_mlp": 1.02330995, "epoch": 0.20200805627367283, "flos": 11546215130880.0, "grad_norm": 4.774287942594052, "language_loss": 0.75944042, "learning_rate": 3.6977270007972468e-06, "loss": 0.78183079, "num_input_tokens_seen": 35668850, "step": 1680, "time_per_iteration": 2.809690475463867 }, { "auxiliary_loss_clip": 0.0122261, "auxiliary_loss_mlp": 0.01040926, "balance_loss_clip": 0.988886, "balance_loss_mlp": 1.03078127, "epoch": 0.2021282991643119, "flos": 28545391906560.0, "grad_norm": 2.6882572284726995, "language_loss": 0.72948468, "learning_rate": 3.6973150981934196e-06, "loss": 0.75212002, "num_input_tokens_seen": 35690080, "step": 1681, "time_per_iteration": 2.7733473777770996 }, { "auxiliary_loss_clip": 0.01227552, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 1.0660975, "balance_loss_mlp": 1.02283347, "epoch": 0.202248542054951, "flos": 17923446564480.0, "grad_norm": 2.3561532520401913, "language_loss": 0.83847165, "learning_rate": 3.6969029381082415e-06, "loss": 0.86108589, "num_input_tokens_seen": 35706075, "step": 1682, "time_per_iteration": 4.645304203033447 }, { "auxiliary_loss_clip": 0.01215974, "auxiliary_loss_mlp": 0.01034819, "balance_loss_clip": 0.98643607, "balance_loss_mlp": 1.02493644, "epoch": 0.2023687849455901, "flos": 19864634296320.0, "grad_norm": 1.7081314441715596, "language_loss": 0.79501951, "learning_rate": 3.696490520604237e-06, "loss": 0.81752741, "num_input_tokens_seen": 35724765, "step": 1683, "time_per_iteration": 2.6545932292938232 }, { "auxiliary_loss_clip": 0.01215148, "auxiliary_loss_mlp": 0.01044, "balance_loss_clip": 1.02573645, "balance_loss_mlp": 1.03415358, "epoch": 0.20248902783622919, "flos": 22564721600640.0, "grad_norm": 1.6472559873232713, "language_loss": 0.80737376, "learning_rate": 3.696077845743968e-06, "loss": 0.82996523, "num_input_tokens_seen": 35744355, "step": 1684, "time_per_iteration": 2.6448566913604736 }, { "auxiliary_loss_clip": 0.01222002, "auxiliary_loss_mlp": 0.0103892, "balance_loss_clip": 1.06264448, "balance_loss_mlp": 1.02845323, "epoch": 0.20260927072686827, "flos": 22709728805760.0, "grad_norm": 2.5896255248305464, "language_loss": 0.72964507, "learning_rate": 3.69566491359004e-06, "loss": 0.75225431, "num_input_tokens_seen": 35761000, "step": 1685, "time_per_iteration": 3.6076552867889404 }, { "auxiliary_loss_clip": 0.01212371, "auxiliary_loss_mlp": 0.01049791, "balance_loss_clip": 0.98400652, "balance_loss_mlp": 1.03918767, "epoch": 0.20272951361750738, "flos": 51023998650240.0, "grad_norm": 1.6665974359256748, "language_loss": 0.6939612, "learning_rate": 3.695251724205092e-06, "loss": 0.71658278, "num_input_tokens_seen": 35785360, "step": 1686, "time_per_iteration": 3.0103702545166016 }, { "auxiliary_loss_clip": 0.01221047, "auxiliary_loss_mlp": 0.01033163, "balance_loss_clip": 1.06412268, "balance_loss_mlp": 1.02271485, "epoch": 0.20284975650814646, "flos": 26578133879040.0, "grad_norm": 1.7135935841850911, "language_loss": 0.86528367, "learning_rate": 3.6948382776518054e-06, "loss": 0.88782573, "num_input_tokens_seen": 35806065, "step": 1687, "time_per_iteration": 3.5564956665039062 }, { "auxiliary_loss_clip": 0.01225783, "auxiliary_loss_mlp": 0.01043083, "balance_loss_clip": 0.94855815, "balance_loss_mlp": 1.03250933, "epoch": 0.20296999939878554, "flos": 16034222833920.0, "grad_norm": 2.0952779691212644, "language_loss": 0.79508781, "learning_rate": 3.6944245739929e-06, "loss": 0.81777644, "num_input_tokens_seen": 35822225, "step": 1688, "time_per_iteration": 2.7345800399780273 }, { "auxiliary_loss_clip": 0.01219511, "auxiliary_loss_mlp": 0.01047305, "balance_loss_clip": 1.02531481, "balance_loss_mlp": 1.03640354, "epoch": 0.20309024228942463, "flos": 19203374868480.0, "grad_norm": 2.4194108712790694, "language_loss": 0.72085834, "learning_rate": 3.6940106132911332e-06, "loss": 0.74352658, "num_input_tokens_seen": 35839410, "step": 1689, "time_per_iteration": 2.606532096862793 }, { "auxiliary_loss_clip": 0.01219376, "auxiliary_loss_mlp": 0.01034868, "balance_loss_clip": 1.02406991, "balance_loss_mlp": 1.02458024, "epoch": 0.20321048518006374, "flos": 22821087945600.0, "grad_norm": 1.8620699105483434, "language_loss": 0.88773251, "learning_rate": 3.6935963956093037e-06, "loss": 0.91027492, "num_input_tokens_seen": 35859495, "step": 1690, "time_per_iteration": 2.77760648727417 }, { "auxiliary_loss_clip": 0.01206593, "auxiliary_loss_mlp": 0.01039236, "balance_loss_clip": 1.02289224, "balance_loss_mlp": 1.0293355, "epoch": 0.20333072807070282, "flos": 19096397187840.0, "grad_norm": 1.7696875759674184, "language_loss": 0.68835735, "learning_rate": 3.6931819210102474e-06, "loss": 0.71081567, "num_input_tokens_seen": 35878890, "step": 1691, "time_per_iteration": 2.766986846923828 }, { "auxiliary_loss_clip": 0.01220858, "auxiliary_loss_mlp": 0.01039226, "balance_loss_clip": 1.06234944, "balance_loss_mlp": 1.02871144, "epoch": 0.2034509709613419, "flos": 18180962144640.0, "grad_norm": 1.7694764400575156, "language_loss": 0.844028, "learning_rate": 3.6927671895568402e-06, "loss": 0.86662889, "num_input_tokens_seen": 35897950, "step": 1692, "time_per_iteration": 2.7129931449890137 }, { "auxiliary_loss_clip": 0.01220285, "auxiliary_loss_mlp": 0.01044499, "balance_loss_clip": 1.06292474, "balance_loss_mlp": 1.03336525, "epoch": 0.20357121385198101, "flos": 22923899648640.0, "grad_norm": 2.18533659067518, "language_loss": 0.86953902, "learning_rate": 3.692352201311996e-06, "loss": 0.89218682, "num_input_tokens_seen": 35916800, "step": 1693, "time_per_iteration": 2.727339744567871 }, { "auxiliary_loss_clip": 0.01208302, "auxiliary_loss_mlp": 0.01033959, "balance_loss_clip": 0.94503069, "balance_loss_mlp": 1.02304542, "epoch": 0.2036914567426201, "flos": 20922131629440.0, "grad_norm": 1.9766120247276013, "language_loss": 0.76845288, "learning_rate": 3.6919369563386687e-06, "loss": 0.79087555, "num_input_tokens_seen": 35936600, "step": 1694, "time_per_iteration": 2.727386713027954 }, { "auxiliary_loss_clip": 0.01213527, "auxiliary_loss_mlp": 0.01039945, "balance_loss_clip": 0.98587888, "balance_loss_mlp": 1.02962184, "epoch": 0.20381169963325918, "flos": 15519155760000.0, "grad_norm": 2.143055090464037, "language_loss": 0.7924794, "learning_rate": 3.69152145469985e-06, "loss": 0.81501412, "num_input_tokens_seen": 35953645, "step": 1695, "time_per_iteration": 2.728583335876465 }, { "auxiliary_loss_clip": 0.0121274, "auxiliary_loss_mlp": 0.01038265, "balance_loss_clip": 0.90611517, "balance_loss_mlp": 1.02705359, "epoch": 0.20393194252389826, "flos": 28833143760000.0, "grad_norm": 1.7669067606044666, "language_loss": 0.82105911, "learning_rate": 3.691105696458572e-06, "loss": 0.84356916, "num_input_tokens_seen": 35970940, "step": 1696, "time_per_iteration": 2.804555892944336 }, { "auxiliary_loss_clip": 0.01220909, "auxiliary_loss_mlp": 0.01036949, "balance_loss_clip": 1.06479394, "balance_loss_mlp": 1.02707279, "epoch": 0.20405218541453737, "flos": 22488554810880.0, "grad_norm": 2.548621644786423, "language_loss": 0.66591787, "learning_rate": 3.690689681677904e-06, "loss": 0.68849647, "num_input_tokens_seen": 35989410, "step": 1697, "time_per_iteration": 2.6778564453125 }, { "auxiliary_loss_clip": 0.01217045, "auxiliary_loss_mlp": 0.01035325, "balance_loss_clip": 0.98287201, "balance_loss_mlp": 1.02499592, "epoch": 0.20417242830517646, "flos": 25374408278400.0, "grad_norm": 1.7190600317801201, "language_loss": 0.88623738, "learning_rate": 3.690273410420956e-06, "loss": 0.90876102, "num_input_tokens_seen": 36009175, "step": 1698, "time_per_iteration": 2.777348518371582 }, { "auxiliary_loss_clip": 0.01214484, "auxiliary_loss_mlp": 0.01040276, "balance_loss_clip": 1.02247047, "balance_loss_mlp": 1.02987504, "epoch": 0.20429267119581554, "flos": 14793078240000.0, "grad_norm": 3.2108477138013973, "language_loss": 0.76424813, "learning_rate": 3.689856882750875e-06, "loss": 0.78679574, "num_input_tokens_seen": 36024375, "step": 1699, "time_per_iteration": 2.688477039337158 }, { "auxiliary_loss_clip": 0.01214329, "auxiliary_loss_mlp": 0.01033799, "balance_loss_clip": 1.02435577, "balance_loss_mlp": 1.02402985, "epoch": 0.20441291408645465, "flos": 17781851151360.0, "grad_norm": 1.7264351764163937, "language_loss": 0.78613263, "learning_rate": 3.6894400987308486e-06, "loss": 0.8086139, "num_input_tokens_seen": 36041895, "step": 1700, "time_per_iteration": 2.630478620529175 }, { "auxiliary_loss_clip": 0.01220353, "auxiliary_loss_mlp": 0.01034628, "balance_loss_clip": 1.02252841, "balance_loss_mlp": 1.02434576, "epoch": 0.20453315697709373, "flos": 16435668211200.0, "grad_norm": 2.413602337929441, "language_loss": 0.85314715, "learning_rate": 3.6890230584241024e-06, "loss": 0.87569696, "num_input_tokens_seen": 36058825, "step": 1701, "time_per_iteration": 2.6611053943634033 }, { "auxiliary_loss_clip": 0.01129147, "auxiliary_loss_mlp": 0.01013729, "balance_loss_clip": 1.03781343, "balance_loss_mlp": 1.00919867, "epoch": 0.20465339986773282, "flos": 66713085653760.0, "grad_norm": 1.0613556209440447, "language_loss": 0.66328657, "learning_rate": 3.6886057618939016e-06, "loss": 0.68471527, "num_input_tokens_seen": 36121645, "step": 1702, "time_per_iteration": 3.3066306114196777 }, { "auxiliary_loss_clip": 0.01206693, "auxiliary_loss_mlp": 0.01038599, "balance_loss_clip": 0.94331729, "balance_loss_mlp": 1.0284667, "epoch": 0.2047736427583719, "flos": 41974114924800.0, "grad_norm": 2.073832098726894, "language_loss": 0.69242918, "learning_rate": 3.6881882092035492e-06, "loss": 0.71488214, "num_input_tokens_seen": 36143030, "step": 1703, "time_per_iteration": 2.8850784301757812 }, { "auxiliary_loss_clip": 0.01136433, "auxiliary_loss_mlp": 0.01122573, "balance_loss_clip": 0.92414224, "balance_loss_mlp": 0.0, "epoch": 0.204893885649011, "flos": 69940878641280.0, "grad_norm": 0.9331244989692222, "language_loss": 0.61150873, "learning_rate": 3.6877704004163873e-06, "loss": 0.63409877, "num_input_tokens_seen": 36203435, "step": 1704, "time_per_iteration": 3.41064453125 }, { "auxiliary_loss_clip": 0.01221945, "auxiliary_loss_mlp": 0.0103574, "balance_loss_clip": 1.06247199, "balance_loss_mlp": 1.02502298, "epoch": 0.2050141285396501, "flos": 22200012858240.0, "grad_norm": 1.788983858598234, "language_loss": 0.77498543, "learning_rate": 3.6873523355957984e-06, "loss": 0.79756224, "num_input_tokens_seen": 36222435, "step": 1705, "time_per_iteration": 2.619774103164673 }, { "auxiliary_loss_clip": 0.01126147, "auxiliary_loss_mlp": 0.01013565, "balance_loss_clip": 1.03541517, "balance_loss_mlp": 1.00901115, "epoch": 0.20513437143028918, "flos": 46283721730560.0, "grad_norm": 0.9919307198988149, "language_loss": 0.64039624, "learning_rate": 3.686934014805201e-06, "loss": 0.66179335, "num_input_tokens_seen": 36273065, "step": 1706, "time_per_iteration": 3.0790059566497803 }, { "auxiliary_loss_clip": 0.01218351, "auxiliary_loss_mlp": 0.01035059, "balance_loss_clip": 1.02443731, "balance_loss_mlp": 1.02495027, "epoch": 0.20525461432092829, "flos": 21904324099200.0, "grad_norm": 1.828463176911667, "language_loss": 0.80800867, "learning_rate": 3.6865154381080552e-06, "loss": 0.8305428, "num_input_tokens_seen": 36293750, "step": 1707, "time_per_iteration": 3.653841733932495 }, { "auxiliary_loss_clip": 0.01208212, "auxiliary_loss_mlp": 0.01034811, "balance_loss_clip": 0.82945806, "balance_loss_mlp": 1.02453518, "epoch": 0.20537485721156737, "flos": 21214264942080.0, "grad_norm": 1.9184740283818693, "language_loss": 0.82784164, "learning_rate": 3.6860966055678585e-06, "loss": 0.85027182, "num_input_tokens_seen": 36310105, "step": 1708, "time_per_iteration": 2.8750734329223633 }, { "auxiliary_loss_clip": 0.01218356, "auxiliary_loss_mlp": 0.01036229, "balance_loss_clip": 1.02469122, "balance_loss_mlp": 1.02511287, "epoch": 0.20549510010220645, "flos": 20191205773440.0, "grad_norm": 1.7389932629839662, "language_loss": 0.86297047, "learning_rate": 3.685677517248147e-06, "loss": 0.88551629, "num_input_tokens_seen": 36328995, "step": 1709, "time_per_iteration": 3.6260204315185547 }, { "auxiliary_loss_clip": 0.01216427, "auxiliary_loss_mlp": 0.01127046, "balance_loss_clip": 0.98997247, "balance_loss_mlp": 0.0, "epoch": 0.20561534299284553, "flos": 17016702612480.0, "grad_norm": 2.00633778748332, "language_loss": 0.80467808, "learning_rate": 3.6852581732124967e-06, "loss": 0.82811284, "num_input_tokens_seen": 36346340, "step": 1710, "time_per_iteration": 2.7922816276550293 }, { "auxiliary_loss_clip": 0.01221654, "auxiliary_loss_mlp": 0.01039555, "balance_loss_clip": 1.02736425, "balance_loss_mlp": 1.0291121, "epoch": 0.20573558588348465, "flos": 22890467064960.0, "grad_norm": 2.737143737754973, "language_loss": 0.76073492, "learning_rate": 3.6848385735245213e-06, "loss": 0.78334701, "num_input_tokens_seen": 36365430, "step": 1711, "time_per_iteration": 3.6910741329193115 }, { "auxiliary_loss_clip": 0.01203809, "auxiliary_loss_mlp": 0.01040213, "balance_loss_clip": 1.0204258, "balance_loss_mlp": 1.03026557, "epoch": 0.20585582877412373, "flos": 24643123286400.0, "grad_norm": 1.777075650989988, "language_loss": 0.86120892, "learning_rate": 3.6844187182478734e-06, "loss": 0.88364917, "num_input_tokens_seen": 36386285, "step": 1712, "time_per_iteration": 2.740769624710083 }, { "auxiliary_loss_clip": 0.01202833, "auxiliary_loss_mlp": 0.01034142, "balance_loss_clip": 0.98109704, "balance_loss_mlp": 1.02348411, "epoch": 0.2059760716647628, "flos": 24206952435840.0, "grad_norm": 1.9347119139976137, "language_loss": 0.74787927, "learning_rate": 3.683998607446246e-06, "loss": 0.77024901, "num_input_tokens_seen": 36404935, "step": 1713, "time_per_iteration": 2.690810203552246 }, { "auxiliary_loss_clip": 0.01216901, "auxiliary_loss_mlp": 0.01035109, "balance_loss_clip": 1.02285051, "balance_loss_mlp": 1.02498257, "epoch": 0.20609631455540192, "flos": 20229522606720.0, "grad_norm": 2.3008225615212083, "language_loss": 0.74935734, "learning_rate": 3.6835782411833686e-06, "loss": 0.77187747, "num_input_tokens_seen": 36424455, "step": 1714, "time_per_iteration": 3.578345537185669 }, { "auxiliary_loss_clip": 0.01199011, "auxiliary_loss_mlp": 0.01039894, "balance_loss_clip": 0.9420296, "balance_loss_mlp": 1.03007734, "epoch": 0.206216557446041, "flos": 19864957518720.0, "grad_norm": 1.8199982986428729, "language_loss": 0.73955703, "learning_rate": 3.68315761952301e-06, "loss": 0.76194608, "num_input_tokens_seen": 36441685, "step": 1715, "time_per_iteration": 2.681640625 }, { "auxiliary_loss_clip": 0.01222163, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.06303692, "balance_loss_mlp": 1.02712131, "epoch": 0.2063368003366801, "flos": 24096311568000.0, "grad_norm": 1.8576930706329995, "language_loss": 0.82838738, "learning_rate": 3.6827367425289797e-06, "loss": 0.85098702, "num_input_tokens_seen": 36461460, "step": 1716, "time_per_iteration": 2.6935675144195557 }, { "auxiliary_loss_clip": 0.01217768, "auxiliary_loss_mlp": 0.01037356, "balance_loss_clip": 0.98581839, "balance_loss_mlp": 1.02666938, "epoch": 0.2064570432273192, "flos": 20340163474560.0, "grad_norm": 2.181474115386808, "language_loss": 0.72399688, "learning_rate": 3.6823156102651225e-06, "loss": 0.74654812, "num_input_tokens_seen": 36479615, "step": 1717, "time_per_iteration": 2.7067513465881348 }, { "auxiliary_loss_clip": 0.01201151, "auxiliary_loss_mlp": 0.01033637, "balance_loss_clip": 0.87131017, "balance_loss_mlp": 1.02333713, "epoch": 0.20657728611795828, "flos": 20520363029760.0, "grad_norm": 2.037162001274475, "language_loss": 0.71188414, "learning_rate": 3.6818942227953257e-06, "loss": 0.73423207, "num_input_tokens_seen": 36500160, "step": 1718, "time_per_iteration": 2.806422710418701 }, { "auxiliary_loss_clip": 0.01214175, "auxiliary_loss_mlp": 0.01046706, "balance_loss_clip": 0.94555652, "balance_loss_mlp": 1.03622794, "epoch": 0.20669752900859736, "flos": 21799285752960.0, "grad_norm": 1.9224446301699007, "language_loss": 0.69252884, "learning_rate": 3.681472580183512e-06, "loss": 0.7151376, "num_input_tokens_seen": 36518810, "step": 1719, "time_per_iteration": 2.816861629486084 }, { "auxiliary_loss_clip": 0.01215236, "auxiliary_loss_mlp": 0.01039091, "balance_loss_clip": 1.02629328, "balance_loss_mlp": 1.02969146, "epoch": 0.20681777189923645, "flos": 15122020014720.0, "grad_norm": 1.892119057763836, "language_loss": 0.86131167, "learning_rate": 3.6810506824936455e-06, "loss": 0.88385493, "num_input_tokens_seen": 36536890, "step": 1720, "time_per_iteration": 2.729114294052124 }, { "auxiliary_loss_clip": 0.01126898, "auxiliary_loss_mlp": 0.01010709, "balance_loss_clip": 0.95904505, "balance_loss_mlp": 1.00574958, "epoch": 0.20693801478987556, "flos": 56481021509760.0, "grad_norm": 1.0832176281484136, "language_loss": 0.62481499, "learning_rate": 3.680628529789726e-06, "loss": 0.646191, "num_input_tokens_seen": 36589300, "step": 1721, "time_per_iteration": 3.099411725997925 }, { "auxiliary_loss_clip": 0.0122996, "auxiliary_loss_mlp": 0.01045528, "balance_loss_clip": 1.06631672, "balance_loss_mlp": 1.0349896, "epoch": 0.20705825768051464, "flos": 21614201948160.0, "grad_norm": 1.807849379628114, "language_loss": 0.86547744, "learning_rate": 3.680206122135796e-06, "loss": 0.88823235, "num_input_tokens_seen": 36609905, "step": 1722, "time_per_iteration": 2.6191093921661377 }, { "auxiliary_loss_clip": 0.01219456, "auxiliary_loss_mlp": 0.01043355, "balance_loss_clip": 0.91100466, "balance_loss_mlp": 1.03296614, "epoch": 0.20717850057115372, "flos": 25848895962240.0, "grad_norm": 1.9036678104202436, "language_loss": 0.78493291, "learning_rate": 3.6797834595959323e-06, "loss": 0.80756104, "num_input_tokens_seen": 36629805, "step": 1723, "time_per_iteration": 2.800062417984009 }, { "auxiliary_loss_clip": 0.01198933, "auxiliary_loss_mlp": 0.01039599, "balance_loss_clip": 0.90331352, "balance_loss_mlp": 1.02858949, "epoch": 0.20729874346179283, "flos": 29130807767040.0, "grad_norm": 2.8117500092300634, "language_loss": 0.77863514, "learning_rate": 3.679360542234254e-06, "loss": 0.8010205, "num_input_tokens_seen": 36649150, "step": 1724, "time_per_iteration": 2.792860984802246 }, { "auxiliary_loss_clip": 0.01202835, "auxiliary_loss_mlp": 0.01127521, "balance_loss_clip": 0.97947526, "balance_loss_mlp": 0.0, "epoch": 0.20741898635243192, "flos": 29023363209600.0, "grad_norm": 1.8712227158997772, "language_loss": 0.72440159, "learning_rate": 3.678937370114916e-06, "loss": 0.74770522, "num_input_tokens_seen": 36668955, "step": 1725, "time_per_iteration": 2.8233699798583984 }, { "auxiliary_loss_clip": 0.01211916, "auxiliary_loss_mlp": 0.01031169, "balance_loss_clip": 0.9857108, "balance_loss_mlp": 1.02193081, "epoch": 0.207539229243071, "flos": 15559447841280.0, "grad_norm": 2.3544175638740925, "language_loss": 0.79448432, "learning_rate": 3.678513943302114e-06, "loss": 0.81691515, "num_input_tokens_seen": 36685730, "step": 1726, "time_per_iteration": 2.6765520572662354 }, { "auxiliary_loss_clip": 0.01219683, "auxiliary_loss_mlp": 0.0103278, "balance_loss_clip": 1.06380033, "balance_loss_mlp": 1.02259982, "epoch": 0.20765947213371008, "flos": 20521081301760.0, "grad_norm": 2.0086634945268513, "language_loss": 0.85290617, "learning_rate": 3.678090261860082e-06, "loss": 0.87543082, "num_input_tokens_seen": 36705460, "step": 1727, "time_per_iteration": 2.600090742111206 }, { "auxiliary_loss_clip": 0.01214087, "auxiliary_loss_mlp": 0.01034859, "balance_loss_clip": 0.94228625, "balance_loss_mlp": 1.02516675, "epoch": 0.2077797150243492, "flos": 19354415558400.0, "grad_norm": 4.643442684322135, "language_loss": 0.77411139, "learning_rate": 3.6776663258530906e-06, "loss": 0.79660082, "num_input_tokens_seen": 36724110, "step": 1728, "time_per_iteration": 2.7566096782684326 }, { "auxiliary_loss_clip": 0.01218274, "auxiliary_loss_mlp": 0.01035103, "balance_loss_clip": 1.02175999, "balance_loss_mlp": 1.02541709, "epoch": 0.20789995791498828, "flos": 21829952989440.0, "grad_norm": 1.855378997165975, "language_loss": 0.71258891, "learning_rate": 3.6772421353454516e-06, "loss": 0.73512262, "num_input_tokens_seen": 36742705, "step": 1729, "time_per_iteration": 2.631056070327759 }, { "auxiliary_loss_clip": 0.01217871, "auxiliary_loss_mlp": 0.01033227, "balance_loss_clip": 1.02530456, "balance_loss_mlp": 1.02332056, "epoch": 0.20802020080562736, "flos": 23148844571520.0, "grad_norm": 1.7361791050543465, "language_loss": 0.88275105, "learning_rate": 3.6768176904015153e-06, "loss": 0.90526199, "num_input_tokens_seen": 36762510, "step": 1730, "time_per_iteration": 2.689627170562744 }, { "auxiliary_loss_clip": 0.01219532, "auxiliary_loss_mlp": 0.01028612, "balance_loss_clip": 1.02292275, "balance_loss_mlp": 1.01812184, "epoch": 0.20814044369626647, "flos": 23072677781760.0, "grad_norm": 2.684338937220168, "language_loss": 0.59672153, "learning_rate": 3.6763929910856674e-06, "loss": 0.61920297, "num_input_tokens_seen": 36780960, "step": 1731, "time_per_iteration": 2.6710426807403564 }, { "auxiliary_loss_clip": 0.01215656, "auxiliary_loss_mlp": 0.01043875, "balance_loss_clip": 1.02311456, "balance_loss_mlp": 1.03344452, "epoch": 0.20826068658690555, "flos": 19608016556160.0, "grad_norm": 2.437216230986112, "language_loss": 0.77754164, "learning_rate": 3.6759680374623365e-06, "loss": 0.80013704, "num_input_tokens_seen": 36798875, "step": 1732, "time_per_iteration": 2.6788017749786377 }, { "auxiliary_loss_clip": 0.01220412, "auxiliary_loss_mlp": 0.01037547, "balance_loss_clip": 1.06382048, "balance_loss_mlp": 1.02745581, "epoch": 0.20838092947754464, "flos": 25374049142400.0, "grad_norm": 3.5360146572609614, "language_loss": 0.75700438, "learning_rate": 3.675542829595986e-06, "loss": 0.77958405, "num_input_tokens_seen": 36818540, "step": 1733, "time_per_iteration": 3.6155343055725098 }, { "auxiliary_loss_clip": 0.01215719, "auxiliary_loss_mlp": 0.01032671, "balance_loss_clip": 0.98447847, "balance_loss_mlp": 1.02226996, "epoch": 0.20850117236818372, "flos": 24061729749120.0, "grad_norm": 1.506065536552, "language_loss": 0.7942425, "learning_rate": 3.6751173675511213e-06, "loss": 0.81672645, "num_input_tokens_seen": 36840585, "step": 1734, "time_per_iteration": 2.726616621017456 }, { "auxiliary_loss_clip": 0.01209788, "auxiliary_loss_mlp": 0.01032288, "balance_loss_clip": 0.9802599, "balance_loss_mlp": 1.02201831, "epoch": 0.20862141525882283, "flos": 20077799558400.0, "grad_norm": 2.6150202429330225, "language_loss": 0.87398827, "learning_rate": 3.674691651392283e-06, "loss": 0.89640903, "num_input_tokens_seen": 36858255, "step": 1735, "time_per_iteration": 3.7211172580718994 }, { "auxiliary_loss_clip": 0.01222052, "auxiliary_loss_mlp": 0.01043157, "balance_loss_clip": 0.9875716, "balance_loss_mlp": 1.03218377, "epoch": 0.2087416581494619, "flos": 39015183237120.0, "grad_norm": 2.580875157370285, "language_loss": 0.75646049, "learning_rate": 3.674265681184053e-06, "loss": 0.77911258, "num_input_tokens_seen": 36881515, "step": 1736, "time_per_iteration": 2.916771650314331 }, { "auxiliary_loss_clip": 0.01212709, "auxiliary_loss_mlp": 0.0104289, "balance_loss_clip": 0.98203754, "balance_loss_mlp": 1.03263164, "epoch": 0.208861901040101, "flos": 26101994169600.0, "grad_norm": 1.8932952114300161, "language_loss": 0.86682427, "learning_rate": 3.6738394569910504e-06, "loss": 0.88938022, "num_input_tokens_seen": 36902055, "step": 1737, "time_per_iteration": 3.708341121673584 }, { "auxiliary_loss_clip": 0.01216393, "auxiliary_loss_mlp": 0.01032581, "balance_loss_clip": 1.02369428, "balance_loss_mlp": 1.02157784, "epoch": 0.2089821439307401, "flos": 28398732675840.0, "grad_norm": 2.283839580319867, "language_loss": 0.83095157, "learning_rate": 3.6734129788779333e-06, "loss": 0.85344136, "num_input_tokens_seen": 36921230, "step": 1738, "time_per_iteration": 2.6902270317077637 }, { "auxiliary_loss_clip": 0.0121321, "auxiliary_loss_mlp": 0.01033629, "balance_loss_clip": 0.94885749, "balance_loss_mlp": 1.02368701, "epoch": 0.2091023868213792, "flos": 21069616872960.0, "grad_norm": 1.7959316484216374, "language_loss": 0.90299731, "learning_rate": 3.6729862469093976e-06, "loss": 0.9254657, "num_input_tokens_seen": 36940325, "step": 1739, "time_per_iteration": 3.5905277729034424 }, { "auxiliary_loss_clip": 0.01202757, "auxiliary_loss_mlp": 0.01039222, "balance_loss_clip": 0.98337841, "balance_loss_mlp": 1.02883899, "epoch": 0.20922262971201827, "flos": 22455481363200.0, "grad_norm": 2.0836258677796122, "language_loss": 0.83192849, "learning_rate": 3.6725592611501782e-06, "loss": 0.8543483, "num_input_tokens_seen": 36959000, "step": 1740, "time_per_iteration": 2.6656267642974854 }, { "auxiliary_loss_clip": 0.01215283, "auxiliary_loss_mlp": 0.01036885, "balance_loss_clip": 1.02175331, "balance_loss_mlp": 1.0263828, "epoch": 0.20934287260265738, "flos": 27852244179840.0, "grad_norm": 1.8304972457346251, "language_loss": 0.76477015, "learning_rate": 3.6721320216650496e-06, "loss": 0.78729188, "num_input_tokens_seen": 36979615, "step": 1741, "time_per_iteration": 2.76265811920166 }, { "auxiliary_loss_clip": 0.01216917, "auxiliary_loss_mlp": 0.01043758, "balance_loss_clip": 0.98664343, "balance_loss_mlp": 1.03384018, "epoch": 0.20946311549329646, "flos": 16435309075200.0, "grad_norm": 1.7971765752449493, "language_loss": 0.83772981, "learning_rate": 3.6717045285188215e-06, "loss": 0.86033654, "num_input_tokens_seen": 36997310, "step": 1742, "time_per_iteration": 2.645681619644165 }, { "auxiliary_loss_clip": 0.01195672, "auxiliary_loss_mlp": 0.01036914, "balance_loss_clip": 0.90197575, "balance_loss_mlp": 1.02651286, "epoch": 0.20958335838393555, "flos": 22492720788480.0, "grad_norm": 2.1378982338519, "language_loss": 0.86988252, "learning_rate": 3.671276781776346e-06, "loss": 0.89220834, "num_input_tokens_seen": 37015965, "step": 1743, "time_per_iteration": 2.762312412261963 }, { "auxiliary_loss_clip": 0.01223584, "auxiliary_loss_mlp": 0.01032961, "balance_loss_clip": 0.94595569, "balance_loss_mlp": 1.02296567, "epoch": 0.20970360127457463, "flos": 25224768218880.0, "grad_norm": 2.076357527963944, "language_loss": 0.67137867, "learning_rate": 3.6708487815025128e-06, "loss": 0.6939441, "num_input_tokens_seen": 37036545, "step": 1744, "time_per_iteration": 2.7071821689605713 }, { "auxiliary_loss_clip": 0.0121605, "auxiliary_loss_mlp": 0.01038611, "balance_loss_clip": 0.94706374, "balance_loss_mlp": 1.02877057, "epoch": 0.20982384416521374, "flos": 18479164855680.0, "grad_norm": 2.241674494770584, "language_loss": 0.74211627, "learning_rate": 3.6704205277622463e-06, "loss": 0.76466286, "num_input_tokens_seen": 37054985, "step": 1745, "time_per_iteration": 2.7310993671417236 }, { "auxiliary_loss_clip": 0.01219069, "auxiliary_loss_mlp": 0.01034131, "balance_loss_clip": 0.98447931, "balance_loss_mlp": 1.02415895, "epoch": 0.20994408705585282, "flos": 25373546352000.0, "grad_norm": 1.7697825136269534, "language_loss": 0.80276322, "learning_rate": 3.6699920206205146e-06, "loss": 0.82529521, "num_input_tokens_seen": 37075725, "step": 1746, "time_per_iteration": 2.7204065322875977 }, { "auxiliary_loss_clip": 0.01217041, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.02159679, "balance_loss_mlp": 1.02488303, "epoch": 0.2100643299464919, "flos": 21320955313920.0, "grad_norm": 1.8766686171102689, "language_loss": 0.81801414, "learning_rate": 3.669563260142321e-06, "loss": 0.84053254, "num_input_tokens_seen": 37094615, "step": 1747, "time_per_iteration": 2.792598009109497 }, { "auxiliary_loss_clip": 0.01211028, "auxiliary_loss_mlp": 0.01040087, "balance_loss_clip": 0.98497558, "balance_loss_mlp": 1.02928114, "epoch": 0.21018457283713102, "flos": 19354379644800.0, "grad_norm": 1.9915692572713246, "language_loss": 0.83856875, "learning_rate": 3.6691342463927083e-06, "loss": 0.86107993, "num_input_tokens_seen": 37113610, "step": 1748, "time_per_iteration": 2.857550621032715 }, { "auxiliary_loss_clip": 0.01217052, "auxiliary_loss_mlp": 0.01034782, "balance_loss_clip": 0.9448489, "balance_loss_mlp": 1.02425003, "epoch": 0.2103048157277701, "flos": 28330035914880.0, "grad_norm": 1.616015568077329, "language_loss": 0.81923985, "learning_rate": 3.668704979436758e-06, "loss": 0.84175819, "num_input_tokens_seen": 37133705, "step": 1749, "time_per_iteration": 2.8295865058898926 }, { "auxiliary_loss_clip": 0.01205012, "auxiliary_loss_mlp": 0.01033282, "balance_loss_clip": 0.98114806, "balance_loss_mlp": 1.02260661, "epoch": 0.21042505861840918, "flos": 17457290835840.0, "grad_norm": 2.12770788048909, "language_loss": 0.78659773, "learning_rate": 3.668275459339588e-06, "loss": 0.80898064, "num_input_tokens_seen": 37152185, "step": 1750, "time_per_iteration": 2.703627824783325 }, { "auxiliary_loss_clip": 0.01220916, "auxiliary_loss_mlp": 0.01032425, "balance_loss_clip": 1.06383514, "balance_loss_mlp": 1.02205384, "epoch": 0.21054530150904827, "flos": 14209817195520.0, "grad_norm": 1.7876633419954093, "language_loss": 0.80022156, "learning_rate": 3.667845686166358e-06, "loss": 0.82275498, "num_input_tokens_seen": 37169110, "step": 1751, "time_per_iteration": 2.554079532623291 }, { "auxiliary_loss_clip": 0.01194129, "auxiliary_loss_mlp": 0.01033358, "balance_loss_clip": 0.94388276, "balance_loss_mlp": 1.02248597, "epoch": 0.21066554439968738, "flos": 18618210403200.0, "grad_norm": 1.7312606997796445, "language_loss": 0.85772324, "learning_rate": 3.6674156599822634e-06, "loss": 0.87999803, "num_input_tokens_seen": 37184905, "step": 1752, "time_per_iteration": 2.6896159648895264 }, { "auxiliary_loss_clip": 0.01214462, "auxiliary_loss_mlp": 0.01039395, "balance_loss_clip": 0.90491831, "balance_loss_mlp": 1.02913094, "epoch": 0.21078578729032646, "flos": 23658883741440.0, "grad_norm": 1.7566830787994114, "language_loss": 0.81691325, "learning_rate": 3.666985380852539e-06, "loss": 0.83945179, "num_input_tokens_seen": 37203910, "step": 1753, "time_per_iteration": 2.7446889877319336 }, { "auxiliary_loss_clip": 0.01215951, "auxiliary_loss_mlp": 0.01044761, "balance_loss_clip": 0.9875682, "balance_loss_mlp": 1.03467035, "epoch": 0.21090603018096554, "flos": 29346379240320.0, "grad_norm": 2.9144388325848793, "language_loss": 0.74446511, "learning_rate": 3.6665548488424576e-06, "loss": 0.7670722, "num_input_tokens_seen": 37222670, "step": 1754, "time_per_iteration": 2.767942190170288 }, { "auxiliary_loss_clip": 0.01220877, "auxiliary_loss_mlp": 0.01035748, "balance_loss_clip": 1.06330729, "balance_loss_mlp": 1.02492356, "epoch": 0.21102627307160465, "flos": 23261245205760.0, "grad_norm": 2.740887472164384, "language_loss": 0.8798281, "learning_rate": 3.6661240640173307e-06, "loss": 0.90239429, "num_input_tokens_seen": 37244140, "step": 1755, "time_per_iteration": 2.6476240158081055 }, { "auxiliary_loss_clip": 0.01126732, "auxiliary_loss_mlp": 0.01003642, "balance_loss_clip": 0.92315161, "balance_loss_mlp": 0.99944633, "epoch": 0.21114651596224374, "flos": 54633454577280.0, "grad_norm": 0.8575056288435428, "language_loss": 0.57883912, "learning_rate": 3.6656930264425085e-06, "loss": 0.60014284, "num_input_tokens_seen": 37308185, "step": 1756, "time_per_iteration": 3.325950860977173 }, { "auxiliary_loss_clip": 0.01223939, "auxiliary_loss_mlp": 0.01039524, "balance_loss_clip": 1.06476557, "balance_loss_mlp": 1.02896178, "epoch": 0.21126675885288282, "flos": 21543314457600.0, "grad_norm": 2.0614151082483287, "language_loss": 0.75566101, "learning_rate": 3.665261736183378e-06, "loss": 0.77829564, "num_input_tokens_seen": 37328220, "step": 1757, "time_per_iteration": 2.583489179611206 }, { "auxiliary_loss_clip": 0.01215928, "auxiliary_loss_mlp": 0.01033474, "balance_loss_clip": 0.94800276, "balance_loss_mlp": 1.02336478, "epoch": 0.2113870017435219, "flos": 10961876678400.0, "grad_norm": 2.3349925565186824, "language_loss": 0.88716733, "learning_rate": 3.664830193305366e-06, "loss": 0.90966129, "num_input_tokens_seen": 37345995, "step": 1758, "time_per_iteration": 2.6703298091888428 }, { "auxiliary_loss_clip": 0.01203402, "auxiliary_loss_mlp": 0.01035879, "balance_loss_clip": 0.9414742, "balance_loss_mlp": 1.02512622, "epoch": 0.211507244634161, "flos": 16653825463680.0, "grad_norm": 2.2696062419383636, "language_loss": 0.77017003, "learning_rate": 3.6643983978739373e-06, "loss": 0.79256278, "num_input_tokens_seen": 37362610, "step": 1759, "time_per_iteration": 3.685487747192383 }, { "auxiliary_loss_clip": 0.01209064, "auxiliary_loss_mlp": 0.01043429, "balance_loss_clip": 0.9871707, "balance_loss_mlp": 1.03187776, "epoch": 0.2116274875248001, "flos": 20954091755520.0, "grad_norm": 1.920757860743488, "language_loss": 0.82308829, "learning_rate": 3.663966349954596e-06, "loss": 0.84561324, "num_input_tokens_seen": 37382790, "step": 1760, "time_per_iteration": 2.715961217880249 }, { "auxiliary_loss_clip": 0.0112224, "auxiliary_loss_mlp": 0.01004225, "balance_loss_clip": 0.99651366, "balance_loss_mlp": 0.99988598, "epoch": 0.21174773041543918, "flos": 68196949424640.0, "grad_norm": 0.7841941559635668, "language_loss": 0.59707737, "learning_rate": 3.6635340496128816e-06, "loss": 0.61834204, "num_input_tokens_seen": 37439720, "step": 1761, "time_per_iteration": 4.138270378112793 }, { "auxiliary_loss_clip": 0.01208965, "auxiliary_loss_mlp": 0.01045429, "balance_loss_clip": 0.90775371, "balance_loss_mlp": 1.03504014, "epoch": 0.2118679733060783, "flos": 20668315150080.0, "grad_norm": 1.7359689869178898, "language_loss": 0.92991984, "learning_rate": 3.6631014969143747e-06, "loss": 0.95246375, "num_input_tokens_seen": 37459410, "step": 1762, "time_per_iteration": 2.793802261352539 }, { "auxiliary_loss_clip": 0.01218838, "auxiliary_loss_mlp": 0.01038201, "balance_loss_clip": 1.02640319, "balance_loss_mlp": 1.02717996, "epoch": 0.21198821619671737, "flos": 23223431162880.0, "grad_norm": 1.8365952653509392, "language_loss": 0.89084393, "learning_rate": 3.662668691924693e-06, "loss": 0.91341436, "num_input_tokens_seen": 37480460, "step": 1763, "time_per_iteration": 3.6719398498535156 }, { "auxiliary_loss_clip": 0.01209979, "auxiliary_loss_mlp": 0.01034886, "balance_loss_clip": 0.94385624, "balance_loss_mlp": 1.02404428, "epoch": 0.21210845908735645, "flos": 24498547044480.0, "grad_norm": 2.3578449850996814, "language_loss": 0.71683061, "learning_rate": 3.6622356347094927e-06, "loss": 0.73927927, "num_input_tokens_seen": 37502025, "step": 1764, "time_per_iteration": 2.7378592491149902 }, { "auxiliary_loss_clip": 0.01210249, "auxiliary_loss_mlp": 0.01040901, "balance_loss_clip": 0.94257319, "balance_loss_mlp": 1.02962422, "epoch": 0.21222870197799554, "flos": 27089789160960.0, "grad_norm": 1.8695074151172653, "language_loss": 0.78600031, "learning_rate": 3.6618023253344684e-06, "loss": 0.80851185, "num_input_tokens_seen": 37520885, "step": 1765, "time_per_iteration": 3.6265389919281006 }, { "auxiliary_loss_clip": 0.01214828, "auxiliary_loss_mlp": 0.01039563, "balance_loss_clip": 1.02212262, "balance_loss_mlp": 1.02876914, "epoch": 0.21234894486863465, "flos": 16873850223360.0, "grad_norm": 1.7152736133656348, "language_loss": 0.8331641, "learning_rate": 3.6613687638653527e-06, "loss": 0.855708, "num_input_tokens_seen": 37539055, "step": 1766, "time_per_iteration": 2.5819902420043945 }, { "auxiliary_loss_clip": 0.01215798, "auxiliary_loss_mlp": 0.01040303, "balance_loss_clip": 0.98839819, "balance_loss_mlp": 1.029199, "epoch": 0.21246918775927373, "flos": 23474949171840.0, "grad_norm": 1.867230006567535, "language_loss": 0.78039032, "learning_rate": 3.660934950367916e-06, "loss": 0.80295134, "num_input_tokens_seen": 37558300, "step": 1767, "time_per_iteration": 2.726759433746338 }, { "auxiliary_loss_clip": 0.01216782, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.02138937, "balance_loss_mlp": 1.02629685, "epoch": 0.21258943064991281, "flos": 22382295402240.0, "grad_norm": 1.9393776182648665, "language_loss": 0.83456373, "learning_rate": 3.660500884907968e-06, "loss": 0.85710311, "num_input_tokens_seen": 37579040, "step": 1768, "time_per_iteration": 2.670869827270508 }, { "auxiliary_loss_clip": 0.01108749, "auxiliary_loss_mlp": 0.01003646, "balance_loss_clip": 0.91597414, "balance_loss_mlp": 0.99890196, "epoch": 0.21270967354055192, "flos": 59440168679040.0, "grad_norm": 0.8198388572399478, "language_loss": 0.6000222, "learning_rate": 3.660066567551356e-06, "loss": 0.62114614, "num_input_tokens_seen": 37639185, "step": 1769, "time_per_iteration": 3.252830743789673 }, { "auxiliary_loss_clip": 0.01215862, "auxiliary_loss_mlp": 0.01127528, "balance_loss_clip": 1.02207398, "balance_loss_mlp": 0.0, "epoch": 0.212829916431191, "flos": 21544032729600.0, "grad_norm": 2.3953391188216404, "language_loss": 0.84292364, "learning_rate": 3.6596319983639657e-06, "loss": 0.86635756, "num_input_tokens_seen": 37657765, "step": 1770, "time_per_iteration": 2.664425849914551 }, { "auxiliary_loss_clip": 0.01214684, "auxiliary_loss_mlp": 0.01127888, "balance_loss_clip": 0.94610995, "balance_loss_mlp": 0.0, "epoch": 0.2129501593218301, "flos": 28987739896320.0, "grad_norm": 1.6183482129088755, "language_loss": 0.86178291, "learning_rate": 3.6591971774117214e-06, "loss": 0.88520867, "num_input_tokens_seen": 37680740, "step": 1771, "time_per_iteration": 2.7731430530548096 }, { "auxiliary_loss_clip": 0.01221616, "auxiliary_loss_mlp": 0.01042272, "balance_loss_clip": 1.02385759, "balance_loss_mlp": 1.0318768, "epoch": 0.2130704022124692, "flos": 18806993308800.0, "grad_norm": 1.9402109928189102, "language_loss": 0.8035779, "learning_rate": 3.6587621047605833e-06, "loss": 0.82621682, "num_input_tokens_seen": 37697910, "step": 1772, "time_per_iteration": 2.6803019046783447 }, { "auxiliary_loss_clip": 0.01215776, "auxiliary_loss_mlp": 0.01045, "balance_loss_clip": 1.02361679, "balance_loss_mlp": 1.03427148, "epoch": 0.21319064510310828, "flos": 13918150759680.0, "grad_norm": 1.9837090592368354, "language_loss": 0.86774218, "learning_rate": 3.6583267804765542e-06, "loss": 0.89034998, "num_input_tokens_seen": 37712245, "step": 1773, "time_per_iteration": 2.6157443523406982 }, { "auxiliary_loss_clip": 0.01217486, "auxiliary_loss_mlp": 0.01038874, "balance_loss_clip": 1.02397907, "balance_loss_mlp": 1.02830005, "epoch": 0.21331088799374737, "flos": 20959694277120.0, "grad_norm": 2.014366255666369, "language_loss": 0.85761118, "learning_rate": 3.6578912046256702e-06, "loss": 0.88017476, "num_input_tokens_seen": 37730765, "step": 1774, "time_per_iteration": 2.6619598865509033 }, { "auxiliary_loss_clip": 0.01210833, "auxiliary_loss_mlp": 0.01033168, "balance_loss_clip": 0.94450504, "balance_loss_mlp": 1.0214678, "epoch": 0.21343113088438645, "flos": 18624638937600.0, "grad_norm": 2.20417440627985, "language_loss": 0.7606976, "learning_rate": 3.6574553772740083e-06, "loss": 0.78313762, "num_input_tokens_seen": 37748695, "step": 1775, "time_per_iteration": 2.8959524631500244 }, { "auxiliary_loss_clip": 0.01114214, "auxiliary_loss_mlp": 0.01008182, "balance_loss_clip": 0.99537158, "balance_loss_mlp": 1.00348496, "epoch": 0.21355137377502556, "flos": 67413128791680.0, "grad_norm": 0.8645638114033127, "language_loss": 0.61892998, "learning_rate": 3.657019298487684e-06, "loss": 0.64015394, "num_input_tokens_seen": 37813705, "step": 1776, "time_per_iteration": 3.3428053855895996 }, { "auxiliary_loss_clip": 0.0121996, "auxiliary_loss_mlp": 0.0112796, "balance_loss_clip": 1.01959276, "balance_loss_mlp": 0.0, "epoch": 0.21367161666566464, "flos": 34532095697280.0, "grad_norm": 1.6753318693751087, "language_loss": 0.84099972, "learning_rate": 3.6565829683328495e-06, "loss": 0.86447895, "num_input_tokens_seen": 37836330, "step": 1777, "time_per_iteration": 2.7322332859039307 }, { "auxiliary_loss_clip": 0.01212135, "auxiliary_loss_mlp": 0.01042561, "balance_loss_clip": 1.02271199, "balance_loss_mlp": 1.03181458, "epoch": 0.21379185955630373, "flos": 18989347680000.0, "grad_norm": 1.8245824850705508, "language_loss": 0.85945368, "learning_rate": 3.6561463868756965e-06, "loss": 0.88200068, "num_input_tokens_seen": 37855030, "step": 1778, "time_per_iteration": 2.659755229949951 }, { "auxiliary_loss_clip": 0.01214667, "auxiliary_loss_mlp": 0.01044362, "balance_loss_clip": 1.02252054, "balance_loss_mlp": 1.03396702, "epoch": 0.21391210244694284, "flos": 28218497207040.0, "grad_norm": 5.716883094099848, "language_loss": 0.78324616, "learning_rate": 3.655709554182452e-06, "loss": 0.80583644, "num_input_tokens_seen": 37875370, "step": 1779, "time_per_iteration": 2.690824508666992 }, { "auxiliary_loss_clip": 0.01218381, "auxiliary_loss_mlp": 0.01038671, "balance_loss_clip": 1.02201414, "balance_loss_mlp": 1.02732253, "epoch": 0.21403234533758192, "flos": 17455064192640.0, "grad_norm": 1.8349322736684182, "language_loss": 0.84871757, "learning_rate": 3.6552724703193855e-06, "loss": 0.87128806, "num_input_tokens_seen": 37892560, "step": 1780, "time_per_iteration": 2.668069839477539 }, { "auxiliary_loss_clip": 0.01114123, "auxiliary_loss_mlp": 0.01002356, "balance_loss_clip": 0.87480783, "balance_loss_mlp": 0.99761105, "epoch": 0.214152588228221, "flos": 51637606686720.0, "grad_norm": 0.7897502394467226, "language_loss": 0.55971444, "learning_rate": 3.654835135352801e-06, "loss": 0.58087921, "num_input_tokens_seen": 37947370, "step": 1781, "time_per_iteration": 3.1620352268218994 }, { "auxiliary_loss_clip": 0.01206677, "auxiliary_loss_mlp": 0.01034283, "balance_loss_clip": 0.90052986, "balance_loss_mlp": 1.02332759, "epoch": 0.21427283111886009, "flos": 19496154625920.0, "grad_norm": 1.9501898930352235, "language_loss": 0.87422323, "learning_rate": 3.654397549349043e-06, "loss": 0.89663285, "num_input_tokens_seen": 37964745, "step": 1782, "time_per_iteration": 2.7941298484802246 }, { "auxiliary_loss_clip": 0.01215748, "auxiliary_loss_mlp": 0.01040564, "balance_loss_clip": 0.98665047, "balance_loss_mlp": 1.0300442, "epoch": 0.2143930740094992, "flos": 20084802710400.0, "grad_norm": 2.006444244233659, "language_loss": 0.75136018, "learning_rate": 3.653959712374491e-06, "loss": 0.77392328, "num_input_tokens_seen": 37982850, "step": 1783, "time_per_iteration": 2.7061219215393066 }, { "auxiliary_loss_clip": 0.01208785, "auxiliary_loss_mlp": 0.01037528, "balance_loss_clip": 0.94811642, "balance_loss_mlp": 1.02670991, "epoch": 0.21451331690013828, "flos": 21798603394560.0, "grad_norm": 1.613065271549423, "language_loss": 0.8313309, "learning_rate": 3.6535216244955663e-06, "loss": 0.85379404, "num_input_tokens_seen": 38002745, "step": 1784, "time_per_iteration": 2.6821341514587402 }, { "auxiliary_loss_clip": 0.01211966, "auxiliary_loss_mlp": 0.01038694, "balance_loss_clip": 0.98195159, "balance_loss_mlp": 1.02763772, "epoch": 0.21463355979077736, "flos": 32853882412800.0, "grad_norm": 1.5931348297361272, "language_loss": 0.70947999, "learning_rate": 3.653083285778726e-06, "loss": 0.73198664, "num_input_tokens_seen": 38024115, "step": 1785, "time_per_iteration": 3.6574411392211914 }, { "auxiliary_loss_clip": 0.01220871, "auxiliary_loss_mlp": 0.01038825, "balance_loss_clip": 1.02284098, "balance_loss_mlp": 1.02835286, "epoch": 0.21475380268141647, "flos": 21543817248000.0, "grad_norm": 3.7071957677873932, "language_loss": 0.81143105, "learning_rate": 3.6526446962904653e-06, "loss": 0.83402801, "num_input_tokens_seen": 38042830, "step": 1786, "time_per_iteration": 2.5596776008605957 }, { "auxiliary_loss_clip": 0.01212035, "auxiliary_loss_mlp": 0.01039094, "balance_loss_clip": 1.02282631, "balance_loss_mlp": 1.02924156, "epoch": 0.21487404557205556, "flos": 32159082660480.0, "grad_norm": 1.4465547960145804, "language_loss": 0.74602175, "learning_rate": 3.652205856097318e-06, "loss": 0.76853299, "num_input_tokens_seen": 38066015, "step": 1787, "time_per_iteration": 3.4823360443115234 }, { "auxiliary_loss_clip": 0.01221345, "auxiliary_loss_mlp": 0.01127788, "balance_loss_clip": 0.94429624, "balance_loss_mlp": 0.0, "epoch": 0.21499428846269464, "flos": 12673091583360.0, "grad_norm": 2.180111665533421, "language_loss": 0.79047871, "learning_rate": 3.651766765265856e-06, "loss": 0.81397009, "num_input_tokens_seen": 38083025, "step": 1788, "time_per_iteration": 3.584535837173462 }, { "auxiliary_loss_clip": 0.01206338, "auxiliary_loss_mlp": 0.010419, "balance_loss_clip": 0.98033381, "balance_loss_mlp": 1.03098631, "epoch": 0.21511453135333372, "flos": 23471573293440.0, "grad_norm": 4.983087098107026, "language_loss": 0.81446946, "learning_rate": 3.65132742386269e-06, "loss": 0.83695185, "num_input_tokens_seen": 38098245, "step": 1789, "time_per_iteration": 2.672992467880249 }, { "auxiliary_loss_clip": 0.01218615, "auxiliary_loss_mlp": 0.01038026, "balance_loss_clip": 1.06135416, "balance_loss_mlp": 1.02695751, "epoch": 0.21523477424397283, "flos": 26943560893440.0, "grad_norm": 1.674919746154901, "language_loss": 0.84715176, "learning_rate": 3.6508878319544656e-06, "loss": 0.86971825, "num_input_tokens_seen": 38118460, "step": 1790, "time_per_iteration": 3.5059814453125 }, { "auxiliary_loss_clip": 0.01204806, "auxiliary_loss_mlp": 0.01050748, "balance_loss_clip": 0.98420906, "balance_loss_mlp": 1.04025769, "epoch": 0.21535501713461191, "flos": 18916161719040.0, "grad_norm": 2.591009708153044, "language_loss": 0.81432605, "learning_rate": 3.65044798960787e-06, "loss": 0.83688164, "num_input_tokens_seen": 38136800, "step": 1791, "time_per_iteration": 2.679051399230957 }, { "auxiliary_loss_clip": 0.01202, "auxiliary_loss_mlp": 0.01030718, "balance_loss_clip": 0.94038451, "balance_loss_mlp": 1.02093053, "epoch": 0.215475260025251, "flos": 17895113712000.0, "grad_norm": 2.5825672287215418, "language_loss": 0.78259039, "learning_rate": 3.650007896889627e-06, "loss": 0.80491757, "num_input_tokens_seen": 38155380, "step": 1792, "time_per_iteration": 2.644334554672241 }, { "auxiliary_loss_clip": 0.01215775, "auxiliary_loss_mlp": 0.01042407, "balance_loss_clip": 1.06252038, "balance_loss_mlp": 1.03150511, "epoch": 0.2155955029158901, "flos": 16654292340480.0, "grad_norm": 1.7522583562120535, "language_loss": 0.80677491, "learning_rate": 3.6495675538664974e-06, "loss": 0.82935673, "num_input_tokens_seen": 38174395, "step": 1793, "time_per_iteration": 2.5741357803344727 }, { "auxiliary_loss_clip": 0.0121112, "auxiliary_loss_mlp": 0.01037994, "balance_loss_clip": 0.97982466, "balance_loss_mlp": 1.02693689, "epoch": 0.2157157458065292, "flos": 23621213352960.0, "grad_norm": 1.6179592151941975, "language_loss": 0.82445461, "learning_rate": 3.649126960605282e-06, "loss": 0.84694576, "num_input_tokens_seen": 38195380, "step": 1794, "time_per_iteration": 2.6497552394866943 }, { "auxiliary_loss_clip": 0.01207307, "auxiliary_loss_mlp": 0.01036953, "balance_loss_clip": 0.98216021, "balance_loss_mlp": 1.02538383, "epoch": 0.21583598869716827, "flos": 22127078292480.0, "grad_norm": 2.4148338355289427, "language_loss": 0.83470529, "learning_rate": 3.6486861171728174e-06, "loss": 0.85714793, "num_input_tokens_seen": 38213775, "step": 1795, "time_per_iteration": 2.6372573375701904 }, { "auxiliary_loss_clip": 0.01210363, "auxiliary_loss_mlp": 0.01039088, "balance_loss_clip": 0.94100314, "balance_loss_mlp": 1.02834094, "epoch": 0.21595623158780738, "flos": 23441229279360.0, "grad_norm": 1.7611941574730323, "language_loss": 0.78441983, "learning_rate": 3.6482450236359803e-06, "loss": 0.80691439, "num_input_tokens_seen": 38235630, "step": 1796, "time_per_iteration": 2.6950745582580566 }, { "auxiliary_loss_clip": 0.01212467, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.02353001, "balance_loss_mlp": 1.02474821, "epoch": 0.21607647447844647, "flos": 26906501036160.0, "grad_norm": 2.599358833587349, "language_loss": 0.77452672, "learning_rate": 3.647803680061683e-06, "loss": 0.79700553, "num_input_tokens_seen": 38256045, "step": 1797, "time_per_iteration": 2.6922199726104736 }, { "auxiliary_loss_clip": 0.01213307, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 0.98173881, "balance_loss_mlp": 1.02644467, "epoch": 0.21619671736908555, "flos": 14495378319360.0, "grad_norm": 3.10302986803653, "language_loss": 0.74692345, "learning_rate": 3.6473620865168776e-06, "loss": 0.76942688, "num_input_tokens_seen": 38272915, "step": 1798, "time_per_iteration": 2.581761360168457 }, { "auxiliary_loss_clip": 0.01213009, "auxiliary_loss_mlp": 0.01042076, "balance_loss_clip": 0.98605847, "balance_loss_mlp": 1.03168702, "epoch": 0.21631696025972463, "flos": 17931096161280.0, "grad_norm": 1.8368940205450086, "language_loss": 0.81753242, "learning_rate": 3.646920243068554e-06, "loss": 0.8400833, "num_input_tokens_seen": 38290810, "step": 1799, "time_per_iteration": 2.6355533599853516 }, { "auxiliary_loss_clip": 0.01200517, "auxiliary_loss_mlp": 0.01034609, "balance_loss_clip": 0.98282987, "balance_loss_mlp": 1.02361202, "epoch": 0.21643720315036374, "flos": 24462385027200.0, "grad_norm": 1.9609975214893327, "language_loss": 0.74790013, "learning_rate": 3.6464781497837384e-06, "loss": 0.77025133, "num_input_tokens_seen": 38312785, "step": 1800, "time_per_iteration": 2.6948978900909424 }, { "auxiliary_loss_clip": 0.01214483, "auxiliary_loss_mlp": 0.01040449, "balance_loss_clip": 0.98159605, "balance_loss_mlp": 1.02989888, "epoch": 0.21655744604100283, "flos": 28474432588800.0, "grad_norm": 1.6060523499498933, "language_loss": 0.72778201, "learning_rate": 3.6460358067294965e-06, "loss": 0.75033134, "num_input_tokens_seen": 38334015, "step": 1801, "time_per_iteration": 2.776090621948242 }, { "auxiliary_loss_clip": 0.01218867, "auxiliary_loss_mlp": 0.01032716, "balance_loss_clip": 1.06010938, "balance_loss_mlp": 1.02212453, "epoch": 0.2166776889316419, "flos": 20152960767360.0, "grad_norm": 2.025058212413782, "language_loss": 0.77429909, "learning_rate": 3.645593213972932e-06, "loss": 0.79681492, "num_input_tokens_seen": 38352920, "step": 1802, "time_per_iteration": 2.596721887588501 }, { "auxiliary_loss_clip": 0.01206037, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.01989412, "balance_loss_mlp": 1.02504253, "epoch": 0.21679793182228102, "flos": 15193482122880.0, "grad_norm": 1.9771595742940276, "language_loss": 0.79421771, "learning_rate": 3.6451503715811852e-06, "loss": 0.81663108, "num_input_tokens_seen": 38371230, "step": 1803, "time_per_iteration": 2.7684051990509033 }, { "auxiliary_loss_clip": 0.0121257, "auxiliary_loss_mlp": 0.01037963, "balance_loss_clip": 0.98770714, "balance_loss_mlp": 1.02805114, "epoch": 0.2169181747129201, "flos": 17384464010880.0, "grad_norm": 1.850970348721938, "language_loss": 0.80491155, "learning_rate": 3.6447072796214345e-06, "loss": 0.8274169, "num_input_tokens_seen": 38389795, "step": 1804, "time_per_iteration": 2.8265130519866943 }, { "auxiliary_loss_clip": 0.01110425, "auxiliary_loss_mlp": 0.01014361, "balance_loss_clip": 0.87321192, "balance_loss_mlp": 1.00995016, "epoch": 0.21703841760355919, "flos": 58760955429120.0, "grad_norm": 0.9897887201401904, "language_loss": 0.63170695, "learning_rate": 3.644263938160898e-06, "loss": 0.65295482, "num_input_tokens_seen": 38445760, "step": 1805, "time_per_iteration": 3.2333977222442627 }, { "auxiliary_loss_clip": 0.01208284, "auxiliary_loss_mlp": 0.0104108, "balance_loss_clip": 0.94535172, "balance_loss_mlp": 1.02970755, "epoch": 0.21715866049419827, "flos": 22418457419520.0, "grad_norm": 1.7692185627993409, "language_loss": 0.71810102, "learning_rate": 3.6438203472668293e-06, "loss": 0.74059469, "num_input_tokens_seen": 38465405, "step": 1806, "time_per_iteration": 2.7263059616088867 }, { "auxiliary_loss_clip": 0.01217449, "auxiliary_loss_mlp": 0.01035898, "balance_loss_clip": 0.98527884, "balance_loss_mlp": 1.02637947, "epoch": 0.21727890338483738, "flos": 17237732952960.0, "grad_norm": 2.004563455283112, "language_loss": 0.8199116, "learning_rate": 3.6433765070065206e-06, "loss": 0.84244502, "num_input_tokens_seen": 38483195, "step": 1807, "time_per_iteration": 2.645073890686035 }, { "auxiliary_loss_clip": 0.01219491, "auxiliary_loss_mlp": 0.01037412, "balance_loss_clip": 1.06270444, "balance_loss_mlp": 1.02624798, "epoch": 0.21739914627547646, "flos": 13434792416640.0, "grad_norm": 2.403614806104032, "language_loss": 0.87382042, "learning_rate": 3.6429324174473025e-06, "loss": 0.89638942, "num_input_tokens_seen": 38496735, "step": 1808, "time_per_iteration": 2.579712390899658 }, { "auxiliary_loss_clip": 0.01217188, "auxiliary_loss_mlp": 0.01031389, "balance_loss_clip": 1.02064395, "balance_loss_mlp": 1.02152407, "epoch": 0.21751938916611555, "flos": 20959514709120.0, "grad_norm": 1.9319551735243758, "language_loss": 0.84892267, "learning_rate": 3.6424880786565425e-06, "loss": 0.8714084, "num_input_tokens_seen": 38512880, "step": 1809, "time_per_iteration": 2.62536358833313 }, { "auxiliary_loss_clip": 0.01210296, "auxiliary_loss_mlp": 0.01039204, "balance_loss_clip": 0.90914059, "balance_loss_mlp": 1.02755737, "epoch": 0.21763963205675466, "flos": 27599936071680.0, "grad_norm": 2.5298054254895437, "language_loss": 0.79875779, "learning_rate": 3.6420434907016482e-06, "loss": 0.82125282, "num_input_tokens_seen": 38532570, "step": 1810, "time_per_iteration": 2.7563912868499756 }, { "auxiliary_loss_clip": 0.012197, "auxiliary_loss_mlp": 0.01038725, "balance_loss_clip": 1.02627766, "balance_loss_mlp": 1.02883101, "epoch": 0.21775987494739374, "flos": 21430411032960.0, "grad_norm": 1.5785901465818328, "language_loss": 0.81208766, "learning_rate": 3.6415986536500606e-06, "loss": 0.83467191, "num_input_tokens_seen": 38550900, "step": 1811, "time_per_iteration": 3.6082568168640137 }, { "auxiliary_loss_clip": 0.01204539, "auxiliary_loss_mlp": 0.01039316, "balance_loss_clip": 0.91175848, "balance_loss_mlp": 1.02886093, "epoch": 0.21788011783803282, "flos": 18332972501760.0, "grad_norm": 2.29240647678699, "language_loss": 0.80743724, "learning_rate": 3.641153567569263e-06, "loss": 0.82987583, "num_input_tokens_seen": 38569215, "step": 1812, "time_per_iteration": 2.7337892055511475 }, { "auxiliary_loss_clip": 0.01208479, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.02065063, "balance_loss_mlp": 1.02496922, "epoch": 0.2180003607286719, "flos": 30262748037120.0, "grad_norm": 1.9456363745833425, "language_loss": 0.95454681, "learning_rate": 3.640708232526774e-06, "loss": 0.97697914, "num_input_tokens_seen": 38587870, "step": 1813, "time_per_iteration": 3.7466399669647217 }, { "auxiliary_loss_clip": 0.01202218, "auxiliary_loss_mlp": 0.01034521, "balance_loss_clip": 0.86112958, "balance_loss_mlp": 1.02434683, "epoch": 0.21812060361931102, "flos": 25480272637440.0, "grad_norm": 1.666990630028351, "language_loss": 0.78462601, "learning_rate": 3.6402626485901504e-06, "loss": 0.80699342, "num_input_tokens_seen": 38606965, "step": 1814, "time_per_iteration": 3.725151300430298 }, { "auxiliary_loss_clip": 0.01212907, "auxiliary_loss_mlp": 0.01045176, "balance_loss_clip": 1.02432024, "balance_loss_mlp": 1.03476918, "epoch": 0.2182408465099501, "flos": 21908166854400.0, "grad_norm": 2.0664132858801727, "language_loss": 0.78559315, "learning_rate": 3.639816815826988e-06, "loss": 0.80817389, "num_input_tokens_seen": 38626290, "step": 1815, "time_per_iteration": 2.6283891201019287 }, { "auxiliary_loss_clip": 0.01209405, "auxiliary_loss_mlp": 0.01040728, "balance_loss_clip": 0.98356628, "balance_loss_mlp": 1.03048754, "epoch": 0.21836108940058918, "flos": 23657339456640.0, "grad_norm": 2.2406909099919567, "language_loss": 0.77656394, "learning_rate": 3.6393707343049176e-06, "loss": 0.79906535, "num_input_tokens_seen": 38646620, "step": 1816, "time_per_iteration": 2.76967453956604 }, { "auxiliary_loss_clip": 0.01220122, "auxiliary_loss_mlp": 0.01033045, "balance_loss_clip": 1.02392411, "balance_loss_mlp": 1.02310252, "epoch": 0.2184813322912283, "flos": 24681009156480.0, "grad_norm": 2.3093858654493937, "language_loss": 0.73201311, "learning_rate": 3.6389244040916104e-06, "loss": 0.75454473, "num_input_tokens_seen": 38665695, "step": 1817, "time_per_iteration": 3.5535247325897217 }, { "auxiliary_loss_clip": 0.01201038, "auxiliary_loss_mlp": 0.0112764, "balance_loss_clip": 0.98125499, "balance_loss_mlp": 0.0, "epoch": 0.21860157518186737, "flos": 26574650259840.0, "grad_norm": 1.9475104203341809, "language_loss": 0.7955637, "learning_rate": 3.6384778252547747e-06, "loss": 0.81885046, "num_input_tokens_seen": 38681575, "step": 1818, "time_per_iteration": 2.7309932708740234 }, { "auxiliary_loss_clip": 0.01210706, "auxiliary_loss_mlp": 0.01127462, "balance_loss_clip": 0.98557258, "balance_loss_mlp": 0.0, "epoch": 0.21872181807250646, "flos": 20886292834560.0, "grad_norm": 2.3194530394864548, "language_loss": 0.77736521, "learning_rate": 3.638030997862155e-06, "loss": 0.80074686, "num_input_tokens_seen": 38700510, "step": 1819, "time_per_iteration": 2.6737451553344727 }, { "auxiliary_loss_clip": 0.01101061, "auxiliary_loss_mlp": 0.0100639, "balance_loss_clip": 0.94738829, "balance_loss_mlp": 1.00212193, "epoch": 0.21884206096314554, "flos": 61209452897280.0, "grad_norm": 0.7763851295987985, "language_loss": 0.59451348, "learning_rate": 3.6375839219815356e-06, "loss": 0.61558795, "num_input_tokens_seen": 38758310, "step": 1820, "time_per_iteration": 3.2345902919769287 }, { "auxiliary_loss_clip": 0.01217226, "auxiliary_loss_mlp": 0.0103752, "balance_loss_clip": 1.0610801, "balance_loss_mlp": 1.02741694, "epoch": 0.21896230385378465, "flos": 23473835850240.0, "grad_norm": 1.9281862496026132, "language_loss": 0.82737035, "learning_rate": 3.6371365976807375e-06, "loss": 0.84991789, "num_input_tokens_seen": 38778705, "step": 1821, "time_per_iteration": 2.6380598545074463 }, { "auxiliary_loss_clip": 0.01198476, "auxiliary_loss_mlp": 0.01036414, "balance_loss_clip": 0.90559697, "balance_loss_mlp": 1.02615643, "epoch": 0.21908254674442373, "flos": 25081915829760.0, "grad_norm": 1.8549241907633132, "language_loss": 0.8342635, "learning_rate": 3.6366890250276185e-06, "loss": 0.85661244, "num_input_tokens_seen": 38799660, "step": 1822, "time_per_iteration": 2.7462158203125 }, { "auxiliary_loss_clip": 0.01217386, "auxiliary_loss_mlp": 0.01038123, "balance_loss_clip": 1.06217384, "balance_loss_mlp": 1.02763867, "epoch": 0.21920278963506282, "flos": 23513768795520.0, "grad_norm": 2.059400334836705, "language_loss": 0.89818323, "learning_rate": 3.6362412040900764e-06, "loss": 0.92073834, "num_input_tokens_seen": 38819450, "step": 1823, "time_per_iteration": 2.664259195327759 }, { "auxiliary_loss_clip": 0.0121728, "auxiliary_loss_mlp": 0.01033671, "balance_loss_clip": 1.02145481, "balance_loss_mlp": 1.02340102, "epoch": 0.21932303252570193, "flos": 29242238734080.0, "grad_norm": 2.034291009408028, "language_loss": 0.80755448, "learning_rate": 3.635793134936044e-06, "loss": 0.83006406, "num_input_tokens_seen": 38840460, "step": 1824, "time_per_iteration": 2.663846969604492 }, { "auxiliary_loss_clip": 0.01213817, "auxiliary_loss_mlp": 0.01039265, "balance_loss_clip": 1.02216673, "balance_loss_mlp": 1.02912068, "epoch": 0.219443275416341, "flos": 20806857907200.0, "grad_norm": 1.6276119768515436, "language_loss": 0.73247528, "learning_rate": 3.635344817633494e-06, "loss": 0.75500607, "num_input_tokens_seen": 38859775, "step": 1825, "time_per_iteration": 2.664144515991211 }, { "auxiliary_loss_clip": 0.01210977, "auxiliary_loss_mlp": 0.01039067, "balance_loss_clip": 1.02109861, "balance_loss_mlp": 1.02913082, "epoch": 0.2195635183069801, "flos": 14501555458560.0, "grad_norm": 2.197761418158773, "language_loss": 0.75503671, "learning_rate": 3.634896252250436e-06, "loss": 0.77753711, "num_input_tokens_seen": 38876540, "step": 1826, "time_per_iteration": 2.599184274673462 }, { "auxiliary_loss_clip": 0.01219263, "auxiliary_loss_mlp": 0.010427, "balance_loss_clip": 1.06135607, "balance_loss_mlp": 1.03197098, "epoch": 0.2196837611976192, "flos": 24243473589120.0, "grad_norm": 2.2823022552300354, "language_loss": 0.82349694, "learning_rate": 3.6344474388549157e-06, "loss": 0.84611654, "num_input_tokens_seen": 38896195, "step": 1827, "time_per_iteration": 2.6023504734039307 }, { "auxiliary_loss_clip": 0.01215761, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.02356505, "balance_loss_mlp": 1.03143954, "epoch": 0.2198040040882583, "flos": 18074523168000.0, "grad_norm": 1.952364315215956, "language_loss": 0.79937774, "learning_rate": 3.6339983775150183e-06, "loss": 0.82195532, "num_input_tokens_seen": 38912755, "step": 1828, "time_per_iteration": 2.67764949798584 }, { "auxiliary_loss_clip": 0.01212974, "auxiliary_loss_mlp": 0.01033611, "balance_loss_clip": 1.02273989, "balance_loss_mlp": 1.0240798, "epoch": 0.21992424697889737, "flos": 17784185535360.0, "grad_norm": 2.2508993069988823, "language_loss": 0.84179997, "learning_rate": 3.6335490682988664e-06, "loss": 0.86426586, "num_input_tokens_seen": 38928365, "step": 1829, "time_per_iteration": 2.6375882625579834 }, { "auxiliary_loss_clip": 0.0119867, "auxiliary_loss_mlp": 0.01043792, "balance_loss_clip": 0.86685437, "balance_loss_mlp": 1.03259873, "epoch": 0.22004448986953645, "flos": 17638495971840.0, "grad_norm": 1.9482667420849256, "language_loss": 0.82896769, "learning_rate": 3.63309951127462e-06, "loss": 0.85139233, "num_input_tokens_seen": 38945275, "step": 1830, "time_per_iteration": 2.7066245079040527 }, { "auxiliary_loss_clip": 0.01211332, "auxiliary_loss_mlp": 0.0103897, "balance_loss_clip": 0.94761533, "balance_loss_mlp": 1.02864075, "epoch": 0.22016473276017556, "flos": 22275533203200.0, "grad_norm": 1.8837732589132605, "language_loss": 0.75169766, "learning_rate": 3.6326497065104757e-06, "loss": 0.77420068, "num_input_tokens_seen": 38965740, "step": 1831, "time_per_iteration": 2.7538487911224365 }, { "auxiliary_loss_clip": 0.01217279, "auxiliary_loss_mlp": 0.01047653, "balance_loss_clip": 1.02311635, "balance_loss_mlp": 1.03758013, "epoch": 0.22028497565081465, "flos": 25556259859200.0, "grad_norm": 1.945948600198489, "language_loss": 0.78088582, "learning_rate": 3.6321996540746697e-06, "loss": 0.8035351, "num_input_tokens_seen": 38984815, "step": 1832, "time_per_iteration": 2.7039451599121094 }, { "auxiliary_loss_clip": 0.01208956, "auxiliary_loss_mlp": 0.01039677, "balance_loss_clip": 0.94493586, "balance_loss_mlp": 1.02935362, "epoch": 0.22040521854145373, "flos": 36247332925440.0, "grad_norm": 1.7217540630995485, "language_loss": 0.80666232, "learning_rate": 3.6317493540354733e-06, "loss": 0.82914865, "num_input_tokens_seen": 39008230, "step": 1833, "time_per_iteration": 2.8332149982452393 }, { "auxiliary_loss_clip": 0.01205627, "auxiliary_loss_mlp": 0.01036448, "balance_loss_clip": 1.01927114, "balance_loss_mlp": 1.02572513, "epoch": 0.22052546143209284, "flos": 11838420270720.0, "grad_norm": 1.8493234616597611, "language_loss": 0.76860321, "learning_rate": 3.6312988064611976e-06, "loss": 0.79102397, "num_input_tokens_seen": 39026540, "step": 1834, "time_per_iteration": 2.656853199005127 }, { "auxiliary_loss_clip": 0.01208899, "auxiliary_loss_mlp": 0.01036695, "balance_loss_clip": 0.93949604, "balance_loss_mlp": 1.02666962, "epoch": 0.22064570432273192, "flos": 24209250906240.0, "grad_norm": 1.6890867257273277, "language_loss": 0.81296247, "learning_rate": 3.6308480114201896e-06, "loss": 0.83541846, "num_input_tokens_seen": 39048460, "step": 1835, "time_per_iteration": 2.767711877822876 }, { "auxiliary_loss_clip": 0.01218715, "auxiliary_loss_mlp": 0.01041569, "balance_loss_clip": 1.06401706, "balance_loss_mlp": 1.03110838, "epoch": 0.220765947213371, "flos": 17931347556480.0, "grad_norm": 1.8189641496204378, "language_loss": 0.76240432, "learning_rate": 3.630396968980835e-06, "loss": 0.78500712, "num_input_tokens_seen": 39066335, "step": 1836, "time_per_iteration": 3.5578157901763916 }, { "auxiliary_loss_clip": 0.01212128, "auxiliary_loss_mlp": 0.01042004, "balance_loss_clip": 0.98326302, "balance_loss_mlp": 1.0318538, "epoch": 0.2208861901040101, "flos": 26757040544640.0, "grad_norm": 2.831232890428992, "language_loss": 0.83589303, "learning_rate": 3.6299456792115575e-06, "loss": 0.85843444, "num_input_tokens_seen": 39087590, "step": 1837, "time_per_iteration": 2.7721405029296875 }, { "auxiliary_loss_clip": 0.01179491, "auxiliary_loss_mlp": 0.01041989, "balance_loss_clip": 0.82115376, "balance_loss_mlp": 1.03103924, "epoch": 0.2210064329946492, "flos": 17817977255040.0, "grad_norm": 1.8172465403117475, "language_loss": 0.80864716, "learning_rate": 3.629494142180815e-06, "loss": 0.83086193, "num_input_tokens_seen": 39106335, "step": 1838, "time_per_iteration": 2.8575026988983154 }, { "auxiliary_loss_clip": 0.01216499, "auxiliary_loss_mlp": 0.01037543, "balance_loss_clip": 1.06193662, "balance_loss_mlp": 1.0272727, "epoch": 0.22112667588528828, "flos": 17967401832960.0, "grad_norm": 2.5124383637260697, "language_loss": 0.85163391, "learning_rate": 3.6290423579571075e-06, "loss": 0.8741743, "num_input_tokens_seen": 39122875, "step": 1839, "time_per_iteration": 3.605942726135254 }, { "auxiliary_loss_clip": 0.01209219, "auxiliary_loss_mlp": 0.01043648, "balance_loss_clip": 1.02193022, "balance_loss_mlp": 1.03311634, "epoch": 0.22124691877592736, "flos": 18369206346240.0, "grad_norm": 1.7683885734943354, "language_loss": 0.8032465, "learning_rate": 3.6285903266089694e-06, "loss": 0.82577515, "num_input_tokens_seen": 39142150, "step": 1840, "time_per_iteration": 3.7141013145446777 }, { "auxiliary_loss_clip": 0.01213495, "auxiliary_loss_mlp": 0.01041631, "balance_loss_clip": 0.98493689, "balance_loss_mlp": 1.03168344, "epoch": 0.22136716166656648, "flos": 20813286441600.0, "grad_norm": 1.7599399503171453, "language_loss": 0.77491319, "learning_rate": 3.628138048204974e-06, "loss": 0.79746443, "num_input_tokens_seen": 39162835, "step": 1841, "time_per_iteration": 2.76835560798645 }, { "auxiliary_loss_clip": 0.01199839, "auxiliary_loss_mlp": 0.01035629, "balance_loss_clip": 0.90652144, "balance_loss_mlp": 1.02551425, "epoch": 0.22148740455720556, "flos": 17675699483520.0, "grad_norm": 1.763640338042746, "language_loss": 0.76257956, "learning_rate": 3.6276855228137304e-06, "loss": 0.78493428, "num_input_tokens_seen": 39181040, "step": 1842, "time_per_iteration": 2.7615461349487305 }, { "auxiliary_loss_clip": 0.01217172, "auxiliary_loss_mlp": 0.01128103, "balance_loss_clip": 1.06119382, "balance_loss_mlp": 0.0, "epoch": 0.22160764744784464, "flos": 21726710323200.0, "grad_norm": 2.0691856155511954, "language_loss": 0.81886888, "learning_rate": 3.6272327505038874e-06, "loss": 0.84232163, "num_input_tokens_seen": 39197505, "step": 1843, "time_per_iteration": 3.5144081115722656 }, { "auxiliary_loss_clip": 0.012115, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 0.90406448, "balance_loss_mlp": 1.02694225, "epoch": 0.22172789033848372, "flos": 23764712186880.0, "grad_norm": 1.8336112311166088, "language_loss": 0.783764, "learning_rate": 3.626779731344131e-06, "loss": 0.80624938, "num_input_tokens_seen": 39217295, "step": 1844, "time_per_iteration": 2.8388330936431885 }, { "auxiliary_loss_clip": 0.01212298, "auxiliary_loss_mlp": 0.01030304, "balance_loss_clip": 1.059484, "balance_loss_mlp": 1.02125573, "epoch": 0.22184813322912283, "flos": 16982300361600.0, "grad_norm": 1.9763987159070528, "language_loss": 0.85026228, "learning_rate": 3.6263264654031814e-06, "loss": 0.87268835, "num_input_tokens_seen": 39234195, "step": 1845, "time_per_iteration": 2.7265496253967285 }, { "auxiliary_loss_clip": 0.01109299, "auxiliary_loss_mlp": 0.01013196, "balance_loss_clip": 0.91263384, "balance_loss_mlp": 1.00885713, "epoch": 0.22196837611976192, "flos": 61823740314240.0, "grad_norm": 0.6998809269503627, "language_loss": 0.59202152, "learning_rate": 3.6258729527498008e-06, "loss": 0.6132465, "num_input_tokens_seen": 39295040, "step": 1846, "time_per_iteration": 3.30880069732666 }, { "auxiliary_loss_clip": 0.0121634, "auxiliary_loss_mlp": 0.01033741, "balance_loss_clip": 0.98395532, "balance_loss_mlp": 1.02353048, "epoch": 0.222088619010401, "flos": 25558019625600.0, "grad_norm": 2.2585590068907577, "language_loss": 0.64595646, "learning_rate": 3.6254191934527854e-06, "loss": 0.66845727, "num_input_tokens_seen": 39314395, "step": 1847, "time_per_iteration": 2.7554516792297363 }, { "auxiliary_loss_clip": 0.01209148, "auxiliary_loss_mlp": 0.01027789, "balance_loss_clip": 0.945907, "balance_loss_mlp": 1.01748335, "epoch": 0.2222088619010401, "flos": 19318612677120.0, "grad_norm": 1.7319219657268796, "language_loss": 0.6496588, "learning_rate": 3.6249651875809715e-06, "loss": 0.67202818, "num_input_tokens_seen": 39334275, "step": 1848, "time_per_iteration": 2.7513651847839355 }, { "auxiliary_loss_clip": 0.0120682, "auxiliary_loss_mlp": 0.01030498, "balance_loss_clip": 0.98448813, "balance_loss_mlp": 1.02115178, "epoch": 0.2223291047916792, "flos": 19099342103040.0, "grad_norm": 1.8578371014823223, "language_loss": 0.89549696, "learning_rate": 3.62451093520323e-06, "loss": 0.91787016, "num_input_tokens_seen": 39352180, "step": 1849, "time_per_iteration": 2.7715394496917725 }, { "auxiliary_loss_clip": 0.01198365, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 0.89989126, "balance_loss_mlp": 1.02451646, "epoch": 0.22244934768231828, "flos": 20850418126080.0, "grad_norm": 2.1156260495320334, "language_loss": 0.90252304, "learning_rate": 3.6240564363884714e-06, "loss": 0.92485148, "num_input_tokens_seen": 39372125, "step": 1850, "time_per_iteration": 2.806830883026123 }, { "auxiliary_loss_clip": 0.01215244, "auxiliary_loss_mlp": 0.01031492, "balance_loss_clip": 1.02007651, "balance_loss_mlp": 1.02138925, "epoch": 0.2225695905729574, "flos": 15632921111040.0, "grad_norm": 1.7210182726579653, "language_loss": 0.70477432, "learning_rate": 3.623601691205643e-06, "loss": 0.72724169, "num_input_tokens_seen": 39391200, "step": 1851, "time_per_iteration": 2.797985315322876 }, { "auxiliary_loss_clip": 0.01208599, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.01800346, "balance_loss_mlp": 1.02423918, "epoch": 0.22268983346359647, "flos": 25373582265600.0, "grad_norm": 2.08608173652339, "language_loss": 0.8140136, "learning_rate": 3.623146699723729e-06, "loss": 0.83643979, "num_input_tokens_seen": 39410660, "step": 1852, "time_per_iteration": 2.739868402481079 }, { "auxiliary_loss_clip": 0.01212209, "auxiliary_loss_mlp": 0.01039509, "balance_loss_clip": 0.98733306, "balance_loss_mlp": 1.02873898, "epoch": 0.22281007635423555, "flos": 13261452359040.0, "grad_norm": 1.5968036751042078, "language_loss": 0.77854759, "learning_rate": 3.6226914620117507e-06, "loss": 0.80106479, "num_input_tokens_seen": 39429280, "step": 1853, "time_per_iteration": 2.718656063079834 }, { "auxiliary_loss_clip": 0.01207486, "auxiliary_loss_mlp": 0.0103183, "balance_loss_clip": 0.93903673, "balance_loss_mlp": 1.02258492, "epoch": 0.22293031924487464, "flos": 15340536403200.0, "grad_norm": 1.9187332842224774, "language_loss": 0.80512542, "learning_rate": 3.622235978138768e-06, "loss": 0.82751858, "num_input_tokens_seen": 39446905, "step": 1854, "time_per_iteration": 2.755401849746704 }, { "auxiliary_loss_clip": 0.01212465, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.02298152, "balance_loss_mlp": 1.02679312, "epoch": 0.22305056213551375, "flos": 22564649773440.0, "grad_norm": 1.8954800358121255, "language_loss": 0.81292284, "learning_rate": 3.621780248173877e-06, "loss": 0.83541059, "num_input_tokens_seen": 39465105, "step": 1855, "time_per_iteration": 2.652951955795288 }, { "auxiliary_loss_clip": 0.01107736, "auxiliary_loss_mlp": 0.01004886, "balance_loss_clip": 0.98522222, "balance_loss_mlp": 1.00066555, "epoch": 0.22317080502615283, "flos": 64880419887360.0, "grad_norm": 0.8301963022591884, "language_loss": 0.61056173, "learning_rate": 3.6213242721862125e-06, "loss": 0.63168788, "num_input_tokens_seen": 39523560, "step": 1856, "time_per_iteration": 3.246316432952881 }, { "auxiliary_loss_clip": 0.01198009, "auxiliary_loss_mlp": 0.01034116, "balance_loss_clip": 0.98092687, "balance_loss_mlp": 1.02435303, "epoch": 0.2232910479167919, "flos": 25775997310080.0, "grad_norm": 1.4982272008611401, "language_loss": 0.75379902, "learning_rate": 3.620868050244945e-06, "loss": 0.77612031, "num_input_tokens_seen": 39544040, "step": 1857, "time_per_iteration": 2.6650164127349854 }, { "auxiliary_loss_clip": 0.01203268, "auxiliary_loss_mlp": 0.01032474, "balance_loss_clip": 0.97916126, "balance_loss_mlp": 1.02198374, "epoch": 0.22341129080743102, "flos": 23251799928960.0, "grad_norm": 1.8447589843287906, "language_loss": 0.77729905, "learning_rate": 3.6204115824192817e-06, "loss": 0.79965651, "num_input_tokens_seen": 39561515, "step": 1858, "time_per_iteration": 2.6877050399780273 }, { "auxiliary_loss_clip": 0.01197943, "auxiliary_loss_mlp": 0.01032165, "balance_loss_clip": 0.97790974, "balance_loss_mlp": 1.02118015, "epoch": 0.2235315336980701, "flos": 21214552250880.0, "grad_norm": 2.1297738151001457, "language_loss": 0.76831496, "learning_rate": 3.619954868778471e-06, "loss": 0.79061604, "num_input_tokens_seen": 39578210, "step": 1859, "time_per_iteration": 2.6728317737579346 }, { "auxiliary_loss_clip": 0.01206975, "auxiliary_loss_mlp": 0.01042507, "balance_loss_clip": 0.98125476, "balance_loss_mlp": 1.03221381, "epoch": 0.2236517765887092, "flos": 19901945548800.0, "grad_norm": 1.839314635052048, "language_loss": 0.82984686, "learning_rate": 3.6194979093917944e-06, "loss": 0.85234171, "num_input_tokens_seen": 39597625, "step": 1860, "time_per_iteration": 2.713763475418091 }, { "auxiliary_loss_clip": 0.01199896, "auxiliary_loss_mlp": 0.01036144, "balance_loss_clip": 0.98032743, "balance_loss_mlp": 1.02622604, "epoch": 0.22377201947934827, "flos": 23214847812480.0, "grad_norm": 1.7287552782953661, "language_loss": 0.8680169, "learning_rate": 3.6190407043285724e-06, "loss": 0.89037734, "num_input_tokens_seen": 39615360, "step": 1861, "time_per_iteration": 2.6717476844787598 }, { "auxiliary_loss_clip": 0.0121691, "auxiliary_loss_mlp": 0.01037044, "balance_loss_clip": 1.06171036, "balance_loss_mlp": 1.02706671, "epoch": 0.22389226236998738, "flos": 26794244056320.0, "grad_norm": 1.7052463129298254, "language_loss": 0.7565757, "learning_rate": 3.618583253658163e-06, "loss": 0.77911526, "num_input_tokens_seen": 39635460, "step": 1862, "time_per_iteration": 2.6320934295654297 }, { "auxiliary_loss_clip": 0.0120742, "auxiliary_loss_mlp": 0.01127456, "balance_loss_clip": 0.90363884, "balance_loss_mlp": 0.0, "epoch": 0.22401250526062647, "flos": 24170359455360.0, "grad_norm": 1.8148545791268653, "language_loss": 0.86238891, "learning_rate": 3.618125557449961e-06, "loss": 0.88573766, "num_input_tokens_seen": 39653515, "step": 1863, "time_per_iteration": 3.71750807762146 }, { "auxiliary_loss_clip": 0.01208845, "auxiliary_loss_mlp": 0.0102673, "balance_loss_clip": 1.02027047, "balance_loss_mlp": 1.01675177, "epoch": 0.22413274815126555, "flos": 16759761649920.0, "grad_norm": 1.844803698166738, "language_loss": 0.82768768, "learning_rate": 3.6176676157733983e-06, "loss": 0.85004348, "num_input_tokens_seen": 39668525, "step": 1864, "time_per_iteration": 2.658571481704712 }, { "auxiliary_loss_clip": 0.01197637, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 0.94066525, "balance_loss_mlp": 1.02488947, "epoch": 0.22425299104190466, "flos": 21360205900800.0, "grad_norm": 1.9464139185200222, "language_loss": 0.75993145, "learning_rate": 3.6172094286979443e-06, "loss": 0.78226316, "num_input_tokens_seen": 39685895, "step": 1865, "time_per_iteration": 3.6585910320281982 }, { "auxiliary_loss_clip": 0.01207313, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 0.97932088, "balance_loss_mlp": 1.02560878, "epoch": 0.22437323393254374, "flos": 32165547108480.0, "grad_norm": 1.4417725673619617, "language_loss": 0.81488031, "learning_rate": 3.6167509962931064e-06, "loss": 0.83730996, "num_input_tokens_seen": 39711595, "step": 1866, "time_per_iteration": 3.8107190132141113 }, { "auxiliary_loss_clip": 0.01215146, "auxiliary_loss_mlp": 0.01043939, "balance_loss_clip": 0.90816796, "balance_loss_mlp": 1.03352642, "epoch": 0.22449347682318282, "flos": 18002809664640.0, "grad_norm": 2.8755259062710667, "language_loss": 0.76940465, "learning_rate": 3.6162923186284276e-06, "loss": 0.79199553, "num_input_tokens_seen": 39727555, "step": 1867, "time_per_iteration": 2.695662021636963 }, { "auxiliary_loss_clip": 0.01205723, "auxiliary_loss_mlp": 0.01041836, "balance_loss_clip": 0.98039544, "balance_loss_mlp": 1.03203702, "epoch": 0.2246137197138219, "flos": 18697286194560.0, "grad_norm": 2.045296200668218, "language_loss": 0.85847104, "learning_rate": 3.6158333957734888e-06, "loss": 0.88094664, "num_input_tokens_seen": 39746145, "step": 1868, "time_per_iteration": 3.6315832138061523 }, { "auxiliary_loss_clip": 0.01208948, "auxiliary_loss_mlp": 0.01033586, "balance_loss_clip": 0.9400413, "balance_loss_mlp": 1.02342939, "epoch": 0.22473396260446102, "flos": 15590653781760.0, "grad_norm": 2.39019840441103, "language_loss": 0.82733518, "learning_rate": 3.6153742277979088e-06, "loss": 0.84976053, "num_input_tokens_seen": 39763575, "step": 1869, "time_per_iteration": 2.7070910930633545 }, { "auxiliary_loss_clip": 0.01211091, "auxiliary_loss_mlp": 0.01035724, "balance_loss_clip": 0.98202497, "balance_loss_mlp": 1.02528143, "epoch": 0.2248542054951001, "flos": 14465501182080.0, "grad_norm": 3.4591592179476036, "language_loss": 0.78545666, "learning_rate": 3.6149148147713434e-06, "loss": 0.80792487, "num_input_tokens_seen": 39781810, "step": 1870, "time_per_iteration": 2.681774377822876 }, { "auxiliary_loss_clip": 0.01220679, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.02652347, "balance_loss_mlp": 1.02521563, "epoch": 0.22497444838573918, "flos": 19243882431360.0, "grad_norm": 1.9488201760080943, "language_loss": 0.8707363, "learning_rate": 3.614455156763484e-06, "loss": 0.89329606, "num_input_tokens_seen": 39800115, "step": 1871, "time_per_iteration": 2.675969362258911 }, { "auxiliary_loss_clip": 0.01195543, "auxiliary_loss_mlp": 0.01038892, "balance_loss_clip": 0.89840984, "balance_loss_mlp": 1.02919459, "epoch": 0.2250946912763783, "flos": 16910299549440.0, "grad_norm": 2.275734181934113, "language_loss": 0.71446109, "learning_rate": 3.613995253844061e-06, "loss": 0.73680544, "num_input_tokens_seen": 39817795, "step": 1872, "time_per_iteration": 2.718625783920288 }, { "auxiliary_loss_clip": 0.01204889, "auxiliary_loss_mlp": 0.01038168, "balance_loss_clip": 1.01744282, "balance_loss_mlp": 1.02813697, "epoch": 0.22521493416701738, "flos": 24681368292480.0, "grad_norm": 2.0530997091131753, "language_loss": 0.80835503, "learning_rate": 3.6135351060828414e-06, "loss": 0.83078563, "num_input_tokens_seen": 39838270, "step": 1873, "time_per_iteration": 2.751965284347534 }, { "auxiliary_loss_clip": 0.01217913, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.06092143, "balance_loss_mlp": 1.02354503, "epoch": 0.22533517705765646, "flos": 17821963664640.0, "grad_norm": 2.8484741844510006, "language_loss": 0.69281769, "learning_rate": 3.6130747135496285e-06, "loss": 0.71534145, "num_input_tokens_seen": 39857270, "step": 1874, "time_per_iteration": 2.6714205741882324 }, { "auxiliary_loss_clip": 0.0121371, "auxiliary_loss_mlp": 0.01032566, "balance_loss_clip": 1.06013322, "balance_loss_mlp": 1.02234936, "epoch": 0.22545541994829554, "flos": 33691390899840.0, "grad_norm": 1.8564597674023737, "language_loss": 0.65993214, "learning_rate": 3.6126140763142646e-06, "loss": 0.68239492, "num_input_tokens_seen": 39882300, "step": 1875, "time_per_iteration": 2.7449913024902344 }, { "auxiliary_loss_clip": 0.01213317, "auxiliary_loss_mlp": 0.01042787, "balance_loss_clip": 1.06040192, "balance_loss_mlp": 1.03298783, "epoch": 0.22557566283893465, "flos": 19171594310400.0, "grad_norm": 2.4930602328636167, "language_loss": 0.85893649, "learning_rate": 3.6121531944466275e-06, "loss": 0.8814975, "num_input_tokens_seen": 39899625, "step": 1876, "time_per_iteration": 2.590020179748535 }, { "auxiliary_loss_clip": 0.01206813, "auxiliary_loss_mlp": 0.0103638, "balance_loss_clip": 1.0204165, "balance_loss_mlp": 1.02683127, "epoch": 0.22569590572957374, "flos": 20773281669120.0, "grad_norm": 2.074376100984681, "language_loss": 0.7877425, "learning_rate": 3.611692068016633e-06, "loss": 0.81017447, "num_input_tokens_seen": 39915955, "step": 1877, "time_per_iteration": 2.6824162006378174 }, { "auxiliary_loss_clip": 0.0120009, "auxiliary_loss_mlp": 0.01040794, "balance_loss_clip": 0.93910635, "balance_loss_mlp": 1.03061974, "epoch": 0.22581614862021282, "flos": 18442715529600.0, "grad_norm": 2.161253152585186, "language_loss": 0.74611914, "learning_rate": 3.611230697094233e-06, "loss": 0.76852798, "num_input_tokens_seen": 39932655, "step": 1878, "time_per_iteration": 2.693695545196533 }, { "auxiliary_loss_clip": 0.01209324, "auxiliary_loss_mlp": 0.01041209, "balance_loss_clip": 0.9804188, "balance_loss_mlp": 1.03125536, "epoch": 0.22593639151085193, "flos": 20048389297920.0, "grad_norm": 2.150118497703898, "language_loss": 0.87480235, "learning_rate": 3.6107690817494173e-06, "loss": 0.89730763, "num_input_tokens_seen": 39952875, "step": 1879, "time_per_iteration": 2.749594211578369 }, { "auxiliary_loss_clip": 0.0119565, "auxiliary_loss_mlp": 0.01037744, "balance_loss_clip": 0.90093046, "balance_loss_mlp": 1.02744389, "epoch": 0.226056634401491, "flos": 13115116350720.0, "grad_norm": 2.151880092763126, "language_loss": 0.7076723, "learning_rate": 3.6103072220522117e-06, "loss": 0.73000622, "num_input_tokens_seen": 39968405, "step": 1880, "time_per_iteration": 2.7514779567718506 }, { "auxiliary_loss_clip": 0.01210065, "auxiliary_loss_mlp": 0.01034585, "balance_loss_clip": 0.94379902, "balance_loss_mlp": 1.02497077, "epoch": 0.2261768772921301, "flos": 18988378012800.0, "grad_norm": 1.9089841664834843, "language_loss": 0.91856849, "learning_rate": 3.609845118072682e-06, "loss": 0.94101501, "num_input_tokens_seen": 39987075, "step": 1881, "time_per_iteration": 2.7703940868377686 }, { "auxiliary_loss_clip": 0.01217071, "auxiliary_loss_mlp": 0.01127887, "balance_loss_clip": 1.02264452, "balance_loss_mlp": 0.0, "epoch": 0.2262971201827692, "flos": 19974054101760.0, "grad_norm": 1.789435922194459, "language_loss": 0.80145985, "learning_rate": 3.6093827698809276e-06, "loss": 0.82490945, "num_input_tokens_seen": 40006175, "step": 1882, "time_per_iteration": 2.7591898441314697 }, { "auxiliary_loss_clip": 0.01209485, "auxiliary_loss_mlp": 0.01038351, "balance_loss_clip": 1.01945686, "balance_loss_mlp": 1.02926755, "epoch": 0.2264173630734083, "flos": 16654543735680.0, "grad_norm": 2.5504193068903, "language_loss": 0.8492015, "learning_rate": 3.6089201775470864e-06, "loss": 0.8716799, "num_input_tokens_seen": 40021630, "step": 1883, "time_per_iteration": 2.6441385746002197 }, { "auxiliary_loss_clip": 0.01193169, "auxiliary_loss_mlp": 0.01033072, "balance_loss_clip": 0.94047964, "balance_loss_mlp": 1.02374434, "epoch": 0.22653760596404737, "flos": 24389809597440.0, "grad_norm": 1.359873460227759, "language_loss": 0.77553368, "learning_rate": 3.6084573411413334e-06, "loss": 0.79779613, "num_input_tokens_seen": 40041025, "step": 1884, "time_per_iteration": 2.779238700866699 }, { "auxiliary_loss_clip": 0.01202707, "auxiliary_loss_mlp": 0.01037264, "balance_loss_clip": 0.94287819, "balance_loss_mlp": 1.02646339, "epoch": 0.22665784885468646, "flos": 18332541538560.0, "grad_norm": 2.0750082698607186, "language_loss": 0.80890119, "learning_rate": 3.607994260733881e-06, "loss": 0.83130091, "num_input_tokens_seen": 40060265, "step": 1885, "time_per_iteration": 2.704019784927368 }, { "auxiliary_loss_clip": 0.01195348, "auxiliary_loss_mlp": 0.01034813, "balance_loss_clip": 1.01601863, "balance_loss_mlp": 1.0254848, "epoch": 0.22677809174532557, "flos": 24058102475520.0, "grad_norm": 1.6652690124032232, "language_loss": 0.75002891, "learning_rate": 3.6075309363949776e-06, "loss": 0.77233052, "num_input_tokens_seen": 40079435, "step": 1886, "time_per_iteration": 2.7560272216796875 }, { "auxiliary_loss_clip": 0.0121265, "auxiliary_loss_mlp": 0.01040735, "balance_loss_clip": 1.05930734, "balance_loss_mlp": 1.03098989, "epoch": 0.22689833463596465, "flos": 20374242503040.0, "grad_norm": 2.3228800945337524, "language_loss": 0.81598735, "learning_rate": 3.6070673681949094e-06, "loss": 0.83852124, "num_input_tokens_seen": 40097800, "step": 1887, "time_per_iteration": 2.619534730911255 }, { "auxiliary_loss_clip": 0.01213943, "auxiliary_loss_mlp": 0.01127305, "balance_loss_clip": 0.98568559, "balance_loss_mlp": 0.0, "epoch": 0.22701857752660373, "flos": 30120398438400.0, "grad_norm": 1.591926207350638, "language_loss": 0.81462169, "learning_rate": 3.606603556203999e-06, "loss": 0.83803415, "num_input_tokens_seen": 40122745, "step": 1888, "time_per_iteration": 2.798651933670044 }, { "auxiliary_loss_clip": 0.0120969, "auxiliary_loss_mlp": 0.01040623, "balance_loss_clip": 1.01976657, "balance_loss_mlp": 1.03086603, "epoch": 0.22713882041724284, "flos": 22492182084480.0, "grad_norm": 1.7474489967435665, "language_loss": 0.83863151, "learning_rate": 3.6061395004926066e-06, "loss": 0.86113465, "num_input_tokens_seen": 40141680, "step": 1889, "time_per_iteration": 3.6784868240356445 }, { "auxiliary_loss_clip": 0.01213007, "auxiliary_loss_mlp": 0.01032085, "balance_loss_clip": 1.0591687, "balance_loss_mlp": 1.02133822, "epoch": 0.22725906330788193, "flos": 20521548178560.0, "grad_norm": 2.1062998088135783, "language_loss": 0.85178757, "learning_rate": 3.605675201131129e-06, "loss": 0.87423849, "num_input_tokens_seen": 40160140, "step": 1890, "time_per_iteration": 2.5678651332855225 }, { "auxiliary_loss_clip": 0.01216317, "auxiliary_loss_mlp": 0.01030783, "balance_loss_clip": 1.02257693, "balance_loss_mlp": 1.02051926, "epoch": 0.227379306198521, "flos": 18989922297600.0, "grad_norm": 2.300406281975205, "language_loss": 0.79664963, "learning_rate": 3.60521065819e-06, "loss": 0.81912065, "num_input_tokens_seen": 40177450, "step": 1891, "time_per_iteration": 3.6257827281951904 }, { "auxiliary_loss_clip": 0.01209743, "auxiliary_loss_mlp": 0.01041204, "balance_loss_clip": 0.98081303, "balance_loss_mlp": 1.03187037, "epoch": 0.2274995490891601, "flos": 21798351999360.0, "grad_norm": 1.7977168192263553, "language_loss": 0.8767978, "learning_rate": 3.60474587173969e-06, "loss": 0.89930725, "num_input_tokens_seen": 40195935, "step": 1892, "time_per_iteration": 2.706766366958618 }, { "auxiliary_loss_clip": 0.01210902, "auxiliary_loss_mlp": 0.01034937, "balance_loss_clip": 1.02249825, "balance_loss_mlp": 1.02482176, "epoch": 0.2276197919797992, "flos": 19058654972160.0, "grad_norm": 2.0888412604270044, "language_loss": 0.84380221, "learning_rate": 3.6042808418507084e-06, "loss": 0.86626065, "num_input_tokens_seen": 40213620, "step": 1893, "time_per_iteration": 3.6050846576690674 }, { "auxiliary_loss_clip": 0.01216104, "auxiliary_loss_mlp": 0.01044076, "balance_loss_clip": 1.02581632, "balance_loss_mlp": 1.03340721, "epoch": 0.22774003487043828, "flos": 18806777827200.0, "grad_norm": 1.8511063995963524, "language_loss": 0.76987898, "learning_rate": 3.6038155685935976e-06, "loss": 0.79248083, "num_input_tokens_seen": 40230190, "step": 1894, "time_per_iteration": 3.5428855419158936 }, { "auxiliary_loss_clip": 0.01209376, "auxiliary_loss_mlp": 0.01034054, "balance_loss_clip": 1.02127981, "balance_loss_mlp": 1.02356339, "epoch": 0.22786027776107737, "flos": 23002544476800.0, "grad_norm": 1.8131444054927452, "language_loss": 0.70467103, "learning_rate": 3.6033500520389404e-06, "loss": 0.72710538, "num_input_tokens_seen": 40246860, "step": 1895, "time_per_iteration": 2.6247663497924805 }, { "auxiliary_loss_clip": 0.01111009, "auxiliary_loss_mlp": 0.01003805, "balance_loss_clip": 0.87285703, "balance_loss_mlp": 0.99948943, "epoch": 0.22798052065171648, "flos": 66706872600960.0, "grad_norm": 0.7915168599982011, "language_loss": 0.64825463, "learning_rate": 3.6028842922573553e-06, "loss": 0.66940272, "num_input_tokens_seen": 40311005, "step": 1896, "time_per_iteration": 3.3941307067871094 }, { "auxiliary_loss_clip": 0.01111763, "auxiliary_loss_mlp": 0.01122326, "balance_loss_clip": 0.91055906, "balance_loss_mlp": 0.0, "epoch": 0.22810076354235556, "flos": 62080896758400.0, "grad_norm": 0.8625915952141087, "language_loss": 0.6292243, "learning_rate": 3.602418289319497e-06, "loss": 0.65156519, "num_input_tokens_seen": 40369560, "step": 1897, "time_per_iteration": 3.2578420639038086 }, { "auxiliary_loss_clip": 0.01201948, "auxiliary_loss_mlp": 0.01039968, "balance_loss_clip": 0.90467274, "balance_loss_mlp": 1.02989507, "epoch": 0.22822100643299464, "flos": 23876358635520.0, "grad_norm": 1.7594934875151573, "language_loss": 0.73077041, "learning_rate": 3.601952043296059e-06, "loss": 0.75318956, "num_input_tokens_seen": 40389555, "step": 1898, "time_per_iteration": 2.7543931007385254 }, { "auxiliary_loss_clip": 0.01216129, "auxiliary_loss_mlp": 0.01038921, "balance_loss_clip": 0.98355681, "balance_loss_mlp": 1.02908659, "epoch": 0.22834124932363373, "flos": 20991331180800.0, "grad_norm": 2.287716928285902, "language_loss": 0.80691826, "learning_rate": 3.6014855542577696e-06, "loss": 0.82946879, "num_input_tokens_seen": 40406765, "step": 1899, "time_per_iteration": 2.7486226558685303 }, { "auxiliary_loss_clip": 0.01207235, "auxiliary_loss_mlp": 0.01043422, "balance_loss_clip": 0.98328102, "balance_loss_mlp": 1.03273475, "epoch": 0.22846149221427284, "flos": 24901572620160.0, "grad_norm": 1.8355548491332054, "language_loss": 0.8453815, "learning_rate": 3.6010188222753943e-06, "loss": 0.86788809, "num_input_tokens_seen": 40427535, "step": 1900, "time_per_iteration": 2.7395660877227783 }, { "auxiliary_loss_clip": 0.01108763, "auxiliary_loss_mlp": 0.01003162, "balance_loss_clip": 0.95172405, "balance_loss_mlp": 0.99929965, "epoch": 0.22858173510491192, "flos": 56132294319360.0, "grad_norm": 0.9180039269587318, "language_loss": 0.64228964, "learning_rate": 3.6005518474197372e-06, "loss": 0.66340899, "num_input_tokens_seen": 40479580, "step": 1901, "time_per_iteration": 3.136599063873291 }, { "auxiliary_loss_clip": 0.0121173, "auxiliary_loss_mlp": 0.01038303, "balance_loss_clip": 1.02311599, "balance_loss_mlp": 1.02755606, "epoch": 0.228701977995551, "flos": 24170826332160.0, "grad_norm": 4.209610341129893, "language_loss": 0.78512824, "learning_rate": 3.6000846297616373e-06, "loss": 0.80762857, "num_input_tokens_seen": 40497880, "step": 1902, "time_per_iteration": 2.689096689224243 }, { "auxiliary_loss_clip": 0.01221497, "auxiliary_loss_mlp": 0.0104369, "balance_loss_clip": 1.06629217, "balance_loss_mlp": 1.03237104, "epoch": 0.22882222088619011, "flos": 21387892308480.0, "grad_norm": 2.123861814382286, "language_loss": 0.72649068, "learning_rate": 3.5996171693719717e-06, "loss": 0.74914253, "num_input_tokens_seen": 40513975, "step": 1903, "time_per_iteration": 2.669870138168335 }, { "auxiliary_loss_clip": 0.01110285, "auxiliary_loss_mlp": 0.01004261, "balance_loss_clip": 0.98855996, "balance_loss_mlp": 1.00025606, "epoch": 0.2289424637768292, "flos": 64589615377920.0, "grad_norm": 0.8361843826368187, "language_loss": 0.64860541, "learning_rate": 3.5991494663216528e-06, "loss": 0.66975087, "num_input_tokens_seen": 40576960, "step": 1904, "time_per_iteration": 3.2534613609313965 }, { "auxiliary_loss_clip": 0.01214313, "auxiliary_loss_mlp": 0.01038569, "balance_loss_clip": 1.06134009, "balance_loss_mlp": 1.02784598, "epoch": 0.22906270666746828, "flos": 22163419877760.0, "grad_norm": 2.120970771874738, "language_loss": 0.87565923, "learning_rate": 3.5986815206816314e-06, "loss": 0.89818799, "num_input_tokens_seen": 40595780, "step": 1905, "time_per_iteration": 2.6074204444885254 }, { "auxiliary_loss_clip": 0.0121461, "auxiliary_loss_mlp": 0.01035562, "balance_loss_clip": 1.0613035, "balance_loss_mlp": 1.02570963, "epoch": 0.2291829495581074, "flos": 25772334122880.0, "grad_norm": 1.7905733447391534, "language_loss": 0.74518585, "learning_rate": 3.598213332522895e-06, "loss": 0.76768756, "num_input_tokens_seen": 40615810, "step": 1906, "time_per_iteration": 2.671447515487671 }, { "auxiliary_loss_clip": 0.01212552, "auxiliary_loss_mlp": 0.01040461, "balance_loss_clip": 1.02379131, "balance_loss_mlp": 1.03117454, "epoch": 0.22930319244874647, "flos": 31172760126720.0, "grad_norm": 1.8832105609634828, "language_loss": 0.77469903, "learning_rate": 3.597744901916466e-06, "loss": 0.79722917, "num_input_tokens_seen": 40637095, "step": 1907, "time_per_iteration": 2.6968953609466553 }, { "auxiliary_loss_clip": 0.01216902, "auxiliary_loss_mlp": 0.01033516, "balance_loss_clip": 1.05949819, "balance_loss_mlp": 1.02266181, "epoch": 0.22942343533938556, "flos": 23254098399360.0, "grad_norm": 2.5621541135228245, "language_loss": 0.76885426, "learning_rate": 3.5972762289334058e-06, "loss": 0.79135847, "num_input_tokens_seen": 40656725, "step": 1908, "time_per_iteration": 2.646167755126953 }, { "auxiliary_loss_clip": 0.01205184, "auxiliary_loss_mlp": 0.01031048, "balance_loss_clip": 0.86976767, "balance_loss_mlp": 1.02092671, "epoch": 0.22954367823002464, "flos": 14610903436800.0, "grad_norm": 2.3177165365970085, "language_loss": 0.84775156, "learning_rate": 3.5968073136448116e-06, "loss": 0.87011385, "num_input_tokens_seen": 40674745, "step": 1909, "time_per_iteration": 2.843153715133667 }, { "auxiliary_loss_clip": 0.01218797, "auxiliary_loss_mlp": 0.01041796, "balance_loss_clip": 1.022228, "balance_loss_mlp": 1.0316695, "epoch": 0.22966392112066375, "flos": 16763604405120.0, "grad_norm": 1.6557264455502965, "language_loss": 0.9129039, "learning_rate": 3.596338156121818e-06, "loss": 0.9355098, "num_input_tokens_seen": 40693630, "step": 1910, "time_per_iteration": 2.819718837738037 }, { "auxiliary_loss_clip": 0.01107229, "auxiliary_loss_mlp": 0.01007136, "balance_loss_clip": 0.94934583, "balance_loss_mlp": 1.00320232, "epoch": 0.22978416401130283, "flos": 67474247783040.0, "grad_norm": 0.7495482510009539, "language_loss": 0.59409618, "learning_rate": 3.595868756435595e-06, "loss": 0.61523986, "num_input_tokens_seen": 40761310, "step": 1911, "time_per_iteration": 3.391648530960083 }, { "auxiliary_loss_clip": 0.01214515, "auxiliary_loss_mlp": 0.01036038, "balance_loss_clip": 0.94838965, "balance_loss_mlp": 1.02526712, "epoch": 0.22990440690194192, "flos": 19865137086720.0, "grad_norm": 2.3890903830609607, "language_loss": 0.80474883, "learning_rate": 3.5953991146573504e-06, "loss": 0.8272543, "num_input_tokens_seen": 40779955, "step": 1912, "time_per_iteration": 2.7522835731506348 }, { "auxiliary_loss_clip": 0.0121581, "auxiliary_loss_mlp": 0.0104594, "balance_loss_clip": 1.02124548, "balance_loss_mlp": 1.03483534, "epoch": 0.23002464979258103, "flos": 13289246507520.0, "grad_norm": 2.2970508950221, "language_loss": 0.83488172, "learning_rate": 3.5949292308583294e-06, "loss": 0.85749924, "num_input_tokens_seen": 40793200, "step": 1913, "time_per_iteration": 2.6767585277557373 }, { "auxiliary_loss_clip": 0.01216932, "auxiliary_loss_mlp": 0.01041521, "balance_loss_clip": 1.06387258, "balance_loss_mlp": 1.03103602, "epoch": 0.2301448926832201, "flos": 22163779013760.0, "grad_norm": 2.15434159653544, "language_loss": 0.80936754, "learning_rate": 3.594459105109811e-06, "loss": 0.83195198, "num_input_tokens_seen": 40812380, "step": 1914, "time_per_iteration": 2.6340324878692627 }, { "auxiliary_loss_clip": 0.01219083, "auxiliary_loss_mlp": 0.01034856, "balance_loss_clip": 1.02659619, "balance_loss_mlp": 1.02483678, "epoch": 0.2302651355738592, "flos": 20704477167360.0, "grad_norm": 1.9418885802737305, "language_loss": 0.81447983, "learning_rate": 3.593988737483115e-06, "loss": 0.83701921, "num_input_tokens_seen": 40832320, "step": 1915, "time_per_iteration": 3.6209588050842285 }, { "auxiliary_loss_clip": 0.01214624, "auxiliary_loss_mlp": 0.01035711, "balance_loss_clip": 0.98528099, "balance_loss_mlp": 1.02549529, "epoch": 0.23038537846449827, "flos": 18588943797120.0, "grad_norm": 2.7397030302524934, "language_loss": 0.78164363, "learning_rate": 3.5935181280495947e-06, "loss": 0.80414701, "num_input_tokens_seen": 40850900, "step": 1916, "time_per_iteration": 2.6929092407226562 }, { "auxiliary_loss_clip": 0.01100331, "auxiliary_loss_mlp": 0.01002839, "balance_loss_clip": 0.94505513, "balance_loss_mlp": 0.99864239, "epoch": 0.23050562135513739, "flos": 64224260190720.0, "grad_norm": 0.7951995229715979, "language_loss": 0.54248869, "learning_rate": 3.5930472768806412e-06, "loss": 0.56352043, "num_input_tokens_seen": 40909570, "step": 1917, "time_per_iteration": 3.2231850624084473 }, { "auxiliary_loss_clip": 0.01220507, "auxiliary_loss_mlp": 0.01048861, "balance_loss_clip": 1.06556368, "balance_loss_mlp": 1.03871, "epoch": 0.23062586424577647, "flos": 17313396952320.0, "grad_norm": 1.8950053797454351, "language_loss": 0.77210665, "learning_rate": 3.5925761840476826e-06, "loss": 0.79480028, "num_input_tokens_seen": 40928180, "step": 1918, "time_per_iteration": 3.6082534790039062 }, { "auxiliary_loss_clip": 0.01208453, "auxiliary_loss_mlp": 0.01035946, "balance_loss_clip": 0.98616678, "balance_loss_mlp": 1.02574146, "epoch": 0.23074610713641555, "flos": 27855979194240.0, "grad_norm": 1.8000883517938093, "language_loss": 0.81382006, "learning_rate": 3.592104849622183e-06, "loss": 0.83626401, "num_input_tokens_seen": 40950435, "step": 1919, "time_per_iteration": 3.676081657409668 }, { "auxiliary_loss_clip": 0.01197511, "auxiliary_loss_mlp": 0.01029505, "balance_loss_clip": 0.90662307, "balance_loss_mlp": 1.01922894, "epoch": 0.23086635002705466, "flos": 28841798937600.0, "grad_norm": 1.6655924494788856, "language_loss": 0.73133039, "learning_rate": 3.591633273675644e-06, "loss": 0.7536006, "num_input_tokens_seen": 40972670, "step": 1920, "time_per_iteration": 2.911912441253662 }, { "auxiliary_loss_clip": 0.01097285, "auxiliary_loss_mlp": 0.01006448, "balance_loss_clip": 0.91357875, "balance_loss_mlp": 1.00246608, "epoch": 0.23098659291769374, "flos": 62923681566720.0, "grad_norm": 0.9052104968799436, "language_loss": 0.58228505, "learning_rate": 3.591161456279602e-06, "loss": 0.60332239, "num_input_tokens_seen": 41018215, "step": 1921, "time_per_iteration": 4.1792333126068115 }, { "auxiliary_loss_clip": 0.01218856, "auxiliary_loss_mlp": 0.01037123, "balance_loss_clip": 0.98439419, "balance_loss_mlp": 1.02665687, "epoch": 0.23110683580833283, "flos": 23476816679040.0, "grad_norm": 1.5246830671181215, "language_loss": 0.80450225, "learning_rate": 3.590689397505633e-06, "loss": 0.82706201, "num_input_tokens_seen": 41039125, "step": 1922, "time_per_iteration": 2.7312982082366943 }, { "auxiliary_loss_clip": 0.0121218, "auxiliary_loss_mlp": 0.01042787, "balance_loss_clip": 1.06074178, "balance_loss_mlp": 1.03294039, "epoch": 0.2312270786989719, "flos": 27271066124160.0, "grad_norm": 2.765690762724468, "language_loss": 0.86725801, "learning_rate": 3.590217097425347e-06, "loss": 0.8898077, "num_input_tokens_seen": 41059025, "step": 1923, "time_per_iteration": 2.713315486907959 }, { "auxiliary_loss_clip": 0.01218806, "auxiliary_loss_mlp": 0.01043648, "balance_loss_clip": 1.06365716, "balance_loss_mlp": 1.03333092, "epoch": 0.23134732158961102, "flos": 13261344618240.0, "grad_norm": 1.9350444947636942, "language_loss": 0.71427858, "learning_rate": 3.589744556110391e-06, "loss": 0.73690313, "num_input_tokens_seen": 41077015, "step": 1924, "time_per_iteration": 2.6702818870544434 }, { "auxiliary_loss_clip": 0.01204979, "auxiliary_loss_mlp": 0.01037758, "balance_loss_clip": 0.98156762, "balance_loss_mlp": 1.02742279, "epoch": 0.2314675644802501, "flos": 36977648250240.0, "grad_norm": 1.5297989414576265, "language_loss": 0.84487468, "learning_rate": 3.58927177363245e-06, "loss": 0.86730206, "num_input_tokens_seen": 41099840, "step": 1925, "time_per_iteration": 2.867764472961426 }, { "auxiliary_loss_clip": 0.01200244, "auxiliary_loss_mlp": 0.01038612, "balance_loss_clip": 0.94162434, "balance_loss_mlp": 1.02779388, "epoch": 0.2315878073708892, "flos": 23842207779840.0, "grad_norm": 2.0825959887440146, "language_loss": 0.73046571, "learning_rate": 3.5887987500632447e-06, "loss": 0.75285423, "num_input_tokens_seen": 41117845, "step": 1926, "time_per_iteration": 2.8377954959869385 }, { "auxiliary_loss_clip": 0.01210946, "auxiliary_loss_mlp": 0.01040425, "balance_loss_clip": 0.94361222, "balance_loss_mlp": 1.03094184, "epoch": 0.2317080502615283, "flos": 23039424766080.0, "grad_norm": 1.7225063150212934, "language_loss": 0.8405844, "learning_rate": 3.5883254854745325e-06, "loss": 0.86309808, "num_input_tokens_seen": 41136235, "step": 1927, "time_per_iteration": 2.764893054962158 }, { "auxiliary_loss_clip": 0.01216139, "auxiliary_loss_mlp": 0.01034645, "balance_loss_clip": 1.02089596, "balance_loss_mlp": 1.02425003, "epoch": 0.23182829315216738, "flos": 11254656435840.0, "grad_norm": 1.9926998741013027, "language_loss": 0.74906623, "learning_rate": 3.587851979938107e-06, "loss": 0.77157414, "num_input_tokens_seen": 41153125, "step": 1928, "time_per_iteration": 2.699862003326416 }, { "auxiliary_loss_clip": 0.01210525, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 1.02199161, "balance_loss_mlp": 1.02559304, "epoch": 0.23194853604280646, "flos": 19828939155840.0, "grad_norm": 1.977545446063583, "language_loss": 0.77453309, "learning_rate": 3.5873782335257985e-06, "loss": 0.79699463, "num_input_tokens_seen": 41171290, "step": 1929, "time_per_iteration": 2.6282689571380615 }, { "auxiliary_loss_clip": 0.01208689, "auxiliary_loss_mlp": 0.0104047, "balance_loss_clip": 0.94624209, "balance_loss_mlp": 1.0304023, "epoch": 0.23206877893344555, "flos": 15305020830720.0, "grad_norm": 1.9793527487072846, "language_loss": 0.78422773, "learning_rate": 3.5869042463094744e-06, "loss": 0.80671936, "num_input_tokens_seen": 41189005, "step": 1930, "time_per_iteration": 2.7714149951934814 }, { "auxiliary_loss_clip": 0.01186727, "auxiliary_loss_mlp": 0.01036571, "balance_loss_clip": 0.90223694, "balance_loss_mlp": 1.0260036, "epoch": 0.23218902182408466, "flos": 22711488572160.0, "grad_norm": 1.825043157382343, "language_loss": 0.77403158, "learning_rate": 3.586430018361038e-06, "loss": 0.79626453, "num_input_tokens_seen": 41208775, "step": 1931, "time_per_iteration": 2.749321460723877 }, { "auxiliary_loss_clip": 0.01197897, "auxiliary_loss_mlp": 0.01036766, "balance_loss_clip": 0.98180401, "balance_loss_mlp": 1.02570343, "epoch": 0.23230926471472374, "flos": 22710734386560.0, "grad_norm": 3.009256429861296, "language_loss": 0.76522708, "learning_rate": 3.5859555497524283e-06, "loss": 0.7875737, "num_input_tokens_seen": 41226010, "step": 1932, "time_per_iteration": 2.755347728729248 }, { "auxiliary_loss_clip": 0.0121107, "auxiliary_loss_mlp": 0.01032439, "balance_loss_clip": 1.02326441, "balance_loss_mlp": 1.022789, "epoch": 0.23242950760536282, "flos": 20375499479040.0, "grad_norm": 2.3706007812011665, "language_loss": 0.92091715, "learning_rate": 3.5854808405556237e-06, "loss": 0.94335222, "num_input_tokens_seen": 41245245, "step": 1933, "time_per_iteration": 2.711890697479248 }, { "auxiliary_loss_clip": 0.01212414, "auxiliary_loss_mlp": 0.01038892, "balance_loss_clip": 0.94735098, "balance_loss_mlp": 1.02930176, "epoch": 0.23254975049600193, "flos": 16908324301440.0, "grad_norm": 3.0819686846166374, "language_loss": 0.7485857, "learning_rate": 3.5850058908426355e-06, "loss": 0.77109885, "num_input_tokens_seen": 41263795, "step": 1934, "time_per_iteration": 2.6879703998565674 }, { "auxiliary_loss_clip": 0.01210446, "auxiliary_loss_mlp": 0.01035674, "balance_loss_clip": 0.97998536, "balance_loss_mlp": 1.02567816, "epoch": 0.23266999338664102, "flos": 23294821443840.0, "grad_norm": 1.7246550135791279, "language_loss": 0.85913527, "learning_rate": 3.584530700685514e-06, "loss": 0.88159645, "num_input_tokens_seen": 41284055, "step": 1935, "time_per_iteration": 2.7534260749816895 }, { "auxiliary_loss_clip": 0.01207896, "auxiliary_loss_mlp": 0.01034483, "balance_loss_clip": 0.98708189, "balance_loss_mlp": 1.02473772, "epoch": 0.2327902362772801, "flos": 19569987031680.0, "grad_norm": 2.1257968753989487, "language_loss": 0.8903116, "learning_rate": 3.5840552701563448e-06, "loss": 0.9127354, "num_input_tokens_seen": 41300255, "step": 1936, "time_per_iteration": 2.6645336151123047 }, { "auxiliary_loss_clip": 0.01211395, "auxiliary_loss_mlp": 0.01034839, "balance_loss_clip": 1.06066084, "balance_loss_mlp": 1.02460432, "epoch": 0.2329104791679192, "flos": 16727514215040.0, "grad_norm": 2.0900391484499163, "language_loss": 0.81867445, "learning_rate": 3.5835795993272513e-06, "loss": 0.84113675, "num_input_tokens_seen": 41318540, "step": 1937, "time_per_iteration": 2.5995967388153076 }, { "auxiliary_loss_clip": 0.01198723, "auxiliary_loss_mlp": 0.01037215, "balance_loss_clip": 0.78847426, "balance_loss_mlp": 1.02750516, "epoch": 0.2330307220585583, "flos": 22163743100160.0, "grad_norm": 1.9655895647734132, "language_loss": 0.71239316, "learning_rate": 3.583103688270391e-06, "loss": 0.7347526, "num_input_tokens_seen": 41338320, "step": 1938, "time_per_iteration": 3.152052402496338 }, { "auxiliary_loss_clip": 0.01190255, "auxiliary_loss_mlp": 0.01039706, "balance_loss_clip": 0.97874486, "balance_loss_mlp": 1.02965629, "epoch": 0.23315096494919738, "flos": 19317319787520.0, "grad_norm": 2.0537167144046293, "language_loss": 0.89826846, "learning_rate": 3.58262753705796e-06, "loss": 0.92056805, "num_input_tokens_seen": 41353210, "step": 1939, "time_per_iteration": 3.066774368286133 }, { "auxiliary_loss_clip": 0.0109885, "auxiliary_loss_mlp": 0.01008437, "balance_loss_clip": 0.94438267, "balance_loss_mlp": 1.00419283, "epoch": 0.23327120783983646, "flos": 53031048946560.0, "grad_norm": 0.7596701838641039, "language_loss": 0.55527431, "learning_rate": 3.5821511457621902e-06, "loss": 0.57634723, "num_input_tokens_seen": 41410510, "step": 1940, "time_per_iteration": 3.2797293663024902 }, { "auxiliary_loss_clip": 0.01198294, "auxiliary_loss_mlp": 0.01039314, "balance_loss_clip": 0.98054564, "balance_loss_mlp": 1.02869248, "epoch": 0.23339145073047557, "flos": 17126984344320.0, "grad_norm": 2.8112759933013343, "language_loss": 0.81198716, "learning_rate": 3.5816745144553497e-06, "loss": 0.83436322, "num_input_tokens_seen": 41425830, "step": 1941, "time_per_iteration": 4.07315468788147 }, { "auxiliary_loss_clip": 0.01202748, "auxiliary_loss_mlp": 0.01029494, "balance_loss_clip": 0.90816832, "balance_loss_mlp": 1.01961803, "epoch": 0.23351169362111465, "flos": 13078918419840.0, "grad_norm": 2.1271894991696016, "language_loss": 0.74972361, "learning_rate": 3.5811976432097424e-06, "loss": 0.77204609, "num_input_tokens_seen": 41443500, "step": 1942, "time_per_iteration": 2.865281820297241 }, { "auxiliary_loss_clip": 0.0121597, "auxiliary_loss_mlp": 0.0112696, "balance_loss_clip": 1.02832878, "balance_loss_mlp": 0.0, "epoch": 0.23363193651175373, "flos": 15851257931520.0, "grad_norm": 1.9395984686344452, "language_loss": 0.84725404, "learning_rate": 3.58072053209771e-06, "loss": 0.87068337, "num_input_tokens_seen": 41460055, "step": 1943, "time_per_iteration": 2.6714277267456055 }, { "auxiliary_loss_clip": 0.01199957, "auxiliary_loss_mlp": 0.01032394, "balance_loss_clip": 0.9789086, "balance_loss_mlp": 1.02212977, "epoch": 0.23375217940239285, "flos": 21025769345280.0, "grad_norm": 1.8305796436490667, "language_loss": 0.78857219, "learning_rate": 3.5802431811916296e-06, "loss": 0.81089562, "num_input_tokens_seen": 41476665, "step": 1944, "time_per_iteration": 3.696798086166382 }, { "auxiliary_loss_clip": 0.01205499, "auxiliary_loss_mlp": 0.01039664, "balance_loss_clip": 0.98288053, "balance_loss_mlp": 1.03017461, "epoch": 0.23387242229303193, "flos": 20594698225920.0, "grad_norm": 2.1560942440523347, "language_loss": 0.80629176, "learning_rate": 3.579765590563916e-06, "loss": 0.82874346, "num_input_tokens_seen": 41496065, "step": 1945, "time_per_iteration": 3.5720059871673584 }, { "auxiliary_loss_clip": 0.0119963, "auxiliary_loss_mlp": 0.0102631, "balance_loss_clip": 1.02010441, "balance_loss_mlp": 1.01634455, "epoch": 0.233992665183671, "flos": 24279491952000.0, "grad_norm": 3.5146576091333572, "language_loss": 0.81755161, "learning_rate": 3.579287760287017e-06, "loss": 0.83981109, "num_input_tokens_seen": 41516815, "step": 1946, "time_per_iteration": 2.6988625526428223 }, { "auxiliary_loss_clip": 0.01211162, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.02358675, "balance_loss_mlp": 1.02557039, "epoch": 0.2341129080743101, "flos": 30154621121280.0, "grad_norm": 1.627994926204797, "language_loss": 0.72862756, "learning_rate": 3.578809690433421e-06, "loss": 0.75109148, "num_input_tokens_seen": 41538525, "step": 1947, "time_per_iteration": 2.742434501647949 }, { "auxiliary_loss_clip": 0.01217736, "auxiliary_loss_mlp": 0.01032463, "balance_loss_clip": 1.06266975, "balance_loss_mlp": 1.02207935, "epoch": 0.2342331509649492, "flos": 22784135829120.0, "grad_norm": 2.130592004755542, "language_loss": 0.81446749, "learning_rate": 3.578331381075651e-06, "loss": 0.83696949, "num_input_tokens_seen": 41559025, "step": 1948, "time_per_iteration": 3.5448403358459473 }, { "auxiliary_loss_clip": 0.01208736, "auxiliary_loss_mlp": 0.01029719, "balance_loss_clip": 1.01852369, "balance_loss_mlp": 1.01986074, "epoch": 0.2343533938555883, "flos": 23623152687360.0, "grad_norm": 2.501636664849308, "language_loss": 0.69176042, "learning_rate": 3.5778528322862646e-06, "loss": 0.71414495, "num_input_tokens_seen": 41577845, "step": 1949, "time_per_iteration": 2.6697030067443848 }, { "auxiliary_loss_clip": 0.01213178, "auxiliary_loss_mlp": 0.01037111, "balance_loss_clip": 1.02102947, "balance_loss_mlp": 1.02772307, "epoch": 0.23447363674622737, "flos": 24570332375040.0, "grad_norm": 1.5559506759633706, "language_loss": 0.86486912, "learning_rate": 3.5773740441378585e-06, "loss": 0.88737196, "num_input_tokens_seen": 41598600, "step": 1950, "time_per_iteration": 2.713487148284912 }, { "auxiliary_loss_clip": 0.01205862, "auxiliary_loss_mlp": 0.01037455, "balance_loss_clip": 1.02107978, "balance_loss_mlp": 1.02826989, "epoch": 0.23459387963686648, "flos": 53140322119680.0, "grad_norm": 1.5901028851714656, "language_loss": 0.73373502, "learning_rate": 3.5768950167030633e-06, "loss": 0.75616825, "num_input_tokens_seen": 41623300, "step": 1951, "time_per_iteration": 2.922863721847534 }, { "auxiliary_loss_clip": 0.01191471, "auxiliary_loss_mlp": 0.0103865, "balance_loss_clip": 0.97710896, "balance_loss_mlp": 1.02877319, "epoch": 0.23471412252750556, "flos": 23951412103680.0, "grad_norm": 1.787545677851132, "language_loss": 0.78545767, "learning_rate": 3.576415750054548e-06, "loss": 0.80775887, "num_input_tokens_seen": 41643420, "step": 1952, "time_per_iteration": 2.7186625003814697 }, { "auxiliary_loss_clip": 0.01198703, "auxiliary_loss_mlp": 0.0103743, "balance_loss_clip": 0.97991407, "balance_loss_mlp": 1.02730954, "epoch": 0.23483436541814465, "flos": 15706573948800.0, "grad_norm": 1.7828528873404648, "language_loss": 0.85655725, "learning_rate": 3.5759362442650172e-06, "loss": 0.87891853, "num_input_tokens_seen": 41660170, "step": 1953, "time_per_iteration": 2.7628142833709717 }, { "auxiliary_loss_clip": 0.01208653, "auxiliary_loss_mlp": 0.01038454, "balance_loss_clip": 1.02199316, "balance_loss_mlp": 1.02853572, "epoch": 0.23495460830878373, "flos": 24936262179840.0, "grad_norm": 1.8540477690132828, "language_loss": 0.85432911, "learning_rate": 3.5754564994072113e-06, "loss": 0.87680018, "num_input_tokens_seen": 41679010, "step": 1954, "time_per_iteration": 2.670884609222412 }, { "auxiliary_loss_clip": 0.01201314, "auxiliary_loss_mlp": 0.01034094, "balance_loss_clip": 0.98050117, "balance_loss_mlp": 1.02452159, "epoch": 0.23507485119942284, "flos": 30482665056000.0, "grad_norm": 2.2746366363089567, "language_loss": 0.60115147, "learning_rate": 3.5749765155539067e-06, "loss": 0.62350553, "num_input_tokens_seen": 41699495, "step": 1955, "time_per_iteration": 2.7545957565307617 }, { "auxiliary_loss_clip": 0.012034, "auxiliary_loss_mlp": 0.01036198, "balance_loss_clip": 0.94254684, "balance_loss_mlp": 1.02706099, "epoch": 0.23519509409006192, "flos": 18329129746560.0, "grad_norm": 2.23688694275574, "language_loss": 0.91885614, "learning_rate": 3.574496292777917e-06, "loss": 0.94125217, "num_input_tokens_seen": 41717705, "step": 1956, "time_per_iteration": 2.7750234603881836 }, { "auxiliary_loss_clip": 0.01213941, "auxiliary_loss_mlp": 0.01034284, "balance_loss_clip": 0.98179376, "balance_loss_mlp": 1.02446747, "epoch": 0.235315336980701, "flos": 29643217234560.0, "grad_norm": 1.9125493953461077, "language_loss": 0.71481812, "learning_rate": 3.574015831152092e-06, "loss": 0.7373004, "num_input_tokens_seen": 41738120, "step": 1957, "time_per_iteration": 2.7321228981018066 }, { "auxiliary_loss_clip": 0.01194564, "auxiliary_loss_mlp": 0.0103143, "balance_loss_clip": 0.98031902, "balance_loss_mlp": 1.02171421, "epoch": 0.23543557987134012, "flos": 18551704371840.0, "grad_norm": 2.1465886436509583, "language_loss": 0.83331645, "learning_rate": 3.573535130749316e-06, "loss": 0.8555764, "num_input_tokens_seen": 41756070, "step": 1958, "time_per_iteration": 2.6529908180236816 }, { "auxiliary_loss_clip": 0.01197707, "auxiliary_loss_mlp": 0.01032625, "balance_loss_clip": 0.98154098, "balance_loss_mlp": 1.02261782, "epoch": 0.2355558227619792, "flos": 24679033908480.0, "grad_norm": 1.6420502750304486, "language_loss": 0.73463786, "learning_rate": 3.5730541916425127e-06, "loss": 0.75694114, "num_input_tokens_seen": 41777550, "step": 1959, "time_per_iteration": 2.695159912109375 }, { "auxiliary_loss_clip": 0.01209184, "auxiliary_loss_mlp": 0.01032786, "balance_loss_clip": 0.94486439, "balance_loss_mlp": 1.02312982, "epoch": 0.23567606565261828, "flos": 21944795748480.0, "grad_norm": 1.828577287306282, "language_loss": 0.86565638, "learning_rate": 3.572573013904639e-06, "loss": 0.88807607, "num_input_tokens_seen": 41797460, "step": 1960, "time_per_iteration": 2.6791460514068604 }, { "auxiliary_loss_clip": 0.012101, "auxiliary_loss_mlp": 0.01034502, "balance_loss_clip": 1.05979657, "balance_loss_mlp": 1.02498293, "epoch": 0.2357963085432574, "flos": 13589352639360.0, "grad_norm": 1.8135752296191536, "language_loss": 0.92198318, "learning_rate": 3.572091597608689e-06, "loss": 0.94442928, "num_input_tokens_seen": 41815585, "step": 1961, "time_per_iteration": 2.6057419776916504 }, { "auxiliary_loss_clip": 0.01215656, "auxiliary_loss_mlp": 0.01044097, "balance_loss_clip": 0.9853459, "balance_loss_mlp": 1.03350496, "epoch": 0.23591655143389648, "flos": 22088689632000.0, "grad_norm": 2.8967518321344383, "language_loss": 0.73489058, "learning_rate": 3.571609942827694e-06, "loss": 0.75748813, "num_input_tokens_seen": 41834700, "step": 1962, "time_per_iteration": 2.6895735263824463 }, { "auxiliary_loss_clip": 0.01205944, "auxiliary_loss_mlp": 0.01034686, "balance_loss_clip": 0.98146367, "balance_loss_mlp": 1.02499986, "epoch": 0.23603679432453556, "flos": 17017349057280.0, "grad_norm": 1.8029577927204383, "language_loss": 0.88281524, "learning_rate": 3.57112804963472e-06, "loss": 0.90522152, "num_input_tokens_seen": 41852915, "step": 1963, "time_per_iteration": 2.6458144187927246 }, { "auxiliary_loss_clip": 0.01201369, "auxiliary_loss_mlp": 0.0103502, "balance_loss_clip": 0.94615757, "balance_loss_mlp": 1.0248158, "epoch": 0.23615703721517464, "flos": 19171307001600.0, "grad_norm": 1.7574844890329484, "language_loss": 0.76758689, "learning_rate": 3.57064591810287e-06, "loss": 0.78995079, "num_input_tokens_seen": 41870415, "step": 1964, "time_per_iteration": 2.7060909271240234 }, { "auxiliary_loss_clip": 0.01211377, "auxiliary_loss_mlp": 0.01126273, "balance_loss_clip": 1.06119204, "balance_loss_mlp": 0.0, "epoch": 0.23627728010581375, "flos": 19098803399040.0, "grad_norm": 2.75728107040267, "language_loss": 0.8046875, "learning_rate": 3.570163548305284e-06, "loss": 0.82806396, "num_input_tokens_seen": 41889345, "step": 1965, "time_per_iteration": 2.638603448867798 }, { "auxiliary_loss_clip": 0.0120617, "auxiliary_loss_mlp": 0.01030917, "balance_loss_clip": 0.98134792, "balance_loss_mlp": 1.02083182, "epoch": 0.23639752299645284, "flos": 14282213057280.0, "grad_norm": 2.0403266791622325, "language_loss": 0.69849443, "learning_rate": 3.569680940315135e-06, "loss": 0.72086537, "num_input_tokens_seen": 41905745, "step": 1966, "time_per_iteration": 2.7678043842315674 }, { "auxiliary_loss_clip": 0.01210643, "auxiliary_loss_mlp": 0.01036578, "balance_loss_clip": 0.94428951, "balance_loss_mlp": 1.02634358, "epoch": 0.23651776588709192, "flos": 22893411980160.0, "grad_norm": 1.8331229259817032, "language_loss": 0.81738806, "learning_rate": 3.5691980942056356e-06, "loss": 0.83986026, "num_input_tokens_seen": 41925115, "step": 1967, "time_per_iteration": 3.701821804046631 }, { "auxiliary_loss_clip": 0.01210238, "auxiliary_loss_mlp": 0.01033087, "balance_loss_clip": 1.01913321, "balance_loss_mlp": 1.02312708, "epoch": 0.23663800877773103, "flos": 18624531196800.0, "grad_norm": 2.8857030432455435, "language_loss": 0.7938754, "learning_rate": 3.5687150100500332e-06, "loss": 0.81630868, "num_input_tokens_seen": 41944815, "step": 1968, "time_per_iteration": 2.73474383354187 }, { "auxiliary_loss_clip": 0.01210972, "auxiliary_loss_mlp": 0.01036457, "balance_loss_clip": 1.02064061, "balance_loss_mlp": 1.02731943, "epoch": 0.2367582516683701, "flos": 25555828896000.0, "grad_norm": 1.5587543232498293, "language_loss": 0.74564958, "learning_rate": 3.568231687921611e-06, "loss": 0.76812387, "num_input_tokens_seen": 41964990, "step": 1969, "time_per_iteration": 2.6941936016082764 }, { "auxiliary_loss_clip": 0.01207531, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.05856431, "balance_loss_mlp": 1.0314697, "epoch": 0.2368784945590092, "flos": 23295072839040.0, "grad_norm": 1.551897521039097, "language_loss": 0.8035745, "learning_rate": 3.5677481278936883e-06, "loss": 0.82605112, "num_input_tokens_seen": 41984570, "step": 1970, "time_per_iteration": 3.6249213218688965 }, { "auxiliary_loss_clip": 0.01107333, "auxiliary_loss_mlp": 0.01005951, "balance_loss_clip": 0.95319927, "balance_loss_mlp": 1.00199366, "epoch": 0.23699873744964828, "flos": 69859291875840.0, "grad_norm": 0.8647442597834171, "language_loss": 0.57880497, "learning_rate": 3.5672643300396214e-06, "loss": 0.5999378, "num_input_tokens_seen": 42053715, "step": 1971, "time_per_iteration": 3.297166347503662 }, { "auxiliary_loss_clip": 0.01206633, "auxiliary_loss_mlp": 0.01030453, "balance_loss_clip": 0.94384176, "balance_loss_mlp": 1.02118993, "epoch": 0.2371189803402874, "flos": 21835052720640.0, "grad_norm": 2.2955765856366064, "language_loss": 0.67466295, "learning_rate": 3.566780294432802e-06, "loss": 0.69703376, "num_input_tokens_seen": 42070890, "step": 1972, "time_per_iteration": 3.6210803985595703 }, { "auxiliary_loss_clip": 0.01210685, "auxiliary_loss_mlp": 0.01036082, "balance_loss_clip": 1.06120741, "balance_loss_mlp": 1.02693319, "epoch": 0.23723922323092647, "flos": 21908490076800.0, "grad_norm": 2.4129168521884568, "language_loss": 0.74431872, "learning_rate": 3.566296021146657e-06, "loss": 0.7667864, "num_input_tokens_seen": 42090270, "step": 1973, "time_per_iteration": 3.5643422603607178 }, { "auxiliary_loss_clip": 0.01216796, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.06167591, "balance_loss_mlp": 1.02294517, "epoch": 0.23735946612156555, "flos": 32708803380480.0, "grad_norm": 1.7636364025984257, "language_loss": 0.73274374, "learning_rate": 3.565811510254652e-06, "loss": 0.75524187, "num_input_tokens_seen": 42111150, "step": 1974, "time_per_iteration": 2.6954214572906494 }, { "auxiliary_loss_clip": 0.01107995, "auxiliary_loss_mlp": 0.01003512, "balance_loss_clip": 0.99348986, "balance_loss_mlp": 0.99948233, "epoch": 0.23747970901220466, "flos": 70546944821760.0, "grad_norm": 0.8518779138210567, "language_loss": 0.58405733, "learning_rate": 3.5653267618302845e-06, "loss": 0.6051724, "num_input_tokens_seen": 42178730, "step": 1975, "time_per_iteration": 3.298980474472046 }, { "auxiliary_loss_clip": 0.01208419, "auxiliary_loss_mlp": 0.01031726, "balance_loss_clip": 1.05870807, "balance_loss_mlp": 1.02192068, "epoch": 0.23759995190284375, "flos": 20849807594880.0, "grad_norm": 1.7627973998841115, "language_loss": 0.85664028, "learning_rate": 3.564841775947093e-06, "loss": 0.87904167, "num_input_tokens_seen": 42199620, "step": 1976, "time_per_iteration": 2.6340091228485107 }, { "auxiliary_loss_clip": 0.01200336, "auxiliary_loss_mlp": 0.01037281, "balance_loss_clip": 0.94101286, "balance_loss_mlp": 1.02785802, "epoch": 0.23772019479348283, "flos": 32921645420160.0, "grad_norm": 2.1440648570078333, "language_loss": 0.76220375, "learning_rate": 3.5643565526786475e-06, "loss": 0.78457993, "num_input_tokens_seen": 42219560, "step": 1977, "time_per_iteration": 2.8141074180603027 }, { "auxiliary_loss_clip": 0.01212534, "auxiliary_loss_mlp": 0.01034124, "balance_loss_clip": 1.06013918, "balance_loss_mlp": 1.02462947, "epoch": 0.2378404376841219, "flos": 32342765834880.0, "grad_norm": 1.56691203657703, "language_loss": 0.77116048, "learning_rate": 3.5638710920985574e-06, "loss": 0.79362702, "num_input_tokens_seen": 42241020, "step": 1978, "time_per_iteration": 2.6768457889556885 }, { "auxiliary_loss_clip": 0.01215215, "auxiliary_loss_mlp": 0.01127278, "balance_loss_clip": 1.02018106, "balance_loss_mlp": 0.0, "epoch": 0.23796068057476102, "flos": 22997624313600.0, "grad_norm": 1.9444290995809281, "language_loss": 0.82411194, "learning_rate": 3.5633853942804655e-06, "loss": 0.84753692, "num_input_tokens_seen": 42259345, "step": 1979, "time_per_iteration": 2.6388957500457764 }, { "auxiliary_loss_clip": 0.01202177, "auxiliary_loss_mlp": 0.0103163, "balance_loss_clip": 0.93881279, "balance_loss_mlp": 1.02097321, "epoch": 0.2380809234654001, "flos": 13480938414720.0, "grad_norm": 2.6475035324173253, "language_loss": 0.76467484, "learning_rate": 3.5628994592980527e-06, "loss": 0.78701288, "num_input_tokens_seen": 42277250, "step": 1980, "time_per_iteration": 2.721461534500122 }, { "auxiliary_loss_clip": 0.01209874, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.05733514, "balance_loss_mlp": 1.02511787, "epoch": 0.2382011663560392, "flos": 16871803148160.0, "grad_norm": 1.9605441783979554, "language_loss": 0.70075393, "learning_rate": 3.562413287225034e-06, "loss": 0.7232002, "num_input_tokens_seen": 42295360, "step": 1981, "time_per_iteration": 2.733931303024292 }, { "auxiliary_loss_clip": 0.01200876, "auxiliary_loss_mlp": 0.01034742, "balance_loss_clip": 1.02041268, "balance_loss_mlp": 1.02564049, "epoch": 0.2383214092466783, "flos": 18441135331200.0, "grad_norm": 2.1228804200429066, "language_loss": 0.89145905, "learning_rate": 3.5619268781351623e-06, "loss": 0.91381526, "num_input_tokens_seen": 42313430, "step": 1982, "time_per_iteration": 2.61442232131958 }, { "auxiliary_loss_clip": 0.01201823, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 0.98317212, "balance_loss_mlp": 1.0231539, "epoch": 0.23844165213731738, "flos": 19755717281280.0, "grad_norm": 1.846370980521908, "language_loss": 0.76861405, "learning_rate": 3.5614402321022256e-06, "loss": 0.79095781, "num_input_tokens_seen": 42331260, "step": 1983, "time_per_iteration": 2.690269947052002 }, { "auxiliary_loss_clip": 0.01198274, "auxiliary_loss_mlp": 0.01031, "balance_loss_clip": 0.90572888, "balance_loss_mlp": 1.02183867, "epoch": 0.23856189502795647, "flos": 23367360960000.0, "grad_norm": 1.7005229654472211, "language_loss": 0.87061244, "learning_rate": 3.5609533492000463e-06, "loss": 0.89290524, "num_input_tokens_seen": 42350150, "step": 1984, "time_per_iteration": 2.707629919052124 }, { "auxiliary_loss_clip": 0.01205355, "auxiliary_loss_mlp": 0.01033384, "balance_loss_clip": 0.98399633, "balance_loss_mlp": 1.02377021, "epoch": 0.23868213791859555, "flos": 23475056912640.0, "grad_norm": 1.9245072465162634, "language_loss": 0.78631723, "learning_rate": 3.560466229502485e-06, "loss": 0.80870461, "num_input_tokens_seen": 42369495, "step": 1985, "time_per_iteration": 2.72959041595459 }, { "auxiliary_loss_clip": 0.01205422, "auxiliary_loss_mlp": 0.01126853, "balance_loss_clip": 0.98594809, "balance_loss_mlp": 0.0, "epoch": 0.23880238080923466, "flos": 16617340224000.0, "grad_norm": 2.0975783418912166, "language_loss": 0.89308393, "learning_rate": 3.5599788730834384e-06, "loss": 0.91640675, "num_input_tokens_seen": 42387455, "step": 1986, "time_per_iteration": 2.705643653869629 }, { "auxiliary_loss_clip": 0.0121252, "auxiliary_loss_mlp": 0.01032104, "balance_loss_clip": 1.02064228, "balance_loss_mlp": 1.02263927, "epoch": 0.23892262369987374, "flos": 17348409734400.0, "grad_norm": 2.2049693942747197, "language_loss": 0.78396034, "learning_rate": 3.559491280016836e-06, "loss": 0.80640662, "num_input_tokens_seen": 42405400, "step": 1987, "time_per_iteration": 2.646941661834717 }, { "auxiliary_loss_clip": 0.01207029, "auxiliary_loss_mlp": 0.0103786, "balance_loss_clip": 0.98173881, "balance_loss_mlp": 1.02828729, "epoch": 0.23904286659051283, "flos": 22309899540480.0, "grad_norm": 3.777917579477249, "language_loss": 0.70889181, "learning_rate": 3.5590034503766465e-06, "loss": 0.73134071, "num_input_tokens_seen": 42425065, "step": 1988, "time_per_iteration": 2.6871087551116943 }, { "auxiliary_loss_clip": 0.01212056, "auxiliary_loss_mlp": 0.01036657, "balance_loss_clip": 1.06129503, "balance_loss_mlp": 1.02712619, "epoch": 0.23916310948115194, "flos": 21178246579200.0, "grad_norm": 2.1725419078712966, "language_loss": 0.80704224, "learning_rate": 3.558515384236874e-06, "loss": 0.82952935, "num_input_tokens_seen": 42442495, "step": 1989, "time_per_iteration": 2.585860252380371 }, { "auxiliary_loss_clip": 0.0119388, "auxiliary_loss_mlp": 0.01126935, "balance_loss_clip": 0.94365597, "balance_loss_mlp": 0.0, "epoch": 0.23928335237179102, "flos": 14137349506560.0, "grad_norm": 1.9484225340322876, "language_loss": 0.83773017, "learning_rate": 3.558027081671556e-06, "loss": 0.86093837, "num_input_tokens_seen": 42459480, "step": 1990, "time_per_iteration": 2.666769504547119 }, { "auxiliary_loss_clip": 0.01210843, "auxiliary_loss_mlp": 0.01032186, "balance_loss_clip": 1.01952219, "balance_loss_mlp": 1.02228618, "epoch": 0.2394035952624301, "flos": 23769596436480.0, "grad_norm": 1.5852778228094264, "language_loss": 0.68374717, "learning_rate": 3.557538542754769e-06, "loss": 0.70617747, "num_input_tokens_seen": 42479175, "step": 1991, "time_per_iteration": 2.6625890731811523 }, { "auxiliary_loss_clip": 0.01212759, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.06158638, "balance_loss_mlp": 1.02281392, "epoch": 0.2395238381530692, "flos": 24206198250240.0, "grad_norm": 1.7374880014119651, "language_loss": 0.66494405, "learning_rate": 3.557049767560623e-06, "loss": 0.68739903, "num_input_tokens_seen": 42498090, "step": 1992, "time_per_iteration": 2.5661141872406006 }, { "auxiliary_loss_clip": 0.01207909, "auxiliary_loss_mlp": 0.01031892, "balance_loss_clip": 0.90923107, "balance_loss_mlp": 1.02221179, "epoch": 0.2396440810437083, "flos": 25295763450240.0, "grad_norm": 1.89471125420174, "language_loss": 0.85455143, "learning_rate": 3.5565607561632655e-06, "loss": 0.87694943, "num_input_tokens_seen": 42516930, "step": 1993, "time_per_iteration": 2.757378339767456 }, { "auxiliary_loss_clip": 0.01202676, "auxiliary_loss_mlp": 0.01030474, "balance_loss_clip": 0.9813295, "balance_loss_mlp": 1.02121162, "epoch": 0.23976432393434738, "flos": 28543093436160.0, "grad_norm": 2.916313140511152, "language_loss": 0.79589158, "learning_rate": 3.5560715086368787e-06, "loss": 0.81822312, "num_input_tokens_seen": 42534800, "step": 1994, "time_per_iteration": 3.618104934692383 }, { "auxiliary_loss_clip": 0.01204445, "auxiliary_loss_mlp": 0.01034024, "balance_loss_clip": 0.98472303, "balance_loss_mlp": 1.02414787, "epoch": 0.23988456682498646, "flos": 19494358945920.0, "grad_norm": 2.7170567730372213, "language_loss": 0.82348138, "learning_rate": 3.5555820250556816e-06, "loss": 0.84586608, "num_input_tokens_seen": 42552000, "step": 1995, "time_per_iteration": 2.6277949810028076 }, { "auxiliary_loss_clip": 0.01214153, "auxiliary_loss_mlp": 0.01031933, "balance_loss_clip": 0.98577142, "balance_loss_mlp": 1.02219963, "epoch": 0.24000480971562557, "flos": 20266331068800.0, "grad_norm": 2.1357994395246713, "language_loss": 0.69058406, "learning_rate": 3.5550923054939278e-06, "loss": 0.71304494, "num_input_tokens_seen": 42571455, "step": 1996, "time_per_iteration": 3.6749680042266846 }, { "auxiliary_loss_clip": 0.01195085, "auxiliary_loss_mlp": 0.01029736, "balance_loss_clip": 0.90141469, "balance_loss_mlp": 1.01978791, "epoch": 0.24012505260626466, "flos": 25443176866560.0, "grad_norm": 1.7332193811612189, "language_loss": 0.74163592, "learning_rate": 3.5546023500259083e-06, "loss": 0.76388407, "num_input_tokens_seen": 42592550, "step": 1997, "time_per_iteration": 3.64876651763916 }, { "auxiliary_loss_clip": 0.01207635, "auxiliary_loss_mlp": 0.01036718, "balance_loss_clip": 0.90382409, "balance_loss_mlp": 1.02641881, "epoch": 0.24024529549690374, "flos": 15553342529280.0, "grad_norm": 1.8941887931808712, "language_loss": 0.80497599, "learning_rate": 3.5541121587259477e-06, "loss": 0.82741952, "num_input_tokens_seen": 42610385, "step": 1998, "time_per_iteration": 2.705787181854248 }, { "auxiliary_loss_clip": 0.01098459, "auxiliary_loss_mlp": 0.01005236, "balance_loss_clip": 0.98472667, "balance_loss_mlp": 1.00106323, "epoch": 0.24036553838754285, "flos": 57122351867520.0, "grad_norm": 0.8437609731874023, "language_loss": 0.57933718, "learning_rate": 3.553621731668408e-06, "loss": 0.60037416, "num_input_tokens_seen": 42673595, "step": 1999, "time_per_iteration": 4.1306281089782715 }, { "auxiliary_loss_clip": 0.01199572, "auxiliary_loss_mlp": 0.01035614, "balance_loss_clip": 1.01733947, "balance_loss_mlp": 1.02579117, "epoch": 0.24048578127818193, "flos": 24969946158720.0, "grad_norm": 2.4106558020972835, "language_loss": 0.83204049, "learning_rate": 3.553131068927688e-06, "loss": 0.85439235, "num_input_tokens_seen": 42692000, "step": 2000, "time_per_iteration": 2.721031665802002 }, { "auxiliary_loss_clip": 0.01203742, "auxiliary_loss_mlp": 0.01032025, "balance_loss_clip": 0.9452405, "balance_loss_mlp": 1.02263784, "epoch": 0.24060602416882101, "flos": 23330947547520.0, "grad_norm": 1.8377323935345544, "language_loss": 0.80644834, "learning_rate": 3.552640170578219e-06, "loss": 0.82880598, "num_input_tokens_seen": 42712250, "step": 2001, "time_per_iteration": 2.7184345722198486 }, { "auxiliary_loss_clip": 0.01210632, "auxiliary_loss_mlp": 0.01028709, "balance_loss_clip": 0.98358858, "balance_loss_mlp": 1.01926756, "epoch": 0.2407262670594601, "flos": 14173260128640.0, "grad_norm": 2.0014161580785204, "language_loss": 0.78217226, "learning_rate": 3.5521490366944703e-06, "loss": 0.80456567, "num_input_tokens_seen": 42729900, "step": 2002, "time_per_iteration": 2.6809468269348145 }, { "auxiliary_loss_clip": 0.01205828, "auxiliary_loss_mlp": 0.01038519, "balance_loss_clip": 0.94362271, "balance_loss_mlp": 1.02859497, "epoch": 0.2408465099500992, "flos": 13663113217920.0, "grad_norm": 2.2157647230109676, "language_loss": 0.79591978, "learning_rate": 3.5516576673509474e-06, "loss": 0.81836331, "num_input_tokens_seen": 42747900, "step": 2003, "time_per_iteration": 2.682063341140747 }, { "auxiliary_loss_clip": 0.01212795, "auxiliary_loss_mlp": 0.01030699, "balance_loss_clip": 1.06065404, "balance_loss_mlp": 1.02044094, "epoch": 0.2409667528407383, "flos": 31248029076480.0, "grad_norm": 1.797257743130324, "language_loss": 0.86437595, "learning_rate": 3.5511660626221896e-06, "loss": 0.88681084, "num_input_tokens_seen": 42768540, "step": 2004, "time_per_iteration": 2.715782403945923 }, { "auxiliary_loss_clip": 0.01206517, "auxiliary_loss_mlp": 0.01127082, "balance_loss_clip": 0.98493695, "balance_loss_mlp": 0.0, "epoch": 0.24108699573137737, "flos": 22199941031040.0, "grad_norm": 2.19755403446841, "language_loss": 0.89060664, "learning_rate": 3.5506742225827744e-06, "loss": 0.91394269, "num_input_tokens_seen": 42785395, "step": 2005, "time_per_iteration": 2.656130075454712 }, { "auxiliary_loss_clip": 0.01205325, "auxiliary_loss_mlp": 0.01036325, "balance_loss_clip": 0.94532061, "balance_loss_mlp": 1.02614462, "epoch": 0.24120723862201648, "flos": 26103035664000.0, "grad_norm": 2.042177400651882, "language_loss": 0.90388811, "learning_rate": 3.5501821473073116e-06, "loss": 0.92630458, "num_input_tokens_seen": 42801980, "step": 2006, "time_per_iteration": 2.7583799362182617 }, { "auxiliary_loss_clip": 0.01199161, "auxiliary_loss_mlp": 0.01038923, "balance_loss_clip": 0.94220805, "balance_loss_mlp": 1.02879047, "epoch": 0.24132748151265557, "flos": 18624926246400.0, "grad_norm": 2.4076661779083843, "language_loss": 0.86778963, "learning_rate": 3.54968983687045e-06, "loss": 0.89017045, "num_input_tokens_seen": 42818850, "step": 2007, "time_per_iteration": 2.6498680114746094 }, { "auxiliary_loss_clip": 0.01210583, "auxiliary_loss_mlp": 0.01039554, "balance_loss_clip": 0.98160452, "balance_loss_mlp": 1.02927208, "epoch": 0.24144772440329465, "flos": 15267673664640.0, "grad_norm": 2.1596474975592215, "language_loss": 0.89473492, "learning_rate": 3.549197291346872e-06, "loss": 0.91723627, "num_input_tokens_seen": 42835375, "step": 2008, "time_per_iteration": 2.7770700454711914 }, { "auxiliary_loss_clip": 0.01212009, "auxiliary_loss_mlp": 0.01035891, "balance_loss_clip": 1.02202106, "balance_loss_mlp": 1.02542424, "epoch": 0.24156796729393373, "flos": 24024274842240.0, "grad_norm": 1.8962555513492712, "language_loss": 0.7941131, "learning_rate": 3.548704510811297e-06, "loss": 0.8165921, "num_input_tokens_seen": 42854570, "step": 2009, "time_per_iteration": 2.756913661956787 }, { "auxiliary_loss_clip": 0.0120711, "auxiliary_loss_mlp": 0.01034775, "balance_loss_clip": 0.90300494, "balance_loss_mlp": 1.02409935, "epoch": 0.24168821018457284, "flos": 26286790665600.0, "grad_norm": 3.307973537047587, "language_loss": 0.74971604, "learning_rate": 3.5482114953384787e-06, "loss": 0.7721349, "num_input_tokens_seen": 42873800, "step": 2010, "time_per_iteration": 2.943244218826294 }, { "auxiliary_loss_clip": 0.01213449, "auxiliary_loss_mlp": 0.01045654, "balance_loss_clip": 1.02147281, "balance_loss_mlp": 1.0347048, "epoch": 0.24180845307521193, "flos": 18223193560320.0, "grad_norm": 2.2206375469338306, "language_loss": 0.84246993, "learning_rate": 3.5477182450032077e-06, "loss": 0.86506104, "num_input_tokens_seen": 42892400, "step": 2011, "time_per_iteration": 2.7755966186523438 }, { "auxiliary_loss_clip": 0.01210606, "auxiliary_loss_mlp": 0.0103374, "balance_loss_clip": 1.02172947, "balance_loss_mlp": 1.0238874, "epoch": 0.241928695965851, "flos": 20449260057600.0, "grad_norm": 1.8873259615048978, "language_loss": 0.83347249, "learning_rate": 3.5472247598803097e-06, "loss": 0.8559159, "num_input_tokens_seen": 42911745, "step": 2012, "time_per_iteration": 2.6802282333374023 }, { "auxiliary_loss_clip": 0.01212662, "auxiliary_loss_mlp": 0.01031385, "balance_loss_clip": 1.05877972, "balance_loss_mlp": 1.02091289, "epoch": 0.24204893885649012, "flos": 25556475340800.0, "grad_norm": 2.1195710620846664, "language_loss": 0.85314071, "learning_rate": 3.546731040044645e-06, "loss": 0.8755812, "num_input_tokens_seen": 42926915, "step": 2013, "time_per_iteration": 2.6972367763519287 }, { "auxiliary_loss_clip": 0.01211723, "auxiliary_loss_mlp": 0.01036955, "balance_loss_clip": 1.06096363, "balance_loss_mlp": 1.02716756, "epoch": 0.2421691817471292, "flos": 30660207004800.0, "grad_norm": 1.7612513510694843, "language_loss": 0.7525084, "learning_rate": 3.546237085571112e-06, "loss": 0.77499521, "num_input_tokens_seen": 42945350, "step": 2014, "time_per_iteration": 2.7459871768951416 }, { "auxiliary_loss_clip": 0.01212243, "auxiliary_loss_mlp": 0.01032845, "balance_loss_clip": 1.02228725, "balance_loss_mlp": 1.0228194, "epoch": 0.24228942463776829, "flos": 21945011230080.0, "grad_norm": 2.199610950589123, "language_loss": 0.72109169, "learning_rate": 3.5457428965346425e-06, "loss": 0.74354255, "num_input_tokens_seen": 42964290, "step": 2015, "time_per_iteration": 2.720247745513916 }, { "auxiliary_loss_clip": 0.01199484, "auxiliary_loss_mlp": 0.01035974, "balance_loss_clip": 0.8649714, "balance_loss_mlp": 1.02560902, "epoch": 0.2424096675284074, "flos": 33984493879680.0, "grad_norm": 1.5529082809825987, "language_loss": 0.74759096, "learning_rate": 3.545248473010205e-06, "loss": 0.76994556, "num_input_tokens_seen": 42987095, "step": 2016, "time_per_iteration": 2.9322941303253174 }, { "auxiliary_loss_clip": 0.01215498, "auxiliary_loss_mlp": 0.0112787, "balance_loss_clip": 1.06050479, "balance_loss_mlp": 0.0, "epoch": 0.24252991041904648, "flos": 21653416621440.0, "grad_norm": 1.9871525167902535, "language_loss": 0.87590826, "learning_rate": 3.544753815072802e-06, "loss": 0.89934188, "num_input_tokens_seen": 43005750, "step": 2017, "time_per_iteration": 2.7011592388153076 }, { "auxiliary_loss_clip": 0.01188884, "auxiliary_loss_mlp": 0.01033258, "balance_loss_clip": 0.82024783, "balance_loss_mlp": 1.0234772, "epoch": 0.24265015330968556, "flos": 21870065502720.0, "grad_norm": 1.7732732032346314, "language_loss": 0.88463855, "learning_rate": 3.544258922797474e-06, "loss": 0.90685999, "num_input_tokens_seen": 43023870, "step": 2018, "time_per_iteration": 2.996454954147339 }, { "auxiliary_loss_clip": 0.0121194, "auxiliary_loss_mlp": 0.01028984, "balance_loss_clip": 1.06033754, "balance_loss_mlp": 1.01868439, "epoch": 0.24277039620032465, "flos": 25628260671360.0, "grad_norm": 1.6426917940395172, "language_loss": 0.7828486, "learning_rate": 3.543763796259295e-06, "loss": 0.8052578, "num_input_tokens_seen": 43043825, "step": 2019, "time_per_iteration": 2.9708003997802734 }, { "auxiliary_loss_clip": 0.01209431, "auxiliary_loss_mlp": 0.01036337, "balance_loss_clip": 1.02026367, "balance_loss_mlp": 1.02616882, "epoch": 0.24289063909096376, "flos": 26286575184000.0, "grad_norm": 4.044722657885507, "language_loss": 0.90652138, "learning_rate": 3.5432684355333754e-06, "loss": 0.92897904, "num_input_tokens_seen": 43062480, "step": 2020, "time_per_iteration": 4.003946542739868 }, { "auxiliary_loss_clip": 0.01205209, "auxiliary_loss_mlp": 0.0103416, "balance_loss_clip": 1.01779187, "balance_loss_mlp": 1.02392602, "epoch": 0.24301088198160284, "flos": 25075056332160.0, "grad_norm": 1.734700171456285, "language_loss": 0.76707029, "learning_rate": 3.5427728406948613e-06, "loss": 0.789464, "num_input_tokens_seen": 43081595, "step": 2021, "time_per_iteration": 3.7036006450653076 }, { "auxiliary_loss_clip": 0.01102869, "auxiliary_loss_mlp": 0.01005246, "balance_loss_clip": 0.94731772, "balance_loss_mlp": 1.00124061, "epoch": 0.24313112487224192, "flos": 69900948673920.0, "grad_norm": 0.7512318846014371, "language_loss": 0.57877916, "learning_rate": 3.542277011818934e-06, "loss": 0.59986031, "num_input_tokens_seen": 43145430, "step": 2022, "time_per_iteration": 3.4326343536376953 }, { "auxiliary_loss_clip": 0.01209243, "auxiliary_loss_mlp": 0.01038674, "balance_loss_clip": 0.98375094, "balance_loss_mlp": 1.0287559, "epoch": 0.24325136776288103, "flos": 40662334235520.0, "grad_norm": 2.344934283533545, "language_loss": 0.74009287, "learning_rate": 3.5417809489808104e-06, "loss": 0.76257205, "num_input_tokens_seen": 43167040, "step": 2023, "time_per_iteration": 3.8713412284851074 }, { "auxiliary_loss_clip": 0.01215144, "auxiliary_loss_mlp": 0.01033751, "balance_loss_clip": 1.02334821, "balance_loss_mlp": 1.02384496, "epoch": 0.24337161065352012, "flos": 25046400257280.0, "grad_norm": 1.7336695211372315, "language_loss": 0.7261281, "learning_rate": 3.5412846522557422e-06, "loss": 0.74861705, "num_input_tokens_seen": 43187930, "step": 2024, "time_per_iteration": 2.7504992485046387 }, { "auxiliary_loss_clip": 0.01214714, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.06131959, "balance_loss_mlp": 1.02755022, "epoch": 0.2434918535441592, "flos": 18661160090880.0, "grad_norm": 2.058402637752043, "language_loss": 0.74304259, "learning_rate": 3.540788121719018e-06, "loss": 0.76556659, "num_input_tokens_seen": 43206350, "step": 2025, "time_per_iteration": 3.507833957672119 }, { "auxiliary_loss_clip": 0.01203447, "auxiliary_loss_mlp": 0.01034136, "balance_loss_clip": 0.94440126, "balance_loss_mlp": 1.0247786, "epoch": 0.24361209643479828, "flos": 23915142345600.0, "grad_norm": 2.2225543535036576, "language_loss": 0.82104039, "learning_rate": 3.5402913574459604e-06, "loss": 0.84341627, "num_input_tokens_seen": 43226255, "step": 2026, "time_per_iteration": 2.7942538261413574 }, { "auxiliary_loss_clip": 0.0119346, "auxiliary_loss_mlp": 0.01041659, "balance_loss_clip": 0.8617636, "balance_loss_mlp": 1.03078079, "epoch": 0.2437323393254374, "flos": 28657505232000.0, "grad_norm": 1.7736528777518572, "language_loss": 0.86041451, "learning_rate": 3.5397943595119297e-06, "loss": 0.88276565, "num_input_tokens_seen": 43247675, "step": 2027, "time_per_iteration": 2.9448516368865967 }, { "auxiliary_loss_clip": 0.01206682, "auxiliary_loss_mlp": 0.01039632, "balance_loss_clip": 0.98353934, "balance_loss_mlp": 1.02916527, "epoch": 0.24385258221607647, "flos": 23550325862400.0, "grad_norm": 2.5922447509621342, "language_loss": 0.77191299, "learning_rate": 3.5392971279923177e-06, "loss": 0.79437613, "num_input_tokens_seen": 43265895, "step": 2028, "time_per_iteration": 2.744534492492676 }, { "auxiliary_loss_clip": 0.01197115, "auxiliary_loss_mlp": 0.01038727, "balance_loss_clip": 0.93913567, "balance_loss_mlp": 1.02803445, "epoch": 0.24397282510671556, "flos": 25336091445120.0, "grad_norm": 2.672280615006205, "language_loss": 0.83068585, "learning_rate": 3.5387996629625557e-06, "loss": 0.85304427, "num_input_tokens_seen": 43283485, "step": 2029, "time_per_iteration": 2.7704854011535645 }, { "auxiliary_loss_clip": 0.01097937, "auxiliary_loss_mlp": 0.0100874, "balance_loss_clip": 1.01923919, "balance_loss_mlp": 1.00473464, "epoch": 0.24409306799735467, "flos": 65187421430400.0, "grad_norm": 0.7989162428947055, "language_loss": 0.55044663, "learning_rate": 3.5383019644981083e-06, "loss": 0.57151341, "num_input_tokens_seen": 43347180, "step": 2030, "time_per_iteration": 3.2304675579071045 }, { "auxiliary_loss_clip": 0.01206944, "auxiliary_loss_mlp": 0.01035465, "balance_loss_clip": 0.98283648, "balance_loss_mlp": 1.02526665, "epoch": 0.24421331088799375, "flos": 19537093152000.0, "grad_norm": 2.1799924698345547, "language_loss": 0.72494543, "learning_rate": 3.5378040326744763e-06, "loss": 0.74736953, "num_input_tokens_seen": 43366665, "step": 2031, "time_per_iteration": 2.7093863487243652 }, { "auxiliary_loss_clip": 0.01210593, "auxiliary_loss_mlp": 0.01032014, "balance_loss_clip": 0.94727021, "balance_loss_mlp": 1.02245378, "epoch": 0.24433355377863283, "flos": 21068575378560.0, "grad_norm": 2.350709544636353, "language_loss": 0.85747433, "learning_rate": 3.5373058675671946e-06, "loss": 0.87990034, "num_input_tokens_seen": 43384670, "step": 2032, "time_per_iteration": 2.8138840198516846 }, { "auxiliary_loss_clip": 0.01196072, "auxiliary_loss_mlp": 0.01037506, "balance_loss_clip": 0.90387845, "balance_loss_mlp": 1.02640724, "epoch": 0.24445379666927192, "flos": 22637189289600.0, "grad_norm": 1.9224111434373632, "language_loss": 0.72079301, "learning_rate": 3.536807469251836e-06, "loss": 0.74312878, "num_input_tokens_seen": 43403825, "step": 2033, "time_per_iteration": 2.817232608795166 }, { "auxiliary_loss_clip": 0.01212414, "auxiliary_loss_mlp": 0.01043016, "balance_loss_clip": 0.94222164, "balance_loss_mlp": 1.03303814, "epoch": 0.24457403955991103, "flos": 21251612108160.0, "grad_norm": 2.2380833681407637, "language_loss": 0.82671916, "learning_rate": 3.5363088378040055e-06, "loss": 0.84927344, "num_input_tokens_seen": 43422715, "step": 2034, "time_per_iteration": 2.7257742881774902 }, { "auxiliary_loss_clip": 0.01095975, "auxiliary_loss_mlp": 0.01121873, "balance_loss_clip": 1.01793671, "balance_loss_mlp": 0.0, "epoch": 0.2446942824505501, "flos": 66997820764800.0, "grad_norm": 0.7579512173623888, "language_loss": 0.64441603, "learning_rate": 3.5358099732993463e-06, "loss": 0.66659451, "num_input_tokens_seen": 43481825, "step": 2035, "time_per_iteration": 3.134211778640747 }, { "auxiliary_loss_clip": 0.01217857, "auxiliary_loss_mlp": 0.01037283, "balance_loss_clip": 0.98493403, "balance_loss_mlp": 1.02748382, "epoch": 0.2448145253411892, "flos": 20411122792320.0, "grad_norm": 2.2415348663816017, "language_loss": 0.89688253, "learning_rate": 3.535310875813535e-06, "loss": 0.91943395, "num_input_tokens_seen": 43500220, "step": 2036, "time_per_iteration": 2.8277268409729004 }, { "auxiliary_loss_clip": 0.01211155, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.02271271, "balance_loss_mlp": 1.02524304, "epoch": 0.2449347682318283, "flos": 28804739080320.0, "grad_norm": 2.1607381834106247, "language_loss": 0.8184334, "learning_rate": 3.5348115454222843e-06, "loss": 0.84089625, "num_input_tokens_seen": 43522805, "step": 2037, "time_per_iteration": 2.7314867973327637 }, { "auxiliary_loss_clip": 0.01200435, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 0.9792279, "balance_loss_mlp": 1.02377021, "epoch": 0.2450550111224674, "flos": 22528990546560.0, "grad_norm": 1.8383910242816262, "language_loss": 0.85914183, "learning_rate": 3.5343119822013425e-06, "loss": 0.88148046, "num_input_tokens_seen": 43541915, "step": 2038, "time_per_iteration": 2.8153669834136963 }, { "auxiliary_loss_clip": 0.01218155, "auxiliary_loss_mlp": 0.01033107, "balance_loss_clip": 1.02306533, "balance_loss_mlp": 1.02289701, "epoch": 0.24517525401310647, "flos": 21759137326080.0, "grad_norm": 2.116930449001057, "language_loss": 0.77715528, "learning_rate": 3.533812186226493e-06, "loss": 0.79966784, "num_input_tokens_seen": 43562625, "step": 2039, "time_per_iteration": 2.686563730239868 }, { "auxiliary_loss_clip": 0.01208204, "auxiliary_loss_mlp": 0.01036378, "balance_loss_clip": 1.05719185, "balance_loss_mlp": 1.0267874, "epoch": 0.24529549690374555, "flos": 25043311687680.0, "grad_norm": 1.6420343726359952, "language_loss": 0.75903183, "learning_rate": 3.5333121575735545e-06, "loss": 0.78147769, "num_input_tokens_seen": 43582265, "step": 2040, "time_per_iteration": 2.7573282718658447 }, { "auxiliary_loss_clip": 0.01210378, "auxiliary_loss_mlp": 0.01036222, "balance_loss_clip": 0.98630714, "balance_loss_mlp": 1.02656007, "epoch": 0.24541573979438466, "flos": 32123638915200.0, "grad_norm": 1.7507122203496492, "language_loss": 0.75519371, "learning_rate": 3.532811896318381e-06, "loss": 0.77765965, "num_input_tokens_seen": 43604335, "step": 2041, "time_per_iteration": 2.7495954036712646 }, { "auxiliary_loss_clip": 0.01211569, "auxiliary_loss_mlp": 0.01041104, "balance_loss_clip": 0.9458878, "balance_loss_mlp": 1.03043532, "epoch": 0.24553598268502375, "flos": 31357556622720.0, "grad_norm": 2.077029119138681, "language_loss": 0.81481451, "learning_rate": 3.5323114025368615e-06, "loss": 0.83734125, "num_input_tokens_seen": 43619400, "step": 2042, "time_per_iteration": 2.7708146572113037 }, { "auxiliary_loss_clip": 0.01204284, "auxiliary_loss_mlp": 0.01032676, "balance_loss_clip": 1.01871717, "balance_loss_mlp": 1.02329397, "epoch": 0.24565622557566283, "flos": 14027462824320.0, "grad_norm": 2.194179962965572, "language_loss": 0.81739795, "learning_rate": 3.53181067630492e-06, "loss": 0.83976758, "num_input_tokens_seen": 43636870, "step": 2043, "time_per_iteration": 2.641692876815796 }, { "auxiliary_loss_clip": 0.01194432, "auxiliary_loss_mlp": 0.01041005, "balance_loss_clip": 0.97891057, "balance_loss_mlp": 1.03139687, "epoch": 0.24577646846630194, "flos": 16581465515520.0, "grad_norm": 1.7336122143621808, "language_loss": 0.76035231, "learning_rate": 3.5313097176985175e-06, "loss": 0.78270662, "num_input_tokens_seen": 43655180, "step": 2044, "time_per_iteration": 2.6629064083099365 }, { "auxiliary_loss_clip": 0.0120785, "auxiliary_loss_mlp": 0.01031497, "balance_loss_clip": 1.02024519, "balance_loss_mlp": 1.02208591, "epoch": 0.24589671135694102, "flos": 18807424272000.0, "grad_norm": 1.6770602208456649, "language_loss": 0.81028503, "learning_rate": 3.5308085267936482e-06, "loss": 0.8326785, "num_input_tokens_seen": 43672895, "step": 2045, "time_per_iteration": 2.68819522857666 }, { "auxiliary_loss_clip": 0.01207524, "auxiliary_loss_mlp": 0.0112665, "balance_loss_clip": 0.87041652, "balance_loss_mlp": 0.0, "epoch": 0.2460169542475801, "flos": 19938538529280.0, "grad_norm": 1.657533331850931, "language_loss": 0.8963623, "learning_rate": 3.530307103666342e-06, "loss": 0.91970402, "num_input_tokens_seen": 43691975, "step": 2046, "time_per_iteration": 3.685054302215576 }, { "auxiliary_loss_clip": 0.01213591, "auxiliary_loss_mlp": 0.0103172, "balance_loss_clip": 0.94706964, "balance_loss_mlp": 1.02240419, "epoch": 0.24613719713821922, "flos": 24171221381760.0, "grad_norm": 1.6635163363659655, "language_loss": 0.80076563, "learning_rate": 3.5298054483926658e-06, "loss": 0.82321882, "num_input_tokens_seen": 43712670, "step": 2047, "time_per_iteration": 3.7662594318389893 }, { "auxiliary_loss_clip": 0.01217895, "auxiliary_loss_mlp": 0.01036399, "balance_loss_clip": 1.02231419, "balance_loss_mlp": 1.02674317, "epoch": 0.2462574400288583, "flos": 30221055325440.0, "grad_norm": 2.1353444423697368, "language_loss": 0.83018672, "learning_rate": 3.5293035610487187e-06, "loss": 0.85272968, "num_input_tokens_seen": 43732035, "step": 2048, "time_per_iteration": 2.772832155227661 }, { "auxiliary_loss_clip": 0.01096477, "auxiliary_loss_mlp": 0.01005569, "balance_loss_clip": 0.94342303, "balance_loss_mlp": 1.00168276, "epoch": 0.24637768291949738, "flos": 68943030819840.0, "grad_norm": 0.7254184188327265, "language_loss": 0.61998963, "learning_rate": 3.5288014417106374e-06, "loss": 0.64101005, "num_input_tokens_seen": 43798055, "step": 2049, "time_per_iteration": 4.145323991775513 }, { "auxiliary_loss_clip": 0.01209367, "auxiliary_loss_mlp": 0.01038228, "balance_loss_clip": 0.94690299, "balance_loss_mlp": 1.02778554, "epoch": 0.24649792581013646, "flos": 34383999922560.0, "grad_norm": 1.8862779454989513, "language_loss": 0.75927883, "learning_rate": 3.528299090454593e-06, "loss": 0.78175479, "num_input_tokens_seen": 43818590, "step": 2050, "time_per_iteration": 2.8868303298950195 }, { "auxiliary_loss_clip": 0.01211174, "auxiliary_loss_mlp": 0.0103141, "balance_loss_clip": 1.02055609, "balance_loss_mlp": 1.02177179, "epoch": 0.24661816870077558, "flos": 19680448331520.0, "grad_norm": 2.1300638633757707, "language_loss": 0.82629037, "learning_rate": 3.527796507356792e-06, "loss": 0.8487162, "num_input_tokens_seen": 43832480, "step": 2051, "time_per_iteration": 3.6198537349700928 }, { "auxiliary_loss_clip": 0.01214156, "auxiliary_loss_mlp": 0.01034811, "balance_loss_clip": 1.02142119, "balance_loss_mlp": 1.02482092, "epoch": 0.24673841159141466, "flos": 20002279213440.0, "grad_norm": 2.34299138043537, "language_loss": 0.89954734, "learning_rate": 3.527293692493475e-06, "loss": 0.92203701, "num_input_tokens_seen": 43848345, "step": 2052, "time_per_iteration": 2.617431640625 }, { "auxiliary_loss_clip": 0.01215774, "auxiliary_loss_mlp": 0.01034084, "balance_loss_clip": 1.02182758, "balance_loss_mlp": 1.02342045, "epoch": 0.24685865448205374, "flos": 21646593037440.0, "grad_norm": 2.396182849457201, "language_loss": 0.73373985, "learning_rate": 3.52679064594092e-06, "loss": 0.75623846, "num_input_tokens_seen": 43865685, "step": 2053, "time_per_iteration": 2.680549144744873 }, { "auxiliary_loss_clip": 0.01182357, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 0.89531672, "balance_loss_mlp": 1.02040267, "epoch": 0.24697889737269285, "flos": 17960470508160.0, "grad_norm": 2.5951614277909094, "language_loss": 0.75062215, "learning_rate": 3.5262873677754375e-06, "loss": 0.77274388, "num_input_tokens_seen": 43883690, "step": 2054, "time_per_iteration": 2.7273223400115967 }, { "auxiliary_loss_clip": 0.01210984, "auxiliary_loss_mlp": 0.01034656, "balance_loss_clip": 1.06062293, "balance_loss_mlp": 1.02468967, "epoch": 0.24709914026333193, "flos": 27344611221120.0, "grad_norm": 1.6835703658090593, "language_loss": 0.80820024, "learning_rate": 3.5257838580733745e-06, "loss": 0.83065665, "num_input_tokens_seen": 43903295, "step": 2055, "time_per_iteration": 2.6425247192382812 }, { "auxiliary_loss_clip": 0.01216195, "auxiliary_loss_mlp": 0.01029455, "balance_loss_clip": 1.02330661, "balance_loss_mlp": 1.01962614, "epoch": 0.24721938315397102, "flos": 19275519335040.0, "grad_norm": 1.8576899491318886, "language_loss": 0.87500238, "learning_rate": 3.5252801169111138e-06, "loss": 0.89745891, "num_input_tokens_seen": 43920960, "step": 2056, "time_per_iteration": 2.6594161987304688 }, { "auxiliary_loss_clip": 0.01209959, "auxiliary_loss_mlp": 0.01039346, "balance_loss_clip": 0.98818022, "balance_loss_mlp": 1.03027987, "epoch": 0.2473396260446101, "flos": 23185796688000.0, "grad_norm": 1.7944709763312607, "language_loss": 0.79893613, "learning_rate": 3.524776144365072e-06, "loss": 0.82142919, "num_input_tokens_seen": 43939415, "step": 2057, "time_per_iteration": 2.733013153076172 }, { "auxiliary_loss_clip": 0.01204865, "auxiliary_loss_mlp": 0.0103525, "balance_loss_clip": 0.98606968, "balance_loss_mlp": 1.0258919, "epoch": 0.2474598689352492, "flos": 21142443697920.0, "grad_norm": 1.5025234200700375, "language_loss": 0.79140091, "learning_rate": 3.5242719405117016e-06, "loss": 0.81380212, "num_input_tokens_seen": 43959220, "step": 2058, "time_per_iteration": 2.8153765201568604 }, { "auxiliary_loss_clip": 0.01209449, "auxiliary_loss_mlp": 0.01126746, "balance_loss_clip": 0.9826262, "balance_loss_mlp": 0.0, "epoch": 0.2475801118258883, "flos": 21648352803840.0, "grad_norm": 2.3422623753109884, "language_loss": 0.7490654, "learning_rate": 3.5237675054274893e-06, "loss": 0.77242732, "num_input_tokens_seen": 43978420, "step": 2059, "time_per_iteration": 2.7395637035369873 }, { "auxiliary_loss_clip": 0.01211807, "auxiliary_loss_mlp": 0.0103555, "balance_loss_clip": 1.0224762, "balance_loss_mlp": 1.02505398, "epoch": 0.24770035471652738, "flos": 22674500542080.0, "grad_norm": 1.8542066978734073, "language_loss": 0.80490577, "learning_rate": 3.5232628391889584e-06, "loss": 0.82737935, "num_input_tokens_seen": 43996710, "step": 2060, "time_per_iteration": 2.6658411026000977 }, { "auxiliary_loss_clip": 0.01200361, "auxiliary_loss_mlp": 0.01031259, "balance_loss_clip": 0.90418577, "balance_loss_mlp": 1.02169204, "epoch": 0.2478205976071665, "flos": 22163814927360.0, "grad_norm": 2.1530275543680757, "language_loss": 0.64464694, "learning_rate": 3.522757941872666e-06, "loss": 0.66696316, "num_input_tokens_seen": 44014865, "step": 2061, "time_per_iteration": 2.745112180709839 }, { "auxiliary_loss_clip": 0.01215739, "auxiliary_loss_mlp": 0.01126873, "balance_loss_clip": 1.06433666, "balance_loss_mlp": 0.0, "epoch": 0.24794084049780557, "flos": 24973106555520.0, "grad_norm": 1.4852977216231333, "language_loss": 0.82836366, "learning_rate": 3.5222528135552042e-06, "loss": 0.85178983, "num_input_tokens_seen": 44036325, "step": 2062, "time_per_iteration": 2.705659866333008 }, { "auxiliary_loss_clip": 0.0120841, "auxiliary_loss_mlp": 0.0104401, "balance_loss_clip": 1.02283335, "balance_loss_mlp": 1.03421068, "epoch": 0.24806108338844465, "flos": 18296379521280.0, "grad_norm": 2.449608371944514, "language_loss": 0.80376256, "learning_rate": 3.521747454313201e-06, "loss": 0.82628673, "num_input_tokens_seen": 44055005, "step": 2063, "time_per_iteration": 2.6168456077575684 }, { "auxiliary_loss_clip": 0.01192622, "auxiliary_loss_mlp": 0.01031851, "balance_loss_clip": 0.93918836, "balance_loss_mlp": 1.02202797, "epoch": 0.24818132627908374, "flos": 19282163351040.0, "grad_norm": 2.0332629703209197, "language_loss": 0.66557342, "learning_rate": 3.521241864223319e-06, "loss": 0.68781811, "num_input_tokens_seen": 44073965, "step": 2064, "time_per_iteration": 2.743298053741455 }, { "auxiliary_loss_clip": 0.01107515, "auxiliary_loss_mlp": 0.01006717, "balance_loss_clip": 0.94847846, "balance_loss_mlp": 1.00256813, "epoch": 0.24830156916972285, "flos": 70285837881600.0, "grad_norm": 1.6419764200436167, "language_loss": 0.61970216, "learning_rate": 3.5207360433622552e-06, "loss": 0.64084446, "num_input_tokens_seen": 44135965, "step": 2065, "time_per_iteration": 3.2501697540283203 }, { "auxiliary_loss_clip": 0.0120243, "auxiliary_loss_mlp": 0.0103347, "balance_loss_clip": 0.98414892, "balance_loss_mlp": 1.02432084, "epoch": 0.24842181206036193, "flos": 40409128287360.0, "grad_norm": 2.647432441874539, "language_loss": 0.74689573, "learning_rate": 3.5202299918067437e-06, "loss": 0.76925474, "num_input_tokens_seen": 44159560, "step": 2066, "time_per_iteration": 2.888288974761963 }, { "auxiliary_loss_clip": 0.0120999, "auxiliary_loss_mlp": 0.01036712, "balance_loss_clip": 1.02366197, "balance_loss_mlp": 1.02718067, "epoch": 0.248542054951001, "flos": 20082432412800.0, "grad_norm": 2.0453315565993564, "language_loss": 0.69538456, "learning_rate": 3.519723709633551e-06, "loss": 0.71785152, "num_input_tokens_seen": 44178320, "step": 2067, "time_per_iteration": 2.7051033973693848 }, { "auxiliary_loss_clip": 0.01198042, "auxiliary_loss_mlp": 0.01044641, "balance_loss_clip": 0.98072773, "balance_loss_mlp": 1.03535438, "epoch": 0.24866229784164012, "flos": 23513948363520.0, "grad_norm": 1.91132867138579, "language_loss": 0.83948433, "learning_rate": 3.519217196919479e-06, "loss": 0.86191112, "num_input_tokens_seen": 44197305, "step": 2068, "time_per_iteration": 2.6918416023254395 }, { "auxiliary_loss_clip": 0.01216061, "auxiliary_loss_mlp": 0.0103939, "balance_loss_clip": 0.98799431, "balance_loss_mlp": 1.02976334, "epoch": 0.2487825407322792, "flos": 19865101173120.0, "grad_norm": 1.7594849288946168, "language_loss": 0.73587084, "learning_rate": 3.518710453741367e-06, "loss": 0.7584253, "num_input_tokens_seen": 44216505, "step": 2069, "time_per_iteration": 2.7214295864105225 }, { "auxiliary_loss_clip": 0.01196988, "auxiliary_loss_mlp": 0.01126919, "balance_loss_clip": 0.9802897, "balance_loss_mlp": 0.0, "epoch": 0.2489027836229183, "flos": 22017622573440.0, "grad_norm": 1.8458286977216558, "language_loss": 0.67174858, "learning_rate": 3.518203480176086e-06, "loss": 0.69498765, "num_input_tokens_seen": 44235435, "step": 2070, "time_per_iteration": 2.6556503772735596 }, { "auxiliary_loss_clip": 0.01199636, "auxiliary_loss_mlp": 0.01036307, "balance_loss_clip": 0.82641232, "balance_loss_mlp": 1.02619219, "epoch": 0.2490230265135574, "flos": 23294354567040.0, "grad_norm": 1.709545456386331, "language_loss": 0.80704391, "learning_rate": 3.517696276300545e-06, "loss": 0.82940334, "num_input_tokens_seen": 44256975, "step": 2071, "time_per_iteration": 3.7950212955474854 }, { "auxiliary_loss_clip": 0.01212946, "auxiliary_loss_mlp": 0.01038937, "balance_loss_clip": 1.02526736, "balance_loss_mlp": 1.02935219, "epoch": 0.24914326940419648, "flos": 19826784339840.0, "grad_norm": 2.4441314714636144, "language_loss": 0.69019783, "learning_rate": 3.517188842191685e-06, "loss": 0.7127167, "num_input_tokens_seen": 44275125, "step": 2072, "time_per_iteration": 2.894369602203369 }, { "auxiliary_loss_clip": 0.0120542, "auxiliary_loss_mlp": 0.01034955, "balance_loss_clip": 1.02011919, "balance_loss_mlp": 1.02449489, "epoch": 0.24926351229483557, "flos": 20229271211520.0, "grad_norm": 1.5301933281604791, "language_loss": 0.73674059, "learning_rate": 3.5166811779264837e-06, "loss": 0.75914431, "num_input_tokens_seen": 44295445, "step": 2073, "time_per_iteration": 3.5498745441436768 }, { "auxiliary_loss_clip": 0.01211696, "auxiliary_loss_mlp": 0.01031575, "balance_loss_clip": 1.0595727, "balance_loss_mlp": 1.02220476, "epoch": 0.24938375518547465, "flos": 23294570048640.0, "grad_norm": 2.0855764772466103, "language_loss": 0.77979892, "learning_rate": 3.5161732835819545e-06, "loss": 0.80223167, "num_input_tokens_seen": 44314755, "step": 2074, "time_per_iteration": 2.6742167472839355 }, { "auxiliary_loss_clip": 0.01210522, "auxiliary_loss_mlp": 0.01039874, "balance_loss_clip": 1.06110525, "balance_loss_mlp": 1.02996826, "epoch": 0.24950399807611376, "flos": 17311673099520.0, "grad_norm": 1.807864446588758, "language_loss": 0.83288741, "learning_rate": 3.515665159235143e-06, "loss": 0.85539126, "num_input_tokens_seen": 44333640, "step": 2075, "time_per_iteration": 3.459698438644409 }, { "auxiliary_loss_clip": 0.01202635, "auxiliary_loss_mlp": 0.01035653, "balance_loss_clip": 0.97899848, "balance_loss_mlp": 1.02658141, "epoch": 0.24962424096675284, "flos": 19024863252480.0, "grad_norm": 3.8722612385203683, "language_loss": 0.74761462, "learning_rate": 3.5151568049631318e-06, "loss": 0.76999748, "num_input_tokens_seen": 44352355, "step": 2076, "time_per_iteration": 2.7302725315093994 }, { "auxiliary_loss_clip": 0.01212605, "auxiliary_loss_mlp": 0.01032599, "balance_loss_clip": 1.06047201, "balance_loss_mlp": 1.02355099, "epoch": 0.24974448385739192, "flos": 33398790710400.0, "grad_norm": 1.664052621643117, "language_loss": 0.80392861, "learning_rate": 3.5146482208430385e-06, "loss": 0.82638073, "num_input_tokens_seen": 44374185, "step": 2077, "time_per_iteration": 3.616485834121704 }, { "auxiliary_loss_clip": 0.01184536, "auxiliary_loss_mlp": 0.01035715, "balance_loss_clip": 0.86091495, "balance_loss_mlp": 1.02554595, "epoch": 0.24986472674803104, "flos": 30007279532160.0, "grad_norm": 2.5343676886405886, "language_loss": 0.67989904, "learning_rate": 3.514139406952014e-06, "loss": 0.70210153, "num_input_tokens_seen": 44396210, "step": 2078, "time_per_iteration": 3.1919617652893066 }, { "auxiliary_loss_clip": 0.01206733, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.02172399, "balance_loss_mlp": 1.02763116, "epoch": 0.24998496963867012, "flos": 26613074833920.0, "grad_norm": 1.9342419458132785, "language_loss": 0.83680356, "learning_rate": 3.5136303633672454e-06, "loss": 0.85924935, "num_input_tokens_seen": 44416340, "step": 2079, "time_per_iteration": 3.2905945777893066 }, { "auxiliary_loss_clip": 0.01216763, "auxiliary_loss_mlp": 0.01126929, "balance_loss_clip": 0.947914, "balance_loss_mlp": 0.0, "epoch": 0.25010521252930923, "flos": 23553989049600.0, "grad_norm": 1.5060235980330963, "language_loss": 0.74460858, "learning_rate": 3.5131210901659544e-06, "loss": 0.76804554, "num_input_tokens_seen": 44438095, "step": 2080, "time_per_iteration": 2.8780815601348877 }, { "auxiliary_loss_clip": 0.01199089, "auxiliary_loss_mlp": 0.01042254, "balance_loss_clip": 0.94193208, "balance_loss_mlp": 1.03234172, "epoch": 0.2502254554199483, "flos": 23441193365760.0, "grad_norm": 2.260897106185283, "language_loss": 0.81965435, "learning_rate": 3.5126115874253967e-06, "loss": 0.84206772, "num_input_tokens_seen": 44457650, "step": 2081, "time_per_iteration": 2.7225847244262695 }, { "auxiliary_loss_clip": 0.0120621, "auxiliary_loss_mlp": 0.01039595, "balance_loss_clip": 0.94526237, "balance_loss_mlp": 1.02983212, "epoch": 0.2503456983105874, "flos": 28761681651840.0, "grad_norm": 2.2550224649061206, "language_loss": 0.80751193, "learning_rate": 3.5121018552228644e-06, "loss": 0.82996994, "num_input_tokens_seen": 44476155, "step": 2082, "time_per_iteration": 2.893188714981079 }, { "auxiliary_loss_clip": 0.01204034, "auxiliary_loss_mlp": 0.01030239, "balance_loss_clip": 0.94288117, "balance_loss_mlp": 1.02078009, "epoch": 0.2504659412012265, "flos": 18770256673920.0, "grad_norm": 1.9301046222221194, "language_loss": 0.76155543, "learning_rate": 3.5115918936356827e-06, "loss": 0.78389823, "num_input_tokens_seen": 44492910, "step": 2083, "time_per_iteration": 2.7424864768981934 }, { "auxiliary_loss_clip": 0.01182174, "auxiliary_loss_mlp": 0.01046404, "balance_loss_clip": 0.94123578, "balance_loss_mlp": 1.03713584, "epoch": 0.25058618409186556, "flos": 16873383346560.0, "grad_norm": 1.9575949611446293, "language_loss": 0.78416324, "learning_rate": 3.5110817027412123e-06, "loss": 0.80644906, "num_input_tokens_seen": 44512000, "step": 2084, "time_per_iteration": 2.7290802001953125 }, { "auxiliary_loss_clip": 0.01196646, "auxiliary_loss_mlp": 0.01033333, "balance_loss_clip": 0.93950957, "balance_loss_mlp": 1.02371907, "epoch": 0.25070642698250467, "flos": 24425540651520.0, "grad_norm": 2.063123452147371, "language_loss": 0.69126201, "learning_rate": 3.5105712826168493e-06, "loss": 0.71356183, "num_input_tokens_seen": 44531650, "step": 2085, "time_per_iteration": 2.689906358718872 }, { "auxiliary_loss_clip": 0.01202928, "auxiliary_loss_mlp": 0.01126102, "balance_loss_clip": 1.01812768, "balance_loss_mlp": 0.0, "epoch": 0.2508266698731437, "flos": 20260944028800.0, "grad_norm": 2.1091128885529433, "language_loss": 0.71170026, "learning_rate": 3.5100606333400235e-06, "loss": 0.73499054, "num_input_tokens_seen": 44548785, "step": 2086, "time_per_iteration": 2.6940767765045166 }, { "auxiliary_loss_clip": 0.01218207, "auxiliary_loss_mlp": 0.01040386, "balance_loss_clip": 0.98225343, "balance_loss_mlp": 1.030128, "epoch": 0.25094691276378284, "flos": 19245318975360.0, "grad_norm": 2.387041902974419, "language_loss": 0.76883632, "learning_rate": 3.5095497549882006e-06, "loss": 0.79142225, "num_input_tokens_seen": 44567230, "step": 2087, "time_per_iteration": 2.665356159210205 }, { "auxiliary_loss_clip": 0.01212017, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.02450919, "balance_loss_mlp": 1.02217829, "epoch": 0.25106715565442195, "flos": 26943237671040.0, "grad_norm": 3.5966978722091847, "language_loss": 0.72727752, "learning_rate": 3.50903864763888e-06, "loss": 0.74971962, "num_input_tokens_seen": 44588020, "step": 2088, "time_per_iteration": 2.7327752113342285 }, { "auxiliary_loss_clip": 0.01209266, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.01895666, "balance_loss_mlp": 1.0244863, "epoch": 0.251187398545061, "flos": 48359570572800.0, "grad_norm": 3.4600309545828134, "language_loss": 0.75788581, "learning_rate": 3.5085273113695965e-06, "loss": 0.78032446, "num_input_tokens_seen": 44612590, "step": 2089, "time_per_iteration": 2.9024386405944824 }, { "auxiliary_loss_clip": 0.01210249, "auxiliary_loss_mlp": 0.010359, "balance_loss_clip": 1.06020224, "balance_loss_mlp": 1.02620864, "epoch": 0.2513076414357001, "flos": 27016100409600.0, "grad_norm": 1.6497733362181726, "language_loss": 0.77961576, "learning_rate": 3.508015746257919e-06, "loss": 0.80207729, "num_input_tokens_seen": 44631630, "step": 2090, "time_per_iteration": 2.6504225730895996 }, { "auxiliary_loss_clip": 0.01208396, "auxiliary_loss_mlp": 0.01040018, "balance_loss_clip": 0.94371223, "balance_loss_mlp": 1.03029656, "epoch": 0.2514278843263392, "flos": 19463619882240.0, "grad_norm": 1.8320414077523788, "language_loss": 0.83397746, "learning_rate": 3.5075039523814518e-06, "loss": 0.85646164, "num_input_tokens_seen": 44650820, "step": 2091, "time_per_iteration": 2.7259299755096436 }, { "auxiliary_loss_clip": 0.01212304, "auxiliary_loss_mlp": 0.01032631, "balance_loss_clip": 1.0193727, "balance_loss_mlp": 1.02309418, "epoch": 0.2515481272169783, "flos": 16866092885760.0, "grad_norm": 1.9253695756529305, "language_loss": 0.81652701, "learning_rate": 3.506991929817834e-06, "loss": 0.83897638, "num_input_tokens_seen": 44667540, "step": 2092, "time_per_iteration": 2.6610617637634277 }, { "auxiliary_loss_clip": 0.0120915, "auxiliary_loss_mlp": 0.01040913, "balance_loss_clip": 1.06241417, "balance_loss_mlp": 1.03200758, "epoch": 0.2516683701076174, "flos": 23732464752000.0, "grad_norm": 2.9876520752072038, "language_loss": 0.82878947, "learning_rate": 3.506479678644738e-06, "loss": 0.85129011, "num_input_tokens_seen": 44687935, "step": 2093, "time_per_iteration": 2.678208827972412 }, { "auxiliary_loss_clip": 0.01189482, "auxiliary_loss_mlp": 0.01032936, "balance_loss_clip": 0.90065718, "balance_loss_mlp": 1.02345252, "epoch": 0.2517886129982565, "flos": 27635954434560.0, "grad_norm": 2.4110399123516144, "language_loss": 0.74108994, "learning_rate": 3.505967198939873e-06, "loss": 0.76331413, "num_input_tokens_seen": 44704975, "step": 2094, "time_per_iteration": 2.831470489501953 }, { "auxiliary_loss_clip": 0.01201235, "auxiliary_loss_mlp": 0.01032215, "balance_loss_clip": 0.97720057, "balance_loss_mlp": 1.02257133, "epoch": 0.25190885588889556, "flos": 38104596529920.0, "grad_norm": 2.009472590317466, "language_loss": 0.78201365, "learning_rate": 3.5054544907809813e-06, "loss": 0.80434817, "num_input_tokens_seen": 44725475, "step": 2095, "time_per_iteration": 2.841475248336792 }, { "auxiliary_loss_clip": 0.01205793, "auxiliary_loss_mlp": 0.01126632, "balance_loss_clip": 0.98494291, "balance_loss_mlp": 0.0, "epoch": 0.25202909877953467, "flos": 22269894768000.0, "grad_norm": 1.9718624752537304, "language_loss": 0.80336148, "learning_rate": 3.50494155424584e-06, "loss": 0.82668573, "num_input_tokens_seen": 44744380, "step": 2096, "time_per_iteration": 2.7041752338409424 }, { "auxiliary_loss_clip": 0.01210542, "auxiliary_loss_mlp": 0.01036036, "balance_loss_clip": 1.02065516, "balance_loss_mlp": 1.02646327, "epoch": 0.2521493416701738, "flos": 21761759018880.0, "grad_norm": 1.5471934796150149, "language_loss": 0.83101058, "learning_rate": 3.504428389412262e-06, "loss": 0.85347635, "num_input_tokens_seen": 44765190, "step": 2097, "time_per_iteration": 3.6215434074401855 }, { "auxiliary_loss_clip": 0.01207365, "auxiliary_loss_mlp": 0.01036205, "balance_loss_clip": 1.02115095, "balance_loss_mlp": 1.02591097, "epoch": 0.25226958456081283, "flos": 27746738956800.0, "grad_norm": 2.108820416585375, "language_loss": 0.72976434, "learning_rate": 3.5039149963580927e-06, "loss": 0.75220007, "num_input_tokens_seen": 44785210, "step": 2098, "time_per_iteration": 2.7341110706329346 }, { "auxiliary_loss_clip": 0.01207067, "auxiliary_loss_mlp": 0.01038964, "balance_loss_clip": 0.98561656, "balance_loss_mlp": 1.02944541, "epoch": 0.25238982745145194, "flos": 30732171903360.0, "grad_norm": 2.0874358101376584, "language_loss": 0.69580078, "learning_rate": 3.503401375161215e-06, "loss": 0.71826106, "num_input_tokens_seen": 44804955, "step": 2099, "time_per_iteration": 3.676455497741699 }, { "auxiliary_loss_clip": 0.01209633, "auxiliary_loss_mlp": 0.0103325, "balance_loss_clip": 1.06020522, "balance_loss_mlp": 1.02400529, "epoch": 0.252510070342091, "flos": 20266331068800.0, "grad_norm": 1.7614190266195446, "language_loss": 0.83575618, "learning_rate": 3.502887525899544e-06, "loss": 0.85818499, "num_input_tokens_seen": 44823935, "step": 2100, "time_per_iteration": 2.628675699234009 }, { "auxiliary_loss_clip": 0.01205116, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 0.98021972, "balance_loss_mlp": 1.0226922, "epoch": 0.2526303132327301, "flos": 22747399194240.0, "grad_norm": 2.0079350655219343, "language_loss": 0.83004379, "learning_rate": 3.50237344865103e-06, "loss": 0.85241461, "num_input_tokens_seen": 44844935, "step": 2101, "time_per_iteration": 3.6516616344451904 }, { "auxiliary_loss_clip": 0.01211356, "auxiliary_loss_mlp": 0.01040691, "balance_loss_clip": 1.060215, "balance_loss_mlp": 1.03062963, "epoch": 0.2527505561233692, "flos": 30263466309120.0, "grad_norm": 2.3665890014513025, "language_loss": 0.76128769, "learning_rate": 3.501859143493658e-06, "loss": 0.78380823, "num_input_tokens_seen": 44865565, "step": 2102, "time_per_iteration": 2.6899218559265137 }, { "auxiliary_loss_clip": 0.01101207, "auxiliary_loss_mlp": 0.01003581, "balance_loss_clip": 1.02348113, "balance_loss_mlp": 0.9996714, "epoch": 0.2528707990140083, "flos": 58492917164160.0, "grad_norm": 0.9017176317003461, "language_loss": 0.60573602, "learning_rate": 3.5013446105054488e-06, "loss": 0.62678397, "num_input_tokens_seen": 44918485, "step": 2103, "time_per_iteration": 3.8242404460906982 }, { "auxiliary_loss_clip": 0.01189196, "auxiliary_loss_mlp": 0.01030173, "balance_loss_clip": 0.94087559, "balance_loss_mlp": 1.0203557, "epoch": 0.2529910419046474, "flos": 24645134448000.0, "grad_norm": 1.8238278482777786, "language_loss": 0.74508095, "learning_rate": 3.5008298497644555e-06, "loss": 0.76727468, "num_input_tokens_seen": 44937530, "step": 2104, "time_per_iteration": 2.776047468185425 }, { "auxiliary_loss_clip": 0.01206676, "auxiliary_loss_mlp": 0.01037089, "balance_loss_clip": 0.94334173, "balance_loss_mlp": 1.02740872, "epoch": 0.2531112847952865, "flos": 23842135952640.0, "grad_norm": 1.9110046736728326, "language_loss": 0.87741435, "learning_rate": 3.500314861348767e-06, "loss": 0.89985198, "num_input_tokens_seen": 44958165, "step": 2105, "time_per_iteration": 2.7478013038635254 }, { "auxiliary_loss_clip": 0.01197198, "auxiliary_loss_mlp": 0.01035589, "balance_loss_clip": 0.94387537, "balance_loss_mlp": 1.02600491, "epoch": 0.25323152768592555, "flos": 16143822207360.0, "grad_norm": 1.9257193787341087, "language_loss": 0.77454066, "learning_rate": 3.499799645336507e-06, "loss": 0.79686856, "num_input_tokens_seen": 44975060, "step": 2106, "time_per_iteration": 2.7210426330566406 }, { "auxiliary_loss_clip": 0.01210202, "auxiliary_loss_mlp": 0.01030385, "balance_loss_clip": 1.02183962, "balance_loss_mlp": 1.02123606, "epoch": 0.25335177057656466, "flos": 28405161210240.0, "grad_norm": 1.5534309559802726, "language_loss": 0.8687582, "learning_rate": 3.4992842018058336e-06, "loss": 0.89116406, "num_input_tokens_seen": 44997960, "step": 2107, "time_per_iteration": 2.790175676345825 }, { "auxiliary_loss_clip": 0.01211926, "auxiliary_loss_mlp": 0.01039742, "balance_loss_clip": 0.9444446, "balance_loss_mlp": 1.03016937, "epoch": 0.25347201346720377, "flos": 18799666934400.0, "grad_norm": 2.032751631432209, "language_loss": 0.88293588, "learning_rate": 3.4987685308349384e-06, "loss": 0.90545255, "num_input_tokens_seen": 45015690, "step": 2108, "time_per_iteration": 2.7470223903656006 }, { "auxiliary_loss_clip": 0.01200347, "auxiliary_loss_mlp": 0.01034429, "balance_loss_clip": 0.9398607, "balance_loss_mlp": 1.0246954, "epoch": 0.2535922563578428, "flos": 15815490963840.0, "grad_norm": 2.390223569136515, "language_loss": 0.61307245, "learning_rate": 3.4982526325020497e-06, "loss": 0.6354202, "num_input_tokens_seen": 45032660, "step": 2109, "time_per_iteration": 2.7213878631591797 }, { "auxiliary_loss_clip": 0.01210055, "auxiliary_loss_mlp": 0.01033217, "balance_loss_clip": 0.98388362, "balance_loss_mlp": 1.02354908, "epoch": 0.25371249924848194, "flos": 16318922031360.0, "grad_norm": 2.1704261223026546, "language_loss": 0.81758368, "learning_rate": 3.4977365068854273e-06, "loss": 0.84001637, "num_input_tokens_seen": 45048280, "step": 2110, "time_per_iteration": 2.7118706703186035 }, { "auxiliary_loss_clip": 0.01194966, "auxiliary_loss_mlp": 0.01037361, "balance_loss_clip": 0.9799242, "balance_loss_mlp": 1.02752042, "epoch": 0.25383274213912105, "flos": 21761615364480.0, "grad_norm": 2.161799584818254, "language_loss": 0.73267877, "learning_rate": 3.4972201540633676e-06, "loss": 0.75500208, "num_input_tokens_seen": 45067635, "step": 2111, "time_per_iteration": 2.682037830352783 }, { "auxiliary_loss_clip": 0.01198306, "auxiliary_loss_mlp": 0.01036645, "balance_loss_clip": 0.98207045, "balance_loss_mlp": 1.02657723, "epoch": 0.2539529850297601, "flos": 21396870708480.0, "grad_norm": 1.8380057362141056, "language_loss": 0.85095716, "learning_rate": 3.4967035741142008e-06, "loss": 0.87330663, "num_input_tokens_seen": 45086455, "step": 2112, "time_per_iteration": 2.7195019721984863 }, { "auxiliary_loss_clip": 0.01198513, "auxiliary_loss_mlp": 0.0103233, "balance_loss_clip": 0.9846766, "balance_loss_mlp": 1.02309144, "epoch": 0.2540732279203992, "flos": 25228467319680.0, "grad_norm": 1.765049025404553, "language_loss": 0.81834817, "learning_rate": 3.4961867671162917e-06, "loss": 0.84065664, "num_input_tokens_seen": 45106385, "step": 2113, "time_per_iteration": 2.712526321411133 }, { "auxiliary_loss_clip": 0.0121332, "auxiliary_loss_mlp": 0.01035286, "balance_loss_clip": 1.06032205, "balance_loss_mlp": 1.02475953, "epoch": 0.2541934708110383, "flos": 19427386037760.0, "grad_norm": 2.5035011649946193, "language_loss": 0.77310604, "learning_rate": 3.4956697331480402e-06, "loss": 0.79559207, "num_input_tokens_seen": 45124955, "step": 2114, "time_per_iteration": 2.6190764904022217 }, { "auxiliary_loss_clip": 0.01207226, "auxiliary_loss_mlp": 0.0103311, "balance_loss_clip": 0.94133323, "balance_loss_mlp": 1.02378166, "epoch": 0.2543137137016774, "flos": 23949436855680.0, "grad_norm": 1.6695396068604949, "language_loss": 0.80168378, "learning_rate": 3.495152472287879e-06, "loss": 0.8240872, "num_input_tokens_seen": 45145665, "step": 2115, "time_per_iteration": 2.739814281463623 }, { "auxiliary_loss_clip": 0.01208773, "auxiliary_loss_mlp": 0.01032175, "balance_loss_clip": 0.94651937, "balance_loss_mlp": 1.02244806, "epoch": 0.2544339565923165, "flos": 25593283802880.0, "grad_norm": 2.9045469652640237, "language_loss": 0.73715109, "learning_rate": 3.4946349846142766e-06, "loss": 0.75956059, "num_input_tokens_seen": 45164805, "step": 2116, "time_per_iteration": 2.7668871879577637 }, { "auxiliary_loss_clip": 0.0121205, "auxiliary_loss_mlp": 0.01034465, "balance_loss_clip": 1.05959177, "balance_loss_mlp": 1.0246954, "epoch": 0.25455419948295555, "flos": 21689470897920.0, "grad_norm": 1.8498612567937358, "language_loss": 0.75818735, "learning_rate": 3.4941172702057353e-06, "loss": 0.78065252, "num_input_tokens_seen": 45184865, "step": 2117, "time_per_iteration": 2.652161121368408 }, { "auxiliary_loss_clip": 0.0120929, "auxiliary_loss_mlp": 0.01034386, "balance_loss_clip": 0.9862864, "balance_loss_mlp": 1.02510524, "epoch": 0.25467444237359466, "flos": 26250341339520.0, "grad_norm": 1.9812726278109136, "language_loss": 0.80611193, "learning_rate": 3.4935993291407924e-06, "loss": 0.82854867, "num_input_tokens_seen": 45203690, "step": 2118, "time_per_iteration": 2.7372193336486816 }, { "auxiliary_loss_clip": 0.01201975, "auxiliary_loss_mlp": 0.010349, "balance_loss_clip": 0.98279947, "balance_loss_mlp": 1.02511883, "epoch": 0.25479468526423377, "flos": 26979686997120.0, "grad_norm": 2.730617581092196, "language_loss": 0.7080282, "learning_rate": 3.4930811614980183e-06, "loss": 0.73039693, "num_input_tokens_seen": 45225385, "step": 2119, "time_per_iteration": 2.7442739009857178 }, { "auxiliary_loss_clip": 0.0120386, "auxiliary_loss_mlp": 0.01034235, "balance_loss_clip": 1.02215576, "balance_loss_mlp": 1.02419174, "epoch": 0.2549149281548728, "flos": 23475811098240.0, "grad_norm": 1.6360351459010283, "language_loss": 0.79333007, "learning_rate": 3.4925627673560198e-06, "loss": 0.81571102, "num_input_tokens_seen": 45246045, "step": 2120, "time_per_iteration": 2.719902515411377 }, { "auxiliary_loss_clip": 0.01206985, "auxiliary_loss_mlp": 0.01036271, "balance_loss_clip": 0.9442395, "balance_loss_mlp": 1.02650738, "epoch": 0.25503517104551193, "flos": 25812302981760.0, "grad_norm": 1.637844000825418, "language_loss": 0.88413072, "learning_rate": 3.4920441467934357e-06, "loss": 0.90656322, "num_input_tokens_seen": 45266560, "step": 2121, "time_per_iteration": 2.826120138168335 }, { "auxiliary_loss_clip": 0.0119416, "auxiliary_loss_mlp": 0.01040675, "balance_loss_clip": 0.94198471, "balance_loss_mlp": 1.03095901, "epoch": 0.25515541393615104, "flos": 26645106787200.0, "grad_norm": 1.9843717063649056, "language_loss": 0.83238012, "learning_rate": 3.491525299888941e-06, "loss": 0.8547284, "num_input_tokens_seen": 45285405, "step": 2122, "time_per_iteration": 2.7880430221557617 }, { "auxiliary_loss_clip": 0.01095965, "auxiliary_loss_mlp": 0.01121572, "balance_loss_clip": 0.94928157, "balance_loss_mlp": 0.0, "epoch": 0.2552756568267901, "flos": 65955945847680.0, "grad_norm": 0.8734430420284602, "language_loss": 0.62691104, "learning_rate": 3.491006226721244e-06, "loss": 0.64908648, "num_input_tokens_seen": 45349615, "step": 2123, "time_per_iteration": 4.250439405441284 }, { "auxiliary_loss_clip": 0.01211927, "auxiliary_loss_mlp": 0.01126816, "balance_loss_clip": 0.98558187, "balance_loss_mlp": 0.0, "epoch": 0.2553958997174292, "flos": 17931096161280.0, "grad_norm": 1.9102082932069804, "language_loss": 0.77489811, "learning_rate": 3.4904869273690882e-06, "loss": 0.79828554, "num_input_tokens_seen": 45367505, "step": 2124, "time_per_iteration": 2.6951069831848145 }, { "auxiliary_loss_clip": 0.01214638, "auxiliary_loss_mlp": 0.0104239, "balance_loss_clip": 1.02341962, "balance_loss_mlp": 1.03214431, "epoch": 0.2555161426080683, "flos": 23367791923200.0, "grad_norm": 1.7767481806758199, "language_loss": 0.88756597, "learning_rate": 3.489967401911251e-06, "loss": 0.91013622, "num_input_tokens_seen": 45386805, "step": 2125, "time_per_iteration": 3.558506488800049 }, { "auxiliary_loss_clip": 0.01214872, "auxiliary_loss_mlp": 0.01042218, "balance_loss_clip": 1.06132746, "balance_loss_mlp": 1.03141797, "epoch": 0.2556363854987074, "flos": 40625130723840.0, "grad_norm": 1.6517042355042284, "language_loss": 0.69501901, "learning_rate": 3.4894476504265428e-06, "loss": 0.71758991, "num_input_tokens_seen": 45411045, "step": 2126, "time_per_iteration": 2.8427038192749023 }, { "auxiliary_loss_clip": 0.01099813, "auxiliary_loss_mlp": 0.01013384, "balance_loss_clip": 0.98626471, "balance_loss_mlp": 1.00956941, "epoch": 0.2557566283893465, "flos": 68019443389440.0, "grad_norm": 0.7539604728525547, "language_loss": 0.54505104, "learning_rate": 3.4889276729938104e-06, "loss": 0.56618303, "num_input_tokens_seen": 45469575, "step": 2127, "time_per_iteration": 4.011860132217407 }, { "auxiliary_loss_clip": 0.01205643, "auxiliary_loss_mlp": 0.01036127, "balance_loss_clip": 0.98391652, "balance_loss_mlp": 1.02599466, "epoch": 0.2558768712799856, "flos": 22635645004800.0, "grad_norm": 2.5520521690912634, "language_loss": 0.80891472, "learning_rate": 3.488407469691934e-06, "loss": 0.83133239, "num_input_tokens_seen": 45490270, "step": 2128, "time_per_iteration": 2.7138102054595947 }, { "auxiliary_loss_clip": 0.01200841, "auxiliary_loss_mlp": 0.01040322, "balance_loss_clip": 0.97930926, "balance_loss_mlp": 1.02921128, "epoch": 0.25599711417062465, "flos": 26396354125440.0, "grad_norm": 1.9257106380017777, "language_loss": 0.807024, "learning_rate": 3.487887040599828e-06, "loss": 0.82943559, "num_input_tokens_seen": 45510070, "step": 2129, "time_per_iteration": 3.6912097930908203 }, { "auxiliary_loss_clip": 0.01214735, "auxiliary_loss_mlp": 0.01035264, "balance_loss_clip": 1.06302023, "balance_loss_mlp": 1.0245831, "epoch": 0.25611735706126376, "flos": 22852042490880.0, "grad_norm": 2.2442679830330596, "language_loss": 0.76100957, "learning_rate": 3.4873663857964407e-06, "loss": 0.78350949, "num_input_tokens_seen": 45527285, "step": 2130, "time_per_iteration": 2.6681714057922363 }, { "auxiliary_loss_clip": 0.01204815, "auxiliary_loss_mlp": 0.01038212, "balance_loss_clip": 0.90527171, "balance_loss_mlp": 1.02877092, "epoch": 0.2562375999519028, "flos": 23367863750400.0, "grad_norm": 1.695362424562407, "language_loss": 0.66459477, "learning_rate": 3.4868455053607556e-06, "loss": 0.68702501, "num_input_tokens_seen": 45546900, "step": 2131, "time_per_iteration": 2.795278310775757 }, { "auxiliary_loss_clip": 0.01215152, "auxiliary_loss_mlp": 0.01032181, "balance_loss_clip": 1.02219987, "balance_loss_mlp": 1.0220542, "epoch": 0.2563578428425419, "flos": 22856962654080.0, "grad_norm": 4.909337009611394, "language_loss": 0.71478993, "learning_rate": 3.486324399371789e-06, "loss": 0.73726332, "num_input_tokens_seen": 45566200, "step": 2132, "time_per_iteration": 2.699272394180298 }, { "auxiliary_loss_clip": 0.01199855, "auxiliary_loss_mlp": 0.0104358, "balance_loss_clip": 0.94461203, "balance_loss_mlp": 1.03375673, "epoch": 0.25647808573318104, "flos": 21653883498240.0, "grad_norm": 1.8312330770176346, "language_loss": 0.78704959, "learning_rate": 3.485803067908593e-06, "loss": 0.80948395, "num_input_tokens_seen": 45585710, "step": 2133, "time_per_iteration": 2.721095085144043 }, { "auxiliary_loss_clip": 0.01171708, "auxiliary_loss_mlp": 0.01040011, "balance_loss_clip": 0.86068308, "balance_loss_mlp": 1.02962744, "epoch": 0.2565983286238201, "flos": 33730569659520.0, "grad_norm": 1.7270512784528267, "language_loss": 0.79448992, "learning_rate": 3.485281511050253e-06, "loss": 0.81660718, "num_input_tokens_seen": 45607845, "step": 2134, "time_per_iteration": 3.028413772583008 }, { "auxiliary_loss_clip": 0.01211533, "auxiliary_loss_mlp": 0.01037179, "balance_loss_clip": 1.02215517, "balance_loss_mlp": 1.02769637, "epoch": 0.2567185715144592, "flos": 16216002587520.0, "grad_norm": 2.243605212233821, "language_loss": 0.8956629, "learning_rate": 3.484759728875889e-06, "loss": 0.91814995, "num_input_tokens_seen": 45623210, "step": 2135, "time_per_iteration": 2.99159836769104 }, { "auxiliary_loss_clip": 0.01194822, "auxiliary_loss_mlp": 0.01040296, "balance_loss_clip": 0.90683502, "balance_loss_mlp": 1.03015733, "epoch": 0.2568388144050983, "flos": 17458475984640.0, "grad_norm": 1.8012902433573421, "language_loss": 0.80851316, "learning_rate": 3.4842377214646543e-06, "loss": 0.83086437, "num_input_tokens_seen": 45641505, "step": 2136, "time_per_iteration": 2.759506940841675 }, { "auxiliary_loss_clip": 0.01212824, "auxiliary_loss_mlp": 0.01034662, "balance_loss_clip": 1.06309664, "balance_loss_mlp": 1.02502966, "epoch": 0.25695905729573737, "flos": 20887442069760.0, "grad_norm": 1.6444946794599864, "language_loss": 0.66880715, "learning_rate": 3.483715488895737e-06, "loss": 0.69128203, "num_input_tokens_seen": 45661835, "step": 2137, "time_per_iteration": 2.707887887954712 }, { "auxiliary_loss_clip": 0.01202242, "auxiliary_loss_mlp": 0.01034687, "balance_loss_clip": 0.90224916, "balance_loss_mlp": 1.02422071, "epoch": 0.2570793001863765, "flos": 24717278914560.0, "grad_norm": 1.832851424372693, "language_loss": 0.7836591, "learning_rate": 3.48319303124836e-06, "loss": 0.80602837, "num_input_tokens_seen": 45682215, "step": 2138, "time_per_iteration": 2.8011701107025146 }, { "auxiliary_loss_clip": 0.01202911, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 0.98372138, "balance_loss_mlp": 1.0236187, "epoch": 0.2571995430770156, "flos": 26906896085760.0, "grad_norm": 2.9085276614546247, "language_loss": 0.66833788, "learning_rate": 3.4826703486017798e-06, "loss": 0.690696, "num_input_tokens_seen": 45701840, "step": 2139, "time_per_iteration": 2.7325727939605713 }, { "auxiliary_loss_clip": 0.01209468, "auxiliary_loss_mlp": 0.01034223, "balance_loss_clip": 1.02409649, "balance_loss_mlp": 1.02448404, "epoch": 0.25731978596765465, "flos": 19792561656960.0, "grad_norm": 1.5198705294756183, "language_loss": 0.76738024, "learning_rate": 3.4821474410352867e-06, "loss": 0.78981721, "num_input_tokens_seen": 45720500, "step": 2140, "time_per_iteration": 2.7005488872528076 }, { "auxiliary_loss_clip": 0.01112009, "auxiliary_loss_mlp": 0.01004839, "balance_loss_clip": 0.91865098, "balance_loss_mlp": 1.00121534, "epoch": 0.25744002885829376, "flos": 70564970471040.0, "grad_norm": 0.9108694660418466, "language_loss": 0.62711382, "learning_rate": 3.481624308628205e-06, "loss": 0.64828229, "num_input_tokens_seen": 45781870, "step": 2141, "time_per_iteration": 3.3971505165100098 }, { "auxiliary_loss_clip": 0.01208966, "auxiliary_loss_mlp": 0.01045479, "balance_loss_clip": 0.98540139, "balance_loss_mlp": 1.0352453, "epoch": 0.25756027174893287, "flos": 18038181582720.0, "grad_norm": 3.424137120617332, "language_loss": 1.00074339, "learning_rate": 3.481100951459893e-06, "loss": 1.02328777, "num_input_tokens_seen": 45794890, "step": 2142, "time_per_iteration": 2.6352477073669434 }, { "auxiliary_loss_clip": 0.01209761, "auxiliary_loss_mlp": 0.01026015, "balance_loss_clip": 1.02400637, "balance_loss_mlp": 1.0171454, "epoch": 0.2576805146395719, "flos": 22674069578880.0, "grad_norm": 1.6337176687420827, "language_loss": 0.78772771, "learning_rate": 3.4805773696097453e-06, "loss": 0.81008542, "num_input_tokens_seen": 45815780, "step": 2143, "time_per_iteration": 2.6877546310424805 }, { "auxiliary_loss_clip": 0.01203827, "auxiliary_loss_mlp": 0.01036015, "balance_loss_clip": 0.9832778, "balance_loss_mlp": 1.02636456, "epoch": 0.25780075753021103, "flos": 16472225278080.0, "grad_norm": 2.1171998612923235, "language_loss": 0.8790462, "learning_rate": 3.4800535631571874e-06, "loss": 0.90144461, "num_input_tokens_seen": 45831310, "step": 2144, "time_per_iteration": 2.6205854415893555 }, { "auxiliary_loss_clip": 0.01219448, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 0.98743653, "balance_loss_mlp": 1.02665019, "epoch": 0.25792100042085014, "flos": 22820297846400.0, "grad_norm": 4.467135859079109, "language_loss": 0.76168519, "learning_rate": 3.4795295321816804e-06, "loss": 0.78425038, "num_input_tokens_seen": 45850135, "step": 2145, "time_per_iteration": 2.7030646800994873 }, { "auxiliary_loss_clip": 0.01198254, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 0.98412597, "balance_loss_mlp": 1.0242399, "epoch": 0.2580412433114892, "flos": 18697286194560.0, "grad_norm": 2.2467599381869774, "language_loss": 0.91035581, "learning_rate": 3.47900527676272e-06, "loss": 0.93267858, "num_input_tokens_seen": 45868470, "step": 2146, "time_per_iteration": 2.6511154174804688 }, { "auxiliary_loss_clip": 0.01217172, "auxiliary_loss_mlp": 0.0103367, "balance_loss_clip": 1.06661236, "balance_loss_mlp": 1.02416945, "epoch": 0.2581614862021283, "flos": 14283146810880.0, "grad_norm": 1.7798122890926995, "language_loss": 0.88500988, "learning_rate": 3.478480796979835e-06, "loss": 0.90751827, "num_input_tokens_seen": 45886355, "step": 2147, "time_per_iteration": 2.685845375061035 }, { "auxiliary_loss_clip": 0.01207943, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 0.98718083, "balance_loss_mlp": 1.02219701, "epoch": 0.25828172909276736, "flos": 29498281856640.0, "grad_norm": 1.6197000294546855, "language_loss": 0.77587748, "learning_rate": 3.4779560929125894e-06, "loss": 0.79827261, "num_input_tokens_seen": 45907900, "step": 2148, "time_per_iteration": 2.783418655395508 }, { "auxiliary_loss_clip": 0.01103709, "auxiliary_loss_mlp": 0.01002541, "balance_loss_clip": 0.9111973, "balance_loss_mlp": 0.99884558, "epoch": 0.2584019719834065, "flos": 67114387376640.0, "grad_norm": 0.6673111375177618, "language_loss": 0.56931531, "learning_rate": 3.4774311646405783e-06, "loss": 0.59037781, "num_input_tokens_seen": 45977805, "step": 2149, "time_per_iteration": 4.280874252319336 }, { "auxiliary_loss_clip": 0.01192417, "auxiliary_loss_mlp": 0.01040039, "balance_loss_clip": 0.94111246, "balance_loss_mlp": 1.03099072, "epoch": 0.2585222148740456, "flos": 22893555634560.0, "grad_norm": 1.9093458457080674, "language_loss": 0.83742982, "learning_rate": 3.476906012243435e-06, "loss": 0.85975432, "num_input_tokens_seen": 45996715, "step": 2150, "time_per_iteration": 2.780397415161133 }, { "auxiliary_loss_clip": 0.01201325, "auxiliary_loss_mlp": 0.01036816, "balance_loss_clip": 1.02359414, "balance_loss_mlp": 1.02665305, "epoch": 0.25864245776468464, "flos": 28909202808960.0, "grad_norm": 1.515350390709042, "language_loss": 0.81221133, "learning_rate": 3.476380635800824e-06, "loss": 0.8345927, "num_input_tokens_seen": 46017915, "step": 2151, "time_per_iteration": 3.666937828063965 }, { "auxiliary_loss_clip": 0.01208967, "auxiliary_loss_mlp": 0.01034888, "balance_loss_clip": 0.98704261, "balance_loss_mlp": 1.02483284, "epoch": 0.25876270065532375, "flos": 14793185980800.0, "grad_norm": 2.8130215005647043, "language_loss": 0.86471415, "learning_rate": 3.475855035392444e-06, "loss": 0.88715267, "num_input_tokens_seen": 46033235, "step": 2152, "time_per_iteration": 2.617405414581299 }, { "auxiliary_loss_clip": 0.0120237, "auxiliary_loss_mlp": 0.01038501, "balance_loss_clip": 0.86838663, "balance_loss_mlp": 1.02886295, "epoch": 0.25888294354596286, "flos": 60467821810560.0, "grad_norm": 1.827809315924596, "language_loss": 0.71323097, "learning_rate": 3.475329211098029e-06, "loss": 0.73563975, "num_input_tokens_seen": 46056390, "step": 2153, "time_per_iteration": 3.9949851036071777 }, { "auxiliary_loss_clip": 0.01212509, "auxiliary_loss_mlp": 0.01031322, "balance_loss_clip": 0.91019249, "balance_loss_mlp": 1.02191615, "epoch": 0.2590031864366019, "flos": 27851166771840.0, "grad_norm": 1.7783105582569263, "language_loss": 0.82025611, "learning_rate": 3.4748031629973453e-06, "loss": 0.8426944, "num_input_tokens_seen": 46077120, "step": 2154, "time_per_iteration": 2.8097712993621826 }, { "auxiliary_loss_clip": 0.01104007, "auxiliary_loss_mlp": 0.01006502, "balance_loss_clip": 0.8744539, "balance_loss_mlp": 1.00304461, "epoch": 0.25912342932724103, "flos": 62422444206720.0, "grad_norm": 0.9178917123252953, "language_loss": 0.56643653, "learning_rate": 3.4742768911701944e-06, "loss": 0.58754158, "num_input_tokens_seen": 46139815, "step": 2155, "time_per_iteration": 4.236000061035156 }, { "auxiliary_loss_clip": 0.01216135, "auxiliary_loss_mlp": 0.01038293, "balance_loss_clip": 1.02496397, "balance_loss_mlp": 1.02820802, "epoch": 0.25924367221788014, "flos": 12378839368320.0, "grad_norm": 3.1689167027685894, "language_loss": 0.70908463, "learning_rate": 3.4737503956964113e-06, "loss": 0.73162889, "num_input_tokens_seen": 46152120, "step": 2156, "time_per_iteration": 2.602689266204834 }, { "auxiliary_loss_clip": 0.01199348, "auxiliary_loss_mlp": 0.01035719, "balance_loss_clip": 0.98066008, "balance_loss_mlp": 1.02544868, "epoch": 0.2593639151085192, "flos": 14575208296320.0, "grad_norm": 2.4221179066058776, "language_loss": 0.67222631, "learning_rate": 3.473223676655865e-06, "loss": 0.69457686, "num_input_tokens_seen": 46170120, "step": 2157, "time_per_iteration": 2.6903045177459717 }, { "auxiliary_loss_clip": 0.01199373, "auxiliary_loss_mlp": 0.01038537, "balance_loss_clip": 0.98122638, "balance_loss_mlp": 1.02859449, "epoch": 0.2594841579991583, "flos": 15230937029760.0, "grad_norm": 1.7832567099666734, "language_loss": 0.80069089, "learning_rate": 3.472696734128459e-06, "loss": 0.82306999, "num_input_tokens_seen": 46187985, "step": 2158, "time_per_iteration": 2.6555209159851074 }, { "auxiliary_loss_clip": 0.01209801, "auxiliary_loss_mlp": 0.01041289, "balance_loss_clip": 1.02229869, "balance_loss_mlp": 1.03129935, "epoch": 0.2596044008897974, "flos": 23623583650560.0, "grad_norm": 1.6116927813786062, "language_loss": 0.75534934, "learning_rate": 3.4721695681941286e-06, "loss": 0.77786028, "num_input_tokens_seen": 46207025, "step": 2159, "time_per_iteration": 2.6999635696411133 }, { "auxiliary_loss_clip": 0.01203989, "auxiliary_loss_mlp": 0.01127114, "balance_loss_clip": 0.98097205, "balance_loss_mlp": 0.0, "epoch": 0.25972464378043647, "flos": 13772281628160.0, "grad_norm": 1.8430082220818174, "language_loss": 0.82363451, "learning_rate": 3.471642178932845e-06, "loss": 0.84694558, "num_input_tokens_seen": 46225670, "step": 2160, "time_per_iteration": 2.7454376220703125 }, { "auxiliary_loss_clip": 0.01209688, "auxiliary_loss_mlp": 0.01032593, "balance_loss_clip": 0.98240858, "balance_loss_mlp": 1.02266312, "epoch": 0.2598448866710756, "flos": 19573578391680.0, "grad_norm": 1.8931240000231084, "language_loss": 0.89480942, "learning_rate": 3.471114566424613e-06, "loss": 0.91723216, "num_input_tokens_seen": 46244130, "step": 2161, "time_per_iteration": 2.721705436706543 }, { "auxiliary_loss_clip": 0.01207878, "auxiliary_loss_mlp": 0.01031198, "balance_loss_clip": 0.98620605, "balance_loss_mlp": 1.02231073, "epoch": 0.25996512956171464, "flos": 21653237053440.0, "grad_norm": 1.797769581686244, "language_loss": 0.75817668, "learning_rate": 3.4705867307494715e-06, "loss": 0.78056741, "num_input_tokens_seen": 46263200, "step": 2162, "time_per_iteration": 2.781683921813965 }, { "auxiliary_loss_clip": 0.01213843, "auxiliary_loss_mlp": 0.01038613, "balance_loss_clip": 1.02317977, "balance_loss_mlp": 1.02843881, "epoch": 0.26008537245235375, "flos": 18223480869120.0, "grad_norm": 2.0311212443997606, "language_loss": 0.84771615, "learning_rate": 3.470058671987492e-06, "loss": 0.87024069, "num_input_tokens_seen": 46281465, "step": 2163, "time_per_iteration": 2.7412893772125244 }, { "auxiliary_loss_clip": 0.01215853, "auxiliary_loss_mlp": 0.01040412, "balance_loss_clip": 1.02340865, "balance_loss_mlp": 1.03065467, "epoch": 0.26020561534299286, "flos": 24645385843200.0, "grad_norm": 1.9089684099040418, "language_loss": 0.84300214, "learning_rate": 3.4695303902187805e-06, "loss": 0.86556476, "num_input_tokens_seen": 46301020, "step": 2164, "time_per_iteration": 2.6874618530273438 }, { "auxiliary_loss_clip": 0.01196145, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 0.93971503, "balance_loss_mlp": 1.0263164, "epoch": 0.2603258582336319, "flos": 25773662926080.0, "grad_norm": 1.84611136329881, "language_loss": 0.78434289, "learning_rate": 3.469001885523478e-06, "loss": 0.80666983, "num_input_tokens_seen": 46321740, "step": 2165, "time_per_iteration": 2.793020486831665 }, { "auxiliary_loss_clip": 0.01209856, "auxiliary_loss_mlp": 0.01033234, "balance_loss_clip": 1.06029344, "balance_loss_mlp": 1.02363157, "epoch": 0.260446101124271, "flos": 28766314506240.0, "grad_norm": 1.574590612994341, "language_loss": 0.81158555, "learning_rate": 3.4684731579817568e-06, "loss": 0.83401644, "num_input_tokens_seen": 46342730, "step": 2166, "time_per_iteration": 2.6638941764831543 }, { "auxiliary_loss_clip": 0.01199804, "auxiliary_loss_mlp": 0.01034119, "balance_loss_clip": 0.86998808, "balance_loss_mlp": 1.02416468, "epoch": 0.26056634401491013, "flos": 25666757072640.0, "grad_norm": 2.377708821575719, "language_loss": 0.76914573, "learning_rate": 3.4679442076738247e-06, "loss": 0.79148495, "num_input_tokens_seen": 46362445, "step": 2167, "time_per_iteration": 2.796387195587158 }, { "auxiliary_loss_clip": 0.01211898, "auxiliary_loss_mlp": 0.01037319, "balance_loss_clip": 1.06027687, "balance_loss_mlp": 1.02784252, "epoch": 0.2606865869055492, "flos": 27052765217280.0, "grad_norm": 1.9200869364221267, "language_loss": 0.83329749, "learning_rate": 3.4674150346799245e-06, "loss": 0.85578966, "num_input_tokens_seen": 46382145, "step": 2168, "time_per_iteration": 2.655673027038574 }, { "auxiliary_loss_clip": 0.01207775, "auxiliary_loss_mlp": 0.01040065, "balance_loss_clip": 0.98395669, "balance_loss_mlp": 1.02950883, "epoch": 0.2608068297961883, "flos": 17712615686400.0, "grad_norm": 1.9177399074078099, "language_loss": 0.79883766, "learning_rate": 3.4668856390803295e-06, "loss": 0.82131606, "num_input_tokens_seen": 46400025, "step": 2169, "time_per_iteration": 2.687455415725708 }, { "auxiliary_loss_clip": 0.01197333, "auxiliary_loss_mlp": 0.01034059, "balance_loss_clip": 1.02142191, "balance_loss_mlp": 1.02414072, "epoch": 0.2609270726868274, "flos": 18551632544640.0, "grad_norm": 1.8648758258500078, "language_loss": 0.9004311, "learning_rate": 3.4663560209553495e-06, "loss": 0.92274505, "num_input_tokens_seen": 46418090, "step": 2170, "time_per_iteration": 2.7164738178253174 }, { "auxiliary_loss_clip": 0.01197199, "auxiliary_loss_mlp": 0.01038176, "balance_loss_clip": 0.98251057, "balance_loss_mlp": 1.02785277, "epoch": 0.26104731557746647, "flos": 21835699165440.0, "grad_norm": 1.5912999678261013, "language_loss": 0.79123819, "learning_rate": 3.4658261803853267e-06, "loss": 0.81359196, "num_input_tokens_seen": 46436015, "step": 2171, "time_per_iteration": 2.650212049484253 }, { "auxiliary_loss_clip": 0.0120481, "auxiliary_loss_mlp": 0.01034795, "balance_loss_clip": 0.98478711, "balance_loss_mlp": 1.02477002, "epoch": 0.2611675584681056, "flos": 21689650465920.0, "grad_norm": 2.009818792991254, "language_loss": 0.80628753, "learning_rate": 3.4652961174506383e-06, "loss": 0.82868361, "num_input_tokens_seen": 46455885, "step": 2172, "time_per_iteration": 2.6738054752349854 }, { "auxiliary_loss_clip": 0.01095947, "auxiliary_loss_mlp": 0.01008201, "balance_loss_clip": 0.98481417, "balance_loss_mlp": 1.00491083, "epoch": 0.2612878013587447, "flos": 71862101389440.0, "grad_norm": 0.9709112143176121, "language_loss": 0.58157903, "learning_rate": 3.464765832231694e-06, "loss": 0.60262054, "num_input_tokens_seen": 46510050, "step": 2173, "time_per_iteration": 3.2452547550201416 }, { "auxiliary_loss_clip": 0.01214781, "auxiliary_loss_mlp": 0.01041638, "balance_loss_clip": 1.0255971, "balance_loss_mlp": 1.03173721, "epoch": 0.26140804424938374, "flos": 20227511445120.0, "grad_norm": 1.785391781796221, "language_loss": 0.70935929, "learning_rate": 3.4642353248089373e-06, "loss": 0.73192346, "num_input_tokens_seen": 46528810, "step": 2174, "time_per_iteration": 2.669532299041748 }, { "auxiliary_loss_clip": 0.01197615, "auxiliary_loss_mlp": 0.01041968, "balance_loss_clip": 0.98139608, "balance_loss_mlp": 1.03164411, "epoch": 0.26152828714002285, "flos": 25557085872000.0, "grad_norm": 1.6834884086122888, "language_loss": 0.8053292, "learning_rate": 3.463704595262846e-06, "loss": 0.82772505, "num_input_tokens_seen": 46549690, "step": 2175, "time_per_iteration": 3.750321865081787 }, { "auxiliary_loss_clip": 0.01201881, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 0.94392288, "balance_loss_mlp": 1.02576208, "epoch": 0.26164853003066196, "flos": 25446516831360.0, "grad_norm": 1.7531246221631684, "language_loss": 0.70659077, "learning_rate": 3.463173643673931e-06, "loss": 0.72897112, "num_input_tokens_seen": 46572215, "step": 2176, "time_per_iteration": 2.7849793434143066 }, { "auxiliary_loss_clip": 0.01102728, "auxiliary_loss_mlp": 0.01009869, "balance_loss_clip": 0.98672056, "balance_loss_mlp": 1.00619721, "epoch": 0.261768772921301, "flos": 53944580568960.0, "grad_norm": 0.9037392768892412, "language_loss": 0.63587344, "learning_rate": 3.4626424701227387e-06, "loss": 0.65699935, "num_input_tokens_seen": 46627275, "step": 2177, "time_per_iteration": 4.103297472000122 }, { "auxiliary_loss_clip": 0.01100257, "auxiliary_loss_mlp": 0.01004001, "balance_loss_clip": 1.02435541, "balance_loss_mlp": 1.00028121, "epoch": 0.26188901581194013, "flos": 70687606481280.0, "grad_norm": 0.8434043630258714, "language_loss": 0.55826384, "learning_rate": 3.4621110746898452e-06, "loss": 0.57930636, "num_input_tokens_seen": 46695135, "step": 2178, "time_per_iteration": 3.275162696838379 }, { "auxiliary_loss_clip": 0.01211727, "auxiliary_loss_mlp": 0.01034716, "balance_loss_clip": 1.02332139, "balance_loss_mlp": 1.02536392, "epoch": 0.2620092587025792, "flos": 21069580959360.0, "grad_norm": 2.1285991359726713, "language_loss": 0.74518263, "learning_rate": 3.4615794574558654e-06, "loss": 0.76764703, "num_input_tokens_seen": 46714145, "step": 2179, "time_per_iteration": 2.7138874530792236 }, { "auxiliary_loss_clip": 0.01205219, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 0.9812541, "balance_loss_mlp": 1.02541804, "epoch": 0.2621295015932183, "flos": 18369601395840.0, "grad_norm": 2.273241685222256, "language_loss": 0.84092647, "learning_rate": 3.4610476185014436e-06, "loss": 0.86332583, "num_input_tokens_seen": 46731405, "step": 2180, "time_per_iteration": 3.799924373626709 }, { "auxiliary_loss_clip": 0.01210053, "auxiliary_loss_mlp": 0.01034467, "balance_loss_clip": 1.05886149, "balance_loss_mlp": 1.0242393, "epoch": 0.2622497444838574, "flos": 23659997063040.0, "grad_norm": 2.0289648812122336, "language_loss": 0.79385394, "learning_rate": 3.4605155579072597e-06, "loss": 0.81629914, "num_input_tokens_seen": 46751260, "step": 2181, "time_per_iteration": 3.6121034622192383 }, { "auxiliary_loss_clip": 0.01193586, "auxiliary_loss_mlp": 0.01039176, "balance_loss_clip": 0.90161836, "balance_loss_mlp": 1.02957964, "epoch": 0.26236998737449646, "flos": 22123810154880.0, "grad_norm": 1.6234669917246332, "language_loss": 0.7109701, "learning_rate": 3.459983275754027e-06, "loss": 0.73329771, "num_input_tokens_seen": 46770155, "step": 2182, "time_per_iteration": 2.740567922592163 }, { "auxiliary_loss_clip": 0.01210431, "auxiliary_loss_mlp": 0.01040083, "balance_loss_clip": 1.06060719, "balance_loss_mlp": 1.03005767, "epoch": 0.26249023026513557, "flos": 17895185539200.0, "grad_norm": 2.8035771796143196, "language_loss": 0.80171061, "learning_rate": 3.4594507721224918e-06, "loss": 0.82421571, "num_input_tokens_seen": 46788805, "step": 2183, "time_per_iteration": 2.5949506759643555 }, { "auxiliary_loss_clip": 0.01210716, "auxiliary_loss_mlp": 0.01042246, "balance_loss_clip": 0.98455667, "balance_loss_mlp": 1.03251898, "epoch": 0.2626104731557747, "flos": 18332936588160.0, "grad_norm": 3.4314641915379296, "language_loss": 0.81745088, "learning_rate": 3.4589180470934353e-06, "loss": 0.83998048, "num_input_tokens_seen": 46808670, "step": 2184, "time_per_iteration": 2.686755418777466 }, { "auxiliary_loss_clip": 0.0121416, "auxiliary_loss_mlp": 0.01036883, "balance_loss_clip": 1.02002597, "balance_loss_mlp": 1.02654767, "epoch": 0.26273071604641374, "flos": 19317714837120.0, "grad_norm": 32.182704003772585, "language_loss": 0.76625365, "learning_rate": 3.4583851007476713e-06, "loss": 0.78876412, "num_input_tokens_seen": 46827140, "step": 2185, "time_per_iteration": 2.628180503845215 }, { "auxiliary_loss_clip": 0.01206381, "auxiliary_loss_mlp": 0.01038091, "balance_loss_clip": 0.94397807, "balance_loss_mlp": 1.02797627, "epoch": 0.26285095893705285, "flos": 18327477720960.0, "grad_norm": 2.1612118824078097, "language_loss": 0.69000483, "learning_rate": 3.4578519331660464e-06, "loss": 0.71244955, "num_input_tokens_seen": 46844135, "step": 2186, "time_per_iteration": 2.6950342655181885 }, { "auxiliary_loss_clip": 0.01207649, "auxiliary_loss_mlp": 0.01031377, "balance_loss_clip": 1.02430809, "balance_loss_mlp": 1.02226329, "epoch": 0.26297120182769196, "flos": 20193827466240.0, "grad_norm": 2.3663652599963982, "language_loss": 0.82424009, "learning_rate": 3.4573185444294426e-06, "loss": 0.84663033, "num_input_tokens_seen": 46862500, "step": 2187, "time_per_iteration": 2.6825640201568604 }, { "auxiliary_loss_clip": 0.01205022, "auxiliary_loss_mlp": 0.01127193, "balance_loss_clip": 0.98259962, "balance_loss_mlp": 0.0, "epoch": 0.263091444718331, "flos": 22418421505920.0, "grad_norm": 1.8230191390974537, "language_loss": 0.78854042, "learning_rate": 3.456784934618774e-06, "loss": 0.81186259, "num_input_tokens_seen": 46883665, "step": 2188, "time_per_iteration": 2.7093405723571777 }, { "auxiliary_loss_clip": 0.01204664, "auxiliary_loss_mlp": 0.01031071, "balance_loss_clip": 0.98327076, "balance_loss_mlp": 1.0212543, "epoch": 0.2632116876089701, "flos": 19024827338880.0, "grad_norm": 1.9894769097588203, "language_loss": 0.79958689, "learning_rate": 3.4562511038149897e-06, "loss": 0.82194424, "num_input_tokens_seen": 46899160, "step": 2189, "time_per_iteration": 2.69791316986084 }, { "auxiliary_loss_clip": 0.01094263, "auxiliary_loss_mlp": 0.0100314, "balance_loss_clip": 0.86840439, "balance_loss_mlp": 0.99946862, "epoch": 0.26333193049960923, "flos": 67308054531840.0, "grad_norm": 0.8662888932895882, "language_loss": 0.5778814, "learning_rate": 3.4557170520990705e-06, "loss": 0.59885544, "num_input_tokens_seen": 46959835, "step": 2190, "time_per_iteration": 3.414092540740967 }, { "auxiliary_loss_clip": 0.01206125, "auxiliary_loss_mlp": 0.01034409, "balance_loss_clip": 1.02250433, "balance_loss_mlp": 1.02575445, "epoch": 0.2634521733902483, "flos": 25048806468480.0, "grad_norm": 1.4541023805293876, "language_loss": 0.86565697, "learning_rate": 3.4551827795520324e-06, "loss": 0.88806236, "num_input_tokens_seen": 46982720, "step": 2191, "time_per_iteration": 2.9329166412353516 }, { "auxiliary_loss_clip": 0.01208801, "auxiliary_loss_mlp": 0.0103422, "balance_loss_clip": 1.02071857, "balance_loss_mlp": 1.02482677, "epoch": 0.2635724162808874, "flos": 20594985534720.0, "grad_norm": 1.655405567480732, "language_loss": 0.85229015, "learning_rate": 3.4546482862549226e-06, "loss": 0.87472034, "num_input_tokens_seen": 47003035, "step": 2192, "time_per_iteration": 2.8022208213806152 }, { "auxiliary_loss_clip": 0.01199432, "auxiliary_loss_mlp": 0.0103718, "balance_loss_clip": 0.94347286, "balance_loss_mlp": 1.02689254, "epoch": 0.2636926591715265, "flos": 19244636616960.0, "grad_norm": 2.0741035043804756, "language_loss": 0.7843498, "learning_rate": 3.4541135722888253e-06, "loss": 0.80671591, "num_input_tokens_seen": 47019625, "step": 2193, "time_per_iteration": 2.7085072994232178 }, { "auxiliary_loss_clip": 0.01210454, "auxiliary_loss_mlp": 0.0103153, "balance_loss_clip": 1.06029963, "balance_loss_mlp": 1.02257705, "epoch": 0.26381290206216557, "flos": 28804882734720.0, "grad_norm": 1.7384218555109279, "language_loss": 0.80132604, "learning_rate": 3.453578637734854e-06, "loss": 0.82374585, "num_input_tokens_seen": 47040815, "step": 2194, "time_per_iteration": 2.761719226837158 }, { "auxiliary_loss_clip": 0.0121449, "auxiliary_loss_mlp": 0.01042891, "balance_loss_clip": 1.06526864, "balance_loss_mlp": 1.03367603, "epoch": 0.2639331449528047, "flos": 25008909436800.0, "grad_norm": 1.525982118035156, "language_loss": 0.78377229, "learning_rate": 3.4530434826741605e-06, "loss": 0.80634606, "num_input_tokens_seen": 47061755, "step": 2195, "time_per_iteration": 2.6456234455108643 }, { "auxiliary_loss_clip": 0.01205835, "auxiliary_loss_mlp": 0.01033333, "balance_loss_clip": 0.98617673, "balance_loss_mlp": 1.02418375, "epoch": 0.26405338784344373, "flos": 46535775465600.0, "grad_norm": 1.674183890816899, "language_loss": 0.68803763, "learning_rate": 3.452508107187926e-06, "loss": 0.71042931, "num_input_tokens_seen": 47085130, "step": 2196, "time_per_iteration": 2.9638280868530273 }, { "auxiliary_loss_clip": 0.01198333, "auxiliary_loss_mlp": 0.01028544, "balance_loss_clip": 0.86464816, "balance_loss_mlp": 1.01812482, "epoch": 0.26417363073408284, "flos": 21179467641600.0, "grad_norm": 1.8093237270375608, "language_loss": 0.77213383, "learning_rate": 3.451972511357366e-06, "loss": 0.7944026, "num_input_tokens_seen": 47104675, "step": 2197, "time_per_iteration": 2.8049209117889404 }, { "auxiliary_loss_clip": 0.01207034, "auxiliary_loss_mlp": 0.010317, "balance_loss_clip": 1.02406144, "balance_loss_mlp": 1.02203822, "epoch": 0.26429387362472195, "flos": 22674751937280.0, "grad_norm": 1.8228196706823523, "language_loss": 0.85301781, "learning_rate": 3.45143669526373e-06, "loss": 0.87540519, "num_input_tokens_seen": 47124435, "step": 2198, "time_per_iteration": 2.6705691814422607 }, { "auxiliary_loss_clip": 0.01104842, "auxiliary_loss_mlp": 0.01002519, "balance_loss_clip": 0.94835806, "balance_loss_mlp": 0.9990142, "epoch": 0.264414116515361, "flos": 67180534272000.0, "grad_norm": 1.0076723071405032, "language_loss": 0.63222468, "learning_rate": 3.450900658988302e-06, "loss": 0.65329832, "num_input_tokens_seen": 47185985, "step": 2199, "time_per_iteration": 3.247112512588501 }, { "auxiliary_loss_clip": 0.01196652, "auxiliary_loss_mlp": 0.01040841, "balance_loss_clip": 0.9826771, "balance_loss_mlp": 1.03094029, "epoch": 0.2645343594060001, "flos": 25664709997440.0, "grad_norm": 1.8899357901707419, "language_loss": 0.77553189, "learning_rate": 3.450364402612397e-06, "loss": 0.79790688, "num_input_tokens_seen": 47203140, "step": 2200, "time_per_iteration": 2.7063212394714355 }, { "auxiliary_loss_clip": 0.01205259, "auxiliary_loss_mlp": 0.01035703, "balance_loss_clip": 0.98270011, "balance_loss_mlp": 1.02604723, "epoch": 0.26465460229663923, "flos": 22491822948480.0, "grad_norm": 2.0457211279844554, "language_loss": 0.83728182, "learning_rate": 3.449827926217366e-06, "loss": 0.85969144, "num_input_tokens_seen": 47222575, "step": 2201, "time_per_iteration": 4.518396377563477 }, { "auxiliary_loss_clip": 0.0120561, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 0.97731465, "balance_loss_mlp": 1.0258348, "epoch": 0.2647748451872783, "flos": 29388036038400.0, "grad_norm": 1.7562201095030658, "language_loss": 0.81242061, "learning_rate": 3.449291229884591e-06, "loss": 0.83483863, "num_input_tokens_seen": 47243815, "step": 2202, "time_per_iteration": 2.8744771480560303 }, { "auxiliary_loss_clip": 0.01208229, "auxiliary_loss_mlp": 0.01029561, "balance_loss_clip": 0.94446719, "balance_loss_mlp": 1.02014995, "epoch": 0.2648950880779174, "flos": 26797799502720.0, "grad_norm": 1.8356991342918316, "language_loss": 0.86499107, "learning_rate": 3.4487543136954887e-06, "loss": 0.88736904, "num_input_tokens_seen": 47263435, "step": 2203, "time_per_iteration": 3.693739175796509 }, { "auxiliary_loss_clip": 0.01203514, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 0.9443298, "balance_loss_mlp": 1.02603483, "epoch": 0.2650153309685565, "flos": 28841008838400.0, "grad_norm": 1.627664415120511, "language_loss": 0.91126037, "learning_rate": 3.448217177731509e-06, "loss": 0.93364829, "num_input_tokens_seen": 47283920, "step": 2204, "time_per_iteration": 2.864478588104248 }, { "auxiliary_loss_clip": 0.0120298, "auxiliary_loss_mlp": 0.01034437, "balance_loss_clip": 0.98584759, "balance_loss_mlp": 1.02497816, "epoch": 0.26513557385919556, "flos": 20303247271680.0, "grad_norm": 1.8867658998148815, "language_loss": 0.77729213, "learning_rate": 3.4476798220741348e-06, "loss": 0.79966629, "num_input_tokens_seen": 47302800, "step": 2205, "time_per_iteration": 3.789491653442383 }, { "auxiliary_loss_clip": 0.01212787, "auxiliary_loss_mlp": 0.01032802, "balance_loss_clip": 1.06434953, "balance_loss_mlp": 1.02371275, "epoch": 0.26525581674983467, "flos": 17676274101120.0, "grad_norm": 1.5952438753624787, "language_loss": 0.77972555, "learning_rate": 3.4471422468048826e-06, "loss": 0.80218142, "num_input_tokens_seen": 47321525, "step": 2206, "time_per_iteration": 2.7274551391601562 }, { "auxiliary_loss_clip": 0.01199818, "auxiliary_loss_mlp": 0.01031542, "balance_loss_clip": 1.02191663, "balance_loss_mlp": 1.02158844, "epoch": 0.2653760596404738, "flos": 26833746038400.0, "grad_norm": 2.362855930644652, "language_loss": 0.73069489, "learning_rate": 3.4466044520053022e-06, "loss": 0.75300848, "num_input_tokens_seen": 47340530, "step": 2207, "time_per_iteration": 3.6694419384002686 }, { "auxiliary_loss_clip": 0.0119191, "auxiliary_loss_mlp": 0.01030407, "balance_loss_clip": 0.98131883, "balance_loss_mlp": 1.02086473, "epoch": 0.26549630253111284, "flos": 22782160581120.0, "grad_norm": 1.6630361269012979, "language_loss": 0.59808576, "learning_rate": 3.446066437756977e-06, "loss": 0.62030894, "num_input_tokens_seen": 47359735, "step": 2208, "time_per_iteration": 2.961310863494873 }, { "auxiliary_loss_clip": 0.01205998, "auxiliary_loss_mlp": 0.01036629, "balance_loss_clip": 0.98485976, "balance_loss_mlp": 1.02737284, "epoch": 0.26561654542175195, "flos": 23550002640000.0, "grad_norm": 2.301086867791505, "language_loss": 0.7571612, "learning_rate": 3.4455282041415224e-06, "loss": 0.77958751, "num_input_tokens_seen": 47378945, "step": 2209, "time_per_iteration": 2.7277894020080566 }, { "auxiliary_loss_clip": 0.01206435, "auxiliary_loss_mlp": 0.01030578, "balance_loss_clip": 0.94529963, "balance_loss_mlp": 1.02116024, "epoch": 0.265736788312391, "flos": 26906680604160.0, "grad_norm": 2.089187233761193, "language_loss": 0.87449783, "learning_rate": 3.4449897512405894e-06, "loss": 0.89686799, "num_input_tokens_seen": 47398095, "step": 2210, "time_per_iteration": 2.822000741958618 }, { "auxiliary_loss_clip": 0.01190498, "auxiliary_loss_mlp": 0.01126654, "balance_loss_clip": 0.86064869, "balance_loss_mlp": 0.0, "epoch": 0.2658570312030301, "flos": 23477139901440.0, "grad_norm": 15.473837650117128, "language_loss": 0.74737364, "learning_rate": 3.444451079135859e-06, "loss": 0.77054513, "num_input_tokens_seen": 47417605, "step": 2211, "time_per_iteration": 2.863391399383545 }, { "auxiliary_loss_clip": 0.01190098, "auxiliary_loss_mlp": 0.01127008, "balance_loss_clip": 0.90246534, "balance_loss_mlp": 0.0, "epoch": 0.2659772740936692, "flos": 21866402315520.0, "grad_norm": 2.040816439340186, "language_loss": 0.74320859, "learning_rate": 3.4439121879090493e-06, "loss": 0.76637959, "num_input_tokens_seen": 47435385, "step": 2212, "time_per_iteration": 2.8568179607391357 }, { "auxiliary_loss_clip": 0.01212297, "auxiliary_loss_mlp": 0.01032529, "balance_loss_clip": 0.98379493, "balance_loss_mlp": 1.0225811, "epoch": 0.2660975169843083, "flos": 19793100360960.0, "grad_norm": 1.9054959910213913, "language_loss": 0.8334983, "learning_rate": 3.4433730776419082e-06, "loss": 0.85594654, "num_input_tokens_seen": 47454310, "step": 2213, "time_per_iteration": 2.7329509258270264 }, { "auxiliary_loss_clip": 0.0121004, "auxiliary_loss_mlp": 0.01126832, "balance_loss_clip": 1.02101564, "balance_loss_mlp": 0.0, "epoch": 0.2662177598749474, "flos": 29018981750400.0, "grad_norm": 2.6919157479236784, "language_loss": 0.80497003, "learning_rate": 3.4428337484162183e-06, "loss": 0.82833868, "num_input_tokens_seen": 47475120, "step": 2214, "time_per_iteration": 2.804258108139038 }, { "auxiliary_loss_clip": 0.01201504, "auxiliary_loss_mlp": 0.01038172, "balance_loss_clip": 0.98328018, "balance_loss_mlp": 1.02812839, "epoch": 0.2663380027655865, "flos": 21762549118080.0, "grad_norm": 1.8084565761319522, "language_loss": 0.84317034, "learning_rate": 3.442294200313797e-06, "loss": 0.86556709, "num_input_tokens_seen": 47493150, "step": 2215, "time_per_iteration": 2.765552520751953 }, { "auxiliary_loss_clip": 0.01101393, "auxiliary_loss_mlp": 0.01018219, "balance_loss_clip": 1.02643228, "balance_loss_mlp": 1.01469028, "epoch": 0.26645824565622556, "flos": 66980333819520.0, "grad_norm": 0.8468182209610944, "language_loss": 0.52726865, "learning_rate": 3.4417544334164916e-06, "loss": 0.54846478, "num_input_tokens_seen": 47557295, "step": 2216, "time_per_iteration": 3.2334909439086914 }, { "auxiliary_loss_clip": 0.01200536, "auxiliary_loss_mlp": 0.01034322, "balance_loss_clip": 0.94439697, "balance_loss_mlp": 1.02473748, "epoch": 0.26657848854686467, "flos": 25264198373760.0, "grad_norm": 1.6166670679131998, "language_loss": 0.77304566, "learning_rate": 3.4412144478061854e-06, "loss": 0.79539418, "num_input_tokens_seen": 47579705, "step": 2217, "time_per_iteration": 2.8569414615631104 }, { "auxiliary_loss_clip": 0.01194127, "auxiliary_loss_mlp": 0.01038419, "balance_loss_clip": 0.78849757, "balance_loss_mlp": 1.02785683, "epoch": 0.2666987314375038, "flos": 23696769611520.0, "grad_norm": 2.2918407374836205, "language_loss": 0.75420159, "learning_rate": 3.4406742435647925e-06, "loss": 0.77652705, "num_input_tokens_seen": 47599770, "step": 2218, "time_per_iteration": 3.1000351905822754 }, { "auxiliary_loss_clip": 0.01209214, "auxiliary_loss_mlp": 0.01041207, "balance_loss_clip": 1.0247848, "balance_loss_mlp": 1.03156257, "epoch": 0.26681897432814283, "flos": 27048958375680.0, "grad_norm": 1.7236040416427687, "language_loss": 0.79318106, "learning_rate": 3.440133820774263e-06, "loss": 0.81568527, "num_input_tokens_seen": 47619580, "step": 2219, "time_per_iteration": 2.999743700027466 }, { "auxiliary_loss_clip": 0.01213397, "auxiliary_loss_mlp": 0.01034553, "balance_loss_clip": 0.98505008, "balance_loss_mlp": 1.02377009, "epoch": 0.26693921721878194, "flos": 28985944216320.0, "grad_norm": 1.9750962836176396, "language_loss": 0.81755912, "learning_rate": 3.439593179516578e-06, "loss": 0.8400386, "num_input_tokens_seen": 47639490, "step": 2220, "time_per_iteration": 2.8750534057617188 }, { "auxiliary_loss_clip": 0.01213855, "auxiliary_loss_mlp": 0.01036665, "balance_loss_clip": 0.98521519, "balance_loss_mlp": 1.02567387, "epoch": 0.26705946010942105, "flos": 21507834798720.0, "grad_norm": 2.8481992146184654, "language_loss": 0.80722064, "learning_rate": 3.4390523198737524e-06, "loss": 0.82972586, "num_input_tokens_seen": 47658650, "step": 2221, "time_per_iteration": 2.8706490993499756 }, { "auxiliary_loss_clip": 0.0121222, "auxiliary_loss_mlp": 0.011264, "balance_loss_clip": 1.06222606, "balance_loss_mlp": 0.0, "epoch": 0.2671797030000601, "flos": 21471277731840.0, "grad_norm": 1.6860951399749857, "language_loss": 0.73822689, "learning_rate": 3.4385112419278333e-06, "loss": 0.76161301, "num_input_tokens_seen": 47679875, "step": 2222, "time_per_iteration": 2.747596025466919 }, { "auxiliary_loss_clip": 0.01105743, "auxiliary_loss_mlp": 0.01009968, "balance_loss_clip": 0.98963523, "balance_loss_mlp": 1.00651073, "epoch": 0.2672999458906992, "flos": 64189929767040.0, "grad_norm": 0.7891079115406868, "language_loss": 0.64850724, "learning_rate": 3.4379699457609033e-06, "loss": 0.66966432, "num_input_tokens_seen": 47737700, "step": 2223, "time_per_iteration": 3.140472650527954 }, { "auxiliary_loss_clip": 0.01196764, "auxiliary_loss_mlp": 0.0103347, "balance_loss_clip": 0.98103833, "balance_loss_mlp": 1.02378452, "epoch": 0.26742018878133833, "flos": 16909042573440.0, "grad_norm": 1.78649631640306, "language_loss": 0.89676642, "learning_rate": 3.4374284314550755e-06, "loss": 0.91906881, "num_input_tokens_seen": 47756740, "step": 2224, "time_per_iteration": 2.7592086791992188 }, { "auxiliary_loss_clip": 0.0121112, "auxiliary_loss_mlp": 0.0103352, "balance_loss_clip": 1.06278563, "balance_loss_mlp": 1.02323794, "epoch": 0.2675404316719774, "flos": 20667560964480.0, "grad_norm": 1.71380693791751, "language_loss": 0.8119505, "learning_rate": 3.436886699092498e-06, "loss": 0.8343969, "num_input_tokens_seen": 47775255, "step": 2225, "time_per_iteration": 2.727023124694824 }, { "auxiliary_loss_clip": 0.01214374, "auxiliary_loss_mlp": 0.01039147, "balance_loss_clip": 1.06178737, "balance_loss_mlp": 1.02876949, "epoch": 0.2676606745626165, "flos": 17485013157120.0, "grad_norm": 2.664054966503023, "language_loss": 0.71594667, "learning_rate": 3.4363447487553502e-06, "loss": 0.73848188, "num_input_tokens_seen": 47788570, "step": 2226, "time_per_iteration": 2.652543783187866 }, { "auxiliary_loss_clip": 0.01205389, "auxiliary_loss_mlp": 0.01032368, "balance_loss_clip": 0.98616374, "balance_loss_mlp": 1.02272451, "epoch": 0.26778091745325555, "flos": 27852675143040.0, "grad_norm": 1.9162753157081192, "language_loss": 0.78429484, "learning_rate": 3.4358025805258455e-06, "loss": 0.80667239, "num_input_tokens_seen": 47808275, "step": 2227, "time_per_iteration": 3.960226535797119 }, { "auxiliary_loss_clip": 0.01206737, "auxiliary_loss_mlp": 0.01040449, "balance_loss_clip": 0.90414059, "balance_loss_mlp": 1.03004253, "epoch": 0.26790116034389466, "flos": 20955995176320.0, "grad_norm": 1.8214104559277116, "language_loss": 0.83236885, "learning_rate": 3.435260194486232e-06, "loss": 0.85484076, "num_input_tokens_seen": 47826245, "step": 2228, "time_per_iteration": 2.846723794937134 }, { "auxiliary_loss_clip": 0.01209074, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 0.98532867, "balance_loss_mlp": 1.02412868, "epoch": 0.2680214032345338, "flos": 18040659621120.0, "grad_norm": 2.050344708547424, "language_loss": 0.82083189, "learning_rate": 3.4347175907187875e-06, "loss": 0.84326744, "num_input_tokens_seen": 47843235, "step": 2229, "time_per_iteration": 2.789783000946045 }, { "auxiliary_loss_clip": 0.01210106, "auxiliary_loss_mlp": 0.01041029, "balance_loss_clip": 1.024611, "balance_loss_mlp": 1.03143311, "epoch": 0.26814164612517283, "flos": 22419427086720.0, "grad_norm": 1.815458480799987, "language_loss": 0.88316351, "learning_rate": 3.4341747693058254e-06, "loss": 0.90567487, "num_input_tokens_seen": 47861710, "step": 2230, "time_per_iteration": 3.743311643600464 }, { "auxiliary_loss_clip": 0.01189435, "auxiliary_loss_mlp": 0.01029881, "balance_loss_clip": 0.7906059, "balance_loss_mlp": 1.02015924, "epoch": 0.26826188901581194, "flos": 35627371159680.0, "grad_norm": 1.6308478500909276, "language_loss": 0.77118421, "learning_rate": 3.4336317303296916e-06, "loss": 0.79337734, "num_input_tokens_seen": 47882685, "step": 2231, "time_per_iteration": 3.063040256500244 }, { "auxiliary_loss_clip": 0.0120809, "auxiliary_loss_mlp": 0.01036462, "balance_loss_clip": 1.02399397, "balance_loss_mlp": 1.02724755, "epoch": 0.26838213190645105, "flos": 17639788861440.0, "grad_norm": 2.3080017323999935, "language_loss": 0.75365591, "learning_rate": 3.4330884738727635e-06, "loss": 0.77610147, "num_input_tokens_seen": 47900860, "step": 2232, "time_per_iteration": 3.973482847213745 }, { "auxiliary_loss_clip": 0.01195016, "auxiliary_loss_mlp": 0.01041473, "balance_loss_clip": 0.90711886, "balance_loss_mlp": 1.03166771, "epoch": 0.2685023747970901, "flos": 22674823764480.0, "grad_norm": 1.7638625669471528, "language_loss": 0.70664322, "learning_rate": 3.4325450000174535e-06, "loss": 0.72900814, "num_input_tokens_seen": 47917500, "step": 2233, "time_per_iteration": 2.904266357421875 }, { "auxiliary_loss_clip": 0.01190631, "auxiliary_loss_mlp": 0.01039088, "balance_loss_clip": 0.90106797, "balance_loss_mlp": 1.02897942, "epoch": 0.2686226176877292, "flos": 20120533764480.0, "grad_norm": 3.8151673673898854, "language_loss": 0.74417657, "learning_rate": 3.4320013088462067e-06, "loss": 0.76647377, "num_input_tokens_seen": 47934860, "step": 2234, "time_per_iteration": 3.7527801990509033 }, { "auxiliary_loss_clip": 0.01209085, "auxiliary_loss_mlp": 0.01038929, "balance_loss_clip": 0.94408834, "balance_loss_mlp": 1.02928483, "epoch": 0.2687428605783683, "flos": 21872040750720.0, "grad_norm": 1.5179487156740175, "language_loss": 0.81840873, "learning_rate": 3.431457400441499e-06, "loss": 0.84088892, "num_input_tokens_seen": 47955255, "step": 2235, "time_per_iteration": 2.962275981903076 }, { "auxiliary_loss_clip": 0.01093953, "auxiliary_loss_mlp": 0.01003282, "balance_loss_clip": 0.8338446, "balance_loss_mlp": 0.99992001, "epoch": 0.2688631034690074, "flos": 69943320766080.0, "grad_norm": 0.9110523772065345, "language_loss": 0.60941386, "learning_rate": 3.4309132748858424e-06, "loss": 0.63038623, "num_input_tokens_seen": 48016245, "step": 2236, "time_per_iteration": 3.478878974914551 }, { "auxiliary_loss_clip": 0.01204804, "auxiliary_loss_mlp": 0.01037201, "balance_loss_clip": 1.02342129, "balance_loss_mlp": 1.02794456, "epoch": 0.2689833463596465, "flos": 22856639431680.0, "grad_norm": 1.7493382931789319, "language_loss": 0.83666581, "learning_rate": 3.430368932261779e-06, "loss": 0.85908586, "num_input_tokens_seen": 48036600, "step": 2237, "time_per_iteration": 2.859805107116699 }, { "auxiliary_loss_clip": 0.01205357, "auxiliary_loss_mlp": 0.0103248, "balance_loss_clip": 0.98433453, "balance_loss_mlp": 1.02277601, "epoch": 0.2691035892502856, "flos": 17200242132480.0, "grad_norm": 1.9175918674500723, "language_loss": 0.74736977, "learning_rate": 3.429824372651886e-06, "loss": 0.76974815, "num_input_tokens_seen": 48054750, "step": 2238, "time_per_iteration": 2.7947919368743896 }, { "auxiliary_loss_clip": 0.01212098, "auxiliary_loss_mlp": 0.01033047, "balance_loss_clip": 0.90708762, "balance_loss_mlp": 1.02312231, "epoch": 0.26922383214092466, "flos": 17747484814080.0, "grad_norm": 1.8967976062378902, "language_loss": 0.83224642, "learning_rate": 3.4292795961387732e-06, "loss": 0.85469782, "num_input_tokens_seen": 48072650, "step": 2239, "time_per_iteration": 2.8184633255004883 }, { "auxiliary_loss_clip": 0.01210387, "auxiliary_loss_mlp": 0.01036751, "balance_loss_clip": 1.06056654, "balance_loss_mlp": 1.02677894, "epoch": 0.26934407503156377, "flos": 16173376122240.0, "grad_norm": 2.0991371360245235, "language_loss": 0.869317, "learning_rate": 3.4287346028050818e-06, "loss": 0.89178842, "num_input_tokens_seen": 48088720, "step": 2240, "time_per_iteration": 2.7064013481140137 }, { "auxiliary_loss_clip": 0.01206076, "auxiliary_loss_mlp": 0.01034276, "balance_loss_clip": 0.98465931, "balance_loss_mlp": 1.02507257, "epoch": 0.2694643179222028, "flos": 23732895715200.0, "grad_norm": 1.5188989725225912, "language_loss": 0.79879951, "learning_rate": 3.4281893927334866e-06, "loss": 0.82120305, "num_input_tokens_seen": 48108630, "step": 2241, "time_per_iteration": 2.742399215698242 }, { "auxiliary_loss_clip": 0.01208459, "auxiliary_loss_mlp": 0.01036598, "balance_loss_clip": 1.02266121, "balance_loss_mlp": 1.02737069, "epoch": 0.26958456081284193, "flos": 24718140840960.0, "grad_norm": 1.7890118617440696, "language_loss": 0.75239456, "learning_rate": 3.4276439660066963e-06, "loss": 0.77484506, "num_input_tokens_seen": 48128330, "step": 2242, "time_per_iteration": 2.7729227542877197 }, { "auxiliary_loss_clip": 0.01208331, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.06174862, "balance_loss_mlp": 1.0252012, "epoch": 0.26970480370348104, "flos": 18112588606080.0, "grad_norm": 2.4253821291792215, "language_loss": 0.84028995, "learning_rate": 3.427098322707452e-06, "loss": 0.86271787, "num_input_tokens_seen": 48144295, "step": 2243, "time_per_iteration": 2.655189037322998 }, { "auxiliary_loss_clip": 0.01210567, "auxiliary_loss_mlp": 0.01047515, "balance_loss_clip": 1.02514136, "balance_loss_mlp": 1.03788304, "epoch": 0.2698250465941201, "flos": 10816546250880.0, "grad_norm": 1.8656633146000465, "language_loss": 0.89289939, "learning_rate": 3.426552462918526e-06, "loss": 0.91548026, "num_input_tokens_seen": 48162230, "step": 2244, "time_per_iteration": 2.763049602508545 }, { "auxiliary_loss_clip": 0.01210712, "auxiliary_loss_mlp": 0.01039157, "balance_loss_clip": 1.06435418, "balance_loss_mlp": 1.02976322, "epoch": 0.2699452894847592, "flos": 17308117653120.0, "grad_norm": 2.0885827912964694, "language_loss": 0.7351293, "learning_rate": 3.426006386722726e-06, "loss": 0.75762796, "num_input_tokens_seen": 48180290, "step": 2245, "time_per_iteration": 2.747924327850342 }, { "auxiliary_loss_clip": 0.01210982, "auxiliary_loss_mlp": 0.01035122, "balance_loss_clip": 0.95137858, "balance_loss_mlp": 1.02542448, "epoch": 0.2700655323753983, "flos": 18078150441600.0, "grad_norm": 2.5636393297952087, "language_loss": 0.92310011, "learning_rate": 3.4254600942028914e-06, "loss": 0.94556111, "num_input_tokens_seen": 48198165, "step": 2246, "time_per_iteration": 2.8180859088897705 }, { "auxiliary_loss_clip": 0.01206505, "auxiliary_loss_mlp": 0.01030289, "balance_loss_clip": 0.98756021, "balance_loss_mlp": 1.02087188, "epoch": 0.2701857752660374, "flos": 18186636493440.0, "grad_norm": 2.6110725874528256, "language_loss": 0.82210648, "learning_rate": 3.424913585441893e-06, "loss": 0.84447443, "num_input_tokens_seen": 48216000, "step": 2247, "time_per_iteration": 2.7517142295837402 }, { "auxiliary_loss_clip": 0.01206416, "auxiliary_loss_mlp": 0.01042288, "balance_loss_clip": 1.02372134, "balance_loss_mlp": 1.03331208, "epoch": 0.2703060181566765, "flos": 16319496648960.0, "grad_norm": 1.9975820311764305, "language_loss": 0.87182671, "learning_rate": 3.4243668605226374e-06, "loss": 0.89431375, "num_input_tokens_seen": 48233025, "step": 2248, "time_per_iteration": 2.6823909282684326 }, { "auxiliary_loss_clip": 0.01206272, "auxiliary_loss_mlp": 0.01036606, "balance_loss_clip": 0.94724882, "balance_loss_mlp": 1.02633619, "epoch": 0.2704262610473156, "flos": 19572357329280.0, "grad_norm": 2.1706944149252463, "language_loss": 0.82700455, "learning_rate": 3.423819919528061e-06, "loss": 0.84943336, "num_input_tokens_seen": 48251110, "step": 2249, "time_per_iteration": 2.8131089210510254 }, { "auxiliary_loss_clip": 0.01204852, "auxiliary_loss_mlp": 0.0103526, "balance_loss_clip": 0.90276319, "balance_loss_mlp": 1.02543116, "epoch": 0.27054650393795465, "flos": 20740746925440.0, "grad_norm": 1.6648014406015428, "language_loss": 0.78301007, "learning_rate": 3.4232727625411355e-06, "loss": 0.80541116, "num_input_tokens_seen": 48270215, "step": 2250, "time_per_iteration": 2.7739295959472656 }, { "auxiliary_loss_clip": 0.01193517, "auxiliary_loss_mlp": 0.01031983, "balance_loss_clip": 0.86699247, "balance_loss_mlp": 1.02201104, "epoch": 0.27066674682859376, "flos": 18658322916480.0, "grad_norm": 1.7530765979186083, "language_loss": 0.86393869, "learning_rate": 3.4227253896448626e-06, "loss": 0.88619375, "num_input_tokens_seen": 48288075, "step": 2251, "time_per_iteration": 2.966953754425049 }, { "auxiliary_loss_clip": 0.01208458, "auxiliary_loss_mlp": 0.01030638, "balance_loss_clip": 1.06132698, "balance_loss_mlp": 1.02146494, "epoch": 0.2707869897192329, "flos": 23002759958400.0, "grad_norm": 2.1924567465546274, "language_loss": 0.82460696, "learning_rate": 3.42217780092228e-06, "loss": 0.84699786, "num_input_tokens_seen": 48306415, "step": 2252, "time_per_iteration": 3.7514569759368896 }, { "auxiliary_loss_clip": 0.0110507, "auxiliary_loss_mlp": 0.01004029, "balance_loss_clip": 0.90974438, "balance_loss_mlp": 1.00071478, "epoch": 0.27090723260987193, "flos": 58323240293760.0, "grad_norm": 0.806645193581211, "language_loss": 0.60350442, "learning_rate": 3.421629996456456e-06, "loss": 0.6245954, "num_input_tokens_seen": 48365035, "step": 2253, "time_per_iteration": 3.268483877182007 }, { "auxiliary_loss_clip": 0.01203467, "auxiliary_loss_mlp": 0.01033282, "balance_loss_clip": 1.02108383, "balance_loss_mlp": 1.02356637, "epoch": 0.27102747550051104, "flos": 11984540797440.0, "grad_norm": 1.9449035894786153, "language_loss": 0.82822764, "learning_rate": 3.421081976330491e-06, "loss": 0.85059512, "num_input_tokens_seen": 48383550, "step": 2254, "time_per_iteration": 2.7599217891693115 }, { "auxiliary_loss_clip": 0.01202653, "auxiliary_loss_mlp": 0.01032769, "balance_loss_clip": 0.98390663, "balance_loss_mlp": 1.02283251, "epoch": 0.27114771839115015, "flos": 19900401264000.0, "grad_norm": 2.326379567580216, "language_loss": 0.87924516, "learning_rate": 3.4205337406275207e-06, "loss": 0.90159941, "num_input_tokens_seen": 48403670, "step": 2255, "time_per_iteration": 3.8132944107055664 }, { "auxiliary_loss_clip": 0.01206523, "auxiliary_loss_mlp": 0.01031882, "balance_loss_clip": 1.05994463, "balance_loss_mlp": 1.02247047, "epoch": 0.2712679612817892, "flos": 18331966920960.0, "grad_norm": 2.330961458624943, "language_loss": 0.753838, "learning_rate": 3.4199852894307114e-06, "loss": 0.77622211, "num_input_tokens_seen": 48420420, "step": 2256, "time_per_iteration": 2.6998753547668457 }, { "auxiliary_loss_clip": 0.01205835, "auxiliary_loss_mlp": 0.01041649, "balance_loss_clip": 0.86999071, "balance_loss_mlp": 1.03167701, "epoch": 0.2713882041724283, "flos": 24460302038400.0, "grad_norm": 1.9525466876296995, "language_loss": 0.78972995, "learning_rate": 3.419436622823262e-06, "loss": 0.81220484, "num_input_tokens_seen": 48441140, "step": 2257, "time_per_iteration": 3.8152706623077393 }, { "auxiliary_loss_clip": 0.01203383, "auxiliary_loss_mlp": 0.01035484, "balance_loss_clip": 0.984532, "balance_loss_mlp": 1.02639413, "epoch": 0.27150844706306737, "flos": 23039317025280.0, "grad_norm": 1.6332275785119594, "language_loss": 0.74426943, "learning_rate": 3.4188877408884063e-06, "loss": 0.76665807, "num_input_tokens_seen": 48461845, "step": 2258, "time_per_iteration": 2.8196496963500977 }, { "auxiliary_loss_clip": 0.01201724, "auxiliary_loss_mlp": 0.01036914, "balance_loss_clip": 0.98510003, "balance_loss_mlp": 1.02729952, "epoch": 0.2716286899537065, "flos": 22563644192640.0, "grad_norm": 2.6326593186087197, "language_loss": 0.65299094, "learning_rate": 3.4183386437094088e-06, "loss": 0.67537731, "num_input_tokens_seen": 48478510, "step": 2259, "time_per_iteration": 3.67911958694458 }, { "auxiliary_loss_clip": 0.01205319, "auxiliary_loss_mlp": 0.01032108, "balance_loss_clip": 0.98267645, "balance_loss_mlp": 1.02283382, "epoch": 0.2717489328443456, "flos": 13115044523520.0, "grad_norm": 1.9948356586058065, "language_loss": 0.81858945, "learning_rate": 3.417789331369565e-06, "loss": 0.84096366, "num_input_tokens_seen": 48494300, "step": 2260, "time_per_iteration": 2.7684051990509033 }, { "auxiliary_loss_clip": 0.01212555, "auxiliary_loss_mlp": 0.01039679, "balance_loss_clip": 1.06330085, "balance_loss_mlp": 1.02993333, "epoch": 0.27186917573498465, "flos": 29278688060160.0, "grad_norm": 2.169740407732504, "language_loss": 0.91244185, "learning_rate": 3.4172398039522088e-06, "loss": 0.93496418, "num_input_tokens_seen": 48515585, "step": 2261, "time_per_iteration": 2.7383079528808594 }, { "auxiliary_loss_clip": 0.01207858, "auxiliary_loss_mlp": 0.01035489, "balance_loss_clip": 1.02309072, "balance_loss_mlp": 1.02617896, "epoch": 0.27198941862562376, "flos": 26032220000640.0, "grad_norm": 1.7015354189568386, "language_loss": 0.79726261, "learning_rate": 3.4166900615407e-06, "loss": 0.81969607, "num_input_tokens_seen": 48533500, "step": 2262, "time_per_iteration": 2.8050103187561035 }, { "auxiliary_loss_clip": 0.01208302, "auxiliary_loss_mlp": 0.01031879, "balance_loss_clip": 1.02467871, "balance_loss_mlp": 1.02264071, "epoch": 0.27210966151626287, "flos": 32780983760640.0, "grad_norm": 1.810073029856845, "language_loss": 0.75258511, "learning_rate": 3.416140104218436e-06, "loss": 0.77498692, "num_input_tokens_seen": 48552865, "step": 2263, "time_per_iteration": 2.8310413360595703 }, { "auxiliary_loss_clip": 0.01102926, "auxiliary_loss_mlp": 0.0112073, "balance_loss_clip": 0.95294297, "balance_loss_mlp": 0.0, "epoch": 0.2722299044069019, "flos": 65471043219840.0, "grad_norm": 0.8502954202547464, "language_loss": 0.69702321, "learning_rate": 3.4155899320688437e-06, "loss": 0.7192598, "num_input_tokens_seen": 48618940, "step": 2264, "time_per_iteration": 3.362767457962036 }, { "auxiliary_loss_clip": 0.01204142, "auxiliary_loss_mlp": 0.0103477, "balance_loss_clip": 0.87062156, "balance_loss_mlp": 1.02534604, "epoch": 0.27235014729754103, "flos": 15334143782400.0, "grad_norm": 2.4927595166308207, "language_loss": 0.73787189, "learning_rate": 3.415039545175384e-06, "loss": 0.760261, "num_input_tokens_seen": 48634665, "step": 2265, "time_per_iteration": 2.8763625621795654 }, { "auxiliary_loss_clip": 0.01210141, "auxiliary_loss_mlp": 0.01034359, "balance_loss_clip": 1.02419424, "balance_loss_mlp": 1.02548981, "epoch": 0.27247039018818014, "flos": 21872363973120.0, "grad_norm": 2.0065799894966623, "language_loss": 0.65350997, "learning_rate": 3.414488943621551e-06, "loss": 0.67595494, "num_input_tokens_seen": 48653330, "step": 2266, "time_per_iteration": 2.6943275928497314 }, { "auxiliary_loss_clip": 0.01206534, "auxiliary_loss_mlp": 0.01031886, "balance_loss_clip": 1.02346361, "balance_loss_mlp": 1.02248049, "epoch": 0.2725906330788192, "flos": 18695490514560.0, "grad_norm": 4.924604489561619, "language_loss": 0.74051905, "learning_rate": 3.41393812749087e-06, "loss": 0.76290321, "num_input_tokens_seen": 48671375, "step": 2267, "time_per_iteration": 2.7531888484954834 }, { "auxiliary_loss_clip": 0.01204352, "auxiliary_loss_mlp": 0.01041684, "balance_loss_clip": 0.98664862, "balance_loss_mlp": 1.03249335, "epoch": 0.2727108759694583, "flos": 17886099398400.0, "grad_norm": 3.055708789425653, "language_loss": 0.71404701, "learning_rate": 3.4133870968668984e-06, "loss": 0.73650742, "num_input_tokens_seen": 48686175, "step": 2268, "time_per_iteration": 2.7513275146484375 }, { "auxiliary_loss_clip": 0.01207085, "auxiliary_loss_mlp": 0.01034795, "balance_loss_clip": 0.98604929, "balance_loss_mlp": 1.02482295, "epoch": 0.2728311188600974, "flos": 24461666755200.0, "grad_norm": 1.7567837552567662, "language_loss": 0.78601885, "learning_rate": 3.412835851833229e-06, "loss": 0.80843771, "num_input_tokens_seen": 48708370, "step": 2269, "time_per_iteration": 2.8303563594818115 }, { "auxiliary_loss_clip": 0.01205792, "auxiliary_loss_mlp": 0.01029315, "balance_loss_clip": 1.02343178, "balance_loss_mlp": 1.02006471, "epoch": 0.2729513617507365, "flos": 30993314757120.0, "grad_norm": 1.6794268230907894, "language_loss": 0.77808177, "learning_rate": 3.4122843924734834e-06, "loss": 0.80043286, "num_input_tokens_seen": 48730670, "step": 2270, "time_per_iteration": 2.7931671142578125 }, { "auxiliary_loss_clip": 0.01198284, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 0.98291278, "balance_loss_mlp": 1.0322063, "epoch": 0.2730716046413756, "flos": 19094637421440.0, "grad_norm": 1.910203908738862, "language_loss": 0.87994659, "learning_rate": 3.411732718871319e-06, "loss": 0.90234971, "num_input_tokens_seen": 48746510, "step": 2271, "time_per_iteration": 2.7714200019836426 }, { "auxiliary_loss_clip": 0.01212252, "auxiliary_loss_mlp": 0.01034111, "balance_loss_clip": 1.06861126, "balance_loss_mlp": 1.02487278, "epoch": 0.27319184753201464, "flos": 26944566474240.0, "grad_norm": 1.5190230912051035, "language_loss": 0.78732371, "learning_rate": 3.4111808311104227e-06, "loss": 0.80978739, "num_input_tokens_seen": 48768825, "step": 2272, "time_per_iteration": 2.864046335220337 }, { "auxiliary_loss_clip": 0.01209275, "auxiliary_loss_mlp": 0.0104062, "balance_loss_clip": 0.98275763, "balance_loss_mlp": 1.03078532, "epoch": 0.27331209042265375, "flos": 31759828012800.0, "grad_norm": 1.8248262896698275, "language_loss": 0.69483221, "learning_rate": 3.410628729274517e-06, "loss": 0.71733117, "num_input_tokens_seen": 48790345, "step": 2273, "time_per_iteration": 2.9991114139556885 }, { "auxiliary_loss_clip": 0.01202966, "auxiliary_loss_mlp": 0.0112653, "balance_loss_clip": 0.98335713, "balance_loss_mlp": 0.0, "epoch": 0.27343233331329286, "flos": 25739081107200.0, "grad_norm": 1.8054731746260075, "language_loss": 0.82217205, "learning_rate": 3.4100764134473546e-06, "loss": 0.84546697, "num_input_tokens_seen": 48809630, "step": 2274, "time_per_iteration": 2.8532121181488037 }, { "auxiliary_loss_clip": 0.01206912, "auxiliary_loss_mlp": 0.01028322, "balance_loss_clip": 1.06282282, "balance_loss_mlp": 1.0192976, "epoch": 0.2735525762039319, "flos": 24389414547840.0, "grad_norm": 2.4273998710823768, "language_loss": 0.84690142, "learning_rate": 3.4095238837127215e-06, "loss": 0.86925375, "num_input_tokens_seen": 48828770, "step": 2275, "time_per_iteration": 2.7715067863464355 }, { "auxiliary_loss_clip": 0.011986, "auxiliary_loss_mlp": 0.01040075, "balance_loss_clip": 0.94312966, "balance_loss_mlp": 1.03091967, "epoch": 0.27367281909457103, "flos": 14465357527680.0, "grad_norm": 2.0898230806010814, "language_loss": 0.79375434, "learning_rate": 3.4089711401544355e-06, "loss": 0.81614107, "num_input_tokens_seen": 48846365, "step": 2276, "time_per_iteration": 2.8703200817108154 }, { "auxiliary_loss_clip": 0.0120129, "auxiliary_loss_mlp": 0.01031158, "balance_loss_clip": 1.01824939, "balance_loss_mlp": 1.021788, "epoch": 0.27379306198521014, "flos": 23476996247040.0, "grad_norm": 2.324513850480093, "language_loss": 0.67901659, "learning_rate": 3.4084181828563486e-06, "loss": 0.70134103, "num_input_tokens_seen": 48863085, "step": 2277, "time_per_iteration": 2.7726387977600098 }, { "auxiliary_loss_clip": 0.01196169, "auxiliary_loss_mlp": 0.01038399, "balance_loss_clip": 0.90306336, "balance_loss_mlp": 1.02860618, "epoch": 0.2739133048758492, "flos": 17458152762240.0, "grad_norm": 1.6159710428301837, "language_loss": 0.70680499, "learning_rate": 3.4078650119023428e-06, "loss": 0.72915071, "num_input_tokens_seen": 48881400, "step": 2278, "time_per_iteration": 3.803751230239868 }, { "auxiliary_loss_clip": 0.01194223, "auxiliary_loss_mlp": 0.01038407, "balance_loss_clip": 0.86139834, "balance_loss_mlp": 1.02858472, "epoch": 0.2740335477664883, "flos": 19273113123840.0, "grad_norm": 1.9817820378281277, "language_loss": 0.74225807, "learning_rate": 3.4073116273763337e-06, "loss": 0.76458442, "num_input_tokens_seen": 48895845, "step": 2279, "time_per_iteration": 2.819650888442993 }, { "auxiliary_loss_clip": 0.01207149, "auxiliary_loss_mlp": 0.01034577, "balance_loss_clip": 0.98173583, "balance_loss_mlp": 1.02549887, "epoch": 0.2741537906571274, "flos": 26104723603200.0, "grad_norm": 1.9370489565816411, "language_loss": 0.81340176, "learning_rate": 3.40675802936227e-06, "loss": 0.83581901, "num_input_tokens_seen": 48916630, "step": 2280, "time_per_iteration": 2.842355728149414 }, { "auxiliary_loss_clip": 0.01194695, "auxiliary_loss_mlp": 0.01029579, "balance_loss_clip": 0.98290682, "balance_loss_mlp": 1.01961339, "epoch": 0.27427403354776647, "flos": 34164190644480.0, "grad_norm": 1.9924123777032519, "language_loss": 0.72195804, "learning_rate": 3.4062042179441318e-06, "loss": 0.74420083, "num_input_tokens_seen": 48937100, "step": 2281, "time_per_iteration": 3.775269031524658 }, { "auxiliary_loss_clip": 0.01204594, "auxiliary_loss_mlp": 0.01036836, "balance_loss_clip": 1.02406669, "balance_loss_mlp": 1.02760303, "epoch": 0.2743942764384056, "flos": 18766988536320.0, "grad_norm": 1.8258884439116658, "language_loss": 0.80558479, "learning_rate": 3.4056501932059314e-06, "loss": 0.82799911, "num_input_tokens_seen": 48955175, "step": 2282, "time_per_iteration": 2.7307796478271484 }, { "auxiliary_loss_clip": 0.01099777, "auxiliary_loss_mlp": 0.01007477, "balance_loss_clip": 1.02584171, "balance_loss_mlp": 1.00446129, "epoch": 0.2745145193290447, "flos": 64904048058240.0, "grad_norm": 0.782425265247041, "language_loss": 0.58146775, "learning_rate": 3.405095955231715e-06, "loss": 0.60254025, "num_input_tokens_seen": 49006830, "step": 2283, "time_per_iteration": 4.144571542739868 }, { "auxiliary_loss_clip": 0.01208023, "auxiliary_loss_mlp": 0.01030808, "balance_loss_clip": 1.01985502, "balance_loss_mlp": 1.02209949, "epoch": 0.27463476221968375, "flos": 16136926796160.0, "grad_norm": 11.56305535777128, "language_loss": 0.94360435, "learning_rate": 3.4045415041055585e-06, "loss": 0.96599269, "num_input_tokens_seen": 49022470, "step": 2284, "time_per_iteration": 2.704148292541504 }, { "auxiliary_loss_clip": 0.0120694, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 0.98243183, "balance_loss_mlp": 1.02223945, "epoch": 0.27475500511032286, "flos": 10376712213120.0, "grad_norm": 2.6351740194886877, "language_loss": 0.78612912, "learning_rate": 3.4039868399115728e-06, "loss": 0.80851942, "num_input_tokens_seen": 49037110, "step": 2285, "time_per_iteration": 3.688283920288086 }, { "auxiliary_loss_clip": 0.01203317, "auxiliary_loss_mlp": 0.01031392, "balance_loss_clip": 0.8720572, "balance_loss_mlp": 1.02181363, "epoch": 0.27487524800096197, "flos": 17311062568320.0, "grad_norm": 1.8096740431158096, "language_loss": 0.80534601, "learning_rate": 3.4034319627339003e-06, "loss": 0.8276931, "num_input_tokens_seen": 49053975, "step": 2286, "time_per_iteration": 2.8836238384246826 }, { "auxiliary_loss_clip": 0.01208656, "auxiliary_loss_mlp": 0.01039513, "balance_loss_clip": 0.98564577, "balance_loss_mlp": 1.02997637, "epoch": 0.274995490891601, "flos": 27120205002240.0, "grad_norm": 2.3931138988466034, "language_loss": 0.69397473, "learning_rate": 3.402876872656715e-06, "loss": 0.71645641, "num_input_tokens_seen": 49072295, "step": 2287, "time_per_iteration": 2.8993759155273438 }, { "auxiliary_loss_clip": 0.01205192, "auxiliary_loss_mlp": 0.01034177, "balance_loss_clip": 0.98528218, "balance_loss_mlp": 1.024593, "epoch": 0.27511573378224013, "flos": 23436093634560.0, "grad_norm": 1.9552447396442891, "language_loss": 0.89817834, "learning_rate": 3.402321569764223e-06, "loss": 0.92057198, "num_input_tokens_seen": 49091600, "step": 2288, "time_per_iteration": 2.7694194316864014 }, { "auxiliary_loss_clip": 0.0120382, "auxiliary_loss_mlp": 0.01126647, "balance_loss_clip": 0.90523785, "balance_loss_mlp": 0.0, "epoch": 0.2752359766728792, "flos": 16722019434240.0, "grad_norm": 1.7960155228369694, "language_loss": 0.83626759, "learning_rate": 3.4017660541406635e-06, "loss": 0.85957229, "num_input_tokens_seen": 49107665, "step": 2289, "time_per_iteration": 2.8484573364257812 }, { "auxiliary_loss_clip": 0.01208244, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 0.98026162, "balance_loss_mlp": 1.02920389, "epoch": 0.2753562195635183, "flos": 25297738698240.0, "grad_norm": 2.080697222237044, "language_loss": 0.73689127, "learning_rate": 3.4012103258703092e-06, "loss": 0.75936526, "num_input_tokens_seen": 49126420, "step": 2290, "time_per_iteration": 2.801345109939575 }, { "auxiliary_loss_clip": 0.0120008, "auxiliary_loss_mlp": 0.01032286, "balance_loss_clip": 0.94320369, "balance_loss_mlp": 1.02295232, "epoch": 0.2754764624541574, "flos": 27338972785920.0, "grad_norm": 1.9174088258450157, "language_loss": 0.8285501, "learning_rate": 3.4006543850374616e-06, "loss": 0.85087377, "num_input_tokens_seen": 49141470, "step": 2291, "time_per_iteration": 2.934187650680542 }, { "auxiliary_loss_clip": 0.0120499, "auxiliary_loss_mlp": 0.01034092, "balance_loss_clip": 1.01936269, "balance_loss_mlp": 1.02438807, "epoch": 0.27559670534479647, "flos": 17238379397760.0, "grad_norm": 1.9289038992986438, "language_loss": 0.75373513, "learning_rate": 3.400098231726458e-06, "loss": 0.77612591, "num_input_tokens_seen": 49158570, "step": 2292, "time_per_iteration": 2.9695558547973633 }, { "auxiliary_loss_clip": 0.01202247, "auxiliary_loss_mlp": 0.01035748, "balance_loss_clip": 0.9393298, "balance_loss_mlp": 1.02643156, "epoch": 0.2757169482354356, "flos": 21939085486080.0, "grad_norm": 1.8316735893935578, "language_loss": 0.8668021, "learning_rate": 3.3995418660216657e-06, "loss": 0.88918209, "num_input_tokens_seen": 49176025, "step": 2293, "time_per_iteration": 2.7849349975585938 }, { "auxiliary_loss_clip": 0.01214569, "auxiliary_loss_mlp": 0.0103626, "balance_loss_clip": 1.06163836, "balance_loss_mlp": 1.02571642, "epoch": 0.2758371911260747, "flos": 20850669521280.0, "grad_norm": 2.6699295356434107, "language_loss": 0.80535954, "learning_rate": 3.3989852880074848e-06, "loss": 0.82786781, "num_input_tokens_seen": 49197455, "step": 2294, "time_per_iteration": 2.797367572784424 }, { "auxiliary_loss_clip": 0.01099156, "auxiliary_loss_mlp": 0.01018132, "balance_loss_clip": 0.95097291, "balance_loss_mlp": 1.01488996, "epoch": 0.27595743401671374, "flos": 69269063592960.0, "grad_norm": 0.75049510380798, "language_loss": 0.6063484, "learning_rate": 3.398428497768348e-06, "loss": 0.62752128, "num_input_tokens_seen": 49262625, "step": 2295, "time_per_iteration": 3.4091081619262695 }, { "auxiliary_loss_clip": 0.012083, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 0.94219851, "balance_loss_mlp": 1.01741898, "epoch": 0.27607767690735285, "flos": 21215019127680.0, "grad_norm": 1.7885440488632327, "language_loss": 0.72056484, "learning_rate": 3.3978714953887205e-06, "loss": 0.74291652, "num_input_tokens_seen": 49282380, "step": 2296, "time_per_iteration": 2.8384876251220703 }, { "auxiliary_loss_clip": 0.01184568, "auxiliary_loss_mlp": 0.01031206, "balance_loss_clip": 0.90247643, "balance_loss_mlp": 1.02168131, "epoch": 0.27619791979799196, "flos": 24825334003200.0, "grad_norm": 1.7839844618151754, "language_loss": 0.86523283, "learning_rate": 3.397314280953098e-06, "loss": 0.88739055, "num_input_tokens_seen": 49303205, "step": 2297, "time_per_iteration": 2.869396686553955 }, { "auxiliary_loss_clip": 0.01196113, "auxiliary_loss_mlp": 0.01036007, "balance_loss_clip": 0.98160881, "balance_loss_mlp": 1.02663684, "epoch": 0.276318162688631, "flos": 24753548672640.0, "grad_norm": 1.8706762997156186, "language_loss": 0.80594575, "learning_rate": 3.3967568545460108e-06, "loss": 0.82826698, "num_input_tokens_seen": 49322745, "step": 2298, "time_per_iteration": 2.7911384105682373 }, { "auxiliary_loss_clip": 0.01201632, "auxiliary_loss_mlp": 0.01034333, "balance_loss_clip": 1.01991224, "balance_loss_mlp": 1.02483165, "epoch": 0.27643840557927013, "flos": 18150007599360.0, "grad_norm": 2.0717942198747314, "language_loss": 0.80946755, "learning_rate": 3.3961992162520185e-06, "loss": 0.83182716, "num_input_tokens_seen": 49341370, "step": 2299, "time_per_iteration": 2.745640516281128 }, { "auxiliary_loss_clip": 0.01206512, "auxiliary_loss_mlp": 0.01043403, "balance_loss_clip": 1.02225065, "balance_loss_mlp": 1.03402162, "epoch": 0.27655864846990924, "flos": 24823933372800.0, "grad_norm": 2.1464950854489793, "language_loss": 0.71780759, "learning_rate": 3.3956413661557156e-06, "loss": 0.74030674, "num_input_tokens_seen": 49361545, "step": 2300, "time_per_iteration": 2.879973888397217 }, { "auxiliary_loss_clip": 0.01207368, "auxiliary_loss_mlp": 0.01039592, "balance_loss_clip": 0.94279045, "balance_loss_mlp": 1.03037667, "epoch": 0.2766788913605483, "flos": 20266582464000.0, "grad_norm": 2.2239903535208225, "language_loss": 0.66226274, "learning_rate": 3.3950833043417273e-06, "loss": 0.68473232, "num_input_tokens_seen": 49379690, "step": 2301, "time_per_iteration": 2.8318564891815186 }, { "auxiliary_loss_clip": 0.01208267, "auxiliary_loss_mlp": 0.01035122, "balance_loss_clip": 1.02510166, "balance_loss_mlp": 1.02560949, "epoch": 0.2767991342511874, "flos": 21470272151040.0, "grad_norm": 2.6154313409293075, "language_loss": 0.72930855, "learning_rate": 3.3945250308947105e-06, "loss": 0.75174242, "num_input_tokens_seen": 49395995, "step": 2302, "time_per_iteration": 2.733745574951172 }, { "auxiliary_loss_clip": 0.01105386, "auxiliary_loss_mlp": 0.01012593, "balance_loss_clip": 0.98930323, "balance_loss_mlp": 1.00939798, "epoch": 0.2769193771418265, "flos": 66002627571840.0, "grad_norm": 1.2493146732827696, "language_loss": 0.68254048, "learning_rate": 3.3939665458993556e-06, "loss": 0.70372027, "num_input_tokens_seen": 49450415, "step": 2303, "time_per_iteration": 3.184849262237549 }, { "auxiliary_loss_clip": 0.01206397, "auxiliary_loss_mlp": 0.01036923, "balance_loss_clip": 0.94114774, "balance_loss_mlp": 1.02779758, "epoch": 0.27703962003246557, "flos": 20704441253760.0, "grad_norm": 1.8608339701512995, "language_loss": 0.76759899, "learning_rate": 3.3934078494403843e-06, "loss": 0.79003215, "num_input_tokens_seen": 49469990, "step": 2304, "time_per_iteration": 3.809687376022339 }, { "auxiliary_loss_clip": 0.01187482, "auxiliary_loss_mlp": 0.01127173, "balance_loss_clip": 0.82591009, "balance_loss_mlp": 0.0, "epoch": 0.2771598629231047, "flos": 22929897219840.0, "grad_norm": 1.5888521668783364, "language_loss": 0.81066501, "learning_rate": 3.3928489416025495e-06, "loss": 0.83381152, "num_input_tokens_seen": 49490835, "step": 2305, "time_per_iteration": 2.828202247619629 }, { "auxiliary_loss_clip": 0.01201097, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 0.98216003, "balance_loss_mlp": 1.02479017, "epoch": 0.27728010581374374, "flos": 18369457741440.0, "grad_norm": 2.89758411094116, "language_loss": 0.79044056, "learning_rate": 3.392289822470638e-06, "loss": 0.81280136, "num_input_tokens_seen": 49508815, "step": 2306, "time_per_iteration": 2.6976194381713867 }, { "auxiliary_loss_clip": 0.01198076, "auxiliary_loss_mlp": 0.01032035, "balance_loss_clip": 0.97984254, "balance_loss_mlp": 1.02252793, "epoch": 0.27740034870438285, "flos": 19427637432960.0, "grad_norm": 2.9012136515718927, "language_loss": 0.7601403, "learning_rate": 3.3917304921294674e-06, "loss": 0.78244138, "num_input_tokens_seen": 49526980, "step": 2307, "time_per_iteration": 3.674920082092285 }, { "auxiliary_loss_clip": 0.01204521, "auxiliary_loss_mlp": 0.01039366, "balance_loss_clip": 1.01930797, "balance_loss_mlp": 1.02957928, "epoch": 0.27752059159502196, "flos": 21614776565760.0, "grad_norm": 1.662839503412153, "language_loss": 0.80694288, "learning_rate": 3.3911709506638876e-06, "loss": 0.8293817, "num_input_tokens_seen": 49546290, "step": 2308, "time_per_iteration": 2.7220659255981445 }, { "auxiliary_loss_clip": 0.01186131, "auxiliary_loss_mlp": 0.01126598, "balance_loss_clip": 0.93856561, "balance_loss_mlp": 0.0, "epoch": 0.277640834485661, "flos": 26608011016320.0, "grad_norm": 5.357890082691454, "language_loss": 0.8146742, "learning_rate": 3.390611198158781e-06, "loss": 0.83780152, "num_input_tokens_seen": 49564165, "step": 2309, "time_per_iteration": 3.755364418029785 }, { "auxiliary_loss_clip": 0.01210773, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.06107318, "balance_loss_mlp": 1.02372658, "epoch": 0.2777610773763001, "flos": 19492814661120.0, "grad_norm": 2.0160104188669847, "language_loss": 0.90203732, "learning_rate": 3.3900512346990612e-06, "loss": 0.92448151, "num_input_tokens_seen": 49580155, "step": 2310, "time_per_iteration": 2.8083724975585938 }, { "auxiliary_loss_clip": 0.01193805, "auxiliary_loss_mlp": 0.01036299, "balance_loss_clip": 0.89934599, "balance_loss_mlp": 1.02617264, "epoch": 0.27788132026693924, "flos": 38290650001920.0, "grad_norm": 1.8610533670624425, "language_loss": 0.65804124, "learning_rate": 3.389491060369674e-06, "loss": 0.68034232, "num_input_tokens_seen": 49605830, "step": 2311, "time_per_iteration": 3.769620180130005 }, { "auxiliary_loss_clip": 0.01189516, "auxiliary_loss_mlp": 0.01043787, "balance_loss_clip": 0.9035629, "balance_loss_mlp": 1.03437543, "epoch": 0.2780015631575783, "flos": 22382546797440.0, "grad_norm": 1.8714172958054678, "language_loss": 0.89559484, "learning_rate": 3.388930675255598e-06, "loss": 0.9179278, "num_input_tokens_seen": 49625680, "step": 2312, "time_per_iteration": 2.8648366928100586 }, { "auxiliary_loss_clip": 0.01206225, "auxiliary_loss_mlp": 0.01034694, "balance_loss_clip": 0.9806428, "balance_loss_mlp": 1.02461505, "epoch": 0.2781218060482174, "flos": 12203200840320.0, "grad_norm": 2.2790550011212236, "language_loss": 0.79243112, "learning_rate": 3.388370079441843e-06, "loss": 0.81484026, "num_input_tokens_seen": 49641195, "step": 2313, "time_per_iteration": 2.8190042972564697 }, { "auxiliary_loss_clip": 0.0120004, "auxiliary_loss_mlp": 0.01033937, "balance_loss_clip": 0.94409776, "balance_loss_mlp": 1.02473426, "epoch": 0.2782420489388565, "flos": 18107632529280.0, "grad_norm": 2.0431742727291273, "language_loss": 0.9232288, "learning_rate": 3.3878092730134505e-06, "loss": 0.94556856, "num_input_tokens_seen": 49659180, "step": 2314, "time_per_iteration": 2.7222740650177 }, { "auxiliary_loss_clip": 0.01194264, "auxiliary_loss_mlp": 0.01037887, "balance_loss_clip": 1.0188843, "balance_loss_mlp": 1.02782619, "epoch": 0.27836229182949557, "flos": 18514752255360.0, "grad_norm": 1.897063826943328, "language_loss": 0.80782264, "learning_rate": 3.3872482560554947e-06, "loss": 0.83014417, "num_input_tokens_seen": 49677955, "step": 2315, "time_per_iteration": 2.671287775039673 }, { "auxiliary_loss_clip": 0.01103236, "auxiliary_loss_mlp": 0.01009765, "balance_loss_clip": 0.98841751, "balance_loss_mlp": 1.00652206, "epoch": 0.2784825347201347, "flos": 67079230940160.0, "grad_norm": 0.7941316031030269, "language_loss": 0.57033277, "learning_rate": 3.386687028653082e-06, "loss": 0.59146273, "num_input_tokens_seen": 49740800, "step": 2316, "time_per_iteration": 3.2953004837036133 }, { "auxiliary_loss_clip": 0.01200534, "auxiliary_loss_mlp": 0.01032674, "balance_loss_clip": 0.90553093, "balance_loss_mlp": 1.02338767, "epoch": 0.2786027776107738, "flos": 22631119891200.0, "grad_norm": 1.7311325650041343, "language_loss": 0.85304368, "learning_rate": 3.386125590891349e-06, "loss": 0.87537575, "num_input_tokens_seen": 49757675, "step": 2317, "time_per_iteration": 2.734788656234741 }, { "auxiliary_loss_clip": 0.01188192, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 0.9782905, "balance_loss_mlp": 1.02805519, "epoch": 0.27872302050141284, "flos": 15778826156160.0, "grad_norm": 2.044157064328177, "language_loss": 0.82792872, "learning_rate": 3.3855639428554657e-06, "loss": 0.85018224, "num_input_tokens_seen": 49775205, "step": 2318, "time_per_iteration": 2.744917392730713 }, { "auxiliary_loss_clip": 0.01189445, "auxiliary_loss_mlp": 0.01036964, "balance_loss_clip": 0.94112146, "balance_loss_mlp": 1.02751064, "epoch": 0.27884326339205195, "flos": 22126970551680.0, "grad_norm": 1.7428750773227486, "language_loss": 0.80249923, "learning_rate": 3.385002084630635e-06, "loss": 0.8247633, "num_input_tokens_seen": 49794175, "step": 2319, "time_per_iteration": 2.746647357940674 }, { "auxiliary_loss_clip": 0.01207679, "auxiliary_loss_mlp": 0.0103543, "balance_loss_clip": 1.02027631, "balance_loss_mlp": 1.02564931, "epoch": 0.278963506282691, "flos": 20558715776640.0, "grad_norm": 1.9540066677087273, "language_loss": 0.84892339, "learning_rate": 3.384440016302088e-06, "loss": 0.87135446, "num_input_tokens_seen": 49812850, "step": 2320, "time_per_iteration": 2.8453307151794434 }, { "auxiliary_loss_clip": 0.01201221, "auxiliary_loss_mlp": 0.0103345, "balance_loss_clip": 1.02010059, "balance_loss_mlp": 1.02359164, "epoch": 0.2790837491733301, "flos": 21942928241280.0, "grad_norm": 2.0068311874221525, "language_loss": 0.61964762, "learning_rate": 3.3838777379550923e-06, "loss": 0.64199436, "num_input_tokens_seen": 49832295, "step": 2321, "time_per_iteration": 2.7226016521453857 }, { "auxiliary_loss_clip": 0.01205975, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 0.98386425, "balance_loss_mlp": 1.02369881, "epoch": 0.27920399206396923, "flos": 26286790665600.0, "grad_norm": 2.1000884118270786, "language_loss": 0.78315818, "learning_rate": 3.383315249674944e-06, "loss": 0.80555105, "num_input_tokens_seen": 49850860, "step": 2322, "time_per_iteration": 2.7706358432769775 }, { "auxiliary_loss_clip": 0.01200976, "auxiliary_loss_mlp": 0.010324, "balance_loss_clip": 0.94274759, "balance_loss_mlp": 1.02270222, "epoch": 0.2793242349546083, "flos": 25400981364480.0, "grad_norm": 2.1841494688133203, "language_loss": 0.86197567, "learning_rate": 3.3827525515469715e-06, "loss": 0.88430947, "num_input_tokens_seen": 49865765, "step": 2323, "time_per_iteration": 2.7751827239990234 }, { "auxiliary_loss_clip": 0.01190619, "auxiliary_loss_mlp": 0.01035521, "balance_loss_clip": 0.94131845, "balance_loss_mlp": 1.02582943, "epoch": 0.2794444778452474, "flos": 20850346298880.0, "grad_norm": 1.913803660922693, "language_loss": 0.70860887, "learning_rate": 3.3821896436565367e-06, "loss": 0.73087025, "num_input_tokens_seen": 49885425, "step": 2324, "time_per_iteration": 2.847914457321167 }, { "auxiliary_loss_clip": 0.01206966, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.02420068, "balance_loss_mlp": 1.0275743, "epoch": 0.2795647207358865, "flos": 21576244250880.0, "grad_norm": 1.5972738179154966, "language_loss": 0.70321119, "learning_rate": 3.381626526089032e-06, "loss": 0.72565353, "num_input_tokens_seen": 49904990, "step": 2325, "time_per_iteration": 2.734219789505005 }, { "auxiliary_loss_clip": 0.01197375, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 0.9786917, "balance_loss_mlp": 1.02283406, "epoch": 0.27968496362652556, "flos": 21471744608640.0, "grad_norm": 1.8493573838633472, "language_loss": 0.78949034, "learning_rate": 3.3810631989298815e-06, "loss": 0.81178904, "num_input_tokens_seen": 49924600, "step": 2326, "time_per_iteration": 2.8467023372650146 }, { "auxiliary_loss_clip": 0.01204291, "auxiliary_loss_mlp": 0.01035129, "balance_loss_clip": 0.90589929, "balance_loss_mlp": 1.02511537, "epoch": 0.2798052065171647, "flos": 23258695340160.0, "grad_norm": 2.962633374181734, "language_loss": 0.8421725, "learning_rate": 3.3804996622645423e-06, "loss": 0.86456674, "num_input_tokens_seen": 49942600, "step": 2327, "time_per_iteration": 2.825985908508301 }, { "auxiliary_loss_clip": 0.01206672, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.06054473, "balance_loss_mlp": 1.0229938, "epoch": 0.2799254494078038, "flos": 21539328048000.0, "grad_norm": 1.790533668062006, "language_loss": 0.89302027, "learning_rate": 3.3799359161785015e-06, "loss": 0.91541469, "num_input_tokens_seen": 49962250, "step": 2328, "time_per_iteration": 2.710679769515991 }, { "auxiliary_loss_clip": 0.01203425, "auxiliary_loss_mlp": 0.01032079, "balance_loss_clip": 1.02032351, "balance_loss_mlp": 1.02298331, "epoch": 0.28004569229844284, "flos": 26393912000640.0, "grad_norm": 1.5422917978635073, "language_loss": 0.85691297, "learning_rate": 3.3793719607572798e-06, "loss": 0.87926805, "num_input_tokens_seen": 49983215, "step": 2329, "time_per_iteration": 2.8056585788726807 }, { "auxiliary_loss_clip": 0.01181667, "auxiliary_loss_mlp": 0.01039197, "balance_loss_clip": 0.97708297, "balance_loss_mlp": 1.02971399, "epoch": 0.28016593518908195, "flos": 33547676584320.0, "grad_norm": 1.8668187987972122, "language_loss": 0.76907825, "learning_rate": 3.378807796086428e-06, "loss": 0.79128683, "num_input_tokens_seen": 50006075, "step": 2330, "time_per_iteration": 3.7492730617523193 }, { "auxiliary_loss_clip": 0.01205123, "auxiliary_loss_mlp": 0.01036525, "balance_loss_clip": 1.05943656, "balance_loss_mlp": 1.02684569, "epoch": 0.28028617807972106, "flos": 15340823712000.0, "grad_norm": 2.776014157410966, "language_loss": 0.76720721, "learning_rate": 3.37824342225153e-06, "loss": 0.78962368, "num_input_tokens_seen": 50022495, "step": 2331, "time_per_iteration": 2.64748215675354 }, { "auxiliary_loss_clip": 0.01203037, "auxiliary_loss_mlp": 0.01033448, "balance_loss_clip": 0.90748715, "balance_loss_mlp": 1.02467966, "epoch": 0.2804064209703601, "flos": 25520277409920.0, "grad_norm": 1.8555533173559013, "language_loss": 0.78030109, "learning_rate": 3.3776788393382006e-06, "loss": 0.80266595, "num_input_tokens_seen": 50041975, "step": 2332, "time_per_iteration": 2.7963924407958984 }, { "auxiliary_loss_clip": 0.01206937, "auxiliary_loss_mlp": 0.01030862, "balance_loss_clip": 1.05971205, "balance_loss_mlp": 1.02098525, "epoch": 0.2805266638609992, "flos": 29351766280320.0, "grad_norm": 4.315560577383983, "language_loss": 0.77414197, "learning_rate": 3.3771140474320872e-06, "loss": 0.79651994, "num_input_tokens_seen": 50061925, "step": 2333, "time_per_iteration": 3.698664903640747 }, { "auxiliary_loss_clip": 0.01208611, "auxiliary_loss_mlp": 0.01034529, "balance_loss_clip": 0.94431925, "balance_loss_mlp": 1.02450943, "epoch": 0.28064690675163834, "flos": 21463735875840.0, "grad_norm": 2.021452535811851, "language_loss": 0.79639584, "learning_rate": 3.3765490466188664e-06, "loss": 0.81882727, "num_input_tokens_seen": 50079325, "step": 2334, "time_per_iteration": 2.8130719661712646 }, { "auxiliary_loss_clip": 0.01195263, "auxiliary_loss_mlp": 0.01031535, "balance_loss_clip": 0.93968493, "balance_loss_mlp": 1.0221169, "epoch": 0.2807671496422774, "flos": 20995640812800.0, "grad_norm": 3.140744780477653, "language_loss": 0.73532116, "learning_rate": 3.3759838369842508e-06, "loss": 0.75758922, "num_input_tokens_seen": 50097400, "step": 2335, "time_per_iteration": 2.746267080307007 }, { "auxiliary_loss_clip": 0.01198093, "auxiliary_loss_mlp": 0.0103566, "balance_loss_clip": 0.9416917, "balance_loss_mlp": 1.02592063, "epoch": 0.2808873925329165, "flos": 21506577822720.0, "grad_norm": 4.233732726916911, "language_loss": 0.72752941, "learning_rate": 3.375418418613981e-06, "loss": 0.74986696, "num_input_tokens_seen": 50116425, "step": 2336, "time_per_iteration": 3.7490127086639404 }, { "auxiliary_loss_clip": 0.01202502, "auxiliary_loss_mlp": 0.01036757, "balance_loss_clip": 0.98228592, "balance_loss_mlp": 1.02742314, "epoch": 0.28100763542355556, "flos": 16070815814400.0, "grad_norm": 2.8339990539447792, "language_loss": 0.83606398, "learning_rate": 3.374852791593831e-06, "loss": 0.85845655, "num_input_tokens_seen": 50132625, "step": 2337, "time_per_iteration": 3.692861318588257 }, { "auxiliary_loss_clip": 0.01207989, "auxiliary_loss_mlp": 0.0103298, "balance_loss_clip": 0.90421069, "balance_loss_mlp": 1.02322841, "epoch": 0.28112787831419467, "flos": 19062605468160.0, "grad_norm": 2.5146872357936405, "language_loss": 0.54177457, "learning_rate": 3.374286956009605e-06, "loss": 0.56418425, "num_input_tokens_seen": 50151190, "step": 2338, "time_per_iteration": 2.740969657897949 }, { "auxiliary_loss_clip": 0.01203503, "auxiliary_loss_mlp": 0.01039599, "balance_loss_clip": 1.02347076, "balance_loss_mlp": 1.03006792, "epoch": 0.2812481212048338, "flos": 12823629482880.0, "grad_norm": 2.4096695815022127, "language_loss": 0.75048614, "learning_rate": 3.3737209119471405e-06, "loss": 0.77291715, "num_input_tokens_seen": 50167700, "step": 2339, "time_per_iteration": 2.7696175575256348 }, { "auxiliary_loss_clip": 0.01210782, "auxiliary_loss_mlp": 0.01033905, "balance_loss_clip": 1.02172899, "balance_loss_mlp": 1.02401614, "epoch": 0.28136836409547283, "flos": 15633064765440.0, "grad_norm": 2.1641636656604, "language_loss": 0.63629663, "learning_rate": 3.373154659492306e-06, "loss": 0.6587435, "num_input_tokens_seen": 50185840, "step": 2340, "time_per_iteration": 2.635300874710083 }, { "auxiliary_loss_clip": 0.01207212, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 0.98262805, "balance_loss_mlp": 1.02297091, "epoch": 0.28148860698611194, "flos": 19933726106880.0, "grad_norm": 1.9208888004294757, "language_loss": 0.85416275, "learning_rate": 3.3725881987310016e-06, "loss": 0.87656498, "num_input_tokens_seen": 50203375, "step": 2341, "time_per_iteration": 2.7218565940856934 }, { "auxiliary_loss_clip": 0.01202447, "auxiliary_loss_mlp": 0.0103277, "balance_loss_clip": 0.98132026, "balance_loss_mlp": 1.02401423, "epoch": 0.28160884987675106, "flos": 17457219008640.0, "grad_norm": 1.6986524338791158, "language_loss": 0.87419897, "learning_rate": 3.372021529749159e-06, "loss": 0.89655113, "num_input_tokens_seen": 50222435, "step": 2342, "time_per_iteration": 2.740952968597412 }, { "auxiliary_loss_clip": 0.01200725, "auxiliary_loss_mlp": 0.01030134, "balance_loss_clip": 0.8682493, "balance_loss_mlp": 1.02090728, "epoch": 0.2817290927673901, "flos": 16834743290880.0, "grad_norm": 2.0457535931584134, "language_loss": 0.92741776, "learning_rate": 3.3714546526327405e-06, "loss": 0.94972634, "num_input_tokens_seen": 50240435, "step": 2343, "time_per_iteration": 2.7873501777648926 }, { "auxiliary_loss_clip": 0.01204784, "auxiliary_loss_mlp": 0.01033249, "balance_loss_clip": 0.94169861, "balance_loss_mlp": 1.0238142, "epoch": 0.2818493356580292, "flos": 15414081500160.0, "grad_norm": 2.0858937586322046, "language_loss": 0.88036919, "learning_rate": 3.3708875674677423e-06, "loss": 0.90274942, "num_input_tokens_seen": 50258410, "step": 2344, "time_per_iteration": 2.7589240074157715 }, { "auxiliary_loss_clip": 0.01213412, "auxiliary_loss_mlp": 0.01036986, "balance_loss_clip": 0.98621356, "balance_loss_mlp": 1.02718127, "epoch": 0.28196957854866833, "flos": 20412451595520.0, "grad_norm": 1.8587776220229726, "language_loss": 0.83590543, "learning_rate": 3.37032027434019e-06, "loss": 0.8584094, "num_input_tokens_seen": 50277930, "step": 2345, "time_per_iteration": 2.695981502532959 }, { "auxiliary_loss_clip": 0.01212878, "auxiliary_loss_mlp": 0.01038863, "balance_loss_clip": 1.01956499, "balance_loss_mlp": 1.02838433, "epoch": 0.2820898214393074, "flos": 19973120348160.0, "grad_norm": 1.633875239145064, "language_loss": 0.82724297, "learning_rate": 3.369752773336141e-06, "loss": 0.84976035, "num_input_tokens_seen": 50297410, "step": 2346, "time_per_iteration": 2.714031934738159 }, { "auxiliary_loss_clip": 0.01203648, "auxiliary_loss_mlp": 0.01031263, "balance_loss_clip": 0.98214352, "balance_loss_mlp": 1.02133322, "epoch": 0.2822100643299465, "flos": 22528308188160.0, "grad_norm": 4.435341867021555, "language_loss": 0.78424478, "learning_rate": 3.3691850645416864e-06, "loss": 0.80659389, "num_input_tokens_seen": 50317120, "step": 2347, "time_per_iteration": 2.7070724964141846 }, { "auxiliary_loss_clip": 0.01209938, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.02059269, "balance_loss_mlp": 1.02803111, "epoch": 0.2823303072205856, "flos": 11546682007680.0, "grad_norm": 1.9430520891192118, "language_loss": 0.8280046, "learning_rate": 3.368617148042945e-06, "loss": 0.85048628, "num_input_tokens_seen": 50334790, "step": 2348, "time_per_iteration": 2.663074493408203 }, { "auxiliary_loss_clip": 0.01192534, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 0.97738695, "balance_loss_mlp": 1.02273726, "epoch": 0.28245055011122466, "flos": 18259894281600.0, "grad_norm": 1.9588566291031908, "language_loss": 0.84558666, "learning_rate": 3.368049023926071e-06, "loss": 0.8678354, "num_input_tokens_seen": 50353785, "step": 2349, "time_per_iteration": 2.6334869861602783 }, { "auxiliary_loss_clip": 0.01203661, "auxiliary_loss_mlp": 0.0103125, "balance_loss_clip": 1.02146101, "balance_loss_mlp": 1.02263141, "epoch": 0.2825707930018638, "flos": 24608110504320.0, "grad_norm": 1.5325928252574272, "language_loss": 0.8380546, "learning_rate": 3.3674806922772476e-06, "loss": 0.86040372, "num_input_tokens_seen": 50374670, "step": 2350, "time_per_iteration": 2.719998836517334 }, { "auxiliary_loss_clip": 0.01203827, "auxiliary_loss_mlp": 0.01035362, "balance_loss_clip": 0.94143552, "balance_loss_mlp": 1.02522993, "epoch": 0.28269103589250283, "flos": 25226994862080.0, "grad_norm": 2.3454314032624555, "language_loss": 0.74910057, "learning_rate": 3.3669121531826904e-06, "loss": 0.77149248, "num_input_tokens_seen": 50395650, "step": 2351, "time_per_iteration": 2.927945375442505 }, { "auxiliary_loss_clip": 0.01197494, "auxiliary_loss_mlp": 0.01033817, "balance_loss_clip": 0.94631141, "balance_loss_mlp": 1.02421451, "epoch": 0.28281127878314194, "flos": 19281552819840.0, "grad_norm": 2.3239525532302525, "language_loss": 0.8307991, "learning_rate": 3.366343406728647e-06, "loss": 0.85311222, "num_input_tokens_seen": 50415100, "step": 2352, "time_per_iteration": 2.753730297088623 }, { "auxiliary_loss_clip": 0.01190024, "auxiliary_loss_mlp": 0.01032742, "balance_loss_clip": 1.01468849, "balance_loss_mlp": 1.02313328, "epoch": 0.28293152167378105, "flos": 23878405710720.0, "grad_norm": 1.9895444484857585, "language_loss": 0.68316048, "learning_rate": 3.3657744530013946e-06, "loss": 0.70538819, "num_input_tokens_seen": 50434335, "step": 2353, "time_per_iteration": 2.722412586212158 }, { "auxiliary_loss_clip": 0.01210616, "auxiliary_loss_mlp": 0.01036182, "balance_loss_clip": 1.02332854, "balance_loss_mlp": 1.02632928, "epoch": 0.2830517645644201, "flos": 43866965928960.0, "grad_norm": 1.817911789469092, "language_loss": 0.71095902, "learning_rate": 3.3652052920872437e-06, "loss": 0.73342699, "num_input_tokens_seen": 50457200, "step": 2354, "time_per_iteration": 2.9237892627716064 }, { "auxiliary_loss_clip": 0.01201672, "auxiliary_loss_mlp": 0.01036986, "balance_loss_clip": 0.98028237, "balance_loss_mlp": 1.02731788, "epoch": 0.2831720074550592, "flos": 26651750803200.0, "grad_norm": 1.8412963333429704, "language_loss": 0.85318178, "learning_rate": 3.3646359240725355e-06, "loss": 0.87556839, "num_input_tokens_seen": 50476390, "step": 2355, "time_per_iteration": 2.775118350982666 }, { "auxiliary_loss_clip": 0.01196606, "auxiliary_loss_mlp": 0.01126449, "balance_loss_clip": 1.01929891, "balance_loss_mlp": 0.0, "epoch": 0.2832922503456983, "flos": 31029979564800.0, "grad_norm": 2.2637437968360032, "language_loss": 0.67618024, "learning_rate": 3.364066349043643e-06, "loss": 0.6994108, "num_input_tokens_seen": 50497595, "step": 2356, "time_per_iteration": 4.205892562866211 }, { "auxiliary_loss_clip": 0.01199087, "auxiliary_loss_mlp": 0.01031035, "balance_loss_clip": 0.97976923, "balance_loss_mlp": 1.02237439, "epoch": 0.2834124932363374, "flos": 20405699838720.0, "grad_norm": 1.683918624744205, "language_loss": 0.82186049, "learning_rate": 3.363496567086969e-06, "loss": 0.84416169, "num_input_tokens_seen": 50514690, "step": 2357, "time_per_iteration": 2.8536882400512695 }, { "auxiliary_loss_clip": 0.01206582, "auxiliary_loss_mlp": 0.01035461, "balance_loss_clip": 1.05956864, "balance_loss_mlp": 1.02579284, "epoch": 0.2835327361269765, "flos": 39384848056320.0, "grad_norm": 2.214202045897326, "language_loss": 0.75643522, "learning_rate": 3.3629265782889506e-06, "loss": 0.77885562, "num_input_tokens_seen": 50536515, "step": 2358, "time_per_iteration": 2.8227498531341553 }, { "auxiliary_loss_clip": 0.01193942, "auxiliary_loss_mlp": 0.01034277, "balance_loss_clip": 0.94000459, "balance_loss_mlp": 1.02451408, "epoch": 0.2836529790176156, "flos": 30261598801920.0, "grad_norm": 2.1878680204599035, "language_loss": 0.71915132, "learning_rate": 3.362356382736054e-06, "loss": 0.7414335, "num_input_tokens_seen": 50557120, "step": 2359, "time_per_iteration": 2.7837016582489014 }, { "auxiliary_loss_clip": 0.01196786, "auxiliary_loss_mlp": 0.01038402, "balance_loss_clip": 0.93770528, "balance_loss_mlp": 1.02943122, "epoch": 0.28377322190825466, "flos": 12677796264960.0, "grad_norm": 2.260668189011596, "language_loss": 0.9113239, "learning_rate": 3.361785980514777e-06, "loss": 0.93367577, "num_input_tokens_seen": 50573320, "step": 2360, "time_per_iteration": 3.6098275184631348 }, { "auxiliary_loss_clip": 0.01187887, "auxiliary_loss_mlp": 0.01033279, "balance_loss_clip": 0.86256588, "balance_loss_mlp": 1.02306843, "epoch": 0.28389346479889377, "flos": 18296666830080.0, "grad_norm": 2.1470381215072645, "language_loss": 0.76351786, "learning_rate": 3.361215371711649e-06, "loss": 0.78572947, "num_input_tokens_seen": 50592415, "step": 2361, "time_per_iteration": 2.773604154586792 }, { "auxiliary_loss_clip": 0.0119437, "auxiliary_loss_mlp": 0.01026387, "balance_loss_clip": 0.94313085, "balance_loss_mlp": 1.01761878, "epoch": 0.2840137076895329, "flos": 20406992728320.0, "grad_norm": 1.678465778283045, "language_loss": 0.83332217, "learning_rate": 3.3606445564132326e-06, "loss": 0.85552979, "num_input_tokens_seen": 50609710, "step": 2362, "time_per_iteration": 3.7224810123443604 }, { "auxiliary_loss_clip": 0.01208788, "auxiliary_loss_mlp": 0.01125979, "balance_loss_clip": 1.06193781, "balance_loss_mlp": 0.0, "epoch": 0.28413395058017193, "flos": 20048030161920.0, "grad_norm": 1.9737019634561825, "language_loss": 0.82118243, "learning_rate": 3.360073534706118e-06, "loss": 0.84453011, "num_input_tokens_seen": 50626865, "step": 2363, "time_per_iteration": 3.5307416915893555 }, { "auxiliary_loss_clip": 0.01205945, "auxiliary_loss_mlp": 0.01037804, "balance_loss_clip": 0.98442036, "balance_loss_mlp": 1.0282793, "epoch": 0.28425419347081105, "flos": 37663613256960.0, "grad_norm": 1.9397081092763333, "language_loss": 0.75992703, "learning_rate": 3.35950230667693e-06, "loss": 0.78236449, "num_input_tokens_seen": 50648560, "step": 2364, "time_per_iteration": 2.8267276287078857 }, { "auxiliary_loss_clip": 0.01206201, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.02078521, "balance_loss_mlp": 1.02218962, "epoch": 0.28437443636145016, "flos": 13845072539520.0, "grad_norm": 1.9546801490206054, "language_loss": 0.86082965, "learning_rate": 3.358930872412323e-06, "loss": 0.88320464, "num_input_tokens_seen": 50665725, "step": 2365, "time_per_iteration": 2.652449131011963 }, { "auxiliary_loss_clip": 0.01202436, "auxiliary_loss_mlp": 0.01034336, "balance_loss_clip": 1.02139735, "balance_loss_mlp": 1.02567506, "epoch": 0.2844946792520892, "flos": 22747794243840.0, "grad_norm": 1.6056510185274333, "language_loss": 0.80815578, "learning_rate": 3.3583592319989825e-06, "loss": 0.83052349, "num_input_tokens_seen": 50685095, "step": 2366, "time_per_iteration": 2.698108673095703 }, { "auxiliary_loss_clip": 0.01211229, "auxiliary_loss_mlp": 0.01032834, "balance_loss_clip": 1.02171481, "balance_loss_mlp": 1.02345777, "epoch": 0.2846149221427283, "flos": 32415987709440.0, "grad_norm": 2.139862749318847, "language_loss": 0.68464369, "learning_rate": 3.357787385523627e-06, "loss": 0.7070843, "num_input_tokens_seen": 50706500, "step": 2367, "time_per_iteration": 2.7305123805999756 }, { "auxiliary_loss_clip": 0.01203396, "auxiliary_loss_mlp": 0.01034513, "balance_loss_clip": 0.86550504, "balance_loss_mlp": 1.02571499, "epoch": 0.2847351650333674, "flos": 28475976873600.0, "grad_norm": 1.7997450749248542, "language_loss": 0.82411921, "learning_rate": 3.3572153330730048e-06, "loss": 0.84649825, "num_input_tokens_seen": 50727595, "step": 2368, "time_per_iteration": 2.8618040084838867 }, { "auxiliary_loss_clip": 0.01118949, "auxiliary_loss_mlp": 0.01003018, "balance_loss_clip": 0.92382199, "balance_loss_mlp": 1.00027597, "epoch": 0.2848554079240065, "flos": 55753399704960.0, "grad_norm": 1.1266226278611848, "language_loss": 0.64727449, "learning_rate": 3.3566430747338956e-06, "loss": 0.66849416, "num_input_tokens_seen": 50782800, "step": 2369, "time_per_iteration": 3.201826572418213 }, { "auxiliary_loss_clip": 0.01204087, "auxiliary_loss_mlp": 0.01032115, "balance_loss_clip": 1.01912141, "balance_loss_mlp": 1.02300167, "epoch": 0.2849756508146456, "flos": 11836875985920.0, "grad_norm": 1.8719312651348474, "language_loss": 0.86606431, "learning_rate": 3.35607061059311e-06, "loss": 0.88842642, "num_input_tokens_seen": 50797730, "step": 2370, "time_per_iteration": 2.6548044681549072 }, { "auxiliary_loss_clip": 0.01203482, "auxiliary_loss_mlp": 0.01039559, "balance_loss_clip": 1.0598886, "balance_loss_mlp": 1.03095794, "epoch": 0.28509589370528465, "flos": 25155209531520.0, "grad_norm": 1.7607677470068772, "language_loss": 0.75337654, "learning_rate": 3.3554979407374917e-06, "loss": 0.77580696, "num_input_tokens_seen": 50819840, "step": 2371, "time_per_iteration": 2.702402114868164 }, { "auxiliary_loss_clip": 0.01199768, "auxiliary_loss_mlp": 0.01035998, "balance_loss_clip": 1.01810765, "balance_loss_mlp": 1.02706909, "epoch": 0.28521613659592376, "flos": 19974808287360.0, "grad_norm": 2.3929544054630982, "language_loss": 0.73545736, "learning_rate": 3.3549250652539134e-06, "loss": 0.757815, "num_input_tokens_seen": 50838935, "step": 2372, "time_per_iteration": 2.7088730335235596 }, { "auxiliary_loss_clip": 0.01197449, "auxiliary_loss_mlp": 0.0103492, "balance_loss_clip": 0.9777106, "balance_loss_mlp": 1.02581835, "epoch": 0.2853363794865629, "flos": 23367971491200.0, "grad_norm": 2.2526585376363233, "language_loss": 0.81513155, "learning_rate": 3.3543519842292794e-06, "loss": 0.83745527, "num_input_tokens_seen": 50858590, "step": 2373, "time_per_iteration": 2.7330660820007324 }, { "auxiliary_loss_clip": 0.012052, "auxiliary_loss_mlp": 0.01125942, "balance_loss_clip": 1.05954361, "balance_loss_mlp": 0.0, "epoch": 0.28545662237720193, "flos": 19861940776320.0, "grad_norm": 1.8687607100077, "language_loss": 0.83606088, "learning_rate": 3.353778697750527e-06, "loss": 0.85937238, "num_input_tokens_seen": 50876995, "step": 2374, "time_per_iteration": 2.640075206756592 }, { "auxiliary_loss_clip": 0.01193536, "auxiliary_loss_mlp": 0.01028124, "balance_loss_clip": 0.97891653, "balance_loss_mlp": 1.01935613, "epoch": 0.28557686526784104, "flos": 23879016241920.0, "grad_norm": 1.5424381660185806, "language_loss": 0.89537895, "learning_rate": 3.353205205904622e-06, "loss": 0.91759551, "num_input_tokens_seen": 50896105, "step": 2375, "time_per_iteration": 2.7241549491882324 }, { "auxiliary_loss_clip": 0.01199623, "auxiliary_loss_mlp": 0.01036241, "balance_loss_clip": 0.98188657, "balance_loss_mlp": 1.02661514, "epoch": 0.28569710815848015, "flos": 44890384233600.0, "grad_norm": 2.045466212008437, "language_loss": 0.71738315, "learning_rate": 3.3526315087785637e-06, "loss": 0.73974174, "num_input_tokens_seen": 50917220, "step": 2376, "time_per_iteration": 2.8514676094055176 }, { "auxiliary_loss_clip": 0.01185017, "auxiliary_loss_mlp": 0.01035295, "balance_loss_clip": 0.90681696, "balance_loss_mlp": 1.02600253, "epoch": 0.2858173510491192, "flos": 26829759628800.0, "grad_norm": 1.7328963503466122, "language_loss": 0.80859566, "learning_rate": 3.3520576064593805e-06, "loss": 0.83079886, "num_input_tokens_seen": 50937175, "step": 2377, "time_per_iteration": 2.7943921089172363 }, { "auxiliary_loss_clip": 0.01211101, "auxiliary_loss_mlp": 0.01040094, "balance_loss_clip": 1.02414203, "balance_loss_mlp": 1.03134966, "epoch": 0.2859375939397583, "flos": 23148916398720.0, "grad_norm": 1.4926614864873184, "language_loss": 0.81936944, "learning_rate": 3.3514834990341337e-06, "loss": 0.84188139, "num_input_tokens_seen": 50957500, "step": 2378, "time_per_iteration": 2.6943399906158447 }, { "auxiliary_loss_clip": 0.01209826, "auxiliary_loss_mlp": 0.01030767, "balance_loss_clip": 0.98329657, "balance_loss_mlp": 1.02191615, "epoch": 0.2860578368303974, "flos": 12129799397760.0, "grad_norm": 2.4446705483028963, "language_loss": 0.92906272, "learning_rate": 3.3509091865899144e-06, "loss": 0.95146859, "num_input_tokens_seen": 50972690, "step": 2379, "time_per_iteration": 2.6943278312683105 }, { "auxiliary_loss_clip": 0.01206214, "auxiliary_loss_mlp": 0.01031029, "balance_loss_clip": 1.06024933, "balance_loss_mlp": 1.02168918, "epoch": 0.2861780797210365, "flos": 19938035738880.0, "grad_norm": 2.037712314232414, "language_loss": 0.70656323, "learning_rate": 3.350334669213846e-06, "loss": 0.72893572, "num_input_tokens_seen": 50990095, "step": 2380, "time_per_iteration": 2.6207761764526367 }, { "auxiliary_loss_clip": 0.01202637, "auxiliary_loss_mlp": 0.01046696, "balance_loss_clip": 1.02439713, "balance_loss_mlp": 1.0374279, "epoch": 0.2862983226116756, "flos": 27563127609600.0, "grad_norm": 2.5327613849925354, "language_loss": 0.75854057, "learning_rate": 3.3497599469930816e-06, "loss": 0.78103393, "num_input_tokens_seen": 51008305, "step": 2381, "time_per_iteration": 2.7032575607299805 }, { "auxiliary_loss_clip": 0.01205639, "auxiliary_loss_mlp": 0.01034092, "balance_loss_clip": 1.05960906, "balance_loss_mlp": 1.02405477, "epoch": 0.28641856550231465, "flos": 22053964158720.0, "grad_norm": 2.29608399965473, "language_loss": 0.83388495, "learning_rate": 3.349185020014807e-06, "loss": 0.85628223, "num_input_tokens_seen": 51025570, "step": 2382, "time_per_iteration": 2.703402280807495 }, { "auxiliary_loss_clip": 0.01203544, "auxiliary_loss_mlp": 0.01037691, "balance_loss_clip": 1.01992011, "balance_loss_mlp": 1.02857161, "epoch": 0.28653880839295376, "flos": 22378775869440.0, "grad_norm": 2.2246780300790667, "language_loss": 0.74610925, "learning_rate": 3.348609888366237e-06, "loss": 0.76852161, "num_input_tokens_seen": 51044585, "step": 2383, "time_per_iteration": 3.781247138977051 }, { "auxiliary_loss_clip": 0.01185965, "auxiliary_loss_mlp": 0.01029241, "balance_loss_clip": 0.86138427, "balance_loss_mlp": 1.02020526, "epoch": 0.28665905128359287, "flos": 23367971491200.0, "grad_norm": 2.0710232676035316, "language_loss": 0.62310183, "learning_rate": 3.348034552134619e-06, "loss": 0.6452539, "num_input_tokens_seen": 51063990, "step": 2384, "time_per_iteration": 2.92177677154541 }, { "auxiliary_loss_clip": 0.01190033, "auxiliary_loss_mlp": 0.01030873, "balance_loss_clip": 0.90404987, "balance_loss_mlp": 1.0221765, "epoch": 0.2867792941742319, "flos": 20881695893760.0, "grad_norm": 3.6442356763263186, "language_loss": 0.84303808, "learning_rate": 3.3474590114072316e-06, "loss": 0.86524713, "num_input_tokens_seen": 51081990, "step": 2385, "time_per_iteration": 3.724660634994507 }, { "auxiliary_loss_clip": 0.01196185, "auxiliary_loss_mlp": 0.01030198, "balance_loss_clip": 0.94375461, "balance_loss_mlp": 1.02107263, "epoch": 0.28689953706487104, "flos": 20664005518080.0, "grad_norm": 2.0690843509884616, "language_loss": 0.83110082, "learning_rate": 3.3468832662713836e-06, "loss": 0.85336465, "num_input_tokens_seen": 51100235, "step": 2386, "time_per_iteration": 2.7493810653686523 }, { "auxiliary_loss_clip": 0.01197957, "auxiliary_loss_mlp": 0.0103287, "balance_loss_clip": 0.94487607, "balance_loss_mlp": 1.02366066, "epoch": 0.28701977995551015, "flos": 12675533708160.0, "grad_norm": 2.1366878509376943, "language_loss": 0.83878565, "learning_rate": 3.346307316814415e-06, "loss": 0.86109394, "num_input_tokens_seen": 51115405, "step": 2387, "time_per_iteration": 2.705498456954956 }, { "auxiliary_loss_clip": 0.01201733, "auxiliary_loss_mlp": 0.01033255, "balance_loss_clip": 1.02199566, "balance_loss_mlp": 1.02395725, "epoch": 0.2871400228461492, "flos": 21252366293760.0, "grad_norm": 2.4154334890505518, "language_loss": 0.75632328, "learning_rate": 3.3457311631236965e-06, "loss": 0.77867317, "num_input_tokens_seen": 51136390, "step": 2388, "time_per_iteration": 3.685453414916992 }, { "auxiliary_loss_clip": 0.01187649, "auxiliary_loss_mlp": 0.0103483, "balance_loss_clip": 0.97922397, "balance_loss_mlp": 1.02548361, "epoch": 0.2872602657367883, "flos": 25119262995840.0, "grad_norm": 1.8731725794791765, "language_loss": 0.84137189, "learning_rate": 3.345154805286631e-06, "loss": 0.86359674, "num_input_tokens_seen": 51156650, "step": 2389, "time_per_iteration": 3.623870372772217 }, { "auxiliary_loss_clip": 0.01195346, "auxiliary_loss_mlp": 0.01027369, "balance_loss_clip": 1.01919651, "balance_loss_mlp": 1.01827312, "epoch": 0.2873805086274274, "flos": 16646606830080.0, "grad_norm": 2.1343064189498957, "language_loss": 0.76710796, "learning_rate": 3.344578243390651e-06, "loss": 0.78933519, "num_input_tokens_seen": 51172210, "step": 2390, "time_per_iteration": 2.6296117305755615 }, { "auxiliary_loss_clip": 0.01198944, "auxiliary_loss_mlp": 0.0103632, "balance_loss_clip": 0.98315489, "balance_loss_mlp": 1.0278089, "epoch": 0.2875007515180665, "flos": 17420123237760.0, "grad_norm": 2.2253053238613196, "language_loss": 0.78723848, "learning_rate": 3.3440014775232206e-06, "loss": 0.80959117, "num_input_tokens_seen": 51190265, "step": 2391, "time_per_iteration": 2.6418826580047607 }, { "auxiliary_loss_clip": 0.0120035, "auxiliary_loss_mlp": 0.01034319, "balance_loss_clip": 0.94058061, "balance_loss_mlp": 1.02598059, "epoch": 0.2876209944087056, "flos": 23434190213760.0, "grad_norm": 2.4064792427763586, "language_loss": 0.70824325, "learning_rate": 3.343424507771834e-06, "loss": 0.73058999, "num_input_tokens_seen": 51208475, "step": 2392, "time_per_iteration": 2.7750136852264404 }, { "auxiliary_loss_clip": 0.01198346, "auxiliary_loss_mlp": 0.01030982, "balance_loss_clip": 0.94381565, "balance_loss_mlp": 1.02248788, "epoch": 0.2877412372993447, "flos": 13735509079680.0, "grad_norm": 1.8549627558703807, "language_loss": 0.8662951, "learning_rate": 3.342847334224018e-06, "loss": 0.88858831, "num_input_tokens_seen": 51225875, "step": 2393, "time_per_iteration": 2.7396252155303955 }, { "auxiliary_loss_clip": 0.01110148, "auxiliary_loss_mlp": 0.01001758, "balance_loss_clip": 0.99787945, "balance_loss_mlp": 0.99888462, "epoch": 0.28786148018998375, "flos": 58079695104000.0, "grad_norm": 0.9492952517108594, "language_loss": 0.62437803, "learning_rate": 3.342269956967329e-06, "loss": 0.64549708, "num_input_tokens_seen": 51287780, "step": 2394, "time_per_iteration": 3.3745508193969727 }, { "auxiliary_loss_clip": 0.01204432, "auxiliary_loss_mlp": 0.01031941, "balance_loss_clip": 1.01942205, "balance_loss_mlp": 1.02257133, "epoch": 0.28798172308062286, "flos": 23435052140160.0, "grad_norm": 4.062011407758059, "language_loss": 0.71276677, "learning_rate": 3.341692376089355e-06, "loss": 0.73513055, "num_input_tokens_seen": 51303335, "step": 2395, "time_per_iteration": 2.635171890258789 }, { "auxiliary_loss_clip": 0.01199159, "auxiliary_loss_mlp": 0.01029856, "balance_loss_clip": 1.02042556, "balance_loss_mlp": 1.02118957, "epoch": 0.288101965971262, "flos": 25110033200640.0, "grad_norm": 4.176897819375736, "language_loss": 0.84344918, "learning_rate": 3.3411145916777146e-06, "loss": 0.86573929, "num_input_tokens_seen": 51317495, "step": 2396, "time_per_iteration": 2.706068754196167 }, { "auxiliary_loss_clip": 0.01191728, "auxiliary_loss_mlp": 0.0104214, "balance_loss_clip": 0.97928071, "balance_loss_mlp": 1.0330677, "epoch": 0.28822220886190103, "flos": 16252559654400.0, "grad_norm": 2.5004566800792065, "language_loss": 0.91110086, "learning_rate": 3.3405366038200566e-06, "loss": 0.93343961, "num_input_tokens_seen": 51336430, "step": 2397, "time_per_iteration": 2.7195515632629395 }, { "auxiliary_loss_clip": 0.01204775, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 0.98570335, "balance_loss_mlp": 1.02514768, "epoch": 0.28834245175254014, "flos": 24535642815360.0, "grad_norm": 2.40301043301708, "language_loss": 0.85099769, "learning_rate": 3.3399584126040617e-06, "loss": 0.87339324, "num_input_tokens_seen": 51355930, "step": 2398, "time_per_iteration": 2.705413341522217 }, { "auxiliary_loss_clip": 0.01205447, "auxiliary_loss_mlp": 0.0112536, "balance_loss_clip": 1.06098986, "balance_loss_mlp": 0.0, "epoch": 0.2884626946431792, "flos": 24571445696640.0, "grad_norm": 2.0995941933331297, "language_loss": 0.90633404, "learning_rate": 3.339380018117441e-06, "loss": 0.92964214, "num_input_tokens_seen": 51376765, "step": 2399, "time_per_iteration": 2.658223867416382 }, { "auxiliary_loss_clip": 0.01198034, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.02104282, "balance_loss_mlp": 1.02429914, "epoch": 0.2885829375338183, "flos": 16544657053440.0, "grad_norm": 3.0604406662484487, "language_loss": 0.77896667, "learning_rate": 3.3388014204479366e-06, "loss": 0.80128324, "num_input_tokens_seen": 51394570, "step": 2400, "time_per_iteration": 2.6402950286865234 }, { "auxiliary_loss_clip": 0.01205578, "auxiliary_loss_mlp": 0.01029811, "balance_loss_clip": 1.06014967, "balance_loss_mlp": 1.02082264, "epoch": 0.2887031804244574, "flos": 24061226958720.0, "grad_norm": 2.224032966905301, "language_loss": 0.91512859, "learning_rate": 3.338222619683321e-06, "loss": 0.93748248, "num_input_tokens_seen": 51414535, "step": 2401, "time_per_iteration": 2.7073590755462646 }, { "auxiliary_loss_clip": 0.01203605, "auxiliary_loss_mlp": 0.0104384, "balance_loss_clip": 0.98403895, "balance_loss_mlp": 1.03441691, "epoch": 0.2888234233150965, "flos": 23330696152320.0, "grad_norm": 5.73240360066121, "language_loss": 0.7363233, "learning_rate": 3.337643615911398e-06, "loss": 0.75879776, "num_input_tokens_seen": 51434160, "step": 2402, "time_per_iteration": 2.684750556945801 }, { "auxiliary_loss_clip": 0.01202936, "auxiliary_loss_mlp": 0.01040195, "balance_loss_clip": 1.01994801, "balance_loss_mlp": 1.03063428, "epoch": 0.2889436662057356, "flos": 22272767856000.0, "grad_norm": 1.9277411131622397, "language_loss": 0.78943706, "learning_rate": 3.3370644092200026e-06, "loss": 0.81186837, "num_input_tokens_seen": 51451435, "step": 2403, "time_per_iteration": 2.7303850650787354 }, { "auxiliary_loss_clip": 0.01183465, "auxiliary_loss_mlp": 0.01032424, "balance_loss_clip": 0.9399789, "balance_loss_mlp": 1.0236268, "epoch": 0.2890639090963747, "flos": 21616931381760.0, "grad_norm": 4.015439821689167, "language_loss": 0.78377676, "learning_rate": 3.3364849996969985e-06, "loss": 0.80593568, "num_input_tokens_seen": 51471455, "step": 2404, "time_per_iteration": 2.6993701457977295 }, { "auxiliary_loss_clip": 0.01200672, "auxiliary_loss_mlp": 0.01034405, "balance_loss_clip": 1.02190566, "balance_loss_mlp": 1.0254643, "epoch": 0.28918415198701375, "flos": 28585540333440.0, "grad_norm": 1.9820609799723872, "language_loss": 0.8547979, "learning_rate": 3.335905387430283e-06, "loss": 0.87714863, "num_input_tokens_seen": 51492890, "step": 2405, "time_per_iteration": 2.802813768386841 }, { "auxiliary_loss_clip": 0.01205813, "auxiliary_loss_mlp": 0.01034625, "balance_loss_clip": 0.98184776, "balance_loss_mlp": 1.02582133, "epoch": 0.28930439487765286, "flos": 21944688007680.0, "grad_norm": 1.9766911700340488, "language_loss": 0.82663321, "learning_rate": 3.335325572507782e-06, "loss": 0.84903765, "num_input_tokens_seen": 51513390, "step": 2406, "time_per_iteration": 2.726977825164795 }, { "auxiliary_loss_clip": 0.01204698, "auxiliary_loss_mlp": 0.01125371, "balance_loss_clip": 1.06165874, "balance_loss_mlp": 0.0, "epoch": 0.28942463776829197, "flos": 19281911955840.0, "grad_norm": 1.596154639669897, "language_loss": 0.73801506, "learning_rate": 3.3347455550174537e-06, "loss": 0.76131576, "num_input_tokens_seen": 51532730, "step": 2407, "time_per_iteration": 2.7225866317749023 }, { "auxiliary_loss_clip": 0.01190512, "auxiliary_loss_mlp": 0.01032006, "balance_loss_clip": 0.94038177, "balance_loss_mlp": 1.02280343, "epoch": 0.289544880658931, "flos": 14645700737280.0, "grad_norm": 2.019715917040385, "language_loss": 0.67993283, "learning_rate": 3.3341653350472864e-06, "loss": 0.70215797, "num_input_tokens_seen": 51549560, "step": 2408, "time_per_iteration": 3.7701423168182373 }, { "auxiliary_loss_clip": 0.01210706, "auxiliary_loss_mlp": 0.01036446, "balance_loss_clip": 1.05957544, "balance_loss_mlp": 1.02648592, "epoch": 0.28966512354957014, "flos": 28621881918720.0, "grad_norm": 2.204439855528983, "language_loss": 0.68848032, "learning_rate": 3.333584912685298e-06, "loss": 0.71095181, "num_input_tokens_seen": 51568180, "step": 2409, "time_per_iteration": 2.679001569747925 }, { "auxiliary_loss_clip": 0.01098986, "auxiliary_loss_mlp": 0.01006491, "balance_loss_clip": 0.91046256, "balance_loss_mlp": 1.00370169, "epoch": 0.28978536644020925, "flos": 64711784511360.0, "grad_norm": 0.8836634804989233, "language_loss": 0.5559774, "learning_rate": 3.3330042880195385e-06, "loss": 0.57703215, "num_input_tokens_seen": 51622530, "step": 2410, "time_per_iteration": 3.270258665084839 }, { "auxiliary_loss_clip": 0.01196605, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 0.97838187, "balance_loss_mlp": 1.02243078, "epoch": 0.2899056093308483, "flos": 18624638937600.0, "grad_norm": 1.853307281025987, "language_loss": 0.79069757, "learning_rate": 3.3324234611380888e-06, "loss": 0.81298184, "num_input_tokens_seen": 51641260, "step": 2411, "time_per_iteration": 2.7274436950683594 }, { "auxiliary_loss_clip": 0.0119427, "auxiliary_loss_mlp": 0.01036878, "balance_loss_clip": 0.9456166, "balance_loss_mlp": 1.02818751, "epoch": 0.2900258522214874, "flos": 22893735202560.0, "grad_norm": 1.49105590308866, "language_loss": 0.82073045, "learning_rate": 3.3318424321290596e-06, "loss": 0.8430419, "num_input_tokens_seen": 51660975, "step": 2412, "time_per_iteration": 3.713630199432373 }, { "auxiliary_loss_clip": 0.01096935, "auxiliary_loss_mlp": 0.01007685, "balance_loss_clip": 0.90980041, "balance_loss_mlp": 1.00474095, "epoch": 0.2901460951121265, "flos": 71106036013440.0, "grad_norm": 0.833116175910257, "language_loss": 0.6000728, "learning_rate": 3.3312612010805917e-06, "loss": 0.62111902, "num_input_tokens_seen": 51720550, "step": 2413, "time_per_iteration": 3.369288206100464 }, { "auxiliary_loss_clip": 0.01189116, "auxiliary_loss_mlp": 0.01032714, "balance_loss_clip": 0.97975039, "balance_loss_mlp": 1.0236783, "epoch": 0.2902663380027656, "flos": 32160986081280.0, "grad_norm": 1.729226987635877, "language_loss": 0.70376563, "learning_rate": 3.330679768080858e-06, "loss": 0.72598398, "num_input_tokens_seen": 51744435, "step": 2414, "time_per_iteration": 3.772744655609131 }, { "auxiliary_loss_clip": 0.01204101, "auxiliary_loss_mlp": 0.01032359, "balance_loss_clip": 1.02489042, "balance_loss_mlp": 1.02313209, "epoch": 0.2903865808934047, "flos": 29351658539520.0, "grad_norm": 1.9542427853048498, "language_loss": 0.83341467, "learning_rate": 3.3300981332180627e-06, "loss": 0.85577929, "num_input_tokens_seen": 51763640, "step": 2415, "time_per_iteration": 3.7222962379455566 }, { "auxiliary_loss_clip": 0.01205651, "auxiliary_loss_mlp": 0.01034214, "balance_loss_clip": 0.94430631, "balance_loss_mlp": 1.02536845, "epoch": 0.29050682378404374, "flos": 17089026647040.0, "grad_norm": 2.3941731077986335, "language_loss": 0.80126631, "learning_rate": 3.3295162965804373e-06, "loss": 0.82366502, "num_input_tokens_seen": 51782135, "step": 2416, "time_per_iteration": 2.8246119022369385 }, { "auxiliary_loss_clip": 0.01197638, "auxiliary_loss_mlp": 0.01027333, "balance_loss_clip": 0.94564551, "balance_loss_mlp": 1.01898873, "epoch": 0.29062706667468285, "flos": 17858233422720.0, "grad_norm": 2.151917577656584, "language_loss": 0.78387165, "learning_rate": 3.328934258256247e-06, "loss": 0.80612135, "num_input_tokens_seen": 51800200, "step": 2417, "time_per_iteration": 2.80025577545166 }, { "auxiliary_loss_clip": 0.01199905, "auxiliary_loss_mlp": 0.01032708, "balance_loss_clip": 1.01831985, "balance_loss_mlp": 1.02303433, "epoch": 0.29074730956532197, "flos": 24279815174400.0, "grad_norm": 2.7254266433149086, "language_loss": 0.66880357, "learning_rate": 3.3283520183337856e-06, "loss": 0.69112968, "num_input_tokens_seen": 51819905, "step": 2418, "time_per_iteration": 2.658466100692749 }, { "auxiliary_loss_clip": 0.01198323, "auxiliary_loss_mlp": 0.01031147, "balance_loss_clip": 0.98184073, "balance_loss_mlp": 1.02186024, "epoch": 0.290867552455961, "flos": 22340961826560.0, "grad_norm": 1.7817093381438038, "language_loss": 0.69625437, "learning_rate": 3.3277695769013797e-06, "loss": 0.71854907, "num_input_tokens_seen": 51839350, "step": 2419, "time_per_iteration": 2.6657474040985107 }, { "auxiliary_loss_clip": 0.01202234, "auxiliary_loss_mlp": 0.01037321, "balance_loss_clip": 1.02198291, "balance_loss_mlp": 1.0277071, "epoch": 0.29098779534660013, "flos": 23186155824000.0, "grad_norm": 1.8171454361945936, "language_loss": 0.77495182, "learning_rate": 3.327186934047385e-06, "loss": 0.79734737, "num_input_tokens_seen": 51858045, "step": 2420, "time_per_iteration": 2.6761014461517334 }, { "auxiliary_loss_clip": 0.01188899, "auxiliary_loss_mlp": 0.01034029, "balance_loss_clip": 0.97830713, "balance_loss_mlp": 1.02526093, "epoch": 0.29110803823723924, "flos": 15304194817920.0, "grad_norm": 1.7835600168804389, "language_loss": 0.65478384, "learning_rate": 3.3266040898601877e-06, "loss": 0.67701316, "num_input_tokens_seen": 51875880, "step": 2421, "time_per_iteration": 2.642688035964966 }, { "auxiliary_loss_clip": 0.01187227, "auxiliary_loss_mlp": 0.01037086, "balance_loss_clip": 0.90148419, "balance_loss_mlp": 1.02785349, "epoch": 0.2912282811278783, "flos": 22595352923520.0, "grad_norm": 1.7600100524831097, "language_loss": 0.7800808, "learning_rate": 3.3260210444282045e-06, "loss": 0.80232388, "num_input_tokens_seen": 51893835, "step": 2422, "time_per_iteration": 2.800309658050537 }, { "auxiliary_loss_clip": 0.01197516, "auxiliary_loss_mlp": 0.01035986, "balance_loss_clip": 1.01999092, "balance_loss_mlp": 1.02706909, "epoch": 0.2913485240185174, "flos": 24497900599680.0, "grad_norm": 2.124257777336987, "language_loss": 0.72899973, "learning_rate": 3.325437797839883e-06, "loss": 0.75133473, "num_input_tokens_seen": 51912205, "step": 2423, "time_per_iteration": 2.728722095489502 }, { "auxiliary_loss_clip": 0.01203103, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.05802476, "balance_loss_mlp": 1.02568221, "epoch": 0.2914687669091565, "flos": 17931024334080.0, "grad_norm": 2.285822695229735, "language_loss": 0.74893665, "learning_rate": 3.3248543501837015e-06, "loss": 0.77131164, "num_input_tokens_seen": 51929410, "step": 2424, "time_per_iteration": 2.556746244430542 }, { "auxiliary_loss_clip": 0.01205778, "auxiliary_loss_mlp": 0.01034644, "balance_loss_clip": 0.90838438, "balance_loss_mlp": 1.02550638, "epoch": 0.2915890097997956, "flos": 22529313768960.0, "grad_norm": 2.0333809223822708, "language_loss": 0.77478653, "learning_rate": 3.3242707015481684e-06, "loss": 0.79719079, "num_input_tokens_seen": 51949345, "step": 2425, "time_per_iteration": 2.8218865394592285 }, { "auxiliary_loss_clip": 0.01198902, "auxiliary_loss_mlp": 0.01043426, "balance_loss_clip": 0.97864604, "balance_loss_mlp": 1.03419924, "epoch": 0.2917092526904347, "flos": 13845216193920.0, "grad_norm": 1.677297984988179, "language_loss": 0.80714118, "learning_rate": 3.323686852021823e-06, "loss": 0.82956451, "num_input_tokens_seen": 51966855, "step": 2426, "time_per_iteration": 2.738102912902832 }, { "auxiliary_loss_clip": 0.01201316, "auxiliary_loss_mlp": 0.01038545, "balance_loss_clip": 0.93968934, "balance_loss_mlp": 1.02901411, "epoch": 0.2918294955810738, "flos": 22674859678080.0, "grad_norm": 1.9428497664060231, "language_loss": 0.79725564, "learning_rate": 3.323102801693235e-06, "loss": 0.81965423, "num_input_tokens_seen": 51985620, "step": 2427, "time_per_iteration": 2.875330924987793 }, { "auxiliary_loss_clip": 0.0119377, "auxiliary_loss_mlp": 0.0103155, "balance_loss_clip": 1.01803458, "balance_loss_mlp": 1.02200699, "epoch": 0.29194973847171285, "flos": 23438284364160.0, "grad_norm": 1.9785685117453184, "language_loss": 0.80580401, "learning_rate": 3.322518550651003e-06, "loss": 0.82805717, "num_input_tokens_seen": 52004930, "step": 2428, "time_per_iteration": 2.730367422103882 }, { "auxiliary_loss_clip": 0.01206745, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 0.98113787, "balance_loss_mlp": 1.02186823, "epoch": 0.29206998136235196, "flos": 21909064694400.0, "grad_norm": 1.6365166183799285, "language_loss": 0.81303692, "learning_rate": 3.3219340989837586e-06, "loss": 0.83541846, "num_input_tokens_seen": 52024920, "step": 2429, "time_per_iteration": 2.696115016937256 }, { "auxiliary_loss_clip": 0.01201613, "auxiliary_loss_mlp": 0.01032064, "balance_loss_clip": 0.98296881, "balance_loss_mlp": 1.02373147, "epoch": 0.292190224252991, "flos": 23215925220480.0, "grad_norm": 1.7780775213526214, "language_loss": 0.80626166, "learning_rate": 3.3213494467801625e-06, "loss": 0.82859838, "num_input_tokens_seen": 52044095, "step": 2430, "time_per_iteration": 2.676456928253174 }, { "auxiliary_loss_clip": 0.01187077, "auxiliary_loss_mlp": 0.01028753, "balance_loss_clip": 0.82142216, "balance_loss_mlp": 1.01950252, "epoch": 0.2923104671436301, "flos": 20740818752640.0, "grad_norm": 2.19017024495622, "language_loss": 0.70892727, "learning_rate": 3.3207645941289063e-06, "loss": 0.73108554, "num_input_tokens_seen": 52062440, "step": 2431, "time_per_iteration": 2.9399492740631104 }, { "auxiliary_loss_clip": 0.01201342, "auxiliary_loss_mlp": 0.01126281, "balance_loss_clip": 1.02136064, "balance_loss_mlp": 0.0, "epoch": 0.29243071003426924, "flos": 35809114999680.0, "grad_norm": 2.018027086924839, "language_loss": 0.80012822, "learning_rate": 3.320179541118711e-06, "loss": 0.82340455, "num_input_tokens_seen": 52084940, "step": 2432, "time_per_iteration": 3.137984037399292 }, { "auxiliary_loss_clip": 0.01096972, "auxiliary_loss_mlp": 0.01003822, "balance_loss_clip": 0.98433983, "balance_loss_mlp": 1.00086582, "epoch": 0.2925509529249083, "flos": 58081598524800.0, "grad_norm": 1.0031675562425986, "language_loss": 0.60326523, "learning_rate": 3.3195942878383293e-06, "loss": 0.62427312, "num_input_tokens_seen": 52141040, "step": 2433, "time_per_iteration": 3.2322158813476562 }, { "auxiliary_loss_clip": 0.01205562, "auxiliary_loss_mlp": 0.01032117, "balance_loss_clip": 1.02097213, "balance_loss_mlp": 1.02255678, "epoch": 0.2926711958155474, "flos": 21397122103680.0, "grad_norm": 1.9967971341313069, "language_loss": 0.778979, "learning_rate": 3.319008834376543e-06, "loss": 0.80135578, "num_input_tokens_seen": 52160730, "step": 2434, "time_per_iteration": 2.7398428916931152 }, { "auxiliary_loss_clip": 0.0120217, "auxiliary_loss_mlp": 0.01034173, "balance_loss_clip": 0.93840027, "balance_loss_mlp": 1.02441609, "epoch": 0.2927914387061865, "flos": 23185796688000.0, "grad_norm": 2.327100187491823, "language_loss": 0.88771963, "learning_rate": 3.3184231808221654e-06, "loss": 0.910083, "num_input_tokens_seen": 52175055, "step": 2435, "time_per_iteration": 3.8159923553466797 }, { "auxiliary_loss_clip": 0.01200866, "auxiliary_loss_mlp": 0.01029974, "balance_loss_clip": 0.94214571, "balance_loss_mlp": 1.0209614, "epoch": 0.29291168159682557, "flos": 22455553190400.0, "grad_norm": 1.7594368498036443, "language_loss": 0.62845689, "learning_rate": 3.3178373272640394e-06, "loss": 0.6507653, "num_input_tokens_seen": 52194150, "step": 2436, "time_per_iteration": 2.699065923690796 }, { "auxiliary_loss_clip": 0.01201407, "auxiliary_loss_mlp": 0.0103378, "balance_loss_clip": 1.05769062, "balance_loss_mlp": 1.02455902, "epoch": 0.2930319244874647, "flos": 21170632896000.0, "grad_norm": 2.230385509986443, "language_loss": 0.85257578, "learning_rate": 3.3172512737910387e-06, "loss": 0.8749277, "num_input_tokens_seen": 52211660, "step": 2437, "time_per_iteration": 3.8513903617858887 }, { "auxiliary_loss_clip": 0.01201212, "auxiliary_loss_mlp": 0.01038081, "balance_loss_clip": 1.01982021, "balance_loss_mlp": 1.02907443, "epoch": 0.2931521673781038, "flos": 31357843931520.0, "grad_norm": 2.0246720609298947, "language_loss": 0.88399184, "learning_rate": 3.3166650204920674e-06, "loss": 0.90638471, "num_input_tokens_seen": 52232830, "step": 2438, "time_per_iteration": 2.778045892715454 }, { "auxiliary_loss_clip": 0.01203516, "auxiliary_loss_mlp": 0.01040229, "balance_loss_clip": 1.02242851, "balance_loss_mlp": 1.03082931, "epoch": 0.29327241026874284, "flos": 24200990778240.0, "grad_norm": 1.889456933769567, "language_loss": 0.81639171, "learning_rate": 3.316078567456059e-06, "loss": 0.83882916, "num_input_tokens_seen": 52250670, "step": 2439, "time_per_iteration": 2.7425220012664795 }, { "auxiliary_loss_clip": 0.01200836, "auxiliary_loss_mlp": 0.01029593, "balance_loss_clip": 0.86679411, "balance_loss_mlp": 1.0205276, "epoch": 0.29339265315938196, "flos": 24242611662720.0, "grad_norm": 1.503934416152367, "language_loss": 0.76491988, "learning_rate": 3.3154919147719786e-06, "loss": 0.78722417, "num_input_tokens_seen": 52271685, "step": 2440, "time_per_iteration": 3.96586275100708 }, { "auxiliary_loss_clip": 0.01202357, "auxiliary_loss_mlp": 0.01032315, "balance_loss_clip": 1.02013707, "balance_loss_mlp": 1.022367, "epoch": 0.29351289605002107, "flos": 16946641134720.0, "grad_norm": 1.8728594433772463, "language_loss": 0.86329931, "learning_rate": 3.31490506252882e-06, "loss": 0.88564605, "num_input_tokens_seen": 52291065, "step": 2441, "time_per_iteration": 3.4703354835510254 }, { "auxiliary_loss_clip": 0.01190139, "auxiliary_loss_mlp": 0.01034115, "balance_loss_clip": 0.94093513, "balance_loss_mlp": 1.0254066, "epoch": 0.2936331389406601, "flos": 19829082810240.0, "grad_norm": 1.8487965823988575, "language_loss": 0.84181428, "learning_rate": 3.31431801081561e-06, "loss": 0.86405683, "num_input_tokens_seen": 52310000, "step": 2442, "time_per_iteration": 2.6593286991119385 }, { "auxiliary_loss_clip": 0.01090417, "auxiliary_loss_mlp": 0.01007809, "balance_loss_clip": 0.94427252, "balance_loss_mlp": 1.00473309, "epoch": 0.29375338183129923, "flos": 71416844398080.0, "grad_norm": 0.903727202631929, "language_loss": 0.67899871, "learning_rate": 3.313730759721402e-06, "loss": 0.69998097, "num_input_tokens_seen": 52372930, "step": 2443, "time_per_iteration": 3.293757200241089 }, { "auxiliary_loss_clip": 0.01198613, "auxiliary_loss_mlp": 0.01036154, "balance_loss_clip": 0.98201621, "balance_loss_mlp": 1.0277493, "epoch": 0.29387362472193834, "flos": 22054502862720.0, "grad_norm": 2.201870393733751, "language_loss": 0.86134672, "learning_rate": 3.313143309335282e-06, "loss": 0.88369435, "num_input_tokens_seen": 52391420, "step": 2444, "time_per_iteration": 2.6955909729003906 }, { "auxiliary_loss_clip": 0.01198243, "auxiliary_loss_mlp": 0.01030814, "balance_loss_clip": 0.94451177, "balance_loss_mlp": 1.02109277, "epoch": 0.2939938676125774, "flos": 22966418373120.0, "grad_norm": 1.777485230454883, "language_loss": 0.84928393, "learning_rate": 3.3125556597463665e-06, "loss": 0.87157446, "num_input_tokens_seen": 52410725, "step": 2445, "time_per_iteration": 2.7368342876434326 }, { "auxiliary_loss_clip": 0.01200909, "auxiliary_loss_mlp": 0.01035827, "balance_loss_clip": 1.02216625, "balance_loss_mlp": 1.02701116, "epoch": 0.2941141105032165, "flos": 31358705857920.0, "grad_norm": 1.9639037864391948, "language_loss": 0.66421354, "learning_rate": 3.311967811043801e-06, "loss": 0.6865809, "num_input_tokens_seen": 52432645, "step": 2446, "time_per_iteration": 2.736133337020874 }, { "auxiliary_loss_clip": 0.01202123, "auxiliary_loss_mlp": 0.01049691, "balance_loss_clip": 1.02197254, "balance_loss_mlp": 1.03973091, "epoch": 0.29423435339385556, "flos": 23222138273280.0, "grad_norm": 2.1080205978190394, "language_loss": 0.81826079, "learning_rate": 3.3113797633167617e-06, "loss": 0.84077895, "num_input_tokens_seen": 52450940, "step": 2447, "time_per_iteration": 2.8068342208862305 }, { "auxiliary_loss_clip": 0.01202065, "auxiliary_loss_mlp": 0.01032663, "balance_loss_clip": 1.05860281, "balance_loss_mlp": 1.02402639, "epoch": 0.2943545962844947, "flos": 26864054138880.0, "grad_norm": 2.481595914335354, "language_loss": 0.69052368, "learning_rate": 3.310791516654455e-06, "loss": 0.71287102, "num_input_tokens_seen": 52468000, "step": 2448, "time_per_iteration": 2.682800531387329 }, { "auxiliary_loss_clip": 0.01205669, "auxiliary_loss_mlp": 0.01034084, "balance_loss_clip": 0.94289839, "balance_loss_mlp": 1.02408838, "epoch": 0.2944748391751338, "flos": 20231677422720.0, "grad_norm": 2.3077656519850724, "language_loss": 0.79612857, "learning_rate": 3.3102030711461177e-06, "loss": 0.81852615, "num_input_tokens_seen": 52487575, "step": 2449, "time_per_iteration": 2.728415012359619 }, { "auxiliary_loss_clip": 0.01201232, "auxiliary_loss_mlp": 0.01028467, "balance_loss_clip": 0.94292361, "balance_loss_mlp": 1.0194726, "epoch": 0.29459508206577284, "flos": 15960965045760.0, "grad_norm": 1.7341341036939817, "language_loss": 0.67562431, "learning_rate": 3.3096144268810156e-06, "loss": 0.69792128, "num_input_tokens_seen": 52506335, "step": 2450, "time_per_iteration": 2.6496734619140625 }, { "auxiliary_loss_clip": 0.01189732, "auxiliary_loss_mlp": 0.01030575, "balance_loss_clip": 1.01734579, "balance_loss_mlp": 1.02076983, "epoch": 0.29471532495641195, "flos": 20412882558720.0, "grad_norm": 1.9409369400225738, "language_loss": 0.72666728, "learning_rate": 3.3090255839484462e-06, "loss": 0.74887031, "num_input_tokens_seen": 52524330, "step": 2451, "time_per_iteration": 2.6447713375091553 }, { "auxiliary_loss_clip": 0.01201683, "auxiliary_loss_mlp": 0.01029338, "balance_loss_clip": 0.98053408, "balance_loss_mlp": 1.01938987, "epoch": 0.29483556784705106, "flos": 20376576887040.0, "grad_norm": 1.8513338048901606, "language_loss": 0.8532908, "learning_rate": 3.3084365424377366e-06, "loss": 0.87560105, "num_input_tokens_seen": 52543095, "step": 2452, "time_per_iteration": 2.684168815612793 }, { "auxiliary_loss_clip": 0.01105909, "auxiliary_loss_mlp": 0.01004383, "balance_loss_clip": 0.87787747, "balance_loss_mlp": 1.00102139, "epoch": 0.2949558107376901, "flos": 68555660595840.0, "grad_norm": 0.7285652910090866, "language_loss": 0.55987179, "learning_rate": 3.307847302438245e-06, "loss": 0.58097476, "num_input_tokens_seen": 52597075, "step": 2453, "time_per_iteration": 3.3153493404388428 }, { "auxiliary_loss_clip": 0.01179412, "auxiliary_loss_mlp": 0.01038621, "balance_loss_clip": 0.89955598, "balance_loss_mlp": 1.02894664, "epoch": 0.2950760536283292, "flos": 16107085572480.0, "grad_norm": 1.9887050759693945, "language_loss": 0.77646279, "learning_rate": 3.3072578640393562e-06, "loss": 0.79864311, "num_input_tokens_seen": 52614410, "step": 2454, "time_per_iteration": 2.936155319213867 }, { "auxiliary_loss_clip": 0.0120276, "auxiliary_loss_mlp": 0.01030795, "balance_loss_clip": 0.98284888, "balance_loss_mlp": 1.02168727, "epoch": 0.29519629651896834, "flos": 20483626394880.0, "grad_norm": 2.2737932809834636, "language_loss": 0.79274201, "learning_rate": 3.3066682273304886e-06, "loss": 0.81507754, "num_input_tokens_seen": 52632055, "step": 2455, "time_per_iteration": 2.688434362411499 }, { "auxiliary_loss_clip": 0.01207471, "auxiliary_loss_mlp": 0.01126288, "balance_loss_clip": 1.02040815, "balance_loss_mlp": 0.0, "epoch": 0.2953165394096074, "flos": 18916484941440.0, "grad_norm": 2.2534014095874024, "language_loss": 0.78800666, "learning_rate": 3.3060783924010904e-06, "loss": 0.81134427, "num_input_tokens_seen": 52649980, "step": 2456, "time_per_iteration": 2.754831552505493 }, { "auxiliary_loss_clip": 0.01200037, "auxiliary_loss_mlp": 0.01031688, "balance_loss_clip": 0.9447881, "balance_loss_mlp": 1.02246165, "epoch": 0.2954367823002465, "flos": 20624467622400.0, "grad_norm": 2.0942555824080182, "language_loss": 0.84890711, "learning_rate": 3.3054883593406387e-06, "loss": 0.87122428, "num_input_tokens_seen": 52664730, "step": 2457, "time_per_iteration": 2.7823634147644043 }, { "auxiliary_loss_clip": 0.01202346, "auxiliary_loss_mlp": 0.01033854, "balance_loss_clip": 0.98144472, "balance_loss_mlp": 1.02484822, "epoch": 0.2955570251908856, "flos": 31175525473920.0, "grad_norm": 3.0479852718182503, "language_loss": 0.64803386, "learning_rate": 3.3048981282386404e-06, "loss": 0.67039585, "num_input_tokens_seen": 52686040, "step": 2458, "time_per_iteration": 2.884507894515991 }, { "auxiliary_loss_clip": 0.01185103, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 0.94343776, "balance_loss_mlp": 1.02520418, "epoch": 0.29567726808152467, "flos": 21650328051840.0, "grad_norm": 2.0643115995627697, "language_loss": 0.82558215, "learning_rate": 3.304307699184634e-06, "loss": 0.84777749, "num_input_tokens_seen": 52704630, "step": 2459, "time_per_iteration": 2.784229278564453 }, { "auxiliary_loss_clip": 0.01202778, "auxiliary_loss_mlp": 0.01038938, "balance_loss_clip": 0.98418808, "balance_loss_mlp": 1.02991998, "epoch": 0.2957975109721638, "flos": 24243868638720.0, "grad_norm": 1.574941683767417, "language_loss": 0.79004169, "learning_rate": 3.3037170722681866e-06, "loss": 0.81245887, "num_input_tokens_seen": 52725465, "step": 2460, "time_per_iteration": 3.7397711277008057 }, { "auxiliary_loss_clip": 0.01185888, "auxiliary_loss_mlp": 0.0103123, "balance_loss_clip": 0.94139153, "balance_loss_mlp": 1.02226543, "epoch": 0.29591775386280283, "flos": 13479717352320.0, "grad_norm": 2.3735329455128644, "language_loss": 0.68782675, "learning_rate": 3.3031262475788956e-06, "loss": 0.70999789, "num_input_tokens_seen": 52742405, "step": 2461, "time_per_iteration": 2.7413711547851562 }, { "auxiliary_loss_clip": 0.01196628, "auxiliary_loss_mlp": 0.01037015, "balance_loss_clip": 0.98117596, "balance_loss_mlp": 1.02837217, "epoch": 0.29603799675344195, "flos": 17749783284480.0, "grad_norm": 1.7122735204189754, "language_loss": 0.73233312, "learning_rate": 3.3025352252063897e-06, "loss": 0.75466955, "num_input_tokens_seen": 52761100, "step": 2462, "time_per_iteration": 2.699871778488159 }, { "auxiliary_loss_clip": 0.01199433, "auxiliary_loss_mlp": 0.01033459, "balance_loss_clip": 1.02181125, "balance_loss_mlp": 1.02416039, "epoch": 0.29615823964408106, "flos": 22783920347520.0, "grad_norm": 2.1775419711571518, "language_loss": 0.75165921, "learning_rate": 3.3019440052403252e-06, "loss": 0.77398819, "num_input_tokens_seen": 52780965, "step": 2463, "time_per_iteration": 2.7435853481292725 }, { "auxiliary_loss_clip": 0.01202476, "auxiliary_loss_mlp": 0.01034913, "balance_loss_clip": 0.98227227, "balance_loss_mlp": 1.02628803, "epoch": 0.2962784825347201, "flos": 23514199758720.0, "grad_norm": 2.466974694583679, "language_loss": 0.70510042, "learning_rate": 3.30135258777039e-06, "loss": 0.72747427, "num_input_tokens_seen": 52800335, "step": 2464, "time_per_iteration": 3.820847749710083 }, { "auxiliary_loss_clip": 0.01203952, "auxiliary_loss_mlp": 0.01125943, "balance_loss_clip": 1.01804233, "balance_loss_mlp": 0.0, "epoch": 0.2963987254253592, "flos": 16362769559040.0, "grad_norm": 1.8016935677675059, "language_loss": 0.70138526, "learning_rate": 3.3007609728863024e-06, "loss": 0.72468424, "num_input_tokens_seen": 52818425, "step": 2465, "time_per_iteration": 2.678868293762207 }, { "auxiliary_loss_clip": 0.01189359, "auxiliary_loss_mlp": 0.01036213, "balance_loss_clip": 0.86990631, "balance_loss_mlp": 1.02752256, "epoch": 0.29651896831599833, "flos": 33472263980160.0, "grad_norm": 3.220208408274187, "language_loss": 0.72837782, "learning_rate": 3.300169160677809e-06, "loss": 0.7506336, "num_input_tokens_seen": 52842340, "step": 2466, "time_per_iteration": 3.8990540504455566 }, { "auxiliary_loss_clip": 0.01205318, "auxiliary_loss_mlp": 0.01030809, "balance_loss_clip": 0.94368571, "balance_loss_mlp": 1.02109933, "epoch": 0.2966392112066374, "flos": 23805363404160.0, "grad_norm": 2.4734518299111135, "language_loss": 0.77664697, "learning_rate": 3.2995771512346878e-06, "loss": 0.79900819, "num_input_tokens_seen": 52860690, "step": 2467, "time_per_iteration": 3.640841245651245 }, { "auxiliary_loss_clip": 0.01206787, "auxiliary_loss_mlp": 0.01125875, "balance_loss_clip": 1.06017876, "balance_loss_mlp": 0.0, "epoch": 0.2967594540972765, "flos": 19938466702080.0, "grad_norm": 2.1220830331470175, "language_loss": 0.73212671, "learning_rate": 3.298984944646746e-06, "loss": 0.75545335, "num_input_tokens_seen": 52879370, "step": 2468, "time_per_iteration": 2.707937240600586 }, { "auxiliary_loss_clip": 0.01206759, "auxiliary_loss_mlp": 0.01125308, "balance_loss_clip": 1.02409244, "balance_loss_mlp": 0.0, "epoch": 0.2968796969879156, "flos": 23732823888000.0, "grad_norm": 1.7497199819854754, "language_loss": 0.81558919, "learning_rate": 3.298392541003822e-06, "loss": 0.83890986, "num_input_tokens_seen": 52898775, "step": 2469, "time_per_iteration": 2.7965846061706543 }, { "auxiliary_loss_clip": 0.01201808, "auxiliary_loss_mlp": 0.01033215, "balance_loss_clip": 0.98454499, "balance_loss_mlp": 1.02379751, "epoch": 0.29699993987855466, "flos": 22893699288960.0, "grad_norm": 1.71655433160864, "language_loss": 0.89487934, "learning_rate": 3.2977999403957806e-06, "loss": 0.91722953, "num_input_tokens_seen": 52917535, "step": 2470, "time_per_iteration": 2.6909642219543457 }, { "auxiliary_loss_clip": 0.01205255, "auxiliary_loss_mlp": 0.010311, "balance_loss_clip": 1.06148994, "balance_loss_mlp": 1.02198648, "epoch": 0.2971201827691938, "flos": 33832555349760.0, "grad_norm": 1.8115312894128641, "language_loss": 0.66948485, "learning_rate": 3.2972071429125207e-06, "loss": 0.6918484, "num_input_tokens_seen": 52938755, "step": 2471, "time_per_iteration": 2.8525869846343994 }, { "auxiliary_loss_clip": 0.01192874, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 0.94253665, "balance_loss_mlp": 1.02421117, "epoch": 0.2972404256598329, "flos": 22054359208320.0, "grad_norm": 2.1950383873453916, "language_loss": 0.88405865, "learning_rate": 3.2966141486439682e-06, "loss": 0.90632045, "num_input_tokens_seen": 52957945, "step": 2472, "time_per_iteration": 2.7819738388061523 }, { "auxiliary_loss_clip": 0.01195372, "auxiliary_loss_mlp": 0.0103129, "balance_loss_clip": 0.86339658, "balance_loss_mlp": 1.02126408, "epoch": 0.29736066855047194, "flos": 31978595796480.0, "grad_norm": 2.247549233992048, "language_loss": 0.6422379, "learning_rate": 3.29602095768008e-06, "loss": 0.66450453, "num_input_tokens_seen": 52978460, "step": 2473, "time_per_iteration": 2.869983196258545 }, { "auxiliary_loss_clip": 0.01194344, "auxiliary_loss_mlp": 0.01033196, "balance_loss_clip": 0.98327458, "balance_loss_mlp": 1.02423787, "epoch": 0.29748091144111105, "flos": 33510401245440.0, "grad_norm": 1.859930713198558, "language_loss": 0.63866723, "learning_rate": 3.2954275701108437e-06, "loss": 0.66094261, "num_input_tokens_seen": 52999640, "step": 2474, "time_per_iteration": 2.796905279159546 }, { "auxiliary_loss_clip": 0.01190292, "auxiliary_loss_mlp": 0.01037347, "balance_loss_clip": 0.90245891, "balance_loss_mlp": 1.02790594, "epoch": 0.29760115433175016, "flos": 41283373409280.0, "grad_norm": 1.7312871061377877, "language_loss": 0.68548369, "learning_rate": 3.294833986026275e-06, "loss": 0.7077601, "num_input_tokens_seen": 53022880, "step": 2475, "time_per_iteration": 2.9326674938201904 }, { "auxiliary_loss_clip": 0.01189626, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 0.94361782, "balance_loss_mlp": 1.02589834, "epoch": 0.2977213972223892, "flos": 24493339572480.0, "grad_norm": 2.34863905846847, "language_loss": 0.85116303, "learning_rate": 3.29424020551642e-06, "loss": 0.87340611, "num_input_tokens_seen": 53041515, "step": 2476, "time_per_iteration": 2.7818195819854736 }, { "auxiliary_loss_clip": 0.01207495, "auxiliary_loss_mlp": 0.01044936, "balance_loss_clip": 1.06107712, "balance_loss_mlp": 1.03547049, "epoch": 0.2978416401130283, "flos": 21285116519040.0, "grad_norm": 2.040246926914697, "language_loss": 0.72036082, "learning_rate": 3.2936462286713546e-06, "loss": 0.74288511, "num_input_tokens_seen": 53059865, "step": 2477, "time_per_iteration": 2.6605374813079834 }, { "auxiliary_loss_clip": 0.01201802, "auxiliary_loss_mlp": 0.01030651, "balance_loss_clip": 1.02071238, "balance_loss_mlp": 1.02057219, "epoch": 0.2979618830036674, "flos": 25772154554880.0, "grad_norm": 2.073837195105151, "language_loss": 0.77436364, "learning_rate": 3.2930520555811846e-06, "loss": 0.7966882, "num_input_tokens_seen": 53079490, "step": 2478, "time_per_iteration": 2.7456867694854736 }, { "auxiliary_loss_clip": 0.01163707, "auxiliary_loss_mlp": 0.01126485, "balance_loss_clip": 0.81932378, "balance_loss_mlp": 0.0, "epoch": 0.2980821258943065, "flos": 23476996247040.0, "grad_norm": 1.8896566280984528, "language_loss": 0.80040598, "learning_rate": 3.292457686336046e-06, "loss": 0.82330787, "num_input_tokens_seen": 53098810, "step": 2479, "time_per_iteration": 2.8743367195129395 }, { "auxiliary_loss_clip": 0.01103317, "auxiliary_loss_mlp": 0.0101665, "balance_loss_clip": 0.95113748, "balance_loss_mlp": 1.01377714, "epoch": 0.2982023687849456, "flos": 69752314195200.0, "grad_norm": 0.8614690619440338, "language_loss": 0.61253589, "learning_rate": 3.291863121026105e-06, "loss": 0.63373554, "num_input_tokens_seen": 53162590, "step": 2480, "time_per_iteration": 3.665418863296509 }, { "auxiliary_loss_clip": 0.01203006, "auxiliary_loss_mlp": 0.01033533, "balance_loss_clip": 1.02300811, "balance_loss_mlp": 1.02397883, "epoch": 0.29832261167558466, "flos": 29825930741760.0, "grad_norm": 1.8574983769015148, "language_loss": 0.76762569, "learning_rate": 3.2912683597415547e-06, "loss": 0.78999102, "num_input_tokens_seen": 53186675, "step": 2481, "time_per_iteration": 2.7899703979492188 }, { "auxiliary_loss_clip": 0.0120156, "auxiliary_loss_mlp": 0.01031584, "balance_loss_clip": 0.94362998, "balance_loss_mlp": 1.02232111, "epoch": 0.29844285456622377, "flos": 33910158683520.0, "grad_norm": 2.1209514470410538, "language_loss": 0.77556515, "learning_rate": 3.2906734025726213e-06, "loss": 0.79789662, "num_input_tokens_seen": 53205940, "step": 2482, "time_per_iteration": 3.1277096271514893 }, { "auxiliary_loss_clip": 0.01208627, "auxiliary_loss_mlp": 0.01031397, "balance_loss_clip": 1.02271068, "balance_loss_mlp": 1.02148485, "epoch": 0.2985630974568629, "flos": 23876933253120.0, "grad_norm": 1.8619188802828006, "language_loss": 0.87776619, "learning_rate": 3.290078249609559e-06, "loss": 0.90016639, "num_input_tokens_seen": 53225360, "step": 2483, "time_per_iteration": 2.7451171875 }, { "auxiliary_loss_clip": 0.01201998, "auxiliary_loss_mlp": 0.01036305, "balance_loss_clip": 1.02322304, "balance_loss_mlp": 1.02737665, "epoch": 0.29868334034750194, "flos": 21799106184960.0, "grad_norm": 2.120015354603101, "language_loss": 0.88423288, "learning_rate": 3.2894829009426514e-06, "loss": 0.90661585, "num_input_tokens_seen": 53243195, "step": 2484, "time_per_iteration": 2.6420230865478516 }, { "auxiliary_loss_clip": 0.01198081, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 1.02044821, "balance_loss_mlp": 1.02507341, "epoch": 0.29880358323814105, "flos": 25666649331840.0, "grad_norm": 1.8625766536943218, "language_loss": 0.77875543, "learning_rate": 3.288887356662213e-06, "loss": 0.80107927, "num_input_tokens_seen": 53264530, "step": 2485, "time_per_iteration": 2.677917003631592 }, { "auxiliary_loss_clip": 0.01092006, "auxiliary_loss_mlp": 0.01008137, "balance_loss_clip": 0.98480064, "balance_loss_mlp": 1.00525224, "epoch": 0.29892382612878016, "flos": 71005846003200.0, "grad_norm": 0.7880549536188021, "language_loss": 0.59718138, "learning_rate": 3.288291616858588e-06, "loss": 0.61818278, "num_input_tokens_seen": 53319920, "step": 2486, "time_per_iteration": 4.477405548095703 }, { "auxiliary_loss_clip": 0.01197191, "auxiliary_loss_mlp": 0.01041192, "balance_loss_clip": 0.9087317, "balance_loss_mlp": 1.03210247, "epoch": 0.2990440690194192, "flos": 25481134563840.0, "grad_norm": 1.8417112176132298, "language_loss": 0.76424766, "learning_rate": 3.287695681622149e-06, "loss": 0.78663152, "num_input_tokens_seen": 53339270, "step": 2487, "time_per_iteration": 2.799269914627075 }, { "auxiliary_loss_clip": 0.01207488, "auxiliary_loss_mlp": 0.01031559, "balance_loss_clip": 0.9847278, "balance_loss_mlp": 1.02266598, "epoch": 0.2991643119100583, "flos": 23732357011200.0, "grad_norm": 1.6788588881106112, "language_loss": 0.80842358, "learning_rate": 3.2870995510432982e-06, "loss": 0.830814, "num_input_tokens_seen": 53357750, "step": 2488, "time_per_iteration": 2.7238893508911133 }, { "auxiliary_loss_clip": 0.01194049, "auxiliary_loss_mlp": 0.01034868, "balance_loss_clip": 1.01958251, "balance_loss_mlp": 1.02599239, "epoch": 0.29928455480069743, "flos": 27417545786880.0, "grad_norm": 1.7799450789387796, "language_loss": 0.76993585, "learning_rate": 3.2865032252124697e-06, "loss": 0.792225, "num_input_tokens_seen": 53378265, "step": 2489, "time_per_iteration": 3.7109577655792236 }, { "auxiliary_loss_clip": 0.01194263, "auxiliary_loss_mlp": 0.01037263, "balance_loss_clip": 0.97823477, "balance_loss_mlp": 1.02878702, "epoch": 0.2994047976913365, "flos": 33692935184640.0, "grad_norm": 2.008621665356517, "language_loss": 0.77370518, "learning_rate": 3.2859067042201243e-06, "loss": 0.79602045, "num_input_tokens_seen": 53400305, "step": 2490, "time_per_iteration": 2.7655258178710938 }, { "auxiliary_loss_clip": 0.01169481, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 0.86329615, "balance_loss_mlp": 1.020715, "epoch": 0.2995250405819756, "flos": 16763963541120.0, "grad_norm": 1.8282945977925398, "language_loss": 0.77953595, "learning_rate": 3.2853099881567544e-06, "loss": 0.80152351, "num_input_tokens_seen": 53418705, "step": 2491, "time_per_iteration": 2.8433711528778076 }, { "auxiliary_loss_clip": 0.01202636, "auxiliary_loss_mlp": 0.01037586, "balance_loss_clip": 1.06064618, "balance_loss_mlp": 1.02918744, "epoch": 0.29964528347261465, "flos": 22963976248320.0, "grad_norm": 1.6475391816449279, "language_loss": 0.79336846, "learning_rate": 3.284713077112881e-06, "loss": 0.81577069, "num_input_tokens_seen": 53438135, "step": 2492, "time_per_iteration": 2.6849985122680664 }, { "auxiliary_loss_clip": 0.01208662, "auxiliary_loss_mlp": 0.0104373, "balance_loss_clip": 0.94843721, "balance_loss_mlp": 1.03419328, "epoch": 0.29976552636325376, "flos": 16938021870720.0, "grad_norm": 4.366687555616971, "language_loss": 0.86883491, "learning_rate": 3.284115971179056e-06, "loss": 0.89135879, "num_input_tokens_seen": 53452165, "step": 2493, "time_per_iteration": 3.6348865032196045 }, { "auxiliary_loss_clip": 0.01200318, "auxiliary_loss_mlp": 0.01037235, "balance_loss_clip": 0.86817968, "balance_loss_mlp": 1.02857399, "epoch": 0.2998857692538929, "flos": 17056455989760.0, "grad_norm": 1.7185449194200753, "language_loss": 0.78626579, "learning_rate": 3.283518670445859e-06, "loss": 0.80864131, "num_input_tokens_seen": 53470075, "step": 2494, "time_per_iteration": 2.7967872619628906 }, { "auxiliary_loss_clip": 0.01088757, "auxiliary_loss_mlp": 0.01120699, "balance_loss_clip": 0.94551504, "balance_loss_mlp": 0.0, "epoch": 0.30000601214453193, "flos": 68831528025600.0, "grad_norm": 0.6818989561709633, "language_loss": 0.54306567, "learning_rate": 3.2829211750038995e-06, "loss": 0.56516021, "num_input_tokens_seen": 53538705, "step": 2495, "time_per_iteration": 3.3285820484161377 }, { "auxiliary_loss_clip": 0.01195693, "auxiliary_loss_mlp": 0.01035052, "balance_loss_clip": 0.94402885, "balance_loss_mlp": 1.02580702, "epoch": 0.30012625503517104, "flos": 17603267708160.0, "grad_norm": 2.0550035676429923, "language_loss": 0.89191341, "learning_rate": 3.2823234849438183e-06, "loss": 0.91422093, "num_input_tokens_seen": 53556740, "step": 2496, "time_per_iteration": 2.8098878860473633 }, { "auxiliary_loss_clip": 0.01205367, "auxiliary_loss_mlp": 0.01030139, "balance_loss_clip": 0.98405111, "balance_loss_mlp": 1.02174652, "epoch": 0.30024649792581015, "flos": 21252581775360.0, "grad_norm": 1.935030441223049, "language_loss": 0.75913537, "learning_rate": 3.2817256003562836e-06, "loss": 0.78149045, "num_input_tokens_seen": 53577115, "step": 2497, "time_per_iteration": 2.810328722000122 }, { "auxiliary_loss_clip": 0.01199089, "auxiliary_loss_mlp": 0.01033575, "balance_loss_clip": 0.86701107, "balance_loss_mlp": 1.02298284, "epoch": 0.3003667408164492, "flos": 23003262748800.0, "grad_norm": 1.8571487563919657, "language_loss": 0.6623081, "learning_rate": 3.281127521331995e-06, "loss": 0.68463469, "num_input_tokens_seen": 53598295, "step": 2498, "time_per_iteration": 2.8498120307922363 }, { "auxiliary_loss_clip": 0.01089588, "auxiliary_loss_mlp": 0.01004007, "balance_loss_clip": 1.01973808, "balance_loss_mlp": 1.001122, "epoch": 0.3004869837070883, "flos": 64232340750720.0, "grad_norm": 0.8826277316212152, "language_loss": 0.60714608, "learning_rate": 3.2805292479616798e-06, "loss": 0.62808198, "num_input_tokens_seen": 53657160, "step": 2499, "time_per_iteration": 3.0844039916992188 }, { "auxiliary_loss_clip": 0.01199805, "auxiliary_loss_mlp": 0.01044721, "balance_loss_clip": 0.98098713, "balance_loss_mlp": 1.03588748, "epoch": 0.30060722659772743, "flos": 26248653400320.0, "grad_norm": 2.3799216428694447, "language_loss": 0.91487384, "learning_rate": 3.2799307803360955e-06, "loss": 0.93731916, "num_input_tokens_seen": 53673090, "step": 2500, "time_per_iteration": 2.724062204360962 }, { "auxiliary_loss_clip": 0.01197926, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 1.05670762, "balance_loss_mlp": 1.01743889, "epoch": 0.3007274694883665, "flos": 24970879912320.0, "grad_norm": 1.488374718308427, "language_loss": 0.81621879, "learning_rate": 3.27933211854603e-06, "loss": 0.83846158, "num_input_tokens_seen": 53692145, "step": 2501, "time_per_iteration": 2.7213189601898193 }, { "auxiliary_loss_clip": 0.01199674, "auxiliary_loss_mlp": 0.01027654, "balance_loss_clip": 0.98191142, "balance_loss_mlp": 1.0188266, "epoch": 0.3008477123790056, "flos": 17055845458560.0, "grad_norm": 1.575782011488699, "language_loss": 0.8721993, "learning_rate": 3.278733262682299e-06, "loss": 0.8944726, "num_input_tokens_seen": 53710000, "step": 2502, "time_per_iteration": 2.6547138690948486 }, { "auxiliary_loss_clip": 0.0120181, "auxiliary_loss_mlp": 0.01039996, "balance_loss_clip": 1.05818343, "balance_loss_mlp": 1.02979159, "epoch": 0.3009679552696447, "flos": 21506398254720.0, "grad_norm": 2.1961457323481226, "language_loss": 0.82698882, "learning_rate": 3.2781342128357484e-06, "loss": 0.84940684, "num_input_tokens_seen": 53729355, "step": 2503, "time_per_iteration": 2.67177677154541 }, { "auxiliary_loss_clip": 0.01196204, "auxiliary_loss_mlp": 0.01030982, "balance_loss_clip": 0.94037008, "balance_loss_mlp": 1.02282166, "epoch": 0.30108819816028376, "flos": 21134004001920.0, "grad_norm": 3.159848511694097, "language_loss": 0.80934417, "learning_rate": 3.2775349690972547e-06, "loss": 0.83161604, "num_input_tokens_seen": 53743505, "step": 2504, "time_per_iteration": 2.77787446975708 }, { "auxiliary_loss_clip": 0.01084094, "auxiliary_loss_mlp": 0.01002931, "balance_loss_clip": 0.97875535, "balance_loss_mlp": 0.99983138, "epoch": 0.30120844105092287, "flos": 71126434938240.0, "grad_norm": 0.7583358183491319, "language_loss": 0.51853579, "learning_rate": 3.276935531557722e-06, "loss": 0.53940606, "num_input_tokens_seen": 53808725, "step": 2505, "time_per_iteration": 3.3835220336914062 }, { "auxiliary_loss_clip": 0.01202565, "auxiliary_loss_mlp": 0.01039509, "balance_loss_clip": 0.90619886, "balance_loss_mlp": 1.02971029, "epoch": 0.301328683941562, "flos": 20264571302400.0, "grad_norm": 2.2566146738923645, "language_loss": 0.79597068, "learning_rate": 3.2763359003080837e-06, "loss": 0.81839144, "num_input_tokens_seen": 53825680, "step": 2506, "time_per_iteration": 2.8408665657043457 }, { "auxiliary_loss_clip": 0.01093675, "auxiliary_loss_mlp": 0.01003089, "balance_loss_clip": 0.94512665, "balance_loss_mlp": 1.0001086, "epoch": 0.30144892683220104, "flos": 70648212240000.0, "grad_norm": 0.8264307667147652, "language_loss": 0.62559074, "learning_rate": 3.2757360754393047e-06, "loss": 0.64655834, "num_input_tokens_seen": 53889750, "step": 2507, "time_per_iteration": 3.3496391773223877 }, { "auxiliary_loss_clip": 0.01199987, "auxiliary_loss_mlp": 0.01037062, "balance_loss_clip": 1.02083611, "balance_loss_mlp": 1.02758455, "epoch": 0.30156916972284015, "flos": 22820549241600.0, "grad_norm": 2.370918851016458, "language_loss": 0.63617659, "learning_rate": 3.2751360570423767e-06, "loss": 0.65854704, "num_input_tokens_seen": 53908135, "step": 2508, "time_per_iteration": 2.7631657123565674 }, { "auxiliary_loss_clip": 0.01197262, "auxiliary_loss_mlp": 0.01025311, "balance_loss_clip": 0.98188281, "balance_loss_mlp": 1.01651919, "epoch": 0.3016894126134792, "flos": 29899188529920.0, "grad_norm": 1.8508804461743922, "language_loss": 0.75653291, "learning_rate": 3.2745358452083236e-06, "loss": 0.77875865, "num_input_tokens_seen": 53931035, "step": 2509, "time_per_iteration": 2.7924644947052 }, { "auxiliary_loss_clip": 0.01198834, "auxiliary_loss_mlp": 0.01029831, "balance_loss_clip": 1.01979232, "balance_loss_mlp": 1.02187419, "epoch": 0.3018096555041183, "flos": 21546331200000.0, "grad_norm": 1.7993305855932253, "language_loss": 0.82220763, "learning_rate": 3.2739354400281955e-06, "loss": 0.84449428, "num_input_tokens_seen": 53952255, "step": 2510, "time_per_iteration": 2.8368849754333496 }, { "auxiliary_loss_clip": 0.01093297, "auxiliary_loss_mlp": 0.01120638, "balance_loss_clip": 0.9058888, "balance_loss_mlp": 0.0, "epoch": 0.3019298983947574, "flos": 59136294597120.0, "grad_norm": 0.8775179114805267, "language_loss": 0.63749731, "learning_rate": 3.2733348415930744e-06, "loss": 0.65963668, "num_input_tokens_seen": 54014125, "step": 2511, "time_per_iteration": 3.3587701320648193 }, { "auxiliary_loss_clip": 0.01190312, "auxiliary_loss_mlp": 0.01028727, "balance_loss_clip": 0.94046247, "balance_loss_mlp": 1.02013779, "epoch": 0.3020501412853965, "flos": 34423070941440.0, "grad_norm": 2.0563999039336998, "language_loss": 0.80852342, "learning_rate": 3.27273404999407e-06, "loss": 0.83071375, "num_input_tokens_seen": 54036345, "step": 2512, "time_per_iteration": 3.8780314922332764 }, { "auxiliary_loss_clip": 0.01094049, "auxiliary_loss_mlp": 0.01005781, "balance_loss_clip": 0.94365668, "balance_loss_mlp": 1.00293207, "epoch": 0.3021703841760356, "flos": 71008288128000.0, "grad_norm": 0.8144689357071289, "language_loss": 0.60472643, "learning_rate": 3.272133065322322e-06, "loss": 0.62572467, "num_input_tokens_seen": 54094615, "step": 2513, "time_per_iteration": 3.269415855407715 }, { "auxiliary_loss_clip": 0.0119766, "auxiliary_loss_mlp": 0.01030882, "balance_loss_clip": 1.05664456, "balance_loss_mlp": 1.02236462, "epoch": 0.3022906270666747, "flos": 21510528318720.0, "grad_norm": 1.6523833163470767, "language_loss": 0.79166663, "learning_rate": 3.271531887669e-06, "loss": 0.81395209, "num_input_tokens_seen": 54114675, "step": 2514, "time_per_iteration": 2.6928746700286865 }, { "auxiliary_loss_clip": 0.01200322, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 0.90253854, "balance_loss_mlp": 1.0261538, "epoch": 0.30241086995731375, "flos": 31132001168640.0, "grad_norm": 2.099696673318503, "language_loss": 0.63388324, "learning_rate": 3.2709305171253015e-06, "loss": 0.65624273, "num_input_tokens_seen": 54134795, "step": 2515, "time_per_iteration": 3.8674163818359375 }, { "auxiliary_loss_clip": 0.01200426, "auxiliary_loss_mlp": 0.01038985, "balance_loss_clip": 1.02270722, "balance_loss_mlp": 1.03033042, "epoch": 0.30253111284795287, "flos": 23511542152320.0, "grad_norm": 1.7729458924205288, "language_loss": 0.7750476, "learning_rate": 3.2703289537824536e-06, "loss": 0.79744172, "num_input_tokens_seen": 54154595, "step": 2516, "time_per_iteration": 2.7170534133911133 }, { "auxiliary_loss_clip": 0.01200484, "auxiliary_loss_mlp": 0.01039314, "balance_loss_clip": 0.90531665, "balance_loss_mlp": 1.02969348, "epoch": 0.302651355738592, "flos": 18725367651840.0, "grad_norm": 2.584998289120869, "language_loss": 0.78736216, "learning_rate": 3.269727197731714e-06, "loss": 0.80976009, "num_input_tokens_seen": 54167360, "step": 2517, "time_per_iteration": 2.689401865005493 }, { "auxiliary_loss_clip": 0.01188973, "auxiliary_loss_mlp": 0.01034226, "balance_loss_clip": 0.90403742, "balance_loss_mlp": 1.0252676, "epoch": 0.30277159862923103, "flos": 22418888382720.0, "grad_norm": 1.6953506098381024, "language_loss": 0.78145057, "learning_rate": 3.269125249064367e-06, "loss": 0.80368257, "num_input_tokens_seen": 54187055, "step": 2518, "time_per_iteration": 2.8256256580352783 }, { "auxiliary_loss_clip": 0.01203626, "auxiliary_loss_mlp": 0.01032449, "balance_loss_clip": 1.05737591, "balance_loss_mlp": 1.02322769, "epoch": 0.30289184151987014, "flos": 22273126992000.0, "grad_norm": 2.1556237419120774, "language_loss": 0.83396816, "learning_rate": 3.2685231078717297e-06, "loss": 0.85632896, "num_input_tokens_seen": 54207245, "step": 2519, "time_per_iteration": 5.079745531082153 }, { "auxiliary_loss_clip": 0.01186848, "auxiliary_loss_mlp": 0.01125618, "balance_loss_clip": 0.94240153, "balance_loss_mlp": 0.0, "epoch": 0.30301208441050925, "flos": 25225594231680.0, "grad_norm": 3.1268790334952734, "language_loss": 0.75303757, "learning_rate": 3.267920774245145e-06, "loss": 0.77616227, "num_input_tokens_seen": 54226650, "step": 2520, "time_per_iteration": 2.920599937438965 }, { "auxiliary_loss_clip": 0.01197812, "auxiliary_loss_mlp": 0.01037128, "balance_loss_clip": 1.01943517, "balance_loss_mlp": 1.02768028, "epoch": 0.3031323273011483, "flos": 23039245198080.0, "grad_norm": 1.9843053786698657, "language_loss": 0.84594959, "learning_rate": 3.2673182482759876e-06, "loss": 0.86829901, "num_input_tokens_seen": 54245765, "step": 2521, "time_per_iteration": 2.6330037117004395 }, { "auxiliary_loss_clip": 0.01198699, "auxiliary_loss_mlp": 0.01028113, "balance_loss_clip": 1.02005744, "balance_loss_mlp": 1.0190289, "epoch": 0.3032525701917874, "flos": 18876695650560.0, "grad_norm": 2.5828176888581913, "language_loss": 0.66075075, "learning_rate": 3.266715530055659e-06, "loss": 0.68301892, "num_input_tokens_seen": 54263915, "step": 2522, "time_per_iteration": 2.6052918434143066 }, { "auxiliary_loss_clip": 0.01189581, "auxiliary_loss_mlp": 0.01023508, "balance_loss_clip": 1.01778412, "balance_loss_mlp": 1.01465058, "epoch": 0.30337281308242653, "flos": 17782641250560.0, "grad_norm": 1.8177780647057802, "language_loss": 0.80540693, "learning_rate": 3.2661126196755927e-06, "loss": 0.82753778, "num_input_tokens_seen": 54283025, "step": 2523, "time_per_iteration": 2.6552140712738037 }, { "auxiliary_loss_clip": 0.01087541, "auxiliary_loss_mlp": 0.01006959, "balance_loss_clip": 1.01799834, "balance_loss_mlp": 1.00419331, "epoch": 0.3034930559730656, "flos": 57824298426240.0, "grad_norm": 0.7836613297682147, "language_loss": 0.55990899, "learning_rate": 3.265509517227248e-06, "loss": 0.58085406, "num_input_tokens_seen": 54339840, "step": 2524, "time_per_iteration": 3.198430299758911 }, { "auxiliary_loss_clip": 0.01195303, "auxiliary_loss_mlp": 0.01031621, "balance_loss_clip": 0.97791243, "balance_loss_mlp": 1.02285933, "epoch": 0.3036132988637047, "flos": 14755587419520.0, "grad_norm": 1.643831767967308, "language_loss": 0.80692941, "learning_rate": 3.264906222802115e-06, "loss": 0.82919866, "num_input_tokens_seen": 54357690, "step": 2525, "time_per_iteration": 2.7509541511535645 }, { "auxiliary_loss_clip": 0.01199513, "auxiliary_loss_mlp": 0.01035126, "balance_loss_clip": 1.05560553, "balance_loss_mlp": 1.02590513, "epoch": 0.30373354175434375, "flos": 21033203460480.0, "grad_norm": 2.008680948641446, "language_loss": 0.78060305, "learning_rate": 3.264302736491715e-06, "loss": 0.80294949, "num_input_tokens_seen": 54377810, "step": 2526, "time_per_iteration": 2.6323883533477783 }, { "auxiliary_loss_clip": 0.01193631, "auxiliary_loss_mlp": 0.01039363, "balance_loss_clip": 1.01886868, "balance_loss_mlp": 1.03093505, "epoch": 0.30385378464498286, "flos": 21143233797120.0, "grad_norm": 1.8730972980649114, "language_loss": 0.87618113, "learning_rate": 3.263699058387594e-06, "loss": 0.89851105, "num_input_tokens_seen": 54395245, "step": 2527, "time_per_iteration": 2.6918678283691406 }, { "auxiliary_loss_clip": 0.01187308, "auxiliary_loss_mlp": 0.01033891, "balance_loss_clip": 0.93809992, "balance_loss_mlp": 1.02461684, "epoch": 0.30397402753562197, "flos": 20629244131200.0, "grad_norm": 2.060883276419869, "language_loss": 0.9047879, "learning_rate": 3.2630951885813315e-06, "loss": 0.92699987, "num_input_tokens_seen": 54412640, "step": 2528, "time_per_iteration": 2.773925542831421 }, { "auxiliary_loss_clip": 0.01196795, "auxiliary_loss_mlp": 0.01029143, "balance_loss_clip": 0.97826064, "balance_loss_mlp": 1.02013123, "epoch": 0.304094270426261, "flos": 15085678429440.0, "grad_norm": 1.8964648697326467, "language_loss": 0.78127587, "learning_rate": 3.262491127164533e-06, "loss": 0.80353522, "num_input_tokens_seen": 54431455, "step": 2529, "time_per_iteration": 2.689737319946289 }, { "auxiliary_loss_clip": 0.01202315, "auxiliary_loss_mlp": 0.01125276, "balance_loss_clip": 0.97841406, "balance_loss_mlp": 0.0, "epoch": 0.30421451331690014, "flos": 13845216193920.0, "grad_norm": 2.553255394957389, "language_loss": 0.80129182, "learning_rate": 3.2618868742288337e-06, "loss": 0.82456779, "num_input_tokens_seen": 54448380, "step": 2530, "time_per_iteration": 2.7300171852111816 }, { "auxiliary_loss_clip": 0.01196006, "auxiliary_loss_mlp": 0.01034467, "balance_loss_clip": 1.01902699, "balance_loss_mlp": 1.02539515, "epoch": 0.30433475620753925, "flos": 17384212615680.0, "grad_norm": 1.7274544898117525, "language_loss": 0.72279215, "learning_rate": 3.261282429865899e-06, "loss": 0.7450968, "num_input_tokens_seen": 54466385, "step": 2531, "time_per_iteration": 2.690270185470581 }, { "auxiliary_loss_clip": 0.01203151, "auxiliary_loss_mlp": 0.01125427, "balance_loss_clip": 0.98169452, "balance_loss_mlp": 0.0, "epoch": 0.3044549990981783, "flos": 18916951818240.0, "grad_norm": 1.5086789985316011, "language_loss": 0.72142076, "learning_rate": 3.2606777941674225e-06, "loss": 0.74470651, "num_input_tokens_seen": 54485040, "step": 2532, "time_per_iteration": 2.7653791904449463 }, { "auxiliary_loss_clip": 0.01185669, "auxiliary_loss_mlp": 0.01039451, "balance_loss_clip": 0.90382421, "balance_loss_mlp": 1.03021812, "epoch": 0.3045752419888174, "flos": 21068431724160.0, "grad_norm": 2.032405368762612, "language_loss": 0.84664178, "learning_rate": 3.2600729672251276e-06, "loss": 0.86889303, "num_input_tokens_seen": 54502755, "step": 2533, "time_per_iteration": 2.7615163326263428 }, { "auxiliary_loss_clip": 0.01199334, "auxiliary_loss_mlp": 0.01125454, "balance_loss_clip": 1.05707622, "balance_loss_mlp": 0.0, "epoch": 0.3046954848794565, "flos": 29096405516160.0, "grad_norm": 1.9778987605420246, "language_loss": 0.65446889, "learning_rate": 3.259467949130765e-06, "loss": 0.67771673, "num_input_tokens_seen": 54524165, "step": 2534, "time_per_iteration": 2.6956615447998047 }, { "auxiliary_loss_clip": 0.01199361, "auxiliary_loss_mlp": 0.01030833, "balance_loss_clip": 0.98148066, "balance_loss_mlp": 1.02213717, "epoch": 0.3048157277700956, "flos": 20295346279680.0, "grad_norm": 2.109860494158819, "language_loss": 0.8321991, "learning_rate": 3.2588627399761164e-06, "loss": 0.85450101, "num_input_tokens_seen": 54540160, "step": 2535, "time_per_iteration": 2.745185613632202 }, { "auxiliary_loss_clip": 0.01196018, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 0.98111105, "balance_loss_mlp": 1.02711141, "epoch": 0.3049359706607347, "flos": 22739929165440.0, "grad_norm": 1.6637417687388494, "language_loss": 0.7058534, "learning_rate": 3.2582573398529903e-06, "loss": 0.72816867, "num_input_tokens_seen": 54557515, "step": 2536, "time_per_iteration": 2.7310004234313965 }, { "auxiliary_loss_clip": 0.01196441, "auxiliary_loss_mlp": 0.01031635, "balance_loss_clip": 0.94153738, "balance_loss_mlp": 1.02238989, "epoch": 0.3050562135513738, "flos": 18434634969600.0, "grad_norm": 2.13491841480153, "language_loss": 0.73852056, "learning_rate": 3.2576517488532265e-06, "loss": 0.76080132, "num_input_tokens_seen": 54573865, "step": 2537, "time_per_iteration": 2.7467784881591797 }, { "auxiliary_loss_clip": 0.01194649, "auxiliary_loss_mlp": 0.01036904, "balance_loss_clip": 1.01674414, "balance_loss_mlp": 1.0286901, "epoch": 0.30517645644201286, "flos": 20370327920640.0, "grad_norm": 1.7808018930853966, "language_loss": 0.87396967, "learning_rate": 3.257045967068692e-06, "loss": 0.89628518, "num_input_tokens_seen": 54593120, "step": 2538, "time_per_iteration": 3.646397113800049 }, { "auxiliary_loss_clip": 0.01202793, "auxiliary_loss_mlp": 0.01031508, "balance_loss_clip": 1.05868495, "balance_loss_mlp": 1.02243638, "epoch": 0.30529669933265197, "flos": 21945118970880.0, "grad_norm": 1.5837946126567262, "language_loss": 0.81943733, "learning_rate": 3.2564399945912848e-06, "loss": 0.8417803, "num_input_tokens_seen": 54612910, "step": 2539, "time_per_iteration": 2.6230483055114746 }, { "auxiliary_loss_clip": 0.0119672, "auxiliary_loss_mlp": 0.01027288, "balance_loss_clip": 0.9021796, "balance_loss_mlp": 1.01854968, "epoch": 0.305416942223291, "flos": 21835411856640.0, "grad_norm": 2.095116758845134, "language_loss": 0.81924212, "learning_rate": 3.2558338315129287e-06, "loss": 0.84148228, "num_input_tokens_seen": 54631055, "step": 2540, "time_per_iteration": 2.728867769241333 }, { "auxiliary_loss_clip": 0.01191621, "auxiliary_loss_mlp": 0.0103824, "balance_loss_clip": 1.01886225, "balance_loss_mlp": 1.0299077, "epoch": 0.30553718511393013, "flos": 33911810709120.0, "grad_norm": 3.022430103254836, "language_loss": 0.76024395, "learning_rate": 3.2552274779255785e-06, "loss": 0.78254259, "num_input_tokens_seen": 54651985, "step": 2541, "time_per_iteration": 3.6939854621887207 }, { "auxiliary_loss_clip": 0.01196663, "auxiliary_loss_mlp": 0.01036911, "balance_loss_clip": 1.01903915, "balance_loss_mlp": 1.02854812, "epoch": 0.30565742800456924, "flos": 22268530051200.0, "grad_norm": 2.3732172979784796, "language_loss": 0.76624978, "learning_rate": 3.2546209339212184e-06, "loss": 0.78858554, "num_input_tokens_seen": 54671005, "step": 2542, "time_per_iteration": 2.6758830547332764 }, { "auxiliary_loss_clip": 0.01195788, "auxiliary_loss_mlp": 0.01035703, "balance_loss_clip": 0.97809827, "balance_loss_mlp": 1.02696538, "epoch": 0.3057776708952083, "flos": 22565044823040.0, "grad_norm": 1.4760882940942166, "language_loss": 0.77576512, "learning_rate": 3.25401419959186e-06, "loss": 0.79808009, "num_input_tokens_seen": 54691615, "step": 2543, "time_per_iteration": 2.775662660598755 }, { "auxiliary_loss_clip": 0.01208932, "auxiliary_loss_mlp": 0.01035162, "balance_loss_clip": 0.98376566, "balance_loss_mlp": 1.02614355, "epoch": 0.3058979137858474, "flos": 21799213925760.0, "grad_norm": 2.0487287976390074, "language_loss": 0.7591573, "learning_rate": 3.253407275029545e-06, "loss": 0.78159833, "num_input_tokens_seen": 54710520, "step": 2544, "time_per_iteration": 3.638982057571411 }, { "auxiliary_loss_clip": 0.01200068, "auxiliary_loss_mlp": 0.0103326, "balance_loss_clip": 0.94437081, "balance_loss_mlp": 1.0231626, "epoch": 0.3060181566764865, "flos": 26979435601920.0, "grad_norm": 3.3588385952861857, "language_loss": 0.80006945, "learning_rate": 3.2528001603263425e-06, "loss": 0.82240278, "num_input_tokens_seen": 54732590, "step": 2545, "time_per_iteration": 3.5522475242614746 }, { "auxiliary_loss_clip": 0.01195506, "auxiliary_loss_mlp": 0.01032614, "balance_loss_clip": 1.01719093, "balance_loss_mlp": 1.02347088, "epoch": 0.3061383995671256, "flos": 19865101173120.0, "grad_norm": 2.2724577979611866, "language_loss": 0.81455475, "learning_rate": 3.2521928555743514e-06, "loss": 0.83683592, "num_input_tokens_seen": 54749935, "step": 2546, "time_per_iteration": 2.611419439315796 }, { "auxiliary_loss_clip": 0.01188742, "auxiliary_loss_mlp": 0.01125587, "balance_loss_clip": 0.9784416, "balance_loss_mlp": 0.0, "epoch": 0.3062586424577647, "flos": 22127509255680.0, "grad_norm": 1.9124676154456504, "language_loss": 0.67571199, "learning_rate": 3.2515853608657e-06, "loss": 0.69885528, "num_input_tokens_seen": 54767935, "step": 2547, "time_per_iteration": 2.7056756019592285 }, { "auxiliary_loss_clip": 0.01190871, "auxiliary_loss_mlp": 0.01036313, "balance_loss_clip": 1.01722908, "balance_loss_mlp": 1.02713394, "epoch": 0.3063788853484038, "flos": 20845497962880.0, "grad_norm": 2.2411522471879053, "language_loss": 0.75313401, "learning_rate": 3.250977676292545e-06, "loss": 0.77540588, "num_input_tokens_seen": 54786175, "step": 2548, "time_per_iteration": 2.6700000762939453 }, { "auxiliary_loss_clip": 0.0119679, "auxiliary_loss_mlp": 0.01031697, "balance_loss_clip": 0.97791135, "balance_loss_mlp": 1.02276206, "epoch": 0.30649912823904285, "flos": 16209717707520.0, "grad_norm": 2.2127716347319177, "language_loss": 0.79342401, "learning_rate": 3.2503698019470712e-06, "loss": 0.81570888, "num_input_tokens_seen": 54801945, "step": 2549, "time_per_iteration": 2.660386562347412 }, { "auxiliary_loss_clip": 0.01194169, "auxiliary_loss_mlp": 0.01033765, "balance_loss_clip": 1.01505566, "balance_loss_mlp": 1.0253365, "epoch": 0.30661937112968196, "flos": 18617815353600.0, "grad_norm": 2.5436699901502324, "language_loss": 0.78523719, "learning_rate": 3.249761737921492e-06, "loss": 0.80751652, "num_input_tokens_seen": 54818475, "step": 2550, "time_per_iteration": 2.653716802597046 }, { "auxiliary_loss_clip": 0.01196274, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 0.98167604, "balance_loss_mlp": 1.02702451, "epoch": 0.30673961402032107, "flos": 31390809638400.0, "grad_norm": 1.9497840690611346, "language_loss": 0.73846751, "learning_rate": 3.249153484308051e-06, "loss": 0.76079094, "num_input_tokens_seen": 54837090, "step": 2551, "time_per_iteration": 2.768099069595337 }, { "auxiliary_loss_clip": 0.01181391, "auxiliary_loss_mlp": 0.01026801, "balance_loss_clip": 0.90175581, "balance_loss_mlp": 1.01771688, "epoch": 0.3068598569109601, "flos": 20229809915520.0, "grad_norm": 1.9298319510118285, "language_loss": 0.77724016, "learning_rate": 3.2485450411990194e-06, "loss": 0.79932201, "num_input_tokens_seen": 54856445, "step": 2552, "time_per_iteration": 2.7745518684387207 }, { "auxiliary_loss_clip": 0.01197937, "auxiliary_loss_mlp": 0.01031038, "balance_loss_clip": 1.05493283, "balance_loss_mlp": 1.02184701, "epoch": 0.30698009980159924, "flos": 29601991399680.0, "grad_norm": 1.7413902213022643, "language_loss": 0.82454681, "learning_rate": 3.2479364086866983e-06, "loss": 0.84683657, "num_input_tokens_seen": 54876700, "step": 2553, "time_per_iteration": 2.714779853820801 }, { "auxiliary_loss_clip": 0.0119529, "auxiliary_loss_mlp": 0.0112568, "balance_loss_clip": 0.9791435, "balance_loss_mlp": 0.0, "epoch": 0.30710034269223835, "flos": 23842423261440.0, "grad_norm": 1.6757478817663007, "language_loss": 0.81512415, "learning_rate": 3.247327586863416e-06, "loss": 0.83833385, "num_input_tokens_seen": 54897580, "step": 2554, "time_per_iteration": 2.7161710262298584 }, { "auxiliary_loss_clip": 0.01198672, "auxiliary_loss_mlp": 0.0102577, "balance_loss_clip": 0.94047987, "balance_loss_mlp": 1.01687145, "epoch": 0.3072205855828774, "flos": 25884986152320.0, "grad_norm": 2.4088489847989942, "language_loss": 0.77243543, "learning_rate": 3.2467185758215304e-06, "loss": 0.79467982, "num_input_tokens_seen": 54917320, "step": 2555, "time_per_iteration": 2.7446823120117188 }, { "auxiliary_loss_clip": 0.01200162, "auxiliary_loss_mlp": 0.01125202, "balance_loss_clip": 0.94327235, "balance_loss_mlp": 0.0, "epoch": 0.3073408284735165, "flos": 22236390357120.0, "grad_norm": 2.306785504506554, "language_loss": 0.85349894, "learning_rate": 3.246109375653428e-06, "loss": 0.87675256, "num_input_tokens_seen": 54934085, "step": 2556, "time_per_iteration": 2.664123058319092 }, { "auxiliary_loss_clip": 0.01202117, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.05836964, "balance_loss_mlp": 1.02350712, "epoch": 0.30746107136415557, "flos": 19500284689920.0, "grad_norm": 1.695463508407054, "language_loss": 0.78665954, "learning_rate": 3.2454999864515243e-06, "loss": 0.80900264, "num_input_tokens_seen": 54953460, "step": 2557, "time_per_iteration": 2.627285957336426 }, { "auxiliary_loss_clip": 0.01191102, "auxiliary_loss_mlp": 0.01125388, "balance_loss_clip": 0.97827983, "balance_loss_mlp": 0.0, "epoch": 0.3075813142547947, "flos": 21724806902400.0, "grad_norm": 2.0712214380281733, "language_loss": 0.69475096, "learning_rate": 3.244890408308263e-06, "loss": 0.71791583, "num_input_tokens_seen": 54974165, "step": 2558, "time_per_iteration": 2.6978375911712646 }, { "auxiliary_loss_clip": 0.01192506, "auxiliary_loss_mlp": 0.01035374, "balance_loss_clip": 0.90006804, "balance_loss_mlp": 1.02670169, "epoch": 0.3077015571454338, "flos": 24097963593600.0, "grad_norm": 1.923792727630622, "language_loss": 0.60680521, "learning_rate": 3.2442806413161165e-06, "loss": 0.62908399, "num_input_tokens_seen": 54993810, "step": 2559, "time_per_iteration": 2.79121732711792 }, { "auxiliary_loss_clip": 0.01198256, "auxiliary_loss_mlp": 0.01031318, "balance_loss_clip": 0.90262473, "balance_loss_mlp": 1.02221584, "epoch": 0.30782180003607285, "flos": 18405476104320.0, "grad_norm": 1.880895176354405, "language_loss": 0.76033849, "learning_rate": 3.243670685567586e-06, "loss": 0.7826342, "num_input_tokens_seen": 55011210, "step": 2560, "time_per_iteration": 2.815709352493286 }, { "auxiliary_loss_clip": 0.01194446, "auxiliary_loss_mlp": 0.01125237, "balance_loss_clip": 0.97888607, "balance_loss_mlp": 0.0, "epoch": 0.30794204292671196, "flos": 23878549365120.0, "grad_norm": 2.162797629038878, "language_loss": 0.80073214, "learning_rate": 3.2430605411552012e-06, "loss": 0.82392901, "num_input_tokens_seen": 55031325, "step": 2561, "time_per_iteration": 2.763310670852661 }, { "auxiliary_loss_clip": 0.0109802, "auxiliary_loss_mlp": 0.0099984, "balance_loss_clip": 0.90759331, "balance_loss_mlp": 0.9972412, "epoch": 0.30806228581735107, "flos": 67927800816000.0, "grad_norm": 0.8961493335963514, "language_loss": 0.70595104, "learning_rate": 3.2424502081715205e-06, "loss": 0.72692961, "num_input_tokens_seen": 55094440, "step": 2562, "time_per_iteration": 3.347461223602295 }, { "auxiliary_loss_clip": 0.01198941, "auxiliary_loss_mlp": 0.01031989, "balance_loss_clip": 0.97952771, "balance_loss_mlp": 1.02328038, "epoch": 0.3081825287079901, "flos": 23843213360640.0, "grad_norm": 1.5792377981648578, "language_loss": 0.77920431, "learning_rate": 3.241839686709132e-06, "loss": 0.80151361, "num_input_tokens_seen": 55115375, "step": 2563, "time_per_iteration": 2.82491135597229 }, { "auxiliary_loss_clip": 0.01196292, "auxiliary_loss_mlp": 0.01033452, "balance_loss_clip": 1.01808643, "balance_loss_mlp": 1.02423096, "epoch": 0.30830277159862923, "flos": 16209969102720.0, "grad_norm": 2.1076071293907885, "language_loss": 0.82041097, "learning_rate": 3.2412289768606495e-06, "loss": 0.84270835, "num_input_tokens_seen": 55131945, "step": 2564, "time_per_iteration": 3.809936285018921 }, { "auxiliary_loss_clip": 0.01202297, "auxiliary_loss_mlp": 0.01031535, "balance_loss_clip": 1.02035213, "balance_loss_mlp": 1.02316666, "epoch": 0.30842301448926834, "flos": 29349503723520.0, "grad_norm": 2.1226187399110485, "language_loss": 0.82722688, "learning_rate": 3.240618078718718e-06, "loss": 0.84956521, "num_input_tokens_seen": 55153405, "step": 2565, "time_per_iteration": 2.787773370742798 }, { "auxiliary_loss_clip": 0.01189682, "auxiliary_loss_mlp": 0.01039022, "balance_loss_clip": 0.93868148, "balance_loss_mlp": 1.02973545, "epoch": 0.3085432573799074, "flos": 21945190798080.0, "grad_norm": 1.8488296921168805, "language_loss": 0.74222082, "learning_rate": 3.240006992376011e-06, "loss": 0.76450783, "num_input_tokens_seen": 55173030, "step": 2566, "time_per_iteration": 3.7990808486938477 }, { "auxiliary_loss_clip": 0.01205148, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 0.983899, "balance_loss_mlp": 1.02575815, "epoch": 0.3086635002705465, "flos": 22054718344320.0, "grad_norm": 2.150090295507856, "language_loss": 0.75510526, "learning_rate": 3.2393957179252284e-06, "loss": 0.7774983, "num_input_tokens_seen": 55189565, "step": 2567, "time_per_iteration": 2.8107502460479736 }, { "auxiliary_loss_clip": 0.01204692, "auxiliary_loss_mlp": 0.01032262, "balance_loss_clip": 1.06071401, "balance_loss_mlp": 1.02363122, "epoch": 0.3087837431611856, "flos": 32665925520000.0, "grad_norm": 2.2299184578558453, "language_loss": 0.8075369, "learning_rate": 3.2387842554591016e-06, "loss": 0.8299064, "num_input_tokens_seen": 55210380, "step": 2568, "time_per_iteration": 2.751105785369873 }, { "auxiliary_loss_clip": 0.01203246, "auxiliary_loss_mlp": 0.01030923, "balance_loss_clip": 1.05998969, "balance_loss_mlp": 1.02118349, "epoch": 0.3089039860518247, "flos": 17599245384960.0, "grad_norm": 2.33284539798361, "language_loss": 0.87561834, "learning_rate": 3.238172605070388e-06, "loss": 0.89796007, "num_input_tokens_seen": 55225795, "step": 2569, "time_per_iteration": 2.648090124130249 }, { "auxiliary_loss_clip": 0.01200628, "auxiliary_loss_mlp": 0.0112613, "balance_loss_clip": 1.02140975, "balance_loss_mlp": 0.0, "epoch": 0.3090242289424638, "flos": 14383839611520.0, "grad_norm": 2.03710316640393, "language_loss": 0.78448367, "learning_rate": 3.2375607668518745e-06, "loss": 0.8077513, "num_input_tokens_seen": 55238830, "step": 2570, "time_per_iteration": 3.4798779487609863 }, { "auxiliary_loss_clip": 0.01186932, "auxiliary_loss_mlp": 0.01031842, "balance_loss_clip": 0.98036951, "balance_loss_mlp": 1.02308047, "epoch": 0.30914447183310284, "flos": 16068625084800.0, "grad_norm": 2.3304332602302154, "language_loss": 0.89829361, "learning_rate": 3.236948740896377e-06, "loss": 0.92048138, "num_input_tokens_seen": 55253630, "step": 2571, "time_per_iteration": 3.5683071613311768 }, { "auxiliary_loss_clip": 0.01200396, "auxiliary_loss_mlp": 0.01034945, "balance_loss_clip": 1.02032399, "balance_loss_mlp": 1.0255928, "epoch": 0.30926471472374195, "flos": 32230221546240.0, "grad_norm": 1.4535808625957451, "language_loss": 0.84240103, "learning_rate": 3.2363365272967384e-06, "loss": 0.8647545, "num_input_tokens_seen": 55276200, "step": 2572, "time_per_iteration": 2.757564067840576 }, { "auxiliary_loss_clip": 0.01198034, "auxiliary_loss_mlp": 0.01037334, "balance_loss_clip": 1.02005303, "balance_loss_mlp": 1.0283637, "epoch": 0.30938495761438106, "flos": 20370722970240.0, "grad_norm": 1.8308670940902592, "language_loss": 0.81663263, "learning_rate": 3.235724126145832e-06, "loss": 0.83898628, "num_input_tokens_seen": 55292235, "step": 2573, "time_per_iteration": 2.6554839611053467 }, { "auxiliary_loss_clip": 0.0118741, "auxiliary_loss_mlp": 0.01041117, "balance_loss_clip": 1.01646101, "balance_loss_mlp": 1.03219426, "epoch": 0.3095052005050201, "flos": 24061155131520.0, "grad_norm": 1.5152123317704538, "language_loss": 0.77553988, "learning_rate": 3.235111537536558e-06, "loss": 0.7978251, "num_input_tokens_seen": 55313050, "step": 2574, "time_per_iteration": 2.7347195148468018 }, { "auxiliary_loss_clip": 0.01201484, "auxiliary_loss_mlp": 0.0102883, "balance_loss_clip": 1.02000761, "balance_loss_mlp": 1.02054501, "epoch": 0.30962544339565923, "flos": 23401547729280.0, "grad_norm": 2.124439441851099, "language_loss": 0.82677257, "learning_rate": 3.2344987615618456e-06, "loss": 0.84907568, "num_input_tokens_seen": 55332885, "step": 2575, "time_per_iteration": 2.7479422092437744 }, { "auxiliary_loss_clip": 0.01195289, "auxiliary_loss_mlp": 0.01031588, "balance_loss_clip": 0.94265777, "balance_loss_mlp": 1.02306485, "epoch": 0.30974568628629834, "flos": 33799984692480.0, "grad_norm": 1.5451413605392692, "language_loss": 0.78337348, "learning_rate": 3.2338857983146533e-06, "loss": 0.80564225, "num_input_tokens_seen": 55354385, "step": 2576, "time_per_iteration": 2.876481771469116 }, { "auxiliary_loss_clip": 0.01191169, "auxiliary_loss_mlp": 0.01030631, "balance_loss_clip": 0.98414069, "balance_loss_mlp": 1.02117813, "epoch": 0.3098659291769374, "flos": 20229594433920.0, "grad_norm": 1.8627035624138757, "language_loss": 0.76304102, "learning_rate": 3.233272647887966e-06, "loss": 0.78525895, "num_input_tokens_seen": 55373275, "step": 2577, "time_per_iteration": 2.68685245513916 }, { "auxiliary_loss_clip": 0.01203123, "auxiliary_loss_mlp": 0.01036573, "balance_loss_clip": 1.05942023, "balance_loss_mlp": 1.02715528, "epoch": 0.3099861720675765, "flos": 24748556682240.0, "grad_norm": 1.5550993651939122, "language_loss": 0.90174645, "learning_rate": 3.2326593103747985e-06, "loss": 0.92414343, "num_input_tokens_seen": 55392290, "step": 2578, "time_per_iteration": 2.68536114692688 }, { "auxiliary_loss_clip": 0.01202023, "auxiliary_loss_mlp": 0.01032657, "balance_loss_clip": 1.02288461, "balance_loss_mlp": 1.02272666, "epoch": 0.3101064149582156, "flos": 11765485704960.0, "grad_norm": 1.99663144475044, "language_loss": 0.85118884, "learning_rate": 3.2320457858681936e-06, "loss": 0.87353563, "num_input_tokens_seen": 55410680, "step": 2579, "time_per_iteration": 2.6148226261138916 }, { "auxiliary_loss_clip": 0.01197212, "auxiliary_loss_mlp": 0.0103636, "balance_loss_clip": 0.98017716, "balance_loss_mlp": 1.02730012, "epoch": 0.31022665784885467, "flos": 23033247626880.0, "grad_norm": 2.309097260546068, "language_loss": 0.85317504, "learning_rate": 3.2314320744612228e-06, "loss": 0.87551069, "num_input_tokens_seen": 55425980, "step": 2580, "time_per_iteration": 2.695842742919922 }, { "auxiliary_loss_clip": 0.01195684, "auxiliary_loss_mlp": 0.01031476, "balance_loss_clip": 1.01969743, "balance_loss_mlp": 1.02264237, "epoch": 0.3103469007394938, "flos": 16289188548480.0, "grad_norm": 1.7310032771460764, "language_loss": 0.76066709, "learning_rate": 3.2308181762469854e-06, "loss": 0.78293872, "num_input_tokens_seen": 55443925, "step": 2581, "time_per_iteration": 2.6494381427764893 }, { "auxiliary_loss_clip": 0.01203303, "auxiliary_loss_mlp": 0.01032746, "balance_loss_clip": 1.05803037, "balance_loss_mlp": 1.02337611, "epoch": 0.3104671436301329, "flos": 30515271626880.0, "grad_norm": 1.8999650495591898, "language_loss": 0.78798342, "learning_rate": 3.230204091318609e-06, "loss": 0.81034386, "num_input_tokens_seen": 55464465, "step": 2582, "time_per_iteration": 2.6672892570495605 }, { "auxiliary_loss_clip": 0.01197064, "auxiliary_loss_mlp": 0.01125469, "balance_loss_clip": 1.0555594, "balance_loss_mlp": 0.0, "epoch": 0.31058738652077195, "flos": 20047240062720.0, "grad_norm": 1.9952966078674284, "language_loss": 0.84689802, "learning_rate": 3.2295898197692503e-06, "loss": 0.87012339, "num_input_tokens_seen": 55483425, "step": 2583, "time_per_iteration": 2.6238064765930176 }, { "auxiliary_loss_clip": 0.01199585, "auxiliary_loss_mlp": 0.0103474, "balance_loss_clip": 1.05743992, "balance_loss_mlp": 1.025859, "epoch": 0.31070762941141106, "flos": 28074639237120.0, "grad_norm": 1.7855309865420268, "language_loss": 0.79114288, "learning_rate": 3.228975361692094e-06, "loss": 0.81348616, "num_input_tokens_seen": 55504445, "step": 2584, "time_per_iteration": 2.627662420272827 }, { "auxiliary_loss_clip": 0.0120393, "auxiliary_loss_mlp": 0.0112597, "balance_loss_clip": 1.01886916, "balance_loss_mlp": 0.0, "epoch": 0.31082787230205017, "flos": 20521907314560.0, "grad_norm": 2.1194764008645377, "language_loss": 0.80022633, "learning_rate": 3.228360717180352e-06, "loss": 0.82352531, "num_input_tokens_seen": 55521970, "step": 2585, "time_per_iteration": 2.6651854515075684 }, { "auxiliary_loss_clip": 0.01087948, "auxiliary_loss_mlp": 0.01120423, "balance_loss_clip": 1.01842225, "balance_loss_mlp": 0.0, "epoch": 0.3109481151926892, "flos": 62445928723200.0, "grad_norm": 0.8238776975640061, "language_loss": 0.59395629, "learning_rate": 3.227745886327266e-06, "loss": 0.61603999, "num_input_tokens_seen": 55580665, "step": 2586, "time_per_iteration": 3.1187071800231934 }, { "auxiliary_loss_clip": 0.01088568, "auxiliary_loss_mlp": 0.01002234, "balance_loss_clip": 1.01906419, "balance_loss_mlp": 0.99955177, "epoch": 0.31106835808332833, "flos": 44746744723200.0, "grad_norm": 0.8058883105348184, "language_loss": 0.55854583, "learning_rate": 3.227130869226105e-06, "loss": 0.57945383, "num_input_tokens_seen": 55637825, "step": 2587, "time_per_iteration": 3.222491502761841 }, { "auxiliary_loss_clip": 0.01198962, "auxiliary_loss_mlp": 0.01032444, "balance_loss_clip": 1.01793027, "balance_loss_mlp": 1.02386737, "epoch": 0.3111886009739674, "flos": 23403056100480.0, "grad_norm": 6.774526831813159, "language_loss": 0.82485461, "learning_rate": 3.226515665970167e-06, "loss": 0.84716868, "num_input_tokens_seen": 55655365, "step": 2588, "time_per_iteration": 2.6238362789154053 }, { "auxiliary_loss_clip": 0.01194772, "auxiliary_loss_mlp": 0.0103554, "balance_loss_clip": 1.01851535, "balance_loss_mlp": 1.02669466, "epoch": 0.3113088438646065, "flos": 17530728192000.0, "grad_norm": 2.4640252447565003, "language_loss": 0.87002242, "learning_rate": 3.225900276652777e-06, "loss": 0.89232552, "num_input_tokens_seen": 55672140, "step": 2589, "time_per_iteration": 2.7546768188476562 }, { "auxiliary_loss_clip": 0.01199838, "auxiliary_loss_mlp": 0.01024468, "balance_loss_clip": 0.97765559, "balance_loss_mlp": 1.01594472, "epoch": 0.3114290867552456, "flos": 28365802882560.0, "grad_norm": 1.5027728287343338, "language_loss": 0.7567789, "learning_rate": 3.2252847013672906e-06, "loss": 0.77902198, "num_input_tokens_seen": 55694800, "step": 2590, "time_per_iteration": 3.821995973587036 }, { "auxiliary_loss_clip": 0.011844, "auxiliary_loss_mlp": 0.01029738, "balance_loss_clip": 0.93820846, "balance_loss_mlp": 1.02097011, "epoch": 0.31154932964588467, "flos": 27379157126400.0, "grad_norm": 2.085303928155695, "language_loss": 0.75947392, "learning_rate": 3.224668940207089e-06, "loss": 0.78161538, "num_input_tokens_seen": 55713785, "step": 2591, "time_per_iteration": 2.8025834560394287 }, { "auxiliary_loss_clip": 0.01182172, "auxiliary_loss_mlp": 0.01035965, "balance_loss_clip": 0.89801663, "balance_loss_mlp": 1.02677441, "epoch": 0.3116695725365238, "flos": 26541864120960.0, "grad_norm": 1.7438086038554792, "language_loss": 0.86801016, "learning_rate": 3.2240529932655828e-06, "loss": 0.8901915, "num_input_tokens_seen": 55733050, "step": 2592, "time_per_iteration": 3.8267288208007812 }, { "auxiliary_loss_clip": 0.01196909, "auxiliary_loss_mlp": 0.01025706, "balance_loss_clip": 0.98223495, "balance_loss_mlp": 1.01626444, "epoch": 0.3117898154271629, "flos": 21177600134400.0, "grad_norm": 3.0535873455094658, "language_loss": 0.88661242, "learning_rate": 3.223436860636211e-06, "loss": 0.90883851, "num_input_tokens_seen": 55748685, "step": 2593, "time_per_iteration": 2.719700574874878 }, { "auxiliary_loss_clip": 0.01200255, "auxiliary_loss_mlp": 0.01031859, "balance_loss_clip": 1.05884004, "balance_loss_mlp": 1.02332926, "epoch": 0.31191005831780194, "flos": 27272430840960.0, "grad_norm": 1.5601685720434828, "language_loss": 0.74347872, "learning_rate": 3.2228205424124403e-06, "loss": 0.76579988, "num_input_tokens_seen": 55771840, "step": 2594, "time_per_iteration": 2.6738650798797607 }, { "auxiliary_loss_clip": 0.0118335, "auxiliary_loss_mlp": 0.01033856, "balance_loss_clip": 0.97913641, "balance_loss_mlp": 1.02535069, "epoch": 0.31203030120844105, "flos": 12963501043200.0, "grad_norm": 2.1489554271404656, "language_loss": 0.74607283, "learning_rate": 3.222204038687765e-06, "loss": 0.76824492, "num_input_tokens_seen": 55784975, "step": 2595, "time_per_iteration": 2.6811118125915527 }, { "auxiliary_loss_clip": 0.01194281, "auxiliary_loss_mlp": 0.01028857, "balance_loss_clip": 1.01853991, "balance_loss_mlp": 1.02049446, "epoch": 0.31215054409908016, "flos": 27562014288000.0, "grad_norm": 1.7961441837694356, "language_loss": 0.87870371, "learning_rate": 3.221587349555709e-06, "loss": 0.90093505, "num_input_tokens_seen": 55805235, "step": 2596, "time_per_iteration": 3.6263251304626465 }, { "auxiliary_loss_clip": 0.01199843, "auxiliary_loss_mlp": 0.01030345, "balance_loss_clip": 0.98051113, "balance_loss_mlp": 1.02172613, "epoch": 0.3122707869897192, "flos": 21506326427520.0, "grad_norm": 1.5961154211126363, "language_loss": 0.69360596, "learning_rate": 3.2209704751098236e-06, "loss": 0.71590781, "num_input_tokens_seen": 55824265, "step": 2597, "time_per_iteration": 2.737704038619995 }, { "auxiliary_loss_clip": 0.01198841, "auxiliary_loss_mlp": 0.01032997, "balance_loss_clip": 0.9806757, "balance_loss_mlp": 1.02424109, "epoch": 0.31239102988035833, "flos": 15187017674880.0, "grad_norm": 1.9920860536207576, "language_loss": 0.82864308, "learning_rate": 3.2203534154436875e-06, "loss": 0.85096145, "num_input_tokens_seen": 55838620, "step": 2598, "time_per_iteration": 2.6216580867767334 }, { "auxiliary_loss_clip": 0.011877, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 0.86230457, "balance_loss_mlp": 1.02458072, "epoch": 0.31251127277099744, "flos": 22053712763520.0, "grad_norm": 1.7054214867910136, "language_loss": 0.75522828, "learning_rate": 3.2197361706509084e-06, "loss": 0.77743959, "num_input_tokens_seen": 55859375, "step": 2599, "time_per_iteration": 2.831193447113037 }, { "auxiliary_loss_clip": 0.01203877, "auxiliary_loss_mlp": 0.01041217, "balance_loss_clip": 1.058568, "balance_loss_mlp": 1.03237736, "epoch": 0.3126315156616365, "flos": 15193984913280.0, "grad_norm": 2.6741834542924985, "language_loss": 0.8357023, "learning_rate": 3.2191187408251228e-06, "loss": 0.85815328, "num_input_tokens_seen": 55876535, "step": 2600, "time_per_iteration": 2.6435112953186035 }, { "auxiliary_loss_clip": 0.01202779, "auxiliary_loss_mlp": 0.01033274, "balance_loss_clip": 1.01740968, "balance_loss_mlp": 1.0241251, "epoch": 0.3127517585522756, "flos": 18145338831360.0, "grad_norm": 2.258516099628326, "language_loss": 0.7897504, "learning_rate": 3.218501126059993e-06, "loss": 0.8121109, "num_input_tokens_seen": 55891930, "step": 2601, "time_per_iteration": 2.6459550857543945 }, { "auxiliary_loss_clip": 0.01197522, "auxiliary_loss_mlp": 0.01034534, "balance_loss_clip": 1.01640892, "balance_loss_mlp": 1.02506256, "epoch": 0.31287200144291466, "flos": 21908633731200.0, "grad_norm": 1.803180139150711, "language_loss": 0.81424177, "learning_rate": 3.2178833264492116e-06, "loss": 0.83656234, "num_input_tokens_seen": 55910635, "step": 2602, "time_per_iteration": 2.7046353816986084 }, { "auxiliary_loss_clip": 0.01203886, "auxiliary_loss_mlp": 0.01030633, "balance_loss_clip": 1.01892257, "balance_loss_mlp": 1.021806, "epoch": 0.31299224433355377, "flos": 29896997800320.0, "grad_norm": 1.9154680783688196, "language_loss": 0.75901961, "learning_rate": 3.217265342086498e-06, "loss": 0.7813648, "num_input_tokens_seen": 55931125, "step": 2603, "time_per_iteration": 2.6707499027252197 }, { "auxiliary_loss_clip": 0.01200647, "auxiliary_loss_mlp": 0.0112604, "balance_loss_clip": 0.94354659, "balance_loss_mlp": 0.0, "epoch": 0.3131124872241929, "flos": 11655886331520.0, "grad_norm": 2.9851234007070673, "language_loss": 0.73542154, "learning_rate": 3.216647173065599e-06, "loss": 0.75868845, "num_input_tokens_seen": 55946590, "step": 2604, "time_per_iteration": 2.7341458797454834 }, { "auxiliary_loss_clip": 0.011978, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 0.98421973, "balance_loss_mlp": 1.02418458, "epoch": 0.31323273011483194, "flos": 49848785470080.0, "grad_norm": 1.83376112509116, "language_loss": 0.7352066, "learning_rate": 3.216028819480292e-06, "loss": 0.75751364, "num_input_tokens_seen": 55967930, "step": 2605, "time_per_iteration": 2.9351377487182617 }, { "auxiliary_loss_clip": 0.01182253, "auxiliary_loss_mlp": 0.01028726, "balance_loss_clip": 0.97873253, "balance_loss_mlp": 1.02024412, "epoch": 0.31335297300547105, "flos": 22601278667520.0, "grad_norm": 2.1370661553326564, "language_loss": 0.75424653, "learning_rate": 3.2154102814243793e-06, "loss": 0.77635634, "num_input_tokens_seen": 55987070, "step": 2606, "time_per_iteration": 2.7488327026367188 }, { "auxiliary_loss_clip": 0.01200903, "auxiliary_loss_mlp": 0.01041188, "balance_loss_clip": 0.94368315, "balance_loss_mlp": 1.03268814, "epoch": 0.31347321589611016, "flos": 34710858708480.0, "grad_norm": 2.7376007849210966, "language_loss": 0.66843712, "learning_rate": 3.2147915589916937e-06, "loss": 0.69085801, "num_input_tokens_seen": 56008630, "step": 2607, "time_per_iteration": 2.8428244590759277 }, { "auxiliary_loss_clip": 0.01188195, "auxiliary_loss_mlp": 0.01034039, "balance_loss_clip": 0.97796714, "balance_loss_mlp": 1.02503252, "epoch": 0.3135934587867492, "flos": 19755789108480.0, "grad_norm": 1.9024505824369993, "language_loss": 0.82565558, "learning_rate": 3.2141726522760938e-06, "loss": 0.84787792, "num_input_tokens_seen": 56026690, "step": 2608, "time_per_iteration": 2.67341685295105 }, { "auxiliary_loss_clip": 0.01089797, "auxiliary_loss_mlp": 0.01003818, "balance_loss_clip": 0.98371744, "balance_loss_mlp": 1.00113618, "epoch": 0.3137137016773883, "flos": 65815535583360.0, "grad_norm": 0.7224573323840551, "language_loss": 0.52667248, "learning_rate": 3.213553561371469e-06, "loss": 0.54760867, "num_input_tokens_seen": 56090425, "step": 2609, "time_per_iteration": 3.3452513217926025 }, { "auxiliary_loss_clip": 0.01192323, "auxiliary_loss_mlp": 0.01032658, "balance_loss_clip": 0.90482211, "balance_loss_mlp": 1.02414596, "epoch": 0.31383394456802743, "flos": 16252739222400.0, "grad_norm": 2.189950922837858, "language_loss": 0.95580459, "learning_rate": 3.212934286371733e-06, "loss": 0.97805434, "num_input_tokens_seen": 56107135, "step": 2610, "time_per_iteration": 2.7326838970184326 }, { "auxiliary_loss_clip": 0.01195395, "auxiliary_loss_mlp": 0.01046136, "balance_loss_clip": 1.01813686, "balance_loss_mlp": 1.03721368, "epoch": 0.3139541874586665, "flos": 38795517613440.0, "grad_norm": 1.9451955803524694, "language_loss": 0.83506638, "learning_rate": 3.2123148273708304e-06, "loss": 0.85748166, "num_input_tokens_seen": 56127325, "step": 2611, "time_per_iteration": 2.8346362113952637 }, { "auxiliary_loss_clip": 0.01199226, "auxiliary_loss_mlp": 0.01038015, "balance_loss_clip": 1.0588541, "balance_loss_mlp": 1.02949178, "epoch": 0.3140744303493056, "flos": 25046328430080.0, "grad_norm": 1.741869792010861, "language_loss": 0.76684201, "learning_rate": 3.211695184462733e-06, "loss": 0.78921437, "num_input_tokens_seen": 56148500, "step": 2612, "time_per_iteration": 2.6264421939849854 }, { "auxiliary_loss_clip": 0.01094974, "auxiliary_loss_mlp": 0.01001526, "balance_loss_clip": 0.90775639, "balance_loss_mlp": 0.99892753, "epoch": 0.3141946732399447, "flos": 72504254782080.0, "grad_norm": 0.891958062546224, "language_loss": 0.60496575, "learning_rate": 3.2110753577414383e-06, "loss": 0.62593073, "num_input_tokens_seen": 56210080, "step": 2613, "time_per_iteration": 3.271017074584961 }, { "auxiliary_loss_clip": 0.01199217, "auxiliary_loss_mlp": 0.01032214, "balance_loss_clip": 0.97976428, "balance_loss_mlp": 1.0226717, "epoch": 0.31431491613058377, "flos": 19239788280960.0, "grad_norm": 1.861240459605536, "language_loss": 0.78652728, "learning_rate": 3.2104553473009757e-06, "loss": 0.80884165, "num_input_tokens_seen": 56228200, "step": 2614, "time_per_iteration": 2.7060179710388184 }, { "auxiliary_loss_clip": 0.01189742, "auxiliary_loss_mlp": 0.01033888, "balance_loss_clip": 0.90245372, "balance_loss_mlp": 1.02531111, "epoch": 0.3144351590212229, "flos": 36210596290560.0, "grad_norm": 2.387189037098938, "language_loss": 0.67854887, "learning_rate": 3.209835153235399e-06, "loss": 0.70078516, "num_input_tokens_seen": 56249755, "step": 2615, "time_per_iteration": 2.807129383087158 }, { "auxiliary_loss_clip": 0.01183699, "auxiliary_loss_mlp": 0.0103358, "balance_loss_clip": 0.94019449, "balance_loss_mlp": 1.02534902, "epoch": 0.314555401911862, "flos": 18551740285440.0, "grad_norm": 1.686553075637883, "language_loss": 0.67899787, "learning_rate": 3.2092147756387916e-06, "loss": 0.70117068, "num_input_tokens_seen": 56270080, "step": 2616, "time_per_iteration": 3.737443447113037 }, { "auxiliary_loss_clip": 0.01184917, "auxiliary_loss_mlp": 0.01033913, "balance_loss_clip": 0.97642338, "balance_loss_mlp": 1.0249598, "epoch": 0.31467564480250104, "flos": 16362877299840.0, "grad_norm": 1.799030454886866, "language_loss": 0.83851135, "learning_rate": 3.208594214605264e-06, "loss": 0.86069965, "num_input_tokens_seen": 56288625, "step": 2617, "time_per_iteration": 2.635493516921997 }, { "auxiliary_loss_clip": 0.011839, "auxiliary_loss_mlp": 0.01037214, "balance_loss_clip": 0.97909337, "balance_loss_mlp": 1.02867293, "epoch": 0.31479588769314015, "flos": 21652375127040.0, "grad_norm": 1.8665338039362411, "language_loss": 0.7673682, "learning_rate": 3.2079734702289553e-06, "loss": 0.78957939, "num_input_tokens_seen": 56307520, "step": 2618, "time_per_iteration": 3.5476083755493164 }, { "auxiliary_loss_clip": 0.01084369, "auxiliary_loss_mlp": 0.01119858, "balance_loss_clip": 0.98066092, "balance_loss_mlp": 0.0, "epoch": 0.3149161305837792, "flos": 66051072040320.0, "grad_norm": 0.8095867177678633, "language_loss": 0.60392118, "learning_rate": 3.207352542604031e-06, "loss": 0.62596345, "num_input_tokens_seen": 56369855, "step": 2619, "time_per_iteration": 3.3244404792785645 }, { "auxiliary_loss_clip": 0.01181728, "auxiliary_loss_mlp": 0.01026074, "balance_loss_clip": 0.93915069, "balance_loss_mlp": 1.01790833, "epoch": 0.3150363734744183, "flos": 28987201192320.0, "grad_norm": 1.548201592164931, "language_loss": 0.7836858, "learning_rate": 3.2067314318246864e-06, "loss": 0.80576384, "num_input_tokens_seen": 56390570, "step": 2620, "time_per_iteration": 2.733959197998047 }, { "auxiliary_loss_clip": 0.01194484, "auxiliary_loss_mlp": 0.01036794, "balance_loss_clip": 0.94250405, "balance_loss_mlp": 1.02775145, "epoch": 0.31515661636505743, "flos": 27636600879360.0, "grad_norm": 1.6895790224775529, "language_loss": 0.77563244, "learning_rate": 3.206110137985143e-06, "loss": 0.7979452, "num_input_tokens_seen": 56410775, "step": 2621, "time_per_iteration": 2.80682110786438 }, { "auxiliary_loss_clip": 0.01182208, "auxiliary_loss_mlp": 0.01034192, "balance_loss_clip": 0.94009244, "balance_loss_mlp": 1.02503049, "epoch": 0.3152768592556965, "flos": 24605632465920.0, "grad_norm": 2.2897445874552482, "language_loss": 0.92209852, "learning_rate": 3.2054886611796505e-06, "loss": 0.9442625, "num_input_tokens_seen": 56429770, "step": 2622, "time_per_iteration": 3.6828696727752686 }, { "auxiliary_loss_clip": 0.01086573, "auxiliary_loss_mlp": 0.01002875, "balance_loss_clip": 1.01907933, "balance_loss_mlp": 1.00031173, "epoch": 0.3153971021463356, "flos": 68476908026880.0, "grad_norm": 0.8843023758982012, "language_loss": 0.63540673, "learning_rate": 3.204867001502487e-06, "loss": 0.6563012, "num_input_tokens_seen": 56488425, "step": 2623, "time_per_iteration": 4.117944002151489 }, { "auxiliary_loss_clip": 0.01203542, "auxiliary_loss_mlp": 0.01031749, "balance_loss_clip": 1.06093907, "balance_loss_mlp": 1.02285624, "epoch": 0.3155173450369747, "flos": 25593714766080.0, "grad_norm": 1.7908098076440382, "language_loss": 0.80954689, "learning_rate": 3.2042451590479567e-06, "loss": 0.83189982, "num_input_tokens_seen": 56508940, "step": 2624, "time_per_iteration": 2.7048697471618652 }, { "auxiliary_loss_clip": 0.0119593, "auxiliary_loss_mlp": 0.0103319, "balance_loss_clip": 1.05814433, "balance_loss_mlp": 1.02468479, "epoch": 0.31563758792761376, "flos": 24309333175680.0, "grad_norm": 1.9643649504550773, "language_loss": 0.86760807, "learning_rate": 3.203623133910394e-06, "loss": 0.88989931, "num_input_tokens_seen": 56527245, "step": 2625, "time_per_iteration": 2.5951082706451416 }, { "auxiliary_loss_clip": 0.01195987, "auxiliary_loss_mlp": 0.01025061, "balance_loss_clip": 0.86436981, "balance_loss_mlp": 1.01650798, "epoch": 0.31575783081825287, "flos": 31903865550720.0, "grad_norm": 2.8596151288676133, "language_loss": 0.76918477, "learning_rate": 3.203000926184158e-06, "loss": 0.79139531, "num_input_tokens_seen": 56546170, "step": 2626, "time_per_iteration": 2.8716142177581787 }, { "auxiliary_loss_clip": 0.01197471, "auxiliary_loss_mlp": 0.01042341, "balance_loss_clip": 1.05856359, "balance_loss_mlp": 1.03406143, "epoch": 0.315878073708892, "flos": 30810960385920.0, "grad_norm": 1.5932012514595943, "language_loss": 0.77533913, "learning_rate": 3.202378535963639e-06, "loss": 0.79773724, "num_input_tokens_seen": 56567085, "step": 2627, "time_per_iteration": 2.7141387462615967 }, { "auxiliary_loss_clip": 0.01183605, "auxiliary_loss_mlp": 0.01125702, "balance_loss_clip": 0.97728479, "balance_loss_mlp": 0.0, "epoch": 0.31599831659953104, "flos": 22200264253440.0, "grad_norm": 1.6859781956319075, "language_loss": 0.83870983, "learning_rate": 3.2017559633432516e-06, "loss": 0.86180294, "num_input_tokens_seen": 56586715, "step": 2628, "time_per_iteration": 2.7679264545440674 }, { "auxiliary_loss_clip": 0.01204665, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 0.98017162, "balance_loss_mlp": 1.02802026, "epoch": 0.31611855949017015, "flos": 25593463370880.0, "grad_norm": 1.7542504107278984, "language_loss": 0.66471159, "learning_rate": 3.2011332084174398e-06, "loss": 0.68712682, "num_input_tokens_seen": 56607585, "step": 2629, "time_per_iteration": 2.711876630783081 }, { "auxiliary_loss_clip": 0.01195464, "auxiliary_loss_mlp": 0.0102986, "balance_loss_clip": 1.01944232, "balance_loss_mlp": 1.02079976, "epoch": 0.31623880238080926, "flos": 20594087694720.0, "grad_norm": 1.490388166614282, "language_loss": 0.89256352, "learning_rate": 3.2005102712806756e-06, "loss": 0.91481674, "num_input_tokens_seen": 56626415, "step": 2630, "time_per_iteration": 2.6855461597442627 }, { "auxiliary_loss_clip": 0.01200188, "auxiliary_loss_mlp": 0.01031874, "balance_loss_clip": 1.01879334, "balance_loss_mlp": 1.02296352, "epoch": 0.3163590452714483, "flos": 12784917600000.0, "grad_norm": 1.9615810896359533, "language_loss": 0.73253369, "learning_rate": 3.1998871520274575e-06, "loss": 0.75485432, "num_input_tokens_seen": 56641750, "step": 2631, "time_per_iteration": 2.6582908630371094 }, { "auxiliary_loss_clip": 0.01194359, "auxiliary_loss_mlp": 0.01030018, "balance_loss_clip": 0.97717071, "balance_loss_mlp": 1.02111948, "epoch": 0.3164792881620874, "flos": 23041292273280.0, "grad_norm": 1.6522134433443783, "language_loss": 0.84602296, "learning_rate": 3.199263850752312e-06, "loss": 0.8682667, "num_input_tokens_seen": 56662585, "step": 2632, "time_per_iteration": 2.726346015930176 }, { "auxiliary_loss_clip": 0.01200514, "auxiliary_loss_mlp": 0.01030364, "balance_loss_clip": 1.0202204, "balance_loss_mlp": 1.02130437, "epoch": 0.31659953105272653, "flos": 18296271780480.0, "grad_norm": 2.1450050391690483, "language_loss": 0.84981871, "learning_rate": 3.198640367549795e-06, "loss": 0.87212753, "num_input_tokens_seen": 56681480, "step": 2633, "time_per_iteration": 2.703623056411743 }, { "auxiliary_loss_clip": 0.0119907, "auxiliary_loss_mlp": 0.01125248, "balance_loss_clip": 1.02006364, "balance_loss_mlp": 0.0, "epoch": 0.3167197739433656, "flos": 25703421880320.0, "grad_norm": 1.686295672394898, "language_loss": 0.85840845, "learning_rate": 3.198016702514487e-06, "loss": 0.88165164, "num_input_tokens_seen": 56701760, "step": 2634, "time_per_iteration": 2.717508316040039 }, { "auxiliary_loss_clip": 0.01197215, "auxiliary_loss_mlp": 0.01029499, "balance_loss_clip": 1.05764914, "balance_loss_mlp": 1.02085578, "epoch": 0.3168400168340047, "flos": 23546016230400.0, "grad_norm": 1.683068503120339, "language_loss": 0.84577107, "learning_rate": 3.1973928557409972e-06, "loss": 0.86803818, "num_input_tokens_seen": 56719800, "step": 2635, "time_per_iteration": 2.663421154022217 }, { "auxiliary_loss_clip": 0.01198528, "auxiliary_loss_mlp": 0.01029334, "balance_loss_clip": 1.06016517, "balance_loss_mlp": 1.02110887, "epoch": 0.31696025972464376, "flos": 28366449327360.0, "grad_norm": 1.9635200173568403, "language_loss": 0.71729219, "learning_rate": 3.1967688273239636e-06, "loss": 0.7395708, "num_input_tokens_seen": 56739605, "step": 2636, "time_per_iteration": 2.6736199855804443 }, { "auxiliary_loss_clip": 0.01192207, "auxiliary_loss_mlp": 0.01027002, "balance_loss_clip": 0.94155419, "balance_loss_mlp": 1.01848435, "epoch": 0.31708050261528287, "flos": 16399111144320.0, "grad_norm": 1.7833376402516696, "language_loss": 0.82270896, "learning_rate": 3.1961446173580503e-06, "loss": 0.84490103, "num_input_tokens_seen": 56756545, "step": 2637, "time_per_iteration": 2.7116336822509766 }, { "auxiliary_loss_clip": 0.01195154, "auxiliary_loss_mlp": 0.0103338, "balance_loss_clip": 0.98239887, "balance_loss_mlp": 1.0247438, "epoch": 0.317200745505922, "flos": 26212347728640.0, "grad_norm": 1.5954954704090762, "language_loss": 0.77362001, "learning_rate": 3.1955202259379502e-06, "loss": 0.79590535, "num_input_tokens_seen": 56778275, "step": 2638, "time_per_iteration": 2.714571475982666 }, { "auxiliary_loss_clip": 0.01191054, "auxiliary_loss_mlp": 0.01036177, "balance_loss_clip": 1.01612616, "balance_loss_mlp": 1.02761197, "epoch": 0.31732098839656103, "flos": 31350876693120.0, "grad_norm": 2.426899939526433, "language_loss": 0.83083409, "learning_rate": 3.194895653158381e-06, "loss": 0.85310644, "num_input_tokens_seen": 56797215, "step": 2639, "time_per_iteration": 2.7818562984466553 }, { "auxiliary_loss_clip": 0.01085529, "auxiliary_loss_mlp": 0.01008157, "balance_loss_clip": 1.01786137, "balance_loss_mlp": 1.00559366, "epoch": 0.31744123128720014, "flos": 58989024835200.0, "grad_norm": 0.7833184841010761, "language_loss": 0.55569315, "learning_rate": 3.194270899114093e-06, "loss": 0.57663, "num_input_tokens_seen": 56863010, "step": 2640, "time_per_iteration": 3.2704920768737793 }, { "auxiliary_loss_clip": 0.012051, "auxiliary_loss_mlp": 0.01028959, "balance_loss_clip": 1.02115989, "balance_loss_mlp": 1.02041197, "epoch": 0.31756147417783925, "flos": 17417573372160.0, "grad_norm": 2.013331526730329, "language_loss": 0.82167852, "learning_rate": 3.193645963899858e-06, "loss": 0.84401911, "num_input_tokens_seen": 56880625, "step": 2641, "time_per_iteration": 2.664198160171509 }, { "auxiliary_loss_clip": 0.01187811, "auxiliary_loss_mlp": 0.01024483, "balance_loss_clip": 0.97854173, "balance_loss_mlp": 1.01604903, "epoch": 0.3176817170684783, "flos": 25481673267840.0, "grad_norm": 1.7642122686250585, "language_loss": 0.84019005, "learning_rate": 3.193020847610479e-06, "loss": 0.86231291, "num_input_tokens_seen": 56900945, "step": 2642, "time_per_iteration": 3.778594493865967 }, { "auxiliary_loss_clip": 0.01190414, "auxiliary_loss_mlp": 0.01031604, "balance_loss_clip": 0.9831562, "balance_loss_mlp": 1.02260983, "epoch": 0.3178019599591174, "flos": 24972603765120.0, "grad_norm": 3.943051366435319, "language_loss": 0.71078408, "learning_rate": 3.192395550340787e-06, "loss": 0.73300421, "num_input_tokens_seen": 56918895, "step": 2643, "time_per_iteration": 2.76035737991333 }, { "auxiliary_loss_clip": 0.0119433, "auxiliary_loss_mlp": 0.01032619, "balance_loss_clip": 1.02058744, "balance_loss_mlp": 1.02397656, "epoch": 0.31792220284975653, "flos": 12422220019200.0, "grad_norm": 2.2799740165708924, "language_loss": 0.76819414, "learning_rate": 3.191770072185638e-06, "loss": 0.79046363, "num_input_tokens_seen": 56935890, "step": 2644, "time_per_iteration": 2.6408538818359375 }, { "auxiliary_loss_clip": 0.011958, "auxiliary_loss_mlp": 0.01029082, "balance_loss_clip": 1.02010202, "balance_loss_mlp": 1.0206418, "epoch": 0.3180424457403956, "flos": 15485759089920.0, "grad_norm": 3.4178885190595163, "language_loss": 0.72366989, "learning_rate": 3.191144413239916e-06, "loss": 0.74591869, "num_input_tokens_seen": 56952460, "step": 2645, "time_per_iteration": 3.5793097019195557 }, { "auxiliary_loss_clip": 0.01196801, "auxiliary_loss_mlp": 0.01029633, "balance_loss_clip": 0.98136961, "balance_loss_mlp": 1.02070987, "epoch": 0.3181626886310347, "flos": 26174964648960.0, "grad_norm": 2.0400880962700136, "language_loss": 0.88096577, "learning_rate": 3.190518573598534e-06, "loss": 0.90323013, "num_input_tokens_seen": 56969065, "step": 2646, "time_per_iteration": 2.6811673641204834 }, { "auxiliary_loss_clip": 0.01201001, "auxiliary_loss_mlp": 0.01038629, "balance_loss_clip": 0.94131458, "balance_loss_mlp": 1.02942038, "epoch": 0.3182829315216738, "flos": 25483109811840.0, "grad_norm": 1.7311825287224052, "language_loss": 0.77624941, "learning_rate": 3.1898925533564308e-06, "loss": 0.79864573, "num_input_tokens_seen": 56990535, "step": 2647, "time_per_iteration": 2.757467746734619 }, { "auxiliary_loss_clip": 0.01180931, "auxiliary_loss_mlp": 0.01037662, "balance_loss_clip": 0.93826538, "balance_loss_mlp": 1.02866757, "epoch": 0.31840317441231286, "flos": 18113701927680.0, "grad_norm": 2.0994033622986383, "language_loss": 0.64319289, "learning_rate": 3.1892663526085733e-06, "loss": 0.66537881, "num_input_tokens_seen": 57008910, "step": 2648, "time_per_iteration": 4.603882789611816 }, { "auxiliary_loss_clip": 0.01085359, "auxiliary_loss_mlp": 0.01003039, "balance_loss_clip": 1.0183574, "balance_loss_mlp": 1.00055933, "epoch": 0.31852341730295197, "flos": 64741948957440.0, "grad_norm": 0.7516635338997695, "language_loss": 0.56971872, "learning_rate": 3.188639971449956e-06, "loss": 0.5906027, "num_input_tokens_seen": 57074960, "step": 2649, "time_per_iteration": 3.180396556854248 }, { "auxiliary_loss_clip": 0.01203426, "auxiliary_loss_mlp": 0.01030815, "balance_loss_clip": 1.06111193, "balance_loss_mlp": 1.02177262, "epoch": 0.318643660193591, "flos": 20668135582080.0, "grad_norm": 2.153067798562823, "language_loss": 0.72043478, "learning_rate": 3.1880134099756e-06, "loss": 0.74277717, "num_input_tokens_seen": 57094595, "step": 2650, "time_per_iteration": 2.7090694904327393 }, { "auxiliary_loss_clip": 0.01194037, "auxiliary_loss_mlp": 0.01029987, "balance_loss_clip": 1.01770914, "balance_loss_mlp": 1.02160597, "epoch": 0.31876390308423014, "flos": 26943345411840.0, "grad_norm": 1.7543987475681038, "language_loss": 0.69625747, "learning_rate": 3.1873866682805535e-06, "loss": 0.71849769, "num_input_tokens_seen": 57115290, "step": 2651, "time_per_iteration": 2.6820385456085205 }, { "auxiliary_loss_clip": 0.01202776, "auxiliary_loss_mlp": 0.01031171, "balance_loss_clip": 0.98441553, "balance_loss_mlp": 1.02291, "epoch": 0.31888414597486925, "flos": 18041916597120.0, "grad_norm": 1.7048653599461716, "language_loss": 0.88473892, "learning_rate": 3.186759746459894e-06, "loss": 0.90707839, "num_input_tokens_seen": 57134400, "step": 2652, "time_per_iteration": 2.740013599395752 }, { "auxiliary_loss_clip": 0.01194058, "auxiliary_loss_mlp": 0.01028834, "balance_loss_clip": 0.98004907, "balance_loss_mlp": 1.02044749, "epoch": 0.3190043888655083, "flos": 25149319701120.0, "grad_norm": 1.7195432639260475, "language_loss": 0.79806775, "learning_rate": 3.1861326446087246e-06, "loss": 0.82029665, "num_input_tokens_seen": 57153140, "step": 2653, "time_per_iteration": 2.7858870029449463 }, { "auxiliary_loss_clip": 0.01203258, "auxiliary_loss_mlp": 0.01032711, "balance_loss_clip": 1.02197278, "balance_loss_mlp": 1.02369845, "epoch": 0.3191246317561474, "flos": 22053892331520.0, "grad_norm": 2.154811092128439, "language_loss": 0.71981919, "learning_rate": 3.1855053628221763e-06, "loss": 0.74217892, "num_input_tokens_seen": 57172395, "step": 2654, "time_per_iteration": 2.6849985122680664 }, { "auxiliary_loss_clip": 0.01185756, "auxiliary_loss_mlp": 0.01033808, "balance_loss_clip": 0.93976641, "balance_loss_mlp": 1.02535021, "epoch": 0.3192448746467865, "flos": 14901815687040.0, "grad_norm": 2.8766111222829664, "language_loss": 0.8975966, "learning_rate": 3.184877901195407e-06, "loss": 0.91979218, "num_input_tokens_seen": 57189090, "step": 2655, "time_per_iteration": 2.7670774459838867 }, { "auxiliary_loss_clip": 0.01090693, "auxiliary_loss_mlp": 0.01007985, "balance_loss_clip": 0.9475863, "balance_loss_mlp": 1.00512362, "epoch": 0.3193651175374256, "flos": 67234832657280.0, "grad_norm": 0.7977846517182942, "language_loss": 0.62833011, "learning_rate": 3.184250259823602e-06, "loss": 0.64931691, "num_input_tokens_seen": 57251620, "step": 2656, "time_per_iteration": 3.3349130153656006 }, { "auxiliary_loss_clip": 0.01195776, "auxiliary_loss_mlp": 0.01032913, "balance_loss_clip": 0.94180918, "balance_loss_mlp": 1.02440178, "epoch": 0.3194853604280647, "flos": 12233077977600.0, "grad_norm": 2.2351878845169875, "language_loss": 0.81562185, "learning_rate": 3.183622438801974e-06, "loss": 0.83790869, "num_input_tokens_seen": 57266910, "step": 2657, "time_per_iteration": 2.7295897006988525 }, { "auxiliary_loss_clip": 0.01201361, "auxiliary_loss_mlp": 0.01042277, "balance_loss_clip": 1.06055629, "balance_loss_mlp": 1.03391421, "epoch": 0.3196056033187038, "flos": 14939917038720.0, "grad_norm": 1.7122434061298626, "language_loss": 0.75414789, "learning_rate": 3.1829944382257637e-06, "loss": 0.77658427, "num_input_tokens_seen": 57285040, "step": 2658, "time_per_iteration": 2.593919515609741 }, { "auxiliary_loss_clip": 0.01194825, "auxiliary_loss_mlp": 0.01025402, "balance_loss_clip": 1.02019, "balance_loss_mlp": 1.01739669, "epoch": 0.31972584620934286, "flos": 23768878164480.0, "grad_norm": 2.4225600617529346, "language_loss": 0.81546628, "learning_rate": 3.1823662581902373e-06, "loss": 0.83766854, "num_input_tokens_seen": 57302725, "step": 2659, "time_per_iteration": 2.7131662368774414 }, { "auxiliary_loss_clip": 0.01177466, "auxiliary_loss_mlp": 0.01038775, "balance_loss_clip": 0.93740714, "balance_loss_mlp": 1.02948308, "epoch": 0.31984608909998197, "flos": 21251540280960.0, "grad_norm": 3.2845327131746864, "language_loss": 0.74541634, "learning_rate": 3.1817378987906896e-06, "loss": 0.76757872, "num_input_tokens_seen": 57322230, "step": 2660, "time_per_iteration": 2.706059455871582 }, { "auxiliary_loss_clip": 0.01189509, "auxiliary_loss_mlp": 0.01031076, "balance_loss_clip": 0.90619767, "balance_loss_mlp": 1.02283323, "epoch": 0.3199663319906211, "flos": 18296235866880.0, "grad_norm": 1.9173520242765674, "language_loss": 0.79864359, "learning_rate": 3.181109360122442e-06, "loss": 0.82084948, "num_input_tokens_seen": 57339820, "step": 2661, "time_per_iteration": 2.7146835327148438 }, { "auxiliary_loss_clip": 0.01191554, "auxiliary_loss_mlp": 0.01033965, "balance_loss_clip": 0.94063061, "balance_loss_mlp": 1.02455318, "epoch": 0.32008657488126013, "flos": 18733627779840.0, "grad_norm": 1.9788768879284193, "language_loss": 0.77950794, "learning_rate": 3.1804806422808445e-06, "loss": 0.80176318, "num_input_tokens_seen": 57356955, "step": 2662, "time_per_iteration": 2.7102324962615967 }, { "auxiliary_loss_clip": 0.01185741, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 0.97835815, "balance_loss_mlp": 1.01902246, "epoch": 0.32020681777189924, "flos": 20595344670720.0, "grad_norm": 1.9890600127717268, "language_loss": 0.73007387, "learning_rate": 3.1798517453612714e-06, "loss": 0.75220799, "num_input_tokens_seen": 57376760, "step": 2663, "time_per_iteration": 2.68745756149292 }, { "auxiliary_loss_clip": 0.01194624, "auxiliary_loss_mlp": 0.01023792, "balance_loss_clip": 1.01984215, "balance_loss_mlp": 1.01538765, "epoch": 0.32032706066253835, "flos": 35261692750080.0, "grad_norm": 1.664397843604086, "language_loss": 0.75198913, "learning_rate": 3.1792226694591265e-06, "loss": 0.77417332, "num_input_tokens_seen": 57398145, "step": 2664, "time_per_iteration": 2.838449478149414 }, { "auxiliary_loss_clip": 0.01193589, "auxiliary_loss_mlp": 0.01040763, "balance_loss_clip": 0.94269985, "balance_loss_mlp": 1.03244185, "epoch": 0.3204473035531774, "flos": 15304230731520.0, "grad_norm": 1.9097852825658226, "language_loss": 0.80670828, "learning_rate": 3.178593414669841e-06, "loss": 0.82905179, "num_input_tokens_seen": 57416730, "step": 2665, "time_per_iteration": 2.7118124961853027 }, { "auxiliary_loss_clip": 0.01200867, "auxiliary_loss_mlp": 0.01031797, "balance_loss_clip": 1.02104318, "balance_loss_mlp": 1.02227807, "epoch": 0.3205675464438165, "flos": 24462564595200.0, "grad_norm": 3.760761831012303, "language_loss": 0.70347941, "learning_rate": 3.1779639810888707e-06, "loss": 0.72580606, "num_input_tokens_seen": 57436325, "step": 2666, "time_per_iteration": 2.6654884815216064 }, { "auxiliary_loss_clip": 0.01195786, "auxiliary_loss_mlp": 0.01031125, "balance_loss_clip": 1.02146602, "balance_loss_mlp": 1.02267933, "epoch": 0.3206877893344556, "flos": 22456235548800.0, "grad_norm": 1.6671979682321747, "language_loss": 0.75668919, "learning_rate": 3.1773343688117013e-06, "loss": 0.77895832, "num_input_tokens_seen": 57457235, "step": 2667, "time_per_iteration": 2.6455044746398926 }, { "auxiliary_loss_clip": 0.011972, "auxiliary_loss_mlp": 0.01124841, "balance_loss_clip": 0.97926366, "balance_loss_mlp": 0.0, "epoch": 0.3208080322250947, "flos": 20412236113920.0, "grad_norm": 1.9846249834741, "language_loss": 0.84146613, "learning_rate": 3.1767045779338445e-06, "loss": 0.86468649, "num_input_tokens_seen": 57474895, "step": 2668, "time_per_iteration": 3.7111761569976807 }, { "auxiliary_loss_clip": 0.01192127, "auxiliary_loss_mlp": 0.01030708, "balance_loss_clip": 1.01549208, "balance_loss_mlp": 1.02270365, "epoch": 0.3209282751157338, "flos": 21762118154880.0, "grad_norm": 2.5226064444318586, "language_loss": 0.91256666, "learning_rate": 3.176074608550839e-06, "loss": 0.93479502, "num_input_tokens_seen": 57490715, "step": 2669, "time_per_iteration": 2.6396546363830566 }, { "auxiliary_loss_clip": 0.01186882, "auxiliary_loss_mlp": 0.01031242, "balance_loss_clip": 0.86393988, "balance_loss_mlp": 1.02305269, "epoch": 0.32104851800637285, "flos": 22055041566720.0, "grad_norm": 2.235544131876833, "language_loss": 0.82686931, "learning_rate": 3.17544446075825e-06, "loss": 0.84905052, "num_input_tokens_seen": 57509880, "step": 2670, "time_per_iteration": 2.782479763031006 }, { "auxiliary_loss_clip": 0.01200883, "auxiliary_loss_mlp": 0.01037241, "balance_loss_clip": 0.98083717, "balance_loss_mlp": 1.02917087, "epoch": 0.32116876089701196, "flos": 37012301896320.0, "grad_norm": 1.6337024468922967, "language_loss": 0.70683193, "learning_rate": 3.174814134651671e-06, "loss": 0.72921318, "num_input_tokens_seen": 57532430, "step": 2671, "time_per_iteration": 3.798234462738037 }, { "auxiliary_loss_clip": 0.01194434, "auxiliary_loss_mlp": 0.01034568, "balance_loss_clip": 1.05738473, "balance_loss_mlp": 1.02660525, "epoch": 0.3212890037876511, "flos": 21979233912960.0, "grad_norm": 1.7373863016908948, "language_loss": 0.80352914, "learning_rate": 3.1741836303267215e-06, "loss": 0.82581925, "num_input_tokens_seen": 57551965, "step": 2672, "time_per_iteration": 2.5691184997558594 }, { "auxiliary_loss_clip": 0.01196997, "auxiliary_loss_mlp": 0.01035981, "balance_loss_clip": 1.05933797, "balance_loss_mlp": 1.02797627, "epoch": 0.32140924667829013, "flos": 10342345875840.0, "grad_norm": 1.8396258265062924, "language_loss": 0.75437248, "learning_rate": 3.1735529478790496e-06, "loss": 0.77670228, "num_input_tokens_seen": 57569955, "step": 2673, "time_per_iteration": 2.634591579437256 }, { "auxiliary_loss_clip": 0.01197101, "auxiliary_loss_mlp": 0.01030832, "balance_loss_clip": 1.0190649, "balance_loss_mlp": 1.02238619, "epoch": 0.32152948956892924, "flos": 50798910072960.0, "grad_norm": 1.7809910870335435, "language_loss": 0.7972883, "learning_rate": 3.172922087404328e-06, "loss": 0.81956756, "num_input_tokens_seen": 57592215, "step": 2674, "time_per_iteration": 4.747631311416626 }, { "auxiliary_loss_clip": 0.01086249, "auxiliary_loss_mlp": 0.01011942, "balance_loss_clip": 1.01878035, "balance_loss_mlp": 1.00930774, "epoch": 0.32164973245956835, "flos": 63863250549120.0, "grad_norm": 0.7748990791787259, "language_loss": 0.55257213, "learning_rate": 3.1722910489982586e-06, "loss": 0.57355404, "num_input_tokens_seen": 57652575, "step": 2675, "time_per_iteration": 3.3284995555877686 }, { "auxiliary_loss_clip": 0.01189919, "auxiliary_loss_mlp": 0.01025028, "balance_loss_clip": 0.97952664, "balance_loss_mlp": 1.01686192, "epoch": 0.3217699753502074, "flos": 23513948363520.0, "grad_norm": 1.5850539553902196, "language_loss": 0.79900289, "learning_rate": 3.1716598327565694e-06, "loss": 0.82115239, "num_input_tokens_seen": 57672215, "step": 2676, "time_per_iteration": 2.765580654144287 }, { "auxiliary_loss_clip": 0.01196549, "auxiliary_loss_mlp": 0.0103023, "balance_loss_clip": 1.05725384, "balance_loss_mlp": 1.02203393, "epoch": 0.3218902182408465, "flos": 19062533640960.0, "grad_norm": 1.5364599936901047, "language_loss": 0.84188241, "learning_rate": 3.171028438775015e-06, "loss": 0.86415017, "num_input_tokens_seen": 57691410, "step": 2677, "time_per_iteration": 2.710705280303955 }, { "auxiliary_loss_clip": 0.01196904, "auxiliary_loss_mlp": 0.01031941, "balance_loss_clip": 1.05700362, "balance_loss_mlp": 1.02323866, "epoch": 0.3220104611314856, "flos": 20375571306240.0, "grad_norm": 1.870088524686712, "language_loss": 0.84383696, "learning_rate": 3.170396867149377e-06, "loss": 0.86612546, "num_input_tokens_seen": 57709415, "step": 2678, "time_per_iteration": 2.67006516456604 }, { "auxiliary_loss_clip": 0.01179412, "auxiliary_loss_mlp": 0.01029776, "balance_loss_clip": 0.90056634, "balance_loss_mlp": 1.02109718, "epoch": 0.3221307040221247, "flos": 20117014231680.0, "grad_norm": 1.739775049660078, "language_loss": 0.86481071, "learning_rate": 3.1697651179754653e-06, "loss": 0.88690263, "num_input_tokens_seen": 57728075, "step": 2679, "time_per_iteration": 2.8048036098480225 }, { "auxiliary_loss_clip": 0.01193325, "auxiliary_loss_mlp": 0.01027454, "balance_loss_clip": 0.94338584, "balance_loss_mlp": 1.01928854, "epoch": 0.3222509469127638, "flos": 23987789602560.0, "grad_norm": 1.96339292410903, "language_loss": 0.72942346, "learning_rate": 3.1691331913491153e-06, "loss": 0.75163126, "num_input_tokens_seen": 57750645, "step": 2680, "time_per_iteration": 2.702547550201416 }, { "auxiliary_loss_clip": 0.01199214, "auxiliary_loss_mlp": 0.01030583, "balance_loss_clip": 1.05777681, "balance_loss_mlp": 1.0220958, "epoch": 0.32237118980340285, "flos": 17675735397120.0, "grad_norm": 1.8516342711629137, "language_loss": 0.84575307, "learning_rate": 3.1685010873661898e-06, "loss": 0.86805105, "num_input_tokens_seen": 57769820, "step": 2681, "time_per_iteration": 2.6240601539611816 }, { "auxiliary_loss_clip": 0.01191483, "auxiliary_loss_mlp": 0.01035816, "balance_loss_clip": 1.01771986, "balance_loss_mlp": 1.02668452, "epoch": 0.32249143269404196, "flos": 23147982645120.0, "grad_norm": 1.8910099001854472, "language_loss": 0.79742664, "learning_rate": 3.167868806122578e-06, "loss": 0.81969965, "num_input_tokens_seen": 57788870, "step": 2682, "time_per_iteration": 2.645326852798462 }, { "auxiliary_loss_clip": 0.01199631, "auxiliary_loss_mlp": 0.0104089, "balance_loss_clip": 0.98187178, "balance_loss_mlp": 1.03233659, "epoch": 0.32261167558468107, "flos": 24422308427520.0, "grad_norm": 5.413212469594398, "language_loss": 0.65971625, "learning_rate": 3.1672363477141968e-06, "loss": 0.6821214, "num_input_tokens_seen": 57808165, "step": 2683, "time_per_iteration": 2.820629835128784 }, { "auxiliary_loss_clip": 0.01198407, "auxiliary_loss_mlp": 0.01030434, "balance_loss_clip": 0.9778353, "balance_loss_mlp": 1.0213387, "epoch": 0.3227319184753201, "flos": 30367175852160.0, "grad_norm": 1.9563002142953658, "language_loss": 0.84830201, "learning_rate": 3.1666037122369903e-06, "loss": 0.87059033, "num_input_tokens_seen": 57828825, "step": 2684, "time_per_iteration": 2.7471227645874023 }, { "auxiliary_loss_clip": 0.01190666, "auxiliary_loss_mlp": 0.0102678, "balance_loss_clip": 1.01504803, "balance_loss_mlp": 1.01783919, "epoch": 0.32285216136595923, "flos": 16946174257920.0, "grad_norm": 2.0359078980004637, "language_loss": 0.86778444, "learning_rate": 3.165970899786928e-06, "loss": 0.88995886, "num_input_tokens_seen": 57846740, "step": 2685, "time_per_iteration": 2.721558094024658 }, { "auxiliary_loss_clip": 0.01196564, "auxiliary_loss_mlp": 0.01030865, "balance_loss_clip": 0.94033396, "balance_loss_mlp": 1.02203751, "epoch": 0.32297240425659834, "flos": 21981532383360.0, "grad_norm": 1.5549195859291973, "language_loss": 0.75466108, "learning_rate": 3.1653379104600067e-06, "loss": 0.77693534, "num_input_tokens_seen": 57866885, "step": 2686, "time_per_iteration": 2.8533473014831543 }, { "auxiliary_loss_clip": 0.01197401, "auxiliary_loss_mlp": 0.01036742, "balance_loss_clip": 1.01911426, "balance_loss_mlp": 1.0276283, "epoch": 0.3230926471472374, "flos": 22748045639040.0, "grad_norm": 1.7845738256338013, "language_loss": 0.69382358, "learning_rate": 3.164704744352251e-06, "loss": 0.71616507, "num_input_tokens_seen": 57887690, "step": 2687, "time_per_iteration": 2.737701654434204 }, { "auxiliary_loss_clip": 0.01190344, "auxiliary_loss_mlp": 0.0102788, "balance_loss_clip": 1.01624072, "balance_loss_mlp": 1.01978528, "epoch": 0.3232128900378765, "flos": 16942977947520.0, "grad_norm": 1.7064826070102344, "language_loss": 0.80385232, "learning_rate": 3.164071401559713e-06, "loss": 0.82603455, "num_input_tokens_seen": 57905090, "step": 2688, "time_per_iteration": 2.7543203830718994 }, { "auxiliary_loss_clip": 0.01197057, "auxiliary_loss_mlp": 0.01037229, "balance_loss_clip": 0.98150873, "balance_loss_mlp": 1.02868211, "epoch": 0.3233331329285156, "flos": 24023736138240.0, "grad_norm": 1.6998746912008595, "language_loss": 0.70746797, "learning_rate": 3.1634378821784674e-06, "loss": 0.72981083, "num_input_tokens_seen": 57925305, "step": 2689, "time_per_iteration": 2.679203510284424 }, { "auxiliary_loss_clip": 0.01196231, "auxiliary_loss_mlp": 0.01033085, "balance_loss_clip": 0.94219685, "balance_loss_mlp": 1.02482414, "epoch": 0.3234533758191547, "flos": 18113845582080.0, "grad_norm": 3.5446110951012826, "language_loss": 0.73851812, "learning_rate": 3.1628041863046208e-06, "loss": 0.76081127, "num_input_tokens_seen": 57942720, "step": 2690, "time_per_iteration": 2.7140355110168457 }, { "auxiliary_loss_clip": 0.01198163, "auxiliary_loss_mlp": 0.01030047, "balance_loss_clip": 1.05356455, "balance_loss_mlp": 1.02063572, "epoch": 0.3235736187097938, "flos": 16946138344320.0, "grad_norm": 2.1024226532080017, "language_loss": 0.91018236, "learning_rate": 3.162170314034304e-06, "loss": 0.93246436, "num_input_tokens_seen": 57960135, "step": 2691, "time_per_iteration": 2.6393237113952637 }, { "auxiliary_loss_clip": 0.01200992, "auxiliary_loss_mlp": 0.01032295, "balance_loss_clip": 1.05821848, "balance_loss_mlp": 1.02296066, "epoch": 0.3236938616004329, "flos": 22127150119680.0, "grad_norm": 1.6390615168163505, "language_loss": 0.81036752, "learning_rate": 3.1615362654636738e-06, "loss": 0.83270037, "num_input_tokens_seen": 57980875, "step": 2692, "time_per_iteration": 2.658038377761841 }, { "auxiliary_loss_clip": 0.01193297, "auxiliary_loss_mlp": 0.01042533, "balance_loss_clip": 0.9468689, "balance_loss_mlp": 1.03415906, "epoch": 0.32381410449107195, "flos": 17164618819200.0, "grad_norm": 1.749257857649044, "language_loss": 0.87269795, "learning_rate": 3.1609020406889163e-06, "loss": 0.89505625, "num_input_tokens_seen": 57998310, "step": 2693, "time_per_iteration": 2.686155080795288 }, { "auxiliary_loss_clip": 0.01195361, "auxiliary_loss_mlp": 0.01039692, "balance_loss_clip": 0.98029429, "balance_loss_mlp": 1.03025031, "epoch": 0.32393434738171106, "flos": 16578125550720.0, "grad_norm": 1.6854328602852193, "language_loss": 0.85234243, "learning_rate": 3.1602676398062416e-06, "loss": 0.87469292, "num_input_tokens_seen": 58017220, "step": 2694, "time_per_iteration": 3.65584397315979 }, { "auxiliary_loss_clip": 0.01195271, "auxiliary_loss_mlp": 0.01028791, "balance_loss_clip": 1.01827228, "balance_loss_mlp": 1.02011275, "epoch": 0.3240545902723502, "flos": 25483612602240.0, "grad_norm": 2.359658496424881, "language_loss": 0.6121242, "learning_rate": 3.1596330629118886e-06, "loss": 0.63436484, "num_input_tokens_seen": 58037190, "step": 2695, "time_per_iteration": 2.6777963638305664 }, { "auxiliary_loss_clip": 0.0118524, "auxiliary_loss_mlp": 0.01034355, "balance_loss_clip": 0.9032644, "balance_loss_mlp": 1.02568889, "epoch": 0.32417483316298923, "flos": 35845851634560.0, "grad_norm": 2.0610533526692003, "language_loss": 0.73066628, "learning_rate": 3.1589983101021223e-06, "loss": 0.75286222, "num_input_tokens_seen": 58055820, "step": 2696, "time_per_iteration": 2.857572555541992 }, { "auxiliary_loss_clip": 0.01196123, "auxiliary_loss_mlp": 0.01029095, "balance_loss_clip": 0.97963762, "balance_loss_mlp": 1.02078605, "epoch": 0.32429507605362834, "flos": 30080501406720.0, "grad_norm": 2.2413979834020874, "language_loss": 0.84736955, "learning_rate": 3.1583633814732337e-06, "loss": 0.86962175, "num_input_tokens_seen": 58075340, "step": 2697, "time_per_iteration": 3.7115864753723145 }, { "auxiliary_loss_clip": 0.01196548, "auxiliary_loss_mlp": 0.01026972, "balance_loss_clip": 1.05511773, "balance_loss_mlp": 1.01833487, "epoch": 0.3244153189442674, "flos": 18223265387520.0, "grad_norm": 2.3257022300253967, "language_loss": 0.71807748, "learning_rate": 3.157728277121541e-06, "loss": 0.74031264, "num_input_tokens_seen": 58093515, "step": 2698, "time_per_iteration": 2.556215763092041 }, { "auxiliary_loss_clip": 0.01198253, "auxiliary_loss_mlp": 0.01034874, "balance_loss_clip": 1.05557382, "balance_loss_mlp": 1.02597463, "epoch": 0.3245355618349065, "flos": 17710317216000.0, "grad_norm": 2.572914387148002, "language_loss": 0.78492302, "learning_rate": 3.1570929971433897e-06, "loss": 0.80725425, "num_input_tokens_seen": 58109300, "step": 2699, "time_per_iteration": 2.6347668170928955 }, { "auxiliary_loss_clip": 0.01197741, "auxiliary_loss_mlp": 0.01040447, "balance_loss_clip": 1.02139688, "balance_loss_mlp": 1.03094041, "epoch": 0.3246558047255456, "flos": 23440798316160.0, "grad_norm": 1.9334589778083475, "language_loss": 0.8345378, "learning_rate": 3.1564575416351504e-06, "loss": 0.85691965, "num_input_tokens_seen": 58128000, "step": 2700, "time_per_iteration": 3.6229937076568604 }, { "auxiliary_loss_clip": 0.01200645, "auxiliary_loss_mlp": 0.01029631, "balance_loss_clip": 1.05976355, "balance_loss_mlp": 1.02054715, "epoch": 0.32477604761618467, "flos": 21760861178880.0, "grad_norm": 2.5218561999818405, "language_loss": 0.74158597, "learning_rate": 3.155821910693221e-06, "loss": 0.76388872, "num_input_tokens_seen": 58147415, "step": 2701, "time_per_iteration": 3.726641893386841 }, { "auxiliary_loss_clip": 0.01189819, "auxiliary_loss_mlp": 0.01026547, "balance_loss_clip": 0.97678602, "balance_loss_mlp": 1.0180769, "epoch": 0.3248962905068238, "flos": 19828328624640.0, "grad_norm": 1.8579255477499392, "language_loss": 0.85841101, "learning_rate": 3.1551861044140275e-06, "loss": 0.8805747, "num_input_tokens_seen": 58167050, "step": 2702, "time_per_iteration": 2.709372043609619 }, { "auxiliary_loss_clip": 0.01182582, "auxiliary_loss_mlp": 0.01033225, "balance_loss_clip": 0.90206641, "balance_loss_mlp": 1.02481508, "epoch": 0.3250165333974629, "flos": 23948215793280.0, "grad_norm": 1.5859411314781993, "language_loss": 0.77633214, "learning_rate": 3.15455012289402e-06, "loss": 0.79849029, "num_input_tokens_seen": 58186695, "step": 2703, "time_per_iteration": 2.7575464248657227 }, { "auxiliary_loss_clip": 0.01200438, "auxiliary_loss_mlp": 0.0102992, "balance_loss_clip": 1.02245033, "balance_loss_mlp": 1.02106881, "epoch": 0.32513677628810195, "flos": 23989333887360.0, "grad_norm": 2.0633041971329504, "language_loss": 0.84331936, "learning_rate": 3.153913966229677e-06, "loss": 0.86562294, "num_input_tokens_seen": 58205815, "step": 2704, "time_per_iteration": 2.664201498031616 }, { "auxiliary_loss_clip": 0.01090326, "auxiliary_loss_mlp": 0.01003041, "balance_loss_clip": 0.98321128, "balance_loss_mlp": 1.00033462, "epoch": 0.32525701917874106, "flos": 70655790009600.0, "grad_norm": 0.6383996561825012, "language_loss": 0.50333011, "learning_rate": 3.1532776345175027e-06, "loss": 0.52426374, "num_input_tokens_seen": 58270960, "step": 2705, "time_per_iteration": 3.2691726684570312 }, { "auxiliary_loss_clip": 0.01196345, "auxiliary_loss_mlp": 0.01031467, "balance_loss_clip": 1.05682659, "balance_loss_mlp": 1.02364135, "epoch": 0.32537726206938017, "flos": 19682639061120.0, "grad_norm": 2.009581585115112, "language_loss": 0.78869289, "learning_rate": 3.1526411278540285e-06, "loss": 0.81097096, "num_input_tokens_seen": 58289390, "step": 2706, "time_per_iteration": 2.573209047317505 }, { "auxiliary_loss_clip": 0.0119732, "auxiliary_loss_mlp": 0.01032184, "balance_loss_clip": 0.9770245, "balance_loss_mlp": 1.02344012, "epoch": 0.3254975049600192, "flos": 28760999293440.0, "grad_norm": 2.07994636163549, "language_loss": 0.81120968, "learning_rate": 3.1520044463358116e-06, "loss": 0.83350468, "num_input_tokens_seen": 58306120, "step": 2707, "time_per_iteration": 2.7539114952087402 }, { "auxiliary_loss_clip": 0.01193325, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.01784921, "balance_loss_mlp": 1.0248785, "epoch": 0.32561774785065833, "flos": 18877378008960.0, "grad_norm": 1.4697515232092626, "language_loss": 0.80236, "learning_rate": 3.151367590059436e-06, "loss": 0.82462502, "num_input_tokens_seen": 58324545, "step": 2708, "time_per_iteration": 2.6878631114959717 }, { "auxiliary_loss_clip": 0.01198374, "auxiliary_loss_mlp": 0.01125163, "balance_loss_clip": 1.0580194, "balance_loss_mlp": 0.0, "epoch": 0.32573799074129745, "flos": 23112107936640.0, "grad_norm": 1.965101176638041, "language_loss": 0.86651605, "learning_rate": 3.1507305591215117e-06, "loss": 0.88975143, "num_input_tokens_seen": 58342455, "step": 2709, "time_per_iteration": 2.6179237365722656 }, { "auxiliary_loss_clip": 0.01091056, "auxiliary_loss_mlp": 0.01000251, "balance_loss_clip": 0.98306119, "balance_loss_mlp": 0.99760437, "epoch": 0.3258582336319365, "flos": 71237650423680.0, "grad_norm": 0.6717828506562884, "language_loss": 0.55734861, "learning_rate": 3.150093353618677e-06, "loss": 0.57826167, "num_input_tokens_seen": 58407185, "step": 2710, "time_per_iteration": 3.282228708267212 }, { "auxiliary_loss_clip": 0.0119946, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.01714945, "balance_loss_mlp": 1.02357495, "epoch": 0.3259784765225756, "flos": 22456020067200.0, "grad_norm": 2.8608385252823316, "language_loss": 0.88688916, "learning_rate": 3.149455973647596e-06, "loss": 0.90920389, "num_input_tokens_seen": 58425245, "step": 2711, "time_per_iteration": 2.6531803607940674 }, { "auxiliary_loss_clip": 0.01180544, "auxiliary_loss_mlp": 0.01033126, "balance_loss_clip": 0.9357456, "balance_loss_mlp": 1.02413213, "epoch": 0.32609871941321467, "flos": 20484811543680.0, "grad_norm": 1.7597020215364407, "language_loss": 0.77424413, "learning_rate": 3.1488184193049563e-06, "loss": 0.79638082, "num_input_tokens_seen": 58444780, "step": 2712, "time_per_iteration": 2.698190927505493 }, { "auxiliary_loss_clip": 0.01197339, "auxiliary_loss_mlp": 0.01032177, "balance_loss_clip": 1.0583632, "balance_loss_mlp": 1.02425516, "epoch": 0.3262189623038538, "flos": 22416805393920.0, "grad_norm": 1.4892516376177787, "language_loss": 0.71901882, "learning_rate": 3.1481806906874767e-06, "loss": 0.74131399, "num_input_tokens_seen": 58466090, "step": 2713, "time_per_iteration": 2.6661019325256348 }, { "auxiliary_loss_clip": 0.01196202, "auxiliary_loss_mlp": 0.01034614, "balance_loss_clip": 1.05750895, "balance_loss_mlp": 1.02657354, "epoch": 0.3263392051944929, "flos": 20923496346240.0, "grad_norm": 1.5723743600642301, "language_loss": 0.87928534, "learning_rate": 3.147542787891899e-06, "loss": 0.90159345, "num_input_tokens_seen": 58485435, "step": 2714, "time_per_iteration": 2.648273468017578 }, { "auxiliary_loss_clip": 0.0119587, "auxiliary_loss_mlp": 0.01030259, "balance_loss_clip": 0.97997952, "balance_loss_mlp": 1.02202153, "epoch": 0.32645944808513194, "flos": 24025172682240.0, "grad_norm": 1.74167743693137, "language_loss": 0.75357115, "learning_rate": 3.1469047110149926e-06, "loss": 0.77583247, "num_input_tokens_seen": 58504175, "step": 2715, "time_per_iteration": 2.6991007328033447 }, { "auxiliary_loss_clip": 0.01182362, "auxiliary_loss_mlp": 0.0102634, "balance_loss_clip": 0.90262091, "balance_loss_mlp": 1.01744068, "epoch": 0.32657969097577105, "flos": 21032413361280.0, "grad_norm": 1.7706247625845146, "language_loss": 0.85265064, "learning_rate": 3.146266460153554e-06, "loss": 0.87473768, "num_input_tokens_seen": 58523885, "step": 2716, "time_per_iteration": 2.759361743927002 }, { "auxiliary_loss_clip": 0.01190646, "auxiliary_loss_mlp": 0.01124953, "balance_loss_clip": 0.97940886, "balance_loss_mlp": 0.0, "epoch": 0.32669993386641016, "flos": 22710267509760.0, "grad_norm": 1.7870039982277781, "language_loss": 0.79932678, "learning_rate": 3.145628035404404e-06, "loss": 0.82248271, "num_input_tokens_seen": 58543085, "step": 2717, "time_per_iteration": 2.664414882659912 }, { "auxiliary_loss_clip": 0.01087181, "auxiliary_loss_mlp": 0.01004117, "balance_loss_clip": 0.98063147, "balance_loss_mlp": 1.0014466, "epoch": 0.3268201767570492, "flos": 72105718406400.0, "grad_norm": 0.8707159103910906, "language_loss": 0.57551908, "learning_rate": 3.1449894368643922e-06, "loss": 0.59643209, "num_input_tokens_seen": 58605400, "step": 2718, "time_per_iteration": 3.3206422328948975 }, { "auxiliary_loss_clip": 0.01193937, "auxiliary_loss_mlp": 0.010296, "balance_loss_clip": 0.94420063, "balance_loss_mlp": 1.02179778, "epoch": 0.32694041964768833, "flos": 24535175938560.0, "grad_norm": 1.5545335511991756, "language_loss": 0.71211141, "learning_rate": 3.1443506646303934e-06, "loss": 0.73434675, "num_input_tokens_seen": 58626700, "step": 2719, "time_per_iteration": 2.817732334136963 }, { "auxiliary_loss_clip": 0.01194819, "auxiliary_loss_mlp": 0.01034205, "balance_loss_clip": 1.01642358, "balance_loss_mlp": 1.02609837, "epoch": 0.32706066253832744, "flos": 33183003755520.0, "grad_norm": 1.745228551633673, "language_loss": 0.66884208, "learning_rate": 3.1437117187993086e-06, "loss": 0.69113231, "num_input_tokens_seen": 58649020, "step": 2720, "time_per_iteration": 3.77008056640625 }, { "auxiliary_loss_clip": 0.01184912, "auxiliary_loss_mlp": 0.0103731, "balance_loss_clip": 0.93848163, "balance_loss_mlp": 1.02845871, "epoch": 0.3271809054289665, "flos": 24061622008320.0, "grad_norm": 1.7965958378046227, "language_loss": 0.79929829, "learning_rate": 3.143072599468065e-06, "loss": 0.82152051, "num_input_tokens_seen": 58668845, "step": 2721, "time_per_iteration": 2.8337464332580566 }, { "auxiliary_loss_clip": 0.01195278, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 0.98377621, "balance_loss_mlp": 1.02749336, "epoch": 0.3273011483196056, "flos": 38253769712640.0, "grad_norm": 1.504145140112092, "language_loss": 0.75797164, "learning_rate": 3.1424333067336174e-06, "loss": 0.78028703, "num_input_tokens_seen": 58691610, "step": 2722, "time_per_iteration": 2.857879638671875 }, { "auxiliary_loss_clip": 0.0120046, "auxiliary_loss_mlp": 0.01030675, "balance_loss_clip": 1.01840186, "balance_loss_mlp": 1.02208602, "epoch": 0.3274213912102447, "flos": 29054389582080.0, "grad_norm": 1.7635046955147031, "language_loss": 0.78081381, "learning_rate": 3.141793840692945e-06, "loss": 0.8031252, "num_input_tokens_seen": 58712360, "step": 2723, "time_per_iteration": 2.728201389312744 }, { "auxiliary_loss_clip": 0.01182389, "auxiliary_loss_mlp": 0.01033727, "balance_loss_clip": 0.97740561, "balance_loss_mlp": 1.02475011, "epoch": 0.32754163410088377, "flos": 29133249891840.0, "grad_norm": 3.2399400261509443, "language_loss": 0.61047643, "learning_rate": 3.1411542014430553e-06, "loss": 0.63263762, "num_input_tokens_seen": 58733440, "step": 2724, "time_per_iteration": 3.6654562950134277 }, { "auxiliary_loss_clip": 0.01186275, "auxiliary_loss_mlp": 0.01031008, "balance_loss_clip": 0.93572313, "balance_loss_mlp": 1.02287245, "epoch": 0.3276618769915229, "flos": 20631075724800.0, "grad_norm": 2.1808756922742, "language_loss": 0.81760663, "learning_rate": 3.1405143890809804e-06, "loss": 0.8397795, "num_input_tokens_seen": 58752735, "step": 2725, "time_per_iteration": 2.7309257984161377 }, { "auxiliary_loss_clip": 0.0118454, "auxiliary_loss_mlp": 0.01027628, "balance_loss_clip": 0.97701859, "balance_loss_mlp": 1.01963496, "epoch": 0.327782119882162, "flos": 18657425076480.0, "grad_norm": 1.9627602703199116, "language_loss": 0.70207012, "learning_rate": 3.1398744037037796e-06, "loss": 0.72419184, "num_input_tokens_seen": 58772070, "step": 2726, "time_per_iteration": 3.6542177200317383 }, { "auxiliary_loss_clip": 0.01194601, "auxiliary_loss_mlp": 0.01034146, "balance_loss_clip": 0.98197949, "balance_loss_mlp": 1.02636743, "epoch": 0.32790236277280105, "flos": 21795802133760.0, "grad_norm": 2.1950260729291085, "language_loss": 0.84111035, "learning_rate": 3.139234245408538e-06, "loss": 0.86339784, "num_input_tokens_seen": 58790950, "step": 2727, "time_per_iteration": 3.7023699283599854 }, { "auxiliary_loss_clip": 0.01192125, "auxiliary_loss_mlp": 0.01124475, "balance_loss_clip": 0.94313216, "balance_loss_mlp": 0.0, "epoch": 0.32802260566344016, "flos": 23331414424320.0, "grad_norm": 1.2933536426177705, "language_loss": 0.76075941, "learning_rate": 3.1385939142923666e-06, "loss": 0.78392541, "num_input_tokens_seen": 58813340, "step": 2728, "time_per_iteration": 2.7150204181671143 }, { "auxiliary_loss_clip": 0.01190882, "auxiliary_loss_mlp": 0.01033077, "balance_loss_clip": 0.97689497, "balance_loss_mlp": 1.02449989, "epoch": 0.3281428485540792, "flos": 24206988349440.0, "grad_norm": 3.1662062422911537, "language_loss": 0.77823305, "learning_rate": 3.137953410452405e-06, "loss": 0.80047262, "num_input_tokens_seen": 58833610, "step": 2729, "time_per_iteration": 2.713268280029297 }, { "auxiliary_loss_clip": 0.01187949, "auxiliary_loss_mlp": 0.0102703, "balance_loss_clip": 0.97641897, "balance_loss_mlp": 1.01892996, "epoch": 0.3282630914447183, "flos": 34128962380800.0, "grad_norm": 1.7228917073530667, "language_loss": 0.74400944, "learning_rate": 3.1373127339858146e-06, "loss": 0.76615924, "num_input_tokens_seen": 58856210, "step": 2730, "time_per_iteration": 2.9181666374206543 }, { "auxiliary_loss_clip": 0.01180899, "auxiliary_loss_mlp": 0.01025546, "balance_loss_clip": 0.93735993, "balance_loss_mlp": 1.01810741, "epoch": 0.32838333433535744, "flos": 27600726170880.0, "grad_norm": 1.880682831894626, "language_loss": 0.74633372, "learning_rate": 3.136671884989787e-06, "loss": 0.76839817, "num_input_tokens_seen": 58876120, "step": 2731, "time_per_iteration": 2.7917380332946777 }, { "auxiliary_loss_clip": 0.01188949, "auxiliary_loss_mlp": 0.01029325, "balance_loss_clip": 0.86323774, "balance_loss_mlp": 1.02024162, "epoch": 0.3285035772259965, "flos": 12349500935040.0, "grad_norm": 2.197110621056796, "language_loss": 0.87487769, "learning_rate": 3.1360308635615383e-06, "loss": 0.89706051, "num_input_tokens_seen": 58894660, "step": 2732, "time_per_iteration": 2.7609450817108154 }, { "auxiliary_loss_clip": 0.01202429, "auxiliary_loss_mlp": 0.01035581, "balance_loss_clip": 0.98145592, "balance_loss_mlp": 1.0261749, "epoch": 0.3286238201166356, "flos": 24316084932480.0, "grad_norm": 1.8660495748991532, "language_loss": 0.78854662, "learning_rate": 3.135389669798311e-06, "loss": 0.81092674, "num_input_tokens_seen": 58912720, "step": 2733, "time_per_iteration": 2.7068049907684326 }, { "auxiliary_loss_clip": 0.01192064, "auxiliary_loss_mlp": 0.01124783, "balance_loss_clip": 1.01606178, "balance_loss_mlp": 0.0, "epoch": 0.3287440630072747, "flos": 21392812471680.0, "grad_norm": 1.7912303299043557, "language_loss": 0.80000329, "learning_rate": 3.134748303797373e-06, "loss": 0.82317179, "num_input_tokens_seen": 58930090, "step": 2734, "time_per_iteration": 2.61887788772583 }, { "auxiliary_loss_clip": 0.01186003, "auxiliary_loss_mlp": 0.01034991, "balance_loss_clip": 0.90004623, "balance_loss_mlp": 1.02574635, "epoch": 0.32886430589791377, "flos": 23732536579200.0, "grad_norm": 2.1794337024928425, "language_loss": 0.8110069, "learning_rate": 3.1341067656560203e-06, "loss": 0.83321679, "num_input_tokens_seen": 58947935, "step": 2735, "time_per_iteration": 2.7741012573242188 }, { "auxiliary_loss_clip": 0.01199214, "auxiliary_loss_mlp": 0.01029925, "balance_loss_clip": 0.97606856, "balance_loss_mlp": 1.02178335, "epoch": 0.3289845487885529, "flos": 22418708814720.0, "grad_norm": 1.9296677824740698, "language_loss": 0.86394858, "learning_rate": 3.133465055471572e-06, "loss": 0.88623995, "num_input_tokens_seen": 58967720, "step": 2736, "time_per_iteration": 2.712097406387329 }, { "auxiliary_loss_clip": 0.0118488, "auxiliary_loss_mlp": 0.01031983, "balance_loss_clip": 0.93921101, "balance_loss_mlp": 1.02388835, "epoch": 0.329104791679192, "flos": 19682603147520.0, "grad_norm": 3.379061891487731, "language_loss": 0.66329408, "learning_rate": 3.1328231733413767e-06, "loss": 0.68546265, "num_input_tokens_seen": 58984360, "step": 2737, "time_per_iteration": 2.7502083778381348 }, { "auxiliary_loss_clip": 0.01189477, "auxiliary_loss_mlp": 0.0102892, "balance_loss_clip": 1.01728737, "balance_loss_mlp": 1.02007425, "epoch": 0.32922503456983104, "flos": 15997234803840.0, "grad_norm": 1.964659122070503, "language_loss": 0.90891373, "learning_rate": 3.1321811193628067e-06, "loss": 0.93109775, "num_input_tokens_seen": 59002505, "step": 2738, "time_per_iteration": 2.7354793548583984 }, { "auxiliary_loss_clip": 0.01194279, "auxiliary_loss_mlp": 0.01125506, "balance_loss_clip": 1.01874566, "balance_loss_mlp": 0.0, "epoch": 0.32934527746047015, "flos": 26834069260800.0, "grad_norm": 1.909747061371216, "language_loss": 0.7025193, "learning_rate": 3.131538893633261e-06, "loss": 0.72571719, "num_input_tokens_seen": 59022065, "step": 2739, "time_per_iteration": 2.770869731903076 }, { "auxiliary_loss_clip": 0.01196101, "auxiliary_loss_mlp": 0.01024923, "balance_loss_clip": 1.05581081, "balance_loss_mlp": 1.01655447, "epoch": 0.32946552035110926, "flos": 23403774372480.0, "grad_norm": 2.349619804447457, "language_loss": 0.78272808, "learning_rate": 3.130896496250165e-06, "loss": 0.80493832, "num_input_tokens_seen": 59041890, "step": 2740, "time_per_iteration": 2.6663854122161865 }, { "auxiliary_loss_clip": 0.01193858, "auxiliary_loss_mlp": 0.01030431, "balance_loss_clip": 1.05316722, "balance_loss_mlp": 1.02174091, "epoch": 0.3295857632417483, "flos": 14172470029440.0, "grad_norm": 2.1669050792432505, "language_loss": 0.86457348, "learning_rate": 3.1302539273109693e-06, "loss": 0.88681638, "num_input_tokens_seen": 59058715, "step": 2741, "time_per_iteration": 2.656998634338379 }, { "auxiliary_loss_clip": 0.01182493, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 0.97803205, "balance_loss_mlp": 1.02582002, "epoch": 0.32970600613238743, "flos": 22196708807040.0, "grad_norm": 1.5772663886151435, "language_loss": 0.8029294, "learning_rate": 3.1296111869131513e-06, "loss": 0.8250941, "num_input_tokens_seen": 59076140, "step": 2742, "time_per_iteration": 2.710123062133789 }, { "auxiliary_loss_clip": 0.01193533, "auxiliary_loss_mlp": 0.01027512, "balance_loss_clip": 1.05342412, "balance_loss_mlp": 1.0193882, "epoch": 0.32982624902302654, "flos": 22053784590720.0, "grad_norm": 1.919817962933458, "language_loss": 0.85802698, "learning_rate": 3.1289682751542153e-06, "loss": 0.88023746, "num_input_tokens_seen": 59095700, "step": 2743, "time_per_iteration": 2.6138858795166016 }, { "auxiliary_loss_clip": 0.01190614, "auxiliary_loss_mlp": 0.01031063, "balance_loss_clip": 1.01716971, "balance_loss_mlp": 1.02320743, "epoch": 0.3299464919136656, "flos": 18661626967680.0, "grad_norm": 1.9170719495322754, "language_loss": 0.71216953, "learning_rate": 3.1283251921316883e-06, "loss": 0.73438632, "num_input_tokens_seen": 59113445, "step": 2744, "time_per_iteration": 2.631021022796631 }, { "auxiliary_loss_clip": 0.01186068, "auxiliary_loss_mlp": 0.01032445, "balance_loss_clip": 0.90249717, "balance_loss_mlp": 1.02399945, "epoch": 0.3300667348043047, "flos": 13407357404160.0, "grad_norm": 1.9533140873575185, "language_loss": 0.80961251, "learning_rate": 3.1276819379431277e-06, "loss": 0.83179772, "num_input_tokens_seen": 59131535, "step": 2745, "time_per_iteration": 2.730018138885498 }, { "auxiliary_loss_clip": 0.01202881, "auxiliary_loss_mlp": 0.01125231, "balance_loss_clip": 0.98228061, "balance_loss_mlp": 0.0, "epoch": 0.33018697769494376, "flos": 15742556398080.0, "grad_norm": 2.0378225880264136, "language_loss": 0.75198734, "learning_rate": 3.1270385126861134e-06, "loss": 0.77526844, "num_input_tokens_seen": 59149520, "step": 2746, "time_per_iteration": 3.7310445308685303 }, { "auxiliary_loss_clip": 0.01198068, "auxiliary_loss_mlp": 0.0103368, "balance_loss_clip": 1.05630434, "balance_loss_mlp": 1.02513909, "epoch": 0.3303072205855829, "flos": 18258601392000.0, "grad_norm": 2.0320906919119115, "language_loss": 0.82159889, "learning_rate": 3.1263949164582533e-06, "loss": 0.84391636, "num_input_tokens_seen": 59169170, "step": 2747, "time_per_iteration": 2.635007619857788 }, { "auxiliary_loss_clip": 0.01195205, "auxiliary_loss_mlp": 0.01025596, "balance_loss_clip": 1.05406094, "balance_loss_mlp": 1.01725698, "epoch": 0.330427463476222, "flos": 17749424148480.0, "grad_norm": 1.9953414215602232, "language_loss": 0.78373992, "learning_rate": 3.1257511493571797e-06, "loss": 0.80594796, "num_input_tokens_seen": 59187675, "step": 2748, "time_per_iteration": 2.655689001083374 }, { "auxiliary_loss_clip": 0.01193003, "auxiliary_loss_mlp": 0.01032112, "balance_loss_clip": 0.94114506, "balance_loss_mlp": 1.02371943, "epoch": 0.33054770636686104, "flos": 27162580072320.0, "grad_norm": 1.6820078048522606, "language_loss": 0.78085613, "learning_rate": 3.125107211480552e-06, "loss": 0.80310726, "num_input_tokens_seen": 59207610, "step": 2749, "time_per_iteration": 3.787700653076172 }, { "auxiliary_loss_clip": 0.0118457, "auxiliary_loss_mlp": 0.01037601, "balance_loss_clip": 0.86225003, "balance_loss_mlp": 1.02874947, "epoch": 0.33066794925750015, "flos": 20117193799680.0, "grad_norm": 1.5899938357909527, "language_loss": 0.79829812, "learning_rate": 3.124463102926054e-06, "loss": 0.82051986, "num_input_tokens_seen": 59226945, "step": 2750, "time_per_iteration": 2.7607080936431885 }, { "auxiliary_loss_clip": 0.01084676, "auxiliary_loss_mlp": 0.01003643, "balance_loss_clip": 0.98152179, "balance_loss_mlp": 1.00106812, "epoch": 0.33078819214813926, "flos": 70642609718400.0, "grad_norm": 0.759169586110985, "language_loss": 0.61579883, "learning_rate": 3.1238188237913984e-06, "loss": 0.63668203, "num_input_tokens_seen": 59291485, "step": 2751, "time_per_iteration": 3.3506836891174316 }, { "auxiliary_loss_clip": 0.01201802, "auxiliary_loss_mlp": 0.01036598, "balance_loss_clip": 1.05748868, "balance_loss_mlp": 1.02779436, "epoch": 0.3309084350387783, "flos": 21141940907520.0, "grad_norm": 2.459289773594848, "language_loss": 0.76548809, "learning_rate": 3.1231743741743202e-06, "loss": 0.78787208, "num_input_tokens_seen": 59310990, "step": 2752, "time_per_iteration": 2.6427016258239746 }, { "auxiliary_loss_clip": 0.01187291, "auxiliary_loss_mlp": 0.0103032, "balance_loss_clip": 1.01428306, "balance_loss_mlp": 1.02195787, "epoch": 0.3310286779294174, "flos": 14209350318720.0, "grad_norm": 2.136211264194758, "language_loss": 0.83471322, "learning_rate": 3.122529754172582e-06, "loss": 0.85688931, "num_input_tokens_seen": 59327875, "step": 2753, "time_per_iteration": 4.415686130523682 }, { "auxiliary_loss_clip": 0.01192205, "auxiliary_loss_mlp": 0.01031826, "balance_loss_clip": 1.0177542, "balance_loss_mlp": 1.02348161, "epoch": 0.33114892082005654, "flos": 20778130005120.0, "grad_norm": 1.8049544115852616, "language_loss": 0.72334582, "learning_rate": 3.1218849638839736e-06, "loss": 0.7455861, "num_input_tokens_seen": 59347135, "step": 2754, "time_per_iteration": 2.693248987197876 }, { "auxiliary_loss_clip": 0.01174429, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 0.93453228, "balance_loss_mlp": 1.0251658, "epoch": 0.3312691637106956, "flos": 17090750499840.0, "grad_norm": 1.7236092596995947, "language_loss": 0.78509808, "learning_rate": 3.121240003406307e-06, "loss": 0.80719382, "num_input_tokens_seen": 59365985, "step": 2755, "time_per_iteration": 2.7173678874969482 }, { "auxiliary_loss_clip": 0.01199264, "auxiliary_loss_mlp": 0.01036608, "balance_loss_clip": 0.94473702, "balance_loss_mlp": 1.02727962, "epoch": 0.3313894066013347, "flos": 29456230008960.0, "grad_norm": 2.0272307582098454, "language_loss": 0.72586441, "learning_rate": 3.120594872837425e-06, "loss": 0.74822313, "num_input_tokens_seen": 59384655, "step": 2756, "time_per_iteration": 2.766071319580078 }, { "auxiliary_loss_clip": 0.01090825, "auxiliary_loss_mlp": 0.01119865, "balance_loss_clip": 0.98368758, "balance_loss_mlp": 0.0, "epoch": 0.3315096494919738, "flos": 61419242280960.0, "grad_norm": 0.828373911334382, "language_loss": 0.62414557, "learning_rate": 3.1199495722751906e-06, "loss": 0.64625245, "num_input_tokens_seen": 59444185, "step": 2757, "time_per_iteration": 3.3409008979797363 }, { "auxiliary_loss_clip": 0.01190062, "auxiliary_loss_mlp": 0.01031205, "balance_loss_clip": 0.90003085, "balance_loss_mlp": 1.0222944, "epoch": 0.33162989238261287, "flos": 21653057485440.0, "grad_norm": 3.7274351452211554, "language_loss": 0.8374548, "learning_rate": 3.1193041018174972e-06, "loss": 0.85966742, "num_input_tokens_seen": 59464900, "step": 2758, "time_per_iteration": 2.785278558731079 }, { "auxiliary_loss_clip": 0.01197842, "auxiliary_loss_mlp": 0.010388, "balance_loss_clip": 1.01897097, "balance_loss_mlp": 1.02985954, "epoch": 0.331750135273252, "flos": 22674787850880.0, "grad_norm": 1.9947455448575917, "language_loss": 0.94827366, "learning_rate": 3.118658461562261e-06, "loss": 0.97064006, "num_input_tokens_seen": 59481000, "step": 2759, "time_per_iteration": 2.658219337463379 }, { "auxiliary_loss_clip": 0.01196263, "auxiliary_loss_mlp": 0.01032846, "balance_loss_clip": 0.98311043, "balance_loss_mlp": 1.02429271, "epoch": 0.33187037816389103, "flos": 22746896403840.0, "grad_norm": 1.3163312730859518, "language_loss": 0.84747887, "learning_rate": 3.118012651607426e-06, "loss": 0.86976999, "num_input_tokens_seen": 59502605, "step": 2760, "time_per_iteration": 2.6850943565368652 }, { "auxiliary_loss_clip": 0.01198614, "auxiliary_loss_mlp": 0.01033789, "balance_loss_clip": 1.0571382, "balance_loss_mlp": 1.02546883, "epoch": 0.33199062105453014, "flos": 19203769918080.0, "grad_norm": 1.985300768667301, "language_loss": 0.83502567, "learning_rate": 3.1173666720509603e-06, "loss": 0.85734975, "num_input_tokens_seen": 59519540, "step": 2761, "time_per_iteration": 2.6231491565704346 }, { "auxiliary_loss_clip": 0.01197943, "auxiliary_loss_mlp": 0.01030164, "balance_loss_clip": 0.98022527, "balance_loss_mlp": 1.02143228, "epoch": 0.33211086394516925, "flos": 31577006764800.0, "grad_norm": 1.8854633026931265, "language_loss": 0.68241334, "learning_rate": 3.116720522990859e-06, "loss": 0.70469439, "num_input_tokens_seen": 59540415, "step": 2762, "time_per_iteration": 2.7793338298797607 }, { "auxiliary_loss_clip": 0.01188976, "auxiliary_loss_mlp": 0.01031179, "balance_loss_clip": 0.86365747, "balance_loss_mlp": 1.02322817, "epoch": 0.3322311068358083, "flos": 17932496791680.0, "grad_norm": 2.727639371690942, "language_loss": 0.62202942, "learning_rate": 3.116074204525142e-06, "loss": 0.6442309, "num_input_tokens_seen": 59558590, "step": 2763, "time_per_iteration": 2.729443311691284 }, { "auxiliary_loss_clip": 0.01184972, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.01625454, "balance_loss_mlp": 1.02388835, "epoch": 0.3323513497264474, "flos": 32269831269120.0, "grad_norm": 1.471665612781208, "language_loss": 0.83283341, "learning_rate": 3.1154277167518553e-06, "loss": 0.85501349, "num_input_tokens_seen": 59580205, "step": 2764, "time_per_iteration": 2.763153076171875 }, { "auxiliary_loss_clip": 0.01087439, "auxiliary_loss_mlp": 0.0100611, "balance_loss_clip": 0.94511956, "balance_loss_mlp": 1.00354719, "epoch": 0.33247159261708653, "flos": 52668674588160.0, "grad_norm": 0.7804041468815756, "language_loss": 0.59496146, "learning_rate": 3.114781059769072e-06, "loss": 0.61589694, "num_input_tokens_seen": 59631530, "step": 2765, "time_per_iteration": 3.139488458633423 }, { "auxiliary_loss_clip": 0.01191983, "auxiliary_loss_mlp": 0.01035735, "balance_loss_clip": 0.97913557, "balance_loss_mlp": 1.02725875, "epoch": 0.3325918355077256, "flos": 27125232906240.0, "grad_norm": 2.364686317776599, "language_loss": 0.67776078, "learning_rate": 3.1141342336748874e-06, "loss": 0.70003796, "num_input_tokens_seen": 59651090, "step": 2766, "time_per_iteration": 2.798790693283081 }, { "auxiliary_loss_clip": 0.01187548, "auxiliary_loss_mlp": 0.01023763, "balance_loss_clip": 1.01675463, "balance_loss_mlp": 1.01614237, "epoch": 0.3327120783983647, "flos": 23664414435840.0, "grad_norm": 1.6215867041649854, "language_loss": 0.81720203, "learning_rate": 3.1134872385674253e-06, "loss": 0.83931518, "num_input_tokens_seen": 59675245, "step": 2767, "time_per_iteration": 2.7453219890594482 }, { "auxiliary_loss_clip": 0.01196237, "auxiliary_loss_mlp": 0.01034802, "balance_loss_clip": 0.97745854, "balance_loss_mlp": 1.02626681, "epoch": 0.3328323212890038, "flos": 19171378828800.0, "grad_norm": 1.940100661435853, "language_loss": 0.856305, "learning_rate": 3.1128400745448353e-06, "loss": 0.87861544, "num_input_tokens_seen": 59694625, "step": 2768, "time_per_iteration": 2.7258057594299316 }, { "auxiliary_loss_clip": 0.01200083, "auxiliary_loss_mlp": 0.01040207, "balance_loss_clip": 1.02086258, "balance_loss_mlp": 1.03164792, "epoch": 0.33295256417964286, "flos": 37706347463040.0, "grad_norm": 2.2253989432356307, "language_loss": 0.62849492, "learning_rate": 3.11219274170529e-06, "loss": 0.65089786, "num_input_tokens_seen": 59716435, "step": 2769, "time_per_iteration": 2.866410255432129 }, { "auxiliary_loss_clip": 0.01186073, "auxiliary_loss_mlp": 0.01027642, "balance_loss_clip": 0.97738385, "balance_loss_mlp": 1.01918387, "epoch": 0.333072807070282, "flos": 26505989412480.0, "grad_norm": 1.8715328338513073, "language_loss": 0.81509429, "learning_rate": 3.1115452401469903e-06, "loss": 0.83723146, "num_input_tokens_seen": 59736835, "step": 2770, "time_per_iteration": 2.704596996307373 }, { "auxiliary_loss_clip": 0.01178733, "auxiliary_loss_mlp": 0.01042685, "balance_loss_clip": 0.89933693, "balance_loss_mlp": 1.03429854, "epoch": 0.3331930499609211, "flos": 21430913823360.0, "grad_norm": 2.50489195233826, "language_loss": 0.86535215, "learning_rate": 3.1108975699681613e-06, "loss": 0.88756633, "num_input_tokens_seen": 59754230, "step": 2771, "time_per_iteration": 2.7876410484313965 }, { "auxiliary_loss_clip": 0.0118901, "auxiliary_loss_mlp": 0.01027381, "balance_loss_clip": 0.93986017, "balance_loss_mlp": 1.0189352, "epoch": 0.33331329285156014, "flos": 20659947281280.0, "grad_norm": 1.7564421829474461, "language_loss": 0.71688867, "learning_rate": 3.1102497312670542e-06, "loss": 0.73905265, "num_input_tokens_seen": 59772235, "step": 2772, "time_per_iteration": 2.770764112472534 }, { "auxiliary_loss_clip": 0.01181243, "auxiliary_loss_mlp": 0.01035303, "balance_loss_clip": 0.97772706, "balance_loss_mlp": 1.02635074, "epoch": 0.33343353574219925, "flos": 28001596930560.0, "grad_norm": 2.240545104412698, "language_loss": 0.80689394, "learning_rate": 3.109601724141946e-06, "loss": 0.82905936, "num_input_tokens_seen": 59791230, "step": 2773, "time_per_iteration": 3.7208714485168457 }, { "auxiliary_loss_clip": 0.011869, "auxiliary_loss_mlp": 0.01029191, "balance_loss_clip": 0.9770807, "balance_loss_mlp": 1.02069736, "epoch": 0.33355377863283836, "flos": 23764963582080.0, "grad_norm": 1.7179417655666636, "language_loss": 0.67910433, "learning_rate": 3.108953548691138e-06, "loss": 0.70126528, "num_input_tokens_seen": 59811315, "step": 2774, "time_per_iteration": 2.819744825363159 }, { "auxiliary_loss_clip": 0.01195386, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.05586839, "balance_loss_mlp": 1.02293992, "epoch": 0.3336740215234774, "flos": 37779677078400.0, "grad_norm": 2.347657287657787, "language_loss": 0.72712344, "learning_rate": 3.108305205012959e-06, "loss": 0.74940336, "num_input_tokens_seen": 59832010, "step": 2775, "time_per_iteration": 3.659087657928467 }, { "auxiliary_loss_clip": 0.01191086, "auxiliary_loss_mlp": 0.01034265, "balance_loss_clip": 0.97911865, "balance_loss_mlp": 1.02615881, "epoch": 0.3337942644141165, "flos": 25519056347520.0, "grad_norm": 1.9488707362031832, "language_loss": 0.87605083, "learning_rate": 3.107656693205761e-06, "loss": 0.89830434, "num_input_tokens_seen": 59851450, "step": 2776, "time_per_iteration": 2.7247395515441895 }, { "auxiliary_loss_clip": 0.01199664, "auxiliary_loss_mlp": 0.01034271, "balance_loss_clip": 1.05548286, "balance_loss_mlp": 1.02478826, "epoch": 0.3339145073047556, "flos": 25989844930560.0, "grad_norm": 2.5037195184364833, "language_loss": 0.70571762, "learning_rate": 3.107008013367924e-06, "loss": 0.72805703, "num_input_tokens_seen": 59870245, "step": 2777, "time_per_iteration": 2.636394500732422 }, { "auxiliary_loss_clip": 0.01184536, "auxiliary_loss_mlp": 0.01028499, "balance_loss_clip": 0.93743193, "balance_loss_mlp": 1.0199641, "epoch": 0.3340347501953947, "flos": 19062569554560.0, "grad_norm": 27.860795477029747, "language_loss": 0.86638206, "learning_rate": 3.1063591655978507e-06, "loss": 0.88851237, "num_input_tokens_seen": 59886195, "step": 2778, "time_per_iteration": 3.6602585315704346 }, { "auxiliary_loss_clip": 0.01174022, "auxiliary_loss_mlp": 0.0102885, "balance_loss_clip": 0.89736831, "balance_loss_mlp": 1.0203917, "epoch": 0.3341549930860338, "flos": 18109715518080.0, "grad_norm": 1.800206942164576, "language_loss": 0.79498488, "learning_rate": 3.105710149993972e-06, "loss": 0.81701362, "num_input_tokens_seen": 59905525, "step": 2779, "time_per_iteration": 3.664416790008545 }, { "auxiliary_loss_clip": 0.01196837, "auxiliary_loss_mlp": 0.01032294, "balance_loss_clip": 1.05637765, "balance_loss_mlp": 1.02384758, "epoch": 0.33427523597667286, "flos": 22674967418880.0, "grad_norm": 2.0876145605267094, "language_loss": 0.85625398, "learning_rate": 3.1050609666547427e-06, "loss": 0.87854522, "num_input_tokens_seen": 59925085, "step": 2780, "time_per_iteration": 2.6202101707458496 }, { "auxiliary_loss_clip": 0.01200014, "auxiliary_loss_mlp": 0.01028667, "balance_loss_clip": 0.94222409, "balance_loss_mlp": 1.01951134, "epoch": 0.33439547886731197, "flos": 22638338524800.0, "grad_norm": 2.0819049777496574, "language_loss": 0.77559131, "learning_rate": 3.104411615678644e-06, "loss": 0.79787815, "num_input_tokens_seen": 59943935, "step": 2781, "time_per_iteration": 2.7241883277893066 }, { "auxiliary_loss_clip": 0.01186867, "auxiliary_loss_mlp": 0.01025484, "balance_loss_clip": 0.97791725, "balance_loss_mlp": 1.01665676, "epoch": 0.3345157217579511, "flos": 24096383395200.0, "grad_norm": 2.265500129112915, "language_loss": 0.73449546, "learning_rate": 3.1037620971641803e-06, "loss": 0.75661898, "num_input_tokens_seen": 59963725, "step": 2782, "time_per_iteration": 2.7175300121307373 }, { "auxiliary_loss_clip": 0.0119777, "auxiliary_loss_mlp": 0.01032252, "balance_loss_clip": 1.05627394, "balance_loss_mlp": 1.02346051, "epoch": 0.33463596464859013, "flos": 18989491334400.0, "grad_norm": 2.5711876366675264, "language_loss": 0.64737844, "learning_rate": 3.1031124112098844e-06, "loss": 0.66967869, "num_input_tokens_seen": 59981935, "step": 2783, "time_per_iteration": 2.690131425857544 }, { "auxiliary_loss_clip": 0.01194012, "auxiliary_loss_mlp": 0.01030808, "balance_loss_clip": 0.97963554, "balance_loss_mlp": 1.02230835, "epoch": 0.33475620753922924, "flos": 20375607219840.0, "grad_norm": 1.8711979837091615, "language_loss": 0.72142637, "learning_rate": 3.1024625579143127e-06, "loss": 0.74367464, "num_input_tokens_seen": 59999455, "step": 2784, "time_per_iteration": 2.689628839492798 }, { "auxiliary_loss_clip": 0.01196081, "auxiliary_loss_mlp": 0.01029556, "balance_loss_clip": 1.05649734, "balance_loss_mlp": 1.02125287, "epoch": 0.33487645042986836, "flos": 18182578256640.0, "grad_norm": 1.8600008279299054, "language_loss": 0.73114258, "learning_rate": 3.101812537376048e-06, "loss": 0.75339895, "num_input_tokens_seen": 60018475, "step": 2785, "time_per_iteration": 2.6890952587127686 }, { "auxiliary_loss_clip": 0.01175714, "auxiliary_loss_mlp": 0.01124823, "balance_loss_clip": 0.97355044, "balance_loss_mlp": 0.0, "epoch": 0.3349966933205074, "flos": 25848824135040.0, "grad_norm": 2.2502881387318743, "language_loss": 0.84417897, "learning_rate": 3.1011623496936973e-06, "loss": 0.86718434, "num_input_tokens_seen": 60036770, "step": 2786, "time_per_iteration": 2.7564890384674072 }, { "auxiliary_loss_clip": 0.01195194, "auxiliary_loss_mlp": 0.01030603, "balance_loss_clip": 1.05665779, "balance_loss_mlp": 1.02219272, "epoch": 0.3351169362111465, "flos": 28111447699200.0, "grad_norm": 1.7737832486034848, "language_loss": 0.69677913, "learning_rate": 3.100511994965893e-06, "loss": 0.71903712, "num_input_tokens_seen": 60056725, "step": 2787, "time_per_iteration": 2.651423454284668 }, { "auxiliary_loss_clip": 0.01191034, "auxiliary_loss_mlp": 0.01028441, "balance_loss_clip": 1.01926661, "balance_loss_mlp": 1.02039433, "epoch": 0.33523717910178563, "flos": 22673315393280.0, "grad_norm": 1.6373008505004445, "language_loss": 0.84444326, "learning_rate": 3.0998614732912947e-06, "loss": 0.866638, "num_input_tokens_seen": 60076100, "step": 2788, "time_per_iteration": 2.6597650051116943 }, { "auxiliary_loss_clip": 0.01192841, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.01942515, "balance_loss_mlp": 1.02840781, "epoch": 0.3353574219924247, "flos": 15669801400320.0, "grad_norm": 1.830324551749091, "language_loss": 0.6823144, "learning_rate": 3.0992107847685855e-06, "loss": 0.70461965, "num_input_tokens_seen": 60093815, "step": 2789, "time_per_iteration": 2.630260467529297 }, { "auxiliary_loss_clip": 0.0119575, "auxiliary_loss_mlp": 0.01038758, "balance_loss_clip": 0.98266119, "balance_loss_mlp": 1.02908421, "epoch": 0.3354776648830638, "flos": 24790644443520.0, "grad_norm": 1.6723859640083363, "language_loss": 0.79034376, "learning_rate": 3.0985599294964736e-06, "loss": 0.81268883, "num_input_tokens_seen": 60113370, "step": 2790, "time_per_iteration": 2.717601776123047 }, { "auxiliary_loss_clip": 0.01201617, "auxiliary_loss_mlp": 0.010271, "balance_loss_clip": 0.9405511, "balance_loss_mlp": 1.01882076, "epoch": 0.33559790777370285, "flos": 28694852398080.0, "grad_norm": 1.8296431103720563, "language_loss": 0.69944966, "learning_rate": 3.097908907573695e-06, "loss": 0.72173679, "num_input_tokens_seen": 60131350, "step": 2791, "time_per_iteration": 2.7995052337646484 }, { "auxiliary_loss_clip": 0.01188486, "auxiliary_loss_mlp": 0.01030366, "balance_loss_clip": 0.86556649, "balance_loss_mlp": 1.02198601, "epoch": 0.33571815066434196, "flos": 22235779825920.0, "grad_norm": 1.827826533877319, "language_loss": 0.89376062, "learning_rate": 3.0972577190990067e-06, "loss": 0.91594917, "num_input_tokens_seen": 60149830, "step": 2792, "time_per_iteration": 2.845081090927124 }, { "auxiliary_loss_clip": 0.0119709, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 0.94074583, "balance_loss_mlp": 1.02242172, "epoch": 0.3358383935549811, "flos": 23842279607040.0, "grad_norm": 1.8297785750632403, "language_loss": 0.79910648, "learning_rate": 3.096606364171196e-06, "loss": 0.82138759, "num_input_tokens_seen": 60169620, "step": 2793, "time_per_iteration": 2.7776038646698 }, { "auxiliary_loss_clip": 0.01171429, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 0.93677044, "balance_loss_mlp": 1.02360225, "epoch": 0.33595863644562013, "flos": 22267308988800.0, "grad_norm": 2.023604483769012, "language_loss": 0.85142469, "learning_rate": 3.0959548428890703e-06, "loss": 0.87346083, "num_input_tokens_seen": 60188490, "step": 2794, "time_per_iteration": 2.7661943435668945 }, { "auxiliary_loss_clip": 0.01190975, "auxiliary_loss_mlp": 0.01027779, "balance_loss_clip": 1.0205915, "balance_loss_mlp": 1.01947594, "epoch": 0.33607887933625924, "flos": 20119779578880.0, "grad_norm": 1.8774269638307994, "language_loss": 0.83893764, "learning_rate": 3.095303155351468e-06, "loss": 0.86112523, "num_input_tokens_seen": 60208695, "step": 2795, "time_per_iteration": 2.713223457336426 }, { "auxiliary_loss_clip": 0.01178743, "auxiliary_loss_mlp": 0.01039105, "balance_loss_clip": 0.90122402, "balance_loss_mlp": 1.03060532, "epoch": 0.33619912222689835, "flos": 19318109886720.0, "grad_norm": 2.1108103938371316, "language_loss": 0.79037535, "learning_rate": 3.0946513016572464e-06, "loss": 0.81255388, "num_input_tokens_seen": 60227600, "step": 2796, "time_per_iteration": 2.862790584564209 }, { "auxiliary_loss_clip": 0.01194059, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.01615119, "balance_loss_mlp": 1.0247587, "epoch": 0.3363193651175374, "flos": 16800664262400.0, "grad_norm": 2.077930071611614, "language_loss": 0.7696659, "learning_rate": 3.0939992819052938e-06, "loss": 0.79194486, "num_input_tokens_seen": 60245110, "step": 2797, "time_per_iteration": 2.7032461166381836 }, { "auxiliary_loss_clip": 0.01191065, "auxiliary_loss_mlp": 0.01031167, "balance_loss_clip": 0.97984254, "balance_loss_mlp": 1.02270865, "epoch": 0.3364396080081765, "flos": 23550289948800.0, "grad_norm": 1.9960415170083423, "language_loss": 0.809425, "learning_rate": 3.0933470961945193e-06, "loss": 0.8316474, "num_input_tokens_seen": 60263405, "step": 2798, "time_per_iteration": 3.726520538330078 }, { "auxiliary_loss_clip": 0.01186469, "auxiliary_loss_mlp": 0.01033285, "balance_loss_clip": 0.97955668, "balance_loss_mlp": 1.02525592, "epoch": 0.3365598508988156, "flos": 28037902602240.0, "grad_norm": 1.6470294684620872, "language_loss": 0.68214166, "learning_rate": 3.0926947446238597e-06, "loss": 0.70433915, "num_input_tokens_seen": 60282975, "step": 2799, "time_per_iteration": 2.728177309036255 }, { "auxiliary_loss_clip": 0.01198197, "auxiliary_loss_mlp": 0.01031541, "balance_loss_clip": 1.0158391, "balance_loss_mlp": 1.02291584, "epoch": 0.3366800937894547, "flos": 16982767238400.0, "grad_norm": 3.9700230289449236, "language_loss": 0.82578331, "learning_rate": 3.092042227292276e-06, "loss": 0.84808064, "num_input_tokens_seen": 60299810, "step": 2800, "time_per_iteration": 2.6585888862609863 }, { "auxiliary_loss_clip": 0.01190594, "auxiliary_loss_mlp": 0.01035382, "balance_loss_clip": 1.05559516, "balance_loss_mlp": 1.0277586, "epoch": 0.3368003366800938, "flos": 23915321913600.0, "grad_norm": 1.5917449757391176, "language_loss": 0.88260227, "learning_rate": 3.0913895442987557e-06, "loss": 0.90486205, "num_input_tokens_seen": 60320775, "step": 2801, "time_per_iteration": 3.647597551345825 }, { "auxiliary_loss_clip": 0.01191201, "auxiliary_loss_mlp": 0.01124933, "balance_loss_clip": 0.94216704, "balance_loss_mlp": 0.0, "epoch": 0.3369205795707329, "flos": 24791219061120.0, "grad_norm": 1.538527042566895, "language_loss": 0.85678542, "learning_rate": 3.090736695742308e-06, "loss": 0.87994671, "num_input_tokens_seen": 60341905, "step": 2802, "time_per_iteration": 2.8510684967041016 }, { "auxiliary_loss_clip": 0.01178668, "auxiliary_loss_mlp": 0.01027115, "balance_loss_clip": 0.89888078, "balance_loss_mlp": 1.0193603, "epoch": 0.33704082246137196, "flos": 17931096161280.0, "grad_norm": 2.5765620738440287, "language_loss": 0.52434999, "learning_rate": 3.0900836817219713e-06, "loss": 0.54640782, "num_input_tokens_seen": 60358335, "step": 2803, "time_per_iteration": 2.721595287322998 }, { "auxiliary_loss_clip": 0.01191261, "auxiliary_loss_mlp": 0.01036936, "balance_loss_clip": 1.05413318, "balance_loss_mlp": 1.0278641, "epoch": 0.33716106535201107, "flos": 21286517149440.0, "grad_norm": 2.5340285221495606, "language_loss": 0.83665502, "learning_rate": 3.089430502336807e-06, "loss": 0.85893703, "num_input_tokens_seen": 60378305, "step": 2804, "time_per_iteration": 3.618786334991455 }, { "auxiliary_loss_clip": 0.01195361, "auxiliary_loss_mlp": 0.01031971, "balance_loss_clip": 1.01779842, "balance_loss_mlp": 1.0225718, "epoch": 0.3372813082426502, "flos": 18402962152320.0, "grad_norm": 2.3689147618940076, "language_loss": 0.90417302, "learning_rate": 3.088777157685902e-06, "loss": 0.92644632, "num_input_tokens_seen": 60393895, "step": 2805, "time_per_iteration": 3.6073877811431885 }, { "auxiliary_loss_clip": 0.01187105, "auxiliary_loss_mlp": 0.01031526, "balance_loss_clip": 0.97773111, "balance_loss_mlp": 1.02262068, "epoch": 0.33740155113328923, "flos": 17201391367680.0, "grad_norm": 2.819840932391084, "language_loss": 0.85900962, "learning_rate": 3.088123647868367e-06, "loss": 0.88119596, "num_input_tokens_seen": 60410445, "step": 2806, "time_per_iteration": 2.5866641998291016 }, { "auxiliary_loss_clip": 0.01194095, "auxiliary_loss_mlp": 0.01030763, "balance_loss_clip": 1.01596606, "balance_loss_mlp": 1.02287161, "epoch": 0.33752179402392835, "flos": 29058950609280.0, "grad_norm": 1.7488363199120198, "language_loss": 0.81124473, "learning_rate": 3.0874699729833405e-06, "loss": 0.83349335, "num_input_tokens_seen": 60431815, "step": 2807, "time_per_iteration": 2.7663450241088867 }, { "auxiliary_loss_clip": 0.01188156, "auxiliary_loss_mlp": 0.01031207, "balance_loss_clip": 0.97930807, "balance_loss_mlp": 1.0226779, "epoch": 0.3376420369145674, "flos": 25080730680960.0, "grad_norm": 1.6396577563294197, "language_loss": 0.79864407, "learning_rate": 3.086816133129983e-06, "loss": 0.82083774, "num_input_tokens_seen": 60452075, "step": 2808, "time_per_iteration": 2.705282688140869 }, { "auxiliary_loss_clip": 0.01195749, "auxiliary_loss_mlp": 0.01035919, "balance_loss_clip": 1.05831456, "balance_loss_mlp": 1.02808666, "epoch": 0.3377622798052065, "flos": 27490624007040.0, "grad_norm": 1.7236361524728019, "language_loss": 0.75995266, "learning_rate": 3.0861621284074826e-06, "loss": 0.78226936, "num_input_tokens_seen": 60472600, "step": 2809, "time_per_iteration": 2.726907968521118 }, { "auxiliary_loss_clip": 0.01200348, "auxiliary_loss_mlp": 0.01030997, "balance_loss_clip": 0.98199749, "balance_loss_mlp": 1.02259862, "epoch": 0.3378825226958456, "flos": 21975211589760.0, "grad_norm": 1.5325765470721717, "language_loss": 0.73180532, "learning_rate": 3.085507958915051e-06, "loss": 0.7541188, "num_input_tokens_seen": 60491030, "step": 2810, "time_per_iteration": 2.680854558944702 }, { "auxiliary_loss_clip": 0.01189688, "auxiliary_loss_mlp": 0.01032548, "balance_loss_clip": 0.98011947, "balance_loss_mlp": 1.02383399, "epoch": 0.3380027655864847, "flos": 42523189200000.0, "grad_norm": 1.7559281770826638, "language_loss": 0.70707333, "learning_rate": 3.084853624751925e-06, "loss": 0.72929573, "num_input_tokens_seen": 60512615, "step": 2811, "time_per_iteration": 2.8185372352600098 }, { "auxiliary_loss_clip": 0.01193745, "auxiliary_loss_mlp": 0.01029708, "balance_loss_clip": 0.94227707, "balance_loss_mlp": 1.02108932, "epoch": 0.3381230084771238, "flos": 26725080418560.0, "grad_norm": 1.663979432041294, "language_loss": 0.85710007, "learning_rate": 3.0841991260173668e-06, "loss": 0.87933463, "num_input_tokens_seen": 60532520, "step": 2812, "time_per_iteration": 2.7267234325408936 }, { "auxiliary_loss_clip": 0.01195131, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.05649006, "balance_loss_mlp": 1.02498412, "epoch": 0.3382432513677629, "flos": 22710375250560.0, "grad_norm": 1.7560597835977487, "language_loss": 0.79976416, "learning_rate": 3.0835444628106634e-06, "loss": 0.82204801, "num_input_tokens_seen": 60551500, "step": 2813, "time_per_iteration": 2.638112783432007 }, { "auxiliary_loss_clip": 0.01193078, "auxiliary_loss_mlp": 0.01125129, "balance_loss_clip": 1.05549157, "balance_loss_mlp": 0.0, "epoch": 0.33836349425840195, "flos": 22122409524480.0, "grad_norm": 1.87020294789648, "language_loss": 0.83067703, "learning_rate": 3.082889635231126e-06, "loss": 0.85385907, "num_input_tokens_seen": 60570160, "step": 2814, "time_per_iteration": 2.6631596088409424 }, { "auxiliary_loss_clip": 0.01193007, "auxiliary_loss_mlp": 0.01033756, "balance_loss_clip": 0.97669202, "balance_loss_mlp": 1.02496409, "epoch": 0.33848373714904106, "flos": 27308090067840.0, "grad_norm": 2.038322720168869, "language_loss": 0.76410568, "learning_rate": 3.0822346433780925e-06, "loss": 0.78637326, "num_input_tokens_seen": 60590885, "step": 2815, "time_per_iteration": 2.730715274810791 }, { "auxiliary_loss_clip": 0.0119183, "auxiliary_loss_mlp": 0.01039523, "balance_loss_clip": 1.01487672, "balance_loss_mlp": 1.03117847, "epoch": 0.3386039800396802, "flos": 25848716394240.0, "grad_norm": 1.9075699912298736, "language_loss": 0.86934918, "learning_rate": 3.0815794873509237e-06, "loss": 0.89166272, "num_input_tokens_seen": 60609170, "step": 2816, "time_per_iteration": 2.69920015335083 }, { "auxiliary_loss_clip": 0.01193071, "auxiliary_loss_mlp": 0.01031929, "balance_loss_clip": 1.05481005, "balance_loss_mlp": 1.02310133, "epoch": 0.33872422293031923, "flos": 18880646146560.0, "grad_norm": 2.269733580136174, "language_loss": 0.72897989, "learning_rate": 3.0809241672490066e-06, "loss": 0.75122988, "num_input_tokens_seen": 60627340, "step": 2817, "time_per_iteration": 2.649411678314209 }, { "auxiliary_loss_clip": 0.01191377, "auxiliary_loss_mlp": 0.01025643, "balance_loss_clip": 0.97911012, "balance_loss_mlp": 1.0175488, "epoch": 0.33884446582095834, "flos": 23146977064320.0, "grad_norm": 1.7756371612407833, "language_loss": 0.8521204, "learning_rate": 3.080268683171753e-06, "loss": 0.87429059, "num_input_tokens_seen": 60647630, "step": 2818, "time_per_iteration": 2.7440600395202637 }, { "auxiliary_loss_clip": 0.01191574, "auxiliary_loss_mlp": 0.01030261, "balance_loss_clip": 1.01604724, "balance_loss_mlp": 1.02220845, "epoch": 0.33896470871159745, "flos": 15997342544640.0, "grad_norm": 2.2939707732302703, "language_loss": 0.89039636, "learning_rate": 3.0796130352185985e-06, "loss": 0.91261476, "num_input_tokens_seen": 60664485, "step": 2819, "time_per_iteration": 2.768230676651001 }, { "auxiliary_loss_clip": 0.01176899, "auxiliary_loss_mlp": 0.01125289, "balance_loss_clip": 0.97442669, "balance_loss_mlp": 0.0, "epoch": 0.3390849516022365, "flos": 34495754112000.0, "grad_norm": 1.9459178367553482, "language_loss": 0.66489363, "learning_rate": 3.0789572234890057e-06, "loss": 0.6879155, "num_input_tokens_seen": 60686125, "step": 2820, "time_per_iteration": 2.8422021865844727 }, { "auxiliary_loss_clip": 0.01193305, "auxiliary_loss_mlp": 0.01029913, "balance_loss_clip": 0.9821955, "balance_loss_mlp": 1.021837, "epoch": 0.3392051944928756, "flos": 16180307447040.0, "grad_norm": 1.4941700720378899, "language_loss": 0.77401519, "learning_rate": 3.0783012480824596e-06, "loss": 0.79624736, "num_input_tokens_seen": 60705270, "step": 2821, "time_per_iteration": 2.7280635833740234 }, { "auxiliary_loss_clip": 0.01192385, "auxiliary_loss_mlp": 0.01033797, "balance_loss_clip": 1.05451024, "balance_loss_mlp": 1.02564883, "epoch": 0.33932543738351467, "flos": 17086656349440.0, "grad_norm": 2.057621046082418, "language_loss": 0.74226373, "learning_rate": 3.077645109098471e-06, "loss": 0.76452553, "num_input_tokens_seen": 60721540, "step": 2822, "time_per_iteration": 2.6619057655334473 }, { "auxiliary_loss_clip": 0.01176747, "auxiliary_loss_mlp": 0.01025133, "balance_loss_clip": 0.93870854, "balance_loss_mlp": 1.0168128, "epoch": 0.3394456802741538, "flos": 22126970551680.0, "grad_norm": 1.7110278061997566, "language_loss": 0.72136939, "learning_rate": 3.076988806636577e-06, "loss": 0.74338824, "num_input_tokens_seen": 60739300, "step": 2823, "time_per_iteration": 2.7272422313690186 }, { "auxiliary_loss_clip": 0.01195978, "auxiliary_loss_mlp": 0.01125459, "balance_loss_clip": 0.98027188, "balance_loss_mlp": 0.0, "epoch": 0.3395659231647929, "flos": 25226887121280.0, "grad_norm": 1.8989437654829076, "language_loss": 0.89013541, "learning_rate": 3.0763323407963377e-06, "loss": 0.91334981, "num_input_tokens_seen": 60758910, "step": 2824, "time_per_iteration": 2.7409791946411133 }, { "auxiliary_loss_clip": 0.01192416, "auxiliary_loss_mlp": 0.01027898, "balance_loss_clip": 1.01662588, "balance_loss_mlp": 1.01907659, "epoch": 0.33968616605543195, "flos": 29096477343360.0, "grad_norm": 2.0280707489634797, "language_loss": 0.80188489, "learning_rate": 3.075675711677337e-06, "loss": 0.82408804, "num_input_tokens_seen": 60779005, "step": 2825, "time_per_iteration": 3.752699375152588 }, { "auxiliary_loss_clip": 0.01185442, "auxiliary_loss_mlp": 0.01036326, "balance_loss_clip": 0.98072362, "balance_loss_mlp": 1.02838051, "epoch": 0.33980640894607106, "flos": 21433966479360.0, "grad_norm": 1.880625789850816, "language_loss": 0.78445655, "learning_rate": 3.0750189193791865e-06, "loss": 0.80667424, "num_input_tokens_seen": 60798590, "step": 2826, "time_per_iteration": 2.7245404720306396 }, { "auxiliary_loss_clip": 0.01186612, "auxiliary_loss_mlp": 0.01027389, "balance_loss_clip": 1.01536727, "balance_loss_mlp": 1.01840067, "epoch": 0.33992665183671017, "flos": 32490035596800.0, "grad_norm": 2.0103783919400575, "language_loss": 0.70426786, "learning_rate": 3.0743619640015203e-06, "loss": 0.72640789, "num_input_tokens_seen": 60818840, "step": 2827, "time_per_iteration": 3.738264799118042 }, { "auxiliary_loss_clip": 0.01194599, "auxiliary_loss_mlp": 0.01030751, "balance_loss_clip": 0.97694916, "balance_loss_mlp": 1.02192342, "epoch": 0.3400468947273492, "flos": 17055414495360.0, "grad_norm": 1.8524105183878696, "language_loss": 0.92117739, "learning_rate": 3.073704845643999e-06, "loss": 0.9434309, "num_input_tokens_seen": 60835965, "step": 2828, "time_per_iteration": 2.7585699558258057 }, { "auxiliary_loss_clip": 0.01191332, "auxiliary_loss_mlp": 0.01036319, "balance_loss_clip": 1.01319528, "balance_loss_mlp": 1.02730072, "epoch": 0.34016713761798834, "flos": 16872988296960.0, "grad_norm": 2.8120645194629827, "language_loss": 0.77708733, "learning_rate": 3.0730475644063063e-06, "loss": 0.79936397, "num_input_tokens_seen": 60851065, "step": 2829, "time_per_iteration": 2.677809476852417 }, { "auxiliary_loss_clip": 0.01181721, "auxiliary_loss_mlp": 0.01124523, "balance_loss_clip": 0.97454709, "balance_loss_mlp": 0.0, "epoch": 0.34028738050862745, "flos": 21907161273600.0, "grad_norm": 1.7247683764387935, "language_loss": 0.64828324, "learning_rate": 3.072390120388151e-06, "loss": 0.67134571, "num_input_tokens_seen": 60869390, "step": 2830, "time_per_iteration": 3.6786882877349854 }, { "auxiliary_loss_clip": 0.01191656, "auxiliary_loss_mlp": 0.01033869, "balance_loss_clip": 1.01708317, "balance_loss_mlp": 1.02432013, "epoch": 0.3404076233992665, "flos": 22746034477440.0, "grad_norm": 2.207942554798071, "language_loss": 0.71000636, "learning_rate": 3.071732513689267e-06, "loss": 0.7322616, "num_input_tokens_seen": 60887925, "step": 2831, "time_per_iteration": 3.624160051345825 }, { "auxiliary_loss_clip": 0.01195868, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.01959419, "balance_loss_mlp": 1.02353644, "epoch": 0.3405278662899056, "flos": 17052361839360.0, "grad_norm": 3.5422217214493728, "language_loss": 0.67455065, "learning_rate": 3.0710747444094134e-06, "loss": 0.69683123, "num_input_tokens_seen": 60905955, "step": 2832, "time_per_iteration": 2.5932648181915283 }, { "auxiliary_loss_clip": 0.01191846, "auxiliary_loss_mlp": 0.0103467, "balance_loss_clip": 0.9796856, "balance_loss_mlp": 1.02599764, "epoch": 0.3406481091805447, "flos": 42813131783040.0, "grad_norm": 1.9979867187438807, "language_loss": 0.65443289, "learning_rate": 3.070416812648372e-06, "loss": 0.67669797, "num_input_tokens_seen": 60929405, "step": 2833, "time_per_iteration": 2.870617151260376 }, { "auxiliary_loss_clip": 0.01182728, "auxiliary_loss_mlp": 0.01031882, "balance_loss_clip": 0.93525732, "balance_loss_mlp": 1.02251232, "epoch": 0.3407683520711838, "flos": 26761457917440.0, "grad_norm": 2.343810485021572, "language_loss": 0.65404391, "learning_rate": 3.069758718505951e-06, "loss": 0.67619002, "num_input_tokens_seen": 60951145, "step": 2834, "time_per_iteration": 2.8361058235168457 }, { "auxiliary_loss_clip": 0.01191486, "auxiliary_loss_mlp": 0.0103435, "balance_loss_clip": 1.05494857, "balance_loss_mlp": 1.0254333, "epoch": 0.3408885949618229, "flos": 28767643309440.0, "grad_norm": 1.6085648227166107, "language_loss": 0.79952049, "learning_rate": 3.0691004620819836e-06, "loss": 0.82177877, "num_input_tokens_seen": 60971275, "step": 2835, "time_per_iteration": 2.71669602394104 }, { "auxiliary_loss_clip": 0.01094479, "auxiliary_loss_mlp": 0.01004615, "balance_loss_clip": 0.87264818, "balance_loss_mlp": 1.00231445, "epoch": 0.341008837852462, "flos": 63576252881280.0, "grad_norm": 0.808670683492874, "language_loss": 0.6024009, "learning_rate": 3.0684420434763254e-06, "loss": 0.62339181, "num_input_tokens_seen": 61037460, "step": 2836, "time_per_iteration": 3.34729266166687 }, { "auxiliary_loss_clip": 0.01182873, "auxiliary_loss_mlp": 0.01029149, "balance_loss_clip": 0.94134581, "balance_loss_mlp": 1.02156687, "epoch": 0.34112908074310105, "flos": 20812173120000.0, "grad_norm": 1.7389843374829865, "language_loss": 0.77215046, "learning_rate": 3.06778346278886e-06, "loss": 0.79427069, "num_input_tokens_seen": 61056295, "step": 2837, "time_per_iteration": 2.7815473079681396 }, { "auxiliary_loss_clip": 0.01195443, "auxiliary_loss_mlp": 0.01031682, "balance_loss_clip": 1.05717659, "balance_loss_mlp": 1.02311039, "epoch": 0.34124932363374016, "flos": 24976446520320.0, "grad_norm": 1.739472796573222, "language_loss": 0.78983349, "learning_rate": 3.0671247201194906e-06, "loss": 0.8121047, "num_input_tokens_seen": 61078430, "step": 2838, "time_per_iteration": 2.7012593746185303 }, { "auxiliary_loss_clip": 0.01189561, "auxiliary_loss_mlp": 0.01036331, "balance_loss_clip": 0.93800545, "balance_loss_mlp": 1.02753329, "epoch": 0.3413695665243792, "flos": 28402970480640.0, "grad_norm": 1.7425517061629576, "language_loss": 0.75554466, "learning_rate": 3.066465815568151e-06, "loss": 0.7778036, "num_input_tokens_seen": 61099260, "step": 2839, "time_per_iteration": 2.7752890586853027 }, { "auxiliary_loss_clip": 0.01190836, "auxiliary_loss_mlp": 0.0103119, "balance_loss_clip": 1.01385462, "balance_loss_mlp": 1.02255642, "epoch": 0.34148980941501833, "flos": 25302012416640.0, "grad_norm": 1.710771804121954, "language_loss": 0.68760514, "learning_rate": 3.0658067492347947e-06, "loss": 0.7098254, "num_input_tokens_seen": 61121900, "step": 2840, "time_per_iteration": 2.7427945137023926 }, { "auxiliary_loss_clip": 0.01173696, "auxiliary_loss_mlp": 0.0103467, "balance_loss_clip": 0.82297695, "balance_loss_mlp": 1.02568734, "epoch": 0.34161005230565744, "flos": 17530081747200.0, "grad_norm": 1.9699201402995807, "language_loss": 0.66949981, "learning_rate": 3.065147521219402e-06, "loss": 0.69158351, "num_input_tokens_seen": 61141155, "step": 2841, "time_per_iteration": 2.8205769062042236 }, { "auxiliary_loss_clip": 0.01182625, "auxiliary_loss_mlp": 0.01031486, "balance_loss_clip": 0.9794656, "balance_loss_mlp": 1.02277207, "epoch": 0.3417302951962965, "flos": 43650101566080.0, "grad_norm": 1.4419404186935787, "language_loss": 0.74394631, "learning_rate": 3.064488131621977e-06, "loss": 0.76608741, "num_input_tokens_seen": 61164480, "step": 2842, "time_per_iteration": 2.913639783859253 }, { "auxiliary_loss_clip": 0.01179187, "auxiliary_loss_mlp": 0.01025252, "balance_loss_clip": 1.0125159, "balance_loss_mlp": 1.01688385, "epoch": 0.3418505380869356, "flos": 30882207012480.0, "grad_norm": 1.73346942449734, "language_loss": 0.73910713, "learning_rate": 3.063828580542549e-06, "loss": 0.76115143, "num_input_tokens_seen": 61185675, "step": 2843, "time_per_iteration": 2.725569009780884 }, { "auxiliary_loss_clip": 0.01189355, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 0.97841042, "balance_loss_mlp": 1.02162886, "epoch": 0.3419707809775747, "flos": 19463871277440.0, "grad_norm": 1.8373097089084762, "language_loss": 0.7352888, "learning_rate": 3.0631688680811706e-06, "loss": 0.75748628, "num_input_tokens_seen": 61205300, "step": 2844, "time_per_iteration": 2.7055506706237793 }, { "auxiliary_loss_clip": 0.01190483, "auxiliary_loss_mlp": 0.0102981, "balance_loss_clip": 1.05339026, "balance_loss_mlp": 1.02147114, "epoch": 0.3420910238682138, "flos": 28727818104960.0, "grad_norm": 2.8329104727897065, "language_loss": 0.75899339, "learning_rate": 3.062508994337921e-06, "loss": 0.78119636, "num_input_tokens_seen": 61224905, "step": 2845, "time_per_iteration": 2.6683900356292725 }, { "auxiliary_loss_clip": 0.01190951, "auxiliary_loss_mlp": 0.01033136, "balance_loss_clip": 1.01523733, "balance_loss_mlp": 1.02437365, "epoch": 0.3422112667588529, "flos": 21397265758080.0, "grad_norm": 1.963013109894051, "language_loss": 0.7919395, "learning_rate": 3.0618489594129013e-06, "loss": 0.81418037, "num_input_tokens_seen": 61243045, "step": 2846, "time_per_iteration": 2.6125426292419434 }, { "auxiliary_loss_clip": 0.01192325, "auxiliary_loss_mlp": 0.01030017, "balance_loss_clip": 0.9402349, "balance_loss_mlp": 1.02139175, "epoch": 0.342331509649492, "flos": 13881450038400.0, "grad_norm": 2.15630861575301, "language_loss": 0.71296203, "learning_rate": 3.061188763406239e-06, "loss": 0.73518538, "num_input_tokens_seen": 61259190, "step": 2847, "time_per_iteration": 2.7245888710021973 }, { "auxiliary_loss_clip": 0.01182085, "auxiliary_loss_mlp": 0.01033988, "balance_loss_clip": 0.97604597, "balance_loss_mlp": 1.02537537, "epoch": 0.34245175254013105, "flos": 28621450955520.0, "grad_norm": 2.790019392231014, "language_loss": 0.8250283, "learning_rate": 3.060528406418085e-06, "loss": 0.84718901, "num_input_tokens_seen": 61279040, "step": 2848, "time_per_iteration": 2.7770674228668213 }, { "auxiliary_loss_clip": 0.01180929, "auxiliary_loss_mlp": 0.01032165, "balance_loss_clip": 0.97742522, "balance_loss_mlp": 1.02415419, "epoch": 0.34257199543077016, "flos": 34127058960000.0, "grad_norm": 1.645629043288728, "language_loss": 0.6164726, "learning_rate": 3.0598678885486145e-06, "loss": 0.63860357, "num_input_tokens_seen": 61301580, "step": 2849, "time_per_iteration": 2.7918365001678467 }, { "auxiliary_loss_clip": 0.01187311, "auxiliary_loss_mlp": 0.01124797, "balance_loss_clip": 0.93573964, "balance_loss_mlp": 0.0, "epoch": 0.34269223832140927, "flos": 19974018188160.0, "grad_norm": 1.6987572925884438, "language_loss": 0.74880242, "learning_rate": 3.0592072098980282e-06, "loss": 0.77192348, "num_input_tokens_seen": 61321240, "step": 2850, "time_per_iteration": 2.7908246517181396 }, { "auxiliary_loss_clip": 0.01185757, "auxiliary_loss_mlp": 0.01030181, "balance_loss_clip": 0.97827303, "balance_loss_mlp": 1.02216387, "epoch": 0.3428124812120483, "flos": 27235658292480.0, "grad_norm": 1.7947223030317419, "language_loss": 0.72783905, "learning_rate": 3.0585463705665514e-06, "loss": 0.74999845, "num_input_tokens_seen": 61341615, "step": 2851, "time_per_iteration": 3.7886574268341064 }, { "auxiliary_loss_clip": 0.011817, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 0.93525219, "balance_loss_mlp": 1.02057648, "epoch": 0.34293272410268744, "flos": 24570871079040.0, "grad_norm": 2.622305591067206, "language_loss": 0.70466375, "learning_rate": 3.0578853706544304e-06, "loss": 0.72677445, "num_input_tokens_seen": 61359005, "step": 2852, "time_per_iteration": 2.765206813812256 }, { "auxiliary_loss_clip": 0.01190275, "auxiliary_loss_mlp": 0.01124925, "balance_loss_clip": 0.94005102, "balance_loss_mlp": 0.0, "epoch": 0.34305296699332655, "flos": 21506865131520.0, "grad_norm": 1.9071829130202629, "language_loss": 0.65682602, "learning_rate": 3.0572242102619404e-06, "loss": 0.67997801, "num_input_tokens_seen": 61376160, "step": 2853, "time_per_iteration": 3.7343966960906982 }, { "auxiliary_loss_clip": 0.01188342, "auxiliary_loss_mlp": 0.01029559, "balance_loss_clip": 0.98014647, "balance_loss_mlp": 1.02124429, "epoch": 0.3431732098839656, "flos": 24056665931520.0, "grad_norm": 1.7592567709328246, "language_loss": 0.80754387, "learning_rate": 3.0565628894893784e-06, "loss": 0.829723, "num_input_tokens_seen": 61396795, "step": 2854, "time_per_iteration": 2.7536327838897705 }, { "auxiliary_loss_clip": 0.01182589, "auxiliary_loss_mlp": 0.01033334, "balance_loss_clip": 1.01605952, "balance_loss_mlp": 1.02532303, "epoch": 0.3432934527746047, "flos": 16800879744000.0, "grad_norm": 1.7336590891921337, "language_loss": 0.74647671, "learning_rate": 3.0559014084370655e-06, "loss": 0.76863587, "num_input_tokens_seen": 61415320, "step": 2855, "time_per_iteration": 2.6317756175994873 }, { "auxiliary_loss_clip": 0.01197428, "auxiliary_loss_mlp": 0.01029773, "balance_loss_clip": 0.97844619, "balance_loss_mlp": 1.02090406, "epoch": 0.34341369566524377, "flos": 23439720908160.0, "grad_norm": 1.609623635693904, "language_loss": 0.78798091, "learning_rate": 3.055239767205349e-06, "loss": 0.81025302, "num_input_tokens_seen": 61437070, "step": 2856, "time_per_iteration": 3.736506700515747 }, { "auxiliary_loss_clip": 0.01190651, "auxiliary_loss_mlp": 0.01032893, "balance_loss_clip": 1.01960039, "balance_loss_mlp": 1.02509654, "epoch": 0.3435339385558829, "flos": 17267466435840.0, "grad_norm": 1.885275476766569, "language_loss": 0.78469992, "learning_rate": 3.054577965894599e-06, "loss": 0.80693537, "num_input_tokens_seen": 61453215, "step": 2857, "time_per_iteration": 2.6204121112823486 }, { "auxiliary_loss_clip": 0.01200186, "auxiliary_loss_mlp": 0.01033533, "balance_loss_clip": 0.98210853, "balance_loss_mlp": 1.02525973, "epoch": 0.343654181446522, "flos": 22199366413440.0, "grad_norm": 1.4988087828745946, "language_loss": 0.70670664, "learning_rate": 3.0539160046052094e-06, "loss": 0.7290439, "num_input_tokens_seen": 61472915, "step": 2858, "time_per_iteration": 3.6098389625549316 }, { "auxiliary_loss_clip": 0.01180091, "auxiliary_loss_mlp": 0.01031183, "balance_loss_clip": 0.97553617, "balance_loss_mlp": 1.02202785, "epoch": 0.34377442433716104, "flos": 19901801894400.0, "grad_norm": 1.9098225356848268, "language_loss": 0.70486426, "learning_rate": 3.0532538834376003e-06, "loss": 0.72697699, "num_input_tokens_seen": 61492475, "step": 2859, "time_per_iteration": 2.6875314712524414 }, { "auxiliary_loss_clip": 0.01198368, "auxiliary_loss_mlp": 0.01033183, "balance_loss_clip": 1.01890707, "balance_loss_mlp": 1.02428997, "epoch": 0.34389466722780015, "flos": 22197678474240.0, "grad_norm": 1.7528416447675246, "language_loss": 0.78112686, "learning_rate": 3.0525916024922143e-06, "loss": 0.80344242, "num_input_tokens_seen": 61511660, "step": 2860, "time_per_iteration": 2.670952796936035 }, { "auxiliary_loss_clip": 0.0118638, "auxiliary_loss_mlp": 0.01035028, "balance_loss_clip": 0.97793138, "balance_loss_mlp": 1.025962, "epoch": 0.34401491011843927, "flos": 18624567110400.0, "grad_norm": 2.458798305504796, "language_loss": 0.84263229, "learning_rate": 3.0519291618695193e-06, "loss": 0.86484641, "num_input_tokens_seen": 61529060, "step": 2861, "time_per_iteration": 2.698072910308838 }, { "auxiliary_loss_clip": 0.01177127, "auxiliary_loss_mlp": 0.01031552, "balance_loss_clip": 0.93612158, "balance_loss_mlp": 1.02320755, "epoch": 0.3441351530090783, "flos": 17858197509120.0, "grad_norm": 1.5613773096458692, "language_loss": 0.75583607, "learning_rate": 3.0512665616700065e-06, "loss": 0.77792287, "num_input_tokens_seen": 61548125, "step": 2862, "time_per_iteration": 2.7644801139831543 }, { "auxiliary_loss_clip": 0.01177851, "auxiliary_loss_mlp": 0.01036292, "balance_loss_clip": 0.89846063, "balance_loss_mlp": 1.02773261, "epoch": 0.34425539589971743, "flos": 23112754381440.0, "grad_norm": 1.8146087874305605, "language_loss": 0.89185101, "learning_rate": 3.0506038019941933e-06, "loss": 0.9139924, "num_input_tokens_seen": 61568135, "step": 2863, "time_per_iteration": 2.84908127784729 }, { "auxiliary_loss_clip": 0.01188422, "auxiliary_loss_mlp": 0.0103719, "balance_loss_clip": 0.94136405, "balance_loss_mlp": 1.02822518, "epoch": 0.34437563879035654, "flos": 21907699977600.0, "grad_norm": 2.53256553292484, "language_loss": 0.6789788, "learning_rate": 3.049940882942617e-06, "loss": 0.70123482, "num_input_tokens_seen": 61586920, "step": 2864, "time_per_iteration": 2.7951271533966064 }, { "auxiliary_loss_clip": 0.01195991, "auxiliary_loss_mlp": 0.01036583, "balance_loss_clip": 1.05733252, "balance_loss_mlp": 1.02781534, "epoch": 0.3444958816809956, "flos": 23076915586560.0, "grad_norm": 1.7750053401691834, "language_loss": 0.80230319, "learning_rate": 3.0492778046158448e-06, "loss": 0.82462895, "num_input_tokens_seen": 61608340, "step": 2865, "time_per_iteration": 2.6320478916168213 }, { "auxiliary_loss_clip": 0.01188882, "auxiliary_loss_mlp": 0.01027032, "balance_loss_clip": 1.01875257, "balance_loss_mlp": 1.01873517, "epoch": 0.3446161245716347, "flos": 21908633731200.0, "grad_norm": 2.5593591436404313, "language_loss": 0.76711375, "learning_rate": 3.0486145671144633e-06, "loss": 0.7892729, "num_input_tokens_seen": 61628130, "step": 2866, "time_per_iteration": 2.648137092590332 }, { "auxiliary_loss_clip": 0.01169309, "auxiliary_loss_mlp": 0.01039505, "balance_loss_clip": 0.86140466, "balance_loss_mlp": 1.0302608, "epoch": 0.3447363674622738, "flos": 25112834461440.0, "grad_norm": 3.1064172387275146, "language_loss": 0.77136707, "learning_rate": 3.047951170539086e-06, "loss": 0.79345524, "num_input_tokens_seen": 61647755, "step": 2867, "time_per_iteration": 2.8084819316864014 }, { "auxiliary_loss_clip": 0.01189674, "auxiliary_loss_mlp": 0.01032075, "balance_loss_clip": 0.94367683, "balance_loss_mlp": 1.02449894, "epoch": 0.3448566103529129, "flos": 11984684451840.0, "grad_norm": 1.8153344990217986, "language_loss": 0.84210706, "learning_rate": 3.047287614990349e-06, "loss": 0.86432457, "num_input_tokens_seen": 61665675, "step": 2868, "time_per_iteration": 2.6852328777313232 }, { "auxiliary_loss_clip": 0.01180543, "auxiliary_loss_mlp": 0.01029512, "balance_loss_clip": 0.97666013, "balance_loss_mlp": 1.02077961, "epoch": 0.344976853243552, "flos": 40187882465280.0, "grad_norm": 2.399187604198925, "language_loss": 0.62257099, "learning_rate": 3.046623900568914e-06, "loss": 0.6446715, "num_input_tokens_seen": 61688240, "step": 2869, "time_per_iteration": 2.799711227416992 }, { "auxiliary_loss_clip": 0.0118949, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 0.97941434, "balance_loss_mlp": 1.02535248, "epoch": 0.34509709613419104, "flos": 28723652127360.0, "grad_norm": 2.4881347606537876, "language_loss": 0.70038855, "learning_rate": 3.045960027375465e-06, "loss": 0.72263479, "num_input_tokens_seen": 61706075, "step": 2870, "time_per_iteration": 2.7321197986602783 }, { "auxiliary_loss_clip": 0.01199442, "auxiliary_loss_mlp": 0.01034662, "balance_loss_clip": 1.0177114, "balance_loss_mlp": 1.02581096, "epoch": 0.34521733902483015, "flos": 29967597982080.0, "grad_norm": 2.4582376328478497, "language_loss": 0.83009481, "learning_rate": 3.045295995510711e-06, "loss": 0.85243589, "num_input_tokens_seen": 61723045, "step": 2871, "time_per_iteration": 2.7051820755004883 }, { "auxiliary_loss_clip": 0.01188561, "auxiliary_loss_mlp": 0.0102611, "balance_loss_clip": 0.97961253, "balance_loss_mlp": 1.01845646, "epoch": 0.34533758191546926, "flos": 27923059843200.0, "grad_norm": 1.6268562553010364, "language_loss": 0.73864496, "learning_rate": 3.0446318050753865e-06, "loss": 0.76079166, "num_input_tokens_seen": 61743525, "step": 2872, "time_per_iteration": 2.72733736038208 }, { "auxiliary_loss_clip": 0.01184163, "auxiliary_loss_mlp": 0.01040741, "balance_loss_clip": 1.01575959, "balance_loss_mlp": 1.03259289, "epoch": 0.3454578248061083, "flos": 27125879351040.0, "grad_norm": 2.000895975613671, "language_loss": 0.77639508, "learning_rate": 3.0439674561702474e-06, "loss": 0.79864419, "num_input_tokens_seen": 61763025, "step": 2873, "time_per_iteration": 2.676266670227051 }, { "auxiliary_loss_clip": 0.0118828, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.01850533, "balance_loss_mlp": 1.02224708, "epoch": 0.3455780676967474, "flos": 19024899166080.0, "grad_norm": 2.2163317122984294, "language_loss": 0.88165349, "learning_rate": 3.043302948896076e-06, "loss": 0.90383941, "num_input_tokens_seen": 61781630, "step": 2874, "time_per_iteration": 2.7197325229644775 }, { "auxiliary_loss_clip": 0.01176054, "auxiliary_loss_mlp": 0.01026441, "balance_loss_clip": 0.90081042, "balance_loss_mlp": 1.01751792, "epoch": 0.34569831058738654, "flos": 34496005507200.0, "grad_norm": 2.710243606612998, "language_loss": 0.6050328, "learning_rate": 3.0426382833536756e-06, "loss": 0.62705773, "num_input_tokens_seen": 61804985, "step": 2875, "time_per_iteration": 2.910104990005493 }, { "auxiliary_loss_clip": 0.01180053, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 0.93741071, "balance_loss_mlp": 1.02327871, "epoch": 0.3458185534780256, "flos": 31138681098240.0, "grad_norm": 1.7566477635141584, "language_loss": 0.77246332, "learning_rate": 3.041973459643877e-06, "loss": 0.79458463, "num_input_tokens_seen": 61824440, "step": 2876, "time_per_iteration": 2.7559571266174316 }, { "auxiliary_loss_clip": 0.01178025, "auxiliary_loss_mlp": 0.01032406, "balance_loss_clip": 0.8985433, "balance_loss_mlp": 1.02365029, "epoch": 0.3459387963686647, "flos": 32452508862720.0, "grad_norm": 1.8292363553982485, "language_loss": 0.67160439, "learning_rate": 3.0413084778675334e-06, "loss": 0.69370878, "num_input_tokens_seen": 61845690, "step": 2877, "time_per_iteration": 3.88474440574646 }, { "auxiliary_loss_clip": 0.01183285, "auxiliary_loss_mlp": 0.01124756, "balance_loss_clip": 0.97537625, "balance_loss_mlp": 0.0, "epoch": 0.3460590392593038, "flos": 24675658030080.0, "grad_norm": 1.8564303895772631, "language_loss": 0.84068954, "learning_rate": 3.0406433381255214e-06, "loss": 0.86376995, "num_input_tokens_seen": 61863725, "step": 2878, "time_per_iteration": 2.714338779449463 }, { "auxiliary_loss_clip": 0.0119232, "auxiliary_loss_mlp": 0.01032318, "balance_loss_clip": 1.02058232, "balance_loss_mlp": 1.02379417, "epoch": 0.34617928214994287, "flos": 18807316531200.0, "grad_norm": 2.146256465928803, "language_loss": 0.82451135, "learning_rate": 3.0399780405187425e-06, "loss": 0.84675771, "num_input_tokens_seen": 61882720, "step": 2879, "time_per_iteration": 3.7269952297210693 }, { "auxiliary_loss_clip": 0.0118771, "auxiliary_loss_mlp": 0.01031529, "balance_loss_clip": 1.01839578, "balance_loss_mlp": 1.02342844, "epoch": 0.346299525040582, "flos": 24857653265280.0, "grad_norm": 1.737051543442168, "language_loss": 0.78536654, "learning_rate": 3.0393125851481216e-06, "loss": 0.80755895, "num_input_tokens_seen": 61902595, "step": 2880, "time_per_iteration": 2.7283761501312256 }, { "auxiliary_loss_clip": 0.01180462, "auxiliary_loss_mlp": 0.01031825, "balance_loss_clip": 0.93997514, "balance_loss_mlp": 1.02367687, "epoch": 0.3464197679312211, "flos": 16434914025600.0, "grad_norm": 1.9334141523138857, "language_loss": 0.86595333, "learning_rate": 3.038646972114608e-06, "loss": 0.88807619, "num_input_tokens_seen": 61918920, "step": 2881, "time_per_iteration": 2.73494029045105 }, { "auxiliary_loss_clip": 0.01187633, "auxiliary_loss_mlp": 0.01032667, "balance_loss_clip": 0.9439733, "balance_loss_mlp": 1.02434063, "epoch": 0.34654001082186014, "flos": 22382474970240.0, "grad_norm": 1.8865114956218836, "language_loss": 0.67479724, "learning_rate": 3.037981201519174e-06, "loss": 0.69700027, "num_input_tokens_seen": 61939520, "step": 2882, "time_per_iteration": 3.763049840927124 }, { "auxiliary_loss_clip": 0.01192041, "auxiliary_loss_mlp": 0.01027733, "balance_loss_clip": 1.02069926, "balance_loss_mlp": 1.01948369, "epoch": 0.34666025371249926, "flos": 19573901614080.0, "grad_norm": 1.976670176631696, "language_loss": 0.71514535, "learning_rate": 3.0373152734628175e-06, "loss": 0.73734307, "num_input_tokens_seen": 61957800, "step": 2883, "time_per_iteration": 2.6552491188049316 }, { "auxiliary_loss_clip": 0.01185601, "auxiliary_loss_mlp": 0.01034639, "balance_loss_clip": 1.01735544, "balance_loss_mlp": 1.02607989, "epoch": 0.34678049660313837, "flos": 15267637751040.0, "grad_norm": 1.9821138704931023, "language_loss": 0.75940228, "learning_rate": 3.0366491880465584e-06, "loss": 0.78160477, "num_input_tokens_seen": 61975820, "step": 2884, "time_per_iteration": 3.600180149078369 }, { "auxiliary_loss_clip": 0.01199977, "auxiliary_loss_mlp": 0.01032051, "balance_loss_clip": 1.06111479, "balance_loss_mlp": 1.02331853, "epoch": 0.3469007394937774, "flos": 21181550630400.0, "grad_norm": 1.5748510836492629, "language_loss": 0.82145953, "learning_rate": 3.035982945371443e-06, "loss": 0.8437798, "num_input_tokens_seen": 61997515, "step": 2885, "time_per_iteration": 2.6471962928771973 }, { "auxiliary_loss_clip": 0.01200483, "auxiliary_loss_mlp": 0.01032319, "balance_loss_clip": 0.98191047, "balance_loss_mlp": 1.02372408, "epoch": 0.34702098238441653, "flos": 22375471818240.0, "grad_norm": 1.8571929035565276, "language_loss": 0.85180974, "learning_rate": 3.035316545538537e-06, "loss": 0.87413776, "num_input_tokens_seen": 62016310, "step": 2886, "time_per_iteration": 2.7474863529205322 }, { "auxiliary_loss_clip": 0.0118778, "auxiliary_loss_mlp": 0.01029997, "balance_loss_clip": 0.98191494, "balance_loss_mlp": 1.02207565, "epoch": 0.3471412252750556, "flos": 22929430343040.0, "grad_norm": 2.025393527342934, "language_loss": 0.7918424, "learning_rate": 3.034649988648935e-06, "loss": 0.81402016, "num_input_tokens_seen": 62036075, "step": 2887, "time_per_iteration": 2.7694313526153564 }, { "auxiliary_loss_clip": 0.01194147, "auxiliary_loss_mlp": 0.01031459, "balance_loss_clip": 0.98053336, "balance_loss_mlp": 1.02299511, "epoch": 0.3472614681656947, "flos": 21324259365120.0, "grad_norm": 1.7026154155836266, "language_loss": 0.80446655, "learning_rate": 3.033983274803752e-06, "loss": 0.82672262, "num_input_tokens_seen": 62055865, "step": 2888, "time_per_iteration": 2.6517586708068848 }, { "auxiliary_loss_clip": 0.01189503, "auxiliary_loss_mlp": 0.01027426, "balance_loss_clip": 0.98094308, "balance_loss_mlp": 1.01862884, "epoch": 0.3473817110563338, "flos": 23475739271040.0, "grad_norm": 2.058356437950254, "language_loss": 0.72505486, "learning_rate": 3.0333164041041283e-06, "loss": 0.74722409, "num_input_tokens_seen": 62072180, "step": 2889, "time_per_iteration": 2.7598323822021484 }, { "auxiliary_loss_clip": 0.01189421, "auxiliary_loss_mlp": 0.01033144, "balance_loss_clip": 0.86550403, "balance_loss_mlp": 1.02466798, "epoch": 0.34750195394697286, "flos": 22346025644160.0, "grad_norm": 2.0089877297984406, "language_loss": 0.72075987, "learning_rate": 3.032649376651228e-06, "loss": 0.74298555, "num_input_tokens_seen": 62091600, "step": 2890, "time_per_iteration": 2.8106751441955566 }, { "auxiliary_loss_clip": 0.01189889, "auxiliary_loss_mlp": 0.0102746, "balance_loss_clip": 0.94028205, "balance_loss_mlp": 1.01844823, "epoch": 0.347622196837612, "flos": 29095004885760.0, "grad_norm": 1.8744840760266814, "language_loss": 0.7567665, "learning_rate": 3.031982192546238e-06, "loss": 0.77893996, "num_input_tokens_seen": 62114695, "step": 2891, "time_per_iteration": 2.776237964630127 }, { "auxiliary_loss_clip": 0.01193739, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.01891398, "balance_loss_mlp": 1.02417529, "epoch": 0.3477424397282511, "flos": 22455732758400.0, "grad_norm": 2.1410577525452092, "language_loss": 0.94699192, "learning_rate": 3.0313148518903696e-06, "loss": 0.96925986, "num_input_tokens_seen": 62134520, "step": 2892, "time_per_iteration": 2.6915552616119385 }, { "auxiliary_loss_clip": 0.01194687, "auxiliary_loss_mlp": 0.01027791, "balance_loss_clip": 0.98098886, "balance_loss_mlp": 1.01907659, "epoch": 0.34786268261889014, "flos": 15778790242560.0, "grad_norm": 1.993277395295914, "language_loss": 0.81383705, "learning_rate": 3.030647354784859e-06, "loss": 0.83606178, "num_input_tokens_seen": 62151560, "step": 2893, "time_per_iteration": 2.606861114501953 }, { "auxiliary_loss_clip": 0.01184267, "auxiliary_loss_mlp": 0.01027217, "balance_loss_clip": 0.93936521, "balance_loss_mlp": 1.01871109, "epoch": 0.34798292550952925, "flos": 20777627214720.0, "grad_norm": 1.7511853478392008, "language_loss": 0.77260953, "learning_rate": 3.029979701330964e-06, "loss": 0.79472435, "num_input_tokens_seen": 62170985, "step": 2894, "time_per_iteration": 2.715524911880493 }, { "auxiliary_loss_clip": 0.01197784, "auxiliary_loss_mlp": 0.01024479, "balance_loss_clip": 0.98175597, "balance_loss_mlp": 1.01623547, "epoch": 0.34810316840016836, "flos": 19937820257280.0, "grad_norm": 2.3491136104010106, "language_loss": 0.79895616, "learning_rate": 3.029311891629966e-06, "loss": 0.82117873, "num_input_tokens_seen": 62189440, "step": 2895, "time_per_iteration": 2.7215676307678223 }, { "auxiliary_loss_clip": 0.01190331, "auxiliary_loss_mlp": 0.01027586, "balance_loss_clip": 0.98242694, "balance_loss_mlp": 1.01903248, "epoch": 0.3482234112908074, "flos": 23623296341760.0, "grad_norm": 1.7814952582754384, "language_loss": 0.74155647, "learning_rate": 3.0286439257831744e-06, "loss": 0.76373565, "num_input_tokens_seen": 62208910, "step": 2896, "time_per_iteration": 2.780669927597046 }, { "auxiliary_loss_clip": 0.01199811, "auxiliary_loss_mlp": 0.01031478, "balance_loss_clip": 1.05868077, "balance_loss_mlp": 1.02235842, "epoch": 0.3483436541814465, "flos": 23986712194560.0, "grad_norm": 1.9366773024145534, "language_loss": 0.71614468, "learning_rate": 3.0279758038919156e-06, "loss": 0.73845756, "num_input_tokens_seen": 62227135, "step": 2897, "time_per_iteration": 2.641798734664917 }, { "auxiliary_loss_clip": 0.01197414, "auxiliary_loss_mlp": 0.01034762, "balance_loss_clip": 1.0208571, "balance_loss_mlp": 1.02625036, "epoch": 0.34846389707208564, "flos": 22638338524800.0, "grad_norm": 2.397472983705064, "language_loss": 0.78634524, "learning_rate": 3.0273075260575455e-06, "loss": 0.80866694, "num_input_tokens_seen": 62246035, "step": 2898, "time_per_iteration": 2.6865434646606445 }, { "auxiliary_loss_clip": 0.01192277, "auxiliary_loss_mlp": 0.0103088, "balance_loss_clip": 0.97937083, "balance_loss_mlp": 1.02106273, "epoch": 0.3485841399627247, "flos": 21792857218560.0, "grad_norm": 1.9621970208713873, "language_loss": 0.81123698, "learning_rate": 3.0266390923814396e-06, "loss": 0.83346856, "num_input_tokens_seen": 62264095, "step": 2899, "time_per_iteration": 2.6833415031433105 }, { "auxiliary_loss_clip": 0.01199768, "auxiliary_loss_mlp": 0.01033672, "balance_loss_clip": 0.98445636, "balance_loss_mlp": 1.02420664, "epoch": 0.3487043828533638, "flos": 17019036996480.0, "grad_norm": 1.861680652617834, "language_loss": 0.8193481, "learning_rate": 3.0259705029650008e-06, "loss": 0.84168249, "num_input_tokens_seen": 62282025, "step": 2900, "time_per_iteration": 2.689523935317993 }, { "auxiliary_loss_clip": 0.01193428, "auxiliary_loss_mlp": 0.01029638, "balance_loss_clip": 1.01773119, "balance_loss_mlp": 1.02114391, "epoch": 0.34882462574400286, "flos": 22601135013120.0, "grad_norm": 1.8945358808106005, "language_loss": 0.72878218, "learning_rate": 3.025301757909652e-06, "loss": 0.7510128, "num_input_tokens_seen": 62302220, "step": 2901, "time_per_iteration": 2.7070865631103516 }, { "auxiliary_loss_clip": 0.01194093, "auxiliary_loss_mlp": 0.0112546, "balance_loss_clip": 0.94255847, "balance_loss_mlp": 0.0, "epoch": 0.34894486863464197, "flos": 29861518141440.0, "grad_norm": 1.5944273106649576, "language_loss": 0.80471665, "learning_rate": 3.024632857316842e-06, "loss": 0.82791221, "num_input_tokens_seen": 62323535, "step": 2902, "time_per_iteration": 2.7696070671081543 }, { "auxiliary_loss_clip": 0.01196985, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.02161992, "balance_loss_mlp": 1.02321696, "epoch": 0.3490651115252811, "flos": 22122265870080.0, "grad_norm": 5.903824093328677, "language_loss": 0.77199233, "learning_rate": 3.0239638012880412e-06, "loss": 0.79428178, "num_input_tokens_seen": 62343430, "step": 2903, "time_per_iteration": 3.6566646099090576 }, { "auxiliary_loss_clip": 0.01179965, "auxiliary_loss_mlp": 0.01034069, "balance_loss_clip": 0.8997696, "balance_loss_mlp": 1.02489555, "epoch": 0.34918535441592014, "flos": 12676682943360.0, "grad_norm": 2.5527953173348124, "language_loss": 0.81357813, "learning_rate": 3.0232945899247466e-06, "loss": 0.83571845, "num_input_tokens_seen": 62360365, "step": 2904, "time_per_iteration": 2.7071592807769775 }, { "auxiliary_loss_clip": 0.01195327, "auxiliary_loss_mlp": 0.01031454, "balance_loss_clip": 1.01922393, "balance_loss_mlp": 1.02238822, "epoch": 0.34930559730655925, "flos": 23185617120000.0, "grad_norm": 1.8346881476618184, "language_loss": 0.77600968, "learning_rate": 3.022625223328476e-06, "loss": 0.7982775, "num_input_tokens_seen": 62382105, "step": 2905, "time_per_iteration": 3.6917805671691895 }, { "auxiliary_loss_clip": 0.01203394, "auxiliary_loss_mlp": 0.01033528, "balance_loss_clip": 1.02179909, "balance_loss_mlp": 1.02518344, "epoch": 0.34942584019719836, "flos": 22855023319680.0, "grad_norm": 1.3993852475733077, "language_loss": 0.68899173, "learning_rate": 3.0219557016007723e-06, "loss": 0.71136093, "num_input_tokens_seen": 62402235, "step": 2906, "time_per_iteration": 2.7105045318603516 }, { "auxiliary_loss_clip": 0.01190759, "auxiliary_loss_mlp": 0.0103075, "balance_loss_clip": 1.01908267, "balance_loss_mlp": 1.0218327, "epoch": 0.3495460830878374, "flos": 24426043441920.0, "grad_norm": 1.8004693422804388, "language_loss": 0.69733763, "learning_rate": 3.021286024843202e-06, "loss": 0.7195527, "num_input_tokens_seen": 62420430, "step": 2907, "time_per_iteration": 2.763500690460205 }, { "auxiliary_loss_clip": 0.01104608, "auxiliary_loss_mlp": 0.01004171, "balance_loss_clip": 1.03546703, "balance_loss_mlp": 1.00172734, "epoch": 0.3496663259784765, "flos": 70008749389440.0, "grad_norm": 1.0658456632544175, "language_loss": 0.64772034, "learning_rate": 3.0206161931573526e-06, "loss": 0.66880816, "num_input_tokens_seen": 62472980, "step": 2908, "time_per_iteration": 4.023785591125488 }, { "auxiliary_loss_clip": 0.01186158, "auxiliary_loss_mlp": 0.01032141, "balance_loss_clip": 0.97598422, "balance_loss_mlp": 1.02399302, "epoch": 0.34978656886911563, "flos": 28692805322880.0, "grad_norm": 1.5077575481557814, "language_loss": 0.92768323, "learning_rate": 3.0199462066448388e-06, "loss": 0.94986629, "num_input_tokens_seen": 62495175, "step": 2909, "time_per_iteration": 2.795625686645508 }, { "auxiliary_loss_clip": 0.01198347, "auxiliary_loss_mlp": 0.01034204, "balance_loss_clip": 1.02088165, "balance_loss_mlp": 1.02545965, "epoch": 0.3499068117597547, "flos": 21142156389120.0, "grad_norm": 1.7161567350075209, "language_loss": 0.69267935, "learning_rate": 3.019276065407296e-06, "loss": 0.71500486, "num_input_tokens_seen": 62514295, "step": 2910, "time_per_iteration": 3.6577138900756836 }, { "auxiliary_loss_clip": 0.01188629, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 0.90156162, "balance_loss_mlp": 1.02094436, "epoch": 0.3500270546503938, "flos": 22782699285120.0, "grad_norm": 2.6839703681057556, "language_loss": 0.80813903, "learning_rate": 3.018605769546385e-06, "loss": 0.83032084, "num_input_tokens_seen": 62534850, "step": 2911, "time_per_iteration": 2.8190789222717285 }, { "auxiliary_loss_clip": 0.01190507, "auxiliary_loss_mlp": 0.0103394, "balance_loss_clip": 1.01673412, "balance_loss_mlp": 1.02451658, "epoch": 0.3501472975410329, "flos": 22894058424960.0, "grad_norm": 1.7523565011795224, "language_loss": 0.79844552, "learning_rate": 3.017935319163788e-06, "loss": 0.82069004, "num_input_tokens_seen": 62553810, "step": 2912, "time_per_iteration": 2.6695330142974854 }, { "auxiliary_loss_clip": 0.01196034, "auxiliary_loss_mlp": 0.01027251, "balance_loss_clip": 1.01939154, "balance_loss_mlp": 1.01839364, "epoch": 0.35026754043167196, "flos": 25446588658560.0, "grad_norm": 1.5967259542996386, "language_loss": 0.70825517, "learning_rate": 3.017264714361213e-06, "loss": 0.73048806, "num_input_tokens_seen": 62573460, "step": 2913, "time_per_iteration": 2.7632429599761963 }, { "auxiliary_loss_clip": 0.01192107, "auxiliary_loss_mlp": 0.01125811, "balance_loss_clip": 0.98067743, "balance_loss_mlp": 0.0, "epoch": 0.3503877833223111, "flos": 19573757959680.0, "grad_norm": 1.8712839548820945, "language_loss": 0.82268488, "learning_rate": 3.016593955240389e-06, "loss": 0.84586406, "num_input_tokens_seen": 62592150, "step": 2914, "time_per_iteration": 2.6838033199310303 }, { "auxiliary_loss_clip": 0.01105566, "auxiliary_loss_mlp": 0.01002874, "balance_loss_clip": 1.00068998, "balance_loss_mlp": 1.00037086, "epoch": 0.3505080262129502, "flos": 65072075880960.0, "grad_norm": 2.2499433406247924, "language_loss": 0.63681817, "learning_rate": 3.015923041903071e-06, "loss": 0.65790254, "num_input_tokens_seen": 62658275, "step": 2915, "time_per_iteration": 3.3208224773406982 }, { "auxiliary_loss_clip": 0.01197101, "auxiliary_loss_mlp": 0.01032996, "balance_loss_clip": 1.02360797, "balance_loss_mlp": 1.02476144, "epoch": 0.35062826910358924, "flos": 29314562768640.0, "grad_norm": 1.862883962031307, "language_loss": 0.83516216, "learning_rate": 3.0152519744510347e-06, "loss": 0.85746306, "num_input_tokens_seen": 62678075, "step": 2916, "time_per_iteration": 2.7494986057281494 }, { "auxiliary_loss_clip": 0.01195083, "auxiliary_loss_mlp": 0.01035397, "balance_loss_clip": 0.94231582, "balance_loss_mlp": 1.02576506, "epoch": 0.35074851199422835, "flos": 23987717775360.0, "grad_norm": 1.7431243809051458, "language_loss": 0.82883197, "learning_rate": 3.014580752986081e-06, "loss": 0.8511368, "num_input_tokens_seen": 62696950, "step": 2917, "time_per_iteration": 2.751988172531128 }, { "auxiliary_loss_clip": 0.01191583, "auxiliary_loss_mlp": 0.01039831, "balance_loss_clip": 0.90476632, "balance_loss_mlp": 1.03076482, "epoch": 0.3508687548848674, "flos": 15224436668160.0, "grad_norm": 2.2840275493393736, "language_loss": 0.7838369, "learning_rate": 3.0139093776100345e-06, "loss": 0.80615103, "num_input_tokens_seen": 62713540, "step": 2918, "time_per_iteration": 2.8275411128997803 }, { "auxiliary_loss_clip": 0.01192837, "auxiliary_loss_mlp": 0.0102856, "balance_loss_clip": 1.0565691, "balance_loss_mlp": 1.02027488, "epoch": 0.3509889977755065, "flos": 21361750185600.0, "grad_norm": 1.8055562380016188, "language_loss": 0.75472301, "learning_rate": 3.013237848424741e-06, "loss": 0.77693695, "num_input_tokens_seen": 62732925, "step": 2919, "time_per_iteration": 2.6319475173950195 }, { "auxiliary_loss_clip": 0.01197051, "auxiliary_loss_mlp": 0.01029697, "balance_loss_clip": 0.98244792, "balance_loss_mlp": 1.02082241, "epoch": 0.35110924066614563, "flos": 19135360465920.0, "grad_norm": 2.1349393552204265, "language_loss": 0.75203383, "learning_rate": 3.012566165532072e-06, "loss": 0.77430129, "num_input_tokens_seen": 62751715, "step": 2920, "time_per_iteration": 2.674933433532715 }, { "auxiliary_loss_clip": 0.01192188, "auxiliary_loss_mlp": 0.01028578, "balance_loss_clip": 0.86459386, "balance_loss_mlp": 1.01978636, "epoch": 0.3512294835567847, "flos": 21980885938560.0, "grad_norm": 1.892801017126357, "language_loss": 0.77127671, "learning_rate": 3.0118943290339207e-06, "loss": 0.79348439, "num_input_tokens_seen": 62771925, "step": 2921, "time_per_iteration": 2.795994520187378 }, { "auxiliary_loss_clip": 0.01175606, "auxiliary_loss_mlp": 0.01034833, "balance_loss_clip": 0.93803048, "balance_loss_mlp": 1.02653599, "epoch": 0.3513497264474238, "flos": 17817294896640.0, "grad_norm": 1.8305720080857248, "language_loss": 0.68329328, "learning_rate": 3.011222339032204e-06, "loss": 0.70539761, "num_input_tokens_seen": 62790075, "step": 2922, "time_per_iteration": 2.816646099090576 }, { "auxiliary_loss_clip": 0.01196315, "auxiliary_loss_mlp": 0.01029331, "balance_loss_clip": 1.06009221, "balance_loss_mlp": 1.02077723, "epoch": 0.3514699693380629, "flos": 26943417239040.0, "grad_norm": 1.6844116849903192, "language_loss": 0.69572794, "learning_rate": 3.0105501956288626e-06, "loss": 0.71798432, "num_input_tokens_seen": 62810545, "step": 2923, "time_per_iteration": 2.7026569843292236 }, { "auxiliary_loss_clip": 0.01198544, "auxiliary_loss_mlp": 0.01029197, "balance_loss_clip": 1.01944041, "balance_loss_mlp": 1.02012467, "epoch": 0.35159021222870196, "flos": 15267565923840.0, "grad_norm": 2.278845098824917, "language_loss": 0.72757155, "learning_rate": 3.0098778989258602e-06, "loss": 0.7498489, "num_input_tokens_seen": 62829155, "step": 2924, "time_per_iteration": 2.6340079307556152 }, { "auxiliary_loss_clip": 0.01184331, "auxiliary_loss_mlp": 0.01030949, "balance_loss_clip": 0.94151866, "balance_loss_mlp": 1.02176428, "epoch": 0.35171045511934107, "flos": 13984154000640.0, "grad_norm": 2.207724220000312, "language_loss": 0.88209403, "learning_rate": 3.009205449025183e-06, "loss": 0.90424687, "num_input_tokens_seen": 62845350, "step": 2925, "time_per_iteration": 2.754246234893799 }, { "auxiliary_loss_clip": 0.01185144, "auxiliary_loss_mlp": 0.01038706, "balance_loss_clip": 0.93931854, "balance_loss_mlp": 1.03002119, "epoch": 0.3518306980099802, "flos": 14283434119680.0, "grad_norm": 1.7399556403901721, "language_loss": 0.63066298, "learning_rate": 3.008532846028842e-06, "loss": 0.65290141, "num_input_tokens_seen": 62862110, "step": 2926, "time_per_iteration": 2.6704344749450684 }, { "auxiliary_loss_clip": 0.01200875, "auxiliary_loss_mlp": 0.01032782, "balance_loss_clip": 1.06174505, "balance_loss_mlp": 1.02417517, "epoch": 0.35195094090061924, "flos": 27052872958080.0, "grad_norm": 3.3891216048678587, "language_loss": 0.71935737, "learning_rate": 3.0078600900388694e-06, "loss": 0.74169385, "num_input_tokens_seen": 62882415, "step": 2927, "time_per_iteration": 2.696444034576416 }, { "auxiliary_loss_clip": 0.01175855, "auxiliary_loss_mlp": 0.01031657, "balance_loss_clip": 0.93679368, "balance_loss_mlp": 1.02278161, "epoch": 0.35207118379125835, "flos": 25629266252160.0, "grad_norm": 2.3117594474003047, "language_loss": 0.74259222, "learning_rate": 3.007187181157323e-06, "loss": 0.76466733, "num_input_tokens_seen": 62902425, "step": 2928, "time_per_iteration": 2.7571957111358643 }, { "auxiliary_loss_clip": 0.0117283, "auxiliary_loss_mlp": 0.0103763, "balance_loss_clip": 0.86048746, "balance_loss_mlp": 1.0289458, "epoch": 0.35219142668189746, "flos": 18004713085440.0, "grad_norm": 2.854338157690754, "language_loss": 0.6793232, "learning_rate": 3.006514119486282e-06, "loss": 0.70142782, "num_input_tokens_seen": 62919255, "step": 2929, "time_per_iteration": 2.8547556400299072 }, { "auxiliary_loss_clip": 0.0118277, "auxiliary_loss_mlp": 0.01028978, "balance_loss_clip": 0.94220161, "balance_loss_mlp": 1.02019775, "epoch": 0.3523116695725365, "flos": 14028109269120.0, "grad_norm": 1.7189962070515044, "language_loss": 0.69415009, "learning_rate": 3.005840905127849e-06, "loss": 0.71626747, "num_input_tokens_seen": 62936160, "step": 2930, "time_per_iteration": 4.885807752609253 }, { "auxiliary_loss_clip": 0.01201115, "auxiliary_loss_mlp": 0.01030152, "balance_loss_clip": 1.06350386, "balance_loss_mlp": 1.02174783, "epoch": 0.3524319124631756, "flos": 21433966479360.0, "grad_norm": 2.0994041779042814, "language_loss": 0.86873299, "learning_rate": 3.0051675381841516e-06, "loss": 0.89104569, "num_input_tokens_seen": 62953470, "step": 2931, "time_per_iteration": 2.667719841003418 }, { "auxiliary_loss_clip": 0.01182917, "auxiliary_loss_mlp": 0.01125315, "balance_loss_clip": 0.82367504, "balance_loss_mlp": 0.0, "epoch": 0.3525521553538147, "flos": 26322773114880.0, "grad_norm": 1.5174678303972629, "language_loss": 0.76617205, "learning_rate": 3.0044940187573363e-06, "loss": 0.78925437, "num_input_tokens_seen": 62974480, "step": 2932, "time_per_iteration": 2.883033514022827 }, { "auxiliary_loss_clip": 0.01196581, "auxiliary_loss_mlp": 0.01028649, "balance_loss_clip": 1.0198226, "balance_loss_mlp": 1.02005363, "epoch": 0.3526723982444538, "flos": 21543314457600.0, "grad_norm": 1.995838612143716, "language_loss": 0.65247703, "learning_rate": 3.003820346949578e-06, "loss": 0.67472935, "num_input_tokens_seen": 62992560, "step": 2933, "time_per_iteration": 2.664654493331909 }, { "auxiliary_loss_clip": 0.01196058, "auxiliary_loss_mlp": 0.01034413, "balance_loss_clip": 1.0580337, "balance_loss_mlp": 1.02571702, "epoch": 0.3527926411350929, "flos": 23733649900800.0, "grad_norm": 2.066953564507281, "language_loss": 0.7974593, "learning_rate": 3.003146522863071e-06, "loss": 0.81976396, "num_input_tokens_seen": 63013445, "step": 2934, "time_per_iteration": 3.6511664390563965 }, { "auxiliary_loss_clip": 0.01191562, "auxiliary_loss_mlp": 0.01032709, "balance_loss_clip": 0.9825803, "balance_loss_mlp": 1.02450109, "epoch": 0.35291288402573195, "flos": 30445461544320.0, "grad_norm": 1.9264603038486494, "language_loss": 0.85677844, "learning_rate": 3.0024725466000345e-06, "loss": 0.87902117, "num_input_tokens_seen": 63033400, "step": 2935, "time_per_iteration": 2.7857561111450195 }, { "auxiliary_loss_clip": 0.01196958, "auxiliary_loss_mlp": 0.0103306, "balance_loss_clip": 1.02207303, "balance_loss_mlp": 1.02481043, "epoch": 0.35303312691637107, "flos": 23112179763840.0, "grad_norm": 1.8312969596056625, "language_loss": 0.78813183, "learning_rate": 3.0017984182627087e-06, "loss": 0.81043202, "num_input_tokens_seen": 63052725, "step": 2936, "time_per_iteration": 3.626903533935547 }, { "auxiliary_loss_clip": 0.01190788, "auxiliary_loss_mlp": 0.01125004, "balance_loss_clip": 0.94260252, "balance_loss_mlp": 0.0, "epoch": 0.3531533698070102, "flos": 21835699165440.0, "grad_norm": 1.8961802022539043, "language_loss": 0.82039142, "learning_rate": 3.00112413795336e-06, "loss": 0.84354937, "num_input_tokens_seen": 63072560, "step": 2937, "time_per_iteration": 2.777923583984375 }, { "auxiliary_loss_clip": 0.01186596, "auxiliary_loss_mlp": 0.01031841, "balance_loss_clip": 0.97698152, "balance_loss_mlp": 1.0233655, "epoch": 0.35327361269764923, "flos": 15778969810560.0, "grad_norm": 1.9462093402528338, "language_loss": 0.8008669, "learning_rate": 3.000449705774275e-06, "loss": 0.82305127, "num_input_tokens_seen": 63090800, "step": 2938, "time_per_iteration": 2.677694320678711 }, { "auxiliary_loss_clip": 0.01197443, "auxiliary_loss_mlp": 0.0102968, "balance_loss_clip": 1.02202964, "balance_loss_mlp": 1.02108479, "epoch": 0.35339385558828834, "flos": 22090413484800.0, "grad_norm": 1.9637077771420248, "language_loss": 0.71623278, "learning_rate": 2.9997751218277654e-06, "loss": 0.73850393, "num_input_tokens_seen": 63108955, "step": 2939, "time_per_iteration": 2.748643159866333 }, { "auxiliary_loss_clip": 0.01197306, "auxiliary_loss_mlp": 0.01033793, "balance_loss_clip": 1.06016517, "balance_loss_mlp": 1.02534676, "epoch": 0.35351409847892745, "flos": 24165008328960.0, "grad_norm": 1.855211529343634, "language_loss": 0.77433467, "learning_rate": 2.999100386216166e-06, "loss": 0.79664564, "num_input_tokens_seen": 63127895, "step": 2940, "time_per_iteration": 2.671459913253784 }, { "auxiliary_loss_clip": 0.01196598, "auxiliary_loss_mlp": 0.01031352, "balance_loss_clip": 0.98269051, "balance_loss_mlp": 1.02322185, "epoch": 0.3536343413695665, "flos": 27052298340480.0, "grad_norm": 1.8202540649161492, "language_loss": 0.74310988, "learning_rate": 2.998425499041831e-06, "loss": 0.76538938, "num_input_tokens_seen": 63148410, "step": 2941, "time_per_iteration": 2.7253682613372803 }, { "auxiliary_loss_clip": 0.01107639, "auxiliary_loss_mlp": 0.01001828, "balance_loss_clip": 1.00109529, "balance_loss_mlp": 0.99951535, "epoch": 0.3537545842602056, "flos": 65991066370560.0, "grad_norm": 1.3300454944373532, "language_loss": 0.64486903, "learning_rate": 2.997750460407142e-06, "loss": 0.66596371, "num_input_tokens_seen": 63209765, "step": 2942, "time_per_iteration": 3.304776191711426 }, { "auxiliary_loss_clip": 0.01193089, "auxiliary_loss_mlp": 0.0102647, "balance_loss_clip": 0.93952578, "balance_loss_mlp": 1.01711178, "epoch": 0.35387482715084473, "flos": 18436897526400.0, "grad_norm": 2.6698339087976324, "language_loss": 0.70129335, "learning_rate": 2.997075270414501e-06, "loss": 0.72348893, "num_input_tokens_seen": 63226980, "step": 2943, "time_per_iteration": 2.6683058738708496 }, { "auxiliary_loss_clip": 0.01104878, "auxiliary_loss_mlp": 0.01001365, "balance_loss_clip": 0.96047437, "balance_loss_mlp": 0.99906462, "epoch": 0.3539950700414838, "flos": 65588579498880.0, "grad_norm": 0.8886636860302936, "language_loss": 0.57780945, "learning_rate": 2.9963999291663347e-06, "loss": 0.59887195, "num_input_tokens_seen": 63292760, "step": 2944, "time_per_iteration": 3.260206460952759 }, { "auxiliary_loss_clip": 0.01191713, "auxiliary_loss_mlp": 0.01029363, "balance_loss_clip": 0.90762037, "balance_loss_mlp": 1.02085137, "epoch": 0.3541153129321229, "flos": 20521655919360.0, "grad_norm": 2.131982579834035, "language_loss": 0.73965359, "learning_rate": 2.9957244367650915e-06, "loss": 0.76186436, "num_input_tokens_seen": 63309005, "step": 2945, "time_per_iteration": 2.7752041816711426 }, { "auxiliary_loss_clip": 0.01187967, "auxiliary_loss_mlp": 0.0103472, "balance_loss_clip": 0.9066807, "balance_loss_mlp": 1.02674198, "epoch": 0.354235555822762, "flos": 19573578391680.0, "grad_norm": 2.3086716943479666, "language_loss": 0.83717477, "learning_rate": 2.9950487933132425e-06, "loss": 0.85940164, "num_input_tokens_seen": 63326420, "step": 2946, "time_per_iteration": 2.67897629737854 }, { "auxiliary_loss_clip": 0.01197888, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.02000105, "balance_loss_mlp": 1.02096152, "epoch": 0.35435579871340106, "flos": 20777268078720.0, "grad_norm": 2.0079428812635323, "language_loss": 0.71586025, "learning_rate": 2.994372998913283e-06, "loss": 0.73813307, "num_input_tokens_seen": 63344925, "step": 2947, "time_per_iteration": 2.6530604362487793 }, { "auxiliary_loss_clip": 0.01198877, "auxiliary_loss_mlp": 0.01032355, "balance_loss_clip": 0.98692679, "balance_loss_mlp": 1.02398705, "epoch": 0.35447604160404017, "flos": 23951807153280.0, "grad_norm": 2.055457729806609, "language_loss": 0.62144756, "learning_rate": 2.99369705366773e-06, "loss": 0.64375997, "num_input_tokens_seen": 63365170, "step": 2948, "time_per_iteration": 2.696984052658081 }, { "auxiliary_loss_clip": 0.01186364, "auxiliary_loss_mlp": 0.01027193, "balance_loss_clip": 0.97960472, "balance_loss_mlp": 1.01879478, "epoch": 0.3545962844946792, "flos": 23435662671360.0, "grad_norm": 1.9085113797136288, "language_loss": 0.82103515, "learning_rate": 2.9930209576791244e-06, "loss": 0.84317064, "num_input_tokens_seen": 63383645, "step": 2949, "time_per_iteration": 2.6546437740325928 }, { "auxiliary_loss_clip": 0.01192044, "auxiliary_loss_mlp": 0.01028903, "balance_loss_clip": 1.02196014, "balance_loss_mlp": 1.02067745, "epoch": 0.35471652738531834, "flos": 22085134185600.0, "grad_norm": 1.8249982446349986, "language_loss": 0.63524079, "learning_rate": 2.9923447110500285e-06, "loss": 0.65745032, "num_input_tokens_seen": 63402390, "step": 2950, "time_per_iteration": 2.68039608001709 }, { "auxiliary_loss_clip": 0.01183609, "auxiliary_loss_mlp": 0.01032579, "balance_loss_clip": 1.01927805, "balance_loss_mlp": 1.02465165, "epoch": 0.35483677027595745, "flos": 27341881787520.0, "grad_norm": 1.8530952076897342, "language_loss": 0.75185513, "learning_rate": 2.9916683138830295e-06, "loss": 0.77401698, "num_input_tokens_seen": 63423055, "step": 2951, "time_per_iteration": 2.7489030361175537 }, { "auxiliary_loss_clip": 0.0119069, "auxiliary_loss_mlp": 0.0103414, "balance_loss_clip": 0.9826948, "balance_loss_mlp": 1.02589655, "epoch": 0.3549570131665965, "flos": 13516166678400.0, "grad_norm": 2.3883585115496846, "language_loss": 0.80479801, "learning_rate": 2.9909917662807353e-06, "loss": 0.82704628, "num_input_tokens_seen": 63440855, "step": 2952, "time_per_iteration": 2.6234164237976074 }, { "auxiliary_loss_clip": 0.01193383, "auxiliary_loss_mlp": 0.01028272, "balance_loss_clip": 1.02037621, "balance_loss_mlp": 1.02003431, "epoch": 0.3550772560572356, "flos": 20887549810560.0, "grad_norm": 2.6390006477508363, "language_loss": 0.69379467, "learning_rate": 2.9903150683457783e-06, "loss": 0.71601129, "num_input_tokens_seen": 63459400, "step": 2953, "time_per_iteration": 2.6942172050476074 }, { "auxiliary_loss_clip": 0.01193304, "auxiliary_loss_mlp": 0.01024497, "balance_loss_clip": 0.97969097, "balance_loss_mlp": 1.01647985, "epoch": 0.3551974989478747, "flos": 20194042947840.0, "grad_norm": 1.8103296190123779, "language_loss": 0.64737153, "learning_rate": 2.9896382201808126e-06, "loss": 0.66954958, "num_input_tokens_seen": 63476800, "step": 2954, "time_per_iteration": 2.6613540649414062 }, { "auxiliary_loss_clip": 0.01196405, "auxiliary_loss_mlp": 0.01029096, "balance_loss_clip": 1.05708718, "balance_loss_mlp": 1.02036095, "epoch": 0.3553177418385138, "flos": 19828831415040.0, "grad_norm": 2.1947113996421925, "language_loss": 0.81130469, "learning_rate": 2.988961221888516e-06, "loss": 0.83355963, "num_input_tokens_seen": 63493475, "step": 2955, "time_per_iteration": 3.5104362964630127 }, { "auxiliary_loss_clip": 0.01183513, "auxiliary_loss_mlp": 0.01033934, "balance_loss_clip": 0.9421283, "balance_loss_mlp": 1.02592278, "epoch": 0.3554379847291529, "flos": 14829132516480.0, "grad_norm": 2.7021204773831613, "language_loss": 0.7907415, "learning_rate": 2.988284073571589e-06, "loss": 0.81291598, "num_input_tokens_seen": 63509560, "step": 2956, "time_per_iteration": 2.721198320388794 }, { "auxiliary_loss_clip": 0.01195025, "auxiliary_loss_mlp": 0.0112458, "balance_loss_clip": 1.01949739, "balance_loss_mlp": 0.0, "epoch": 0.355558227619792, "flos": 20485350247680.0, "grad_norm": 2.0822901051176377, "language_loss": 0.73085678, "learning_rate": 2.9876067753327528e-06, "loss": 0.75405276, "num_input_tokens_seen": 63527290, "step": 2957, "time_per_iteration": 3.6494157314300537 }, { "auxiliary_loss_clip": 0.01196908, "auxiliary_loss_mlp": 0.01027352, "balance_loss_clip": 1.01832533, "balance_loss_mlp": 1.01971054, "epoch": 0.35567847051043106, "flos": 37663613256960.0, "grad_norm": 1.8110092808539255, "language_loss": 0.80725682, "learning_rate": 2.986929327274754e-06, "loss": 0.82949936, "num_input_tokens_seen": 63547870, "step": 2958, "time_per_iteration": 2.77630877494812 }, { "auxiliary_loss_clip": 0.01193984, "auxiliary_loss_mlp": 0.01028224, "balance_loss_clip": 1.02262187, "balance_loss_mlp": 1.02045774, "epoch": 0.35579871340107017, "flos": 26943058103040.0, "grad_norm": 1.6817880364481939, "language_loss": 0.78534162, "learning_rate": 2.9862517295003617e-06, "loss": 0.80756372, "num_input_tokens_seen": 63568285, "step": 2959, "time_per_iteration": 2.693290948867798 }, { "auxiliary_loss_clip": 0.01183781, "auxiliary_loss_mlp": 0.01028375, "balance_loss_clip": 0.937392, "balance_loss_mlp": 1.02035785, "epoch": 0.3559189562917093, "flos": 28293335193600.0, "grad_norm": 1.5349687184577516, "language_loss": 0.72449291, "learning_rate": 2.9855739821123654e-06, "loss": 0.74661446, "num_input_tokens_seen": 63589865, "step": 2960, "time_per_iteration": 3.6732029914855957 }, { "auxiliary_loss_clip": 0.01190386, "auxiliary_loss_mlp": 0.01029402, "balance_loss_clip": 1.02060938, "balance_loss_mlp": 1.02125418, "epoch": 0.35603919918234833, "flos": 25664063552640.0, "grad_norm": 1.7352783992254, "language_loss": 0.821684, "learning_rate": 2.98489608521358e-06, "loss": 0.84388185, "num_input_tokens_seen": 63609805, "step": 2961, "time_per_iteration": 2.7350072860717773 }, { "auxiliary_loss_clip": 0.01197179, "auxiliary_loss_mlp": 0.01123802, "balance_loss_clip": 1.01908612, "balance_loss_mlp": 0.0, "epoch": 0.35615944207298744, "flos": 23000856537600.0, "grad_norm": 2.710200936684309, "language_loss": 0.79479504, "learning_rate": 2.9842180389068425e-06, "loss": 0.81800485, "num_input_tokens_seen": 63627115, "step": 2962, "time_per_iteration": 2.7243638038635254 }, { "auxiliary_loss_clip": 0.01109127, "auxiliary_loss_mlp": 0.01004854, "balance_loss_clip": 0.92907071, "balance_loss_mlp": 1.00241041, "epoch": 0.35627968496362655, "flos": 68251283723520.0, "grad_norm": 3.7418263837041312, "language_loss": 0.59218752, "learning_rate": 2.98353984329501e-06, "loss": 0.61332738, "num_input_tokens_seen": 63691460, "step": 2963, "time_per_iteration": 4.205641508102417 }, { "auxiliary_loss_clip": 0.0118812, "auxiliary_loss_mlp": 0.01032696, "balance_loss_clip": 0.9788928, "balance_loss_mlp": 1.02370167, "epoch": 0.3563999278542656, "flos": 22641714403200.0, "grad_norm": 1.5908170462059326, "language_loss": 0.70298815, "learning_rate": 2.982861498480965e-06, "loss": 0.72519636, "num_input_tokens_seen": 63713840, "step": 2964, "time_per_iteration": 2.793691873550415 }, { "auxiliary_loss_clip": 0.01182189, "auxiliary_loss_mlp": 0.01033278, "balance_loss_clip": 0.93849343, "balance_loss_mlp": 1.02546394, "epoch": 0.3565201707449047, "flos": 25952533678080.0, "grad_norm": 1.6799863568827356, "language_loss": 0.82829511, "learning_rate": 2.9821830045676122e-06, "loss": 0.8504498, "num_input_tokens_seen": 63733540, "step": 2965, "time_per_iteration": 2.7942137718200684 }, { "auxiliary_loss_clip": 0.011951, "auxiliary_loss_mlp": 0.01030801, "balance_loss_clip": 1.05827999, "balance_loss_mlp": 1.02229571, "epoch": 0.3566404136355438, "flos": 28475725478400.0, "grad_norm": 1.6657592642816457, "language_loss": 0.73036027, "learning_rate": 2.9815043616578793e-06, "loss": 0.75261927, "num_input_tokens_seen": 63754335, "step": 2966, "time_per_iteration": 2.6844985485076904 }, { "auxiliary_loss_clip": 0.01183282, "auxiliary_loss_mlp": 0.01027854, "balance_loss_clip": 0.93968433, "balance_loss_mlp": 1.01992035, "epoch": 0.3567606565261829, "flos": 38363117690880.0, "grad_norm": 1.7460865328048656, "language_loss": 0.76758444, "learning_rate": 2.9808255698547145e-06, "loss": 0.78969574, "num_input_tokens_seen": 63777135, "step": 2967, "time_per_iteration": 2.8534352779388428 }, { "auxiliary_loss_clip": 0.01193128, "auxiliary_loss_mlp": 0.01030292, "balance_loss_clip": 1.02189755, "balance_loss_mlp": 1.022174, "epoch": 0.356880899416822, "flos": 21981029592960.0, "grad_norm": 2.021357223318659, "language_loss": 0.79315758, "learning_rate": 2.9801466292610913e-06, "loss": 0.81539178, "num_input_tokens_seen": 63797020, "step": 2968, "time_per_iteration": 2.663804769515991 }, { "auxiliary_loss_clip": 0.01191507, "auxiliary_loss_mlp": 0.01022202, "balance_loss_clip": 1.0191046, "balance_loss_mlp": 1.01484704, "epoch": 0.35700114230746105, "flos": 18989132198400.0, "grad_norm": 1.8393627795540473, "language_loss": 0.80766886, "learning_rate": 2.979467539980003e-06, "loss": 0.82980597, "num_input_tokens_seen": 63813810, "step": 2969, "time_per_iteration": 2.638836622238159 }, { "auxiliary_loss_clip": 0.01195793, "auxiliary_loss_mlp": 0.01026498, "balance_loss_clip": 1.02080834, "balance_loss_mlp": 1.01855898, "epoch": 0.35712138519810016, "flos": 19756112330880.0, "grad_norm": 2.289988571574782, "language_loss": 0.77194446, "learning_rate": 2.978788302114468e-06, "loss": 0.7941674, "num_input_tokens_seen": 63830925, "step": 2970, "time_per_iteration": 2.6504392623901367 }, { "auxiliary_loss_clip": 0.01191106, "auxiliary_loss_mlp": 0.01030307, "balance_loss_clip": 1.01923203, "balance_loss_mlp": 1.02211106, "epoch": 0.35724162808873927, "flos": 35183012008320.0, "grad_norm": 5.23599348514322, "language_loss": 0.83220208, "learning_rate": 2.9781089157675255e-06, "loss": 0.85441619, "num_input_tokens_seen": 63849385, "step": 2971, "time_per_iteration": 2.735863208770752 }, { "auxiliary_loss_clip": 0.01189523, "auxiliary_loss_mlp": 0.01024759, "balance_loss_clip": 1.02052987, "balance_loss_mlp": 1.01672459, "epoch": 0.3573618709793783, "flos": 25556726736000.0, "grad_norm": 1.4767136393933273, "language_loss": 0.88528252, "learning_rate": 2.977429381042238e-06, "loss": 0.90742534, "num_input_tokens_seen": 63870060, "step": 2972, "time_per_iteration": 2.7690858840942383 }, { "auxiliary_loss_clip": 0.01190844, "auxiliary_loss_mlp": 0.01025604, "balance_loss_clip": 0.98124111, "balance_loss_mlp": 1.01766789, "epoch": 0.35748211387001744, "flos": 29132352051840.0, "grad_norm": 2.1226095200347093, "language_loss": 0.88808686, "learning_rate": 2.9767496980416913e-06, "loss": 0.91025138, "num_input_tokens_seen": 63889355, "step": 2973, "time_per_iteration": 2.83487868309021 }, { "auxiliary_loss_clip": 0.01183136, "auxiliary_loss_mlp": 0.01032085, "balance_loss_clip": 0.97711229, "balance_loss_mlp": 1.02354944, "epoch": 0.35760235676065655, "flos": 13954169122560.0, "grad_norm": 2.614832318081303, "language_loss": 0.81304479, "learning_rate": 2.9760698668689914e-06, "loss": 0.83519697, "num_input_tokens_seen": 63905580, "step": 2974, "time_per_iteration": 2.698054552078247 }, { "auxiliary_loss_clip": 0.01192178, "auxiliary_loss_mlp": 0.01029161, "balance_loss_clip": 1.01891506, "balance_loss_mlp": 1.02117383, "epoch": 0.3577225996512956, "flos": 44018688977280.0, "grad_norm": 2.35938605351568, "language_loss": 0.71518612, "learning_rate": 2.975389887627269e-06, "loss": 0.73739952, "num_input_tokens_seen": 63928180, "step": 2975, "time_per_iteration": 2.863762140274048 }, { "auxiliary_loss_clip": 0.01194338, "auxiliary_loss_mlp": 0.01031931, "balance_loss_clip": 0.94209254, "balance_loss_mlp": 1.02349114, "epoch": 0.3578428425419347, "flos": 17055199013760.0, "grad_norm": 2.102132243826137, "language_loss": 0.89659864, "learning_rate": 2.9747097604196764e-06, "loss": 0.91886139, "num_input_tokens_seen": 63944825, "step": 2976, "time_per_iteration": 2.688711643218994 }, { "auxiliary_loss_clip": 0.01101038, "auxiliary_loss_mlp": 0.01006861, "balance_loss_clip": 0.87995303, "balance_loss_mlp": 1.00427425, "epoch": 0.3579630854325738, "flos": 71676550707840.0, "grad_norm": 0.6930086758900473, "language_loss": 0.56652182, "learning_rate": 2.9740294853493875e-06, "loss": 0.58760089, "num_input_tokens_seen": 64016385, "step": 2977, "time_per_iteration": 3.5504586696624756 }, { "auxiliary_loss_clip": 0.0119325, "auxiliary_loss_mlp": 0.01029215, "balance_loss_clip": 0.90268564, "balance_loss_mlp": 1.02151382, "epoch": 0.3580833283232129, "flos": 25046651652480.0, "grad_norm": 2.2226478044274627, "language_loss": 0.66988742, "learning_rate": 2.9733490625196008e-06, "loss": 0.69211209, "num_input_tokens_seen": 64036245, "step": 2978, "time_per_iteration": 2.807157516479492 }, { "auxiliary_loss_clip": 0.01181341, "auxiliary_loss_mlp": 0.0102953, "balance_loss_clip": 0.94120336, "balance_loss_mlp": 1.02173924, "epoch": 0.358203571213852, "flos": 13953127628160.0, "grad_norm": 25.099257230307206, "language_loss": 0.76060176, "learning_rate": 2.9726684920335353e-06, "loss": 0.78271049, "num_input_tokens_seen": 64054110, "step": 2979, "time_per_iteration": 2.729779005050659 }, { "auxiliary_loss_clip": 0.01194512, "auxiliary_loss_mlp": 0.01124376, "balance_loss_clip": 1.05585194, "balance_loss_mlp": 0.0, "epoch": 0.35832381410449105, "flos": 20302457172480.0, "grad_norm": 2.081740723899189, "language_loss": 0.82152665, "learning_rate": 2.971987773994432e-06, "loss": 0.8447156, "num_input_tokens_seen": 64070295, "step": 2980, "time_per_iteration": 2.650230884552002 }, { "auxiliary_loss_clip": 0.01178759, "auxiliary_loss_mlp": 0.01028344, "balance_loss_clip": 1.01416147, "balance_loss_mlp": 1.02010667, "epoch": 0.35844405699513016, "flos": 16983234115200.0, "grad_norm": 2.0277963456637735, "language_loss": 0.83046079, "learning_rate": 2.9713069085055566e-06, "loss": 0.85253179, "num_input_tokens_seen": 64088605, "step": 2981, "time_per_iteration": 3.627150535583496 }, { "auxiliary_loss_clip": 0.0119263, "auxiliary_loss_mlp": 0.01029411, "balance_loss_clip": 0.94250655, "balance_loss_mlp": 1.02049363, "epoch": 0.35856429988576927, "flos": 23216858974080.0, "grad_norm": 1.5546617388964474, "language_loss": 0.79030681, "learning_rate": 2.9706258956701958e-06, "loss": 0.81252718, "num_input_tokens_seen": 64108595, "step": 2982, "time_per_iteration": 2.75616455078125 }, { "auxiliary_loss_clip": 0.01193688, "auxiliary_loss_mlp": 0.01025034, "balance_loss_clip": 1.01885307, "balance_loss_mlp": 1.01603377, "epoch": 0.3586845427764083, "flos": 23034576430080.0, "grad_norm": 4.219168050334588, "language_loss": 0.77671981, "learning_rate": 2.9699447355916575e-06, "loss": 0.79890704, "num_input_tokens_seen": 64127405, "step": 2983, "time_per_iteration": 3.708709716796875 }, { "auxiliary_loss_clip": 0.01191882, "auxiliary_loss_mlp": 0.01124273, "balance_loss_clip": 1.05645347, "balance_loss_mlp": 0.0, "epoch": 0.35880478566704743, "flos": 20010682995840.0, "grad_norm": 1.838265565256358, "language_loss": 0.73963284, "learning_rate": 2.969263428373275e-06, "loss": 0.76279438, "num_input_tokens_seen": 64145755, "step": 2984, "time_per_iteration": 2.5988757610321045 }, { "auxiliary_loss_clip": 0.01189635, "auxiliary_loss_mlp": 0.01028463, "balance_loss_clip": 0.97799885, "balance_loss_mlp": 1.02038383, "epoch": 0.35892502855768654, "flos": 13699095667200.0, "grad_norm": 1.8529368352347277, "language_loss": 0.7895422, "learning_rate": 2.9685819741184007e-06, "loss": 0.81172311, "num_input_tokens_seen": 64164195, "step": 2985, "time_per_iteration": 2.7212188243865967 }, { "auxiliary_loss_clip": 0.01184165, "auxiliary_loss_mlp": 0.01024481, "balance_loss_clip": 0.94099796, "balance_loss_mlp": 1.01658297, "epoch": 0.3590452714483256, "flos": 18114096977280.0, "grad_norm": 2.4305182457952372, "language_loss": 0.68427372, "learning_rate": 2.967900372930411e-06, "loss": 0.70636022, "num_input_tokens_seen": 64182705, "step": 2986, "time_per_iteration": 2.6876907348632812 }, { "auxiliary_loss_clip": 0.01183292, "auxiliary_loss_mlp": 0.01031713, "balance_loss_clip": 0.97767985, "balance_loss_mlp": 1.02268267, "epoch": 0.3591655143389647, "flos": 17749352321280.0, "grad_norm": 7.860459760903073, "language_loss": 0.79046559, "learning_rate": 2.9672186249127046e-06, "loss": 0.81261563, "num_input_tokens_seen": 64202170, "step": 2987, "time_per_iteration": 3.65718412399292 }, { "auxiliary_loss_clip": 0.01188833, "auxiliary_loss_mlp": 0.01029313, "balance_loss_clip": 0.98003799, "balance_loss_mlp": 1.02203536, "epoch": 0.3592857572296038, "flos": 25224409082880.0, "grad_norm": 1.9808642026346157, "language_loss": 0.78810954, "learning_rate": 2.9665367301687014e-06, "loss": 0.81029099, "num_input_tokens_seen": 64220415, "step": 2988, "time_per_iteration": 2.693861484527588 }, { "auxiliary_loss_clip": 0.01183814, "auxiliary_loss_mlp": 0.01034314, "balance_loss_clip": 0.978688, "balance_loss_mlp": 1.02617252, "epoch": 0.3594060001202429, "flos": 29384408764800.0, "grad_norm": 1.791846488065643, "language_loss": 0.76448226, "learning_rate": 2.965854688801845e-06, "loss": 0.78666353, "num_input_tokens_seen": 64242475, "step": 2989, "time_per_iteration": 3.6374473571777344 }, { "auxiliary_loss_clip": 0.01188339, "auxiliary_loss_mlp": 0.01024229, "balance_loss_clip": 1.01419199, "balance_loss_mlp": 1.01633787, "epoch": 0.359526243010882, "flos": 17052900543360.0, "grad_norm": 2.3553095931192147, "language_loss": 0.76330835, "learning_rate": 2.9651725009156005e-06, "loss": 0.78543401, "num_input_tokens_seen": 64260220, "step": 2990, "time_per_iteration": 2.702526330947876 }, { "auxiliary_loss_clip": 0.01178935, "auxiliary_loss_mlp": 0.0103205, "balance_loss_clip": 0.97640383, "balance_loss_mlp": 1.02349043, "epoch": 0.3596464859015211, "flos": 22965089569920.0, "grad_norm": 3.256448430226864, "language_loss": 0.74202925, "learning_rate": 2.964490166613454e-06, "loss": 0.76413912, "num_input_tokens_seen": 64280145, "step": 2991, "time_per_iteration": 2.713094711303711 }, { "auxiliary_loss_clip": 0.01095616, "auxiliary_loss_mlp": 0.01001854, "balance_loss_clip": 1.02770913, "balance_loss_mlp": 0.99939823, "epoch": 0.35976672879216015, "flos": 54739462590720.0, "grad_norm": 0.7743467821642804, "language_loss": 0.57772487, "learning_rate": 2.963807685998917e-06, "loss": 0.59869957, "num_input_tokens_seen": 64336010, "step": 2992, "time_per_iteration": 3.0405852794647217 }, { "auxiliary_loss_clip": 0.01189738, "auxiliary_loss_mlp": 0.01025058, "balance_loss_clip": 0.90122432, "balance_loss_mlp": 1.01707399, "epoch": 0.35988697168279926, "flos": 43139020901760.0, "grad_norm": 1.5131106644951753, "language_loss": 0.78164721, "learning_rate": 2.9631250591755196e-06, "loss": 0.80379516, "num_input_tokens_seen": 64358725, "step": 2993, "time_per_iteration": 2.9558331966400146 }, { "auxiliary_loss_clip": 0.01190201, "auxiliary_loss_mlp": 0.01035199, "balance_loss_clip": 0.98254979, "balance_loss_mlp": 1.0269022, "epoch": 0.36000721457343837, "flos": 35845600239360.0, "grad_norm": 7.788929911000162, "language_loss": 0.57466304, "learning_rate": 2.962442286246817e-06, "loss": 0.59691703, "num_input_tokens_seen": 64381555, "step": 2994, "time_per_iteration": 2.9228222370147705 }, { "auxiliary_loss_clip": 0.01195487, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 0.98044968, "balance_loss_mlp": 1.02360392, "epoch": 0.3601274574640774, "flos": 18291100222080.0, "grad_norm": 1.8547205394389599, "language_loss": 0.69527566, "learning_rate": 2.9617593673163853e-06, "loss": 0.71755153, "num_input_tokens_seen": 64400375, "step": 2995, "time_per_iteration": 2.6913869380950928 }, { "auxiliary_loss_clip": 0.01190693, "auxiliary_loss_mlp": 0.01027958, "balance_loss_clip": 0.97611386, "balance_loss_mlp": 1.02014422, "epoch": 0.36024770035471654, "flos": 13333955961600.0, "grad_norm": 2.052696033791829, "language_loss": 0.77696824, "learning_rate": 2.9610763024878216e-06, "loss": 0.79915476, "num_input_tokens_seen": 64415880, "step": 2996, "time_per_iteration": 2.629202127456665 }, { "auxiliary_loss_clip": 0.01183688, "auxiliary_loss_mlp": 0.0102875, "balance_loss_clip": 0.97722059, "balance_loss_mlp": 1.02081096, "epoch": 0.3603679432453556, "flos": 20267013427200.0, "grad_norm": 1.709474040225576, "language_loss": 0.91485667, "learning_rate": 2.960393091864747e-06, "loss": 0.93698108, "num_input_tokens_seen": 64434260, "step": 2997, "time_per_iteration": 2.67492413520813 }, { "auxiliary_loss_clip": 0.01190166, "auxiliary_loss_mlp": 0.0103192, "balance_loss_clip": 0.97993922, "balance_loss_mlp": 1.02423143, "epoch": 0.3604881861359947, "flos": 22451135817600.0, "grad_norm": 1.7227008191750428, "language_loss": 0.74980533, "learning_rate": 2.959709735550804e-06, "loss": 0.77202618, "num_input_tokens_seen": 64453855, "step": 2998, "time_per_iteration": 2.6855404376983643 }, { "auxiliary_loss_clip": 0.01192353, "auxiliary_loss_mlp": 0.01025858, "balance_loss_clip": 0.90339935, "balance_loss_mlp": 1.01768601, "epoch": 0.3606084290266338, "flos": 22054251467520.0, "grad_norm": 2.136602216657118, "language_loss": 0.75453943, "learning_rate": 2.9590262336496575e-06, "loss": 0.77672148, "num_input_tokens_seen": 64473585, "step": 2999, "time_per_iteration": 2.756228446960449 }, { "auxiliary_loss_clip": 0.01186367, "auxiliary_loss_mlp": 0.01031344, "balance_loss_clip": 0.9431054, "balance_loss_mlp": 1.02286196, "epoch": 0.36072867191727287, "flos": 15632921111040.0, "grad_norm": 1.7840595787585867, "language_loss": 0.85161281, "learning_rate": 2.9583425862649936e-06, "loss": 0.87378991, "num_input_tokens_seen": 64491720, "step": 3000, "time_per_iteration": 2.760556697845459 }, { "auxiliary_loss_clip": 0.0119533, "auxiliary_loss_mlp": 0.01033648, "balance_loss_clip": 1.05699015, "balance_loss_mlp": 1.02476728, "epoch": 0.360848914807912, "flos": 19677000625920.0, "grad_norm": 2.237801340552917, "language_loss": 0.73831475, "learning_rate": 2.9576587935005215e-06, "loss": 0.76060456, "num_input_tokens_seen": 64509800, "step": 3001, "time_per_iteration": 2.6476681232452393 }, { "auxiliary_loss_clip": 0.01196373, "auxiliary_loss_mlp": 0.01023766, "balance_loss_clip": 1.02001667, "balance_loss_mlp": 1.01536226, "epoch": 0.3609691576985511, "flos": 18877808972160.0, "grad_norm": 2.4469893492524113, "language_loss": 0.72265035, "learning_rate": 2.9569748554599713e-06, "loss": 0.74485177, "num_input_tokens_seen": 64525410, "step": 3002, "time_per_iteration": 2.6212058067321777 }, { "auxiliary_loss_clip": 0.01187687, "auxiliary_loss_mlp": 0.01029529, "balance_loss_clip": 0.97880483, "balance_loss_mlp": 1.02185178, "epoch": 0.36108940058919015, "flos": 42224088648960.0, "grad_norm": 3.701941530428575, "language_loss": 0.73270524, "learning_rate": 2.956290772247097e-06, "loss": 0.75487733, "num_input_tokens_seen": 64544085, "step": 3003, "time_per_iteration": 2.8296058177948 }, { "auxiliary_loss_clip": 0.01178729, "auxiliary_loss_mlp": 0.01024084, "balance_loss_clip": 0.90221101, "balance_loss_mlp": 1.01665092, "epoch": 0.36120964347982926, "flos": 23185150243200.0, "grad_norm": 1.697771285944049, "language_loss": 0.72910082, "learning_rate": 2.9556065439656724e-06, "loss": 0.75112897, "num_input_tokens_seen": 64563135, "step": 3004, "time_per_iteration": 2.754863739013672 }, { "auxiliary_loss_clip": 0.01172585, "auxiliary_loss_mlp": 0.01026033, "balance_loss_clip": 0.85938954, "balance_loss_mlp": 1.01810563, "epoch": 0.36132988637046837, "flos": 18113055482880.0, "grad_norm": 1.6753703741753938, "language_loss": 0.8174603, "learning_rate": 2.9549221707194952e-06, "loss": 0.83944643, "num_input_tokens_seen": 64581985, "step": 3005, "time_per_iteration": 2.8170034885406494 }, { "auxiliary_loss_clip": 0.01195715, "auxiliary_loss_mlp": 0.01034476, "balance_loss_clip": 1.01863909, "balance_loss_mlp": 1.02636981, "epoch": 0.3614501292611074, "flos": 27813101333760.0, "grad_norm": 1.923162360048908, "language_loss": 0.72679085, "learning_rate": 2.954237652612384e-06, "loss": 0.74909276, "num_input_tokens_seen": 64601035, "step": 3006, "time_per_iteration": 3.6675608158111572 }, { "auxiliary_loss_clip": 0.01187672, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 0.97988796, "balance_loss_mlp": 1.0206883, "epoch": 0.36157037215174653, "flos": 22634926732800.0, "grad_norm": 1.899726743310442, "language_loss": 0.84637487, "learning_rate": 2.9535529897481796e-06, "loss": 0.86853743, "num_input_tokens_seen": 64618580, "step": 3007, "time_per_iteration": 2.7085087299346924 }, { "auxiliary_loss_clip": 0.01193979, "auxiliary_loss_mlp": 0.01029508, "balance_loss_clip": 1.05572784, "balance_loss_mlp": 1.02130079, "epoch": 0.36169061504238564, "flos": 12600839376000.0, "grad_norm": 2.343876697319984, "language_loss": 0.7679528, "learning_rate": 2.9528681822307446e-06, "loss": 0.79018772, "num_input_tokens_seen": 64635430, "step": 3008, "time_per_iteration": 2.6238784790039062 }, { "auxiliary_loss_clip": 0.01188889, "auxiliary_loss_mlp": 0.01124137, "balance_loss_clip": 1.01874363, "balance_loss_mlp": 0.0, "epoch": 0.3618108579330247, "flos": 26684644682880.0, "grad_norm": 2.010091874229172, "language_loss": 0.82634485, "learning_rate": 2.952183230163964e-06, "loss": 0.84947509, "num_input_tokens_seen": 64655005, "step": 3009, "time_per_iteration": 3.769545316696167 }, { "auxiliary_loss_clip": 0.01179815, "auxiliary_loss_mlp": 0.01026722, "balance_loss_clip": 0.9369117, "balance_loss_mlp": 1.0186547, "epoch": 0.3619311008236638, "flos": 22817029708800.0, "grad_norm": 1.9136842995045333, "language_loss": 0.73186839, "learning_rate": 2.9514981336517448e-06, "loss": 0.75393379, "num_input_tokens_seen": 64674775, "step": 3010, "time_per_iteration": 2.727431058883667 }, { "auxiliary_loss_clip": 0.01190827, "auxiliary_loss_mlp": 0.01031059, "balance_loss_clip": 1.01834321, "balance_loss_mlp": 1.02264261, "epoch": 0.36205134371430286, "flos": 25919603884800.0, "grad_norm": 1.7296369699521799, "language_loss": 0.81404638, "learning_rate": 2.950812892798015e-06, "loss": 0.83626521, "num_input_tokens_seen": 64695670, "step": 3011, "time_per_iteration": 2.7008419036865234 }, { "auxiliary_loss_clip": 0.01178838, "auxiliary_loss_mlp": 0.01124075, "balance_loss_clip": 0.90210688, "balance_loss_mlp": 0.0, "epoch": 0.362171586604942, "flos": 26139592730880.0, "grad_norm": 2.070760231022463, "language_loss": 0.8738873, "learning_rate": 2.9501275077067256e-06, "loss": 0.89691639, "num_input_tokens_seen": 64716290, "step": 3012, "time_per_iteration": 2.860365152359009 }, { "auxiliary_loss_clip": 0.01164705, "auxiliary_loss_mlp": 0.01034056, "balance_loss_clip": 0.85953271, "balance_loss_mlp": 1.02611935, "epoch": 0.3622918294955811, "flos": 28074208273920.0, "grad_norm": 1.4793249070920016, "language_loss": 0.88209385, "learning_rate": 2.949441978481848e-06, "loss": 0.90408146, "num_input_tokens_seen": 64737190, "step": 3013, "time_per_iteration": 3.8138105869293213 }, { "auxiliary_loss_clip": 0.01191159, "auxiliary_loss_mlp": 0.01035356, "balance_loss_clip": 0.93886697, "balance_loss_mlp": 1.02633774, "epoch": 0.36241207238622014, "flos": 19828005402240.0, "grad_norm": 1.96462051088435, "language_loss": 0.80083752, "learning_rate": 2.9487563052273778e-06, "loss": 0.82310265, "num_input_tokens_seen": 64753950, "step": 3014, "time_per_iteration": 2.728410005569458 }, { "auxiliary_loss_clip": 0.01191576, "auxiliary_loss_mlp": 0.0102968, "balance_loss_clip": 1.02121103, "balance_loss_mlp": 1.02194905, "epoch": 0.36253231527685925, "flos": 21397158017280.0, "grad_norm": 1.850305182533223, "language_loss": 0.85689044, "learning_rate": 2.94807048804733e-06, "loss": 0.879103, "num_input_tokens_seen": 64773570, "step": 3015, "time_per_iteration": 3.50809645652771 }, { "auxiliary_loss_clip": 0.0119096, "auxiliary_loss_mlp": 0.01030297, "balance_loss_clip": 0.93784374, "balance_loss_mlp": 1.02214956, "epoch": 0.36265255816749836, "flos": 18362885552640.0, "grad_norm": 1.670192548916342, "language_loss": 0.9034276, "learning_rate": 2.9473845270457434e-06, "loss": 0.92564017, "num_input_tokens_seen": 64790385, "step": 3016, "time_per_iteration": 2.734013557434082 }, { "auxiliary_loss_clip": 0.01184082, "auxiliary_loss_mlp": 0.01032533, "balance_loss_clip": 0.97671795, "balance_loss_mlp": 1.02474225, "epoch": 0.3627728010581374, "flos": 18660046769280.0, "grad_norm": 1.9589017271876947, "language_loss": 0.6971038, "learning_rate": 2.946698422326677e-06, "loss": 0.71926999, "num_input_tokens_seen": 64807845, "step": 3017, "time_per_iteration": 2.643296480178833 }, { "auxiliary_loss_clip": 0.01183088, "auxiliary_loss_mlp": 0.0102571, "balance_loss_clip": 0.89810765, "balance_loss_mlp": 1.01772261, "epoch": 0.36289304394877653, "flos": 27524272072320.0, "grad_norm": 1.952245294585522, "language_loss": 0.79727489, "learning_rate": 2.946012173994213e-06, "loss": 0.81936282, "num_input_tokens_seen": 64827630, "step": 3018, "time_per_iteration": 2.8337018489837646 }, { "auxiliary_loss_clip": 0.01187305, "auxiliary_loss_mlp": 0.01033649, "balance_loss_clip": 1.02025247, "balance_loss_mlp": 1.02557278, "epoch": 0.36301328683941564, "flos": 34533244932480.0, "grad_norm": 1.448764464263474, "language_loss": 0.67541671, "learning_rate": 2.945325782152454e-06, "loss": 0.69762623, "num_input_tokens_seen": 64850665, "step": 3019, "time_per_iteration": 2.845961570739746 }, { "auxiliary_loss_clip": 0.01189605, "auxiliary_loss_mlp": 0.01032822, "balance_loss_clip": 0.97646332, "balance_loss_mlp": 1.02477503, "epoch": 0.3631335297300547, "flos": 19025976574080.0, "grad_norm": 1.9383278294314412, "language_loss": 0.78724611, "learning_rate": 2.9446392469055257e-06, "loss": 0.80947036, "num_input_tokens_seen": 64868700, "step": 3020, "time_per_iteration": 2.735365629196167 }, { "auxiliary_loss_clip": 0.01185696, "auxiliary_loss_mlp": 0.01029082, "balance_loss_clip": 0.94484079, "balance_loss_mlp": 1.02112484, "epoch": 0.3632537726206938, "flos": 19536769929600.0, "grad_norm": 1.7764204972378257, "language_loss": 0.79711258, "learning_rate": 2.9439525683575745e-06, "loss": 0.81926042, "num_input_tokens_seen": 64887620, "step": 3021, "time_per_iteration": 2.7430901527404785 }, { "auxiliary_loss_clip": 0.0119802, "auxiliary_loss_mlp": 0.01033301, "balance_loss_clip": 1.0594877, "balance_loss_mlp": 1.02433658, "epoch": 0.3633740155113329, "flos": 21068611292160.0, "grad_norm": 1.888958604607165, "language_loss": 0.7506932, "learning_rate": 2.9432657466127694e-06, "loss": 0.77300644, "num_input_tokens_seen": 64907190, "step": 3022, "time_per_iteration": 2.635979175567627 }, { "auxiliary_loss_clip": 0.01187379, "auxiliary_loss_mlp": 0.01032593, "balance_loss_clip": 0.90578496, "balance_loss_mlp": 1.02417731, "epoch": 0.36349425840197197, "flos": 20298722158080.0, "grad_norm": 1.5987242994943682, "language_loss": 0.76654041, "learning_rate": 2.9425787817753007e-06, "loss": 0.78874016, "num_input_tokens_seen": 64925850, "step": 3023, "time_per_iteration": 2.8504843711853027 }, { "auxiliary_loss_clip": 0.01189145, "auxiliary_loss_mlp": 0.01029815, "balance_loss_clip": 0.93981671, "balance_loss_mlp": 1.02138638, "epoch": 0.3636145012926111, "flos": 29716762331520.0, "grad_norm": 1.8857098652301454, "language_loss": 0.71764082, "learning_rate": 2.94189167394938e-06, "loss": 0.73983043, "num_input_tokens_seen": 64948285, "step": 3024, "time_per_iteration": 2.7805371284484863 }, { "auxiliary_loss_clip": 0.01195851, "auxiliary_loss_mlp": 0.01034177, "balance_loss_clip": 1.05992544, "balance_loss_mlp": 1.02607703, "epoch": 0.3637347441832502, "flos": 21431847576960.0, "grad_norm": 1.645402040741893, "language_loss": 0.80905378, "learning_rate": 2.941204423239241e-06, "loss": 0.83135414, "num_input_tokens_seen": 64967160, "step": 3025, "time_per_iteration": 2.6881728172302246 }, { "auxiliary_loss_clip": 0.01188388, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.01792717, "balance_loss_mlp": 1.02486849, "epoch": 0.36385498707388925, "flos": 29533941083520.0, "grad_norm": 1.70997828162107, "language_loss": 0.75902271, "learning_rate": 2.9405170297491395e-06, "loss": 0.78123748, "num_input_tokens_seen": 64987155, "step": 3026, "time_per_iteration": 2.7326736450195312 }, { "auxiliary_loss_clip": 0.01178552, "auxiliary_loss_mlp": 0.0112482, "balance_loss_clip": 0.86527658, "balance_loss_mlp": 0.0, "epoch": 0.36397522996452836, "flos": 22236569925120.0, "grad_norm": 1.9650554342860795, "language_loss": 0.80167079, "learning_rate": 2.939829493583353e-06, "loss": 0.82470453, "num_input_tokens_seen": 65003800, "step": 3027, "time_per_iteration": 2.829901695251465 }, { "auxiliary_loss_clip": 0.01176503, "auxiliary_loss_mlp": 0.01027093, "balance_loss_clip": 0.93575048, "balance_loss_mlp": 1.01918936, "epoch": 0.3640954728551674, "flos": 21506505995520.0, "grad_norm": 2.758595191277045, "language_loss": 0.8291865, "learning_rate": 2.939141814846179e-06, "loss": 0.85122246, "num_input_tokens_seen": 65021215, "step": 3028, "time_per_iteration": 2.709298849105835 }, { "auxiliary_loss_clip": 0.0118814, "auxiliary_loss_mlp": 0.01024078, "balance_loss_clip": 0.97710276, "balance_loss_mlp": 1.01646352, "epoch": 0.3642157157458065, "flos": 17712867081600.0, "grad_norm": 1.5067953787719188, "language_loss": 0.825409, "learning_rate": 2.938453993641938e-06, "loss": 0.84753114, "num_input_tokens_seen": 65039590, "step": 3029, "time_per_iteration": 2.721916913986206 }, { "auxiliary_loss_clip": 0.01189799, "auxiliary_loss_mlp": 0.0103904, "balance_loss_clip": 0.98272252, "balance_loss_mlp": 1.03071964, "epoch": 0.36433595863644563, "flos": 17639537466240.0, "grad_norm": 2.552516989691585, "language_loss": 0.7105732, "learning_rate": 2.937766030074973e-06, "loss": 0.73286164, "num_input_tokens_seen": 65056845, "step": 3030, "time_per_iteration": 2.734168767929077 }, { "auxiliary_loss_clip": 0.01194134, "auxiliary_loss_mlp": 0.01025042, "balance_loss_clip": 0.94037277, "balance_loss_mlp": 1.01704276, "epoch": 0.3644562015270847, "flos": 26833279161600.0, "grad_norm": 1.7385025677942143, "language_loss": 0.82542276, "learning_rate": 2.937077924249646e-06, "loss": 0.84761453, "num_input_tokens_seen": 65079435, "step": 3031, "time_per_iteration": 2.783940076828003 }, { "auxiliary_loss_clip": 0.01197226, "auxiliary_loss_mlp": 0.01027576, "balance_loss_clip": 0.9793824, "balance_loss_mlp": 1.01897478, "epoch": 0.3645764444177238, "flos": 14282715847680.0, "grad_norm": 2.3631159422666785, "language_loss": 0.76004934, "learning_rate": 2.9363896762703443e-06, "loss": 0.78229737, "num_input_tokens_seen": 65096500, "step": 3032, "time_per_iteration": 3.6715846061706543 }, { "auxiliary_loss_clip": 0.01194478, "auxiliary_loss_mlp": 0.01033023, "balance_loss_clip": 1.05662966, "balance_loss_mlp": 1.02519071, "epoch": 0.3646966873083629, "flos": 20667489137280.0, "grad_norm": 1.7050211388742749, "language_loss": 0.84337187, "learning_rate": 2.9357012862414725e-06, "loss": 0.86564696, "num_input_tokens_seen": 65115860, "step": 3033, "time_per_iteration": 2.679966926574707 }, { "auxiliary_loss_clip": 0.01193023, "auxiliary_loss_mlp": 0.01023932, "balance_loss_clip": 1.01935458, "balance_loss_mlp": 1.01591563, "epoch": 0.36481693019900197, "flos": 27782613665280.0, "grad_norm": 1.849800002627672, "language_loss": 0.71421874, "learning_rate": 2.9350127542674593e-06, "loss": 0.73638833, "num_input_tokens_seen": 65138070, "step": 3034, "time_per_iteration": 2.710320472717285 }, { "auxiliary_loss_clip": 0.01197473, "auxiliary_loss_mlp": 0.01028877, "balance_loss_clip": 0.98021436, "balance_loss_mlp": 1.02004361, "epoch": 0.3649371730896411, "flos": 19712588025600.0, "grad_norm": 1.9094314962349255, "language_loss": 0.76568663, "learning_rate": 2.934324080452755e-06, "loss": 0.7879501, "num_input_tokens_seen": 65155860, "step": 3035, "time_per_iteration": 3.7176456451416016 }, { "auxiliary_loss_clip": 0.0117468, "auxiliary_loss_mlp": 0.01124812, "balance_loss_clip": 0.93704224, "balance_loss_mlp": 0.0, "epoch": 0.3650574159802802, "flos": 24750496016640.0, "grad_norm": 2.0152913427712202, "language_loss": 0.77853024, "learning_rate": 2.9336352649018307e-06, "loss": 0.80152524, "num_input_tokens_seen": 65175930, "step": 3036, "time_per_iteration": 2.7467124462127686 }, { "auxiliary_loss_clip": 0.01193187, "auxiliary_loss_mlp": 0.01031598, "balance_loss_clip": 0.98143804, "balance_loss_mlp": 1.02261579, "epoch": 0.36517765887091924, "flos": 32853487363200.0, "grad_norm": 1.6620633789930717, "language_loss": 0.69882727, "learning_rate": 2.9329463077191783e-06, "loss": 0.72107518, "num_input_tokens_seen": 65199305, "step": 3037, "time_per_iteration": 2.871314525604248 }, { "auxiliary_loss_clip": 0.01185749, "auxiliary_loss_mlp": 0.01028476, "balance_loss_clip": 0.9022187, "balance_loss_mlp": 1.02012503, "epoch": 0.36529790176155835, "flos": 20120318282880.0, "grad_norm": 2.577882912474391, "language_loss": 0.64141041, "learning_rate": 2.9322572090093135e-06, "loss": 0.66355264, "num_input_tokens_seen": 65218010, "step": 3038, "time_per_iteration": 2.7615203857421875 }, { "auxiliary_loss_clip": 0.01184468, "auxiliary_loss_mlp": 0.01031495, "balance_loss_clip": 0.90034896, "balance_loss_mlp": 1.02315593, "epoch": 0.36541814465219746, "flos": 17639573379840.0, "grad_norm": 2.6113059234226728, "language_loss": 0.76131588, "learning_rate": 2.9315679688767713e-06, "loss": 0.78347552, "num_input_tokens_seen": 65236020, "step": 3039, "time_per_iteration": 3.6787219047546387 }, { "auxiliary_loss_clip": 0.0118286, "auxiliary_loss_mlp": 0.01025459, "balance_loss_clip": 0.97684163, "balance_loss_mlp": 1.0173285, "epoch": 0.3655383875428365, "flos": 22674356887680.0, "grad_norm": 2.0243707910478292, "language_loss": 0.66393113, "learning_rate": 2.9308785874261085e-06, "loss": 0.68601435, "num_input_tokens_seen": 65256210, "step": 3040, "time_per_iteration": 3.6329102516174316 }, { "auxiliary_loss_clip": 0.01195558, "auxiliary_loss_mlp": 0.01026483, "balance_loss_clip": 1.05904555, "balance_loss_mlp": 1.01862693, "epoch": 0.36565863043347563, "flos": 21981173247360.0, "grad_norm": 1.6083889778760299, "language_loss": 0.81623644, "learning_rate": 2.9301890647619045e-06, "loss": 0.83845681, "num_input_tokens_seen": 65275505, "step": 3041, "time_per_iteration": 2.6362075805664062 }, { "auxiliary_loss_clip": 0.01199411, "auxiliary_loss_mlp": 0.01036368, "balance_loss_clip": 0.98118937, "balance_loss_mlp": 1.02796412, "epoch": 0.36577887332411474, "flos": 24827632473600.0, "grad_norm": 1.8587926015549934, "language_loss": 0.80039328, "learning_rate": 2.929499400988759e-06, "loss": 0.82275105, "num_input_tokens_seen": 65296665, "step": 3042, "time_per_iteration": 2.743699073791504 }, { "auxiliary_loss_clip": 0.01192585, "auxiliary_loss_mlp": 0.01029191, "balance_loss_clip": 1.01994681, "balance_loss_mlp": 1.02073336, "epoch": 0.3658991162147538, "flos": 28293191539200.0, "grad_norm": 1.7019225489927348, "language_loss": 0.65323436, "learning_rate": 2.9288095962112927e-06, "loss": 0.67545211, "num_input_tokens_seen": 65317370, "step": 3043, "time_per_iteration": 2.7106809616088867 }, { "auxiliary_loss_clip": 0.01189158, "auxiliary_loss_mlp": 0.01030039, "balance_loss_clip": 1.05389857, "balance_loss_mlp": 1.02199817, "epoch": 0.3660193591053929, "flos": 17785550252160.0, "grad_norm": 1.7938440885839082, "language_loss": 0.85372066, "learning_rate": 2.9281196505341503e-06, "loss": 0.87591267, "num_input_tokens_seen": 65334540, "step": 3044, "time_per_iteration": 2.6580538749694824 }, { "auxiliary_loss_clip": 0.0118241, "auxiliary_loss_mlp": 0.01124137, "balance_loss_clip": 0.90251315, "balance_loss_mlp": 0.0, "epoch": 0.36613960199603196, "flos": 10342776839040.0, "grad_norm": 1.8761100851303172, "language_loss": 0.78042543, "learning_rate": 2.9274295640619946e-06, "loss": 0.80349094, "num_input_tokens_seen": 65351670, "step": 3045, "time_per_iteration": 2.768232822418213 }, { "auxiliary_loss_clip": 0.01185247, "auxiliary_loss_mlp": 0.01029073, "balance_loss_clip": 0.93877035, "balance_loss_mlp": 1.02168226, "epoch": 0.36625984488667107, "flos": 19755609540480.0, "grad_norm": 1.7232678525412723, "language_loss": 0.78679931, "learning_rate": 2.9267393368995103e-06, "loss": 0.8089425, "num_input_tokens_seen": 65370900, "step": 3046, "time_per_iteration": 2.8014180660247803 }, { "auxiliary_loss_clip": 0.01195499, "auxiliary_loss_mlp": 0.01030959, "balance_loss_clip": 1.05848455, "balance_loss_mlp": 1.02301967, "epoch": 0.3663800877773102, "flos": 17674262939520.0, "grad_norm": 2.649058679694957, "language_loss": 0.74347788, "learning_rate": 2.926048969151407e-06, "loss": 0.76574248, "num_input_tokens_seen": 65388185, "step": 3047, "time_per_iteration": 2.6964752674102783 }, { "auxiliary_loss_clip": 0.01183426, "auxiliary_loss_mlp": 0.01026892, "balance_loss_clip": 0.90521836, "balance_loss_mlp": 1.01870275, "epoch": 0.36650033066794924, "flos": 20303606407680.0, "grad_norm": 1.665319673448977, "language_loss": 0.68611974, "learning_rate": 2.92535846092241e-06, "loss": 0.70822293, "num_input_tokens_seen": 65407200, "step": 3048, "time_per_iteration": 2.81390118598938 }, { "auxiliary_loss_clip": 0.01193753, "auxiliary_loss_mlp": 0.01032257, "balance_loss_clip": 0.98201984, "balance_loss_mlp": 1.02417469, "epoch": 0.36662057355858835, "flos": 24716237420160.0, "grad_norm": 1.6122119489155116, "language_loss": 0.82807118, "learning_rate": 2.9246678123172704e-06, "loss": 0.85033131, "num_input_tokens_seen": 65427290, "step": 3049, "time_per_iteration": 2.787149429321289 }, { "auxiliary_loss_clip": 0.011942, "auxiliary_loss_mlp": 0.01040181, "balance_loss_clip": 1.05611968, "balance_loss_mlp": 1.03141332, "epoch": 0.36674081644922746, "flos": 12385267902720.0, "grad_norm": 2.0859550370769266, "language_loss": 0.7377646, "learning_rate": 2.9239770234407596e-06, "loss": 0.76010847, "num_input_tokens_seen": 65445595, "step": 3050, "time_per_iteration": 2.550755023956299 }, { "auxiliary_loss_clip": 0.01194151, "auxiliary_loss_mlp": 0.01028475, "balance_loss_clip": 1.01926851, "balance_loss_mlp": 1.02023137, "epoch": 0.3668610593398665, "flos": 21105922544640.0, "grad_norm": 1.6861129873707221, "language_loss": 0.6826973, "learning_rate": 2.9232860943976686e-06, "loss": 0.70492363, "num_input_tokens_seen": 65466330, "step": 3051, "time_per_iteration": 2.716369390487671 }, { "auxiliary_loss_clip": 0.0119113, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 0.98146921, "balance_loss_mlp": 1.02708781, "epoch": 0.3669813022305056, "flos": 26758082039040.0, "grad_norm": 1.578705819173538, "language_loss": 0.84134412, "learning_rate": 2.9225950252928115e-06, "loss": 0.86359841, "num_input_tokens_seen": 65487180, "step": 3052, "time_per_iteration": 2.7357680797576904 }, { "auxiliary_loss_clip": 0.01192027, "auxiliary_loss_mlp": 0.01032455, "balance_loss_clip": 1.0184983, "balance_loss_mlp": 1.02406287, "epoch": 0.36710154512114473, "flos": 19099521671040.0, "grad_norm": 2.1824457119791143, "language_loss": 0.82361567, "learning_rate": 2.9219038162310217e-06, "loss": 0.84586048, "num_input_tokens_seen": 65505380, "step": 3053, "time_per_iteration": 2.6025969982147217 }, { "auxiliary_loss_clip": 0.01186381, "auxiliary_loss_mlp": 0.01125049, "balance_loss_clip": 0.82514727, "balance_loss_mlp": 0.0, "epoch": 0.3672217880117838, "flos": 20812029465600.0, "grad_norm": 4.849006586109743, "language_loss": 0.82557213, "learning_rate": 2.921212467317157e-06, "loss": 0.8486864, "num_input_tokens_seen": 65524825, "step": 3054, "time_per_iteration": 2.877042770385742 }, { "auxiliary_loss_clip": 0.01172235, "auxiliary_loss_mlp": 0.01030187, "balance_loss_clip": 0.97409564, "balance_loss_mlp": 1.02179503, "epoch": 0.3673420309024229, "flos": 13590394133760.0, "grad_norm": 1.935420458156833, "language_loss": 0.80152279, "learning_rate": 2.920520978656093e-06, "loss": 0.82354707, "num_input_tokens_seen": 65541790, "step": 3055, "time_per_iteration": 2.8850274085998535 }, { "auxiliary_loss_clip": 0.01189729, "auxiliary_loss_mlp": 0.01124286, "balance_loss_clip": 1.05581141, "balance_loss_mlp": 0.0, "epoch": 0.367462273793062, "flos": 28986877969920.0, "grad_norm": 1.7618090365000294, "language_loss": 0.76958722, "learning_rate": 2.919829350352729e-06, "loss": 0.79272735, "num_input_tokens_seen": 65563395, "step": 3056, "time_per_iteration": 2.9655380249023438 }, { "auxiliary_loss_clip": 0.0109788, "auxiliary_loss_mlp": 0.010058, "balance_loss_clip": 1.02786827, "balance_loss_mlp": 1.00336838, "epoch": 0.36758251668370107, "flos": 62643148346880.0, "grad_norm": 0.7560404238355903, "language_loss": 0.60075045, "learning_rate": 2.919137582511983e-06, "loss": 0.62178725, "num_input_tokens_seen": 65619835, "step": 3057, "time_per_iteration": 3.2212774753570557 }, { "auxiliary_loss_clip": 0.01196638, "auxiliary_loss_mlp": 0.01034609, "balance_loss_clip": 0.94462848, "balance_loss_mlp": 1.02652061, "epoch": 0.3677027595743402, "flos": 12713886455040.0, "grad_norm": 1.8371523334849433, "language_loss": 0.63787502, "learning_rate": 2.918445675238797e-06, "loss": 0.66018748, "num_input_tokens_seen": 65636760, "step": 3058, "time_per_iteration": 3.8090691566467285 }, { "auxiliary_loss_clip": 0.01191825, "auxiliary_loss_mlp": 0.01026411, "balance_loss_clip": 1.05490601, "balance_loss_mlp": 1.01820374, "epoch": 0.36782300246497923, "flos": 25046579825280.0, "grad_norm": 2.077620140604334, "language_loss": 0.69639397, "learning_rate": 2.917753628638132e-06, "loss": 0.71857631, "num_input_tokens_seen": 65657065, "step": 3059, "time_per_iteration": 2.650858163833618 }, { "auxiliary_loss_clip": 0.01189319, "auxiliary_loss_mlp": 0.010348, "balance_loss_clip": 0.97950029, "balance_loss_mlp": 1.02673578, "epoch": 0.36794324535561834, "flos": 17419512706560.0, "grad_norm": 1.9398886906044952, "language_loss": 0.70671427, "learning_rate": 2.9170614428149716e-06, "loss": 0.72895545, "num_input_tokens_seen": 65675400, "step": 3060, "time_per_iteration": 2.757361888885498 }, { "auxiliary_loss_clip": 0.01181927, "auxiliary_loss_mlp": 0.01029397, "balance_loss_clip": 0.94181907, "balance_loss_mlp": 1.02082014, "epoch": 0.36806348824625745, "flos": 24089128848000.0, "grad_norm": 2.3672556654940418, "language_loss": 0.86693549, "learning_rate": 2.9163691178743195e-06, "loss": 0.8890487, "num_input_tokens_seen": 65694050, "step": 3061, "time_per_iteration": 3.6829121112823486 }, { "auxiliary_loss_clip": 0.01186897, "auxiliary_loss_mlp": 0.01032537, "balance_loss_clip": 1.01708531, "balance_loss_mlp": 1.024544, "epoch": 0.3681837311368965, "flos": 20521871400960.0, "grad_norm": 2.0560215393290293, "language_loss": 0.77492481, "learning_rate": 2.9156766539212006e-06, "loss": 0.79711914, "num_input_tokens_seen": 65711695, "step": 3062, "time_per_iteration": 2.650960922241211 }, { "auxiliary_loss_clip": 0.01193754, "auxiliary_loss_mlp": 0.01033757, "balance_loss_clip": 1.01784778, "balance_loss_mlp": 1.02579403, "epoch": 0.3683039740275356, "flos": 21466644877440.0, "grad_norm": 1.8342483216749559, "language_loss": 0.71741962, "learning_rate": 2.9149840510606614e-06, "loss": 0.73969471, "num_input_tokens_seen": 65730350, "step": 3063, "time_per_iteration": 2.6538379192352295 }, { "auxiliary_loss_clip": 0.01096375, "auxiliary_loss_mlp": 0.011197, "balance_loss_clip": 0.99052, "balance_loss_mlp": 0.0, "epoch": 0.36842421691817473, "flos": 70380999987840.0, "grad_norm": 1.0448026367546828, "language_loss": 0.64240777, "learning_rate": 2.914291309397769e-06, "loss": 0.66456854, "num_input_tokens_seen": 65787820, "step": 3064, "time_per_iteration": 3.340041160583496 }, { "auxiliary_loss_clip": 0.01168787, "auxiliary_loss_mlp": 0.01030129, "balance_loss_clip": 0.86080915, "balance_loss_mlp": 1.02164161, "epoch": 0.3685444598088138, "flos": 23331378510720.0, "grad_norm": 2.452229004537399, "language_loss": 0.78299546, "learning_rate": 2.9135984290376117e-06, "loss": 0.80498457, "num_input_tokens_seen": 65806685, "step": 3065, "time_per_iteration": 2.945021629333496 }, { "auxiliary_loss_clip": 0.01180194, "auxiliary_loss_mlp": 0.01025284, "balance_loss_clip": 0.86044312, "balance_loss_mlp": 1.01771402, "epoch": 0.3686647026994529, "flos": 23070271570560.0, "grad_norm": 1.6109705153635812, "language_loss": 0.82643461, "learning_rate": 2.9129054100853e-06, "loss": 0.8484894, "num_input_tokens_seen": 65825525, "step": 3066, "time_per_iteration": 4.71534276008606 }, { "auxiliary_loss_clip": 0.01190055, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 0.97845137, "balance_loss_mlp": 1.02544606, "epoch": 0.368784945590092, "flos": 25119909440640.0, "grad_norm": 1.6390779081320288, "language_loss": 0.76042479, "learning_rate": 2.912212252645963e-06, "loss": 0.7826606, "num_input_tokens_seen": 65848110, "step": 3067, "time_per_iteration": 2.745900869369507 }, { "auxiliary_loss_clip": 0.01199653, "auxiliary_loss_mlp": 0.01041145, "balance_loss_clip": 1.02024531, "balance_loss_mlp": 1.03225183, "epoch": 0.36890518848073106, "flos": 18442284566400.0, "grad_norm": 2.0309655431945584, "language_loss": 0.76527274, "learning_rate": 2.9115189568247523e-06, "loss": 0.78768075, "num_input_tokens_seen": 65865670, "step": 3068, "time_per_iteration": 2.633462429046631 }, { "auxiliary_loss_clip": 0.01180181, "auxiliary_loss_mlp": 0.01031202, "balance_loss_clip": 0.9071753, "balance_loss_mlp": 1.02372801, "epoch": 0.36902543137137017, "flos": 16362446336640.0, "grad_norm": 2.0579116577904446, "language_loss": 0.92465842, "learning_rate": 2.910825522726841e-06, "loss": 0.94677234, "num_input_tokens_seen": 65883195, "step": 3069, "time_per_iteration": 2.751129627227783 }, { "auxiliary_loss_clip": 0.01181337, "auxiliary_loss_mlp": 0.01027376, "balance_loss_clip": 0.90199167, "balance_loss_mlp": 1.01992536, "epoch": 0.3691456742620093, "flos": 12275596702080.0, "grad_norm": 1.8654249836776824, "language_loss": 0.77030361, "learning_rate": 2.9101319504574215e-06, "loss": 0.7923907, "num_input_tokens_seen": 65899635, "step": 3070, "time_per_iteration": 2.737323522567749 }, { "auxiliary_loss_clip": 0.01180907, "auxiliary_loss_mlp": 0.01034385, "balance_loss_clip": 1.01429987, "balance_loss_mlp": 1.02624846, "epoch": 0.36926591715264834, "flos": 17786412178560.0, "grad_norm": 1.74359312889328, "language_loss": 0.7635752, "learning_rate": 2.909438240121709e-06, "loss": 0.7857281, "num_input_tokens_seen": 65919910, "step": 3071, "time_per_iteration": 2.7106337547302246 }, { "auxiliary_loss_clip": 0.01183916, "auxiliary_loss_mlp": 0.01029791, "balance_loss_clip": 0.98166907, "balance_loss_mlp": 1.02153563, "epoch": 0.36938616004328745, "flos": 28948309741440.0, "grad_norm": 2.226696804875745, "language_loss": 0.70472145, "learning_rate": 2.908744391824939e-06, "loss": 0.72685856, "num_input_tokens_seen": 65940930, "step": 3072, "time_per_iteration": 2.733063220977783 }, { "auxiliary_loss_clip": 0.01185907, "auxiliary_loss_mlp": 0.01029439, "balance_loss_clip": 0.86119223, "balance_loss_mlp": 1.02064705, "epoch": 0.36950640293392656, "flos": 29205394358400.0, "grad_norm": 1.9735883433087484, "language_loss": 0.78979647, "learning_rate": 2.908050405672367e-06, "loss": 0.81194991, "num_input_tokens_seen": 65960475, "step": 3073, "time_per_iteration": 2.8347015380859375 }, { "auxiliary_loss_clip": 0.01193385, "auxiliary_loss_mlp": 0.01024058, "balance_loss_clip": 0.97585821, "balance_loss_mlp": 1.01631522, "epoch": 0.3696266458245656, "flos": 24827776128000.0, "grad_norm": 1.694639553824735, "language_loss": 0.79399848, "learning_rate": 2.9073562817692703e-06, "loss": 0.8161729, "num_input_tokens_seen": 65979160, "step": 3074, "time_per_iteration": 2.7115917205810547 }, { "auxiliary_loss_clip": 0.01112778, "auxiliary_loss_mlp": 0.01000484, "balance_loss_clip": 0.88292015, "balance_loss_mlp": 0.99783748, "epoch": 0.3697468887152047, "flos": 59887257264000.0, "grad_norm": 0.7169879062152581, "language_loss": 0.56537819, "learning_rate": 2.9066620202209468e-06, "loss": 0.58651078, "num_input_tokens_seen": 66041650, "step": 3075, "time_per_iteration": 3.306030750274658 }, { "auxiliary_loss_clip": 0.01177841, "auxiliary_loss_mlp": 0.0102959, "balance_loss_clip": 0.94059914, "balance_loss_mlp": 1.02187681, "epoch": 0.3698671316058438, "flos": 26137581569280.0, "grad_norm": 5.58630347167216, "language_loss": 0.77387512, "learning_rate": 2.905967621132716e-06, "loss": 0.79594946, "num_input_tokens_seen": 66059260, "step": 3076, "time_per_iteration": 2.797353982925415 }, { "auxiliary_loss_clip": 0.01192659, "auxiliary_loss_mlp": 0.01029801, "balance_loss_clip": 0.97678626, "balance_loss_mlp": 1.02093172, "epoch": 0.3699873744964829, "flos": 24607464059520.0, "grad_norm": 2.065921754587545, "language_loss": 0.74951494, "learning_rate": 2.9052730846099172e-06, "loss": 0.77173954, "num_input_tokens_seen": 66080605, "step": 3077, "time_per_iteration": 2.706639528274536 }, { "auxiliary_loss_clip": 0.01101889, "auxiliary_loss_mlp": 0.010023, "balance_loss_clip": 0.9545511, "balance_loss_mlp": 0.99974841, "epoch": 0.370107617387122, "flos": 64885340050560.0, "grad_norm": 0.8592957139551254, "language_loss": 0.60867155, "learning_rate": 2.9045784107579123e-06, "loss": 0.62971342, "num_input_tokens_seen": 66140710, "step": 3078, "time_per_iteration": 3.2901391983032227 }, { "auxiliary_loss_clip": 0.01192508, "auxiliary_loss_mlp": 0.0103111, "balance_loss_clip": 1.0561161, "balance_loss_mlp": 1.0230037, "epoch": 0.37022786027776106, "flos": 15961683317760.0, "grad_norm": 1.7176013596611521, "language_loss": 0.66771722, "learning_rate": 2.9038835996820807e-06, "loss": 0.68995339, "num_input_tokens_seen": 66158320, "step": 3079, "time_per_iteration": 2.5982024669647217 }, { "auxiliary_loss_clip": 0.01189181, "auxiliary_loss_mlp": 0.01024116, "balance_loss_clip": 0.93678594, "balance_loss_mlp": 1.01656413, "epoch": 0.37034810316840017, "flos": 18546927863040.0, "grad_norm": 2.0535514488872666, "language_loss": 0.79511297, "learning_rate": 2.903188651487826e-06, "loss": 0.81724596, "num_input_tokens_seen": 66176875, "step": 3080, "time_per_iteration": 2.7764198780059814 }, { "auxiliary_loss_clip": 0.01194825, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.01926017, "balance_loss_mlp": 1.02444744, "epoch": 0.3704683460590393, "flos": 17821927751040.0, "grad_norm": 1.9798521687825275, "language_loss": 0.86384416, "learning_rate": 2.902493566280571e-06, "loss": 0.88611692, "num_input_tokens_seen": 66194980, "step": 3081, "time_per_iteration": 2.727426052093506 }, { "auxiliary_loss_clip": 0.01188901, "auxiliary_loss_mlp": 0.01030184, "balance_loss_clip": 0.97983277, "balance_loss_mlp": 1.02182162, "epoch": 0.37058858894967833, "flos": 14134081368960.0, "grad_norm": 2.4576037905450088, "language_loss": 0.81213558, "learning_rate": 2.9017983441657595e-06, "loss": 0.83432639, "num_input_tokens_seen": 66212310, "step": 3082, "time_per_iteration": 2.6871182918548584 }, { "auxiliary_loss_clip": 0.01187521, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 0.90038848, "balance_loss_mlp": 1.0231725, "epoch": 0.37070883184031744, "flos": 13954492344960.0, "grad_norm": 2.166241864791019, "language_loss": 0.75459456, "learning_rate": 2.9011029852488564e-06, "loss": 0.77677751, "num_input_tokens_seen": 66229545, "step": 3083, "time_per_iteration": 2.761700391769409 }, { "auxiliary_loss_clip": 0.01097697, "auxiliary_loss_mlp": 0.01004312, "balance_loss_clip": 1.02822042, "balance_loss_mlp": 1.00189233, "epoch": 0.37082907473095655, "flos": 52315419306240.0, "grad_norm": 0.9766903489581491, "language_loss": 0.62455153, "learning_rate": 2.9004074896353465e-06, "loss": 0.64557171, "num_input_tokens_seen": 66283545, "step": 3084, "time_per_iteration": 4.084162712097168 }, { "auxiliary_loss_clip": 0.01192872, "auxiliary_loss_mlp": 0.01029018, "balance_loss_clip": 1.0591495, "balance_loss_mlp": 1.02122188, "epoch": 0.3709493176215956, "flos": 15998096730240.0, "grad_norm": 1.7867853700642637, "language_loss": 0.81652266, "learning_rate": 2.8997118574307362e-06, "loss": 0.83874154, "num_input_tokens_seen": 66300500, "step": 3085, "time_per_iteration": 2.6024351119995117 }, { "auxiliary_loss_clip": 0.01199513, "auxiliary_loss_mlp": 0.01027472, "balance_loss_clip": 0.9445461, "balance_loss_mlp": 1.01910341, "epoch": 0.3710695605122347, "flos": 20959837931520.0, "grad_norm": 2.009795531299865, "language_loss": 0.74475849, "learning_rate": 2.899016088740553e-06, "loss": 0.76702833, "num_input_tokens_seen": 66318610, "step": 3086, "time_per_iteration": 2.6956582069396973 }, { "auxiliary_loss_clip": 0.01187108, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 0.90179604, "balance_loss_mlp": 1.02507889, "epoch": 0.37118980340287383, "flos": 14355578586240.0, "grad_norm": 1.937398064011089, "language_loss": 0.79115695, "learning_rate": 2.898320183670344e-06, "loss": 0.81335521, "num_input_tokens_seen": 66336025, "step": 3087, "time_per_iteration": 3.614084005355835 }, { "auxiliary_loss_clip": 0.01184271, "auxiliary_loss_mlp": 0.01033896, "balance_loss_clip": 0.903965, "balance_loss_mlp": 1.02481246, "epoch": 0.3713100462935129, "flos": 25885381201920.0, "grad_norm": 1.664972890561432, "language_loss": 0.88514161, "learning_rate": 2.8976241423256767e-06, "loss": 0.90732336, "num_input_tokens_seen": 66356120, "step": 3088, "time_per_iteration": 2.784017324447632 }, { "auxiliary_loss_clip": 0.01184906, "auxiliary_loss_mlp": 0.01025247, "balance_loss_clip": 0.98095983, "balance_loss_mlp": 1.01777887, "epoch": 0.371430289184152, "flos": 30518934814080.0, "grad_norm": 2.1777162972554422, "language_loss": 0.68022245, "learning_rate": 2.896927964812142e-06, "loss": 0.70232391, "num_input_tokens_seen": 66376685, "step": 3089, "time_per_iteration": 2.74819278717041 }, { "auxiliary_loss_clip": 0.01193607, "auxiliary_loss_mlp": 0.01029756, "balance_loss_clip": 0.98518836, "balance_loss_mlp": 1.0216974, "epoch": 0.37155053207479105, "flos": 15742233175680.0, "grad_norm": 2.477537610499848, "language_loss": 0.74973971, "learning_rate": 2.8962316512353465e-06, "loss": 0.77197337, "num_input_tokens_seen": 66394230, "step": 3090, "time_per_iteration": 2.7221155166625977 }, { "auxiliary_loss_clip": 0.0118092, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 0.86294872, "balance_loss_mlp": 1.01672387, "epoch": 0.37167077496543016, "flos": 23404061681280.0, "grad_norm": 1.7378186008405037, "language_loss": 0.74846256, "learning_rate": 2.8955352017009233e-06, "loss": 0.7705186, "num_input_tokens_seen": 66413475, "step": 3091, "time_per_iteration": 2.8326406478881836 }, { "auxiliary_loss_clip": 0.01189225, "auxiliary_loss_mlp": 0.01030652, "balance_loss_clip": 0.98098314, "balance_loss_mlp": 1.02255201, "epoch": 0.3717910178560693, "flos": 22088653718400.0, "grad_norm": 1.941399732231767, "language_loss": 0.77328074, "learning_rate": 2.8948386163145212e-06, "loss": 0.79547954, "num_input_tokens_seen": 66432685, "step": 3092, "time_per_iteration": 3.679802894592285 }, { "auxiliary_loss_clip": 0.01196663, "auxiliary_loss_mlp": 0.01033576, "balance_loss_clip": 1.0193218, "balance_loss_mlp": 1.02557743, "epoch": 0.3719112607467083, "flos": 26939969533440.0, "grad_norm": 1.8731621170124404, "language_loss": 0.79218596, "learning_rate": 2.8941418951818135e-06, "loss": 0.81448829, "num_input_tokens_seen": 66452245, "step": 3093, "time_per_iteration": 3.640514850616455 }, { "auxiliary_loss_clip": 0.0118766, "auxiliary_loss_mlp": 0.01028015, "balance_loss_clip": 0.93981564, "balance_loss_mlp": 1.02029014, "epoch": 0.37203150363734744, "flos": 12166500119040.0, "grad_norm": 2.2032743205481435, "language_loss": 0.71262538, "learning_rate": 2.8934450384084903e-06, "loss": 0.7347821, "num_input_tokens_seen": 66469760, "step": 3094, "time_per_iteration": 2.690366268157959 }, { "auxiliary_loss_clip": 0.01180334, "auxiliary_loss_mlp": 0.0103198, "balance_loss_clip": 0.97691977, "balance_loss_mlp": 1.02350438, "epoch": 0.37215174652798655, "flos": 23697595624320.0, "grad_norm": 1.8787713255518135, "language_loss": 0.69533646, "learning_rate": 2.8927480461002653e-06, "loss": 0.71745962, "num_input_tokens_seen": 66489730, "step": 3095, "time_per_iteration": 2.7499961853027344 }, { "auxiliary_loss_clip": 0.01187088, "auxiliary_loss_mlp": 0.01033584, "balance_loss_clip": 0.97633457, "balance_loss_mlp": 1.0249474, "epoch": 0.3722719894186256, "flos": 17887751424000.0, "grad_norm": 2.266729178311097, "language_loss": 0.85609031, "learning_rate": 2.892050918362872e-06, "loss": 0.87829709, "num_input_tokens_seen": 66504785, "step": 3096, "time_per_iteration": 2.6432952880859375 }, { "auxiliary_loss_clip": 0.01104405, "auxiliary_loss_mlp": 0.01002051, "balance_loss_clip": 0.80172396, "balance_loss_mlp": 0.99950045, "epoch": 0.3723922323092647, "flos": 62419891363200.0, "grad_norm": 0.8799268369430315, "language_loss": 0.55917954, "learning_rate": 2.8913536553020626e-06, "loss": 0.58024406, "num_input_tokens_seen": 66558840, "step": 3097, "time_per_iteration": 3.562037467956543 }, { "auxiliary_loss_clip": 0.01174426, "auxiliary_loss_mlp": 0.0103151, "balance_loss_clip": 0.89788288, "balance_loss_mlp": 1.02362728, "epoch": 0.3725124751999038, "flos": 23039747988480.0, "grad_norm": 1.9399493502680265, "language_loss": 0.84644306, "learning_rate": 2.8906562570236137e-06, "loss": 0.86850244, "num_input_tokens_seen": 66576750, "step": 3098, "time_per_iteration": 3.207641363143921 }, { "auxiliary_loss_clip": 0.01178863, "auxiliary_loss_mlp": 0.01031607, "balance_loss_clip": 0.85904825, "balance_loss_mlp": 1.02397776, "epoch": 0.3726327180905429, "flos": 20920551431040.0, "grad_norm": 1.50652719425137, "language_loss": 0.76168287, "learning_rate": 2.889958723633318e-06, "loss": 0.78378755, "num_input_tokens_seen": 66595690, "step": 3099, "time_per_iteration": 2.817821502685547 }, { "auxiliary_loss_clip": 0.01190328, "auxiliary_loss_mlp": 0.01025951, "balance_loss_clip": 0.9423784, "balance_loss_mlp": 1.01851845, "epoch": 0.372752960981182, "flos": 30592156688640.0, "grad_norm": 1.5566932930519266, "language_loss": 0.73946583, "learning_rate": 2.889261055236992e-06, "loss": 0.76162863, "num_input_tokens_seen": 66617905, "step": 3100, "time_per_iteration": 2.782031536102295 }, { "auxiliary_loss_clip": 0.01189345, "auxiliary_loss_mlp": 0.01026246, "balance_loss_clip": 0.98240829, "balance_loss_mlp": 1.0183661, "epoch": 0.3728732038718211, "flos": 25116749043840.0, "grad_norm": 1.6803435942144778, "language_loss": 0.82753253, "learning_rate": 2.8885632519404704e-06, "loss": 0.84968841, "num_input_tokens_seen": 66638175, "step": 3101, "time_per_iteration": 3.4188671112060547 }, { "auxiliary_loss_clip": 0.01185698, "auxiliary_loss_mlp": 0.01028165, "balance_loss_clip": 0.97900349, "balance_loss_mlp": 1.02043986, "epoch": 0.37299344676246016, "flos": 25302048330240.0, "grad_norm": 2.025133334061084, "language_loss": 0.75833333, "learning_rate": 2.8878653138496107e-06, "loss": 0.78047192, "num_input_tokens_seen": 66658670, "step": 3102, "time_per_iteration": 2.8703441619873047 }, { "auxiliary_loss_clip": 0.01180068, "auxiliary_loss_mlp": 0.01029711, "balance_loss_clip": 0.86024058, "balance_loss_mlp": 1.02159238, "epoch": 0.37311368965309927, "flos": 23842531002240.0, "grad_norm": 2.1341510829655697, "language_loss": 0.76357436, "learning_rate": 2.8871672410702878e-06, "loss": 0.78567219, "num_input_tokens_seen": 66676030, "step": 3103, "time_per_iteration": 2.7995529174804688 }, { "auxiliary_loss_clip": 0.01195429, "auxiliary_loss_mlp": 0.01028639, "balance_loss_clip": 0.94002277, "balance_loss_mlp": 1.02016902, "epoch": 0.3732339325437384, "flos": 25811943845760.0, "grad_norm": 1.7049250614428764, "language_loss": 0.81948781, "learning_rate": 2.8864690337084008e-06, "loss": 0.84172857, "num_input_tokens_seen": 66695305, "step": 3104, "time_per_iteration": 2.673051118850708 }, { "auxiliary_loss_clip": 0.01187073, "auxiliary_loss_mlp": 0.01027834, "balance_loss_clip": 1.0189141, "balance_loss_mlp": 1.02028775, "epoch": 0.37335417543437743, "flos": 26208433146240.0, "grad_norm": 1.7175729699417823, "language_loss": 0.77944362, "learning_rate": 2.885770691869866e-06, "loss": 0.80159271, "num_input_tokens_seen": 66716185, "step": 3105, "time_per_iteration": 2.6233532428741455 }, { "auxiliary_loss_clip": 0.01186046, "auxiliary_loss_mlp": 0.01029901, "balance_loss_clip": 1.01888955, "balance_loss_mlp": 1.02240229, "epoch": 0.37347441832501654, "flos": 24023879792640.0, "grad_norm": 2.1040264530927035, "language_loss": 0.74149895, "learning_rate": 2.8850722156606207e-06, "loss": 0.7636584, "num_input_tokens_seen": 66734575, "step": 3106, "time_per_iteration": 2.5501067638397217 }, { "auxiliary_loss_clip": 0.0117952, "auxiliary_loss_mlp": 0.01023489, "balance_loss_clip": 1.01536298, "balance_loss_mlp": 1.01600909, "epoch": 0.3735946612156556, "flos": 19714922409600.0, "grad_norm": 1.8312228163131195, "language_loss": 0.66910601, "learning_rate": 2.8843736051866252e-06, "loss": 0.69113612, "num_input_tokens_seen": 66753500, "step": 3107, "time_per_iteration": 2.542752742767334 }, { "auxiliary_loss_clip": 0.01184349, "auxiliary_loss_mlp": 0.01123725, "balance_loss_clip": 0.90344644, "balance_loss_mlp": 0.0, "epoch": 0.3737149041062947, "flos": 23039604334080.0, "grad_norm": 1.628194336622218, "language_loss": 0.69371867, "learning_rate": 2.8836748605538557e-06, "loss": 0.71679938, "num_input_tokens_seen": 66775140, "step": 3108, "time_per_iteration": 2.688891649246216 }, { "auxiliary_loss_clip": 0.01190672, "auxiliary_loss_mlp": 0.01028727, "balance_loss_clip": 0.97814989, "balance_loss_mlp": 1.01975679, "epoch": 0.3738351469969338, "flos": 34678108483200.0, "grad_norm": 2.070116956671189, "language_loss": 0.63680857, "learning_rate": 2.882975981868313e-06, "loss": 0.65900254, "num_input_tokens_seen": 66795525, "step": 3109, "time_per_iteration": 2.794471502304077 }, { "auxiliary_loss_clip": 0.0119002, "auxiliary_loss_mlp": 0.0103285, "balance_loss_clip": 1.019333, "balance_loss_mlp": 1.02491665, "epoch": 0.3739553898875729, "flos": 43507967448960.0, "grad_norm": 4.21600486233936, "language_loss": 0.68888342, "learning_rate": 2.882276969236016e-06, "loss": 0.71111214, "num_input_tokens_seen": 66816885, "step": 3110, "time_per_iteration": 3.8192460536956787 }, { "auxiliary_loss_clip": 0.0118335, "auxiliary_loss_mlp": 0.01029964, "balance_loss_clip": 0.97930491, "balance_loss_mlp": 1.02189302, "epoch": 0.374075632778212, "flos": 12856487448960.0, "grad_norm": 1.973145540479078, "language_loss": 0.7655074, "learning_rate": 2.881577822763005e-06, "loss": 0.78764057, "num_input_tokens_seen": 66834835, "step": 3111, "time_per_iteration": 2.853579044342041 }, { "auxiliary_loss_clip": 0.01188861, "auxiliary_loss_mlp": 0.01023499, "balance_loss_clip": 1.0164659, "balance_loss_mlp": 1.01536846, "epoch": 0.3741958756688511, "flos": 26024031699840.0, "grad_norm": 1.7714235993408396, "language_loss": 0.87230432, "learning_rate": 2.880878542555338e-06, "loss": 0.89442796, "num_input_tokens_seen": 66852600, "step": 3112, "time_per_iteration": 3.720607280731201 }, { "auxiliary_loss_clip": 0.01193961, "auxiliary_loss_mlp": 0.0103278, "balance_loss_clip": 1.05726957, "balance_loss_mlp": 1.02434611, "epoch": 0.37431611855949015, "flos": 21433894652160.0, "grad_norm": 2.0868279408509482, "language_loss": 0.80209559, "learning_rate": 2.8801791287190976e-06, "loss": 0.82436299, "num_input_tokens_seen": 66870595, "step": 3113, "time_per_iteration": 2.5926430225372314 }, { "auxiliary_loss_clip": 0.01195346, "auxiliary_loss_mlp": 0.01028065, "balance_loss_clip": 1.02011204, "balance_loss_mlp": 1.0197506, "epoch": 0.37443636145012926, "flos": 24207096090240.0, "grad_norm": 2.5253648998636984, "language_loss": 0.85995209, "learning_rate": 2.8794795813603817e-06, "loss": 0.88218617, "num_input_tokens_seen": 66886060, "step": 3114, "time_per_iteration": 2.602541208267212 }, { "auxiliary_loss_clip": 0.01193892, "auxiliary_loss_mlp": 0.01027156, "balance_loss_clip": 1.01595569, "balance_loss_mlp": 1.01903224, "epoch": 0.3745566043407684, "flos": 15378601841280.0, "grad_norm": 1.7473309261174979, "language_loss": 0.81709868, "learning_rate": 2.878779900585314e-06, "loss": 0.83930916, "num_input_tokens_seen": 66903900, "step": 3115, "time_per_iteration": 2.534945011138916 }, { "auxiliary_loss_clip": 0.0119474, "auxiliary_loss_mlp": 0.01027925, "balance_loss_clip": 0.98107934, "balance_loss_mlp": 1.01955628, "epoch": 0.37467684723140743, "flos": 24608218245120.0, "grad_norm": 1.5407588991685244, "language_loss": 0.74947977, "learning_rate": 2.8780800865000336e-06, "loss": 0.7717064, "num_input_tokens_seen": 66925210, "step": 3116, "time_per_iteration": 2.731348752975464 }, { "auxiliary_loss_clip": 0.01096414, "auxiliary_loss_mlp": 0.01004006, "balance_loss_clip": 0.98845828, "balance_loss_mlp": 1.00165749, "epoch": 0.37479709012204654, "flos": 64377491610240.0, "grad_norm": 0.9792655390816026, "language_loss": 0.59241712, "learning_rate": 2.877380139210702e-06, "loss": 0.61342132, "num_input_tokens_seen": 66983880, "step": 3117, "time_per_iteration": 4.297847509384155 }, { "auxiliary_loss_clip": 0.01192746, "auxiliary_loss_mlp": 0.0103347, "balance_loss_clip": 0.9415307, "balance_loss_mlp": 1.02489936, "epoch": 0.37491733301268565, "flos": 23803962773760.0, "grad_norm": 1.7722824460549318, "language_loss": 0.76221216, "learning_rate": 2.876680058823501e-06, "loss": 0.78447437, "num_input_tokens_seen": 67004280, "step": 3118, "time_per_iteration": 2.8877084255218506 }, { "auxiliary_loss_clip": 0.01176763, "auxiliary_loss_mlp": 0.01027717, "balance_loss_clip": 0.97645378, "balance_loss_mlp": 1.01922345, "epoch": 0.3750375759033247, "flos": 32160950167680.0, "grad_norm": 1.777367658614394, "language_loss": 0.6593014, "learning_rate": 2.8759798454446314e-06, "loss": 0.68134618, "num_input_tokens_seen": 67027445, "step": 3119, "time_per_iteration": 3.6723198890686035 }, { "auxiliary_loss_clip": 0.01193885, "auxiliary_loss_mlp": 0.01029885, "balance_loss_clip": 1.01799297, "balance_loss_mlp": 1.02208614, "epoch": 0.3751578187939638, "flos": 23367791923200.0, "grad_norm": 1.9172375999342541, "language_loss": 0.81296366, "learning_rate": 2.8752794991803173e-06, "loss": 0.83520138, "num_input_tokens_seen": 67045130, "step": 3120, "time_per_iteration": 2.7343382835388184 }, { "auxiliary_loss_clip": 0.01184963, "auxiliary_loss_mlp": 0.01030792, "balance_loss_clip": 0.97947264, "balance_loss_mlp": 1.02294791, "epoch": 0.37527806168460287, "flos": 14605731878400.0, "grad_norm": 3.0292311751725447, "language_loss": 0.75014621, "learning_rate": 2.8745790201367976e-06, "loss": 0.77230376, "num_input_tokens_seen": 67060885, "step": 3121, "time_per_iteration": 2.6738827228546143 }, { "auxiliary_loss_clip": 0.01194666, "auxiliary_loss_mlp": 0.0103235, "balance_loss_clip": 1.05773306, "balance_loss_mlp": 1.02410054, "epoch": 0.375398304575242, "flos": 26390823431040.0, "grad_norm": 2.358696797707724, "language_loss": 0.84048349, "learning_rate": 2.8738784084203373e-06, "loss": 0.86275369, "num_input_tokens_seen": 67080960, "step": 3122, "time_per_iteration": 2.74772047996521 }, { "auxiliary_loss_clip": 0.01175854, "auxiliary_loss_mlp": 0.01030768, "balance_loss_clip": 0.97373199, "balance_loss_mlp": 1.02304292, "epoch": 0.3755185474658811, "flos": 22236605838720.0, "grad_norm": 1.7556697103035925, "language_loss": 0.78531939, "learning_rate": 2.873177664137216e-06, "loss": 0.80738562, "num_input_tokens_seen": 67101890, "step": 3123, "time_per_iteration": 2.6792619228363037 }, { "auxiliary_loss_clip": 0.01185679, "auxiliary_loss_mlp": 0.01029053, "balance_loss_clip": 0.94140387, "balance_loss_mlp": 1.02033901, "epoch": 0.37563879035652015, "flos": 30812935633920.0, "grad_norm": 1.560805948533513, "language_loss": 0.69438922, "learning_rate": 2.8724767873937384e-06, "loss": 0.71653658, "num_input_tokens_seen": 67126010, "step": 3124, "time_per_iteration": 2.807981252670288 }, { "auxiliary_loss_clip": 0.01188332, "auxiliary_loss_mlp": 0.01031078, "balance_loss_clip": 0.9780612, "balance_loss_mlp": 1.02341866, "epoch": 0.37575903324715926, "flos": 20773533064320.0, "grad_norm": 2.1697432177903524, "language_loss": 0.86828423, "learning_rate": 2.871775778296225e-06, "loss": 0.89047831, "num_input_tokens_seen": 67143100, "step": 3125, "time_per_iteration": 2.7067975997924805 }, { "auxiliary_loss_clip": 0.0119336, "auxiliary_loss_mlp": 0.01026214, "balance_loss_clip": 1.01900506, "balance_loss_mlp": 1.01743412, "epoch": 0.37587927613779837, "flos": 18697681244160.0, "grad_norm": 2.1474246505380137, "language_loss": 0.78309631, "learning_rate": 2.8710746369510196e-06, "loss": 0.80529201, "num_input_tokens_seen": 67161085, "step": 3126, "time_per_iteration": 2.6272664070129395 }, { "auxiliary_loss_clip": 0.01183863, "auxiliary_loss_mlp": 0.01029846, "balance_loss_clip": 0.97879839, "balance_loss_mlp": 1.0217036, "epoch": 0.3759995190284374, "flos": 13624796384640.0, "grad_norm": 3.183986919400606, "language_loss": 0.83903933, "learning_rate": 2.8703733634644846e-06, "loss": 0.86117643, "num_input_tokens_seen": 67175840, "step": 3127, "time_per_iteration": 2.6736302375793457 }, { "auxiliary_loss_clip": 0.01189156, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.05624282, "balance_loss_mlp": 1.02027392, "epoch": 0.37611976191907653, "flos": 20484847457280.0, "grad_norm": 1.775748456740706, "language_loss": 0.79267949, "learning_rate": 2.869671957943002e-06, "loss": 0.81485021, "num_input_tokens_seen": 67194995, "step": 3128, "time_per_iteration": 2.643807888031006 }, { "auxiliary_loss_clip": 0.01186289, "auxiliary_loss_mlp": 0.01023323, "balance_loss_clip": 0.98391163, "balance_loss_mlp": 1.01576221, "epoch": 0.37624000480971564, "flos": 21141797253120.0, "grad_norm": 2.5234743632683343, "language_loss": 0.74566078, "learning_rate": 2.8689704204929747e-06, "loss": 0.76775694, "num_input_tokens_seen": 67214175, "step": 3129, "time_per_iteration": 2.702609062194824 }, { "auxiliary_loss_clip": 0.01189396, "auxiliary_loss_mlp": 0.01032028, "balance_loss_clip": 1.05426335, "balance_loss_mlp": 1.02456534, "epoch": 0.3763602477003547, "flos": 22564470205440.0, "grad_norm": 2.090289000041398, "language_loss": 0.8138504, "learning_rate": 2.8682687512208253e-06, "loss": 0.83606458, "num_input_tokens_seen": 67233185, "step": 3130, "time_per_iteration": 2.6713809967041016 }, { "auxiliary_loss_clip": 0.01194628, "auxiliary_loss_mlp": 0.01024844, "balance_loss_clip": 1.01649165, "balance_loss_mlp": 1.01687491, "epoch": 0.3764804905909938, "flos": 27526857851520.0, "grad_norm": 1.8416017409570564, "language_loss": 0.80225921, "learning_rate": 2.8675669502329972e-06, "loss": 0.82445395, "num_input_tokens_seen": 67254715, "step": 3131, "time_per_iteration": 2.692624807357788 }, { "auxiliary_loss_clip": 0.01187618, "auxiliary_loss_mlp": 0.01124259, "balance_loss_clip": 1.01604605, "balance_loss_mlp": 0.0, "epoch": 0.3766007334816329, "flos": 22528092706560.0, "grad_norm": 2.4353344435228976, "language_loss": 0.85351175, "learning_rate": 2.866865017635952e-06, "loss": 0.87663043, "num_input_tokens_seen": 67272535, "step": 3132, "time_per_iteration": 2.709597110748291 }, { "auxiliary_loss_clip": 0.01188736, "auxiliary_loss_mlp": 0.01026857, "balance_loss_clip": 0.94537687, "balance_loss_mlp": 1.01869106, "epoch": 0.376720976372272, "flos": 25957166532480.0, "grad_norm": 1.5302207436259943, "language_loss": 0.79280806, "learning_rate": 2.866162953536174e-06, "loss": 0.81496394, "num_input_tokens_seen": 67293505, "step": 3133, "time_per_iteration": 2.84798002243042 }, { "auxiliary_loss_clip": 0.0118402, "auxiliary_loss_mlp": 0.01123966, "balance_loss_clip": 0.97560644, "balance_loss_mlp": 0.0, "epoch": 0.3768412192629111, "flos": 18041162411520.0, "grad_norm": 1.662444502935159, "language_loss": 0.75180715, "learning_rate": 2.8654607580401634e-06, "loss": 0.77488697, "num_input_tokens_seen": 67313240, "step": 3134, "time_per_iteration": 2.6979987621307373 }, { "auxiliary_loss_clip": 0.01094099, "auxiliary_loss_mlp": 0.01000507, "balance_loss_clip": 0.98900139, "balance_loss_mlp": 0.99819422, "epoch": 0.3769614621535502, "flos": 62989472304000.0, "grad_norm": 0.8876778193987833, "language_loss": 0.65269089, "learning_rate": 2.8647584312544446e-06, "loss": 0.67363691, "num_input_tokens_seen": 67378445, "step": 3135, "time_per_iteration": 3.3371849060058594 }, { "auxiliary_loss_clip": 0.01180254, "auxiliary_loss_mlp": 0.01123771, "balance_loss_clip": 0.93684602, "balance_loss_mlp": 0.0, "epoch": 0.37708170504418925, "flos": 23661685002240.0, "grad_norm": 1.4682663833820173, "language_loss": 0.85336655, "learning_rate": 2.864055973285559e-06, "loss": 0.87640679, "num_input_tokens_seen": 67400445, "step": 3136, "time_per_iteration": 3.720947742462158 }, { "auxiliary_loss_clip": 0.01173825, "auxiliary_loss_mlp": 0.01026621, "balance_loss_clip": 0.97506005, "balance_loss_mlp": 1.01877069, "epoch": 0.37720194793482836, "flos": 24423170353920.0, "grad_norm": 1.96968648194101, "language_loss": 0.86144328, "learning_rate": 2.8633533842400698e-06, "loss": 0.88344777, "num_input_tokens_seen": 67420645, "step": 3137, "time_per_iteration": 2.784085988998413 }, { "auxiliary_loss_clip": 0.01191156, "auxiliary_loss_mlp": 0.01124837, "balance_loss_clip": 1.01770473, "balance_loss_mlp": 0.0, "epoch": 0.3773221908254674, "flos": 20996502739200.0, "grad_norm": 1.8399448286858455, "language_loss": 0.77295601, "learning_rate": 2.862650664224558e-06, "loss": 0.79611599, "num_input_tokens_seen": 67439495, "step": 3138, "time_per_iteration": 3.6891732215881348 }, { "auxiliary_loss_clip": 0.01188153, "auxiliary_loss_mlp": 0.01028562, "balance_loss_clip": 1.01973665, "balance_loss_mlp": 1.02098584, "epoch": 0.37744243371610653, "flos": 37631724958080.0, "grad_norm": 1.3347536656200834, "language_loss": 0.6973936, "learning_rate": 2.861947813345627e-06, "loss": 0.71956074, "num_input_tokens_seen": 67462195, "step": 3139, "time_per_iteration": 2.8524723052978516 }, { "auxiliary_loss_clip": 0.01192729, "auxiliary_loss_mlp": 0.01124247, "balance_loss_clip": 1.05548644, "balance_loss_mlp": 0.0, "epoch": 0.37756267660674564, "flos": 26140526484480.0, "grad_norm": 4.256473890578764, "language_loss": 0.72349477, "learning_rate": 2.8612448317098974e-06, "loss": 0.74666452, "num_input_tokens_seen": 67482530, "step": 3140, "time_per_iteration": 2.6917896270751953 }, { "auxiliary_loss_clip": 0.01185236, "auxiliary_loss_mlp": 0.01124469, "balance_loss_clip": 0.93794, "balance_loss_mlp": 0.0, "epoch": 0.3776829194973847, "flos": 19427888828160.0, "grad_norm": 2.0811677005704694, "language_loss": 0.83330274, "learning_rate": 2.8605417194240114e-06, "loss": 0.85639977, "num_input_tokens_seen": 67500890, "step": 3141, "time_per_iteration": 2.751016616821289 }, { "auxiliary_loss_clip": 0.011828, "auxiliary_loss_mlp": 0.01025808, "balance_loss_clip": 1.01536822, "balance_loss_mlp": 1.01771998, "epoch": 0.3778031623880238, "flos": 17382309194880.0, "grad_norm": 1.9458192738194615, "language_loss": 0.78537881, "learning_rate": 2.8598384765946315e-06, "loss": 0.80746484, "num_input_tokens_seen": 67519545, "step": 3142, "time_per_iteration": 2.710099697113037 }, { "auxiliary_loss_clip": 0.01187187, "auxiliary_loss_mlp": 0.01025079, "balance_loss_clip": 1.05328608, "balance_loss_mlp": 1.01761067, "epoch": 0.3779234052786629, "flos": 27125843437440.0, "grad_norm": 1.7625726362548548, "language_loss": 0.71811271, "learning_rate": 2.8591351033284377e-06, "loss": 0.74023539, "num_input_tokens_seen": 67539275, "step": 3143, "time_per_iteration": 3.7620668411254883 }, { "auxiliary_loss_clip": 0.01190997, "auxiliary_loss_mlp": 0.01026012, "balance_loss_clip": 1.01542568, "balance_loss_mlp": 1.01837039, "epoch": 0.37804364816930197, "flos": 19682639061120.0, "grad_norm": 2.1345270531803044, "language_loss": 0.83946598, "learning_rate": 2.8584315997321325e-06, "loss": 0.86163604, "num_input_tokens_seen": 67558280, "step": 3144, "time_per_iteration": 2.67633056640625 }, { "auxiliary_loss_clip": 0.01189901, "auxiliary_loss_mlp": 0.01025816, "balance_loss_clip": 1.05497098, "balance_loss_mlp": 1.01820445, "epoch": 0.3781638910599411, "flos": 22702905221760.0, "grad_norm": 2.2561389250278716, "language_loss": 0.77795839, "learning_rate": 2.8577279659124356e-06, "loss": 0.80011559, "num_input_tokens_seen": 67575955, "step": 3145, "time_per_iteration": 3.5003535747528076 }, { "auxiliary_loss_clip": 0.01180355, "auxiliary_loss_mlp": 0.01024943, "balance_loss_clip": 1.01245546, "balance_loss_mlp": 1.01748061, "epoch": 0.3782841339505802, "flos": 14647604158080.0, "grad_norm": 1.8012731342457244, "language_loss": 0.83486992, "learning_rate": 2.857024201976089e-06, "loss": 0.85692292, "num_input_tokens_seen": 67593515, "step": 3146, "time_per_iteration": 2.7257301807403564 }, { "auxiliary_loss_clip": 0.0118341, "auxiliary_loss_mlp": 0.01026101, "balance_loss_clip": 0.97549611, "balance_loss_mlp": 1.0169456, "epoch": 0.37840437684121925, "flos": 32818223185920.0, "grad_norm": 2.161847366436668, "language_loss": 0.73651463, "learning_rate": 2.8563203080298516e-06, "loss": 0.75860977, "num_input_tokens_seen": 67614290, "step": 3147, "time_per_iteration": 2.7461721897125244 }, { "auxiliary_loss_clip": 0.01185041, "auxiliary_loss_mlp": 0.01124317, "balance_loss_clip": 0.9759506, "balance_loss_mlp": 0.0, "epoch": 0.37852461973185836, "flos": 18369206346240.0, "grad_norm": 2.060881518059898, "language_loss": 0.8921988, "learning_rate": 2.855616284180505e-06, "loss": 0.91529238, "num_input_tokens_seen": 67631340, "step": 3148, "time_per_iteration": 2.7260191440582275 }, { "auxiliary_loss_clip": 0.01096072, "auxiliary_loss_mlp": 0.01004389, "balance_loss_clip": 0.98797643, "balance_loss_mlp": 1.00199318, "epoch": 0.37864486262249747, "flos": 59500680117120.0, "grad_norm": 0.879883050359054, "language_loss": 0.66181648, "learning_rate": 2.8549121305348477e-06, "loss": 0.68282104, "num_input_tokens_seen": 67691125, "step": 3149, "time_per_iteration": 3.2258365154266357 }, { "auxiliary_loss_clip": 0.01186009, "auxiliary_loss_mlp": 0.010275, "balance_loss_clip": 1.01512122, "balance_loss_mlp": 1.02008796, "epoch": 0.3787651055131365, "flos": 23363015414400.0, "grad_norm": 2.5792220205130745, "language_loss": 0.83049846, "learning_rate": 2.8542078471997006e-06, "loss": 0.85263354, "num_input_tokens_seen": 67708740, "step": 3150, "time_per_iteration": 2.631721019744873 }, { "auxiliary_loss_clip": 0.01186915, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 1.01442695, "balance_loss_mlp": 1.02602386, "epoch": 0.37888534840377563, "flos": 24601394661120.0, "grad_norm": 1.6752734470488935, "language_loss": 0.75744092, "learning_rate": 2.8535034342819013e-06, "loss": 0.77964818, "num_input_tokens_seen": 67726150, "step": 3151, "time_per_iteration": 2.7412259578704834 }, { "auxiliary_loss_clip": 0.01185274, "auxiliary_loss_mlp": 0.01027559, "balance_loss_clip": 1.05290627, "balance_loss_mlp": 1.01970267, "epoch": 0.37900559129441475, "flos": 23986891762560.0, "grad_norm": 1.5055794001881408, "language_loss": 0.72414863, "learning_rate": 2.85279889188831e-06, "loss": 0.74627697, "num_input_tokens_seen": 67746525, "step": 3152, "time_per_iteration": 2.6237001419067383 }, { "auxiliary_loss_clip": 0.01179959, "auxiliary_loss_mlp": 0.01026726, "balance_loss_clip": 0.93542159, "balance_loss_mlp": 1.01840568, "epoch": 0.3791258341850538, "flos": 24644667571200.0, "grad_norm": 2.38885432817769, "language_loss": 0.81527102, "learning_rate": 2.852094220125805e-06, "loss": 0.83733785, "num_input_tokens_seen": 67766035, "step": 3153, "time_per_iteration": 2.8329670429229736 }, { "auxiliary_loss_clip": 0.0119118, "auxiliary_loss_mlp": 0.01026001, "balance_loss_clip": 1.01921916, "balance_loss_mlp": 1.01806188, "epoch": 0.3792460770756929, "flos": 17420841509760.0, "grad_norm": 2.2085910305285634, "language_loss": 0.70982331, "learning_rate": 2.8513894191012846e-06, "loss": 0.73199511, "num_input_tokens_seen": 67785015, "step": 3154, "time_per_iteration": 2.6462676525115967 }, { "auxiliary_loss_clip": 0.01190107, "auxiliary_loss_mlp": 0.01025072, "balance_loss_clip": 1.05485964, "balance_loss_mlp": 1.01660275, "epoch": 0.37936631996633197, "flos": 24206557386240.0, "grad_norm": 1.5650261831707777, "language_loss": 0.78984714, "learning_rate": 2.8506844889216664e-06, "loss": 0.81199896, "num_input_tokens_seen": 67804400, "step": 3155, "time_per_iteration": 2.6925888061523438 }, { "auxiliary_loss_clip": 0.01086699, "auxiliary_loss_mlp": 0.01002102, "balance_loss_clip": 0.98502946, "balance_loss_mlp": 0.9998371, "epoch": 0.3794865628569711, "flos": 70297114752000.0, "grad_norm": 0.8624332736926908, "language_loss": 0.62924218, "learning_rate": 2.849979429693887e-06, "loss": 0.65013021, "num_input_tokens_seen": 67865385, "step": 3156, "time_per_iteration": 3.286583662033081 }, { "auxiliary_loss_clip": 0.0118803, "auxiliary_loss_mlp": 0.01025615, "balance_loss_clip": 1.05453944, "balance_loss_mlp": 1.01731241, "epoch": 0.3796068057476102, "flos": 15779364860160.0, "grad_norm": 1.813709126475665, "language_loss": 0.73947012, "learning_rate": 2.8492742415249042e-06, "loss": 0.76160657, "num_input_tokens_seen": 67883030, "step": 3157, "time_per_iteration": 2.7270121574401855 }, { "auxiliary_loss_clip": 0.01188982, "auxiliary_loss_mlp": 0.01025056, "balance_loss_clip": 1.05488002, "balance_loss_mlp": 1.01731396, "epoch": 0.37972704863824924, "flos": 25191694771200.0, "grad_norm": 1.6532327782384963, "language_loss": 0.76109731, "learning_rate": 2.848568924521694e-06, "loss": 0.7832377, "num_input_tokens_seen": 67903810, "step": 3158, "time_per_iteration": 2.6522912979125977 }, { "auxiliary_loss_clip": 0.01177118, "auxiliary_loss_mlp": 0.01028676, "balance_loss_clip": 1.01219106, "balance_loss_mlp": 1.02039123, "epoch": 0.37984729152888835, "flos": 26210372480640.0, "grad_norm": 1.884634817819773, "language_loss": 0.73343128, "learning_rate": 2.8478634787912526e-06, "loss": 0.75548923, "num_input_tokens_seen": 67921865, "step": 3159, "time_per_iteration": 2.677262783050537 }, { "auxiliary_loss_clip": 0.01187223, "auxiliary_loss_mlp": 0.01031751, "balance_loss_clip": 1.0158354, "balance_loss_mlp": 1.02403784, "epoch": 0.37996753441952746, "flos": 25629302165760.0, "grad_norm": 2.0245396395211377, "language_loss": 0.76657599, "learning_rate": 2.847157904440596e-06, "loss": 0.78876567, "num_input_tokens_seen": 67941595, "step": 3160, "time_per_iteration": 2.708516836166382 }, { "auxiliary_loss_clip": 0.0118872, "auxiliary_loss_mlp": 0.01032215, "balance_loss_clip": 1.01705945, "balance_loss_mlp": 1.02484226, "epoch": 0.3800877773101665, "flos": 20118414862080.0, "grad_norm": 1.5203263357004682, "language_loss": 0.73828197, "learning_rate": 2.846452201576759e-06, "loss": 0.76049125, "num_input_tokens_seen": 67960970, "step": 3161, "time_per_iteration": 2.672978639602661 }, { "auxiliary_loss_clip": 0.01096336, "auxiliary_loss_mlp": 0.0100157, "balance_loss_clip": 0.94925946, "balance_loss_mlp": 0.99923342, "epoch": 0.38020802020080563, "flos": 63053608037760.0, "grad_norm": 0.9030612285150593, "language_loss": 0.62801313, "learning_rate": 2.845746370306795e-06, "loss": 0.64899224, "num_input_tokens_seen": 68026160, "step": 3162, "time_per_iteration": 4.320513963699341 }, { "auxiliary_loss_clip": 0.0118712, "auxiliary_loss_mlp": 0.01027264, "balance_loss_clip": 1.01528203, "balance_loss_mlp": 1.01926446, "epoch": 0.38032826309144474, "flos": 21288420570240.0, "grad_norm": 1.8355817044659142, "language_loss": 0.78499299, "learning_rate": 2.84504041073778e-06, "loss": 0.80713683, "num_input_tokens_seen": 68044575, "step": 3163, "time_per_iteration": 2.7179958820343018 }, { "auxiliary_loss_clip": 0.01178689, "auxiliary_loss_mlp": 0.01029086, "balance_loss_clip": 0.97814965, "balance_loss_mlp": 1.02050877, "epoch": 0.3804485059820838, "flos": 18954119416320.0, "grad_norm": 1.6879413097163805, "language_loss": 0.79148501, "learning_rate": 2.844334322976806e-06, "loss": 0.81356275, "num_input_tokens_seen": 68064790, "step": 3164, "time_per_iteration": 3.6406211853027344 }, { "auxiliary_loss_clip": 0.01187277, "auxiliary_loss_mlp": 0.01028874, "balance_loss_clip": 0.89940202, "balance_loss_mlp": 1.02076173, "epoch": 0.3805687488727229, "flos": 21833759831040.0, "grad_norm": 3.390824287117582, "language_loss": 0.83333576, "learning_rate": 2.8436281071309866e-06, "loss": 0.85549724, "num_input_tokens_seen": 68083330, "step": 3165, "time_per_iteration": 2.8253984451293945 }, { "auxiliary_loss_clip": 0.01101363, "auxiliary_loss_mlp": 0.00999924, "balance_loss_clip": 0.8735671, "balance_loss_mlp": 0.99730128, "epoch": 0.380688991763362, "flos": 58546209968640.0, "grad_norm": 0.7702844748202228, "language_loss": 0.53038818, "learning_rate": 2.842921763307455e-06, "loss": 0.55140102, "num_input_tokens_seen": 68146140, "step": 3166, "time_per_iteration": 3.4243438243865967 }, { "auxiliary_loss_clip": 0.01178353, "auxiliary_loss_mlp": 0.01025199, "balance_loss_clip": 0.97546995, "balance_loss_mlp": 1.01746869, "epoch": 0.38080923465400107, "flos": 23799509487360.0, "grad_norm": 1.7742768115252951, "language_loss": 0.82476592, "learning_rate": 2.842215291613361e-06, "loss": 0.8468014, "num_input_tokens_seen": 68164520, "step": 3167, "time_per_iteration": 2.987236261367798 }, { "auxiliary_loss_clip": 0.01086403, "auxiliary_loss_mlp": 0.01003731, "balance_loss_clip": 0.79469085, "balance_loss_mlp": 1.00115609, "epoch": 0.3809294775446402, "flos": 54969866380800.0, "grad_norm": 0.8263135531820458, "language_loss": 0.59334046, "learning_rate": 2.8415086921558774e-06, "loss": 0.61424178, "num_input_tokens_seen": 68227945, "step": 3168, "time_per_iteration": 3.4625842571258545 }, { "auxiliary_loss_clip": 0.01167516, "auxiliary_loss_mlp": 0.01024969, "balance_loss_clip": 0.97088999, "balance_loss_mlp": 1.01648724, "epoch": 0.38104972043527924, "flos": 24643697904000.0, "grad_norm": 1.5196174768477186, "language_loss": 0.78480196, "learning_rate": 2.840801965042194e-06, "loss": 0.80672681, "num_input_tokens_seen": 68247405, "step": 3169, "time_per_iteration": 4.605800628662109 }, { "auxiliary_loss_clip": 0.01171722, "auxiliary_loss_mlp": 0.01032376, "balance_loss_clip": 0.97302949, "balance_loss_mlp": 1.02466345, "epoch": 0.38116996332591835, "flos": 22856783086080.0, "grad_norm": 2.122882032124826, "language_loss": 0.83580601, "learning_rate": 2.840095110379521e-06, "loss": 0.85784698, "num_input_tokens_seen": 68266925, "step": 3170, "time_per_iteration": 2.72011399269104 }, { "auxiliary_loss_clip": 0.01095142, "auxiliary_loss_mlp": 0.00999938, "balance_loss_clip": 0.87541449, "balance_loss_mlp": 0.99749464, "epoch": 0.38129020621655746, "flos": 60836160804480.0, "grad_norm": 0.7241888066833814, "language_loss": 0.53950995, "learning_rate": 2.8393881282750884e-06, "loss": 0.56046075, "num_input_tokens_seen": 68329755, "step": 3171, "time_per_iteration": 4.195053815841675 }, { "auxiliary_loss_clip": 0.0118762, "auxiliary_loss_mlp": 0.01028797, "balance_loss_clip": 0.97872341, "balance_loss_mlp": 1.02061975, "epoch": 0.3814104491071965, "flos": 21648101408640.0, "grad_norm": 3.568520692230349, "language_loss": 0.7846365, "learning_rate": 2.838681018836144e-06, "loss": 0.8068006, "num_input_tokens_seen": 68347075, "step": 3172, "time_per_iteration": 2.747774600982666 }, { "auxiliary_loss_clip": 0.01183486, "auxiliary_loss_mlp": 0.01124272, "balance_loss_clip": 0.93711448, "balance_loss_mlp": 0.0, "epoch": 0.3815306919978356, "flos": 19099090707840.0, "grad_norm": 1.9698785361593676, "language_loss": 0.78281617, "learning_rate": 2.837973782169955e-06, "loss": 0.80589378, "num_input_tokens_seen": 68365450, "step": 3173, "time_per_iteration": 2.7431743144989014 }, { "auxiliary_loss_clip": 0.01088487, "auxiliary_loss_mlp": 0.01006355, "balance_loss_clip": 1.02266288, "balance_loss_mlp": 1.00394714, "epoch": 0.38165093488847474, "flos": 67067918156160.0, "grad_norm": 0.8136543830857919, "language_loss": 0.59237725, "learning_rate": 2.8372664183838096e-06, "loss": 0.61332566, "num_input_tokens_seen": 68428470, "step": 3174, "time_per_iteration": 3.2574870586395264 }, { "auxiliary_loss_clip": 0.01188112, "auxiliary_loss_mlp": 0.01024191, "balance_loss_clip": 1.0548923, "balance_loss_mlp": 1.01660919, "epoch": 0.3817711777791138, "flos": 22341105480960.0, "grad_norm": 2.0157502351508323, "language_loss": 0.68584716, "learning_rate": 2.836558927585015e-06, "loss": 0.7079702, "num_input_tokens_seen": 68445440, "step": 3175, "time_per_iteration": 2.5962326526641846 }, { "auxiliary_loss_clip": 0.01189898, "auxiliary_loss_mlp": 0.01035972, "balance_loss_clip": 1.01693726, "balance_loss_mlp": 1.02780032, "epoch": 0.3818914206697529, "flos": 22820621068800.0, "grad_norm": 1.7919233238813588, "language_loss": 0.8238135, "learning_rate": 2.8358513098808957e-06, "loss": 0.8460722, "num_input_tokens_seen": 68465755, "step": 3176, "time_per_iteration": 2.676645278930664 }, { "auxiliary_loss_clip": 0.01171663, "auxiliary_loss_mlp": 0.01030972, "balance_loss_clip": 0.89965457, "balance_loss_mlp": 1.02250814, "epoch": 0.382011663560392, "flos": 24386074583040.0, "grad_norm": 1.7769012318073396, "language_loss": 0.76876897, "learning_rate": 2.835143565378798e-06, "loss": 0.79079533, "num_input_tokens_seen": 68486220, "step": 3177, "time_per_iteration": 2.8385722637176514 }, { "auxiliary_loss_clip": 0.01174556, "auxiliary_loss_mlp": 0.01025898, "balance_loss_clip": 0.85998464, "balance_loss_mlp": 1.0167371, "epoch": 0.38213190645103107, "flos": 21981568296960.0, "grad_norm": 1.8043779827002198, "language_loss": 0.78375775, "learning_rate": 2.8344356941860847e-06, "loss": 0.80576229, "num_input_tokens_seen": 68505850, "step": 3178, "time_per_iteration": 2.813248872756958 }, { "auxiliary_loss_clip": 0.01178253, "auxiliary_loss_mlp": 0.01027387, "balance_loss_clip": 0.938398, "balance_loss_mlp": 1.01937056, "epoch": 0.3822521493416702, "flos": 35516945773440.0, "grad_norm": 2.293173589848934, "language_loss": 0.66559726, "learning_rate": 2.8337276964101403e-06, "loss": 0.68765366, "num_input_tokens_seen": 68526290, "step": 3179, "time_per_iteration": 2.8560667037963867 }, { "auxiliary_loss_clip": 0.01189023, "auxiliary_loss_mlp": 0.01026822, "balance_loss_clip": 1.0164938, "balance_loss_mlp": 1.01870942, "epoch": 0.3823723922323093, "flos": 21069904181760.0, "grad_norm": 3.504636592440565, "language_loss": 0.76272744, "learning_rate": 2.833019572158367e-06, "loss": 0.78488588, "num_input_tokens_seen": 68544725, "step": 3180, "time_per_iteration": 2.633114814758301 }, { "auxiliary_loss_clip": 0.01183118, "auxiliary_loss_mlp": 0.01034744, "balance_loss_clip": 0.97809273, "balance_loss_mlp": 1.02582669, "epoch": 0.38249263512294834, "flos": 19789149864960.0, "grad_norm": 2.643502136234976, "language_loss": 0.79824907, "learning_rate": 2.8323113215381872e-06, "loss": 0.82042766, "num_input_tokens_seen": 68563070, "step": 3181, "time_per_iteration": 2.9048571586608887 }, { "auxiliary_loss_clip": 0.01181992, "auxiliary_loss_mlp": 0.01027013, "balance_loss_clip": 0.93817794, "balance_loss_mlp": 1.0181675, "epoch": 0.38261287801358745, "flos": 21433930565760.0, "grad_norm": 1.8798652983849298, "language_loss": 0.76203549, "learning_rate": 2.831602944657042e-06, "loss": 0.78412551, "num_input_tokens_seen": 68581150, "step": 3182, "time_per_iteration": 2.702089309692383 }, { "auxiliary_loss_clip": 0.01193075, "auxiliary_loss_mlp": 0.01028283, "balance_loss_clip": 0.97770834, "balance_loss_mlp": 1.02014685, "epoch": 0.38273312090422656, "flos": 21981568296960.0, "grad_norm": 2.095289321818517, "language_loss": 0.74489582, "learning_rate": 2.830894441622391e-06, "loss": 0.76710939, "num_input_tokens_seen": 68597800, "step": 3183, "time_per_iteration": 2.6747171878814697 }, { "auxiliary_loss_clip": 0.01180794, "auxiliary_loss_mlp": 0.0112471, "balance_loss_clip": 0.93600428, "balance_loss_mlp": 0.0, "epoch": 0.3828533637948656, "flos": 24790895838720.0, "grad_norm": 1.7964861925520441, "language_loss": 0.80317056, "learning_rate": 2.8301858125417134e-06, "loss": 0.82622564, "num_input_tokens_seen": 68617640, "step": 3184, "time_per_iteration": 2.8168234825134277 }, { "auxiliary_loss_clip": 0.01189082, "auxiliary_loss_mlp": 0.01027221, "balance_loss_clip": 0.98094237, "balance_loss_mlp": 1.01939535, "epoch": 0.38297360668550473, "flos": 22455445449600.0, "grad_norm": 1.6451811290945086, "language_loss": 0.73855114, "learning_rate": 2.8294770575225082e-06, "loss": 0.76071417, "num_input_tokens_seen": 68637770, "step": 3185, "time_per_iteration": 2.687913179397583 }, { "auxiliary_loss_clip": 0.01188252, "auxiliary_loss_mlp": 0.01032016, "balance_loss_clip": 1.0192064, "balance_loss_mlp": 1.02395201, "epoch": 0.3830938495761438, "flos": 24896903852160.0, "grad_norm": 1.6907552328330453, "language_loss": 0.83803231, "learning_rate": 2.828768176672293e-06, "loss": 0.86023498, "num_input_tokens_seen": 68656885, "step": 3186, "time_per_iteration": 2.665295362472534 }, { "auxiliary_loss_clip": 0.01181118, "auxiliary_loss_mlp": 0.0102796, "balance_loss_clip": 0.93658459, "balance_loss_mlp": 1.01946616, "epoch": 0.3832140924667829, "flos": 33036236784000.0, "grad_norm": 1.6956745393389605, "language_loss": 0.71570605, "learning_rate": 2.8280591700986044e-06, "loss": 0.73779678, "num_input_tokens_seen": 68678750, "step": 3187, "time_per_iteration": 2.820096731185913 }, { "auxiliary_loss_clip": 0.01188604, "auxiliary_loss_mlp": 0.01029279, "balance_loss_clip": 0.97543383, "balance_loss_mlp": 1.02095783, "epoch": 0.383334335357422, "flos": 31903721896320.0, "grad_norm": 1.8864130174019806, "language_loss": 0.7464776, "learning_rate": 2.827350037908999e-06, "loss": 0.76865649, "num_input_tokens_seen": 68698190, "step": 3188, "time_per_iteration": 4.133085489273071 }, { "auxiliary_loss_clip": 0.01189992, "auxiliary_loss_mlp": 0.01029051, "balance_loss_clip": 0.93927586, "balance_loss_mlp": 1.02060485, "epoch": 0.38345457824806106, "flos": 19791915212160.0, "grad_norm": 2.84785174638537, "language_loss": 0.79029471, "learning_rate": 2.8266407802110496e-06, "loss": 0.81248516, "num_input_tokens_seen": 68716445, "step": 3189, "time_per_iteration": 2.8135123252868652 }, { "auxiliary_loss_clip": 0.01184148, "auxiliary_loss_mlp": 0.01036573, "balance_loss_clip": 0.82158923, "balance_loss_mlp": 1.02787066, "epoch": 0.3835748211387002, "flos": 22419391173120.0, "grad_norm": 2.965503672438945, "language_loss": 0.76041484, "learning_rate": 2.8259313971123515e-06, "loss": 0.78262198, "num_input_tokens_seen": 68737565, "step": 3190, "time_per_iteration": 3.886563301086426 }, { "auxiliary_loss_clip": 0.01186464, "auxiliary_loss_mlp": 0.01027315, "balance_loss_clip": 1.0184617, "balance_loss_mlp": 1.01928663, "epoch": 0.3836950640293393, "flos": 25118436983040.0, "grad_norm": 1.5152575835402569, "language_loss": 0.78245234, "learning_rate": 2.8252218887205166e-06, "loss": 0.80459011, "num_input_tokens_seen": 68758255, "step": 3191, "time_per_iteration": 2.989163637161255 }, { "auxiliary_loss_clip": 0.0118389, "auxiliary_loss_mlp": 0.01033602, "balance_loss_clip": 0.86271095, "balance_loss_mlp": 1.02509046, "epoch": 0.38381530691997834, "flos": 21799213925760.0, "grad_norm": 1.7555028459046753, "language_loss": 0.80944276, "learning_rate": 2.824512255143178e-06, "loss": 0.83161759, "num_input_tokens_seen": 68777490, "step": 3192, "time_per_iteration": 2.8305187225341797 }, { "auxiliary_loss_clip": 0.01188968, "auxiliary_loss_mlp": 0.01028876, "balance_loss_clip": 0.93968582, "balance_loss_mlp": 1.02097845, "epoch": 0.38393554981061745, "flos": 21252689516160.0, "grad_norm": 1.7600587274916155, "language_loss": 0.79195476, "learning_rate": 2.8238024964879855e-06, "loss": 0.81413317, "num_input_tokens_seen": 68798385, "step": 3193, "time_per_iteration": 2.8433892726898193 }, { "auxiliary_loss_clip": 0.01191782, "auxiliary_loss_mlp": 0.01027155, "balance_loss_clip": 1.05597901, "balance_loss_mlp": 1.01841152, "epoch": 0.38405579270125656, "flos": 17019360218880.0, "grad_norm": 2.241016450093393, "language_loss": 0.76801753, "learning_rate": 2.8230926128626095e-06, "loss": 0.79020691, "num_input_tokens_seen": 68816880, "step": 3194, "time_per_iteration": 2.6364293098449707 }, { "auxiliary_loss_clip": 0.01176681, "auxiliary_loss_mlp": 0.01023607, "balance_loss_clip": 0.97555476, "balance_loss_mlp": 1.01520276, "epoch": 0.3841760355918956, "flos": 21835375943040.0, "grad_norm": 3.004285210027862, "language_loss": 0.79128647, "learning_rate": 2.822382604374738e-06, "loss": 0.8132894, "num_input_tokens_seen": 68835805, "step": 3195, "time_per_iteration": 3.9481306076049805 }, { "auxiliary_loss_clip": 0.01185673, "auxiliary_loss_mlp": 0.01030701, "balance_loss_clip": 0.98013192, "balance_loss_mlp": 1.02260685, "epoch": 0.3842962784825347, "flos": 25915114684800.0, "grad_norm": 1.950897197756079, "language_loss": 0.65220881, "learning_rate": 2.8216724711320793e-06, "loss": 0.67437255, "num_input_tokens_seen": 68854930, "step": 3196, "time_per_iteration": 2.7649476528167725 }, { "auxiliary_loss_clip": 0.01184991, "auxiliary_loss_mlp": 0.01123756, "balance_loss_clip": 1.05311942, "balance_loss_mlp": 0.0, "epoch": 0.38441652137317384, "flos": 25337492075520.0, "grad_norm": 1.4715661953335895, "language_loss": 0.79867113, "learning_rate": 2.820962213242361e-06, "loss": 0.82175863, "num_input_tokens_seen": 68874260, "step": 3197, "time_per_iteration": 3.866159200668335 }, { "auxiliary_loss_clip": 0.01187212, "auxiliary_loss_mlp": 0.0103495, "balance_loss_clip": 1.01957154, "balance_loss_mlp": 1.02684951, "epoch": 0.3845367642638129, "flos": 18113486446080.0, "grad_norm": 2.0693629509356244, "language_loss": 0.8442874, "learning_rate": 2.8202518308133264e-06, "loss": 0.86650908, "num_input_tokens_seen": 68891535, "step": 3198, "time_per_iteration": 2.6857118606567383 }, { "auxiliary_loss_clip": 0.01189108, "auxiliary_loss_mlp": 0.01031571, "balance_loss_clip": 1.05343032, "balance_loss_mlp": 1.02317858, "epoch": 0.384657007154452, "flos": 25228395492480.0, "grad_norm": 1.6924394182770377, "language_loss": 0.73062563, "learning_rate": 2.8195413239527426e-06, "loss": 0.75283247, "num_input_tokens_seen": 68911275, "step": 3199, "time_per_iteration": 2.712351083755493 }, { "auxiliary_loss_clip": 0.01179974, "auxiliary_loss_mlp": 0.0103093, "balance_loss_clip": 1.01342916, "balance_loss_mlp": 1.02275848, "epoch": 0.38477725004509106, "flos": 19865855358720.0, "grad_norm": 2.3018997307637794, "language_loss": 0.80687571, "learning_rate": 2.8188306927683906e-06, "loss": 0.82898474, "num_input_tokens_seen": 68930745, "step": 3200, "time_per_iteration": 2.730107545852661 }, { "auxiliary_loss_clip": 0.01187111, "auxiliary_loss_mlp": 0.01029119, "balance_loss_clip": 0.97923869, "balance_loss_mlp": 1.02107882, "epoch": 0.38489749293573017, "flos": 18259391491200.0, "grad_norm": 2.32076706972064, "language_loss": 0.75132906, "learning_rate": 2.818119937368074e-06, "loss": 0.77349132, "num_input_tokens_seen": 68949380, "step": 3201, "time_per_iteration": 2.72652006149292 }, { "auxiliary_loss_clip": 0.01190842, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.0146879, "balance_loss_mlp": 1.02237654, "epoch": 0.3850177358263693, "flos": 24389163152640.0, "grad_norm": 1.8171953407504602, "language_loss": 0.65414405, "learning_rate": 2.817409057859613e-06, "loss": 0.67635751, "num_input_tokens_seen": 68968370, "step": 3202, "time_per_iteration": 2.6925618648529053 }, { "auxiliary_loss_clip": 0.0117796, "auxiliary_loss_mlp": 0.01035759, "balance_loss_clip": 0.90065885, "balance_loss_mlp": 1.02756369, "epoch": 0.38513797871700833, "flos": 17671533505920.0, "grad_norm": 1.9826081028998828, "language_loss": 0.79317641, "learning_rate": 2.8166980543508482e-06, "loss": 0.81531358, "num_input_tokens_seen": 68984260, "step": 3203, "time_per_iteration": 2.9608187675476074 }, { "auxiliary_loss_clip": 0.01191162, "auxiliary_loss_mlp": 0.01029323, "balance_loss_clip": 1.05709827, "balance_loss_mlp": 1.0205071, "epoch": 0.38525822160764744, "flos": 25739583897600.0, "grad_norm": 1.7911810541851854, "language_loss": 0.80201226, "learning_rate": 2.815986926949638e-06, "loss": 0.82421714, "num_input_tokens_seen": 69002760, "step": 3204, "time_per_iteration": 2.623842716217041 }, { "auxiliary_loss_clip": 0.01186024, "auxiliary_loss_mlp": 0.01023826, "balance_loss_clip": 1.01897812, "balance_loss_mlp": 1.01568449, "epoch": 0.38537846449828655, "flos": 20193647898240.0, "grad_norm": 1.89728769238286, "language_loss": 0.80101901, "learning_rate": 2.8152756757638597e-06, "loss": 0.82311749, "num_input_tokens_seen": 69021260, "step": 3205, "time_per_iteration": 2.760883331298828 }, { "auxiliary_loss_clip": 0.01185872, "auxiliary_loss_mlp": 0.01024942, "balance_loss_clip": 1.01641631, "balance_loss_mlp": 1.01730704, "epoch": 0.3854987073889256, "flos": 23039352938880.0, "grad_norm": 1.8881072203699492, "language_loss": 0.84210348, "learning_rate": 2.8145643009014093e-06, "loss": 0.86421162, "num_input_tokens_seen": 69039755, "step": 3206, "time_per_iteration": 2.7281532287597656 }, { "auxiliary_loss_clip": 0.0119055, "auxiliary_loss_mlp": 0.01033876, "balance_loss_clip": 1.01866519, "balance_loss_mlp": 1.02631259, "epoch": 0.3856189502795647, "flos": 20190631155840.0, "grad_norm": 1.838759244959022, "language_loss": 0.79217851, "learning_rate": 2.813852802470202e-06, "loss": 0.81442279, "num_input_tokens_seen": 69057650, "step": 3207, "time_per_iteration": 2.676762104034424 }, { "auxiliary_loss_clip": 0.0118022, "auxiliary_loss_mlp": 0.01027943, "balance_loss_clip": 0.97769916, "balance_loss_mlp": 1.01996779, "epoch": 0.38573919317020383, "flos": 25702631781120.0, "grad_norm": 1.9012797855220147, "language_loss": 0.72376966, "learning_rate": 2.8131411805781717e-06, "loss": 0.74585128, "num_input_tokens_seen": 69077775, "step": 3208, "time_per_iteration": 2.7475123405456543 }, { "auxiliary_loss_clip": 0.01189621, "auxiliary_loss_mlp": 0.01028729, "balance_loss_clip": 0.98082423, "balance_loss_mlp": 1.01994896, "epoch": 0.3858594360608429, "flos": 29821405628160.0, "grad_norm": 2.0572093988994924, "language_loss": 0.64013219, "learning_rate": 2.8124294353332707e-06, "loss": 0.66231567, "num_input_tokens_seen": 69096450, "step": 3209, "time_per_iteration": 2.730544090270996 }, { "auxiliary_loss_clip": 0.01195876, "auxiliary_loss_mlp": 0.0102647, "balance_loss_clip": 0.94288403, "balance_loss_mlp": 1.01840568, "epoch": 0.385979678951482, "flos": 24790428961920.0, "grad_norm": 2.6581600296417425, "language_loss": 0.77728462, "learning_rate": 2.8117175668434713e-06, "loss": 0.79950809, "num_input_tokens_seen": 69116110, "step": 3210, "time_per_iteration": 2.826087236404419 }, { "auxiliary_loss_clip": 0.01188762, "auxiliary_loss_mlp": 0.01029236, "balance_loss_clip": 1.05468249, "balance_loss_mlp": 1.02019417, "epoch": 0.3860999218421211, "flos": 21287881866240.0, "grad_norm": 2.481070133697372, "language_loss": 0.69927722, "learning_rate": 2.811005575216762e-06, "loss": 0.72145724, "num_input_tokens_seen": 69134825, "step": 3211, "time_per_iteration": 2.7416799068450928 }, { "auxiliary_loss_clip": 0.01180403, "auxiliary_loss_mlp": 0.01024141, "balance_loss_clip": 0.94065094, "balance_loss_mlp": 1.01595163, "epoch": 0.38622016473276016, "flos": 24536720223360.0, "grad_norm": 1.372435307937371, "language_loss": 0.78846323, "learning_rate": 2.8102934605611513e-06, "loss": 0.81050861, "num_input_tokens_seen": 69156460, "step": 3212, "time_per_iteration": 2.764953136444092 }, { "auxiliary_loss_clip": 0.01192271, "auxiliary_loss_mlp": 0.01034359, "balance_loss_clip": 0.9811334, "balance_loss_mlp": 1.02665198, "epoch": 0.3863404076233993, "flos": 20558212986240.0, "grad_norm": 2.717545717469259, "language_loss": 0.67337525, "learning_rate": 2.8095812229846665e-06, "loss": 0.69564158, "num_input_tokens_seen": 69176420, "step": 3213, "time_per_iteration": 2.722651481628418 }, { "auxiliary_loss_clip": 0.01186527, "auxiliary_loss_mlp": 0.01028096, "balance_loss_clip": 0.97668499, "balance_loss_mlp": 1.02024055, "epoch": 0.3864606505140384, "flos": 22346277039360.0, "grad_norm": 2.2060311210306605, "language_loss": 0.69255984, "learning_rate": 2.808868862595355e-06, "loss": 0.71470612, "num_input_tokens_seen": 69196665, "step": 3214, "time_per_iteration": 3.7683727741241455 }, { "auxiliary_loss_clip": 0.01189095, "auxiliary_loss_mlp": 0.01026381, "balance_loss_clip": 1.01575971, "balance_loss_mlp": 1.01801896, "epoch": 0.38658089340467744, "flos": 25703601448320.0, "grad_norm": 1.8135340345078441, "language_loss": 0.79541051, "learning_rate": 2.8081563795012795e-06, "loss": 0.81756532, "num_input_tokens_seen": 69216290, "step": 3215, "time_per_iteration": 2.8446602821350098 }, { "auxiliary_loss_clip": 0.01194415, "auxiliary_loss_mlp": 0.0103599, "balance_loss_clip": 0.97804368, "balance_loss_mlp": 1.02738261, "epoch": 0.38670113629531655, "flos": 33802534558080.0, "grad_norm": 1.599435743829535, "language_loss": 0.73704958, "learning_rate": 2.807443773810524e-06, "loss": 0.75935352, "num_input_tokens_seen": 69237550, "step": 3216, "time_per_iteration": 2.7623229026794434 }, { "auxiliary_loss_clip": 0.01182986, "auxiliary_loss_mlp": 0.01028756, "balance_loss_clip": 0.93973529, "balance_loss_mlp": 1.01990485, "epoch": 0.3868213791859556, "flos": 23331522165120.0, "grad_norm": 21.49601175696693, "language_loss": 0.8958075, "learning_rate": 2.80673104563119e-06, "loss": 0.91792488, "num_input_tokens_seen": 69258175, "step": 3217, "time_per_iteration": 3.711980104446411 }, { "auxiliary_loss_clip": 0.01186911, "auxiliary_loss_mlp": 0.01026794, "balance_loss_clip": 1.01932228, "balance_loss_mlp": 1.01898599, "epoch": 0.3869416220765947, "flos": 18441530380800.0, "grad_norm": 1.7105148005664623, "language_loss": 0.78968292, "learning_rate": 2.8060181950713976e-06, "loss": 0.81181991, "num_input_tokens_seen": 69274965, "step": 3218, "time_per_iteration": 2.632242202758789 }, { "auxiliary_loss_clip": 0.01179567, "auxiliary_loss_mlp": 0.01034249, "balance_loss_clip": 0.93670273, "balance_loss_mlp": 1.02558887, "epoch": 0.3870618649672338, "flos": 15632992938240.0, "grad_norm": 1.9707178286085192, "language_loss": 0.81306159, "learning_rate": 2.805305222239286e-06, "loss": 0.83519971, "num_input_tokens_seen": 69292220, "step": 3219, "time_per_iteration": 2.771085023880005 }, { "auxiliary_loss_clip": 0.01182574, "auxiliary_loss_mlp": 0.01031997, "balance_loss_clip": 0.97710103, "balance_loss_mlp": 1.02377748, "epoch": 0.3871821078578729, "flos": 23513804709120.0, "grad_norm": 1.7163946296361312, "language_loss": 0.73931181, "learning_rate": 2.8045921272430118e-06, "loss": 0.76145756, "num_input_tokens_seen": 69311900, "step": 3220, "time_per_iteration": 2.785839796066284 }, { "auxiliary_loss_clip": 0.01190809, "auxiliary_loss_mlp": 0.01029865, "balance_loss_clip": 1.01482582, "balance_loss_mlp": 1.02137721, "epoch": 0.387302350748512, "flos": 17778259791360.0, "grad_norm": 2.0910714843992038, "language_loss": 0.76920742, "learning_rate": 2.803878910190753e-06, "loss": 0.79141414, "num_input_tokens_seen": 69328820, "step": 3221, "time_per_iteration": 3.89057993888855 }, { "auxiliary_loss_clip": 0.01191974, "auxiliary_loss_mlp": 0.01031915, "balance_loss_clip": 1.01703393, "balance_loss_mlp": 1.02384424, "epoch": 0.3874225936391511, "flos": 11503409097600.0, "grad_norm": 2.4061407277844333, "language_loss": 0.82080781, "learning_rate": 2.8031655711907017e-06, "loss": 0.84304667, "num_input_tokens_seen": 69342525, "step": 3222, "time_per_iteration": 2.71869158744812 }, { "auxiliary_loss_clip": 0.0118726, "auxiliary_loss_mlp": 0.01032954, "balance_loss_clip": 1.01725805, "balance_loss_mlp": 1.02465105, "epoch": 0.38754283652979016, "flos": 21945154884480.0, "grad_norm": 2.184754659349082, "language_loss": 0.80208409, "learning_rate": 2.8024521103510723e-06, "loss": 0.82428622, "num_input_tokens_seen": 69359295, "step": 3223, "time_per_iteration": 3.6479368209838867 }, { "auxiliary_loss_clip": 0.01183427, "auxiliary_loss_mlp": 0.0103348, "balance_loss_clip": 1.01245904, "balance_loss_mlp": 1.02540052, "epoch": 0.38766307942042927, "flos": 21175984022400.0, "grad_norm": 1.6842368257772715, "language_loss": 0.7531842, "learning_rate": 2.8017385277800952e-06, "loss": 0.77535331, "num_input_tokens_seen": 69377650, "step": 3224, "time_per_iteration": 2.6602609157562256 }, { "auxiliary_loss_clip": 0.01188596, "auxiliary_loss_mlp": 0.01027235, "balance_loss_clip": 0.94113147, "balance_loss_mlp": 1.01871717, "epoch": 0.3877833223110684, "flos": 27417294391680.0, "grad_norm": 1.9307021704777538, "language_loss": 0.74826753, "learning_rate": 2.8010248235860213e-06, "loss": 0.7704258, "num_input_tokens_seen": 69397765, "step": 3225, "time_per_iteration": 2.7853682041168213 }, { "auxiliary_loss_clip": 0.01112611, "auxiliary_loss_mlp": 0.01119491, "balance_loss_clip": 0.96389067, "balance_loss_mlp": 0.0, "epoch": 0.38790356520170743, "flos": 64500019879680.0, "grad_norm": 0.832409822098683, "language_loss": 0.62812489, "learning_rate": 2.8003109978771192e-06, "loss": 0.65044588, "num_input_tokens_seen": 69458930, "step": 3226, "time_per_iteration": 3.36919903755188 }, { "auxiliary_loss_clip": 0.01170476, "auxiliary_loss_mlp": 0.01025695, "balance_loss_clip": 0.93309867, "balance_loss_mlp": 1.017344, "epoch": 0.38802380809234654, "flos": 22345415112960.0, "grad_norm": 1.9276678175016155, "language_loss": 0.79100871, "learning_rate": 2.799597050761674e-06, "loss": 0.81297046, "num_input_tokens_seen": 69475135, "step": 3227, "time_per_iteration": 2.740190267562866 }, { "auxiliary_loss_clip": 0.01189949, "auxiliary_loss_mlp": 0.01027076, "balance_loss_clip": 1.05562258, "balance_loss_mlp": 1.01843929, "epoch": 0.38814405098298566, "flos": 25261361199360.0, "grad_norm": 1.7256239283319057, "language_loss": 0.79201293, "learning_rate": 2.7988829823479924e-06, "loss": 0.81418318, "num_input_tokens_seen": 69493525, "step": 3228, "time_per_iteration": 2.701134204864502 }, { "auxiliary_loss_clip": 0.01174995, "auxiliary_loss_mlp": 0.01031744, "balance_loss_clip": 0.97363973, "balance_loss_mlp": 1.02267802, "epoch": 0.3882642938736247, "flos": 18841180078080.0, "grad_norm": 1.7642937027482246, "language_loss": 0.64012432, "learning_rate": 2.7981687927443976e-06, "loss": 0.66219169, "num_input_tokens_seen": 69510325, "step": 3229, "time_per_iteration": 2.701906681060791 }, { "auxiliary_loss_clip": 0.01183492, "auxiliary_loss_mlp": 0.010269, "balance_loss_clip": 1.01221752, "balance_loss_mlp": 1.01893687, "epoch": 0.3883845367642638, "flos": 21652806090240.0, "grad_norm": 1.722820097238357, "language_loss": 0.85631204, "learning_rate": 2.797454482059231e-06, "loss": 0.87841594, "num_input_tokens_seen": 69530480, "step": 3230, "time_per_iteration": 2.7006590366363525 }, { "auxiliary_loss_clip": 0.01189731, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.05524862, "balance_loss_mlp": 1.0178988, "epoch": 0.3885047796549029, "flos": 20557530627840.0, "grad_norm": 3.2756999073342348, "language_loss": 0.84076393, "learning_rate": 2.7967400504008537e-06, "loss": 0.86292279, "num_input_tokens_seen": 69549780, "step": 3231, "time_per_iteration": 2.6840436458587646 }, { "auxiliary_loss_clip": 0.01098221, "auxiliary_loss_mlp": 0.01013723, "balance_loss_clip": 0.8775574, "balance_loss_mlp": 1.01126754, "epoch": 0.388625022545542, "flos": 64325491695360.0, "grad_norm": 0.8302810064182353, "language_loss": 0.57401901, "learning_rate": 2.7960254978776456e-06, "loss": 0.59513849, "num_input_tokens_seen": 69611870, "step": 3232, "time_per_iteration": 3.358896255493164 }, { "auxiliary_loss_clip": 0.01192266, "auxiliary_loss_mlp": 0.01029782, "balance_loss_clip": 1.05656016, "balance_loss_mlp": 1.02185488, "epoch": 0.3887452654361811, "flos": 18113881495680.0, "grad_norm": 2.2404855980746126, "language_loss": 0.81974632, "learning_rate": 2.7953108245980006e-06, "loss": 0.84196675, "num_input_tokens_seen": 69630385, "step": 3233, "time_per_iteration": 2.830672264099121 }, { "auxiliary_loss_clip": 0.01181154, "auxiliary_loss_mlp": 0.01026586, "balance_loss_clip": 0.97864765, "balance_loss_mlp": 1.01870573, "epoch": 0.38886550832682015, "flos": 24975261371520.0, "grad_norm": 1.558478278309921, "language_loss": 0.7351563, "learning_rate": 2.7945960306703365e-06, "loss": 0.75723368, "num_input_tokens_seen": 69653370, "step": 3234, "time_per_iteration": 2.828996419906616 }, { "auxiliary_loss_clip": 0.01189634, "auxiliary_loss_mlp": 0.01031091, "balance_loss_clip": 1.01615191, "balance_loss_mlp": 1.02336621, "epoch": 0.38898575121745926, "flos": 27199496275200.0, "grad_norm": 1.6061375620619993, "language_loss": 0.65657854, "learning_rate": 2.7938811162030865e-06, "loss": 0.6787858, "num_input_tokens_seen": 69673635, "step": 3235, "time_per_iteration": 2.763765335083008 }, { "auxiliary_loss_clip": 0.01183719, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.01648545, "balance_loss_mlp": 1.0186975, "epoch": 0.3891059941080984, "flos": 28763728727040.0, "grad_norm": 1.8235303888072636, "language_loss": 0.82171297, "learning_rate": 2.793166081304702e-06, "loss": 0.84381169, "num_input_tokens_seen": 69694130, "step": 3236, "time_per_iteration": 2.767437696456909 }, { "auxiliary_loss_clip": 0.01187598, "auxiliary_loss_mlp": 0.01027435, "balance_loss_clip": 0.93818331, "balance_loss_mlp": 1.01943564, "epoch": 0.38922623699873743, "flos": 22893447893760.0, "grad_norm": 1.876179053758285, "language_loss": 0.82619524, "learning_rate": 2.7924509260836543e-06, "loss": 0.84834552, "num_input_tokens_seen": 69713255, "step": 3237, "time_per_iteration": 2.8168582916259766 }, { "auxiliary_loss_clip": 0.01183312, "auxiliary_loss_mlp": 0.01029974, "balance_loss_clip": 0.93853396, "balance_loss_mlp": 1.02225494, "epoch": 0.38934647988937654, "flos": 19792418002560.0, "grad_norm": 1.4091360017151375, "language_loss": 0.6830523, "learning_rate": 2.791735650648431e-06, "loss": 0.70518512, "num_input_tokens_seen": 69732375, "step": 3238, "time_per_iteration": 2.7555437088012695 }, { "auxiliary_loss_clip": 0.01183219, "auxiliary_loss_mlp": 0.01028181, "balance_loss_clip": 0.97626567, "balance_loss_mlp": 1.01978898, "epoch": 0.38946672278001565, "flos": 19202081978880.0, "grad_norm": 2.228077785058058, "language_loss": 0.74654245, "learning_rate": 2.791020255107538e-06, "loss": 0.76865649, "num_input_tokens_seen": 69749745, "step": 3239, "time_per_iteration": 2.7437186241149902 }, { "auxiliary_loss_clip": 0.0117652, "auxiliary_loss_mlp": 0.0102672, "balance_loss_clip": 0.9367854, "balance_loss_mlp": 1.0189116, "epoch": 0.3895869656706547, "flos": 24936477661440.0, "grad_norm": 1.5739851519273946, "language_loss": 0.80792826, "learning_rate": 2.7903047395695023e-06, "loss": 0.82996064, "num_input_tokens_seen": 69769645, "step": 3240, "time_per_iteration": 3.7577106952667236 }, { "auxiliary_loss_clip": 0.01183817, "auxiliary_loss_mlp": 0.01124003, "balance_loss_clip": 1.01790333, "balance_loss_mlp": 0.0, "epoch": 0.3897072085612938, "flos": 24133622820480.0, "grad_norm": 2.154868501866276, "language_loss": 0.90480053, "learning_rate": 2.789589104142865e-06, "loss": 0.92787874, "num_input_tokens_seen": 69787270, "step": 3241, "time_per_iteration": 2.7320518493652344 }, { "auxiliary_loss_clip": 0.01188508, "auxiliary_loss_mlp": 0.01026234, "balance_loss_clip": 0.94096375, "balance_loss_mlp": 1.0183183, "epoch": 0.3898274514519329, "flos": 17166342672000.0, "grad_norm": 1.607713957090204, "language_loss": 0.7666719, "learning_rate": 2.7888733489361895e-06, "loss": 0.78881925, "num_input_tokens_seen": 69805685, "step": 3242, "time_per_iteration": 3.6696553230285645 }, { "auxiliary_loss_clip": 0.01095566, "auxiliary_loss_mlp": 0.01005812, "balance_loss_clip": 1.02994466, "balance_loss_mlp": 1.00355935, "epoch": 0.389947694342572, "flos": 66074807952000.0, "grad_norm": 0.7308255094502573, "language_loss": 0.58800995, "learning_rate": 2.788157474058054e-06, "loss": 0.60902369, "num_input_tokens_seen": 69867960, "step": 3243, "time_per_iteration": 3.304527521133423 }, { "auxiliary_loss_clip": 0.01180977, "auxiliary_loss_mlp": 0.01024135, "balance_loss_clip": 1.0513165, "balance_loss_mlp": 1.01641583, "epoch": 0.3900679372332111, "flos": 25740912700800.0, "grad_norm": 1.4951291931924562, "language_loss": 0.69841146, "learning_rate": 2.7874414796170555e-06, "loss": 0.72046262, "num_input_tokens_seen": 69889450, "step": 3244, "time_per_iteration": 2.7032618522644043 }, { "auxiliary_loss_clip": 0.01178862, "auxiliary_loss_mlp": 0.01029865, "balance_loss_clip": 1.01380241, "balance_loss_mlp": 1.02163339, "epoch": 0.3901881801238502, "flos": 11801611808640.0, "grad_norm": 2.514918020559133, "language_loss": 0.83749849, "learning_rate": 2.7867253657218113e-06, "loss": 0.85958582, "num_input_tokens_seen": 69903340, "step": 3245, "time_per_iteration": 2.663862466812134 }, { "auxiliary_loss_clip": 0.01179977, "auxiliary_loss_mlp": 0.01124326, "balance_loss_clip": 0.9740752, "balance_loss_mlp": 0.0, "epoch": 0.39030842301448926, "flos": 27308951994240.0, "grad_norm": 1.6181211382678755, "language_loss": 0.73131341, "learning_rate": 2.7860091324809544e-06, "loss": 0.7543565, "num_input_tokens_seen": 69924400, "step": 3246, "time_per_iteration": 2.763366460800171 }, { "auxiliary_loss_clip": 0.01184387, "auxiliary_loss_mlp": 0.0102192, "balance_loss_clip": 1.01903224, "balance_loss_mlp": 1.01429021, "epoch": 0.39042866590512837, "flos": 27163334257920.0, "grad_norm": 1.8468322275829887, "language_loss": 0.8115325, "learning_rate": 2.7852927800031377e-06, "loss": 0.83359551, "num_input_tokens_seen": 69944565, "step": 3247, "time_per_iteration": 3.7535085678100586 }, { "auxiliary_loss_clip": 0.01184112, "auxiliary_loss_mlp": 0.01028585, "balance_loss_clip": 0.97737396, "balance_loss_mlp": 1.02092004, "epoch": 0.3905489087957674, "flos": 29716115886720.0, "grad_norm": 1.8253396109424747, "language_loss": 0.8269462, "learning_rate": 2.7845763083970298e-06, "loss": 0.84907317, "num_input_tokens_seen": 69964965, "step": 3248, "time_per_iteration": 2.78688645362854 }, { "auxiliary_loss_clip": 0.01175413, "auxiliary_loss_mlp": 0.01024873, "balance_loss_clip": 1.01377892, "balance_loss_mlp": 1.01674247, "epoch": 0.39066915168640653, "flos": 24498618871680.0, "grad_norm": 1.8445058750215084, "language_loss": 0.81759578, "learning_rate": 2.7838597177713205e-06, "loss": 0.83959866, "num_input_tokens_seen": 69986055, "step": 3249, "time_per_iteration": 3.675887107849121 }, { "auxiliary_loss_clip": 0.0117071, "auxiliary_loss_mlp": 0.01028049, "balance_loss_clip": 0.86239403, "balance_loss_mlp": 1.01995504, "epoch": 0.39078939457704565, "flos": 20558572122240.0, "grad_norm": 1.7046074720610764, "language_loss": 0.73609614, "learning_rate": 2.7831430082347143e-06, "loss": 0.75808376, "num_input_tokens_seen": 70005260, "step": 3250, "time_per_iteration": 2.763711929321289 }, { "auxiliary_loss_clip": 0.01186661, "auxiliary_loss_mlp": 0.01123552, "balance_loss_clip": 1.01763439, "balance_loss_mlp": 0.0, "epoch": 0.3909096374676847, "flos": 22783417557120.0, "grad_norm": 1.849461760474183, "language_loss": 0.82198608, "learning_rate": 2.7824261798959373e-06, "loss": 0.84508824, "num_input_tokens_seen": 70023440, "step": 3251, "time_per_iteration": 2.602813243865967 }, { "auxiliary_loss_clip": 0.01183251, "auxiliary_loss_mlp": 0.01029501, "balance_loss_clip": 0.97408801, "balance_loss_mlp": 1.02166867, "epoch": 0.3910298803583238, "flos": 23003119094400.0, "grad_norm": 1.7586196330549413, "language_loss": 0.79735816, "learning_rate": 2.78170923286373e-06, "loss": 0.81948566, "num_input_tokens_seen": 70043040, "step": 3252, "time_per_iteration": 2.7489404678344727 }, { "auxiliary_loss_clip": 0.01177837, "auxiliary_loss_mlp": 0.01029418, "balance_loss_clip": 0.82740116, "balance_loss_mlp": 1.02113271, "epoch": 0.3911501232489629, "flos": 24316264500480.0, "grad_norm": 2.423790069492652, "language_loss": 0.84044045, "learning_rate": 2.780992167246854e-06, "loss": 0.86251295, "num_input_tokens_seen": 70060565, "step": 3253, "time_per_iteration": 2.844003677368164 }, { "auxiliary_loss_clip": 0.01099907, "auxiliary_loss_mlp": 0.01000699, "balance_loss_clip": 0.95316505, "balance_loss_mlp": 0.99843448, "epoch": 0.391270366139602, "flos": 60869054684160.0, "grad_norm": 0.9679550067174092, "language_loss": 0.72184753, "learning_rate": 2.7802749831540883e-06, "loss": 0.74285364, "num_input_tokens_seen": 70119465, "step": 3254, "time_per_iteration": 3.2908663749694824 }, { "auxiliary_loss_clip": 0.01184312, "auxiliary_loss_mlp": 0.01026486, "balance_loss_clip": 0.89985716, "balance_loss_mlp": 1.01870775, "epoch": 0.3913906090302411, "flos": 21543494025600.0, "grad_norm": 1.8415505654874702, "language_loss": 0.81649423, "learning_rate": 2.7795576806942268e-06, "loss": 0.83860219, "num_input_tokens_seen": 70138270, "step": 3255, "time_per_iteration": 2.789083242416382 }, { "auxiliary_loss_clip": 0.01102892, "auxiliary_loss_mlp": 0.01011721, "balance_loss_clip": 0.96512967, "balance_loss_mlp": 1.00953913, "epoch": 0.3915108519208802, "flos": 49839953702400.0, "grad_norm": 0.7688424752823143, "language_loss": 0.54930007, "learning_rate": 2.778840259976085e-06, "loss": 0.57044619, "num_input_tokens_seen": 70193500, "step": 3256, "time_per_iteration": 3.225407361984253 }, { "auxiliary_loss_clip": 0.01186208, "auxiliary_loss_mlp": 0.01028269, "balance_loss_clip": 1.01494503, "balance_loss_mlp": 1.02028203, "epoch": 0.39163109481151925, "flos": 16506447960960.0, "grad_norm": 2.09009621630617, "language_loss": 0.77350473, "learning_rate": 2.778122721108495e-06, "loss": 0.79564953, "num_input_tokens_seen": 70211730, "step": 3257, "time_per_iteration": 2.7008984088897705 }, { "auxiliary_loss_clip": 0.01182854, "auxiliary_loss_mlp": 0.01031789, "balance_loss_clip": 1.01795423, "balance_loss_mlp": 1.02387381, "epoch": 0.39175133770215836, "flos": 26067484177920.0, "grad_norm": 1.8118415347102772, "language_loss": 0.88280869, "learning_rate": 2.7774050642003076e-06, "loss": 0.90495515, "num_input_tokens_seen": 70232540, "step": 3258, "time_per_iteration": 2.727943181991577 }, { "auxiliary_loss_clip": 0.01189489, "auxiliary_loss_mlp": 0.01030558, "balance_loss_clip": 1.05617642, "balance_loss_mlp": 1.02282715, "epoch": 0.3918715805927975, "flos": 21872076664320.0, "grad_norm": 2.855150416191539, "language_loss": 0.93541265, "learning_rate": 2.7766872893603896e-06, "loss": 0.95761311, "num_input_tokens_seen": 70252515, "step": 3259, "time_per_iteration": 2.739384889602661 }, { "auxiliary_loss_clip": 0.01186944, "auxiliary_loss_mlp": 0.01032206, "balance_loss_clip": 1.01746249, "balance_loss_mlp": 1.02495193, "epoch": 0.39199182348343653, "flos": 20376181837440.0, "grad_norm": 4.565914559556942, "language_loss": 0.7312144, "learning_rate": 2.7759693966976275e-06, "loss": 0.75340581, "num_input_tokens_seen": 70271020, "step": 3260, "time_per_iteration": 2.662269115447998 }, { "auxiliary_loss_clip": 0.01178843, "auxiliary_loss_mlp": 0.01028427, "balance_loss_clip": 0.93678606, "balance_loss_mlp": 1.02021933, "epoch": 0.39211206637407564, "flos": 21683545153920.0, "grad_norm": 2.8222482662010826, "language_loss": 0.85057944, "learning_rate": 2.7752513863209242e-06, "loss": 0.87265217, "num_input_tokens_seen": 70289600, "step": 3261, "time_per_iteration": 2.7752909660339355 }, { "auxiliary_loss_clip": 0.01179896, "auxiliary_loss_mlp": 0.01123483, "balance_loss_clip": 0.97969902, "balance_loss_mlp": 0.0, "epoch": 0.39223230926471475, "flos": 21066276908160.0, "grad_norm": 1.5964906008498059, "language_loss": 0.84654772, "learning_rate": 2.774533258339203e-06, "loss": 0.86958158, "num_input_tokens_seen": 70307060, "step": 3262, "time_per_iteration": 2.694809913635254 }, { "auxiliary_loss_clip": 0.01179834, "auxiliary_loss_mlp": 0.01027338, "balance_loss_clip": 0.89677572, "balance_loss_mlp": 1.01910639, "epoch": 0.3923525521553538, "flos": 17603016312960.0, "grad_norm": 3.556696744787129, "language_loss": 0.79624879, "learning_rate": 2.7738150128614014e-06, "loss": 0.81832045, "num_input_tokens_seen": 70324465, "step": 3263, "time_per_iteration": 2.9223415851593018 }, { "auxiliary_loss_clip": 0.0116983, "auxiliary_loss_mlp": 0.01024731, "balance_loss_clip": 0.93733102, "balance_loss_mlp": 1.01670241, "epoch": 0.3924727950459929, "flos": 20558284813440.0, "grad_norm": 1.6770855256298571, "language_loss": 0.89566708, "learning_rate": 2.7730966499964777e-06, "loss": 0.91761273, "num_input_tokens_seen": 70341415, "step": 3264, "time_per_iteration": 2.841647148132324 }, { "auxiliary_loss_clip": 0.01185359, "auxiliary_loss_mlp": 0.01033407, "balance_loss_clip": 1.05152941, "balance_loss_mlp": 1.02532434, "epoch": 0.39259303793663197, "flos": 16216110328320.0, "grad_norm": 7.248791152136381, "language_loss": 0.80841726, "learning_rate": 2.772378169853408e-06, "loss": 0.83060491, "num_input_tokens_seen": 70358985, "step": 3265, "time_per_iteration": 2.653513193130493 }, { "auxiliary_loss_clip": 0.01185456, "auxiliary_loss_mlp": 0.01029781, "balance_loss_clip": 0.94197345, "balance_loss_mlp": 1.02239871, "epoch": 0.3927132808272711, "flos": 16797001075200.0, "grad_norm": 1.9899274232279656, "language_loss": 0.74318731, "learning_rate": 2.771659572541183e-06, "loss": 0.76533967, "num_input_tokens_seen": 70376915, "step": 3266, "time_per_iteration": 3.6862852573394775 }, { "auxiliary_loss_clip": 0.01187796, "auxiliary_loss_mlp": 0.01030348, "balance_loss_clip": 1.01695263, "balance_loss_mlp": 1.02263486, "epoch": 0.3928335237179102, "flos": 20267228908800.0, "grad_norm": 1.890475176010539, "language_loss": 0.87297934, "learning_rate": 2.7709408581688143e-06, "loss": 0.89516079, "num_input_tokens_seen": 70396900, "step": 3267, "time_per_iteration": 2.6700541973114014 }, { "auxiliary_loss_clip": 0.01189112, "auxiliary_loss_mlp": 0.01029089, "balance_loss_clip": 0.94002223, "balance_loss_mlp": 1.02137041, "epoch": 0.39295376660854925, "flos": 24973250209920.0, "grad_norm": 1.5815003144807638, "language_loss": 0.88215208, "learning_rate": 2.7702220268453307e-06, "loss": 0.90433407, "num_input_tokens_seen": 70417260, "step": 3268, "time_per_iteration": 3.817883014678955 }, { "auxiliary_loss_clip": 0.01188208, "auxiliary_loss_mlp": 0.01030353, "balance_loss_clip": 0.97721976, "balance_loss_mlp": 1.02179432, "epoch": 0.39307400949918836, "flos": 18697788984960.0, "grad_norm": 2.520955338745875, "language_loss": 0.84757984, "learning_rate": 2.7695030786797785e-06, "loss": 0.86976552, "num_input_tokens_seen": 70433155, "step": 3269, "time_per_iteration": 2.7012035846710205 }, { "auxiliary_loss_clip": 0.01176391, "auxiliary_loss_mlp": 0.01026742, "balance_loss_clip": 0.9003576, "balance_loss_mlp": 1.01882041, "epoch": 0.39319425238982747, "flos": 22415476590720.0, "grad_norm": 2.0434440641596985, "language_loss": 0.74331057, "learning_rate": 2.7687840137812206e-06, "loss": 0.76534188, "num_input_tokens_seen": 70451240, "step": 3270, "time_per_iteration": 2.7494781017303467 }, { "auxiliary_loss_clip": 0.01085389, "auxiliary_loss_mlp": 0.01002625, "balance_loss_clip": 0.98464906, "balance_loss_mlp": 1.00043154, "epoch": 0.3933144952804665, "flos": 66192954762240.0, "grad_norm": 0.8010458440255932, "language_loss": 0.6213432, "learning_rate": 2.7680648322587395e-06, "loss": 0.64222336, "num_input_tokens_seen": 70516115, "step": 3271, "time_per_iteration": 3.232682228088379 }, { "auxiliary_loss_clip": 0.01186234, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.05441511, "balance_loss_mlp": 1.02280569, "epoch": 0.39343473817110564, "flos": 15487159720320.0, "grad_norm": 1.8957259989162598, "language_loss": 0.80950546, "learning_rate": 2.7673455342214334e-06, "loss": 0.83167344, "num_input_tokens_seen": 70533105, "step": 3272, "time_per_iteration": 2.606093406677246 }, { "auxiliary_loss_clip": 0.01186796, "auxiliary_loss_mlp": 0.01034767, "balance_loss_clip": 1.01685238, "balance_loss_mlp": 1.02646375, "epoch": 0.39355498106174475, "flos": 21324905809920.0, "grad_norm": 1.9104338663672158, "language_loss": 0.76251894, "learning_rate": 2.7666261197784198e-06, "loss": 0.78473461, "num_input_tokens_seen": 70551920, "step": 3273, "time_per_iteration": 3.5986874103546143 }, { "auxiliary_loss_clip": 0.01179995, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 0.97918785, "balance_loss_mlp": 1.02311957, "epoch": 0.3936752239523838, "flos": 13296357400320.0, "grad_norm": 2.077867789767854, "language_loss": 0.76575732, "learning_rate": 2.7659065890388336e-06, "loss": 0.78786516, "num_input_tokens_seen": 70567920, "step": 3274, "time_per_iteration": 2.715397834777832 }, { "auxiliary_loss_clip": 0.01183012, "auxiliary_loss_mlp": 0.01027018, "balance_loss_clip": 0.97642195, "balance_loss_mlp": 1.01906049, "epoch": 0.3937954668430229, "flos": 16800161472000.0, "grad_norm": 1.7133925269283303, "language_loss": 0.84379387, "learning_rate": 2.7651869421118266e-06, "loss": 0.8658942, "num_input_tokens_seen": 70584530, "step": 3275, "time_per_iteration": 3.6283011436462402 }, { "auxiliary_loss_clip": 0.01191267, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.0193975, "balance_loss_mlp": 1.02818406, "epoch": 0.393915709733662, "flos": 21064229832960.0, "grad_norm": 1.6953857053555113, "language_loss": 0.82638359, "learning_rate": 2.76446717910657e-06, "loss": 0.84865499, "num_input_tokens_seen": 70605235, "step": 3276, "time_per_iteration": 2.7742793560028076 }, { "auxiliary_loss_clip": 0.01180865, "auxiliary_loss_mlp": 0.01022246, "balance_loss_clip": 1.01498723, "balance_loss_mlp": 1.01443839, "epoch": 0.3940359526243011, "flos": 17165265264000.0, "grad_norm": 2.067874471391789, "language_loss": 0.76466227, "learning_rate": 2.763747300132249e-06, "loss": 0.78669339, "num_input_tokens_seen": 70622675, "step": 3277, "time_per_iteration": 2.6240346431732178 }, { "auxiliary_loss_clip": 0.01187909, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 1.05612886, "balance_loss_mlp": 1.02218819, "epoch": 0.3941561955149402, "flos": 20995856294400.0, "grad_norm": 1.5647005490883055, "language_loss": 0.86459512, "learning_rate": 2.7630273052980704e-06, "loss": 0.88677645, "num_input_tokens_seen": 70643265, "step": 3278, "time_per_iteration": 2.727639675140381 }, { "auxiliary_loss_clip": 0.01172829, "auxiliary_loss_mlp": 0.01030221, "balance_loss_clip": 0.97537398, "balance_loss_mlp": 1.02260375, "epoch": 0.39427643840557924, "flos": 18843406721280.0, "grad_norm": 1.923297317074496, "language_loss": 0.67267275, "learning_rate": 2.7623071947132554e-06, "loss": 0.69470322, "num_input_tokens_seen": 70660295, "step": 3279, "time_per_iteration": 2.65450382232666 }, { "auxiliary_loss_clip": 0.01187747, "auxiliary_loss_mlp": 0.0102996, "balance_loss_clip": 0.97581846, "balance_loss_mlp": 1.02144837, "epoch": 0.39439668129621835, "flos": 23258659426560.0, "grad_norm": 2.163981815277143, "language_loss": 0.78768539, "learning_rate": 2.7615869684870458e-06, "loss": 0.80986243, "num_input_tokens_seen": 70679605, "step": 3280, "time_per_iteration": 2.7329161167144775 }, { "auxiliary_loss_clip": 0.01184851, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.01627207, "balance_loss_mlp": 1.02148068, "epoch": 0.39451692418685746, "flos": 26652289507200.0, "grad_norm": 1.6048446249632977, "language_loss": 0.84426796, "learning_rate": 2.7608666267286986e-06, "loss": 0.8664127, "num_input_tokens_seen": 70699835, "step": 3281, "time_per_iteration": 2.6806411743164062 }, { "auxiliary_loss_clip": 0.0117464, "auxiliary_loss_mlp": 0.0102072, "balance_loss_clip": 0.85954243, "balance_loss_mlp": 1.01312661, "epoch": 0.3946371670774965, "flos": 18258709132800.0, "grad_norm": 2.1126115478268916, "language_loss": 0.86376095, "learning_rate": 2.760146169547489e-06, "loss": 0.88571447, "num_input_tokens_seen": 70716600, "step": 3282, "time_per_iteration": 2.7648098468780518 }, { "auxiliary_loss_clip": 0.01190677, "auxiliary_loss_mlp": 0.0103186, "balance_loss_clip": 0.98384887, "balance_loss_mlp": 1.02355742, "epoch": 0.39475740996813563, "flos": 24206126423040.0, "grad_norm": 1.7722126153426652, "language_loss": 0.7628153, "learning_rate": 2.75942559705271e-06, "loss": 0.78504062, "num_input_tokens_seen": 70736335, "step": 3283, "time_per_iteration": 2.8608100414276123 }, { "auxiliary_loss_clip": 0.01183015, "auxiliary_loss_mlp": 0.01023909, "balance_loss_clip": 1.01519775, "balance_loss_mlp": 1.01616693, "epoch": 0.39487765285877474, "flos": 19317858491520.0, "grad_norm": 1.8244626799189938, "language_loss": 0.8921442, "learning_rate": 2.7587049093536713e-06, "loss": 0.91421342, "num_input_tokens_seen": 70752665, "step": 3284, "time_per_iteration": 2.6132988929748535 }, { "auxiliary_loss_clip": 0.01193055, "auxiliary_loss_mlp": 0.01026822, "balance_loss_clip": 1.01854205, "balance_loss_mlp": 1.01931798, "epoch": 0.3949978957494138, "flos": 17311744926720.0, "grad_norm": 1.821493482454806, "language_loss": 0.80252576, "learning_rate": 2.757984106559701e-06, "loss": 0.82472456, "num_input_tokens_seen": 70771650, "step": 3285, "time_per_iteration": 2.672626495361328 }, { "auxiliary_loss_clip": 0.01173945, "auxiliary_loss_mlp": 0.01024388, "balance_loss_clip": 0.9763031, "balance_loss_mlp": 1.01688647, "epoch": 0.3951181386400529, "flos": 36317861280000.0, "grad_norm": 2.3740749040728995, "language_loss": 0.70934606, "learning_rate": 2.7572631887801446e-06, "loss": 0.73132938, "num_input_tokens_seen": 70793275, "step": 3286, "time_per_iteration": 3.006066083908081 }, { "auxiliary_loss_clip": 0.01184375, "auxiliary_loss_mlp": 0.01028531, "balance_loss_clip": 1.01403189, "balance_loss_mlp": 1.02081847, "epoch": 0.395238381530692, "flos": 23110348170240.0, "grad_norm": 1.5852340974778385, "language_loss": 0.7654615, "learning_rate": 2.7565421561243654e-06, "loss": 0.78759056, "num_input_tokens_seen": 70811440, "step": 3287, "time_per_iteration": 2.679288625717163 }, { "auxiliary_loss_clip": 0.01176835, "auxiliary_loss_mlp": 0.01029027, "balance_loss_clip": 0.9383328, "balance_loss_mlp": 1.02155304, "epoch": 0.3953586244213311, "flos": 24347614095360.0, "grad_norm": 1.9408948748850223, "language_loss": 0.82079422, "learning_rate": 2.7558210087017413e-06, "loss": 0.84285289, "num_input_tokens_seen": 70831375, "step": 3288, "time_per_iteration": 2.882251024246216 }, { "auxiliary_loss_clip": 0.01179125, "auxiliary_loss_mlp": 0.01031123, "balance_loss_clip": 0.94079316, "balance_loss_mlp": 1.02321386, "epoch": 0.3954788673119702, "flos": 23440080044160.0, "grad_norm": 1.836150722998474, "language_loss": 0.73208022, "learning_rate": 2.7550997466216724e-06, "loss": 0.7541827, "num_input_tokens_seen": 70849170, "step": 3289, "time_per_iteration": 2.7167747020721436 }, { "auxiliary_loss_clip": 0.01184541, "auxiliary_loss_mlp": 0.01033558, "balance_loss_clip": 0.98233682, "balance_loss_mlp": 1.02638793, "epoch": 0.3955991102026093, "flos": 17494063384320.0, "grad_norm": 1.8671328368092395, "language_loss": 0.8154484, "learning_rate": 2.7543783699935714e-06, "loss": 0.83762938, "num_input_tokens_seen": 70867200, "step": 3290, "time_per_iteration": 2.6630687713623047 }, { "auxiliary_loss_clip": 0.01183378, "auxiliary_loss_mlp": 0.01032793, "balance_loss_clip": 1.01757336, "balance_loss_mlp": 1.02530706, "epoch": 0.39571935309324835, "flos": 18221326053120.0, "grad_norm": 2.450352520470571, "language_loss": 0.86004281, "learning_rate": 2.753656878926872e-06, "loss": 0.88220453, "num_input_tokens_seen": 70883080, "step": 3291, "time_per_iteration": 2.675614595413208 }, { "auxiliary_loss_clip": 0.01168844, "auxiliary_loss_mlp": 0.01024205, "balance_loss_clip": 0.9720453, "balance_loss_mlp": 1.01626539, "epoch": 0.39583959598388746, "flos": 17748813617280.0, "grad_norm": 1.7565173812581307, "language_loss": 0.73981988, "learning_rate": 2.752935273531023e-06, "loss": 0.76175034, "num_input_tokens_seen": 70901230, "step": 3292, "time_per_iteration": 2.7373502254486084 }, { "auxiliary_loss_clip": 0.01186791, "auxiliary_loss_mlp": 0.01030989, "balance_loss_clip": 1.01635456, "balance_loss_mlp": 1.0226388, "epoch": 0.39595983887452657, "flos": 19352368483200.0, "grad_norm": 1.7039518399652107, "language_loss": 0.78179026, "learning_rate": 2.752213553915492e-06, "loss": 0.80396807, "num_input_tokens_seen": 70919585, "step": 3293, "time_per_iteration": 3.6607956886291504 }, { "auxiliary_loss_clip": 0.01091644, "auxiliary_loss_mlp": 0.01004333, "balance_loss_clip": 0.94939899, "balance_loss_mlp": 1.00211608, "epoch": 0.3960800817651656, "flos": 60682282940160.0, "grad_norm": 0.8131323278449963, "language_loss": 0.66064119, "learning_rate": 2.751491720189762e-06, "loss": 0.68160099, "num_input_tokens_seen": 70977695, "step": 3294, "time_per_iteration": 4.215333461761475 }, { "auxiliary_loss_clip": 0.01184657, "auxiliary_loss_mlp": 0.01123624, "balance_loss_clip": 0.979819, "balance_loss_mlp": 0.0, "epoch": 0.39620032465580474, "flos": 16836718538880.0, "grad_norm": 2.315492048399918, "language_loss": 0.91670603, "learning_rate": 2.7507697724633364e-06, "loss": 0.93978882, "num_input_tokens_seen": 70994455, "step": 3295, "time_per_iteration": 2.728485345840454 }, { "auxiliary_loss_clip": 0.01099436, "auxiliary_loss_mlp": 0.01007662, "balance_loss_clip": 0.92177761, "balance_loss_mlp": 1.00556397, "epoch": 0.3963205675464438, "flos": 69071445941760.0, "grad_norm": 0.7823033988041413, "language_loss": 0.54715824, "learning_rate": 2.7500477108457327e-06, "loss": 0.5682292, "num_input_tokens_seen": 71046465, "step": 3296, "time_per_iteration": 3.2144484519958496 }, { "auxiliary_loss_clip": 0.01181491, "auxiliary_loss_mlp": 0.0102927, "balance_loss_clip": 1.01419258, "balance_loss_mlp": 1.02151799, "epoch": 0.3964408104370829, "flos": 25667439431040.0, "grad_norm": 1.8987138879760401, "language_loss": 0.80704093, "learning_rate": 2.7493255354464877e-06, "loss": 0.82914853, "num_input_tokens_seen": 71064275, "step": 3297, "time_per_iteration": 2.7017805576324463 }, { "auxiliary_loss_clip": 0.01156512, "auxiliary_loss_mlp": 0.0102717, "balance_loss_clip": 0.74095607, "balance_loss_mlp": 1.01933765, "epoch": 0.396561053327722, "flos": 24277480790400.0, "grad_norm": 1.7936849302262776, "language_loss": 0.76166004, "learning_rate": 2.748603246375156e-06, "loss": 0.78349686, "num_input_tokens_seen": 71082290, "step": 3298, "time_per_iteration": 3.0109550952911377 }, { "auxiliary_loss_clip": 0.01187095, "auxiliary_loss_mlp": 0.01029093, "balance_loss_clip": 1.0561173, "balance_loss_mlp": 1.02147532, "epoch": 0.39668129621836107, "flos": 20522302364160.0, "grad_norm": 1.9925456358788394, "language_loss": 0.69653398, "learning_rate": 2.7478808437413055e-06, "loss": 0.71869588, "num_input_tokens_seen": 71101700, "step": 3299, "time_per_iteration": 3.69246768951416 }, { "auxiliary_loss_clip": 0.01182547, "auxiliary_loss_mlp": 0.01026629, "balance_loss_clip": 0.90441716, "balance_loss_mlp": 1.01886249, "epoch": 0.3968015391090002, "flos": 27052585649280.0, "grad_norm": 2.001745911521979, "language_loss": 0.6618095, "learning_rate": 2.7471583276545263e-06, "loss": 0.68390125, "num_input_tokens_seen": 71122360, "step": 3300, "time_per_iteration": 2.8162009716033936 }, { "auxiliary_loss_clip": 0.01183775, "auxiliary_loss_mlp": 0.01025709, "balance_loss_clip": 0.97635192, "balance_loss_mlp": 1.01773369, "epoch": 0.3969217819996393, "flos": 12531819392640.0, "grad_norm": 1.9044037699459564, "language_loss": 0.70404196, "learning_rate": 2.7464356982244224e-06, "loss": 0.7261368, "num_input_tokens_seen": 71140360, "step": 3301, "time_per_iteration": 4.168538331985474 }, { "auxiliary_loss_clip": 0.01097002, "auxiliary_loss_mlp": 0.01003731, "balance_loss_clip": 0.99714184, "balance_loss_mlp": 1.00165701, "epoch": 0.39704202489027834, "flos": 66241399230720.0, "grad_norm": 0.7916274747372686, "language_loss": 0.61746228, "learning_rate": 2.745712955560617e-06, "loss": 0.63846958, "num_input_tokens_seen": 71196565, "step": 3302, "time_per_iteration": 3.2351982593536377 }, { "auxiliary_loss_clip": 0.01180512, "auxiliary_loss_mlp": 0.01029503, "balance_loss_clip": 0.86430258, "balance_loss_mlp": 1.02105689, "epoch": 0.39716226778091746, "flos": 16982982720000.0, "grad_norm": 6.8371526554491195, "language_loss": 0.76665282, "learning_rate": 2.7449900997727496e-06, "loss": 0.78875291, "num_input_tokens_seen": 71214675, "step": 3303, "time_per_iteration": 2.808791160583496 }, { "auxiliary_loss_clip": 0.01184119, "auxiliary_loss_mlp": 0.01028196, "balance_loss_clip": 0.98203433, "balance_loss_mlp": 1.02067423, "epoch": 0.39728251067155657, "flos": 23477139901440.0, "grad_norm": 1.8527527376280946, "language_loss": 0.84004498, "learning_rate": 2.744267130970476e-06, "loss": 0.86216819, "num_input_tokens_seen": 71234400, "step": 3304, "time_per_iteration": 2.7307217121124268 }, { "auxiliary_loss_clip": 0.01175858, "auxiliary_loss_mlp": 0.01032856, "balance_loss_clip": 0.97717673, "balance_loss_mlp": 1.0253042, "epoch": 0.3974027535621956, "flos": 20704441253760.0, "grad_norm": 1.8163645812779594, "language_loss": 0.76793301, "learning_rate": 2.7435440492634697e-06, "loss": 0.79002017, "num_input_tokens_seen": 71253725, "step": 3305, "time_per_iteration": 2.8322153091430664 }, { "auxiliary_loss_clip": 0.0118373, "auxiliary_loss_mlp": 0.01032621, "balance_loss_clip": 0.97703969, "balance_loss_mlp": 1.02381098, "epoch": 0.39752299645283473, "flos": 21543278544000.0, "grad_norm": 2.301788835516328, "language_loss": 0.67397958, "learning_rate": 2.7428208547614228e-06, "loss": 0.69614309, "num_input_tokens_seen": 71273220, "step": 3306, "time_per_iteration": 2.782687187194824 }, { "auxiliary_loss_clip": 0.01182419, "auxiliary_loss_mlp": 0.01033194, "balance_loss_clip": 1.01540327, "balance_loss_mlp": 1.02521276, "epoch": 0.39764323934347384, "flos": 19208295031680.0, "grad_norm": 1.8814592219514306, "language_loss": 0.77398598, "learning_rate": 2.742097547574043e-06, "loss": 0.79614216, "num_input_tokens_seen": 71291445, "step": 3307, "time_per_iteration": 2.7122695446014404 }, { "auxiliary_loss_clip": 0.01189215, "auxiliary_loss_mlp": 0.01124063, "balance_loss_clip": 0.97650945, "balance_loss_mlp": 0.0, "epoch": 0.3977634822341129, "flos": 20850202644480.0, "grad_norm": 1.9696928540328376, "language_loss": 0.77866215, "learning_rate": 2.7413741278110544e-06, "loss": 0.80179489, "num_input_tokens_seen": 71310135, "step": 3308, "time_per_iteration": 2.7266156673431396 }, { "auxiliary_loss_clip": 0.01188362, "auxiliary_loss_mlp": 0.0102753, "balance_loss_clip": 0.98031008, "balance_loss_mlp": 1.01901221, "epoch": 0.397883725124752, "flos": 39786042038400.0, "grad_norm": 2.1291462511928394, "language_loss": 0.68475842, "learning_rate": 2.7406505955822016e-06, "loss": 0.70691729, "num_input_tokens_seen": 71331160, "step": 3309, "time_per_iteration": 2.8500425815582275 }, { "auxiliary_loss_clip": 0.01182633, "auxiliary_loss_mlp": 0.01032155, "balance_loss_clip": 0.97749293, "balance_loss_mlp": 1.02370274, "epoch": 0.39800396801539106, "flos": 17379507934080.0, "grad_norm": 2.07564995538058, "language_loss": 0.66569215, "learning_rate": 2.7399269509972415e-06, "loss": 0.68784004, "num_input_tokens_seen": 71345315, "step": 3310, "time_per_iteration": 2.703982353210449 }, { "auxiliary_loss_clip": 0.01177122, "auxiliary_loss_mlp": 0.01027331, "balance_loss_clip": 0.9743582, "balance_loss_mlp": 1.01854563, "epoch": 0.3981242109060302, "flos": 19202764337280.0, "grad_norm": 2.0302536057468683, "language_loss": 0.85331357, "learning_rate": 2.7392031941659514e-06, "loss": 0.8753581, "num_input_tokens_seen": 71363160, "step": 3311, "time_per_iteration": 2.6693837642669678 }, { "auxiliary_loss_clip": 0.01185714, "auxiliary_loss_mlp": 0.01029252, "balance_loss_clip": 0.98191249, "balance_loss_mlp": 1.0216409, "epoch": 0.3982444537966693, "flos": 24565124903040.0, "grad_norm": 1.909435098182018, "language_loss": 0.85756928, "learning_rate": 2.7384793251981244e-06, "loss": 0.8797189, "num_input_tokens_seen": 71382145, "step": 3312, "time_per_iteration": 2.7014448642730713 }, { "auxiliary_loss_clip": 0.01188943, "auxiliary_loss_mlp": 0.01028662, "balance_loss_clip": 1.0146277, "balance_loss_mlp": 1.02034163, "epoch": 0.39836469668730834, "flos": 26213856099840.0, "grad_norm": 1.9487922985733113, "language_loss": 0.80782104, "learning_rate": 2.737755344203571e-06, "loss": 0.82999706, "num_input_tokens_seen": 71402095, "step": 3313, "time_per_iteration": 2.6786723136901855 }, { "auxiliary_loss_clip": 0.01188277, "auxiliary_loss_mlp": 0.01028872, "balance_loss_clip": 1.01794791, "balance_loss_mlp": 1.02197862, "epoch": 0.39848493957794745, "flos": 27636134002560.0, "grad_norm": 1.5455275586273625, "language_loss": 0.79895985, "learning_rate": 2.7370312512921186e-06, "loss": 0.82113135, "num_input_tokens_seen": 71423875, "step": 3314, "time_per_iteration": 2.7227885723114014 }, { "auxiliary_loss_clip": 0.01186436, "auxiliary_loss_mlp": 0.01026913, "balance_loss_clip": 0.97622991, "balance_loss_mlp": 1.01886618, "epoch": 0.39860518246858656, "flos": 12239326944000.0, "grad_norm": 2.4727353398497742, "language_loss": 0.76383686, "learning_rate": 2.736307046573611e-06, "loss": 0.78597033, "num_input_tokens_seen": 71439745, "step": 3315, "time_per_iteration": 2.623366355895996 }, { "auxiliary_loss_clip": 0.01186494, "auxiliary_loss_mlp": 0.01023721, "balance_loss_clip": 1.05489421, "balance_loss_mlp": 1.01622868, "epoch": 0.3987254253592256, "flos": 22379135005440.0, "grad_norm": 1.8481753532563245, "language_loss": 0.8188284, "learning_rate": 2.73558273015791e-06, "loss": 0.84093058, "num_input_tokens_seen": 71459575, "step": 3316, "time_per_iteration": 2.6960949897766113 }, { "auxiliary_loss_clip": 0.01190203, "auxiliary_loss_mlp": 0.01030699, "balance_loss_clip": 1.05644739, "balance_loss_mlp": 1.02278948, "epoch": 0.3988456682498647, "flos": 23514020190720.0, "grad_norm": 1.9846689588040598, "language_loss": 0.70418847, "learning_rate": 2.734858302154894e-06, "loss": 0.72639751, "num_input_tokens_seen": 71481075, "step": 3317, "time_per_iteration": 2.617588758468628 }, { "auxiliary_loss_clip": 0.01179884, "auxiliary_loss_mlp": 0.01030659, "balance_loss_clip": 0.97684169, "balance_loss_mlp": 1.02299356, "epoch": 0.39896591114050384, "flos": 19208761908480.0, "grad_norm": 1.929115703112892, "language_loss": 0.76288879, "learning_rate": 2.734133762674457e-06, "loss": 0.78499413, "num_input_tokens_seen": 71500665, "step": 3318, "time_per_iteration": 2.763009548187256 }, { "auxiliary_loss_clip": 0.01185236, "auxiliary_loss_mlp": 0.01027131, "balance_loss_clip": 0.97714579, "balance_loss_mlp": 1.01911998, "epoch": 0.3990861540311429, "flos": 28401031146240.0, "grad_norm": 1.7033633730232445, "language_loss": 0.70610404, "learning_rate": 2.7334091118265124e-06, "loss": 0.72822773, "num_input_tokens_seen": 71522560, "step": 3319, "time_per_iteration": 3.7364094257354736 }, { "auxiliary_loss_clip": 0.01093513, "auxiliary_loss_mlp": 0.01012486, "balance_loss_clip": 0.98742586, "balance_loss_mlp": 1.01025665, "epoch": 0.399206396921782, "flos": 61758563086080.0, "grad_norm": 0.6824461424391219, "language_loss": 0.57864928, "learning_rate": 2.732684349720989e-06, "loss": 0.59970927, "num_input_tokens_seen": 71590520, "step": 3320, "time_per_iteration": 4.219793081283569 }, { "auxiliary_loss_clip": 0.01188774, "auxiliary_loss_mlp": 0.01030692, "balance_loss_clip": 0.94055843, "balance_loss_mlp": 1.02240062, "epoch": 0.3993266398124211, "flos": 28074567409920.0, "grad_norm": 1.6233755923837085, "language_loss": 0.75344193, "learning_rate": 2.7319594764678318e-06, "loss": 0.77563655, "num_input_tokens_seen": 71612620, "step": 3321, "time_per_iteration": 2.727275848388672 }, { "auxiliary_loss_clip": 0.01185631, "auxiliary_loss_mlp": 0.01025651, "balance_loss_clip": 0.90211415, "balance_loss_mlp": 1.01759815, "epoch": 0.39944688270306017, "flos": 23225083188480.0, "grad_norm": 1.790856091131052, "language_loss": 0.83502567, "learning_rate": 2.7312344921770044e-06, "loss": 0.85713851, "num_input_tokens_seen": 71634320, "step": 3322, "time_per_iteration": 2.749117851257324 }, { "auxiliary_loss_clip": 0.01183703, "auxiliary_loss_mlp": 0.01028037, "balance_loss_clip": 0.97581577, "balance_loss_mlp": 1.01979983, "epoch": 0.3995671255936993, "flos": 19390433921280.0, "grad_norm": 1.9212248487076617, "language_loss": 0.77923274, "learning_rate": 2.7305093969584857e-06, "loss": 0.80135012, "num_input_tokens_seen": 71653145, "step": 3323, "time_per_iteration": 2.6982810497283936 }, { "auxiliary_loss_clip": 0.01175047, "auxiliary_loss_mlp": 0.01029387, "balance_loss_clip": 1.01209819, "balance_loss_mlp": 1.02093542, "epoch": 0.3996873684843384, "flos": 23842638743040.0, "grad_norm": 1.875651311026208, "language_loss": 0.79956138, "learning_rate": 2.729784190922272e-06, "loss": 0.82160574, "num_input_tokens_seen": 71674580, "step": 3324, "time_per_iteration": 3.685328960418701 }, { "auxiliary_loss_clip": 0.0109293, "auxiliary_loss_mlp": 0.0100375, "balance_loss_clip": 0.9498629, "balance_loss_mlp": 1.00149655, "epoch": 0.39980761137497745, "flos": 66576877280640.0, "grad_norm": 1.7239493684657128, "language_loss": 0.57217205, "learning_rate": 2.729058874178378e-06, "loss": 0.59313881, "num_input_tokens_seen": 71745260, "step": 3325, "time_per_iteration": 3.3356106281280518 }, { "auxiliary_loss_clip": 0.01190168, "auxiliary_loss_mlp": 0.01027742, "balance_loss_clip": 0.97973335, "balance_loss_mlp": 1.01999307, "epoch": 0.39992785426561656, "flos": 28549162834560.0, "grad_norm": 1.6757215844193663, "language_loss": 0.69249773, "learning_rate": 2.7283334468368315e-06, "loss": 0.71467686, "num_input_tokens_seen": 71766540, "step": 3326, "time_per_iteration": 2.8164238929748535 }, { "auxiliary_loss_clip": 0.01170689, "auxiliary_loss_mlp": 0.01024093, "balance_loss_clip": 0.77932239, "balance_loss_mlp": 1.01595652, "epoch": 0.4000480971562556, "flos": 15049408671360.0, "grad_norm": 1.691955847218284, "language_loss": 0.72648859, "learning_rate": 2.72760790900768e-06, "loss": 0.74843645, "num_input_tokens_seen": 71783125, "step": 3327, "time_per_iteration": 3.916665554046631 }, { "auxiliary_loss_clip": 0.01188824, "auxiliary_loss_mlp": 0.01030708, "balance_loss_clip": 1.05626559, "balance_loss_mlp": 1.02255976, "epoch": 0.4001683400468947, "flos": 23915609222400.0, "grad_norm": 1.7128099080459087, "language_loss": 0.78149974, "learning_rate": 2.7268822608009875e-06, "loss": 0.80369508, "num_input_tokens_seen": 71802500, "step": 3328, "time_per_iteration": 2.9085946083068848 }, { "auxiliary_loss_clip": 0.0118767, "auxiliary_loss_mlp": 0.0103009, "balance_loss_clip": 0.93942446, "balance_loss_mlp": 1.02165008, "epoch": 0.40028858293753383, "flos": 24352677912960.0, "grad_norm": 1.8905714745855873, "language_loss": 0.78110933, "learning_rate": 2.726156502326834e-06, "loss": 0.80328691, "num_input_tokens_seen": 71823800, "step": 3329, "time_per_iteration": 288.1878650188446 }, { "auxiliary_loss_clip": 0.01116743, "auxiliary_loss_mlp": 0.01011984, "balance_loss_clip": 0.8513813, "balance_loss_mlp": 1.00976682, "epoch": 0.4004088258281729, "flos": 66787025800320.0, "grad_norm": 0.707736207170792, "language_loss": 0.60265124, "learning_rate": 2.725430633695316e-06, "loss": 0.6239385, "num_input_tokens_seen": 71886880, "step": 3330, "time_per_iteration": 3.408086061477661 }, { "auxiliary_loss_clip": 0.01089104, "auxiliary_loss_mlp": 0.01000762, "balance_loss_clip": 1.02377868, "balance_loss_mlp": 0.99867535, "epoch": 0.400529068718812, "flos": 58598386473600.0, "grad_norm": 0.9698842049116833, "language_loss": 0.57988375, "learning_rate": 2.7247046550165485e-06, "loss": 0.60078239, "num_input_tokens_seen": 71939005, "step": 3331, "time_per_iteration": 3.0478451251983643 }, { "auxiliary_loss_clip": 0.01189584, "auxiliary_loss_mlp": 0.01031394, "balance_loss_clip": 1.05756772, "balance_loss_mlp": 1.02359223, "epoch": 0.4006493116094511, "flos": 25377460934400.0, "grad_norm": 1.4351676395374597, "language_loss": 0.7585628, "learning_rate": 2.7239785664006606e-06, "loss": 0.78077257, "num_input_tokens_seen": 71962545, "step": 3332, "time_per_iteration": 2.73270583152771 }, { "auxiliary_loss_clip": 0.01090403, "auxiliary_loss_mlp": 0.01002565, "balance_loss_clip": 0.98504996, "balance_loss_mlp": 1.00035942, "epoch": 0.40076955450009016, "flos": 60280729822080.0, "grad_norm": 0.7735801553117854, "language_loss": 0.61848772, "learning_rate": 2.7232523679578002e-06, "loss": 0.63941729, "num_input_tokens_seen": 72025625, "step": 3333, "time_per_iteration": 3.2787435054779053 }, { "auxiliary_loss_clip": 0.0118787, "auxiliary_loss_mlp": 0.01032519, "balance_loss_clip": 1.01862514, "balance_loss_mlp": 1.02441239, "epoch": 0.4008897973907293, "flos": 16617268396800.0, "grad_norm": 2.1748364974884904, "language_loss": 0.79453313, "learning_rate": 2.7225260597981295e-06, "loss": 0.816737, "num_input_tokens_seen": 72043330, "step": 3334, "time_per_iteration": 2.7070703506469727 }, { "auxiliary_loss_clip": 0.01184501, "auxiliary_loss_mlp": 0.01124821, "balance_loss_clip": 0.94198716, "balance_loss_mlp": 0.0, "epoch": 0.4010100402813684, "flos": 15377344865280.0, "grad_norm": 3.384088974223337, "language_loss": 0.78830492, "learning_rate": 2.721799642031831e-06, "loss": 0.81139809, "num_input_tokens_seen": 72059500, "step": 3335, "time_per_iteration": 2.7297542095184326 }, { "auxiliary_loss_clip": 0.01188353, "auxiliary_loss_mlp": 0.01028135, "balance_loss_clip": 0.97621316, "balance_loss_mlp": 1.02036285, "epoch": 0.40113028317200744, "flos": 13298835438720.0, "grad_norm": 2.6412346413650845, "language_loss": 0.77658123, "learning_rate": 2.721073114769101e-06, "loss": 0.79874611, "num_input_tokens_seen": 72077175, "step": 3336, "time_per_iteration": 2.703052043914795 }, { "auxiliary_loss_clip": 0.01177775, "auxiliary_loss_mlp": 0.01027173, "balance_loss_clip": 0.93955976, "balance_loss_mlp": 1.01957917, "epoch": 0.40125052606264655, "flos": 20668027841280.0, "grad_norm": 2.8030552718505675, "language_loss": 0.74720204, "learning_rate": 2.7203464781201523e-06, "loss": 0.76925159, "num_input_tokens_seen": 72096490, "step": 3337, "time_per_iteration": 2.8079516887664795 }, { "auxiliary_loss_clip": 0.01188953, "auxiliary_loss_mlp": 0.01029716, "balance_loss_clip": 1.05529606, "balance_loss_mlp": 1.02217567, "epoch": 0.40137076895328566, "flos": 24607679541120.0, "grad_norm": 1.8981492556271202, "language_loss": 0.7819891, "learning_rate": 2.719619732195215e-06, "loss": 0.80417573, "num_input_tokens_seen": 72118130, "step": 3338, "time_per_iteration": 2.6754074096679688 }, { "auxiliary_loss_clip": 0.01184367, "auxiliary_loss_mlp": 0.01025041, "balance_loss_clip": 0.9392581, "balance_loss_mlp": 1.01700604, "epoch": 0.4014910118439247, "flos": 24206593299840.0, "grad_norm": 5.271192221627894, "language_loss": 0.72721952, "learning_rate": 2.7188928771045377e-06, "loss": 0.74931359, "num_input_tokens_seen": 72139450, "step": 3339, "time_per_iteration": 2.8276889324188232 }, { "auxiliary_loss_clip": 0.01172517, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 0.93543398, "balance_loss_mlp": 1.02086544, "epoch": 0.4016112547345638, "flos": 26725080418560.0, "grad_norm": 3.195867101334779, "language_loss": 0.79591048, "learning_rate": 2.7181659129583815e-06, "loss": 0.81792486, "num_input_tokens_seen": 72159040, "step": 3340, "time_per_iteration": 2.8384876251220703 }, { "auxiliary_loss_clip": 0.01172118, "auxiliary_loss_mlp": 0.01026929, "balance_loss_clip": 0.9719609, "balance_loss_mlp": 1.01884711, "epoch": 0.4017314976252029, "flos": 21288025520640.0, "grad_norm": 1.7692074349514215, "language_loss": 0.76285619, "learning_rate": 2.7174388398670276e-06, "loss": 0.78484666, "num_input_tokens_seen": 72178220, "step": 3341, "time_per_iteration": 2.7291858196258545 }, { "auxiliary_loss_clip": 0.01185667, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.0517323, "balance_loss_mlp": 1.02126038, "epoch": 0.401851740515842, "flos": 25484690010240.0, "grad_norm": 1.814025228792301, "language_loss": 0.91923463, "learning_rate": 2.716711657940773e-06, "loss": 0.94138515, "num_input_tokens_seen": 72199230, "step": 3342, "time_per_iteration": 2.7078304290771484 }, { "auxiliary_loss_clip": 0.0109298, "auxiliary_loss_mlp": 0.01004942, "balance_loss_clip": 0.90920377, "balance_loss_mlp": 1.00282049, "epoch": 0.4019719834064811, "flos": 55395334978560.0, "grad_norm": 0.8149073589752691, "language_loss": 0.56532431, "learning_rate": 2.7159843672899284e-06, "loss": 0.58630359, "num_input_tokens_seen": 72263430, "step": 3343, "time_per_iteration": 3.418544054031372 }, { "auxiliary_loss_clip": 0.01185542, "auxiliary_loss_mlp": 0.01022228, "balance_loss_clip": 1.01695311, "balance_loss_mlp": 1.01406252, "epoch": 0.40209222629712016, "flos": 18180100218240.0, "grad_norm": 1.8164118380719938, "language_loss": 0.81156385, "learning_rate": 2.715256968024825e-06, "loss": 0.83364159, "num_input_tokens_seen": 72280505, "step": 3344, "time_per_iteration": 2.684689998626709 }, { "auxiliary_loss_clip": 0.01192627, "auxiliary_loss_mlp": 0.01025521, "balance_loss_clip": 0.97907972, "balance_loss_mlp": 1.0178504, "epoch": 0.40221246918775927, "flos": 25961009287680.0, "grad_norm": 1.5590113589555634, "language_loss": 0.82616508, "learning_rate": 2.7145294602558083e-06, "loss": 0.84834659, "num_input_tokens_seen": 72301215, "step": 3345, "time_per_iteration": 5.306323766708374 }, { "auxiliary_loss_clip": 0.0118668, "auxiliary_loss_mlp": 0.0102389, "balance_loss_clip": 1.01754165, "balance_loss_mlp": 1.0157783, "epoch": 0.4023327120783984, "flos": 33838912056960.0, "grad_norm": 1.8049124279631181, "language_loss": 0.70292819, "learning_rate": 2.713801844093241e-06, "loss": 0.72503388, "num_input_tokens_seen": 72322365, "step": 3346, "time_per_iteration": 2.7273337841033936 }, { "auxiliary_loss_clip": 0.01188259, "auxiliary_loss_mlp": 0.01028021, "balance_loss_clip": 1.01820302, "balance_loss_mlp": 1.02067173, "epoch": 0.40245295496903744, "flos": 26900252069760.0, "grad_norm": 1.8546280636918815, "language_loss": 0.88345331, "learning_rate": 2.7130741196475014e-06, "loss": 0.90561604, "num_input_tokens_seen": 72340495, "step": 3347, "time_per_iteration": 2.7084267139434814 }, { "auxiliary_loss_clip": 0.01193702, "auxiliary_loss_mlp": 0.0102655, "balance_loss_clip": 0.9816829, "balance_loss_mlp": 1.0176506, "epoch": 0.40257319785967655, "flos": 36902738436480.0, "grad_norm": 7.082390454054902, "language_loss": 0.79615462, "learning_rate": 2.7123462870289848e-06, "loss": 0.81835711, "num_input_tokens_seen": 72360545, "step": 3348, "time_per_iteration": 2.832284450531006 }, { "auxiliary_loss_clip": 0.01185758, "auxiliary_loss_mlp": 0.01027093, "balance_loss_clip": 0.97691453, "balance_loss_mlp": 1.01899874, "epoch": 0.40269344075031566, "flos": 24353180703360.0, "grad_norm": 1.8352706188691845, "language_loss": 0.81128752, "learning_rate": 2.711618346348102e-06, "loss": 0.8334161, "num_input_tokens_seen": 72381070, "step": 3349, "time_per_iteration": 2.781388998031616 }, { "auxiliary_loss_clip": 0.01175502, "auxiliary_loss_mlp": 0.01033388, "balance_loss_clip": 0.97587192, "balance_loss_mlp": 1.02582979, "epoch": 0.4028136836409547, "flos": 14389657614720.0, "grad_norm": 2.078964331490312, "language_loss": 0.63538998, "learning_rate": 2.7108902977152825e-06, "loss": 0.65747887, "num_input_tokens_seen": 72398970, "step": 3350, "time_per_iteration": 3.678699016571045 }, { "auxiliary_loss_clip": 0.01181939, "auxiliary_loss_mlp": 0.01026759, "balance_loss_clip": 1.01480842, "balance_loss_mlp": 1.01820576, "epoch": 0.4029339265315938, "flos": 26136037284480.0, "grad_norm": 2.1707272362095913, "language_loss": 0.74826401, "learning_rate": 2.7101621412409704e-06, "loss": 0.77035099, "num_input_tokens_seen": 72418455, "step": 3351, "time_per_iteration": 2.7267258167266846 }, { "auxiliary_loss_clip": 0.01187052, "auxiliary_loss_mlp": 0.01032068, "balance_loss_clip": 1.05358481, "balance_loss_mlp": 1.02397323, "epoch": 0.40305416942223293, "flos": 23256325042560.0, "grad_norm": 1.9617314712952065, "language_loss": 0.85512829, "learning_rate": 2.7094338770356256e-06, "loss": 0.87731951, "num_input_tokens_seen": 72437540, "step": 3352, "time_per_iteration": 2.631277322769165 }, { "auxiliary_loss_clip": 0.01180195, "auxiliary_loss_mlp": 0.01030461, "balance_loss_clip": 0.9767431, "balance_loss_mlp": 1.02246261, "epoch": 0.403174412312872, "flos": 27089645506560.0, "grad_norm": 1.8159395913568555, "language_loss": 0.63810647, "learning_rate": 2.708705505209726e-06, "loss": 0.66021305, "num_input_tokens_seen": 72458315, "step": 3353, "time_per_iteration": 3.6995208263397217 }, { "auxiliary_loss_clip": 0.01171648, "auxiliary_loss_mlp": 0.01026793, "balance_loss_clip": 0.89844656, "balance_loss_mlp": 1.01826954, "epoch": 0.4032946552035111, "flos": 21756336065280.0, "grad_norm": 2.3367382362333102, "language_loss": 0.91762966, "learning_rate": 2.7079770258737646e-06, "loss": 0.93961406, "num_input_tokens_seen": 72476225, "step": 3354, "time_per_iteration": 2.8055901527404785 }, { "auxiliary_loss_clip": 0.01173613, "auxiliary_loss_mlp": 0.0102603, "balance_loss_clip": 0.93429935, "balance_loss_mlp": 1.01749504, "epoch": 0.4034148980941502, "flos": 17343956448000.0, "grad_norm": 2.0151458955266146, "language_loss": 0.75206506, "learning_rate": 2.707248439138251e-06, "loss": 0.77406144, "num_input_tokens_seen": 72492460, "step": 3355, "time_per_iteration": 2.6941096782684326 }, { "auxiliary_loss_clip": 0.01177401, "auxiliary_loss_mlp": 0.01030792, "balance_loss_clip": 0.97846162, "balance_loss_mlp": 1.02304959, "epoch": 0.40353514098478926, "flos": 22017838055040.0, "grad_norm": 1.7569700270707496, "language_loss": 0.65445364, "learning_rate": 2.7065197451137114e-06, "loss": 0.67653549, "num_input_tokens_seen": 72513840, "step": 3356, "time_per_iteration": 2.7095909118652344 }, { "auxiliary_loss_clip": 0.01180505, "auxiliary_loss_mlp": 0.0103096, "balance_loss_clip": 0.97676343, "balance_loss_mlp": 1.02293134, "epoch": 0.4036553838754284, "flos": 14246446089600.0, "grad_norm": 2.0251950471928906, "language_loss": 0.67451507, "learning_rate": 2.7057909439106894e-06, "loss": 0.6966297, "num_input_tokens_seen": 72531695, "step": 3357, "time_per_iteration": 2.679466724395752 }, { "auxiliary_loss_clip": 0.01175652, "auxiliary_loss_mlp": 0.01123823, "balance_loss_clip": 1.01390266, "balance_loss_mlp": 0.0, "epoch": 0.40377562676606743, "flos": 24790644443520.0, "grad_norm": 1.7641336699481711, "language_loss": 0.78587723, "learning_rate": 2.7050620356397417e-06, "loss": 0.80887198, "num_input_tokens_seen": 72550645, "step": 3358, "time_per_iteration": 2.6922221183776855 }, { "auxiliary_loss_clip": 0.011844, "auxiliary_loss_mlp": 0.01026104, "balance_loss_clip": 1.05556178, "balance_loss_mlp": 1.0183847, "epoch": 0.40389586965670654, "flos": 24061226958720.0, "grad_norm": 1.9578632888416811, "language_loss": 0.72214091, "learning_rate": 2.7043330204114437e-06, "loss": 0.74424589, "num_input_tokens_seen": 72569355, "step": 3359, "time_per_iteration": 2.6731162071228027 }, { "auxiliary_loss_clip": 0.01181534, "auxiliary_loss_mlp": 0.01025003, "balance_loss_clip": 1.05075383, "balance_loss_mlp": 1.01746297, "epoch": 0.40401611254734565, "flos": 16399613934720.0, "grad_norm": 1.7632947353711885, "language_loss": 0.85577339, "learning_rate": 2.7036038983363862e-06, "loss": 0.87783873, "num_input_tokens_seen": 72585960, "step": 3360, "time_per_iteration": 2.6353561878204346 }, { "auxiliary_loss_clip": 0.01178618, "auxiliary_loss_mlp": 0.01025205, "balance_loss_clip": 1.01402092, "balance_loss_mlp": 1.01801026, "epoch": 0.4041363554379847, "flos": 23988220565760.0, "grad_norm": 1.759910333752104, "language_loss": 0.84403813, "learning_rate": 2.702874669525177e-06, "loss": 0.86607635, "num_input_tokens_seen": 72604440, "step": 3361, "time_per_iteration": 2.7029380798339844 }, { "auxiliary_loss_clip": 0.01186915, "auxiliary_loss_mlp": 0.01030234, "balance_loss_clip": 0.9431507, "balance_loss_mlp": 1.02247298, "epoch": 0.4042565983286238, "flos": 28401964899840.0, "grad_norm": 2.320819590265577, "language_loss": 0.69929719, "learning_rate": 2.7021453340884394e-06, "loss": 0.72146869, "num_input_tokens_seen": 72622165, "step": 3362, "time_per_iteration": 2.849440097808838 }, { "auxiliary_loss_clip": 0.01170344, "auxiliary_loss_mlp": 0.01123772, "balance_loss_clip": 0.97553611, "balance_loss_mlp": 0.0, "epoch": 0.40437684121926293, "flos": 17710963660800.0, "grad_norm": 2.353318683896361, "language_loss": 0.73404789, "learning_rate": 2.7014158921368125e-06, "loss": 0.756989, "num_input_tokens_seen": 72640490, "step": 3363, "time_per_iteration": 2.6570537090301514 }, { "auxiliary_loss_clip": 0.01186003, "auxiliary_loss_mlp": 0.01029483, "balance_loss_clip": 1.05411196, "balance_loss_mlp": 1.02135909, "epoch": 0.404497084109902, "flos": 24018959629440.0, "grad_norm": 2.402018124852151, "language_loss": 0.85668725, "learning_rate": 2.700686343780953e-06, "loss": 0.87884206, "num_input_tokens_seen": 72660360, "step": 3364, "time_per_iteration": 2.651445150375366 }, { "auxiliary_loss_clip": 0.01181749, "auxiliary_loss_mlp": 0.0102911, "balance_loss_clip": 0.97582638, "balance_loss_mlp": 1.02139449, "epoch": 0.4046173270005411, "flos": 22929861306240.0, "grad_norm": 1.5855140250095867, "language_loss": 0.88169265, "learning_rate": 2.699956689131532e-06, "loss": 0.90380126, "num_input_tokens_seen": 72680345, "step": 3365, "time_per_iteration": 2.7516653537750244 }, { "auxiliary_loss_clip": 0.01184158, "auxiliary_loss_mlp": 0.01031229, "balance_loss_clip": 0.97488654, "balance_loss_mlp": 1.0235368, "epoch": 0.4047375698911802, "flos": 20668135582080.0, "grad_norm": 2.279238124474858, "language_loss": 0.85042691, "learning_rate": 2.699226928299238e-06, "loss": 0.87258077, "num_input_tokens_seen": 72698365, "step": 3366, "time_per_iteration": 2.721346139907837 }, { "auxiliary_loss_clip": 0.01185752, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.01633072, "balance_loss_mlp": 1.0224117, "epoch": 0.40485781278181926, "flos": 28912865996160.0, "grad_norm": 2.182295355999049, "language_loss": 0.78894842, "learning_rate": 2.698497061394774e-06, "loss": 0.8111124, "num_input_tokens_seen": 72716850, "step": 3367, "time_per_iteration": 2.7207090854644775 }, { "auxiliary_loss_clip": 0.01190485, "auxiliary_loss_mlp": 0.011242, "balance_loss_clip": 0.94105041, "balance_loss_mlp": 0.0, "epoch": 0.40497805567245837, "flos": 23148377694720.0, "grad_norm": 1.4616557850912124, "language_loss": 0.80659431, "learning_rate": 2.6977670885288627e-06, "loss": 0.82974112, "num_input_tokens_seen": 72738250, "step": 3368, "time_per_iteration": 2.753560781478882 }, { "auxiliary_loss_clip": 0.01170791, "auxiliary_loss_mlp": 0.01028157, "balance_loss_clip": 0.97455049, "balance_loss_mlp": 1.0203371, "epoch": 0.4050982985630975, "flos": 16289404030080.0, "grad_norm": 1.8371630484504444, "language_loss": 0.75198728, "learning_rate": 2.6970370098122378e-06, "loss": 0.77397674, "num_input_tokens_seen": 72755235, "step": 3369, "time_per_iteration": 2.664545774459839 }, { "auxiliary_loss_clip": 0.01187251, "auxiliary_loss_mlp": 0.01025577, "balance_loss_clip": 1.05399942, "balance_loss_mlp": 1.01720214, "epoch": 0.40521854145373654, "flos": 34459484353920.0, "grad_norm": 1.6730784534556373, "language_loss": 0.86530203, "learning_rate": 2.6963068253556535e-06, "loss": 0.88743025, "num_input_tokens_seen": 72776620, "step": 3370, "time_per_iteration": 2.777749538421631 }, { "auxiliary_loss_clip": 0.01190706, "auxiliary_loss_mlp": 0.01030822, "balance_loss_clip": 1.01365912, "balance_loss_mlp": 1.02220368, "epoch": 0.40533878434437565, "flos": 25331099454720.0, "grad_norm": 1.803984887207016, "language_loss": 0.85759825, "learning_rate": 2.6955765352698763e-06, "loss": 0.87981355, "num_input_tokens_seen": 72796765, "step": 3371, "time_per_iteration": 3.645355224609375 }, { "auxiliary_loss_clip": 0.01186607, "auxiliary_loss_mlp": 0.01026502, "balance_loss_clip": 1.05277419, "balance_loss_mlp": 1.01816308, "epoch": 0.40545902723501476, "flos": 15012061505280.0, "grad_norm": 1.8530926250628008, "language_loss": 0.73006988, "learning_rate": 2.6948461396656923e-06, "loss": 0.75220096, "num_input_tokens_seen": 72814175, "step": 3372, "time_per_iteration": 3.5870485305786133 }, { "auxiliary_loss_clip": 0.01190681, "auxiliary_loss_mlp": 0.0102593, "balance_loss_clip": 1.01616466, "balance_loss_mlp": 1.01780641, "epoch": 0.4055792701256538, "flos": 25521103422720.0, "grad_norm": 2.2845734085002736, "language_loss": 0.74908376, "learning_rate": 2.6941156386539013e-06, "loss": 0.77124989, "num_input_tokens_seen": 72834125, "step": 3373, "time_per_iteration": 2.6914467811584473 }, { "auxiliary_loss_clip": 0.01181643, "auxiliary_loss_mlp": 0.01036914, "balance_loss_clip": 0.98129439, "balance_loss_mlp": 1.02845621, "epoch": 0.4056995130162929, "flos": 19574583972480.0, "grad_norm": 2.1617022094538534, "language_loss": 0.80955249, "learning_rate": 2.6933850323453203e-06, "loss": 0.83173811, "num_input_tokens_seen": 72852570, "step": 3374, "time_per_iteration": 2.6334521770477295 }, { "auxiliary_loss_clip": 0.01184797, "auxiliary_loss_mlp": 0.01030613, "balance_loss_clip": 1.05365479, "balance_loss_mlp": 1.0231564, "epoch": 0.405819755906932, "flos": 15413794191360.0, "grad_norm": 1.8558091557026393, "language_loss": 0.74734795, "learning_rate": 2.6926543208507806e-06, "loss": 0.76950204, "num_input_tokens_seen": 72871250, "step": 3375, "time_per_iteration": 2.616793155670166 }, { "auxiliary_loss_clip": 0.01183951, "auxiliary_loss_mlp": 0.01027839, "balance_loss_clip": 1.01576519, "balance_loss_mlp": 1.01943529, "epoch": 0.4059399987975711, "flos": 21433930565760.0, "grad_norm": 2.745190519190429, "language_loss": 0.80511445, "learning_rate": 2.6919235042811316e-06, "loss": 0.82723236, "num_input_tokens_seen": 72890035, "step": 3376, "time_per_iteration": 2.653526782989502 }, { "auxiliary_loss_clip": 0.011804, "auxiliary_loss_mlp": 0.01029496, "balance_loss_clip": 0.93891954, "balance_loss_mlp": 1.02109766, "epoch": 0.4060602416882102, "flos": 25556942217600.0, "grad_norm": 1.971860594673484, "language_loss": 0.76396805, "learning_rate": 2.691192582747237e-06, "loss": 0.78606701, "num_input_tokens_seen": 72909665, "step": 3377, "time_per_iteration": 3.6608500480651855 }, { "auxiliary_loss_clip": 0.01187114, "auxiliary_loss_mlp": 0.01023054, "balance_loss_clip": 1.05491924, "balance_loss_mlp": 1.01559734, "epoch": 0.40618048457884925, "flos": 23766759262080.0, "grad_norm": 1.6914666523260038, "language_loss": 0.73901212, "learning_rate": 2.6904615563599765e-06, "loss": 0.76111388, "num_input_tokens_seen": 72929465, "step": 3378, "time_per_iteration": 2.7085824012756348 }, { "auxiliary_loss_clip": 0.01177114, "auxiliary_loss_mlp": 0.01028821, "balance_loss_clip": 0.93725359, "balance_loss_mlp": 1.02069712, "epoch": 0.40630072746948837, "flos": 17639681120640.0, "grad_norm": 1.7041071589832308, "language_loss": 0.82836014, "learning_rate": 2.6897304252302477e-06, "loss": 0.85041952, "num_input_tokens_seen": 72946785, "step": 3379, "time_per_iteration": 3.6300671100616455 }, { "auxiliary_loss_clip": 0.01092335, "auxiliary_loss_mlp": 0.01001209, "balance_loss_clip": 0.91213375, "balance_loss_mlp": 0.99906313, "epoch": 0.4064209703601275, "flos": 60836053063680.0, "grad_norm": 0.7890877020649616, "language_loss": 0.54852223, "learning_rate": 2.688999189468962e-06, "loss": 0.56945765, "num_input_tokens_seen": 73003215, "step": 3380, "time_per_iteration": 3.178077459335327 }, { "auxiliary_loss_clip": 0.01185183, "auxiliary_loss_mlp": 0.01034364, "balance_loss_clip": 1.01758635, "balance_loss_mlp": 1.02632344, "epoch": 0.40654121325076653, "flos": 24024346669440.0, "grad_norm": 2.2057168026948215, "language_loss": 0.75915337, "learning_rate": 2.6882678491870464e-06, "loss": 0.78134882, "num_input_tokens_seen": 73023650, "step": 3381, "time_per_iteration": 2.7398030757904053 }, { "auxiliary_loss_clip": 0.01191798, "auxiliary_loss_mlp": 0.01030653, "balance_loss_clip": 1.01866698, "balance_loss_mlp": 1.0222311, "epoch": 0.40666145614140564, "flos": 27344252085120.0, "grad_norm": 1.5685804197388038, "language_loss": 0.71786183, "learning_rate": 2.6875364044954453e-06, "loss": 0.74008638, "num_input_tokens_seen": 73043880, "step": 3382, "time_per_iteration": 2.708951234817505 }, { "auxiliary_loss_clip": 0.01177746, "auxiliary_loss_mlp": 0.01027862, "balance_loss_clip": 0.97279602, "balance_loss_mlp": 1.02050126, "epoch": 0.40678169903204475, "flos": 26176724415360.0, "grad_norm": 1.5544497012353553, "language_loss": 0.82604265, "learning_rate": 2.6868048555051185e-06, "loss": 0.8480987, "num_input_tokens_seen": 73065410, "step": 3383, "time_per_iteration": 2.7863216400146484 }, { "auxiliary_loss_clip": 0.01185008, "auxiliary_loss_mlp": 0.01037365, "balance_loss_clip": 0.97399235, "balance_loss_mlp": 1.02934813, "epoch": 0.4069019419226838, "flos": 28622420622720.0, "grad_norm": 2.800202896770222, "language_loss": 0.85938174, "learning_rate": 2.686073202327041e-06, "loss": 0.88160551, "num_input_tokens_seen": 73084410, "step": 3384, "time_per_iteration": 2.7578675746917725 }, { "auxiliary_loss_clip": 0.01169085, "auxiliary_loss_mlp": 0.01027332, "balance_loss_clip": 0.97267103, "balance_loss_mlp": 1.01988149, "epoch": 0.4070221848133229, "flos": 25229006023680.0, "grad_norm": 1.56503367038316, "language_loss": 0.73226249, "learning_rate": 2.6853414450722043e-06, "loss": 0.75422668, "num_input_tokens_seen": 73104075, "step": 3385, "time_per_iteration": 2.7720346450805664 }, { "auxiliary_loss_clip": 0.0118151, "auxiliary_loss_mlp": 0.01021405, "balance_loss_clip": 1.01505411, "balance_loss_mlp": 1.01398969, "epoch": 0.40714242770396203, "flos": 18405224709120.0, "grad_norm": 2.051782743736697, "language_loss": 0.85268736, "learning_rate": 2.684609583851616e-06, "loss": 0.87471652, "num_input_tokens_seen": 73122250, "step": 3386, "time_per_iteration": 2.608196973800659 }, { "auxiliary_loss_clip": 0.01180185, "auxiliary_loss_mlp": 0.01028015, "balance_loss_clip": 0.90014791, "balance_loss_mlp": 1.01984286, "epoch": 0.4072626705946011, "flos": 30228920403840.0, "grad_norm": 1.4723096379109974, "language_loss": 0.80697256, "learning_rate": 2.683877618776297e-06, "loss": 0.82905459, "num_input_tokens_seen": 73144505, "step": 3387, "time_per_iteration": 2.8514811992645264 }, { "auxiliary_loss_clip": 0.01174148, "auxiliary_loss_mlp": 0.01027682, "balance_loss_clip": 0.97217602, "balance_loss_mlp": 1.01943266, "epoch": 0.4073829134852402, "flos": 21834549930240.0, "grad_norm": 2.3982494499474267, "language_loss": 0.73969162, "learning_rate": 2.6831455499572876e-06, "loss": 0.76170993, "num_input_tokens_seen": 73162440, "step": 3388, "time_per_iteration": 2.6862990856170654 }, { "auxiliary_loss_clip": 0.01184644, "auxiliary_loss_mlp": 0.01030774, "balance_loss_clip": 1.0527333, "balance_loss_mlp": 1.0229423, "epoch": 0.40750315637587925, "flos": 25260211964160.0, "grad_norm": 1.8439749343611571, "language_loss": 0.77686077, "learning_rate": 2.682413377505641e-06, "loss": 0.79901499, "num_input_tokens_seen": 73181245, "step": 3389, "time_per_iteration": 2.688330888748169 }, { "auxiliary_loss_clip": 0.0118295, "auxiliary_loss_mlp": 0.01025572, "balance_loss_clip": 1.01296246, "balance_loss_mlp": 1.01757908, "epoch": 0.40762339926651836, "flos": 19712767593600.0, "grad_norm": 1.8245372352308444, "language_loss": 0.76619089, "learning_rate": 2.6816811015324284e-06, "loss": 0.78827608, "num_input_tokens_seen": 73199295, "step": 3390, "time_per_iteration": 2.6781582832336426 }, { "auxiliary_loss_clip": 0.01088695, "auxiliary_loss_mlp": 0.00999873, "balance_loss_clip": 1.02451849, "balance_loss_mlp": 0.99785846, "epoch": 0.40774364215715747, "flos": 71449307314560.0, "grad_norm": 0.7273286775114532, "language_loss": 0.56736892, "learning_rate": 2.6809487221487343e-06, "loss": 0.58825463, "num_input_tokens_seen": 73258780, "step": 3391, "time_per_iteration": 3.117621421813965 }, { "auxiliary_loss_clip": 0.01172863, "auxiliary_loss_mlp": 0.01029507, "balance_loss_clip": 1.01102901, "balance_loss_mlp": 1.02156162, "epoch": 0.4078638850477965, "flos": 15084134144640.0, "grad_norm": 2.7972439887811875, "language_loss": 0.81693125, "learning_rate": 2.6802162394656605e-06, "loss": 0.83895493, "num_input_tokens_seen": 73275490, "step": 3392, "time_per_iteration": 2.702841281890869 }, { "auxiliary_loss_clip": 0.011768, "auxiliary_loss_mlp": 0.01025686, "balance_loss_clip": 0.97470939, "balance_loss_mlp": 1.01826191, "epoch": 0.40798412793843564, "flos": 23842890138240.0, "grad_norm": 1.8031331682476954, "language_loss": 0.71912593, "learning_rate": 2.679483653594324e-06, "loss": 0.74115074, "num_input_tokens_seen": 73297260, "step": 3393, "time_per_iteration": 2.7359704971313477 }, { "auxiliary_loss_clip": 0.01184247, "auxiliary_loss_mlp": 0.01028825, "balance_loss_clip": 1.01425624, "balance_loss_mlp": 1.02132726, "epoch": 0.40810437082907475, "flos": 21065774117760.0, "grad_norm": 2.15384942071174, "language_loss": 0.75905383, "learning_rate": 2.678750964645857e-06, "loss": 0.78118455, "num_input_tokens_seen": 73316340, "step": 3394, "time_per_iteration": 2.7211570739746094 }, { "auxiliary_loss_clip": 0.01189751, "auxiliary_loss_mlp": 0.0102803, "balance_loss_clip": 1.02040577, "balance_loss_mlp": 1.01934004, "epoch": 0.4082246137197138, "flos": 11321377948800.0, "grad_norm": 2.434001929791834, "language_loss": 0.83053601, "learning_rate": 2.6780181727314094e-06, "loss": 0.85271382, "num_input_tokens_seen": 73331245, "step": 3395, "time_per_iteration": 2.6463730335235596 }, { "auxiliary_loss_clip": 0.01183116, "auxiliary_loss_mlp": 0.0112401, "balance_loss_clip": 0.93914258, "balance_loss_mlp": 0.0, "epoch": 0.4083448566103529, "flos": 19062569554560.0, "grad_norm": 1.6868832115648362, "language_loss": 0.77958047, "learning_rate": 2.6772852779621435e-06, "loss": 0.8026517, "num_input_tokens_seen": 73349105, "step": 3396, "time_per_iteration": 2.74125599861145 }, { "auxiliary_loss_clip": 0.01180571, "auxiliary_loss_mlp": 0.01123811, "balance_loss_clip": 1.01765966, "balance_loss_mlp": 0.0, "epoch": 0.408465099500992, "flos": 23550254035200.0, "grad_norm": 1.8848083245666567, "language_loss": 0.86366594, "learning_rate": 2.676552280449239e-06, "loss": 0.88670981, "num_input_tokens_seen": 73368990, "step": 3397, "time_per_iteration": 4.6604063510894775 }, { "auxiliary_loss_clip": 0.01173759, "auxiliary_loss_mlp": 0.01030134, "balance_loss_clip": 1.0128032, "balance_loss_mlp": 1.02173591, "epoch": 0.4085853423916311, "flos": 12750012558720.0, "grad_norm": 2.400074865155392, "language_loss": 0.75607824, "learning_rate": 2.6758191803038917e-06, "loss": 0.77811718, "num_input_tokens_seen": 73387485, "step": 3398, "time_per_iteration": 2.6175432205200195 }, { "auxiliary_loss_clip": 0.01171132, "auxiliary_loss_mlp": 0.01024752, "balance_loss_clip": 0.86282533, "balance_loss_mlp": 1.01659822, "epoch": 0.4087055852822702, "flos": 24353072962560.0, "grad_norm": 1.5387914619840795, "language_loss": 0.82608461, "learning_rate": 2.6750859776373125e-06, "loss": 0.84804344, "num_input_tokens_seen": 73406940, "step": 3399, "time_per_iteration": 2.908864736557007 }, { "auxiliary_loss_clip": 0.01099244, "auxiliary_loss_mlp": 0.00999253, "balance_loss_clip": 0.83513021, "balance_loss_mlp": 0.99703592, "epoch": 0.4088258281729093, "flos": 66387950720640.0, "grad_norm": 0.7771672907880406, "language_loss": 0.605299, "learning_rate": 2.674352672560727e-06, "loss": 0.626284, "num_input_tokens_seen": 73468385, "step": 3400, "time_per_iteration": 3.564884662628174 }, { "auxiliary_loss_clip": 0.01179304, "auxiliary_loss_mlp": 0.01031684, "balance_loss_clip": 0.93706888, "balance_loss_mlp": 1.02379835, "epoch": 0.40894607106354836, "flos": 20449260057600.0, "grad_norm": 1.56793700425882, "language_loss": 0.76921678, "learning_rate": 2.673619265185377e-06, "loss": 0.79132658, "num_input_tokens_seen": 73488225, "step": 3401, "time_per_iteration": 3.059183120727539 }, { "auxiliary_loss_clip": 0.01184562, "auxiliary_loss_mlp": 0.01028992, "balance_loss_clip": 1.01371574, "balance_loss_mlp": 1.02045107, "epoch": 0.40906631395418747, "flos": 27053627143680.0, "grad_norm": 1.5717750098360384, "language_loss": 0.77887732, "learning_rate": 2.672885755622521e-06, "loss": 0.80101287, "num_input_tokens_seen": 73510640, "step": 3402, "time_per_iteration": 2.7273755073547363 }, { "auxiliary_loss_clip": 0.01173765, "auxiliary_loss_mlp": 0.0102418, "balance_loss_clip": 0.89627004, "balance_loss_mlp": 1.01605558, "epoch": 0.4091865568448266, "flos": 25484151306240.0, "grad_norm": 4.948926218135286, "language_loss": 0.70060873, "learning_rate": 2.67215214398343e-06, "loss": 0.72258824, "num_input_tokens_seen": 73530655, "step": 3403, "time_per_iteration": 3.6305599212646484 }, { "auxiliary_loss_clip": 0.01182379, "auxiliary_loss_mlp": 0.01031398, "balance_loss_clip": 0.89765102, "balance_loss_mlp": 1.02279675, "epoch": 0.40930679973546563, "flos": 28657864368000.0, "grad_norm": 2.039472003565162, "language_loss": 0.78432244, "learning_rate": 2.671418430379393e-06, "loss": 0.80646014, "num_input_tokens_seen": 73549340, "step": 3404, "time_per_iteration": 2.768949270248413 }, { "auxiliary_loss_clip": 0.01183168, "auxiliary_loss_mlp": 0.01026137, "balance_loss_clip": 1.05126095, "balance_loss_mlp": 1.01815557, "epoch": 0.40942704262610474, "flos": 20886292834560.0, "grad_norm": 3.1441503687670087, "language_loss": 0.83138752, "learning_rate": 2.670684614921715e-06, "loss": 0.85348058, "num_input_tokens_seen": 73568315, "step": 3405, "time_per_iteration": 3.9625837802886963 }, { "auxiliary_loss_clip": 0.01183378, "auxiliary_loss_mlp": 0.01027889, "balance_loss_clip": 0.97455478, "balance_loss_mlp": 1.01998544, "epoch": 0.4095472855167438, "flos": 21618080616960.0, "grad_norm": 3.2665337005419897, "language_loss": 0.69420779, "learning_rate": 2.6699506977217128e-06, "loss": 0.7163204, "num_input_tokens_seen": 73588490, "step": 3406, "time_per_iteration": 2.726911783218384 }, { "auxiliary_loss_clip": 0.0118221, "auxiliary_loss_mlp": 0.01032755, "balance_loss_clip": 1.01515818, "balance_loss_mlp": 1.02495253, "epoch": 0.4096675284073829, "flos": 27926112499200.0, "grad_norm": 2.0689021114228163, "language_loss": 0.70168626, "learning_rate": 2.6692166788907233e-06, "loss": 0.72383589, "num_input_tokens_seen": 73608685, "step": 3407, "time_per_iteration": 2.703693389892578 }, { "auxiliary_loss_clip": 0.01183757, "auxiliary_loss_mlp": 0.01029915, "balance_loss_clip": 0.97541451, "balance_loss_mlp": 1.02171636, "epoch": 0.409787771298022, "flos": 19206607092480.0, "grad_norm": 1.9249181719942439, "language_loss": 0.77134931, "learning_rate": 2.6684825585400957e-06, "loss": 0.79348606, "num_input_tokens_seen": 73627630, "step": 3408, "time_per_iteration": 2.657454490661621 }, { "auxiliary_loss_clip": 0.01081162, "auxiliary_loss_mlp": 0.01003611, "balance_loss_clip": 0.94200146, "balance_loss_mlp": 1.00154901, "epoch": 0.4099080141886611, "flos": 59269234832640.0, "grad_norm": 0.8425180014080119, "language_loss": 0.6517182, "learning_rate": 2.6677483367811947e-06, "loss": 0.67256594, "num_input_tokens_seen": 73687670, "step": 3409, "time_per_iteration": 3.3524019718170166 }, { "auxiliary_loss_clip": 0.01185285, "auxiliary_loss_mlp": 0.01026547, "balance_loss_clip": 1.01450849, "balance_loss_mlp": 1.01844096, "epoch": 0.4100282570793002, "flos": 21906443001600.0, "grad_norm": 1.711521549076087, "language_loss": 0.7565425, "learning_rate": 2.6670140137254028e-06, "loss": 0.77866083, "num_input_tokens_seen": 73707145, "step": 3410, "time_per_iteration": 2.694714307785034 }, { "auxiliary_loss_clip": 0.01174638, "auxiliary_loss_mlp": 0.01034099, "balance_loss_clip": 0.89780807, "balance_loss_mlp": 1.02621961, "epoch": 0.4101484999699393, "flos": 18551596631040.0, "grad_norm": 2.1160929840079774, "language_loss": 0.89618117, "learning_rate": 2.666279589484115e-06, "loss": 0.9182685, "num_input_tokens_seen": 73725045, "step": 3411, "time_per_iteration": 2.744387149810791 }, { "auxiliary_loss_clip": 0.01176705, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 0.89765012, "balance_loss_mlp": 1.0215162, "epoch": 0.41026874286057835, "flos": 19094529680640.0, "grad_norm": 1.7672457987221066, "language_loss": 0.81047428, "learning_rate": 2.6655450641687435e-06, "loss": 0.83253521, "num_input_tokens_seen": 73742610, "step": 3412, "time_per_iteration": 2.8496129512786865 }, { "auxiliary_loss_clip": 0.01182672, "auxiliary_loss_mlp": 0.01033795, "balance_loss_clip": 1.053514, "balance_loss_mlp": 1.02615976, "epoch": 0.41038898575121746, "flos": 31209568588800.0, "grad_norm": 2.2500728537054706, "language_loss": 0.69225538, "learning_rate": 2.664810437890715e-06, "loss": 0.71441996, "num_input_tokens_seen": 73764280, "step": 3413, "time_per_iteration": 2.7674684524536133 }, { "auxiliary_loss_clip": 0.01170823, "auxiliary_loss_mlp": 0.01028571, "balance_loss_clip": 0.866009, "balance_loss_mlp": 1.02098966, "epoch": 0.41050922864185657, "flos": 14355865895040.0, "grad_norm": 1.8252423099620834, "language_loss": 0.79522693, "learning_rate": 2.6640757107614714e-06, "loss": 0.81722093, "num_input_tokens_seen": 73782375, "step": 3414, "time_per_iteration": 2.7828848361968994 }, { "auxiliary_loss_clip": 0.01176351, "auxiliary_loss_mlp": 0.01030469, "balance_loss_clip": 0.93847775, "balance_loss_mlp": 1.02166569, "epoch": 0.4106294715324956, "flos": 30956290813440.0, "grad_norm": 2.659966179302985, "language_loss": 0.68947512, "learning_rate": 2.6633408828924697e-06, "loss": 0.71154332, "num_input_tokens_seen": 73801240, "step": 3415, "time_per_iteration": 2.7649991512298584 }, { "auxiliary_loss_clip": 0.01186566, "auxiliary_loss_mlp": 0.01026807, "balance_loss_clip": 0.93746787, "balance_loss_mlp": 1.01852155, "epoch": 0.41074971442313474, "flos": 24457321209600.0, "grad_norm": 1.5134863658217417, "language_loss": 0.69978029, "learning_rate": 2.662605954395185e-06, "loss": 0.72191405, "num_input_tokens_seen": 73821200, "step": 3416, "time_per_iteration": 2.7923877239227295 }, { "auxiliary_loss_clip": 0.01184344, "auxiliary_loss_mlp": 0.01023311, "balance_loss_clip": 1.0130322, "balance_loss_mlp": 1.01537776, "epoch": 0.41086995731377385, "flos": 21542991235200.0, "grad_norm": 1.7127429586293936, "language_loss": 0.83368063, "learning_rate": 2.6618709253811027e-06, "loss": 0.85575724, "num_input_tokens_seen": 73840655, "step": 3417, "time_per_iteration": 2.735280752182007 }, { "auxiliary_loss_clip": 0.01179609, "auxiliary_loss_mlp": 0.01023862, "balance_loss_clip": 1.05272317, "balance_loss_mlp": 1.01698661, "epoch": 0.4109902002044129, "flos": 20702753314560.0, "grad_norm": 2.126247044513198, "language_loss": 0.87654018, "learning_rate": 2.6611357959617277e-06, "loss": 0.89857483, "num_input_tokens_seen": 73860275, "step": 3418, "time_per_iteration": 2.6166374683380127 }, { "auxiliary_loss_clip": 0.01171106, "auxiliary_loss_mlp": 0.01024664, "balance_loss_clip": 0.93633473, "balance_loss_mlp": 1.01653385, "epoch": 0.411110443095052, "flos": 18179992477440.0, "grad_norm": 1.8668011665501323, "language_loss": 0.91175067, "learning_rate": 2.660400566248578e-06, "loss": 0.93370837, "num_input_tokens_seen": 73878400, "step": 3419, "time_per_iteration": 2.743934392929077 }, { "auxiliary_loss_clip": 0.01178505, "auxiliary_loss_mlp": 0.01029885, "balance_loss_clip": 0.93590438, "balance_loss_mlp": 1.02168965, "epoch": 0.41123068598569107, "flos": 14575244209920.0, "grad_norm": 2.2502130815436017, "language_loss": 0.66578841, "learning_rate": 2.6596652363531876e-06, "loss": 0.68787241, "num_input_tokens_seen": 73894275, "step": 3420, "time_per_iteration": 2.6952548027038574 }, { "auxiliary_loss_clip": 0.01184142, "auxiliary_loss_mlp": 0.01031522, "balance_loss_clip": 1.05470657, "balance_loss_mlp": 1.02383268, "epoch": 0.4113509288763302, "flos": 21177995184000.0, "grad_norm": 1.5056104674050237, "language_loss": 0.78127778, "learning_rate": 2.6589298063871055e-06, "loss": 0.80343437, "num_input_tokens_seen": 73914450, "step": 3421, "time_per_iteration": 2.6423511505126953 }, { "auxiliary_loss_clip": 0.01183933, "auxiliary_loss_mlp": 0.0103126, "balance_loss_clip": 1.05443144, "balance_loss_mlp": 1.02298427, "epoch": 0.4114711717669693, "flos": 18442212739200.0, "grad_norm": 1.7639245942000745, "language_loss": 0.69821024, "learning_rate": 2.658194276461895e-06, "loss": 0.72036219, "num_input_tokens_seen": 73932375, "step": 3422, "time_per_iteration": 2.6186821460723877 }, { "auxiliary_loss_clip": 0.0117851, "auxiliary_loss_mlp": 0.01028921, "balance_loss_clip": 0.97250867, "balance_loss_mlp": 1.02031434, "epoch": 0.41159141465760835, "flos": 27233395735680.0, "grad_norm": 2.0174833704749853, "language_loss": 0.66787755, "learning_rate": 2.6574586466891368e-06, "loss": 0.6899519, "num_input_tokens_seen": 73952850, "step": 3423, "time_per_iteration": 3.7832915782928467 }, { "auxiliary_loss_clip": 0.01182254, "auxiliary_loss_mlp": 0.01123507, "balance_loss_clip": 0.97662181, "balance_loss_mlp": 0.0, "epoch": 0.41171165754824746, "flos": 20006876154240.0, "grad_norm": 2.5464336783145636, "language_loss": 0.64350247, "learning_rate": 2.6567229171804247e-06, "loss": 0.66656011, "num_input_tokens_seen": 73970735, "step": 3424, "time_per_iteration": 2.747412919998169 }, { "auxiliary_loss_clip": 0.01184624, "auxiliary_loss_mlp": 0.0103302, "balance_loss_clip": 0.93436849, "balance_loss_mlp": 1.02467561, "epoch": 0.41183190043888657, "flos": 18004318035840.0, "grad_norm": 2.1486091239690865, "language_loss": 0.87857699, "learning_rate": 2.655987088047368e-06, "loss": 0.90075344, "num_input_tokens_seen": 73989080, "step": 3425, "time_per_iteration": 2.7301039695739746 }, { "auxiliary_loss_clip": 0.01178038, "auxiliary_loss_mlp": 0.01032436, "balance_loss_clip": 0.97532666, "balance_loss_mlp": 1.02364397, "epoch": 0.4119521433295256, "flos": 27163370171520.0, "grad_norm": 1.8011583307791093, "language_loss": 0.78466201, "learning_rate": 2.6552511594015912e-06, "loss": 0.80676675, "num_input_tokens_seen": 74009470, "step": 3426, "time_per_iteration": 2.763554573059082 }, { "auxiliary_loss_clip": 0.01175769, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 0.97170925, "balance_loss_mlp": 1.01954472, "epoch": 0.41207238622016473, "flos": 15122020014720.0, "grad_norm": 2.552514996550449, "language_loss": 0.85193551, "learning_rate": 2.654515131354735e-06, "loss": 0.87397313, "num_input_tokens_seen": 74027735, "step": 3427, "time_per_iteration": 2.679291248321533 }, { "auxiliary_loss_clip": 0.01183711, "auxiliary_loss_mlp": 0.01028272, "balance_loss_clip": 0.94101179, "balance_loss_mlp": 1.02026689, "epoch": 0.41219262911080384, "flos": 27052872958080.0, "grad_norm": 2.0014833917837245, "language_loss": 0.85197973, "learning_rate": 2.653779004018453e-06, "loss": 0.87409961, "num_input_tokens_seen": 74048300, "step": 3428, "time_per_iteration": 3.728872776031494 }, { "auxiliary_loss_clip": 0.01173445, "auxiliary_loss_mlp": 0.0102924, "balance_loss_clip": 0.97508669, "balance_loss_mlp": 1.02118754, "epoch": 0.4123128720014429, "flos": 24686360282880.0, "grad_norm": 1.7413340400418342, "language_loss": 0.82276243, "learning_rate": 2.653042777504417e-06, "loss": 0.84478927, "num_input_tokens_seen": 74070890, "step": 3429, "time_per_iteration": 2.7063980102539062 }, { "auxiliary_loss_clip": 0.01188904, "auxiliary_loss_mlp": 0.01027751, "balance_loss_clip": 0.97551596, "balance_loss_mlp": 1.01958513, "epoch": 0.412433114892082, "flos": 26244774731520.0, "grad_norm": 2.1284224597187427, "language_loss": 0.79926133, "learning_rate": 2.6523064519243105e-06, "loss": 0.82142788, "num_input_tokens_seen": 74090460, "step": 3430, "time_per_iteration": 2.7632999420166016 }, { "auxiliary_loss_clip": 0.01183506, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.01531184, "balance_loss_mlp": 1.02744424, "epoch": 0.4125533577827211, "flos": 21361031913600.0, "grad_norm": 2.1218474333259048, "language_loss": 0.79339981, "learning_rate": 2.6515700273898333e-06, "loss": 0.81559181, "num_input_tokens_seen": 74108335, "step": 3431, "time_per_iteration": 3.5941896438598633 }, { "auxiliary_loss_clip": 0.0117184, "auxiliary_loss_mlp": 0.01024654, "balance_loss_clip": 0.97681487, "balance_loss_mlp": 1.01698923, "epoch": 0.4126736006733602, "flos": 26067556005120.0, "grad_norm": 2.176032021267838, "language_loss": 0.69059181, "learning_rate": 2.6508335040127018e-06, "loss": 0.71255672, "num_input_tokens_seen": 74128030, "step": 3432, "time_per_iteration": 2.718360662460327 }, { "auxiliary_loss_clip": 0.01189023, "auxiliary_loss_mlp": 0.0102881, "balance_loss_clip": 1.01716375, "balance_loss_mlp": 1.02106082, "epoch": 0.4127938435639993, "flos": 25666146541440.0, "grad_norm": 1.375063941035949, "language_loss": 0.76875591, "learning_rate": 2.6500968819046446e-06, "loss": 0.79093421, "num_input_tokens_seen": 74148330, "step": 3433, "time_per_iteration": 2.696289539337158 }, { "auxiliary_loss_clip": 0.01165463, "auxiliary_loss_mlp": 0.01030815, "balance_loss_clip": 0.93355691, "balance_loss_mlp": 1.02310181, "epoch": 0.4129140864546384, "flos": 17995914253440.0, "grad_norm": 2.2587815684218016, "language_loss": 0.59142768, "learning_rate": 2.649360161177408e-06, "loss": 0.61339045, "num_input_tokens_seen": 74163390, "step": 3434, "time_per_iteration": 2.6588878631591797 }, { "auxiliary_loss_clip": 0.01187719, "auxiliary_loss_mlp": 0.01025308, "balance_loss_clip": 1.0144068, "balance_loss_mlp": 1.01712465, "epoch": 0.41303432934527745, "flos": 23732895715200.0, "grad_norm": 1.6943363256158173, "language_loss": 0.73182106, "learning_rate": 2.6486233419427504e-06, "loss": 0.75395143, "num_input_tokens_seen": 74183205, "step": 3435, "time_per_iteration": 2.7496204376220703 }, { "auxiliary_loss_clip": 0.01171491, "auxiliary_loss_mlp": 0.0102993, "balance_loss_clip": 0.93829441, "balance_loss_mlp": 1.02126312, "epoch": 0.41315457223591656, "flos": 19755286318080.0, "grad_norm": 2.096196054495243, "language_loss": 0.74847937, "learning_rate": 2.6478864243124484e-06, "loss": 0.77049351, "num_input_tokens_seen": 74202870, "step": 3436, "time_per_iteration": 2.697836399078369 }, { "auxiliary_loss_clip": 0.01180786, "auxiliary_loss_mlp": 0.01024846, "balance_loss_clip": 1.01170301, "balance_loss_mlp": 1.01750863, "epoch": 0.4132748151265556, "flos": 20923316778240.0, "grad_norm": 1.6878006559054126, "language_loss": 0.85231209, "learning_rate": 2.6471494083982903e-06, "loss": 0.87436843, "num_input_tokens_seen": 74222255, "step": 3437, "time_per_iteration": 2.7630937099456787 }, { "auxiliary_loss_clip": 0.01182003, "auxiliary_loss_mlp": 0.01035549, "balance_loss_clip": 0.93696076, "balance_loss_mlp": 1.02752614, "epoch": 0.4133950580171947, "flos": 32232520016640.0, "grad_norm": 2.108063687135923, "language_loss": 0.74563026, "learning_rate": 2.6464122943120818e-06, "loss": 0.76780576, "num_input_tokens_seen": 74242480, "step": 3438, "time_per_iteration": 2.86586856842041 }, { "auxiliary_loss_clip": 0.01179838, "auxiliary_loss_mlp": 0.01030574, "balance_loss_clip": 0.93902886, "balance_loss_mlp": 1.02254558, "epoch": 0.41351530090783384, "flos": 23292487059840.0, "grad_norm": 2.5778392650318454, "language_loss": 0.82574129, "learning_rate": 2.645675082165642e-06, "loss": 0.84784532, "num_input_tokens_seen": 74258690, "step": 3439, "time_per_iteration": 2.7733356952667236 }, { "auxiliary_loss_clip": 0.01185929, "auxiliary_loss_mlp": 0.01028871, "balance_loss_clip": 0.98007399, "balance_loss_mlp": 1.02024615, "epoch": 0.4136355437984729, "flos": 25593571111680.0, "grad_norm": 2.250102537687514, "language_loss": 0.75266922, "learning_rate": 2.644937772070806e-06, "loss": 0.77481723, "num_input_tokens_seen": 74277135, "step": 3440, "time_per_iteration": 2.763821840286255 }, { "auxiliary_loss_clip": 0.0118678, "auxiliary_loss_mlp": 0.01024268, "balance_loss_clip": 1.05546165, "balance_loss_mlp": 1.01641822, "epoch": 0.413755786689112, "flos": 19828615933440.0, "grad_norm": 2.3090699830268733, "language_loss": 0.83651906, "learning_rate": 2.6442003641394225e-06, "loss": 0.85862958, "num_input_tokens_seen": 74294730, "step": 3441, "time_per_iteration": 2.589370012283325 }, { "auxiliary_loss_clip": 0.01177475, "auxiliary_loss_mlp": 0.01022075, "balance_loss_clip": 0.97427988, "balance_loss_mlp": 1.01447284, "epoch": 0.4138760295797511, "flos": 26870446759680.0, "grad_norm": 1.4439809363161333, "language_loss": 0.83893186, "learning_rate": 2.643462858483356e-06, "loss": 0.86092734, "num_input_tokens_seen": 74315015, "step": 3442, "time_per_iteration": 2.7728116512298584 }, { "auxiliary_loss_clip": 0.01176329, "auxiliary_loss_mlp": 0.01027214, "balance_loss_clip": 0.89890981, "balance_loss_mlp": 1.0190897, "epoch": 0.41399627247039017, "flos": 16399254798720.0, "grad_norm": 2.3085644408826727, "language_loss": 0.72567821, "learning_rate": 2.6427252552144856e-06, "loss": 0.74771369, "num_input_tokens_seen": 74333665, "step": 3443, "time_per_iteration": 2.7296056747436523 }, { "auxiliary_loss_clip": 0.01186874, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.05478287, "balance_loss_mlp": 1.02150548, "epoch": 0.4141165153610293, "flos": 22930220442240.0, "grad_norm": 2.5226314209194873, "language_loss": 0.74912602, "learning_rate": 2.6419875544447044e-06, "loss": 0.771294, "num_input_tokens_seen": 74355065, "step": 3444, "time_per_iteration": 2.800088405609131 }, { "auxiliary_loss_clip": 0.01184351, "auxiliary_loss_mlp": 0.01038881, "balance_loss_clip": 1.05254197, "balance_loss_mlp": 1.03078127, "epoch": 0.4142367582516684, "flos": 25192556697600.0, "grad_norm": 1.5549155968185346, "language_loss": 0.71732032, "learning_rate": 2.6412497562859218e-06, "loss": 0.73955262, "num_input_tokens_seen": 74376345, "step": 3445, "time_per_iteration": 2.6331863403320312 }, { "auxiliary_loss_clip": 0.01186978, "auxiliary_loss_mlp": 0.01026883, "balance_loss_clip": 1.01397824, "balance_loss_mlp": 1.01890826, "epoch": 0.41435700114230745, "flos": 21690476478720.0, "grad_norm": 2.051412427661836, "language_loss": 0.76035964, "learning_rate": 2.6405118608500617e-06, "loss": 0.78249824, "num_input_tokens_seen": 74395170, "step": 3446, "time_per_iteration": 2.704639196395874 }, { "auxiliary_loss_clip": 0.01176729, "auxiliary_loss_mlp": 0.01025158, "balance_loss_clip": 0.94105232, "balance_loss_mlp": 1.01783299, "epoch": 0.41447724403294656, "flos": 25995160143360.0, "grad_norm": 1.6467592433166236, "language_loss": 0.81631196, "learning_rate": 2.6397738682490613e-06, "loss": 0.83833086, "num_input_tokens_seen": 74416070, "step": 3447, "time_per_iteration": 2.7701478004455566 }, { "auxiliary_loss_clip": 0.01187367, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.05572724, "balance_loss_mlp": 1.02579331, "epoch": 0.41459748692358567, "flos": 18259678800000.0, "grad_norm": 2.1910146096019383, "language_loss": 0.75094092, "learning_rate": 2.6390357785948734e-06, "loss": 0.77314377, "num_input_tokens_seen": 74433185, "step": 3448, "time_per_iteration": 2.6890008449554443 }, { "auxiliary_loss_clip": 0.01185196, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.0155561, "balance_loss_mlp": 1.02180624, "epoch": 0.4147177298142247, "flos": 24168456034560.0, "grad_norm": 1.5950852612207989, "language_loss": 0.80150449, "learning_rate": 2.6382975919994667e-06, "loss": 0.82365865, "num_input_tokens_seen": 74453760, "step": 3449, "time_per_iteration": 3.58577561378479 }, { "auxiliary_loss_clip": 0.01183992, "auxiliary_loss_mlp": 0.01025008, "balance_loss_clip": 0.97682059, "balance_loss_mlp": 1.0174557, "epoch": 0.41483797270486383, "flos": 20084659056000.0, "grad_norm": 1.7914680161359797, "language_loss": 0.72970951, "learning_rate": 2.637559308574822e-06, "loss": 0.75179946, "num_input_tokens_seen": 74473505, "step": 3450, "time_per_iteration": 3.721907138824463 }, { "auxiliary_loss_clip": 0.01184097, "auxiliary_loss_mlp": 0.0102541, "balance_loss_clip": 1.05385292, "balance_loss_mlp": 1.01759028, "epoch": 0.4149582155955029, "flos": 30081040110720.0, "grad_norm": 1.7175818558574123, "language_loss": 0.70972228, "learning_rate": 2.6368209284329376e-06, "loss": 0.73181731, "num_input_tokens_seen": 74494135, "step": 3451, "time_per_iteration": 2.7651565074920654 }, { "auxiliary_loss_clip": 0.01181803, "auxiliary_loss_mlp": 0.01030191, "balance_loss_clip": 1.01354039, "balance_loss_mlp": 1.0224905, "epoch": 0.415078458486142, "flos": 16764394504320.0, "grad_norm": 2.58411783519183, "language_loss": 0.75685728, "learning_rate": 2.636082451685825e-06, "loss": 0.77897722, "num_input_tokens_seen": 74512335, "step": 3452, "time_per_iteration": 2.681755542755127 }, { "auxiliary_loss_clip": 0.01187435, "auxiliary_loss_mlp": 0.01029688, "balance_loss_clip": 0.97945929, "balance_loss_mlp": 1.02217245, "epoch": 0.4151987013767811, "flos": 26033692458240.0, "grad_norm": 1.7585968894645216, "language_loss": 0.86309844, "learning_rate": 2.6353438784455094e-06, "loss": 0.88526976, "num_input_tokens_seen": 74535620, "step": 3453, "time_per_iteration": 2.7314984798431396 }, { "auxiliary_loss_clip": 0.01180812, "auxiliary_loss_mlp": 0.01029253, "balance_loss_clip": 0.97783935, "balance_loss_mlp": 1.02026439, "epoch": 0.41531894426742016, "flos": 24608002763520.0, "grad_norm": 2.0713743474238084, "language_loss": 0.71176648, "learning_rate": 2.6346052088240326e-06, "loss": 0.73386717, "num_input_tokens_seen": 74555140, "step": 3454, "time_per_iteration": 2.7406585216522217 }, { "auxiliary_loss_clip": 0.0118503, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 0.97775888, "balance_loss_mlp": 1.02607524, "epoch": 0.4154391871580593, "flos": 14975791747200.0, "grad_norm": 1.8542563290761596, "language_loss": 0.77398521, "learning_rate": 2.63386644293345e-06, "loss": 0.79617417, "num_input_tokens_seen": 74571485, "step": 3455, "time_per_iteration": 3.548403024673462 }, { "auxiliary_loss_clip": 0.01175482, "auxiliary_loss_mlp": 0.01029, "balance_loss_clip": 0.93517041, "balance_loss_mlp": 1.02118599, "epoch": 0.4155594300486984, "flos": 14647173194880.0, "grad_norm": 2.201345169196974, "language_loss": 0.83041418, "learning_rate": 2.633127580885833e-06, "loss": 0.85245895, "num_input_tokens_seen": 74585985, "step": 3456, "time_per_iteration": 2.7635326385498047 }, { "auxiliary_loss_clip": 0.01186436, "auxiliary_loss_mlp": 0.01027628, "balance_loss_clip": 1.0575608, "balance_loss_mlp": 1.02001023, "epoch": 0.41567967293933744, "flos": 29497276275840.0, "grad_norm": 1.9553808626044562, "language_loss": 0.65124154, "learning_rate": 2.632388622793265e-06, "loss": 0.67338216, "num_input_tokens_seen": 74605140, "step": 3457, "time_per_iteration": 3.647773265838623 }, { "auxiliary_loss_clip": 0.01182465, "auxiliary_loss_mlp": 0.01030586, "balance_loss_clip": 1.01517034, "balance_loss_mlp": 1.02323127, "epoch": 0.41579991582997655, "flos": 19238387650560.0, "grad_norm": 1.6439877765467155, "language_loss": 0.67805564, "learning_rate": 2.6316495687678457e-06, "loss": 0.70018613, "num_input_tokens_seen": 74623790, "step": 3458, "time_per_iteration": 2.693096399307251 }, { "auxiliary_loss_clip": 0.01171316, "auxiliary_loss_mlp": 0.01024164, "balance_loss_clip": 0.89795536, "balance_loss_mlp": 1.01587939, "epoch": 0.41592015872061566, "flos": 24462061804800.0, "grad_norm": 2.2448103470622973, "language_loss": 0.76879984, "learning_rate": 2.6309104189216887e-06, "loss": 0.79075468, "num_input_tokens_seen": 74641355, "step": 3459, "time_per_iteration": 2.7907612323760986 }, { "auxiliary_loss_clip": 0.0116891, "auxiliary_loss_mlp": 0.01124271, "balance_loss_clip": 0.93550289, "balance_loss_mlp": 0.0, "epoch": 0.4160404016112547, "flos": 20775651966720.0, "grad_norm": 2.05163983420307, "language_loss": 0.74756902, "learning_rate": 2.630171173366923e-06, "loss": 0.77050084, "num_input_tokens_seen": 74657155, "step": 3460, "time_per_iteration": 2.715069055557251 }, { "auxiliary_loss_clip": 0.01178129, "auxiliary_loss_mlp": 0.01030061, "balance_loss_clip": 0.8993057, "balance_loss_mlp": 1.02182913, "epoch": 0.41616064450189383, "flos": 13916462820480.0, "grad_norm": 2.6343050376991646, "language_loss": 0.73852915, "learning_rate": 2.629431832215691e-06, "loss": 0.760611, "num_input_tokens_seen": 74671960, "step": 3461, "time_per_iteration": 2.7493739128112793 }, { "auxiliary_loss_clip": 0.01180706, "auxiliary_loss_mlp": 0.01027554, "balance_loss_clip": 0.97599787, "balance_loss_mlp": 1.01969171, "epoch": 0.41628088739253294, "flos": 20010826650240.0, "grad_norm": 1.9053996479775888, "language_loss": 0.87255692, "learning_rate": 2.628692395580151e-06, "loss": 0.89463949, "num_input_tokens_seen": 74692050, "step": 3462, "time_per_iteration": 2.7053873538970947 }, { "auxiliary_loss_clip": 0.01155389, "auxiliary_loss_mlp": 0.01034525, "balance_loss_clip": 0.85618281, "balance_loss_mlp": 1.02649069, "epoch": 0.416401130283172, "flos": 29168801377920.0, "grad_norm": 1.5955692931938328, "language_loss": 0.78970343, "learning_rate": 2.6279528635724747e-06, "loss": 0.81160259, "num_input_tokens_seen": 74712205, "step": 3463, "time_per_iteration": 2.8523082733154297 }, { "auxiliary_loss_clip": 0.01182779, "auxiliary_loss_mlp": 0.01027026, "balance_loss_clip": 1.01431274, "balance_loss_mlp": 1.01902676, "epoch": 0.4165213731738111, "flos": 16246813478400.0, "grad_norm": 2.6365375437842444, "language_loss": 0.7869004, "learning_rate": 2.627213236304848e-06, "loss": 0.80899835, "num_input_tokens_seen": 74729005, "step": 3464, "time_per_iteration": 2.6426756381988525 }, { "auxiliary_loss_clip": 0.01185937, "auxiliary_loss_mlp": 0.01033278, "balance_loss_clip": 1.01518512, "balance_loss_mlp": 1.02528536, "epoch": 0.4166416160644502, "flos": 33765438787200.0, "grad_norm": 2.243396896160623, "language_loss": 0.70947182, "learning_rate": 2.626473513889472e-06, "loss": 0.73166394, "num_input_tokens_seen": 74751385, "step": 3465, "time_per_iteration": 2.790417432785034 }, { "auxiliary_loss_clip": 0.0117618, "auxiliary_loss_mlp": 0.0103238, "balance_loss_clip": 1.01417327, "balance_loss_mlp": 1.02432156, "epoch": 0.41676185895508927, "flos": 20917498775040.0, "grad_norm": 1.912735639636027, "language_loss": 0.82863563, "learning_rate": 2.625733696438562e-06, "loss": 0.85072124, "num_input_tokens_seen": 74768890, "step": 3466, "time_per_iteration": 2.7321414947509766 }, { "auxiliary_loss_clip": 0.01175941, "auxiliary_loss_mlp": 0.01034511, "balance_loss_clip": 0.97485095, "balance_loss_mlp": 1.02618408, "epoch": 0.4168821018457284, "flos": 18406122549120.0, "grad_norm": 1.613615760964088, "language_loss": 0.7528314, "learning_rate": 2.6249937840643476e-06, "loss": 0.77493584, "num_input_tokens_seen": 74787195, "step": 3467, "time_per_iteration": 2.6975929737091064 }, { "auxiliary_loss_clip": 0.01188018, "auxiliary_loss_mlp": 0.01123977, "balance_loss_clip": 1.05724335, "balance_loss_mlp": 0.0, "epoch": 0.41700234473636744, "flos": 18698399516160.0, "grad_norm": 1.5863018950070351, "language_loss": 0.66523474, "learning_rate": 2.6242537768790733e-06, "loss": 0.68835473, "num_input_tokens_seen": 74806350, "step": 3468, "time_per_iteration": 2.7244601249694824 }, { "auxiliary_loss_clip": 0.01184226, "auxiliary_loss_mlp": 0.01029323, "balance_loss_clip": 1.01653469, "balance_loss_mlp": 1.02081704, "epoch": 0.41712258762700655, "flos": 31033283616000.0, "grad_norm": 1.9526978365070122, "language_loss": 0.68719625, "learning_rate": 2.6235136749949975e-06, "loss": 0.70933175, "num_input_tokens_seen": 74829800, "step": 3469, "time_per_iteration": 2.8193888664245605 }, { "auxiliary_loss_clip": 0.01185802, "auxiliary_loss_mlp": 0.01027215, "balance_loss_clip": 1.05403996, "balance_loss_mlp": 1.01957989, "epoch": 0.41724283051764566, "flos": 35914763877120.0, "grad_norm": 2.5566045033970926, "language_loss": 0.6154145, "learning_rate": 2.6227734785243924e-06, "loss": 0.63754469, "num_input_tokens_seen": 74849760, "step": 3470, "time_per_iteration": 2.780707836151123 }, { "auxiliary_loss_clip": 0.011692, "auxiliary_loss_mlp": 0.01027354, "balance_loss_clip": 0.85903454, "balance_loss_mlp": 1.01935482, "epoch": 0.4173630734082847, "flos": 25333649320320.0, "grad_norm": 1.7534449073021712, "language_loss": 0.79231644, "learning_rate": 2.6220331875795466e-06, "loss": 0.81428206, "num_input_tokens_seen": 74869110, "step": 3471, "time_per_iteration": 2.7944750785827637 }, { "auxiliary_loss_clip": 0.0117961, "auxiliary_loss_mlp": 0.01031849, "balance_loss_clip": 1.01532507, "balance_loss_mlp": 1.02359939, "epoch": 0.4174833162989238, "flos": 26685398868480.0, "grad_norm": 1.6231094612164216, "language_loss": 0.74799752, "learning_rate": 2.62129280227276e-06, "loss": 0.7701121, "num_input_tokens_seen": 74889110, "step": 3472, "time_per_iteration": 2.716191291809082 }, { "auxiliary_loss_clip": 0.01192279, "auxiliary_loss_mlp": 0.01027575, "balance_loss_clip": 1.0187782, "balance_loss_mlp": 1.01973069, "epoch": 0.41760355918956293, "flos": 74739584010240.0, "grad_norm": 2.2180529369216426, "language_loss": 0.68510842, "learning_rate": 2.62055232271635e-06, "loss": 0.70730698, "num_input_tokens_seen": 74916260, "step": 3473, "time_per_iteration": 3.0702743530273438 }, { "auxiliary_loss_clip": 0.01169793, "auxiliary_loss_mlp": 0.01031761, "balance_loss_clip": 0.93572646, "balance_loss_mlp": 1.02334511, "epoch": 0.417723802080202, "flos": 14317513148160.0, "grad_norm": 1.8898162333862962, "language_loss": 0.87658709, "learning_rate": 2.619811749022646e-06, "loss": 0.8986026, "num_input_tokens_seen": 74931570, "step": 3474, "time_per_iteration": 2.766219139099121 }, { "auxiliary_loss_clip": 0.01188211, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.0187273, "balance_loss_mlp": 1.02387393, "epoch": 0.4178440449708411, "flos": 14643797316480.0, "grad_norm": 2.081988231652057, "language_loss": 0.71309793, "learning_rate": 2.6190710813039917e-06, "loss": 0.7353043, "num_input_tokens_seen": 74944695, "step": 3475, "time_per_iteration": 3.518368721008301 }, { "auxiliary_loss_clip": 0.01172146, "auxiliary_loss_mlp": 0.01124678, "balance_loss_clip": 0.89579505, "balance_loss_mlp": 0.0, "epoch": 0.4179642878614802, "flos": 21507296094720.0, "grad_norm": 2.30655523152882, "language_loss": 0.83603007, "learning_rate": 2.618330319672747e-06, "loss": 0.8589983, "num_input_tokens_seen": 74964115, "step": 3476, "time_per_iteration": 2.782423734664917 }, { "auxiliary_loss_clip": 0.01186894, "auxiliary_loss_mlp": 0.0102474, "balance_loss_clip": 1.05534959, "balance_loss_mlp": 1.01733434, "epoch": 0.41808453075211927, "flos": 18441997257600.0, "grad_norm": 2.2310911579788733, "language_loss": 0.91828656, "learning_rate": 2.617589464241284e-06, "loss": 0.94040293, "num_input_tokens_seen": 74978515, "step": 3477, "time_per_iteration": 2.6704111099243164 }, { "auxiliary_loss_clip": 0.01185528, "auxiliary_loss_mlp": 0.01027857, "balance_loss_clip": 0.93924105, "balance_loss_mlp": 1.02029872, "epoch": 0.4182047736427584, "flos": 20301020628480.0, "grad_norm": 1.8586598616053127, "language_loss": 0.7465117, "learning_rate": 2.6168485151219914e-06, "loss": 0.76864552, "num_input_tokens_seen": 74998135, "step": 3478, "time_per_iteration": 2.7435412406921387 }, { "auxiliary_loss_clip": 0.0118864, "auxiliary_loss_mlp": 0.01021841, "balance_loss_clip": 1.01928341, "balance_loss_mlp": 1.01346636, "epoch": 0.4183250165333975, "flos": 18876623823360.0, "grad_norm": 2.0494332870072736, "language_loss": 0.70864582, "learning_rate": 2.616107472427269e-06, "loss": 0.73075062, "num_input_tokens_seen": 75012830, "step": 3479, "time_per_iteration": 2.7516915798187256 }, { "auxiliary_loss_clip": 0.01189323, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 1.01610899, "balance_loss_mlp": 1.01866484, "epoch": 0.41844525942403654, "flos": 17740050698880.0, "grad_norm": 2.498268287327362, "language_loss": 0.76386726, "learning_rate": 2.615366336269533e-06, "loss": 0.78602779, "num_input_tokens_seen": 75026495, "step": 3480, "time_per_iteration": 3.6576428413391113 }, { "auxiliary_loss_clip": 0.01190357, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.05516124, "balance_loss_mlp": 1.0249331, "epoch": 0.41856550231467565, "flos": 18361377181440.0, "grad_norm": 2.374519901444704, "language_loss": 0.8031137, "learning_rate": 2.6146251067612126e-06, "loss": 0.82535303, "num_input_tokens_seen": 75041970, "step": 3481, "time_per_iteration": 2.6042940616607666 }, { "auxiliary_loss_clip": 0.01191481, "auxiliary_loss_mlp": 0.01029664, "balance_loss_clip": 1.02264571, "balance_loss_mlp": 1.0216893, "epoch": 0.41868574520531476, "flos": 22781801445120.0, "grad_norm": 1.6532132809500395, "language_loss": 0.82392371, "learning_rate": 2.6138837840147525e-06, "loss": 0.84613508, "num_input_tokens_seen": 75061005, "step": 3482, "time_per_iteration": 2.6588799953460693 }, { "auxiliary_loss_clip": 0.01181036, "auxiliary_loss_mlp": 0.01026044, "balance_loss_clip": 0.93963552, "balance_loss_mlp": 1.01856947, "epoch": 0.4188059880959538, "flos": 13699167494400.0, "grad_norm": 1.9473933958747376, "language_loss": 0.76702547, "learning_rate": 2.6131423681426103e-06, "loss": 0.78909624, "num_input_tokens_seen": 75076920, "step": 3483, "time_per_iteration": 3.564392328262329 }, { "auxiliary_loss_clip": 0.01185967, "auxiliary_loss_mlp": 0.01025585, "balance_loss_clip": 1.05544412, "balance_loss_mlp": 1.0176878, "epoch": 0.41892623098659293, "flos": 37818281220480.0, "grad_norm": 1.4680304474591028, "language_loss": 0.72634679, "learning_rate": 2.6124008592572587e-06, "loss": 0.74846232, "num_input_tokens_seen": 75100905, "step": 3484, "time_per_iteration": 2.781047821044922 }, { "auxiliary_loss_clip": 0.01190185, "auxiliary_loss_mlp": 0.01030046, "balance_loss_clip": 1.05505335, "balance_loss_mlp": 1.02177906, "epoch": 0.419046473877232, "flos": 23258874908160.0, "grad_norm": 2.0737394428801568, "language_loss": 0.8160212, "learning_rate": 2.6116592574711835e-06, "loss": 0.83822346, "num_input_tokens_seen": 75119205, "step": 3485, "time_per_iteration": 2.661757707595825 }, { "auxiliary_loss_clip": 0.01193782, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.05909431, "balance_loss_mlp": 1.02855539, "epoch": 0.4191667167678711, "flos": 20741034234240.0, "grad_norm": 1.7101408291939928, "language_loss": 0.83942688, "learning_rate": 2.6109175628968853e-06, "loss": 0.86173451, "num_input_tokens_seen": 75138970, "step": 3486, "time_per_iteration": 2.629390239715576 }, { "auxiliary_loss_clip": 0.01171367, "auxiliary_loss_mlp": 0.01021986, "balance_loss_clip": 1.01191545, "balance_loss_mlp": 1.01448512, "epoch": 0.4192869596585102, "flos": 23586416052480.0, "grad_norm": 1.7876619961697222, "language_loss": 0.82806325, "learning_rate": 2.610175775646878e-06, "loss": 0.84999681, "num_input_tokens_seen": 75157550, "step": 3487, "time_per_iteration": 2.693220376968384 }, { "auxiliary_loss_clip": 0.01177923, "auxiliary_loss_mlp": 0.01027063, "balance_loss_clip": 0.97557545, "balance_loss_mlp": 1.01922464, "epoch": 0.41940720254914926, "flos": 25081269384960.0, "grad_norm": 1.7398962768705206, "language_loss": 0.72841907, "learning_rate": 2.6094338958336907e-06, "loss": 0.75046885, "num_input_tokens_seen": 75176220, "step": 3488, "time_per_iteration": 2.7097504138946533 }, { "auxiliary_loss_clip": 0.01183761, "auxiliary_loss_mlp": 0.0102787, "balance_loss_clip": 0.97812295, "balance_loss_mlp": 1.02021706, "epoch": 0.41952744543978837, "flos": 15554132628480.0, "grad_norm": 2.338815157078354, "language_loss": 0.82169974, "learning_rate": 2.608691923569867e-06, "loss": 0.84381604, "num_input_tokens_seen": 75193095, "step": 3489, "time_per_iteration": 2.6906912326812744 }, { "auxiliary_loss_clip": 0.01190583, "auxiliary_loss_mlp": 0.01035291, "balance_loss_clip": 1.01910734, "balance_loss_mlp": 1.02775145, "epoch": 0.4196476883304275, "flos": 24644775312000.0, "grad_norm": 1.6417650271897364, "language_loss": 0.75790256, "learning_rate": 2.6079498589679616e-06, "loss": 0.78016138, "num_input_tokens_seen": 75214185, "step": 3490, "time_per_iteration": 2.7309958934783936 }, { "auxiliary_loss_clip": 0.01171849, "auxiliary_loss_mlp": 0.01029883, "balance_loss_clip": 0.85861367, "balance_loss_mlp": 1.02094233, "epoch": 0.41976793122106654, "flos": 24531333183360.0, "grad_norm": 1.7360876832385352, "language_loss": 0.76503444, "learning_rate": 2.6072077021405465e-06, "loss": 0.7870518, "num_input_tokens_seen": 75233020, "step": 3491, "time_per_iteration": 2.8749074935913086 }, { "auxiliary_loss_clip": 0.01190768, "auxiliary_loss_mlp": 0.01034322, "balance_loss_clip": 0.9399066, "balance_loss_mlp": 1.02599478, "epoch": 0.41988817411170565, "flos": 21175301664000.0, "grad_norm": 1.6415459643180637, "language_loss": 0.69254279, "learning_rate": 2.6064654532002054e-06, "loss": 0.71479362, "num_input_tokens_seen": 75252030, "step": 3492, "time_per_iteration": 2.756105422973633 }, { "auxiliary_loss_clip": 0.01185259, "auxiliary_loss_mlp": 0.01032289, "balance_loss_clip": 1.0548737, "balance_loss_mlp": 1.02453399, "epoch": 0.42000841700234476, "flos": 31649402626560.0, "grad_norm": 1.4186719695794114, "language_loss": 0.75886011, "learning_rate": 2.6057231122595375e-06, "loss": 0.78103554, "num_input_tokens_seen": 75273340, "step": 3493, "time_per_iteration": 2.69443416595459 }, { "auxiliary_loss_clip": 0.01182194, "auxiliary_loss_mlp": 0.01027154, "balance_loss_clip": 0.97468269, "balance_loss_mlp": 1.01875556, "epoch": 0.4201286598929838, "flos": 21281525159040.0, "grad_norm": 1.5591993014777832, "language_loss": 0.72771931, "learning_rate": 2.604980679431154e-06, "loss": 0.74981272, "num_input_tokens_seen": 75291580, "step": 3494, "time_per_iteration": 2.7224104404449463 }, { "auxiliary_loss_clip": 0.01184877, "auxiliary_loss_mlp": 0.01028242, "balance_loss_clip": 1.01436079, "balance_loss_mlp": 1.02031469, "epoch": 0.4202489027836229, "flos": 18546532813440.0, "grad_norm": 1.9166332704529216, "language_loss": 0.74601597, "learning_rate": 2.604238154827684e-06, "loss": 0.76814711, "num_input_tokens_seen": 75308205, "step": 3495, "time_per_iteration": 2.675802707672119 }, { "auxiliary_loss_clip": 0.01188095, "auxiliary_loss_mlp": 0.01025361, "balance_loss_clip": 1.01719522, "balance_loss_mlp": 1.01793385, "epoch": 0.42036914567426203, "flos": 19317643009920.0, "grad_norm": 1.7217458496515814, "language_loss": 0.72444749, "learning_rate": 2.6034955385617656e-06, "loss": 0.74658203, "num_input_tokens_seen": 75326535, "step": 3496, "time_per_iteration": 2.684577465057373 }, { "auxiliary_loss_clip": 0.01102122, "auxiliary_loss_mlp": 0.01015687, "balance_loss_clip": 0.91713393, "balance_loss_mlp": 1.01360035, "epoch": 0.4204893885649011, "flos": 67842942935040.0, "grad_norm": 0.7205534073908856, "language_loss": 0.61703169, "learning_rate": 2.6027528307460544e-06, "loss": 0.63820982, "num_input_tokens_seen": 75390540, "step": 3497, "time_per_iteration": 3.391031503677368 }, { "auxiliary_loss_clip": 0.01187282, "auxiliary_loss_mlp": 0.01025099, "balance_loss_clip": 1.05584335, "balance_loss_mlp": 1.01728535, "epoch": 0.4206096314555402, "flos": 21908777385600.0, "grad_norm": 1.9821187246806822, "language_loss": 0.86634719, "learning_rate": 2.602010031493217e-06, "loss": 0.88847089, "num_input_tokens_seen": 75408770, "step": 3498, "time_per_iteration": 2.6970510482788086 }, { "auxiliary_loss_clip": 0.01181167, "auxiliary_loss_mlp": 0.01028319, "balance_loss_clip": 0.94018936, "balance_loss_mlp": 1.02026629, "epoch": 0.42072987434617926, "flos": 29278185269760.0, "grad_norm": 2.0405421854796093, "language_loss": 0.86951745, "learning_rate": 2.6012671409159367e-06, "loss": 0.89161229, "num_input_tokens_seen": 75430105, "step": 3499, "time_per_iteration": 2.7776873111724854 }, { "auxiliary_loss_clip": 0.01179175, "auxiliary_loss_mlp": 0.01028258, "balance_loss_clip": 0.97773898, "balance_loss_mlp": 1.01960301, "epoch": 0.42085011723681837, "flos": 27600726170880.0, "grad_norm": 1.8918254396483596, "language_loss": 0.8153187, "learning_rate": 2.6005241591269097e-06, "loss": 0.83739305, "num_input_tokens_seen": 75449475, "step": 3500, "time_per_iteration": 2.7494237422943115 }, { "auxiliary_loss_clip": 0.01179661, "auxiliary_loss_mlp": 0.0103508, "balance_loss_clip": 0.94099188, "balance_loss_mlp": 1.02761185, "epoch": 0.4209703601274575, "flos": 27818632028160.0, "grad_norm": 1.8842481387757757, "language_loss": 0.79920936, "learning_rate": 2.5997810862388454e-06, "loss": 0.82135677, "num_input_tokens_seen": 75469315, "step": 3501, "time_per_iteration": 3.7780332565307617 }, { "auxiliary_loss_clip": 0.01184761, "auxiliary_loss_mlp": 0.01030468, "balance_loss_clip": 0.97617149, "balance_loss_mlp": 1.02236211, "epoch": 0.42109060301809653, "flos": 27525529048320.0, "grad_norm": 2.0263907985920158, "language_loss": 0.759251, "learning_rate": 2.599037922364467e-06, "loss": 0.7814033, "num_input_tokens_seen": 75488215, "step": 3502, "time_per_iteration": 2.8263943195343018 }, { "auxiliary_loss_clip": 0.01178565, "auxiliary_loss_mlp": 0.01029116, "balance_loss_clip": 0.94134623, "balance_loss_mlp": 1.02078331, "epoch": 0.42121084590873564, "flos": 29314275459840.0, "grad_norm": 2.6255049498555465, "language_loss": 0.75611192, "learning_rate": 2.5982946676165112e-06, "loss": 0.77818871, "num_input_tokens_seen": 75507985, "step": 3503, "time_per_iteration": 2.8024768829345703 }, { "auxiliary_loss_clip": 0.01105061, "auxiliary_loss_mlp": 0.00999433, "balance_loss_clip": 0.92651051, "balance_loss_mlp": 0.99722749, "epoch": 0.42133108879937475, "flos": 67398835178880.0, "grad_norm": 0.7298756686079526, "language_loss": 0.57713044, "learning_rate": 2.5975513221077313e-06, "loss": 0.59817541, "num_input_tokens_seen": 75571955, "step": 3504, "time_per_iteration": 3.3488729000091553 }, { "auxiliary_loss_clip": 0.01171682, "auxiliary_loss_mlp": 0.01028209, "balance_loss_clip": 0.97468662, "balance_loss_mlp": 1.02067542, "epoch": 0.4214513316900138, "flos": 23106038538240.0, "grad_norm": 2.1570297162149688, "language_loss": 0.88353491, "learning_rate": 2.5968078859508897e-06, "loss": 0.90553385, "num_input_tokens_seen": 75589155, "step": 3505, "time_per_iteration": 2.755915880203247 }, { "auxiliary_loss_clip": 0.01181841, "auxiliary_loss_mlp": 0.01023431, "balance_loss_clip": 1.01324272, "balance_loss_mlp": 1.01573002, "epoch": 0.4215715745806529, "flos": 15336190857600.0, "grad_norm": 1.8409035065545512, "language_loss": 0.79440677, "learning_rate": 2.5960643592587673e-06, "loss": 0.81645948, "num_input_tokens_seen": 75606565, "step": 3506, "time_per_iteration": 3.6332826614379883 }, { "auxiliary_loss_clip": 0.01177015, "auxiliary_loss_mlp": 0.01032778, "balance_loss_clip": 0.93609905, "balance_loss_mlp": 1.02514815, "epoch": 0.42169181747129203, "flos": 22127257860480.0, "grad_norm": 1.801031336351951, "language_loss": 0.81486338, "learning_rate": 2.5953207421441553e-06, "loss": 0.83696133, "num_input_tokens_seen": 75625165, "step": 3507, "time_per_iteration": 2.7352051734924316 }, { "auxiliary_loss_clip": 0.01187562, "auxiliary_loss_mlp": 0.01022843, "balance_loss_clip": 0.94095773, "balance_loss_mlp": 1.01516604, "epoch": 0.4218120603619311, "flos": 22630724841600.0, "grad_norm": 2.0906236990056715, "language_loss": 0.75143284, "learning_rate": 2.5945770347198603e-06, "loss": 0.77353692, "num_input_tokens_seen": 75643320, "step": 3508, "time_per_iteration": 2.7994067668914795 }, { "auxiliary_loss_clip": 0.01176486, "auxiliary_loss_mlp": 0.0102314, "balance_loss_clip": 0.9732976, "balance_loss_mlp": 1.01576114, "epoch": 0.4219323032525702, "flos": 19682818629120.0, "grad_norm": 1.885172486557838, "language_loss": 0.81988895, "learning_rate": 2.593833237098701e-06, "loss": 0.84188521, "num_input_tokens_seen": 75660920, "step": 3509, "time_per_iteration": 3.5606443881988525 }, { "auxiliary_loss_clip": 0.01180444, "auxiliary_loss_mlp": 0.01026526, "balance_loss_clip": 1.01142359, "balance_loss_mlp": 1.01835394, "epoch": 0.4220525461432093, "flos": 30190747224960.0, "grad_norm": 1.7314074585587502, "language_loss": 0.62340665, "learning_rate": 2.593089349393512e-06, "loss": 0.64547628, "num_input_tokens_seen": 75681410, "step": 3510, "time_per_iteration": 2.7522900104522705 }, { "auxiliary_loss_clip": 0.0118682, "auxiliary_loss_mlp": 0.01028583, "balance_loss_clip": 1.01936519, "balance_loss_mlp": 1.02090049, "epoch": 0.42217278903384836, "flos": 24315941278080.0, "grad_norm": 2.4138911512301773, "language_loss": 0.83092946, "learning_rate": 2.592345371717141e-06, "loss": 0.85308349, "num_input_tokens_seen": 75700940, "step": 3511, "time_per_iteration": 2.6886637210845947 }, { "auxiliary_loss_clip": 0.01186457, "auxiliary_loss_mlp": 0.01028363, "balance_loss_clip": 1.01916564, "balance_loss_mlp": 1.02016735, "epoch": 0.42229303192448747, "flos": 17092474352640.0, "grad_norm": 2.209906670498662, "language_loss": 0.72069752, "learning_rate": 2.591601304182448e-06, "loss": 0.74284571, "num_input_tokens_seen": 75718910, "step": 3512, "time_per_iteration": 2.6710915565490723 }, { "auxiliary_loss_clip": 0.01182249, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 0.98029184, "balance_loss_mlp": 1.02021575, "epoch": 0.4224132748151266, "flos": 22784530878720.0, "grad_norm": 1.6858630549809825, "language_loss": 0.79081267, "learning_rate": 2.5908571469023067e-06, "loss": 0.81291467, "num_input_tokens_seen": 75738395, "step": 3513, "time_per_iteration": 2.708738327026367 }, { "auxiliary_loss_clip": 0.01181847, "auxiliary_loss_mlp": 0.01027011, "balance_loss_clip": 1.0523684, "balance_loss_mlp": 1.01970935, "epoch": 0.42253351770576564, "flos": 17819090576640.0, "grad_norm": 2.0881353794947954, "language_loss": 0.75247639, "learning_rate": 2.5901128999896067e-06, "loss": 0.77456498, "num_input_tokens_seen": 75753825, "step": 3514, "time_per_iteration": 2.545193672180176 }, { "auxiliary_loss_clip": 0.01181675, "auxiliary_loss_mlp": 0.01022587, "balance_loss_clip": 1.01533008, "balance_loss_mlp": 1.01530302, "epoch": 0.42265376059640475, "flos": 28512390286080.0, "grad_norm": 1.7914341396823, "language_loss": 0.67907536, "learning_rate": 2.5893685635572487e-06, "loss": 0.70111793, "num_input_tokens_seen": 75774675, "step": 3515, "time_per_iteration": 2.7367897033691406 }, { "auxiliary_loss_clip": 0.01177912, "auxiliary_loss_mlp": 0.01024803, "balance_loss_clip": 0.97620177, "balance_loss_mlp": 1.01680422, "epoch": 0.4227740034870438, "flos": 16253349753600.0, "grad_norm": 1.7957405479549722, "language_loss": 0.68621469, "learning_rate": 2.5886241377181483e-06, "loss": 0.70824188, "num_input_tokens_seen": 75793545, "step": 3516, "time_per_iteration": 2.6461522579193115 }, { "auxiliary_loss_clip": 0.01183474, "auxiliary_loss_mlp": 0.01031176, "balance_loss_clip": 1.01583552, "balance_loss_mlp": 1.02319467, "epoch": 0.4228942463776829, "flos": 25295691623040.0, "grad_norm": 1.6974181028665312, "language_loss": 0.8144871, "learning_rate": 2.587879622585234e-06, "loss": 0.83663356, "num_input_tokens_seen": 75812145, "step": 3517, "time_per_iteration": 2.6927297115325928 }, { "auxiliary_loss_clip": 0.0118343, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 1.01751328, "balance_loss_mlp": 1.02027607, "epoch": 0.423014489268322, "flos": 26395779507840.0, "grad_norm": 2.013028885925018, "language_loss": 0.7583732, "learning_rate": 2.5871350182714486e-06, "loss": 0.780487, "num_input_tokens_seen": 75833025, "step": 3518, "time_per_iteration": 2.670905590057373 }, { "auxiliary_loss_clip": 0.01179221, "auxiliary_loss_mlp": 0.01025446, "balance_loss_clip": 1.05068505, "balance_loss_mlp": 1.01818275, "epoch": 0.4231347321589611, "flos": 17274002711040.0, "grad_norm": 1.9331872620366677, "language_loss": 0.80371135, "learning_rate": 2.586390324889748e-06, "loss": 0.82575798, "num_input_tokens_seen": 75848925, "step": 3519, "time_per_iteration": 2.6210052967071533 }, { "auxiliary_loss_clip": 0.01180311, "auxiliary_loss_mlp": 0.0102579, "balance_loss_clip": 1.01416063, "balance_loss_mlp": 1.01841736, "epoch": 0.4232549750496002, "flos": 22999635475200.0, "grad_norm": 1.7408258944648816, "language_loss": 0.67516482, "learning_rate": 2.5856455425531003e-06, "loss": 0.69722581, "num_input_tokens_seen": 75870400, "step": 3520, "time_per_iteration": 2.7886176109313965 }, { "auxiliary_loss_clip": 0.01179079, "auxiliary_loss_mlp": 0.01028962, "balance_loss_clip": 1.01600671, "balance_loss_mlp": 1.02109468, "epoch": 0.4233752179402393, "flos": 21248343970560.0, "grad_norm": 1.7465405191546668, "language_loss": 0.80647218, "learning_rate": 2.5849006713744902e-06, "loss": 0.82855248, "num_input_tokens_seen": 75889195, "step": 3521, "time_per_iteration": 2.731779098510742 }, { "auxiliary_loss_clip": 0.01175917, "auxiliary_loss_mlp": 0.01023403, "balance_loss_clip": 0.97553778, "balance_loss_mlp": 1.01552892, "epoch": 0.42349546083087836, "flos": 20704297599360.0, "grad_norm": 2.474481652966291, "language_loss": 0.73457122, "learning_rate": 2.5841557114669135e-06, "loss": 0.75656444, "num_input_tokens_seen": 75906055, "step": 3522, "time_per_iteration": 2.7238004207611084 }, { "auxiliary_loss_clip": 0.01187349, "auxiliary_loss_mlp": 0.0102833, "balance_loss_clip": 1.05307317, "balance_loss_mlp": 1.01984835, "epoch": 0.42361570372151747, "flos": 18585065128320.0, "grad_norm": 2.5494333912374643, "language_loss": 0.6778096, "learning_rate": 2.58341066294338e-06, "loss": 0.69996643, "num_input_tokens_seen": 75922720, "step": 3523, "time_per_iteration": 2.637662887573242 }, { "auxiliary_loss_clip": 0.01182601, "auxiliary_loss_mlp": 0.01123586, "balance_loss_clip": 0.89914572, "balance_loss_mlp": 0.0, "epoch": 0.4237359466121566, "flos": 20959478795520.0, "grad_norm": 1.9338353109877333, "language_loss": 0.85130632, "learning_rate": 2.5826655259169124e-06, "loss": 0.87436819, "num_input_tokens_seen": 75941375, "step": 3524, "time_per_iteration": 2.8822524547576904 }, { "auxiliary_loss_clip": 0.01185171, "auxiliary_loss_mlp": 0.0102976, "balance_loss_clip": 1.05524135, "balance_loss_mlp": 1.02210069, "epoch": 0.42385618950279563, "flos": 18038181582720.0, "grad_norm": 1.6700602969881895, "language_loss": 0.90413254, "learning_rate": 2.5819203005005475e-06, "loss": 0.92628181, "num_input_tokens_seen": 75958710, "step": 3525, "time_per_iteration": 2.617565155029297 }, { "auxiliary_loss_clip": 0.01176147, "auxiliary_loss_mlp": 0.01025696, "balance_loss_clip": 0.97625577, "balance_loss_mlp": 1.01792121, "epoch": 0.42397643239343474, "flos": 23769129559680.0, "grad_norm": 1.4972282966197832, "language_loss": 0.78695357, "learning_rate": 2.581174986807336e-06, "loss": 0.808972, "num_input_tokens_seen": 75978945, "step": 3526, "time_per_iteration": 2.7416272163391113 }, { "auxiliary_loss_clip": 0.01171935, "auxiliary_loss_mlp": 0.01123648, "balance_loss_clip": 1.0130198, "balance_loss_mlp": 0.0, "epoch": 0.42409667528407385, "flos": 16545088016640.0, "grad_norm": 2.1500747761088173, "language_loss": 0.91356277, "learning_rate": 2.580429584950341e-06, "loss": 0.93651855, "num_input_tokens_seen": 75994695, "step": 3527, "time_per_iteration": 3.5729317665100098 }, { "auxiliary_loss_clip": 0.01190451, "auxiliary_loss_mlp": 0.0103041, "balance_loss_clip": 0.94123942, "balance_loss_mlp": 1.02261972, "epoch": 0.4242169181747129, "flos": 16034186920320.0, "grad_norm": 1.9037970757395017, "language_loss": 0.66399884, "learning_rate": 2.5796840950426397e-06, "loss": 0.68620753, "num_input_tokens_seen": 76011780, "step": 3528, "time_per_iteration": 3.6913881301879883 }, { "auxiliary_loss_clip": 0.01172571, "auxiliary_loss_mlp": 0.01023959, "balance_loss_clip": 1.01307893, "balance_loss_mlp": 1.01668751, "epoch": 0.424337161065352, "flos": 20084012611200.0, "grad_norm": 1.626014871802519, "language_loss": 0.65583676, "learning_rate": 2.578938517197322e-06, "loss": 0.67780209, "num_input_tokens_seen": 76029875, "step": 3529, "time_per_iteration": 2.679292678833008 }, { "auxiliary_loss_clip": 0.01172387, "auxiliary_loss_mlp": 0.01024698, "balance_loss_clip": 0.97576988, "balance_loss_mlp": 1.01682973, "epoch": 0.4244574039559911, "flos": 23878369797120.0, "grad_norm": 2.2181760865826208, "language_loss": 0.62420952, "learning_rate": 2.5781928515274916e-06, "loss": 0.64618039, "num_input_tokens_seen": 76048595, "step": 3530, "time_per_iteration": 2.6789796352386475 }, { "auxiliary_loss_clip": 0.01186894, "auxiliary_loss_mlp": 0.0103137, "balance_loss_clip": 1.01786053, "balance_loss_mlp": 1.02360034, "epoch": 0.4245776468466302, "flos": 17565920542080.0, "grad_norm": 1.853805409565906, "language_loss": 0.67917848, "learning_rate": 2.577447098146265e-06, "loss": 0.70136106, "num_input_tokens_seen": 76065770, "step": 3531, "time_per_iteration": 2.6861181259155273 }, { "auxiliary_loss_clip": 0.01181661, "auxiliary_loss_mlp": 0.01031749, "balance_loss_clip": 0.93873096, "balance_loss_mlp": 1.02368426, "epoch": 0.4246978897372693, "flos": 27776256958080.0, "grad_norm": 1.678432039846757, "language_loss": 0.79173154, "learning_rate": 2.5767012571667724e-06, "loss": 0.81386566, "num_input_tokens_seen": 76085250, "step": 3532, "time_per_iteration": 3.6959245204925537 }, { "auxiliary_loss_clip": 0.01182211, "auxiliary_loss_mlp": 0.01028228, "balance_loss_clip": 1.01286197, "balance_loss_mlp": 1.02041411, "epoch": 0.42481813262790835, "flos": 15596615439360.0, "grad_norm": 1.7381677229104024, "language_loss": 0.68479085, "learning_rate": 2.5759553287021587e-06, "loss": 0.70689523, "num_input_tokens_seen": 76103580, "step": 3533, "time_per_iteration": 2.6533095836639404 }, { "auxiliary_loss_clip": 0.01182959, "auxiliary_loss_mlp": 0.01030878, "balance_loss_clip": 0.97910643, "balance_loss_mlp": 1.02297997, "epoch": 0.42493837551854746, "flos": 23951088881280.0, "grad_norm": 1.6973311803780784, "language_loss": 0.77276015, "learning_rate": 2.5752093128655786e-06, "loss": 0.79489851, "num_input_tokens_seen": 76121825, "step": 3534, "time_per_iteration": 2.729245662689209 }, { "auxiliary_loss_clip": 0.01173751, "auxiliary_loss_mlp": 0.01029057, "balance_loss_clip": 0.97527158, "balance_loss_mlp": 1.02093852, "epoch": 0.4250586184091866, "flos": 20813466009600.0, "grad_norm": 1.9695889287800579, "language_loss": 0.74008518, "learning_rate": 2.574463209770204e-06, "loss": 0.76211327, "num_input_tokens_seen": 76141140, "step": 3535, "time_per_iteration": 3.6574535369873047 }, { "auxiliary_loss_clip": 0.01179342, "auxiliary_loss_mlp": 0.01025386, "balance_loss_clip": 0.93723893, "balance_loss_mlp": 1.01698136, "epoch": 0.42517886129982563, "flos": 30371018607360.0, "grad_norm": 1.8552474305021778, "language_loss": 0.79496706, "learning_rate": 2.5737170195292165e-06, "loss": 0.81701434, "num_input_tokens_seen": 76164475, "step": 3536, "time_per_iteration": 2.8359010219573975 }, { "auxiliary_loss_clip": 0.01181058, "auxiliary_loss_mlp": 0.01032956, "balance_loss_clip": 0.93637514, "balance_loss_mlp": 1.0252285, "epoch": 0.42529910419046474, "flos": 20080636732800.0, "grad_norm": 3.0987232662289625, "language_loss": 0.7792592, "learning_rate": 2.572970742255814e-06, "loss": 0.80139941, "num_input_tokens_seen": 76182965, "step": 3537, "time_per_iteration": 2.8177385330200195 }, { "auxiliary_loss_clip": 0.01179906, "auxiliary_loss_mlp": 0.01034229, "balance_loss_clip": 1.01477563, "balance_loss_mlp": 1.02661395, "epoch": 0.42541934708110385, "flos": 22632448694400.0, "grad_norm": 1.6726819906799482, "language_loss": 0.81270522, "learning_rate": 2.5722243780632046e-06, "loss": 0.83484662, "num_input_tokens_seen": 76201230, "step": 3538, "time_per_iteration": 2.717115879058838 }, { "auxiliary_loss_clip": 0.01098211, "auxiliary_loss_mlp": 0.01015145, "balance_loss_clip": 0.87407261, "balance_loss_mlp": 1.01315391, "epoch": 0.4255395899717429, "flos": 66200676186240.0, "grad_norm": 0.7558983276301114, "language_loss": 0.60465503, "learning_rate": 2.5714779270646125e-06, "loss": 0.62578857, "num_input_tokens_seen": 76262000, "step": 3539, "time_per_iteration": 3.4622533321380615 }, { "auxiliary_loss_clip": 0.01186395, "auxiliary_loss_mlp": 0.01124144, "balance_loss_clip": 0.97832501, "balance_loss_mlp": 0.0, "epoch": 0.425659832862382, "flos": 17931814433280.0, "grad_norm": 2.639077474985095, "language_loss": 0.77496946, "learning_rate": 2.5707313893732735e-06, "loss": 0.79807484, "num_input_tokens_seen": 76280540, "step": 3540, "time_per_iteration": 2.932847738265991 }, { "auxiliary_loss_clip": 0.01167588, "auxiliary_loss_mlp": 0.0102648, "balance_loss_clip": 0.81779414, "balance_loss_mlp": 1.01905584, "epoch": 0.4257800757530211, "flos": 24022550989440.0, "grad_norm": 1.9055796148469946, "language_loss": 0.7692821, "learning_rate": 2.5699847651024364e-06, "loss": 0.79122281, "num_input_tokens_seen": 76301180, "step": 3541, "time_per_iteration": 3.0103278160095215 }, { "auxiliary_loss_clip": 0.01181252, "auxiliary_loss_mlp": 0.01029856, "balance_loss_clip": 1.01688373, "balance_loss_mlp": 1.022686, "epoch": 0.4259003186436602, "flos": 23696015425920.0, "grad_norm": 2.233120930261395, "language_loss": 0.77069372, "learning_rate": 2.5692380543653627e-06, "loss": 0.79280484, "num_input_tokens_seen": 76319335, "step": 3542, "time_per_iteration": 2.942396879196167 }, { "auxiliary_loss_clip": 0.01189974, "auxiliary_loss_mlp": 0.01123918, "balance_loss_clip": 1.01734805, "balance_loss_mlp": 0.0, "epoch": 0.4260205615342993, "flos": 15259772672640.0, "grad_norm": 1.8451614060838075, "language_loss": 0.69796824, "learning_rate": 2.5684912572753293e-06, "loss": 0.72110713, "num_input_tokens_seen": 76335010, "step": 3543, "time_per_iteration": 2.641726493835449 }, { "auxiliary_loss_clip": 0.011814, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.05306792, "balance_loss_mlp": 1.02241611, "epoch": 0.4261408044249384, "flos": 30665306736000.0, "grad_norm": 1.6405192321112863, "language_loss": 0.8347221, "learning_rate": 2.5677443739456245e-06, "loss": 0.8568294, "num_input_tokens_seen": 76356670, "step": 3544, "time_per_iteration": 2.687281370162964 }, { "auxiliary_loss_clip": 0.01186109, "auxiliary_loss_mlp": 0.01030878, "balance_loss_clip": 0.97983432, "balance_loss_mlp": 1.02326608, "epoch": 0.42626104731557746, "flos": 23257905240960.0, "grad_norm": 2.58946689460216, "language_loss": 0.79656994, "learning_rate": 2.5669974044895495e-06, "loss": 0.81873977, "num_input_tokens_seen": 76373065, "step": 3545, "time_per_iteration": 2.8160016536712646 }, { "auxiliary_loss_clip": 0.0118687, "auxiliary_loss_mlp": 0.01028416, "balance_loss_clip": 0.93772054, "balance_loss_mlp": 1.02037501, "epoch": 0.42638129020621657, "flos": 25884770670720.0, "grad_norm": 2.0568851971076194, "language_loss": 0.79647416, "learning_rate": 2.5662503490204187e-06, "loss": 0.81862706, "num_input_tokens_seen": 76393230, "step": 3546, "time_per_iteration": 2.795614719390869 }, { "auxiliary_loss_clip": 0.01177876, "auxiliary_loss_mlp": 0.01027664, "balance_loss_clip": 0.97388828, "balance_loss_mlp": 1.0196712, "epoch": 0.4265015330968556, "flos": 26502362138880.0, "grad_norm": 1.7521937898700994, "language_loss": 0.76027626, "learning_rate": 2.5655032076515603e-06, "loss": 0.78233165, "num_input_tokens_seen": 76412555, "step": 3547, "time_per_iteration": 2.705929756164551 }, { "auxiliary_loss_clip": 0.01183612, "auxiliary_loss_mlp": 0.01029824, "balance_loss_clip": 0.97806638, "balance_loss_mlp": 1.0221591, "epoch": 0.42662177598749473, "flos": 24389522288640.0, "grad_norm": 1.8915702184828371, "language_loss": 0.8179794, "learning_rate": 2.5647559804963155e-06, "loss": 0.84011376, "num_input_tokens_seen": 76432485, "step": 3548, "time_per_iteration": 2.7552804946899414 }, { "auxiliary_loss_clip": 0.0118775, "auxiliary_loss_mlp": 0.01032811, "balance_loss_clip": 0.89978743, "balance_loss_mlp": 1.02459192, "epoch": 0.42674201887813384, "flos": 23148629089920.0, "grad_norm": 2.0872169749077556, "language_loss": 0.78883994, "learning_rate": 2.5640086676680364e-06, "loss": 0.81104553, "num_input_tokens_seen": 76453980, "step": 3549, "time_per_iteration": 2.8426802158355713 }, { "auxiliary_loss_clip": 0.01184982, "auxiliary_loss_mlp": 0.01029932, "balance_loss_clip": 1.01637411, "balance_loss_mlp": 1.0221653, "epoch": 0.4268622617687729, "flos": 21689614552320.0, "grad_norm": 2.623188437974138, "language_loss": 0.80460703, "learning_rate": 2.5632612692800923e-06, "loss": 0.82675624, "num_input_tokens_seen": 76473045, "step": 3550, "time_per_iteration": 2.702329397201538 }, { "auxiliary_loss_clip": 0.01181528, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 0.93877786, "balance_loss_mlp": 1.0272007, "epoch": 0.426982504659412, "flos": 23440151871360.0, "grad_norm": 2.002040147750967, "language_loss": 0.74993491, "learning_rate": 2.5625137854458603e-06, "loss": 0.77210772, "num_input_tokens_seen": 76492060, "step": 3551, "time_per_iteration": 2.7306578159332275 }, { "auxiliary_loss_clip": 0.01182256, "auxiliary_loss_mlp": 0.01025567, "balance_loss_clip": 0.97567594, "balance_loss_mlp": 1.01824427, "epoch": 0.4271027475500511, "flos": 18916556768640.0, "grad_norm": 1.7379835844331653, "language_loss": 0.79896235, "learning_rate": 2.561766216278735e-06, "loss": 0.82104057, "num_input_tokens_seen": 76509655, "step": 3552, "time_per_iteration": 2.739915609359741 }, { "auxiliary_loss_clip": 0.01181215, "auxiliary_loss_mlp": 0.01023365, "balance_loss_clip": 0.90079165, "balance_loss_mlp": 1.01581597, "epoch": 0.4272229904406902, "flos": 26870554500480.0, "grad_norm": 1.8438382723805489, "language_loss": 0.8168211, "learning_rate": 2.561018561892121e-06, "loss": 0.83886695, "num_input_tokens_seen": 76528795, "step": 3553, "time_per_iteration": 4.752245187759399 }, { "auxiliary_loss_clip": 0.01176857, "auxiliary_loss_mlp": 0.01027117, "balance_loss_clip": 0.97337878, "balance_loss_mlp": 1.01964903, "epoch": 0.4273432333313293, "flos": 23951376190080.0, "grad_norm": 1.625175873063264, "language_loss": 0.7661438, "learning_rate": 2.5602708223994363e-06, "loss": 0.78818345, "num_input_tokens_seen": 76550660, "step": 3554, "time_per_iteration": 2.825624465942383 }, { "auxiliary_loss_clip": 0.01181046, "auxiliary_loss_mlp": 0.01023229, "balance_loss_clip": 0.93535209, "balance_loss_mlp": 1.01516461, "epoch": 0.4274634762219684, "flos": 29570354496000.0, "grad_norm": 2.233836863702708, "language_loss": 0.68092692, "learning_rate": 2.559522997914115e-06, "loss": 0.70296973, "num_input_tokens_seen": 76570240, "step": 3555, "time_per_iteration": 2.7911128997802734 }, { "auxiliary_loss_clip": 0.01185877, "auxiliary_loss_mlp": 0.01025107, "balance_loss_clip": 1.05702925, "balance_loss_mlp": 1.01824915, "epoch": 0.42758371911260745, "flos": 21434146047360.0, "grad_norm": 1.9376829785432201, "language_loss": 0.84504938, "learning_rate": 2.558775088549599e-06, "loss": 0.86715925, "num_input_tokens_seen": 76589820, "step": 3556, "time_per_iteration": 2.696614980697632 }, { "auxiliary_loss_clip": 0.01189858, "auxiliary_loss_mlp": 0.01029144, "balance_loss_clip": 1.01627588, "balance_loss_mlp": 1.02067983, "epoch": 0.42770396200324656, "flos": 14752822072320.0, "grad_norm": 2.5479944808969237, "language_loss": 0.66570121, "learning_rate": 2.5580270944193467e-06, "loss": 0.68789124, "num_input_tokens_seen": 76606640, "step": 3557, "time_per_iteration": 2.654890298843384 }, { "auxiliary_loss_clip": 0.01089887, "auxiliary_loss_mlp": 0.0100325, "balance_loss_clip": 1.02408433, "balance_loss_mlp": 1.00127113, "epoch": 0.4278242048938857, "flos": 70654712601600.0, "grad_norm": 0.8293281087112214, "language_loss": 0.55560112, "learning_rate": 2.557279015636827e-06, "loss": 0.57653248, "num_input_tokens_seen": 76667050, "step": 3558, "time_per_iteration": 4.112024307250977 }, { "auxiliary_loss_clip": 0.01089664, "auxiliary_loss_mlp": 0.01003853, "balance_loss_clip": 0.98709649, "balance_loss_mlp": 1.00185013, "epoch": 0.42794444778452473, "flos": 69366165033600.0, "grad_norm": 0.7660591513436, "language_loss": 0.61232287, "learning_rate": 2.5565308523155245e-06, "loss": 0.63325804, "num_input_tokens_seen": 76726650, "step": 3559, "time_per_iteration": 3.1536922454833984 }, { "auxiliary_loss_clip": 0.01171264, "auxiliary_loss_mlp": 0.01029479, "balance_loss_clip": 0.90045649, "balance_loss_mlp": 1.02146173, "epoch": 0.42806469067516384, "flos": 18215328481920.0, "grad_norm": 2.302432576615666, "language_loss": 0.81917417, "learning_rate": 2.5557826045689336e-06, "loss": 0.84118158, "num_input_tokens_seen": 76742890, "step": 3560, "time_per_iteration": 2.7516438961029053 }, { "auxiliary_loss_clip": 0.01103094, "auxiliary_loss_mlp": 0.00999345, "balance_loss_clip": 0.92473114, "balance_loss_mlp": 0.99716395, "epoch": 0.4281849335658029, "flos": 54535814432640.0, "grad_norm": 0.8363095351618923, "language_loss": 0.58873427, "learning_rate": 2.5550342725105643e-06, "loss": 0.60975862, "num_input_tokens_seen": 76801055, "step": 3561, "time_per_iteration": 4.110677719116211 }, { "auxiliary_loss_clip": 0.01186937, "auxiliary_loss_mlp": 0.01034863, "balance_loss_clip": 1.02014267, "balance_loss_mlp": 1.0273056, "epoch": 0.428305176456442, "flos": 17274828723840.0, "grad_norm": 1.6066414906206221, "language_loss": 0.8100487, "learning_rate": 2.554285856253937e-06, "loss": 0.83226669, "num_input_tokens_seen": 76819890, "step": 3562, "time_per_iteration": 2.6920993328094482 }, { "auxiliary_loss_clip": 0.0118567, "auxiliary_loss_mlp": 0.01027852, "balance_loss_clip": 0.98184228, "balance_loss_mlp": 1.01978719, "epoch": 0.4284254193470811, "flos": 26359509749760.0, "grad_norm": 1.6581786891644894, "language_loss": 0.77361012, "learning_rate": 2.5535373559125855e-06, "loss": 0.79574537, "num_input_tokens_seen": 76840255, "step": 3563, "time_per_iteration": 2.7088704109191895 }, { "auxiliary_loss_clip": 0.01180162, "auxiliary_loss_mlp": 0.01029142, "balance_loss_clip": 0.82407844, "balance_loss_mlp": 1.02047586, "epoch": 0.42854566223772017, "flos": 29714248379520.0, "grad_norm": 1.5523560213627727, "language_loss": 0.81675291, "learning_rate": 2.552788771600057e-06, "loss": 0.83884597, "num_input_tokens_seen": 76860565, "step": 3564, "time_per_iteration": 2.9472196102142334 }, { "auxiliary_loss_clip": 0.01186738, "auxiliary_loss_mlp": 0.01030873, "balance_loss_clip": 0.94120413, "balance_loss_mlp": 1.02299297, "epoch": 0.4286659051283593, "flos": 22018161277440.0, "grad_norm": 1.710166150888007, "language_loss": 0.81898177, "learning_rate": 2.5520401034299118e-06, "loss": 0.84115779, "num_input_tokens_seen": 76878325, "step": 3565, "time_per_iteration": 3.101139783859253 }, { "auxiliary_loss_clip": 0.0118826, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.01773524, "balance_loss_mlp": 1.01809645, "epoch": 0.4287861480189984, "flos": 13334422838400.0, "grad_norm": 1.9132178074290467, "language_loss": 0.87828714, "learning_rate": 2.551291351515722e-06, "loss": 0.90043926, "num_input_tokens_seen": 76895340, "step": 3566, "time_per_iteration": 2.687917947769165 }, { "auxiliary_loss_clip": 0.01174848, "auxiliary_loss_mlp": 0.01124607, "balance_loss_clip": 0.93483424, "balance_loss_mlp": 0.0, "epoch": 0.42890639090963745, "flos": 26651535321600.0, "grad_norm": 1.5243591269866772, "language_loss": 0.85288048, "learning_rate": 2.5505425159710726e-06, "loss": 0.87587506, "num_input_tokens_seen": 76915150, "step": 3567, "time_per_iteration": 2.7880496978759766 }, { "auxiliary_loss_clip": 0.01189741, "auxiliary_loss_mlp": 0.01124144, "balance_loss_clip": 0.97643828, "balance_loss_mlp": 0.0, "epoch": 0.42902663380027656, "flos": 24055768091520.0, "grad_norm": 4.40086187876193, "language_loss": 0.82693613, "learning_rate": 2.549793596909561e-06, "loss": 0.85007501, "num_input_tokens_seen": 76933770, "step": 3568, "time_per_iteration": 2.7392804622650146 }, { "auxiliary_loss_clip": 0.01181868, "auxiliary_loss_mlp": 0.01027297, "balance_loss_clip": 0.97835726, "balance_loss_mlp": 1.01948881, "epoch": 0.42914687669091567, "flos": 15632561975040.0, "grad_norm": 3.8772601303098977, "language_loss": 0.65938187, "learning_rate": 2.5490445944447976e-06, "loss": 0.68147349, "num_input_tokens_seen": 76952265, "step": 3569, "time_per_iteration": 2.63464617729187 }, { "auxiliary_loss_clip": 0.01188332, "auxiliary_loss_mlp": 0.01028795, "balance_loss_clip": 1.01797295, "balance_loss_mlp": 1.02074218, "epoch": 0.4292671195815547, "flos": 31467802440960.0, "grad_norm": 1.8443292970383252, "language_loss": 0.64837712, "learning_rate": 2.548295508690406e-06, "loss": 0.67054838, "num_input_tokens_seen": 76973560, "step": 3570, "time_per_iteration": 2.741594076156616 }, { "auxiliary_loss_clip": 0.01185226, "auxiliary_loss_mlp": 0.01028094, "balance_loss_clip": 1.0146457, "balance_loss_mlp": 1.0204711, "epoch": 0.42938736247219383, "flos": 30257756046720.0, "grad_norm": 1.6631241047681982, "language_loss": 0.76677001, "learning_rate": 2.5475463397600217e-06, "loss": 0.78890318, "num_input_tokens_seen": 76993640, "step": 3571, "time_per_iteration": 2.7270405292510986 }, { "auxiliary_loss_clip": 0.0118689, "auxiliary_loss_mlp": 0.01038945, "balance_loss_clip": 1.05418897, "balance_loss_mlp": 1.03017688, "epoch": 0.42950760536283294, "flos": 29349683291520.0, "grad_norm": 1.8519298920933012, "language_loss": 0.77306545, "learning_rate": 2.546797087767293e-06, "loss": 0.79532373, "num_input_tokens_seen": 77013765, "step": 3572, "time_per_iteration": 2.6591060161590576 }, { "auxiliary_loss_clip": 0.01178613, "auxiliary_loss_mlp": 0.01026106, "balance_loss_clip": 0.90128392, "balance_loss_mlp": 1.01789784, "epoch": 0.429627848253472, "flos": 26869943969280.0, "grad_norm": 1.7902384330988332, "language_loss": 0.87308812, "learning_rate": 2.546047752825881e-06, "loss": 0.89513528, "num_input_tokens_seen": 77034370, "step": 3573, "time_per_iteration": 2.818284749984741 }, { "auxiliary_loss_clip": 0.01186189, "auxiliary_loss_mlp": 0.01025874, "balance_loss_clip": 0.90073776, "balance_loss_mlp": 1.0181253, "epoch": 0.4297480911441111, "flos": 13881270470400.0, "grad_norm": 2.1326133331514163, "language_loss": 0.93034434, "learning_rate": 2.5452983350494595e-06, "loss": 0.95246506, "num_input_tokens_seen": 77049925, "step": 3574, "time_per_iteration": 2.7369704246520996 }, { "auxiliary_loss_clip": 0.0118833, "auxiliary_loss_mlp": 0.01123725, "balance_loss_clip": 1.01788807, "balance_loss_mlp": 0.0, "epoch": 0.4298683340347502, "flos": 20741141975040.0, "grad_norm": 2.105046232245987, "language_loss": 0.65467417, "learning_rate": 2.544548834551713e-06, "loss": 0.67779469, "num_input_tokens_seen": 77068930, "step": 3575, "time_per_iteration": 2.7293717861175537 }, { "auxiliary_loss_clip": 0.01180138, "auxiliary_loss_mlp": 0.01124369, "balance_loss_clip": 0.94063354, "balance_loss_mlp": 0.0, "epoch": 0.4299885769253893, "flos": 20882126856960.0, "grad_norm": 2.3554236020413275, "language_loss": 0.94412887, "learning_rate": 2.5437992514463424e-06, "loss": 0.96717393, "num_input_tokens_seen": 77082255, "step": 3576, "time_per_iteration": 2.738337516784668 }, { "auxiliary_loss_clip": 0.01183365, "auxiliary_loss_mlp": 0.01028683, "balance_loss_clip": 1.01535296, "balance_loss_mlp": 1.02098465, "epoch": 0.4301088198160284, "flos": 25484618183040.0, "grad_norm": 2.2575908857764273, "language_loss": 0.88255417, "learning_rate": 2.5430495858470565e-06, "loss": 0.90467465, "num_input_tokens_seen": 77101725, "step": 3577, "time_per_iteration": 2.7711431980133057 }, { "auxiliary_loss_clip": 0.0118321, "auxiliary_loss_mlp": 0.0102528, "balance_loss_clip": 1.01758087, "balance_loss_mlp": 1.01761198, "epoch": 0.43022906270666744, "flos": 18259427404800.0, "grad_norm": 3.06544659674576, "language_loss": 0.77342898, "learning_rate": 2.54229983786758e-06, "loss": 0.79551387, "num_input_tokens_seen": 77119670, "step": 3578, "time_per_iteration": 2.704608201980591 }, { "auxiliary_loss_clip": 0.01181784, "auxiliary_loss_mlp": 0.01031474, "balance_loss_clip": 0.97506005, "balance_loss_mlp": 1.0236001, "epoch": 0.43034930559730655, "flos": 23399536567680.0, "grad_norm": 2.3647631327451313, "language_loss": 0.85033154, "learning_rate": 2.541550007621651e-06, "loss": 0.87246412, "num_input_tokens_seen": 77138160, "step": 3579, "time_per_iteration": 4.108766794204712 }, { "auxiliary_loss_clip": 0.01184435, "auxiliary_loss_mlp": 0.01025412, "balance_loss_clip": 1.01830196, "balance_loss_mlp": 1.01791334, "epoch": 0.43046954848794566, "flos": 28184382264960.0, "grad_norm": 1.7429632600966787, "language_loss": 0.80015266, "learning_rate": 2.5408000952230156e-06, "loss": 0.82225114, "num_input_tokens_seen": 77156950, "step": 3580, "time_per_iteration": 3.669327735900879 }, { "auxiliary_loss_clip": 0.0119348, "auxiliary_loss_mlp": 0.01026327, "balance_loss_clip": 0.93947053, "balance_loss_mlp": 1.0179168, "epoch": 0.4305897913785847, "flos": 28580476515840.0, "grad_norm": 1.9512174181672175, "language_loss": 0.90149701, "learning_rate": 2.5400501007854357e-06, "loss": 0.92369509, "num_input_tokens_seen": 77176395, "step": 3581, "time_per_iteration": 2.824557065963745 }, { "auxiliary_loss_clip": 0.01181184, "auxiliary_loss_mlp": 0.01026078, "balance_loss_clip": 0.89823067, "balance_loss_mlp": 1.01835299, "epoch": 0.43071003426922383, "flos": 20448721353600.0, "grad_norm": 3.4642596981334375, "language_loss": 0.75328833, "learning_rate": 2.539300024422685e-06, "loss": 0.77536094, "num_input_tokens_seen": 77194340, "step": 3582, "time_per_iteration": 2.8606865406036377 }, { "auxiliary_loss_clip": 0.01094187, "auxiliary_loss_mlp": 0.01004222, "balance_loss_clip": 0.91073513, "balance_loss_mlp": 1.0022788, "epoch": 0.43083027715986294, "flos": 51997969883520.0, "grad_norm": 1.1187131942463926, "language_loss": 0.60919964, "learning_rate": 2.538549866248549e-06, "loss": 0.6301837, "num_input_tokens_seen": 77249320, "step": 3583, "time_per_iteration": 3.149294137954712 }, { "auxiliary_loss_clip": 0.01186209, "auxiliary_loss_mlp": 0.01024835, "balance_loss_clip": 1.01648545, "balance_loss_mlp": 1.01665163, "epoch": 0.430950520050502, "flos": 16690885320960.0, "grad_norm": 2.0741073503794074, "language_loss": 0.81171656, "learning_rate": 2.5377996263768274e-06, "loss": 0.83382702, "num_input_tokens_seen": 77267400, "step": 3584, "time_per_iteration": 2.6656668186187744 }, { "auxiliary_loss_clip": 0.01177577, "auxiliary_loss_mlp": 0.01029119, "balance_loss_clip": 1.01368785, "balance_loss_mlp": 1.02066112, "epoch": 0.4310707629411411, "flos": 24608433726720.0, "grad_norm": 1.7644712520038264, "language_loss": 0.68118042, "learning_rate": 2.5370493049213293e-06, "loss": 0.70324737, "num_input_tokens_seen": 77287045, "step": 3585, "time_per_iteration": 3.627284049987793 }, { "auxiliary_loss_clip": 0.0117347, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 0.7862305, "balance_loss_mlp": 1.02211881, "epoch": 0.4311910058317802, "flos": 26432983019520.0, "grad_norm": 2.0494299085361916, "language_loss": 0.80073869, "learning_rate": 2.536298901995878e-06, "loss": 0.82277822, "num_input_tokens_seen": 77306255, "step": 3586, "time_per_iteration": 2.9785990715026855 }, { "auxiliary_loss_clip": 0.01185303, "auxiliary_loss_mlp": 0.010281, "balance_loss_clip": 0.97804934, "balance_loss_mlp": 1.02018487, "epoch": 0.43131124872241927, "flos": 25155891889920.0, "grad_norm": 1.6522923519936938, "language_loss": 0.80082905, "learning_rate": 2.535548417714311e-06, "loss": 0.82296312, "num_input_tokens_seen": 77325555, "step": 3587, "time_per_iteration": 4.191251277923584 }, { "auxiliary_loss_clip": 0.01186452, "auxiliary_loss_mlp": 0.01028314, "balance_loss_clip": 1.01315284, "balance_loss_mlp": 1.02024961, "epoch": 0.4314314916130584, "flos": 21614812479360.0, "grad_norm": 1.4663146345769913, "language_loss": 0.87210977, "learning_rate": 2.534797852190474e-06, "loss": 0.89425743, "num_input_tokens_seen": 77345735, "step": 3588, "time_per_iteration": 2.7000906467437744 }, { "auxiliary_loss_clip": 0.01179865, "auxiliary_loss_mlp": 0.0103012, "balance_loss_clip": 1.01388788, "balance_loss_mlp": 1.02275252, "epoch": 0.4315517345036975, "flos": 19275016544640.0, "grad_norm": 1.7916700857020165, "language_loss": 0.81307459, "learning_rate": 2.5340472055382283e-06, "loss": 0.8351745, "num_input_tokens_seen": 77361765, "step": 3589, "time_per_iteration": 2.696242332458496 }, { "auxiliary_loss_clip": 0.01178381, "auxiliary_loss_mlp": 0.01025783, "balance_loss_clip": 0.93531954, "balance_loss_mlp": 1.01835048, "epoch": 0.43167197739433655, "flos": 24273853516800.0, "grad_norm": 1.8826273944498801, "language_loss": 0.81102872, "learning_rate": 2.5332964778714468e-06, "loss": 0.83307028, "num_input_tokens_seen": 77378950, "step": 3590, "time_per_iteration": 2.697474718093872 }, { "auxiliary_loss_clip": 0.01181909, "auxiliary_loss_mlp": 0.01032183, "balance_loss_clip": 0.94211638, "balance_loss_mlp": 1.02453542, "epoch": 0.43179222028497566, "flos": 16867816738560.0, "grad_norm": 1.575704158628241, "language_loss": 0.6620512, "learning_rate": 2.5325456693040123e-06, "loss": 0.68419218, "num_input_tokens_seen": 77396145, "step": 3591, "time_per_iteration": 2.695726156234741 }, { "auxiliary_loss_clip": 0.0119156, "auxiliary_loss_mlp": 0.01028509, "balance_loss_clip": 1.01556897, "balance_loss_mlp": 1.02013469, "epoch": 0.43191246317561477, "flos": 17639214243840.0, "grad_norm": 2.0051801769662605, "language_loss": 0.74818623, "learning_rate": 2.531794779949824e-06, "loss": 0.77038687, "num_input_tokens_seen": 77414045, "step": 3592, "time_per_iteration": 2.612287998199463 }, { "auxiliary_loss_clip": 0.01171865, "auxiliary_loss_mlp": 0.01029717, "balance_loss_clip": 0.93719065, "balance_loss_mlp": 1.02162242, "epoch": 0.4320327060662538, "flos": 23878800760320.0, "grad_norm": 1.8196519979761998, "language_loss": 0.87794018, "learning_rate": 2.5310438099227903e-06, "loss": 0.89995599, "num_input_tokens_seen": 77431310, "step": 3593, "time_per_iteration": 2.673452615737915 }, { "auxiliary_loss_clip": 0.01089191, "auxiliary_loss_mlp": 0.01002785, "balance_loss_clip": 0.98527336, "balance_loss_mlp": 1.00081778, "epoch": 0.43215294895689293, "flos": 66394917959040.0, "grad_norm": 0.8166284628052796, "language_loss": 0.53416699, "learning_rate": 2.530292759336833e-06, "loss": 0.55508673, "num_input_tokens_seen": 77492045, "step": 3594, "time_per_iteration": 3.289127826690674 }, { "auxiliary_loss_clip": 0.01181735, "auxiliary_loss_mlp": 0.01031119, "balance_loss_clip": 0.97832859, "balance_loss_mlp": 1.02243471, "epoch": 0.432273191847532, "flos": 20594267262720.0, "grad_norm": 2.606292905015003, "language_loss": 0.69627738, "learning_rate": 2.5295416283058855e-06, "loss": 0.71840596, "num_input_tokens_seen": 77510910, "step": 3595, "time_per_iteration": 2.623948097229004 }, { "auxiliary_loss_clip": 0.01178416, "auxiliary_loss_mlp": 0.01124193, "balance_loss_clip": 0.97829288, "balance_loss_mlp": 0.0, "epoch": 0.4323934347381711, "flos": 19282127437440.0, "grad_norm": 1.7125916560748982, "language_loss": 0.65962172, "learning_rate": 2.5287904169438943e-06, "loss": 0.68264782, "num_input_tokens_seen": 77530115, "step": 3596, "time_per_iteration": 2.671678304672241 }, { "auxiliary_loss_clip": 0.01186241, "auxiliary_loss_mlp": 0.01023481, "balance_loss_clip": 0.8262971, "balance_loss_mlp": 1.01506519, "epoch": 0.4325136776288102, "flos": 21726315273600.0, "grad_norm": 2.404922049567735, "language_loss": 0.63657314, "learning_rate": 2.528039125364817e-06, "loss": 0.65867043, "num_input_tokens_seen": 77548920, "step": 3597, "time_per_iteration": 2.909273862838745 }, { "auxiliary_loss_clip": 0.0118355, "auxiliary_loss_mlp": 0.010265, "balance_loss_clip": 0.93957335, "balance_loss_mlp": 1.01794672, "epoch": 0.43263392051944927, "flos": 22340746344960.0, "grad_norm": 3.1295667410986847, "language_loss": 0.75546128, "learning_rate": 2.5272877536826246e-06, "loss": 0.77756178, "num_input_tokens_seen": 77567715, "step": 3598, "time_per_iteration": 3.2706127166748047 }, { "auxiliary_loss_clip": 0.01181355, "auxiliary_loss_mlp": 0.01023846, "balance_loss_clip": 0.89760202, "balance_loss_mlp": 1.01608002, "epoch": 0.4327541634100884, "flos": 29168406328320.0, "grad_norm": 2.9042307159306824, "language_loss": 0.70045257, "learning_rate": 2.5265363020112986e-06, "loss": 0.72250462, "num_input_tokens_seen": 77588035, "step": 3599, "time_per_iteration": 2.8505144119262695 }, { "auxiliary_loss_clip": 0.01184262, "auxiliary_loss_mlp": 0.01027486, "balance_loss_clip": 1.01729989, "balance_loss_mlp": 1.01962399, "epoch": 0.4328744063007275, "flos": 26067448264320.0, "grad_norm": 2.0298283432202644, "language_loss": 0.83795977, "learning_rate": 2.5257847704648344e-06, "loss": 0.86007726, "num_input_tokens_seen": 77609265, "step": 3600, "time_per_iteration": 2.668567657470703 }, { "auxiliary_loss_clip": 0.01184924, "auxiliary_loss_mlp": 0.01033055, "balance_loss_clip": 1.05487633, "balance_loss_mlp": 1.02535105, "epoch": 0.43299464919136654, "flos": 16581357774720.0, "grad_norm": 1.6617780828272366, "language_loss": 0.75283313, "learning_rate": 2.525033159157239e-06, "loss": 0.77501297, "num_input_tokens_seen": 77625580, "step": 3601, "time_per_iteration": 2.6219189167022705 }, { "auxiliary_loss_clip": 0.01180254, "auxiliary_loss_mlp": 0.01033081, "balance_loss_clip": 1.01485014, "balance_loss_mlp": 1.0244565, "epoch": 0.43311489208200565, "flos": 16107265140480.0, "grad_norm": 1.71189641964567, "language_loss": 0.77188396, "learning_rate": 2.52428146820253e-06, "loss": 0.79401726, "num_input_tokens_seen": 77643835, "step": 3602, "time_per_iteration": 2.685880422592163 }, { "auxiliary_loss_clip": 0.01183996, "auxiliary_loss_mlp": 0.01027317, "balance_loss_clip": 0.9389298, "balance_loss_mlp": 1.01857877, "epoch": 0.43323513497264476, "flos": 22930220442240.0, "grad_norm": 1.8671233898094108, "language_loss": 0.81647098, "learning_rate": 2.52352969771474e-06, "loss": 0.83858413, "num_input_tokens_seen": 77663060, "step": 3603, "time_per_iteration": 2.7238776683807373 }, { "auxiliary_loss_clip": 0.01181683, "auxiliary_loss_mlp": 0.01028667, "balance_loss_clip": 0.9756465, "balance_loss_mlp": 1.02117455, "epoch": 0.4333553778632838, "flos": 25299031587840.0, "grad_norm": 2.4289679500521926, "language_loss": 0.8841393, "learning_rate": 2.5227778478079106e-06, "loss": 0.90624279, "num_input_tokens_seen": 77682470, "step": 3604, "time_per_iteration": 2.7915492057800293 }, { "auxiliary_loss_clip": 0.01178865, "auxiliary_loss_mlp": 0.01026215, "balance_loss_clip": 1.01396728, "balance_loss_mlp": 1.01852655, "epoch": 0.43347562075392293, "flos": 19387165783680.0, "grad_norm": 1.6214306322162644, "language_loss": 0.76729697, "learning_rate": 2.522025918596098e-06, "loss": 0.78934777, "num_input_tokens_seen": 77700770, "step": 3605, "time_per_iteration": 4.853153944015503 }, { "auxiliary_loss_clip": 0.01184662, "auxiliary_loss_mlp": 0.01024697, "balance_loss_clip": 1.01559472, "balance_loss_mlp": 1.01681423, "epoch": 0.43359586364456204, "flos": 26325969425280.0, "grad_norm": 1.4891972521630263, "language_loss": 0.65513104, "learning_rate": 2.521273910193368e-06, "loss": 0.67722464, "num_input_tokens_seen": 77723950, "step": 3606, "time_per_iteration": 2.720045566558838 }, { "auxiliary_loss_clip": 0.01189075, "auxiliary_loss_mlp": 0.01030517, "balance_loss_clip": 1.01534462, "balance_loss_mlp": 1.02247643, "epoch": 0.4337161065352011, "flos": 15989261984640.0, "grad_norm": 2.1704771106231067, "language_loss": 0.87107944, "learning_rate": 2.5205218227138006e-06, "loss": 0.89327538, "num_input_tokens_seen": 77736905, "step": 3607, "time_per_iteration": 2.6306116580963135 }, { "auxiliary_loss_clip": 0.01186324, "auxiliary_loss_mlp": 0.01029703, "balance_loss_clip": 1.05410361, "balance_loss_mlp": 1.02154899, "epoch": 0.4338363494258402, "flos": 20224710184320.0, "grad_norm": 1.910914770297639, "language_loss": 0.79187739, "learning_rate": 2.519769656271486e-06, "loss": 0.81403768, "num_input_tokens_seen": 77754325, "step": 3608, "time_per_iteration": 2.6045942306518555 }, { "auxiliary_loss_clip": 0.01171675, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 0.90095812, "balance_loss_mlp": 1.02665925, "epoch": 0.43395659231647926, "flos": 20083904870400.0, "grad_norm": 1.9442464803437143, "language_loss": 0.67842376, "learning_rate": 2.5190174109805285e-06, "loss": 0.70048821, "num_input_tokens_seen": 77774150, "step": 3609, "time_per_iteration": 2.8039252758026123 }, { "auxiliary_loss_clip": 0.01172138, "auxiliary_loss_mlp": 0.01030311, "balance_loss_clip": 0.97378743, "balance_loss_mlp": 1.0222944, "epoch": 0.43407683520711837, "flos": 19901801894400.0, "grad_norm": 2.061429184227631, "language_loss": 0.64376402, "learning_rate": 2.518265086955042e-06, "loss": 0.66578853, "num_input_tokens_seen": 77791870, "step": 3610, "time_per_iteration": 2.608623504638672 }, { "auxiliary_loss_clip": 0.01185162, "auxiliary_loss_mlp": 0.01030976, "balance_loss_clip": 1.05400479, "balance_loss_mlp": 1.02272058, "epoch": 0.4341970780977575, "flos": 23108732058240.0, "grad_norm": 2.2773894973666127, "language_loss": 0.83811617, "learning_rate": 2.5175126843091534e-06, "loss": 0.86027753, "num_input_tokens_seen": 77811240, "step": 3611, "time_per_iteration": 3.5107758045196533 }, { "auxiliary_loss_clip": 0.01189531, "auxiliary_loss_mlp": 0.0103065, "balance_loss_clip": 0.97881168, "balance_loss_mlp": 1.02247262, "epoch": 0.43431732098839654, "flos": 37408288406400.0, "grad_norm": 2.043705757102316, "language_loss": 0.75280637, "learning_rate": 2.5167602031570034e-06, "loss": 0.7750082, "num_input_tokens_seen": 77831425, "step": 3612, "time_per_iteration": 2.795139789581299 }, { "auxiliary_loss_clip": 0.01185008, "auxiliary_loss_mlp": 0.01027157, "balance_loss_clip": 1.0542767, "balance_loss_mlp": 1.01928377, "epoch": 0.43443756387903565, "flos": 31868206323840.0, "grad_norm": 1.758639624824342, "language_loss": 0.73319292, "learning_rate": 2.51600764361274e-06, "loss": 0.75531453, "num_input_tokens_seen": 77852950, "step": 3613, "time_per_iteration": 3.645063638687134 }, { "auxiliary_loss_clip": 0.01187672, "auxiliary_loss_mlp": 0.01030279, "balance_loss_clip": 1.05575466, "balance_loss_mlp": 1.02169013, "epoch": 0.43455780676967476, "flos": 23477139901440.0, "grad_norm": 2.499656825063409, "language_loss": 0.78889835, "learning_rate": 2.5152550057905283e-06, "loss": 0.81107783, "num_input_tokens_seen": 77872840, "step": 3614, "time_per_iteration": 2.6065306663513184 }, { "auxiliary_loss_clip": 0.0118648, "auxiliary_loss_mlp": 0.01124192, "balance_loss_clip": 1.01703942, "balance_loss_mlp": 0.0, "epoch": 0.4346780496603138, "flos": 24207060176640.0, "grad_norm": 2.3376905694241974, "language_loss": 0.77219176, "learning_rate": 2.5145022898045415e-06, "loss": 0.79529846, "num_input_tokens_seen": 77892025, "step": 3615, "time_per_iteration": 2.6345746517181396 }, { "auxiliary_loss_clip": 0.0118525, "auxiliary_loss_mlp": 0.01026038, "balance_loss_clip": 0.975999, "balance_loss_mlp": 1.01759183, "epoch": 0.4347982925509529, "flos": 17092366611840.0, "grad_norm": 1.9891909229242413, "language_loss": 0.89716196, "learning_rate": 2.5137494957689664e-06, "loss": 0.91927481, "num_input_tokens_seen": 77907635, "step": 3616, "time_per_iteration": 2.6725754737854004 }, { "auxiliary_loss_clip": 0.01087608, "auxiliary_loss_mlp": 0.01003386, "balance_loss_clip": 0.94587958, "balance_loss_mlp": 1.00135922, "epoch": 0.43491853544159204, "flos": 60945544696320.0, "grad_norm": 0.7874515006927079, "language_loss": 0.57374156, "learning_rate": 2.5129966237980016e-06, "loss": 0.59465146, "num_input_tokens_seen": 77970630, "step": 3617, "time_per_iteration": 3.280794143676758 }, { "auxiliary_loss_clip": 0.01181828, "auxiliary_loss_mlp": 0.01024118, "balance_loss_clip": 0.93829083, "balance_loss_mlp": 1.01647687, "epoch": 0.4350387783322311, "flos": 21944652094080.0, "grad_norm": 2.121473599068382, "language_loss": 0.78524023, "learning_rate": 2.512243674005857e-06, "loss": 0.80729967, "num_input_tokens_seen": 77989995, "step": 3618, "time_per_iteration": 2.728649616241455 }, { "auxiliary_loss_clip": 0.01174491, "auxiliary_loss_mlp": 0.01030891, "balance_loss_clip": 0.85998952, "balance_loss_mlp": 1.02313638, "epoch": 0.4351590212228702, "flos": 25082705928960.0, "grad_norm": 1.7139077657812427, "language_loss": 0.86380041, "learning_rate": 2.5114906465067537e-06, "loss": 0.88585418, "num_input_tokens_seen": 78010980, "step": 3619, "time_per_iteration": 2.884364366531372 }, { "auxiliary_loss_clip": 0.01184187, "auxiliary_loss_mlp": 0.01029883, "balance_loss_clip": 1.01335418, "balance_loss_mlp": 1.02247453, "epoch": 0.4352792641135093, "flos": 21506541909120.0, "grad_norm": 2.153724679409345, "language_loss": 0.74703264, "learning_rate": 2.5107375414149264e-06, "loss": 0.76917338, "num_input_tokens_seen": 78030225, "step": 3620, "time_per_iteration": 2.760944128036499 }, { "auxiliary_loss_clip": 0.0117194, "auxiliary_loss_mlp": 0.0102643, "balance_loss_clip": 0.89572573, "balance_loss_mlp": 1.01836598, "epoch": 0.43539950700414837, "flos": 16253457494400.0, "grad_norm": 2.182209717711598, "language_loss": 0.71447432, "learning_rate": 2.5099843588446197e-06, "loss": 0.73645806, "num_input_tokens_seen": 78048545, "step": 3621, "time_per_iteration": 2.7151060104370117 }, { "auxiliary_loss_clip": 0.01186665, "auxiliary_loss_mlp": 0.01027738, "balance_loss_clip": 0.90224552, "balance_loss_mlp": 1.01932192, "epoch": 0.4355197498947875, "flos": 16691819074560.0, "grad_norm": 2.3792055862777586, "language_loss": 0.61381292, "learning_rate": 2.509231098910091e-06, "loss": 0.635957, "num_input_tokens_seen": 78068415, "step": 3622, "time_per_iteration": 2.7860217094421387 }, { "auxiliary_loss_clip": 0.01180943, "auxiliary_loss_mlp": 0.01026179, "balance_loss_clip": 0.97785628, "balance_loss_mlp": 1.01822746, "epoch": 0.4356399927854266, "flos": 16362733645440.0, "grad_norm": 2.0764419508380034, "language_loss": 0.7485503, "learning_rate": 2.508477761725611e-06, "loss": 0.77062154, "num_input_tokens_seen": 78086690, "step": 3623, "time_per_iteration": 2.6534831523895264 }, { "auxiliary_loss_clip": 0.01188004, "auxiliary_loss_mlp": 0.01030727, "balance_loss_clip": 1.01549864, "balance_loss_mlp": 1.02228701, "epoch": 0.43576023567606564, "flos": 17202037812480.0, "grad_norm": 1.7361766271921892, "language_loss": 0.80767912, "learning_rate": 2.507724347405458e-06, "loss": 0.82986647, "num_input_tokens_seen": 78104640, "step": 3624, "time_per_iteration": 2.61525821685791 }, { "auxiliary_loss_clip": 0.01171493, "auxiliary_loss_mlp": 0.01022909, "balance_loss_clip": 0.89570236, "balance_loss_mlp": 1.01499391, "epoch": 0.43588047856670475, "flos": 15917656222080.0, "grad_norm": 2.29217240152575, "language_loss": 0.81876236, "learning_rate": 2.5069708560639243e-06, "loss": 0.84070647, "num_input_tokens_seen": 78122550, "step": 3625, "time_per_iteration": 2.7404251098632812 }, { "auxiliary_loss_clip": 0.01180468, "auxiliary_loss_mlp": 0.0102636, "balance_loss_clip": 0.93664432, "balance_loss_mlp": 1.01834297, "epoch": 0.4360007214573438, "flos": 23659566099840.0, "grad_norm": 1.922622799526427, "language_loss": 0.61299288, "learning_rate": 2.5062172878153158e-06, "loss": 0.63506114, "num_input_tokens_seen": 78141825, "step": 3626, "time_per_iteration": 2.6807310581207275 }, { "auxiliary_loss_clip": 0.01185955, "auxiliary_loss_mlp": 0.01023327, "balance_loss_clip": 0.862809, "balance_loss_mlp": 1.01537013, "epoch": 0.4361209643479829, "flos": 21978767036160.0, "grad_norm": 2.0477310473106187, "language_loss": 0.872648, "learning_rate": 2.505463642773947e-06, "loss": 0.89474082, "num_input_tokens_seen": 78161790, "step": 3627, "time_per_iteration": 2.8751537799835205 }, { "auxiliary_loss_clip": 0.01182132, "auxiliary_loss_mlp": 0.01124319, "balance_loss_clip": 0.93716055, "balance_loss_mlp": 0.0, "epoch": 0.43624120723862203, "flos": 17420159151360.0, "grad_norm": 2.2959048862900397, "language_loss": 0.74958515, "learning_rate": 2.504709921054146e-06, "loss": 0.77264965, "num_input_tokens_seen": 78178605, "step": 3628, "time_per_iteration": 2.7062220573425293 }, { "auxiliary_loss_clip": 0.01171894, "auxiliary_loss_mlp": 0.01030331, "balance_loss_clip": 0.93342876, "balance_loss_mlp": 1.02208734, "epoch": 0.4363614501292611, "flos": 17895293280000.0, "grad_norm": 2.3322563720244265, "language_loss": 0.83932018, "learning_rate": 2.50395612277025e-06, "loss": 0.86134243, "num_input_tokens_seen": 78194460, "step": 3629, "time_per_iteration": 2.6975765228271484 }, { "auxiliary_loss_clip": 0.01184999, "auxiliary_loss_mlp": 0.01030191, "balance_loss_clip": 0.97373605, "balance_loss_mlp": 1.02225173, "epoch": 0.4364816930199002, "flos": 20302888135680.0, "grad_norm": 1.8854594526950803, "language_loss": 0.72887683, "learning_rate": 2.503202248036612e-06, "loss": 0.75102878, "num_input_tokens_seen": 78213315, "step": 3630, "time_per_iteration": 3.6851584911346436 }, { "auxiliary_loss_clip": 0.01185952, "auxiliary_loss_mlp": 0.01034514, "balance_loss_clip": 1.05424452, "balance_loss_mlp": 1.02667022, "epoch": 0.4366019359105393, "flos": 24061334699520.0, "grad_norm": 1.6704429363393547, "language_loss": 0.73400015, "learning_rate": 2.5024482969675927e-06, "loss": 0.75620484, "num_input_tokens_seen": 78233270, "step": 3631, "time_per_iteration": 2.725320816040039 }, { "auxiliary_loss_clip": 0.01184643, "auxiliary_loss_mlp": 0.01024439, "balance_loss_clip": 0.90035075, "balance_loss_mlp": 1.01643968, "epoch": 0.43672217880117836, "flos": 21754109422080.0, "grad_norm": 2.029772920237369, "language_loss": 0.84336865, "learning_rate": 2.501694269677566e-06, "loss": 0.8654595, "num_input_tokens_seen": 78251040, "step": 3632, "time_per_iteration": 3.7611265182495117 }, { "auxiliary_loss_clip": 0.01186006, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.01343942, "balance_loss_mlp": 1.01971745, "epoch": 0.4368424216918175, "flos": 18035200753920.0, "grad_norm": 1.8220413709400145, "language_loss": 0.80993122, "learning_rate": 2.500940166280918e-06, "loss": 0.83207065, "num_input_tokens_seen": 78269470, "step": 3633, "time_per_iteration": 2.6626741886138916 }, { "auxiliary_loss_clip": 0.01177191, "auxiliary_loss_mlp": 0.0102886, "balance_loss_clip": 1.01284432, "balance_loss_mlp": 1.02084935, "epoch": 0.4369626645824566, "flos": 25447127362560.0, "grad_norm": 1.775714839284276, "language_loss": 0.78932452, "learning_rate": 2.500185986892045e-06, "loss": 0.81138504, "num_input_tokens_seen": 78288955, "step": 3634, "time_per_iteration": 2.725003242492676 }, { "auxiliary_loss_clip": 0.01177333, "auxiliary_loss_mlp": 0.01026657, "balance_loss_clip": 1.01313698, "balance_loss_mlp": 1.0184797, "epoch": 0.43708290747309564, "flos": 25302694775040.0, "grad_norm": 2.7080719784799645, "language_loss": 0.77376318, "learning_rate": 2.499431731625355e-06, "loss": 0.79580307, "num_input_tokens_seen": 78307980, "step": 3635, "time_per_iteration": 2.7051844596862793 }, { "auxiliary_loss_clip": 0.01187774, "auxiliary_loss_mlp": 0.01029082, "balance_loss_clip": 1.05347204, "balance_loss_mlp": 1.02066016, "epoch": 0.43720315036373475, "flos": 31575103344000.0, "grad_norm": 1.95800759404508, "language_loss": 0.79090226, "learning_rate": 2.4986774005952686e-06, "loss": 0.81307083, "num_input_tokens_seen": 78330355, "step": 3636, "time_per_iteration": 2.6864259243011475 }, { "auxiliary_loss_clip": 0.01181592, "auxiliary_loss_mlp": 0.01028887, "balance_loss_clip": 1.01510012, "balance_loss_mlp": 1.02092361, "epoch": 0.43732339325437386, "flos": 23112000195840.0, "grad_norm": 1.9800928605224248, "language_loss": 0.84788501, "learning_rate": 2.4979229939162166e-06, "loss": 0.86998975, "num_input_tokens_seen": 78349135, "step": 3637, "time_per_iteration": 4.022350549697876 }, { "auxiliary_loss_clip": 0.01181158, "auxiliary_loss_mlp": 0.01029577, "balance_loss_clip": 1.01592588, "balance_loss_mlp": 1.02191222, "epoch": 0.4374436361450129, "flos": 27746272080000.0, "grad_norm": 1.598718348933168, "language_loss": 0.80549085, "learning_rate": 2.4971685117026433e-06, "loss": 0.82759821, "num_input_tokens_seen": 78368900, "step": 3638, "time_per_iteration": 2.668365716934204 }, { "auxiliary_loss_clip": 0.01185859, "auxiliary_loss_mlp": 0.0102516, "balance_loss_clip": 1.01518321, "balance_loss_mlp": 1.01748872, "epoch": 0.437563879035652, "flos": 24172370616960.0, "grad_norm": 1.3807477129000632, "language_loss": 0.7707634, "learning_rate": 2.4964139540690018e-06, "loss": 0.79287356, "num_input_tokens_seen": 78392235, "step": 3639, "time_per_iteration": 3.63617205619812 }, { "auxiliary_loss_clip": 0.01187896, "auxiliary_loss_mlp": 0.01027424, "balance_loss_clip": 0.94045061, "balance_loss_mlp": 1.01918626, "epoch": 0.4376841219262911, "flos": 23477211728640.0, "grad_norm": 2.093601471342815, "language_loss": 0.72527158, "learning_rate": 2.495659321129758e-06, "loss": 0.74742472, "num_input_tokens_seen": 78409980, "step": 3640, "time_per_iteration": 2.6949081420898438 }, { "auxiliary_loss_clip": 0.0118245, "auxiliary_loss_mlp": 0.01032898, "balance_loss_clip": 1.0163486, "balance_loss_mlp": 1.02514958, "epoch": 0.4378043648169302, "flos": 25447809720960.0, "grad_norm": 1.7401254879264967, "language_loss": 0.75295824, "learning_rate": 2.494904612999389e-06, "loss": 0.77511168, "num_input_tokens_seen": 78428690, "step": 3641, "time_per_iteration": 2.657477617263794 }, { "auxiliary_loss_clip": 0.01082253, "auxiliary_loss_mlp": 0.01002734, "balance_loss_clip": 0.98251355, "balance_loss_mlp": 1.00068367, "epoch": 0.4379246077075693, "flos": 53914056986880.0, "grad_norm": 0.7599659521166064, "language_loss": 0.56558371, "learning_rate": 2.4941498297923843e-06, "loss": 0.58643365, "num_input_tokens_seen": 78489260, "step": 3642, "time_per_iteration": 3.206023693084717 }, { "auxiliary_loss_clip": 0.01181876, "auxiliary_loss_mlp": 0.01026997, "balance_loss_clip": 1.01666832, "balance_loss_mlp": 1.01929045, "epoch": 0.43804485059820836, "flos": 20588305605120.0, "grad_norm": 1.742424110912525, "language_loss": 0.69828135, "learning_rate": 2.4933949716232424e-06, "loss": 0.72037005, "num_input_tokens_seen": 78506785, "step": 3643, "time_per_iteration": 2.683504343032837 }, { "auxiliary_loss_clip": 0.01183319, "auxiliary_loss_mlp": 0.01032045, "balance_loss_clip": 0.94020641, "balance_loss_mlp": 1.02336097, "epoch": 0.43816509348884747, "flos": 23876214981120.0, "grad_norm": 2.0801434522058693, "language_loss": 0.73600543, "learning_rate": 2.492640038606476e-06, "loss": 0.75815916, "num_input_tokens_seen": 78525150, "step": 3644, "time_per_iteration": 2.7357757091522217 }, { "auxiliary_loss_clip": 0.01183519, "auxiliary_loss_mlp": 0.01029011, "balance_loss_clip": 1.01473832, "balance_loss_mlp": 1.02048159, "epoch": 0.4382853363794866, "flos": 14684448533760.0, "grad_norm": 2.0986580382834013, "language_loss": 0.78658175, "learning_rate": 2.491885030856608e-06, "loss": 0.808707, "num_input_tokens_seen": 78543245, "step": 3645, "time_per_iteration": 2.665936231613159 }, { "auxiliary_loss_clip": 0.01183622, "auxiliary_loss_mlp": 0.01027817, "balance_loss_clip": 0.9767468, "balance_loss_mlp": 1.02009797, "epoch": 0.43840557927012563, "flos": 17165301177600.0, "grad_norm": 2.3120767892585667, "language_loss": 0.8287909, "learning_rate": 2.4911299484881713e-06, "loss": 0.8509053, "num_input_tokens_seen": 78560775, "step": 3646, "time_per_iteration": 2.666853666305542 }, { "auxiliary_loss_clip": 0.01173199, "auxiliary_loss_mlp": 0.01027294, "balance_loss_clip": 0.97326565, "balance_loss_mlp": 1.01909244, "epoch": 0.43852582216076474, "flos": 19390685316480.0, "grad_norm": 2.185875402908665, "language_loss": 0.81037009, "learning_rate": 2.490374791615712e-06, "loss": 0.83237505, "num_input_tokens_seen": 78580800, "step": 3647, "time_per_iteration": 2.6983795166015625 }, { "auxiliary_loss_clip": 0.0119034, "auxiliary_loss_mlp": 0.01124492, "balance_loss_clip": 1.0552218, "balance_loss_mlp": 0.0, "epoch": 0.43864606505140386, "flos": 18075133699200.0, "grad_norm": 2.7107593309170306, "language_loss": 0.76747298, "learning_rate": 2.4896195603537867e-06, "loss": 0.79062128, "num_input_tokens_seen": 78595410, "step": 3648, "time_per_iteration": 2.5598413944244385 }, { "auxiliary_loss_clip": 0.01176818, "auxiliary_loss_mlp": 0.01031346, "balance_loss_clip": 0.90380889, "balance_loss_mlp": 1.02297735, "epoch": 0.4387663079420429, "flos": 19644896845440.0, "grad_norm": 2.2019372221947964, "language_loss": 0.74067378, "learning_rate": 2.488864254816964e-06, "loss": 0.76275539, "num_input_tokens_seen": 78614100, "step": 3649, "time_per_iteration": 2.733046770095825 }, { "auxiliary_loss_clip": 0.01187837, "auxiliary_loss_mlp": 0.01032956, "balance_loss_clip": 1.01689637, "balance_loss_mlp": 1.02414608, "epoch": 0.438886550832682, "flos": 19719339782400.0, "grad_norm": 2.9161713640824747, "language_loss": 0.68505812, "learning_rate": 2.4881088751198218e-06, "loss": 0.70726609, "num_input_tokens_seen": 78632260, "step": 3650, "time_per_iteration": 2.6342098712921143 }, { "auxiliary_loss_clip": 0.01183832, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 0.97574341, "balance_loss_mlp": 1.0191747, "epoch": 0.43900679372332113, "flos": 14536675981440.0, "grad_norm": 2.736698109184975, "language_loss": 0.63885629, "learning_rate": 2.4873534213769517e-06, "loss": 0.6609655, "num_input_tokens_seen": 78647490, "step": 3651, "time_per_iteration": 2.7033612728118896 }, { "auxiliary_loss_clip": 0.01177918, "auxiliary_loss_mlp": 0.01031717, "balance_loss_clip": 0.93936265, "balance_loss_mlp": 1.02378941, "epoch": 0.4391270366139602, "flos": 24056234968320.0, "grad_norm": 1.630308015307728, "language_loss": 0.71977055, "learning_rate": 2.4865978937029547e-06, "loss": 0.74186683, "num_input_tokens_seen": 78666470, "step": 3652, "time_per_iteration": 2.674546718597412 }, { "auxiliary_loss_clip": 0.01169631, "auxiliary_loss_mlp": 0.01029964, "balance_loss_clip": 0.8985377, "balance_loss_mlp": 1.02211368, "epoch": 0.4392472795045993, "flos": 31538510363520.0, "grad_norm": 1.6338743998001783, "language_loss": 0.66352236, "learning_rate": 2.485842292212445e-06, "loss": 0.68551838, "num_input_tokens_seen": 78687685, "step": 3653, "time_per_iteration": 2.85756778717041 }, { "auxiliary_loss_clip": 0.01189154, "auxiliary_loss_mlp": 0.01031574, "balance_loss_clip": 1.05607486, "balance_loss_mlp": 1.02318752, "epoch": 0.4393675223952384, "flos": 14866300114560.0, "grad_norm": 1.8728189938297224, "language_loss": 0.80266321, "learning_rate": 2.485086617020045e-06, "loss": 0.82487047, "num_input_tokens_seen": 78706180, "step": 3654, "time_per_iteration": 2.6038951873779297 }, { "auxiliary_loss_clip": 0.01172912, "auxiliary_loss_mlp": 0.01029202, "balance_loss_clip": 0.97294188, "balance_loss_mlp": 1.02089322, "epoch": 0.43948776528587746, "flos": 14825900292480.0, "grad_norm": 1.8967904167965146, "language_loss": 0.81934106, "learning_rate": 2.4843308682403903e-06, "loss": 0.84136224, "num_input_tokens_seen": 78723095, "step": 3655, "time_per_iteration": 2.7099010944366455 }, { "auxiliary_loss_clip": 0.01184402, "auxiliary_loss_mlp": 0.01025813, "balance_loss_clip": 1.0527029, "balance_loss_mlp": 1.01811159, "epoch": 0.4396080081765166, "flos": 13914523486080.0, "grad_norm": 1.604825398059585, "language_loss": 0.82452309, "learning_rate": 2.4835750459881294e-06, "loss": 0.84662521, "num_input_tokens_seen": 78739720, "step": 3656, "time_per_iteration": 3.6980745792388916 }, { "auxiliary_loss_clip": 0.01171922, "auxiliary_loss_mlp": 0.01034045, "balance_loss_clip": 0.97352666, "balance_loss_mlp": 1.02593899, "epoch": 0.43972825106715563, "flos": 18222978078720.0, "grad_norm": 2.8639397735687173, "language_loss": 0.82087505, "learning_rate": 2.4828191503779177e-06, "loss": 0.84293473, "num_input_tokens_seen": 78757820, "step": 3657, "time_per_iteration": 2.65509033203125 }, { "auxiliary_loss_clip": 0.01182281, "auxiliary_loss_mlp": 0.01026911, "balance_loss_clip": 0.93727344, "balance_loss_mlp": 1.01867402, "epoch": 0.43984849395779474, "flos": 16873239692160.0, "grad_norm": 2.096468028140605, "language_loss": 0.90096587, "learning_rate": 2.482063181524425e-06, "loss": 0.92305779, "num_input_tokens_seen": 78773720, "step": 3658, "time_per_iteration": 3.646451950073242 }, { "auxiliary_loss_clip": 0.01186903, "auxiliary_loss_mlp": 0.01031709, "balance_loss_clip": 1.0533936, "balance_loss_mlp": 1.02332854, "epoch": 0.43996873684843385, "flos": 18691504104960.0, "grad_norm": 2.1924866330246613, "language_loss": 0.81287098, "learning_rate": 2.4813071395423307e-06, "loss": 0.83505708, "num_input_tokens_seen": 78791285, "step": 3659, "time_per_iteration": 2.731510877609253 }, { "auxiliary_loss_clip": 0.0118621, "auxiliary_loss_mlp": 0.01027489, "balance_loss_clip": 1.01604533, "balance_loss_mlp": 1.01935339, "epoch": 0.4400889797390729, "flos": 23653460787840.0, "grad_norm": 2.4450000178683453, "language_loss": 0.64261198, "learning_rate": 2.4805510245463263e-06, "loss": 0.66474903, "num_input_tokens_seen": 78811440, "step": 3660, "time_per_iteration": 2.7443079948425293 }, { "auxiliary_loss_clip": 0.01181522, "auxiliary_loss_mlp": 0.01030405, "balance_loss_clip": 1.01226115, "balance_loss_mlp": 1.02254915, "epoch": 0.440209222629712, "flos": 23149203707520.0, "grad_norm": 1.8190209877442687, "language_loss": 0.6031642, "learning_rate": 2.4797948366511137e-06, "loss": 0.62528348, "num_input_tokens_seen": 78831150, "step": 3661, "time_per_iteration": 2.7650699615478516 }, { "auxiliary_loss_clip": 0.01174816, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 0.93373203, "balance_loss_mlp": 1.02407885, "epoch": 0.4403294655203511, "flos": 24823394668800.0, "grad_norm": 1.7581176877423315, "language_loss": 0.76048255, "learning_rate": 2.4790385759714055e-06, "loss": 0.78255725, "num_input_tokens_seen": 78850215, "step": 3662, "time_per_iteration": 2.7725958824157715 }, { "auxiliary_loss_clip": 0.01185268, "auxiliary_loss_mlp": 0.01030009, "balance_loss_clip": 1.01642704, "balance_loss_mlp": 1.02209949, "epoch": 0.4404497084109902, "flos": 22565080736640.0, "grad_norm": 1.6366598316776273, "language_loss": 0.70920825, "learning_rate": 2.478282242621926e-06, "loss": 0.73136103, "num_input_tokens_seen": 78870675, "step": 3663, "time_per_iteration": 3.5526251792907715 }, { "auxiliary_loss_clip": 0.01090252, "auxiliary_loss_mlp": 0.01004046, "balance_loss_clip": 0.9088217, "balance_loss_mlp": 1.00181723, "epoch": 0.4405699513016293, "flos": 64967073448320.0, "grad_norm": 0.8409357208019408, "language_loss": 0.59541738, "learning_rate": 2.477525836717411e-06, "loss": 0.61636043, "num_input_tokens_seen": 78938440, "step": 3664, "time_per_iteration": 3.4727022647857666 }, { "auxiliary_loss_clip": 0.0118206, "auxiliary_loss_mlp": 0.01035033, "balance_loss_clip": 1.0119704, "balance_loss_mlp": 1.02716517, "epoch": 0.4406901941922684, "flos": 35661952978560.0, "grad_norm": 2.081497345402461, "language_loss": 0.79841208, "learning_rate": 2.476769358372606e-06, "loss": 0.82058299, "num_input_tokens_seen": 78960090, "step": 3665, "time_per_iteration": 2.848552703857422 }, { "auxiliary_loss_clip": 0.01181138, "auxiliary_loss_mlp": 0.01033957, "balance_loss_clip": 0.94115716, "balance_loss_mlp": 1.02642894, "epoch": 0.44081043708290746, "flos": 18040767361920.0, "grad_norm": 2.1052318539259014, "language_loss": 0.74542338, "learning_rate": 2.4760128077022683e-06, "loss": 0.76757431, "num_input_tokens_seen": 78978225, "step": 3666, "time_per_iteration": 3.5338706970214844 }, { "auxiliary_loss_clip": 0.01170893, "auxiliary_loss_mlp": 0.01025209, "balance_loss_clip": 0.89860392, "balance_loss_mlp": 1.01784766, "epoch": 0.44093067997354657, "flos": 30153507799680.0, "grad_norm": 1.4810956732422744, "language_loss": 0.68378627, "learning_rate": 2.4752561848211672e-06, "loss": 0.70574731, "num_input_tokens_seen": 79000625, "step": 3667, "time_per_iteration": 2.7484281063079834 }, { "auxiliary_loss_clip": 0.0118335, "auxiliary_loss_mlp": 0.01024485, "balance_loss_clip": 1.01807189, "balance_loss_mlp": 1.01722848, "epoch": 0.4410509228641857, "flos": 23255068066560.0, "grad_norm": 1.7467121686506693, "language_loss": 0.71058673, "learning_rate": 2.4744994898440797e-06, "loss": 0.73266506, "num_input_tokens_seen": 79019415, "step": 3668, "time_per_iteration": 2.5602097511291504 }, { "auxiliary_loss_clip": 0.01184225, "auxiliary_loss_mlp": 0.01034819, "balance_loss_clip": 0.93635857, "balance_loss_mlp": 1.02594972, "epoch": 0.44117116575482473, "flos": 19500571998720.0, "grad_norm": 2.299199851944006, "language_loss": 0.83448946, "learning_rate": 2.473742722885797e-06, "loss": 0.85667986, "num_input_tokens_seen": 79038435, "step": 3669, "time_per_iteration": 2.5673296451568604 }, { "auxiliary_loss_clip": 0.01190586, "auxiliary_loss_mlp": 0.01124484, "balance_loss_clip": 1.02029085, "balance_loss_mlp": 0.0, "epoch": 0.44129140864546385, "flos": 27053124353280.0, "grad_norm": 2.2266061132696553, "language_loss": 0.64628965, "learning_rate": 2.4729858840611197e-06, "loss": 0.66944033, "num_input_tokens_seen": 79057345, "step": 3670, "time_per_iteration": 2.5867016315460205 }, { "auxiliary_loss_clip": 0.01184149, "auxiliary_loss_mlp": 0.01032435, "balance_loss_clip": 1.05385518, "balance_loss_mlp": 1.02434087, "epoch": 0.4414116515361029, "flos": 26102101910400.0, "grad_norm": 1.9431797368239792, "language_loss": 0.72700131, "learning_rate": 2.4722289734848605e-06, "loss": 0.74916714, "num_input_tokens_seen": 79077810, "step": 3671, "time_per_iteration": 2.7353382110595703 }, { "auxiliary_loss_clip": 0.01183145, "auxiliary_loss_mlp": 0.01029762, "balance_loss_clip": 0.94132084, "balance_loss_mlp": 1.02217412, "epoch": 0.441531894426742, "flos": 21906083865600.0, "grad_norm": 2.420702232201793, "language_loss": 0.77540088, "learning_rate": 2.471471991271841e-06, "loss": 0.79752994, "num_input_tokens_seen": 79094935, "step": 3672, "time_per_iteration": 2.6192286014556885 }, { "auxiliary_loss_clip": 0.01176392, "auxiliary_loss_mlp": 0.01024807, "balance_loss_clip": 1.01347852, "balance_loss_mlp": 1.01621175, "epoch": 0.4416521373173811, "flos": 23437099215360.0, "grad_norm": 1.778526670227745, "language_loss": 0.79086423, "learning_rate": 2.470714937536896e-06, "loss": 0.81287616, "num_input_tokens_seen": 79113660, "step": 3673, "time_per_iteration": 2.750223159790039 }, { "auxiliary_loss_clip": 0.01179326, "auxiliary_loss_mlp": 0.01031943, "balance_loss_clip": 0.90099061, "balance_loss_mlp": 1.02379823, "epoch": 0.4417723802080202, "flos": 20334345471360.0, "grad_norm": 1.6816348748816121, "language_loss": 0.70369691, "learning_rate": 2.469957812394868e-06, "loss": 0.72580963, "num_input_tokens_seen": 79132470, "step": 3674, "time_per_iteration": 2.741786479949951 }, { "auxiliary_loss_clip": 0.01185127, "auxiliary_loss_mlp": 0.01032618, "balance_loss_clip": 1.05432498, "balance_loss_mlp": 1.02500057, "epoch": 0.4418926230986593, "flos": 18880682060160.0, "grad_norm": 2.0999580816504495, "language_loss": 0.76263797, "learning_rate": 2.4692006159606148e-06, "loss": 0.78481537, "num_input_tokens_seen": 79150000, "step": 3675, "time_per_iteration": 2.646040678024292 }, { "auxiliary_loss_clip": 0.01186081, "auxiliary_loss_mlp": 0.01032848, "balance_loss_clip": 1.05451143, "balance_loss_mlp": 1.02473009, "epoch": 0.4420128659892984, "flos": 19464409981440.0, "grad_norm": 1.7235008665520934, "language_loss": 0.78457916, "learning_rate": 2.468443348349e-06, "loss": 0.80676842, "num_input_tokens_seen": 79167875, "step": 3676, "time_per_iteration": 2.6106810569763184 }, { "auxiliary_loss_clip": 0.01172654, "auxiliary_loss_mlp": 0.01030357, "balance_loss_clip": 0.89549804, "balance_loss_mlp": 1.0215292, "epoch": 0.44213310887993745, "flos": 17894359526400.0, "grad_norm": 2.3099794341627637, "language_loss": 0.82846749, "learning_rate": 2.467686009674902e-06, "loss": 0.8504976, "num_input_tokens_seen": 79182325, "step": 3677, "time_per_iteration": 2.6980061531066895 }, { "auxiliary_loss_clip": 0.01175641, "auxiliary_loss_mlp": 0.01029982, "balance_loss_clip": 1.01153064, "balance_loss_mlp": 1.02187562, "epoch": 0.44225335177057656, "flos": 19204667758080.0, "grad_norm": 1.8981760245023633, "language_loss": 0.85354257, "learning_rate": 2.466928600053209e-06, "loss": 0.87559891, "num_input_tokens_seen": 79197630, "step": 3678, "time_per_iteration": 2.6573944091796875 }, { "auxiliary_loss_clip": 0.01182085, "auxiliary_loss_mlp": 0.01028825, "balance_loss_clip": 0.97608936, "balance_loss_mlp": 1.02116275, "epoch": 0.4423735946612157, "flos": 23471321898240.0, "grad_norm": 1.8274751943503678, "language_loss": 0.7110545, "learning_rate": 2.466171119598818e-06, "loss": 0.7331636, "num_input_tokens_seen": 79217600, "step": 3679, "time_per_iteration": 2.791857957839966 }, { "auxiliary_loss_clip": 0.01185978, "auxiliary_loss_mlp": 0.01029157, "balance_loss_clip": 1.01130342, "balance_loss_mlp": 1.02066946, "epoch": 0.44249383755185473, "flos": 26685398868480.0, "grad_norm": 1.6862101041437152, "language_loss": 0.76842684, "learning_rate": 2.465413568426639e-06, "loss": 0.79057819, "num_input_tokens_seen": 79238550, "step": 3680, "time_per_iteration": 2.832670211791992 }, { "auxiliary_loss_clip": 0.0117852, "auxiliary_loss_mlp": 0.01020961, "balance_loss_clip": 1.01437044, "balance_loss_mlp": 1.01365948, "epoch": 0.44261408044249384, "flos": 23147659422720.0, "grad_norm": 1.5929471238328088, "language_loss": 0.81149465, "learning_rate": 2.464655946651591e-06, "loss": 0.83348948, "num_input_tokens_seen": 79257555, "step": 3681, "time_per_iteration": 2.7578775882720947 }, { "auxiliary_loss_clip": 0.01185438, "auxiliary_loss_mlp": 0.01036274, "balance_loss_clip": 1.01536977, "balance_loss_mlp": 1.02805424, "epoch": 0.44273432333313295, "flos": 24462564595200.0, "grad_norm": 1.7455175036804593, "language_loss": 0.81033683, "learning_rate": 2.4638982543886065e-06, "loss": 0.83255392, "num_input_tokens_seen": 79277595, "step": 3682, "time_per_iteration": 2.679290771484375 }, { "auxiliary_loss_clip": 0.01190471, "auxiliary_loss_mlp": 0.01032692, "balance_loss_clip": 1.01804852, "balance_loss_mlp": 1.02474713, "epoch": 0.442854566223772, "flos": 17528932512000.0, "grad_norm": 2.143308680898854, "language_loss": 0.87359387, "learning_rate": 2.4631404917526254e-06, "loss": 0.89582551, "num_input_tokens_seen": 79294550, "step": 3683, "time_per_iteration": 3.610063314437866 }, { "auxiliary_loss_clip": 0.0117443, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.01248443, "balance_loss_mlp": 1.02430987, "epoch": 0.4429748091144111, "flos": 24896293320960.0, "grad_norm": 1.5908534833493024, "language_loss": 0.79073751, "learning_rate": 2.4623826588586e-06, "loss": 0.81280327, "num_input_tokens_seen": 79314820, "step": 3684, "time_per_iteration": 3.6309571266174316 }, { "auxiliary_loss_clip": 0.01175741, "auxiliary_loss_mlp": 0.01031192, "balance_loss_clip": 0.97341001, "balance_loss_mlp": 1.0234375, "epoch": 0.4430950520050502, "flos": 21614704738560.0, "grad_norm": 1.4883849080597344, "language_loss": 0.8245821, "learning_rate": 2.461624755821492e-06, "loss": 0.84665143, "num_input_tokens_seen": 79334300, "step": 3685, "time_per_iteration": 2.776477575302124 }, { "auxiliary_loss_clip": 0.01186597, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 0.94105685, "balance_loss_mlp": 1.02138543, "epoch": 0.4432152948956893, "flos": 24572271709440.0, "grad_norm": 1.6214684815419076, "language_loss": 0.7646848, "learning_rate": 2.4608667827562763e-06, "loss": 0.78684425, "num_input_tokens_seen": 79353630, "step": 3686, "time_per_iteration": 2.8066282272338867 }, { "auxiliary_loss_clip": 0.01190108, "auxiliary_loss_mlp": 0.01033168, "balance_loss_clip": 1.01551771, "balance_loss_mlp": 1.02492428, "epoch": 0.4433355377863284, "flos": 21762261809280.0, "grad_norm": 2.0711892419832925, "language_loss": 0.89779115, "learning_rate": 2.460108739777936e-06, "loss": 0.92002392, "num_input_tokens_seen": 79372765, "step": 3687, "time_per_iteration": 2.7094619274139404 }, { "auxiliary_loss_clip": 0.01184051, "auxiliary_loss_mlp": 0.01027207, "balance_loss_clip": 0.97785324, "balance_loss_mlp": 1.01953042, "epoch": 0.44345578067696745, "flos": 20084479488000.0, "grad_norm": 1.4981002109707955, "language_loss": 0.76823837, "learning_rate": 2.4593506270014656e-06, "loss": 0.79035097, "num_input_tokens_seen": 79391735, "step": 3688, "time_per_iteration": 2.7265350818634033 }, { "auxiliary_loss_clip": 0.01183981, "auxiliary_loss_mlp": 0.01033466, "balance_loss_clip": 0.9739567, "balance_loss_mlp": 1.02503181, "epoch": 0.44357602356760656, "flos": 24169497528960.0, "grad_norm": 1.891617404389135, "language_loss": 0.81730866, "learning_rate": 2.45859244454187e-06, "loss": 0.83948314, "num_input_tokens_seen": 79411525, "step": 3689, "time_per_iteration": 3.7521355152130127 }, { "auxiliary_loss_clip": 0.01182112, "auxiliary_loss_mlp": 0.01030483, "balance_loss_clip": 1.01679468, "balance_loss_mlp": 1.02283299, "epoch": 0.44369626645824567, "flos": 22707717644160.0, "grad_norm": 1.924627931081606, "language_loss": 0.66628385, "learning_rate": 2.4578341925141655e-06, "loss": 0.68840981, "num_input_tokens_seen": 79430740, "step": 3690, "time_per_iteration": 2.692183494567871 }, { "auxiliary_loss_clip": 0.01188353, "auxiliary_loss_mlp": 0.01035502, "balance_loss_clip": 1.01416993, "balance_loss_mlp": 1.0258882, "epoch": 0.4438165093488847, "flos": 38030225420160.0, "grad_norm": 2.059131853907219, "language_loss": 0.71814942, "learning_rate": 2.457075871033378e-06, "loss": 0.74038798, "num_input_tokens_seen": 79452615, "step": 3691, "time_per_iteration": 2.84641432762146 }, { "auxiliary_loss_clip": 0.01181534, "auxiliary_loss_mlp": 0.01027805, "balance_loss_clip": 0.93837863, "balance_loss_mlp": 1.02014589, "epoch": 0.44393675223952384, "flos": 15523213996800.0, "grad_norm": 2.16578060458909, "language_loss": 0.88584656, "learning_rate": 2.4563174802145445e-06, "loss": 0.90793997, "num_input_tokens_seen": 79469865, "step": 3692, "time_per_iteration": 3.583651304244995 }, { "auxiliary_loss_clip": 0.01083591, "auxiliary_loss_mlp": 0.0100216, "balance_loss_clip": 0.94369394, "balance_loss_mlp": 0.99978751, "epoch": 0.44405699513016295, "flos": 64574893779840.0, "grad_norm": 0.7592595534061417, "language_loss": 0.4862465, "learning_rate": 2.455559020172712e-06, "loss": 0.50710398, "num_input_tokens_seen": 79537220, "step": 3693, "time_per_iteration": 3.359290838241577 }, { "auxiliary_loss_clip": 0.01184722, "auxiliary_loss_mlp": 0.01027722, "balance_loss_clip": 0.90419596, "balance_loss_mlp": 1.01944304, "epoch": 0.444177238020802, "flos": 23987394552960.0, "grad_norm": 1.840865773063439, "language_loss": 0.89709109, "learning_rate": 2.4548004910229385e-06, "loss": 0.91921556, "num_input_tokens_seen": 79554795, "step": 3694, "time_per_iteration": 2.7780370712280273 }, { "auxiliary_loss_clip": 0.01187284, "auxiliary_loss_mlp": 0.01123803, "balance_loss_clip": 1.01638138, "balance_loss_mlp": 0.0, "epoch": 0.4442974809114411, "flos": 22563069575040.0, "grad_norm": 1.6650443890055666, "language_loss": 0.86924887, "learning_rate": 2.4540418928802913e-06, "loss": 0.89235973, "num_input_tokens_seen": 79573530, "step": 3695, "time_per_iteration": 2.76521635055542 }, { "auxiliary_loss_clip": 0.01180922, "auxiliary_loss_mlp": 0.01030079, "balance_loss_clip": 0.9772315, "balance_loss_mlp": 1.02196074, "epoch": 0.4444177238020802, "flos": 17675699483520.0, "grad_norm": 2.1049631309111794, "language_loss": 0.65917826, "learning_rate": 2.4532832258598506e-06, "loss": 0.6812883, "num_input_tokens_seen": 79591360, "step": 3696, "time_per_iteration": 2.785198450088501 }, { "auxiliary_loss_clip": 0.01182827, "auxiliary_loss_mlp": 0.01031164, "balance_loss_clip": 1.05348384, "balance_loss_mlp": 1.02336216, "epoch": 0.4445379666927193, "flos": 28621594609920.0, "grad_norm": 3.018501101515092, "language_loss": 0.80846536, "learning_rate": 2.4525244900767047e-06, "loss": 0.83060527, "num_input_tokens_seen": 79612175, "step": 3697, "time_per_iteration": 2.7450766563415527 }, { "auxiliary_loss_clip": 0.01078465, "auxiliary_loss_mlp": 0.00999253, "balance_loss_clip": 0.98113751, "balance_loss_mlp": 0.99714327, "epoch": 0.4446582095833584, "flos": 70487370115200.0, "grad_norm": 0.7725313996892227, "language_loss": 0.60539126, "learning_rate": 2.4517656856459536e-06, "loss": 0.62616849, "num_input_tokens_seen": 79678020, "step": 3698, "time_per_iteration": 3.335956573486328 }, { "auxiliary_loss_clip": 0.01181728, "auxiliary_loss_mlp": 0.01029077, "balance_loss_clip": 1.01350427, "balance_loss_mlp": 1.0207324, "epoch": 0.4447784524739975, "flos": 26505199313280.0, "grad_norm": 1.7119847576433478, "language_loss": 0.68023109, "learning_rate": 2.4510068126827073e-06, "loss": 0.70233917, "num_input_tokens_seen": 79699020, "step": 3699, "time_per_iteration": 2.7528903484344482 }, { "auxiliary_loss_clip": 0.01179412, "auxiliary_loss_mlp": 0.0102481, "balance_loss_clip": 0.97629803, "balance_loss_mlp": 1.01680541, "epoch": 0.44489869536463655, "flos": 11656209553920.0, "grad_norm": 2.041522861496513, "language_loss": 0.81424022, "learning_rate": 2.450247871302086e-06, "loss": 0.83628243, "num_input_tokens_seen": 79716795, "step": 3700, "time_per_iteration": 2.8873562812805176 }, { "auxiliary_loss_clip": 0.01189417, "auxiliary_loss_mlp": 0.01025293, "balance_loss_clip": 1.01605356, "balance_loss_mlp": 1.01712155, "epoch": 0.44501893825527566, "flos": 20448469958400.0, "grad_norm": 1.9518455918513153, "language_loss": 0.83295786, "learning_rate": 2.44948886161922e-06, "loss": 0.85510498, "num_input_tokens_seen": 79735810, "step": 3701, "time_per_iteration": 2.692791700363159 }, { "auxiliary_loss_clip": 0.01189123, "auxiliary_loss_mlp": 0.01028573, "balance_loss_clip": 1.01686573, "balance_loss_mlp": 1.02044845, "epoch": 0.4451391811459148, "flos": 18261079430400.0, "grad_norm": 1.4772941939043687, "language_loss": 0.84631026, "learning_rate": 2.4487297837492524e-06, "loss": 0.86848724, "num_input_tokens_seen": 79754975, "step": 3702, "time_per_iteration": 2.7546722888946533 }, { "auxiliary_loss_clip": 0.01183982, "auxiliary_loss_mlp": 0.01030612, "balance_loss_clip": 0.9403975, "balance_loss_mlp": 1.02236915, "epoch": 0.44525942403655383, "flos": 16910155895040.0, "grad_norm": 1.885813981351007, "language_loss": 0.62118673, "learning_rate": 2.4479706378073323e-06, "loss": 0.64333266, "num_input_tokens_seen": 79773515, "step": 3703, "time_per_iteration": 2.761237621307373 }, { "auxiliary_loss_clip": 0.01168213, "auxiliary_loss_mlp": 0.01027032, "balance_loss_clip": 0.93504393, "balance_loss_mlp": 1.01930761, "epoch": 0.44537966692719294, "flos": 23258838994560.0, "grad_norm": 1.4902215024663361, "language_loss": 0.83900076, "learning_rate": 2.447211423908623e-06, "loss": 0.86095321, "num_input_tokens_seen": 79793560, "step": 3704, "time_per_iteration": 2.745850086212158 }, { "auxiliary_loss_clip": 0.01185038, "auxiliary_loss_mlp": 0.01027641, "balance_loss_clip": 1.01401794, "balance_loss_mlp": 1.01983225, "epoch": 0.445499909817832, "flos": 21724160457600.0, "grad_norm": 4.860791257553615, "language_loss": 0.75045037, "learning_rate": 2.4464521421682966e-06, "loss": 0.77257717, "num_input_tokens_seen": 79811150, "step": 3705, "time_per_iteration": 2.6638104915618896 }, { "auxiliary_loss_clip": 0.0117983, "auxiliary_loss_mlp": 0.01032343, "balance_loss_clip": 1.01535428, "balance_loss_mlp": 1.0246954, "epoch": 0.4456201527084711, "flos": 23987969170560.0, "grad_norm": 1.331093894591875, "language_loss": 0.875691, "learning_rate": 2.4456927927015345e-06, "loss": 0.89781272, "num_input_tokens_seen": 79832190, "step": 3706, "time_per_iteration": 2.803112030029297 }, { "auxiliary_loss_clip": 0.01188863, "auxiliary_loss_mlp": 0.01033428, "balance_loss_clip": 0.97802782, "balance_loss_mlp": 1.02492273, "epoch": 0.4457403955991102, "flos": 18807065136000.0, "grad_norm": 1.9544506582213945, "language_loss": 0.76206207, "learning_rate": 2.4449333756235307e-06, "loss": 0.78428501, "num_input_tokens_seen": 79848905, "step": 3707, "time_per_iteration": 2.701558828353882 }, { "auxiliary_loss_clip": 0.01189789, "auxiliary_loss_mlp": 0.01030119, "balance_loss_clip": 1.01676464, "balance_loss_mlp": 1.02176249, "epoch": 0.4458606384897493, "flos": 19207756327680.0, "grad_norm": 2.3030150733336914, "language_loss": 0.79269201, "learning_rate": 2.4441738910494876e-06, "loss": 0.81489116, "num_input_tokens_seen": 79863640, "step": 3708, "time_per_iteration": 2.6888787746429443 }, { "auxiliary_loss_clip": 0.01190714, "auxiliary_loss_mlp": 0.01031216, "balance_loss_clip": 0.97738528, "balance_loss_mlp": 1.02258539, "epoch": 0.4459808813803884, "flos": 21361283308800.0, "grad_norm": 1.983527064487312, "language_loss": 0.82306749, "learning_rate": 2.4434143390946176e-06, "loss": 0.84528685, "num_input_tokens_seen": 79882450, "step": 3709, "time_per_iteration": 3.6531152725219727 }, { "auxiliary_loss_clip": 0.01179157, "auxiliary_loss_mlp": 0.01029011, "balance_loss_clip": 0.93752337, "balance_loss_mlp": 1.02135801, "epoch": 0.4461011242710275, "flos": 23288967527040.0, "grad_norm": 1.9444293587599286, "language_loss": 0.84869313, "learning_rate": 2.4426547198741457e-06, "loss": 0.87077487, "num_input_tokens_seen": 79900655, "step": 3710, "time_per_iteration": 3.638740301132202 }, { "auxiliary_loss_clip": 0.01184604, "auxiliary_loss_mlp": 0.01037562, "balance_loss_clip": 0.90564835, "balance_loss_mlp": 1.0292654, "epoch": 0.44622136716166655, "flos": 20193001453440.0, "grad_norm": 2.064398562076332, "language_loss": 0.74479401, "learning_rate": 2.441895033503305e-06, "loss": 0.7670157, "num_input_tokens_seen": 79918575, "step": 3711, "time_per_iteration": 2.820843458175659 }, { "auxiliary_loss_clip": 0.01184445, "auxiliary_loss_mlp": 0.01029878, "balance_loss_clip": 1.01512885, "balance_loss_mlp": 1.02093124, "epoch": 0.44634161005230566, "flos": 21283033530240.0, "grad_norm": 2.0360174892908804, "language_loss": 0.8202424, "learning_rate": 2.4411352800973375e-06, "loss": 0.84238565, "num_input_tokens_seen": 79937010, "step": 3712, "time_per_iteration": 2.646605968475342 }, { "auxiliary_loss_clip": 0.01172378, "auxiliary_loss_mlp": 0.01026438, "balance_loss_clip": 0.93408358, "balance_loss_mlp": 1.01775348, "epoch": 0.44646185294294477, "flos": 22929358515840.0, "grad_norm": 6.890380291113251, "language_loss": 0.75592583, "learning_rate": 2.4403754597715005e-06, "loss": 0.77791393, "num_input_tokens_seen": 79956455, "step": 3713, "time_per_iteration": 2.782953977584839 }, { "auxiliary_loss_clip": 0.01182261, "auxiliary_loss_mlp": 0.01034995, "balance_loss_clip": 0.97326452, "balance_loss_mlp": 1.02649879, "epoch": 0.4465820958335838, "flos": 22637692080000.0, "grad_norm": 1.9304803207013959, "language_loss": 0.92538971, "learning_rate": 2.4396155726410553e-06, "loss": 0.94756234, "num_input_tokens_seen": 79975065, "step": 3714, "time_per_iteration": 2.747084856033325 }, { "auxiliary_loss_clip": 0.01189979, "auxiliary_loss_mlp": 0.01030871, "balance_loss_clip": 1.01388884, "balance_loss_mlp": 1.02287221, "epoch": 0.44670233872422294, "flos": 22672525294080.0, "grad_norm": 2.778310525542392, "language_loss": 0.90938562, "learning_rate": 2.438855618821278e-06, "loss": 0.93159413, "num_input_tokens_seen": 79990865, "step": 3715, "time_per_iteration": 3.8118298053741455 }, { "auxiliary_loss_clip": 0.01173591, "auxiliary_loss_mlp": 0.01032475, "balance_loss_clip": 1.01086009, "balance_loss_mlp": 1.02432704, "epoch": 0.44682258161486205, "flos": 23582178247680.0, "grad_norm": 1.5772704615264874, "language_loss": 0.67661303, "learning_rate": 2.4380955984274517e-06, "loss": 0.69867373, "num_input_tokens_seen": 80009520, "step": 3716, "time_per_iteration": 2.7366650104522705 }, { "auxiliary_loss_clip": 0.01184648, "auxiliary_loss_mlp": 0.01031553, "balance_loss_clip": 1.01457489, "balance_loss_mlp": 1.0232743, "epoch": 0.4469428245055011, "flos": 26501356558080.0, "grad_norm": 2.207988280705494, "language_loss": 0.77201593, "learning_rate": 2.4373355115748716e-06, "loss": 0.79417801, "num_input_tokens_seen": 80030350, "step": 3717, "time_per_iteration": 3.6512107849121094 }, { "auxiliary_loss_clip": 0.01173243, "auxiliary_loss_mlp": 0.01028789, "balance_loss_clip": 0.97550392, "balance_loss_mlp": 1.02064681, "epoch": 0.4470630673961402, "flos": 21504925797120.0, "grad_norm": 1.7156138059156698, "language_loss": 0.72203344, "learning_rate": 2.436575358378842e-06, "loss": 0.74405378, "num_input_tokens_seen": 80049840, "step": 3718, "time_per_iteration": 2.805002212524414 }, { "auxiliary_loss_clip": 0.01190611, "auxiliary_loss_mlp": 0.0103164, "balance_loss_clip": 0.97856975, "balance_loss_mlp": 1.02352822, "epoch": 0.44718331028677927, "flos": 16173986653440.0, "grad_norm": 2.555219731340438, "language_loss": 0.8331607, "learning_rate": 2.4358151389546782e-06, "loss": 0.85538322, "num_input_tokens_seen": 80066525, "step": 3719, "time_per_iteration": 2.714872360229492 }, { "auxiliary_loss_clip": 0.01185535, "auxiliary_loss_mlp": 0.01029199, "balance_loss_clip": 1.05336809, "balance_loss_mlp": 1.02161765, "epoch": 0.4473035531774184, "flos": 19681238430720.0, "grad_norm": 2.2672093910541657, "language_loss": 0.76353168, "learning_rate": 2.4350548534177035e-06, "loss": 0.78567898, "num_input_tokens_seen": 80083355, "step": 3720, "time_per_iteration": 2.5987908840179443 }, { "auxiliary_loss_clip": 0.01181584, "auxiliary_loss_mlp": 0.01030685, "balance_loss_clip": 0.9380272, "balance_loss_mlp": 1.02315974, "epoch": 0.4474237960680575, "flos": 41427590515200.0, "grad_norm": 1.741333351568676, "language_loss": 0.66618681, "learning_rate": 2.434294501883254e-06, "loss": 0.68830949, "num_input_tokens_seen": 80106450, "step": 3721, "time_per_iteration": 2.8513870239257812 }, { "auxiliary_loss_clip": 0.01169792, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 0.97211981, "balance_loss_mlp": 1.01891398, "epoch": 0.44754403895869654, "flos": 22891328991360.0, "grad_norm": 1.6146545494157338, "language_loss": 0.65801257, "learning_rate": 2.433534084466674e-06, "loss": 0.67998087, "num_input_tokens_seen": 80125670, "step": 3722, "time_per_iteration": 2.7905654907226562 }, { "auxiliary_loss_clip": 0.01183624, "auxiliary_loss_mlp": 0.01030529, "balance_loss_clip": 1.0539844, "balance_loss_mlp": 1.02259612, "epoch": 0.44766428184933565, "flos": 25630271832960.0, "grad_norm": 1.4263117820541922, "language_loss": 0.70432627, "learning_rate": 2.4327736012833178e-06, "loss": 0.72646785, "num_input_tokens_seen": 80147390, "step": 3723, "time_per_iteration": 2.702061891555786 }, { "auxiliary_loss_clip": 0.01181245, "auxiliary_loss_mlp": 0.01028733, "balance_loss_clip": 1.01353121, "balance_loss_mlp": 1.02075195, "epoch": 0.44778452473997477, "flos": 20448972748800.0, "grad_norm": 2.2358879872527804, "language_loss": 0.76342523, "learning_rate": 2.4320130524485506e-06, "loss": 0.78552508, "num_input_tokens_seen": 80166185, "step": 3724, "time_per_iteration": 2.646394729614258 }, { "auxiliary_loss_clip": 0.01178664, "auxiliary_loss_mlp": 0.01031607, "balance_loss_clip": 0.98040438, "balance_loss_mlp": 1.02394795, "epoch": 0.4479047676306138, "flos": 21975462984960.0, "grad_norm": 2.010147805463774, "language_loss": 0.79514122, "learning_rate": 2.431252438077746e-06, "loss": 0.81724393, "num_input_tokens_seen": 80185685, "step": 3725, "time_per_iteration": 2.7372541427612305 }, { "auxiliary_loss_clip": 0.01186535, "auxiliary_loss_mlp": 0.01124232, "balance_loss_clip": 1.01417351, "balance_loss_mlp": 0.0, "epoch": 0.44802501052125293, "flos": 21467219495040.0, "grad_norm": 2.388234275116468, "language_loss": 0.7720539, "learning_rate": 2.4304917582862906e-06, "loss": 0.79516155, "num_input_tokens_seen": 80204865, "step": 3726, "time_per_iteration": 2.698164463043213 }, { "auxiliary_loss_clip": 0.01187175, "auxiliary_loss_mlp": 0.0103286, "balance_loss_clip": 1.0562892, "balance_loss_mlp": 1.02492642, "epoch": 0.44814525341189204, "flos": 22126970551680.0, "grad_norm": 2.2311707881388383, "language_loss": 0.87815326, "learning_rate": 2.4297310131895774e-06, "loss": 0.90035355, "num_input_tokens_seen": 80223410, "step": 3727, "time_per_iteration": 2.625833749771118 }, { "auxiliary_loss_clip": 0.01184585, "auxiliary_loss_mlp": 0.0103295, "balance_loss_clip": 1.01605439, "balance_loss_mlp": 1.02520752, "epoch": 0.4482654963025311, "flos": 16653933204480.0, "grad_norm": 2.3333621862068865, "language_loss": 0.74483716, "learning_rate": 2.4289702029030113e-06, "loss": 0.76701248, "num_input_tokens_seen": 80240880, "step": 3728, "time_per_iteration": 2.7949509620666504 }, { "auxiliary_loss_clip": 0.0118663, "auxiliary_loss_mlp": 0.01028665, "balance_loss_clip": 1.01781416, "balance_loss_mlp": 1.02085042, "epoch": 0.4483857391931702, "flos": 18841251905280.0, "grad_norm": 1.7554700761502762, "language_loss": 0.82800448, "learning_rate": 2.4282093275420057e-06, "loss": 0.8501575, "num_input_tokens_seen": 80259910, "step": 3729, "time_per_iteration": 2.857043743133545 }, { "auxiliary_loss_clip": 0.01188637, "auxiliary_loss_mlp": 0.01027332, "balance_loss_clip": 1.01600671, "balance_loss_mlp": 1.01930308, "epoch": 0.4485059820838093, "flos": 20372590477440.0, "grad_norm": 2.1553183481869707, "language_loss": 0.70802808, "learning_rate": 2.4274483872219863e-06, "loss": 0.73018777, "num_input_tokens_seen": 80277270, "step": 3730, "time_per_iteration": 2.7343413829803467 }, { "auxiliary_loss_clip": 0.01178985, "auxiliary_loss_mlp": 0.0103367, "balance_loss_clip": 1.01339912, "balance_loss_mlp": 1.02553988, "epoch": 0.4486262249744484, "flos": 20047742853120.0, "grad_norm": 1.7095404372921565, "language_loss": 0.93562919, "learning_rate": 2.426687382058386e-06, "loss": 0.95775568, "num_input_tokens_seen": 80295550, "step": 3731, "time_per_iteration": 2.654963493347168 }, { "auxiliary_loss_clip": 0.01077604, "auxiliary_loss_mlp": 0.01003372, "balance_loss_clip": 0.98064816, "balance_loss_mlp": 1.00132179, "epoch": 0.4487464678650875, "flos": 64595684776320.0, "grad_norm": 0.864835512376622, "language_loss": 0.59848249, "learning_rate": 2.425926312166649e-06, "loss": 0.61929226, "num_input_tokens_seen": 80348425, "step": 3732, "time_per_iteration": 3.0938093662261963 }, { "auxiliary_loss_clip": 0.01185555, "auxiliary_loss_mlp": 0.01033481, "balance_loss_clip": 0.97796524, "balance_loss_mlp": 1.024791, "epoch": 0.4488667107557266, "flos": 20769798049920.0, "grad_norm": 3.4231892871103793, "language_loss": 0.72819149, "learning_rate": 2.42516517766223e-06, "loss": 0.75038189, "num_input_tokens_seen": 80366505, "step": 3733, "time_per_iteration": 2.667785167694092 }, { "auxiliary_loss_clip": 0.01186046, "auxiliary_loss_mlp": 0.01029518, "balance_loss_clip": 1.0553093, "balance_loss_mlp": 1.02186728, "epoch": 0.44898695364636565, "flos": 23951735326080.0, "grad_norm": 1.7053465460060788, "language_loss": 0.67799968, "learning_rate": 2.4244039786605907e-06, "loss": 0.70015538, "num_input_tokens_seen": 80387510, "step": 3734, "time_per_iteration": 3.63136887550354 }, { "auxiliary_loss_clip": 0.01177291, "auxiliary_loss_mlp": 0.01027151, "balance_loss_clip": 0.89710116, "balance_loss_mlp": 1.01882422, "epoch": 0.44910719653700476, "flos": 18624351628800.0, "grad_norm": 3.4259126118155656, "language_loss": 0.81991696, "learning_rate": 2.4236427152772055e-06, "loss": 0.84196138, "num_input_tokens_seen": 80405915, "step": 3735, "time_per_iteration": 2.748756170272827 }, { "auxiliary_loss_clip": 0.01073143, "auxiliary_loss_mlp": 0.01001389, "balance_loss_clip": 0.90516001, "balance_loss_mlp": 0.9992556, "epoch": 0.4492274394276438, "flos": 57033435749760.0, "grad_norm": 0.8410169601019017, "language_loss": 0.57375276, "learning_rate": 2.422881387627557e-06, "loss": 0.59449816, "num_input_tokens_seen": 80458365, "step": 3736, "time_per_iteration": 3.9894216060638428 }, { "auxiliary_loss_clip": 0.01185898, "auxiliary_loss_mlp": 0.01027707, "balance_loss_clip": 0.97917175, "balance_loss_mlp": 1.01988959, "epoch": 0.4493476823182829, "flos": 23254888498560.0, "grad_norm": 1.7183353726087245, "language_loss": 0.77256596, "learning_rate": 2.422119995827139e-06, "loss": 0.79470199, "num_input_tokens_seen": 80478490, "step": 3737, "time_per_iteration": 2.674785852432251 }, { "auxiliary_loss_clip": 0.01189989, "auxiliary_loss_mlp": 0.01031832, "balance_loss_clip": 1.01813996, "balance_loss_mlp": 1.02385664, "epoch": 0.44946792520892204, "flos": 15815131827840.0, "grad_norm": 15.901095416584202, "language_loss": 0.74034154, "learning_rate": 2.4213585399914528e-06, "loss": 0.76255977, "num_input_tokens_seen": 80495695, "step": 3738, "time_per_iteration": 2.547853469848633 }, { "auxiliary_loss_clip": 0.01185683, "auxiliary_loss_mlp": 0.01030101, "balance_loss_clip": 1.01864636, "balance_loss_mlp": 1.02196455, "epoch": 0.4495881680995611, "flos": 19610063631360.0, "grad_norm": 1.8013195837929044, "language_loss": 0.85287666, "learning_rate": 2.4205970202360113e-06, "loss": 0.87503451, "num_input_tokens_seen": 80515260, "step": 3739, "time_per_iteration": 2.6352100372314453 }, { "auxiliary_loss_clip": 0.01170442, "auxiliary_loss_mlp": 0.01028543, "balance_loss_clip": 0.89945006, "balance_loss_mlp": 1.01992416, "epoch": 0.4497084109902002, "flos": 26031465815040.0, "grad_norm": 2.1090235126065693, "language_loss": 0.77693933, "learning_rate": 2.4198354366763354e-06, "loss": 0.79892921, "num_input_tokens_seen": 80533900, "step": 3740, "time_per_iteration": 2.8280868530273438 }, { "auxiliary_loss_clip": 0.01183655, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 0.97690713, "balance_loss_mlp": 1.02595472, "epoch": 0.4498286538808393, "flos": 14793688771200.0, "grad_norm": 2.328682146898751, "language_loss": 0.78519917, "learning_rate": 2.4190737894279587e-06, "loss": 0.80737495, "num_input_tokens_seen": 80551270, "step": 3741, "time_per_iteration": 3.7552907466888428 }, { "auxiliary_loss_clip": 0.0116417, "auxiliary_loss_mlp": 0.01033061, "balance_loss_clip": 0.9336791, "balance_loss_mlp": 1.02486491, "epoch": 0.44994889677147837, "flos": 15450171690240.0, "grad_norm": 2.0365625849491504, "language_loss": 0.80864441, "learning_rate": 2.4183120786064203e-06, "loss": 0.83061671, "num_input_tokens_seen": 80568145, "step": 3742, "time_per_iteration": 2.721510171890259 }, { "auxiliary_loss_clip": 0.01184599, "auxiliary_loss_mlp": 0.01123267, "balance_loss_clip": 1.01798809, "balance_loss_mlp": 0.0, "epoch": 0.4500691396621175, "flos": 21798316085760.0, "grad_norm": 2.5396730148816795, "language_loss": 0.85482216, "learning_rate": 2.417550304327273e-06, "loss": 0.87790084, "num_input_tokens_seen": 80586185, "step": 3743, "time_per_iteration": 3.6302592754364014 }, { "auxiliary_loss_clip": 0.01188095, "auxiliary_loss_mlp": 0.01028607, "balance_loss_clip": 1.05470908, "balance_loss_mlp": 1.01955354, "epoch": 0.4501893825527566, "flos": 32382016421760.0, "grad_norm": 1.5312920615128378, "language_loss": 0.75772119, "learning_rate": 2.4167884667060763e-06, "loss": 0.77988827, "num_input_tokens_seen": 80608895, "step": 3744, "time_per_iteration": 2.7237744331359863 }, { "auxiliary_loss_clip": 0.01182743, "auxiliary_loss_mlp": 0.01031648, "balance_loss_clip": 0.9767822, "balance_loss_mlp": 1.02282679, "epoch": 0.45030962544339564, "flos": 16544944362240.0, "grad_norm": 2.1384718138942604, "language_loss": 0.87376595, "learning_rate": 2.4160265658584e-06, "loss": 0.89590979, "num_input_tokens_seen": 80623785, "step": 3745, "time_per_iteration": 2.6532421112060547 }, { "auxiliary_loss_clip": 0.01186862, "auxiliary_loss_mlp": 0.01026385, "balance_loss_clip": 1.01672196, "balance_loss_mlp": 1.01812327, "epoch": 0.45042986833403476, "flos": 19573039687680.0, "grad_norm": 4.177629031255262, "language_loss": 0.68538666, "learning_rate": 2.4152646018998253e-06, "loss": 0.70751905, "num_input_tokens_seen": 80642735, "step": 3746, "time_per_iteration": 2.63486647605896 }, { "auxiliary_loss_clip": 0.01181111, "auxiliary_loss_mlp": 0.01030065, "balance_loss_clip": 1.01684678, "balance_loss_mlp": 1.02228689, "epoch": 0.45055011122467387, "flos": 23112467072640.0, "grad_norm": 1.756850079218706, "language_loss": 0.71510178, "learning_rate": 2.4145025749459403e-06, "loss": 0.73721361, "num_input_tokens_seen": 80663760, "step": 3747, "time_per_iteration": 2.6636526584625244 }, { "auxiliary_loss_clip": 0.01176271, "auxiliary_loss_mlp": 0.01035867, "balance_loss_clip": 0.82345021, "balance_loss_mlp": 1.02699733, "epoch": 0.4506703541153129, "flos": 19934623946880.0, "grad_norm": 1.9663959702263367, "language_loss": 0.69974673, "learning_rate": 2.413740485112344e-06, "loss": 0.7218681, "num_input_tokens_seen": 80682100, "step": 3748, "time_per_iteration": 2.8546142578125 }, { "auxiliary_loss_clip": 0.01180427, "auxiliary_loss_mlp": 0.01027958, "balance_loss_clip": 0.98047304, "balance_loss_mlp": 1.02045345, "epoch": 0.45079059700595203, "flos": 19499530504320.0, "grad_norm": 1.5042982834302254, "language_loss": 0.82486373, "learning_rate": 2.412978332514646e-06, "loss": 0.84694761, "num_input_tokens_seen": 80700880, "step": 3749, "time_per_iteration": 2.866251230239868 }, { "auxiliary_loss_clip": 0.01181765, "auxiliary_loss_mlp": 0.0103409, "balance_loss_clip": 0.97677493, "balance_loss_mlp": 1.02587044, "epoch": 0.4509108398965911, "flos": 27636313570560.0, "grad_norm": 1.8764145419878338, "language_loss": 0.72370481, "learning_rate": 2.4122161172684623e-06, "loss": 0.74586332, "num_input_tokens_seen": 80721675, "step": 3750, "time_per_iteration": 2.726776599884033 }, { "auxiliary_loss_clip": 0.01187799, "auxiliary_loss_mlp": 0.01034847, "balance_loss_clip": 0.97969306, "balance_loss_mlp": 1.02662122, "epoch": 0.4510310827872302, "flos": 20995712640000.0, "grad_norm": 2.599894586333351, "language_loss": 0.84317571, "learning_rate": 2.4114538394894216e-06, "loss": 0.86540216, "num_input_tokens_seen": 80739315, "step": 3751, "time_per_iteration": 2.7294960021972656 }, { "auxiliary_loss_clip": 0.01174074, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 0.97012758, "balance_loss_mlp": 1.01916778, "epoch": 0.4511513256778693, "flos": 16216684945920.0, "grad_norm": 1.7730565647167738, "language_loss": 0.8292948, "learning_rate": 2.410691499293161e-06, "loss": 0.85129905, "num_input_tokens_seen": 80757470, "step": 3752, "time_per_iteration": 2.6541099548339844 }, { "auxiliary_loss_clip": 0.01181918, "auxiliary_loss_mlp": 0.01028109, "balance_loss_clip": 1.01397359, "balance_loss_mlp": 1.02021098, "epoch": 0.45127156856850836, "flos": 25186702780800.0, "grad_norm": 1.8841829443617668, "language_loss": 0.7387076, "learning_rate": 2.409929096795326e-06, "loss": 0.76080787, "num_input_tokens_seen": 80777840, "step": 3753, "time_per_iteration": 2.664508581161499 }, { "auxiliary_loss_clip": 0.01183634, "auxiliary_loss_mlp": 0.01031829, "balance_loss_clip": 1.01429534, "balance_loss_mlp": 1.02353823, "epoch": 0.4513918114591475, "flos": 20412523422720.0, "grad_norm": 1.7906105046929097, "language_loss": 0.7905857, "learning_rate": 2.409166632111573e-06, "loss": 0.81274033, "num_input_tokens_seen": 80795975, "step": 3754, "time_per_iteration": 2.6346659660339355 }, { "auxiliary_loss_clip": 0.01192919, "auxiliary_loss_mlp": 0.01028047, "balance_loss_clip": 1.01657701, "balance_loss_mlp": 1.01924372, "epoch": 0.4515120543497866, "flos": 26648482665600.0, "grad_norm": 2.481663643288022, "language_loss": 0.80168283, "learning_rate": 2.4084041053575674e-06, "loss": 0.82389247, "num_input_tokens_seen": 80815395, "step": 3755, "time_per_iteration": 2.688317060470581 }, { "auxiliary_loss_clip": 0.01186833, "auxiliary_loss_mlp": 0.01027264, "balance_loss_clip": 0.9785198, "balance_loss_mlp": 1.01921153, "epoch": 0.45163229724042564, "flos": 20595093275520.0, "grad_norm": 2.264699992972252, "language_loss": 0.72369158, "learning_rate": 2.4076415166489834e-06, "loss": 0.74583256, "num_input_tokens_seen": 80834805, "step": 3756, "time_per_iteration": 2.7390100955963135 }, { "auxiliary_loss_clip": 0.01183631, "auxiliary_loss_mlp": 0.01028957, "balance_loss_clip": 0.89932102, "balance_loss_mlp": 1.02109492, "epoch": 0.45175254013106475, "flos": 21689004021120.0, "grad_norm": 1.490910499937179, "language_loss": 0.78925937, "learning_rate": 2.406878866101506e-06, "loss": 0.81138527, "num_input_tokens_seen": 80853770, "step": 3757, "time_per_iteration": 2.8695313930511475 }, { "auxiliary_loss_clip": 0.0118466, "auxiliary_loss_mlp": 0.01029275, "balance_loss_clip": 1.0547241, "balance_loss_mlp": 1.02145147, "epoch": 0.45187278302170386, "flos": 18878850466560.0, "grad_norm": 1.8840471790597813, "language_loss": 0.78525078, "learning_rate": 2.4061161538308273e-06, "loss": 0.80739009, "num_input_tokens_seen": 80870615, "step": 3758, "time_per_iteration": 2.6245949268341064 }, { "auxiliary_loss_clip": 0.01185185, "auxiliary_loss_mlp": 0.01026267, "balance_loss_clip": 1.01621735, "balance_loss_mlp": 1.01798236, "epoch": 0.4519930259123429, "flos": 18582479349120.0, "grad_norm": 1.7262719566726883, "language_loss": 0.88804317, "learning_rate": 2.4053533799526523e-06, "loss": 0.91015768, "num_input_tokens_seen": 80886335, "step": 3759, "time_per_iteration": 2.6534059047698975 }, { "auxiliary_loss_clip": 0.01177247, "auxiliary_loss_mlp": 0.01023365, "balance_loss_clip": 0.97790134, "balance_loss_mlp": 1.01559234, "epoch": 0.452113268802982, "flos": 25192377129600.0, "grad_norm": 1.8203920743258069, "language_loss": 0.86267853, "learning_rate": 2.404590544582691e-06, "loss": 0.88468468, "num_input_tokens_seen": 80904570, "step": 3760, "time_per_iteration": 2.705723762512207 }, { "auxiliary_loss_clip": 0.01177376, "auxiliary_loss_mlp": 0.01027098, "balance_loss_clip": 0.89473766, "balance_loss_mlp": 1.01890814, "epoch": 0.45223351169362114, "flos": 39378922312320.0, "grad_norm": 1.6661504436160866, "language_loss": 0.8111366, "learning_rate": 2.403827647836666e-06, "loss": 0.83318138, "num_input_tokens_seen": 80925125, "step": 3761, "time_per_iteration": 4.021667003631592 }, { "auxiliary_loss_clip": 0.01185143, "auxiliary_loss_mlp": 0.0103456, "balance_loss_clip": 1.0530107, "balance_loss_mlp": 1.02627516, "epoch": 0.4523537545842602, "flos": 21582169994880.0, "grad_norm": 1.7405422447748349, "language_loss": 0.69377917, "learning_rate": 2.4030646898303075e-06, "loss": 0.71597624, "num_input_tokens_seen": 80946615, "step": 3762, "time_per_iteration": 3.6665024757385254 }, { "auxiliary_loss_clip": 0.01186912, "auxiliary_loss_mlp": 0.01025693, "balance_loss_clip": 0.97630316, "balance_loss_mlp": 1.01802826, "epoch": 0.4524739974748993, "flos": 28439527547520.0, "grad_norm": 2.0238108103088988, "language_loss": 0.8155483, "learning_rate": 2.4023016706793566e-06, "loss": 0.83767432, "num_input_tokens_seen": 80966410, "step": 3763, "time_per_iteration": 2.7223217487335205 }, { "auxiliary_loss_clip": 0.01090144, "auxiliary_loss_mlp": 0.01007737, "balance_loss_clip": 0.90540528, "balance_loss_mlp": 1.00569844, "epoch": 0.4525942403655384, "flos": 61556492148480.0, "grad_norm": 0.7768083028587515, "language_loss": 0.56884819, "learning_rate": 2.401538590499561e-06, "loss": 0.589827, "num_input_tokens_seen": 81026865, "step": 3764, "time_per_iteration": 3.340073347091675 }, { "auxiliary_loss_clip": 0.01188664, "auxiliary_loss_mlp": 0.0112381, "balance_loss_clip": 1.01739526, "balance_loss_mlp": 0.0, "epoch": 0.45271448325617747, "flos": 27529838680320.0, "grad_norm": 1.7909302879774556, "language_loss": 0.7173506, "learning_rate": 2.400775449406682e-06, "loss": 0.7404753, "num_input_tokens_seen": 81050060, "step": 3765, "time_per_iteration": 2.714311122894287 }, { "auxiliary_loss_clip": 0.01179261, "auxiliary_loss_mlp": 0.01027523, "balance_loss_clip": 1.01195478, "balance_loss_mlp": 1.01989377, "epoch": 0.4528347261468166, "flos": 22452608275200.0, "grad_norm": 1.6709588102739086, "language_loss": 0.7284168, "learning_rate": 2.400012247516485e-06, "loss": 0.75048459, "num_input_tokens_seen": 81070625, "step": 3766, "time_per_iteration": 2.6547048091888428 }, { "auxiliary_loss_clip": 0.0118167, "auxiliary_loss_mlp": 0.01028159, "balance_loss_clip": 0.93730116, "balance_loss_mlp": 1.02017784, "epoch": 0.45295496903745563, "flos": 21103875469440.0, "grad_norm": 1.5982720706408504, "language_loss": 0.89910173, "learning_rate": 2.3992489849447484e-06, "loss": 0.92120004, "num_input_tokens_seen": 81089080, "step": 3767, "time_per_iteration": 3.7038562297821045 }, { "auxiliary_loss_clip": 0.01188944, "auxiliary_loss_mlp": 0.01027215, "balance_loss_clip": 0.93869668, "balance_loss_mlp": 1.01956177, "epoch": 0.45307521192809475, "flos": 23221168606080.0, "grad_norm": 2.3588163445025634, "language_loss": 0.78642607, "learning_rate": 2.3984856618072584e-06, "loss": 0.80858767, "num_input_tokens_seen": 81109115, "step": 3768, "time_per_iteration": 2.6946094036102295 }, { "auxiliary_loss_clip": 0.01184895, "auxiliary_loss_mlp": 0.01032481, "balance_loss_clip": 0.93720877, "balance_loss_mlp": 1.02475619, "epoch": 0.45319545481873386, "flos": 15560094286080.0, "grad_norm": 1.7071027898128648, "language_loss": 0.73694044, "learning_rate": 2.3977222782198098e-06, "loss": 0.75911427, "num_input_tokens_seen": 81127750, "step": 3769, "time_per_iteration": 2.6801202297210693 }, { "auxiliary_loss_clip": 0.01172762, "auxiliary_loss_mlp": 0.01027699, "balance_loss_clip": 0.93727338, "balance_loss_mlp": 1.01931858, "epoch": 0.4533156977093729, "flos": 21944759834880.0, "grad_norm": 1.58524868461853, "language_loss": 0.75147271, "learning_rate": 2.3969588342982077e-06, "loss": 0.77347738, "num_input_tokens_seen": 81147125, "step": 3770, "time_per_iteration": 3.6729164123535156 }, { "auxiliary_loss_clip": 0.01182714, "auxiliary_loss_mlp": 0.01022518, "balance_loss_clip": 1.01708722, "balance_loss_mlp": 1.01471007, "epoch": 0.453435940600012, "flos": 24242180699520.0, "grad_norm": 1.6614025902319216, "language_loss": 0.72669643, "learning_rate": 2.396195330158267e-06, "loss": 0.74874878, "num_input_tokens_seen": 81167015, "step": 3771, "time_per_iteration": 2.7098143100738525 }, { "auxiliary_loss_clip": 0.01183224, "auxiliary_loss_mlp": 0.01027232, "balance_loss_clip": 1.05178308, "balance_loss_mlp": 1.01907182, "epoch": 0.45355618349065113, "flos": 23440367352960.0, "grad_norm": 1.771713530558185, "language_loss": 0.79574078, "learning_rate": 2.3954317659158094e-06, "loss": 0.81784534, "num_input_tokens_seen": 81187350, "step": 3772, "time_per_iteration": 2.66764235496521 }, { "auxiliary_loss_clip": 0.01079442, "auxiliary_loss_mlp": 0.01007052, "balance_loss_clip": 1.01767957, "balance_loss_mlp": 1.00500154, "epoch": 0.4536764263812902, "flos": 66903161448960.0, "grad_norm": 0.8903898146061476, "language_loss": 0.57002759, "learning_rate": 2.394668141686667e-06, "loss": 0.59089249, "num_input_tokens_seen": 81249315, "step": 3773, "time_per_iteration": 3.2156879901885986 }, { "auxiliary_loss_clip": 0.01177251, "auxiliary_loss_mlp": 0.01030459, "balance_loss_clip": 1.01317191, "balance_loss_mlp": 1.02287471, "epoch": 0.4537966692719293, "flos": 42739766254080.0, "grad_norm": 1.9086926314623047, "language_loss": 0.69528961, "learning_rate": 2.3939044575866813e-06, "loss": 0.7173667, "num_input_tokens_seen": 81272065, "step": 3774, "time_per_iteration": 2.798804759979248 }, { "auxiliary_loss_clip": 0.01174681, "auxiliary_loss_mlp": 0.01123876, "balance_loss_clip": 0.97432959, "balance_loss_mlp": 0.0, "epoch": 0.4539169121625684, "flos": 35549480517120.0, "grad_norm": 1.9951859303044275, "language_loss": 0.75332314, "learning_rate": 2.3931407137317024e-06, "loss": 0.77630872, "num_input_tokens_seen": 81292220, "step": 3775, "time_per_iteration": 2.805312156677246 }, { "auxiliary_loss_clip": 0.01175328, "auxiliary_loss_mlp": 0.01026083, "balance_loss_clip": 0.93513429, "balance_loss_mlp": 1.01800692, "epoch": 0.45403715505320746, "flos": 18514716341760.0, "grad_norm": 1.7101309530706774, "language_loss": 0.85073757, "learning_rate": 2.3923769102375907e-06, "loss": 0.87275171, "num_input_tokens_seen": 81311085, "step": 3776, "time_per_iteration": 2.725778818130493 }, { "auxiliary_loss_clip": 0.01182048, "auxiliary_loss_mlp": 0.01026553, "balance_loss_clip": 0.93711126, "balance_loss_mlp": 1.01889932, "epoch": 0.4541573979438466, "flos": 25045825639680.0, "grad_norm": 1.9381004556094092, "language_loss": 0.78743261, "learning_rate": 2.391613047220213e-06, "loss": 0.80951864, "num_input_tokens_seen": 81330985, "step": 3777, "time_per_iteration": 2.7486889362335205 }, { "auxiliary_loss_clip": 0.01187826, "auxiliary_loss_mlp": 0.01026475, "balance_loss_clip": 0.89895982, "balance_loss_mlp": 1.01867843, "epoch": 0.4542776408344857, "flos": 18332397884160.0, "grad_norm": 1.945686416946306, "language_loss": 0.78775311, "learning_rate": 2.390849124795447e-06, "loss": 0.80989611, "num_input_tokens_seen": 81346985, "step": 3778, "time_per_iteration": 2.719949245452881 }, { "auxiliary_loss_clip": 0.01185452, "auxiliary_loss_mlp": 0.0102859, "balance_loss_clip": 1.05383217, "balance_loss_mlp": 1.02114487, "epoch": 0.45439788372512474, "flos": 20701173116160.0, "grad_norm": 1.8839705079653524, "language_loss": 0.84218788, "learning_rate": 2.3900851430791804e-06, "loss": 0.86432832, "num_input_tokens_seen": 81365005, "step": 3779, "time_per_iteration": 2.6288106441497803 }, { "auxiliary_loss_clip": 0.011869, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.0527482, "balance_loss_mlp": 1.02580643, "epoch": 0.45451812661576385, "flos": 22309432663680.0, "grad_norm": 1.8915413824717922, "language_loss": 0.84491003, "learning_rate": 2.389321102187307e-06, "loss": 0.86711979, "num_input_tokens_seen": 81383785, "step": 3780, "time_per_iteration": 2.646559000015259 }, { "auxiliary_loss_clip": 0.01180858, "auxiliary_loss_mlp": 0.01124612, "balance_loss_clip": 0.97721398, "balance_loss_mlp": 0.0, "epoch": 0.4546383695064029, "flos": 21763303303680.0, "grad_norm": 1.699573364033841, "language_loss": 0.81295574, "learning_rate": 2.3885570022357326e-06, "loss": 0.83601046, "num_input_tokens_seen": 81402915, "step": 3781, "time_per_iteration": 2.7384495735168457 }, { "auxiliary_loss_clip": 0.01091856, "auxiliary_loss_mlp": 0.01004832, "balance_loss_clip": 0.90705454, "balance_loss_mlp": 1.00267398, "epoch": 0.454758612397042, "flos": 64242755694720.0, "grad_norm": 0.8580401482032136, "language_loss": 0.60918498, "learning_rate": 2.38779284334037e-06, "loss": 0.63015187, "num_input_tokens_seen": 81467890, "step": 3782, "time_per_iteration": 3.4149351119995117 }, { "auxiliary_loss_clip": 0.01160312, "auxiliary_loss_mlp": 0.01031387, "balance_loss_clip": 0.89549148, "balance_loss_mlp": 1.02345347, "epoch": 0.4548788552876811, "flos": 27304175485440.0, "grad_norm": 1.8308780932224726, "language_loss": 0.79184508, "learning_rate": 2.387028625617141e-06, "loss": 0.81376207, "num_input_tokens_seen": 81487105, "step": 3783, "time_per_iteration": 3.0254228115081787 }, { "auxiliary_loss_clip": 0.0116705, "auxiliary_loss_mlp": 0.01028361, "balance_loss_clip": 0.97230744, "balance_loss_mlp": 1.02069557, "epoch": 0.4549990981783202, "flos": 22857142222080.0, "grad_norm": 1.7984585529942891, "language_loss": 0.84878957, "learning_rate": 2.3862643491819766e-06, "loss": 0.87074363, "num_input_tokens_seen": 81505670, "step": 3784, "time_per_iteration": 2.7845592498779297 }, { "auxiliary_loss_clip": 0.01175904, "auxiliary_loss_mlp": 0.01029619, "balance_loss_clip": 1.01008272, "balance_loss_mlp": 1.02204275, "epoch": 0.4551193410689593, "flos": 23258587599360.0, "grad_norm": 1.814998442435533, "language_loss": 0.84617609, "learning_rate": 2.3855000141508186e-06, "loss": 0.8682313, "num_input_tokens_seen": 81525825, "step": 3785, "time_per_iteration": 2.6845178604125977 }, { "auxiliary_loss_clip": 0.01186695, "auxiliary_loss_mlp": 0.01031402, "balance_loss_clip": 0.97966409, "balance_loss_mlp": 1.02266359, "epoch": 0.4552395839595984, "flos": 20777519473920.0, "grad_norm": 2.0303442890777306, "language_loss": 0.83937621, "learning_rate": 2.3847356206396143e-06, "loss": 0.86155713, "num_input_tokens_seen": 81543135, "step": 3786, "time_per_iteration": 2.652322292327881 }, { "auxiliary_loss_clip": 0.01184474, "auxiliary_loss_mlp": 0.01024609, "balance_loss_clip": 1.05368078, "balance_loss_mlp": 1.01643097, "epoch": 0.45535982685023746, "flos": 23257510191360.0, "grad_norm": 1.443513959197461, "language_loss": 0.78599334, "learning_rate": 2.3839711687643227e-06, "loss": 0.80808425, "num_input_tokens_seen": 81564360, "step": 3787, "time_per_iteration": 3.6564278602600098 }, { "auxiliary_loss_clip": 0.01183337, "auxiliary_loss_mlp": 0.01030108, "balance_loss_clip": 1.01559067, "balance_loss_mlp": 1.02164376, "epoch": 0.45548006974087657, "flos": 19646117907840.0, "grad_norm": 2.3694012889912357, "language_loss": 0.73836988, "learning_rate": 2.38320665864091e-06, "loss": 0.76050436, "num_input_tokens_seen": 81583710, "step": 3788, "time_per_iteration": 3.588388681411743 }, { "auxiliary_loss_clip": 0.01175429, "auxiliary_loss_mlp": 0.01024607, "balance_loss_clip": 0.85782599, "balance_loss_mlp": 1.01634622, "epoch": 0.4556003126315157, "flos": 20047778766720.0, "grad_norm": 1.6385584403278506, "language_loss": 0.81732279, "learning_rate": 2.3824420903853516e-06, "loss": 0.83932316, "num_input_tokens_seen": 81602175, "step": 3789, "time_per_iteration": 2.7425825595855713 }, { "auxiliary_loss_clip": 0.01181095, "auxiliary_loss_mlp": 0.01028328, "balance_loss_clip": 1.01502526, "balance_loss_mlp": 1.02059722, "epoch": 0.45572055552215474, "flos": 22959738443520.0, "grad_norm": 1.9969742454265693, "language_loss": 0.81992316, "learning_rate": 2.3816774641136324e-06, "loss": 0.84201735, "num_input_tokens_seen": 81619430, "step": 3790, "time_per_iteration": 2.64505934715271 }, { "auxiliary_loss_clip": 0.01182066, "auxiliary_loss_mlp": 0.01123904, "balance_loss_clip": 1.01661694, "balance_loss_mlp": 0.0, "epoch": 0.45584079841279385, "flos": 33109925535360.0, "grad_norm": 2.005719871857909, "language_loss": 0.71505052, "learning_rate": 2.380912779941745e-06, "loss": 0.73811018, "num_input_tokens_seen": 81642550, "step": 3791, "time_per_iteration": 2.7250747680664062 }, { "auxiliary_loss_clip": 0.01184446, "auxiliary_loss_mlp": 0.01029546, "balance_loss_clip": 1.01316094, "balance_loss_mlp": 1.02085543, "epoch": 0.45596104130343296, "flos": 27272179445760.0, "grad_norm": 3.1929227480002527, "language_loss": 0.83087301, "learning_rate": 2.3801480379856918e-06, "loss": 0.85301286, "num_input_tokens_seen": 81664260, "step": 3792, "time_per_iteration": 2.7316672801971436 }, { "auxiliary_loss_clip": 0.01185183, "auxiliary_loss_mlp": 0.01032922, "balance_loss_clip": 0.97806108, "balance_loss_mlp": 1.02410698, "epoch": 0.456081284194072, "flos": 21579799697280.0, "grad_norm": 1.5513704456898192, "language_loss": 0.83416927, "learning_rate": 2.379383238361484e-06, "loss": 0.8563503, "num_input_tokens_seen": 81683620, "step": 3793, "time_per_iteration": 3.5576796531677246 }, { "auxiliary_loss_clip": 0.01178483, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 1.01284575, "balance_loss_mlp": 1.02235365, "epoch": 0.4562015270847111, "flos": 35918822113920.0, "grad_norm": 2.1964254473253764, "language_loss": 0.79674876, "learning_rate": 2.3786183811851407e-06, "loss": 0.81883776, "num_input_tokens_seen": 81704325, "step": 3794, "time_per_iteration": 2.731013774871826 }, { "auxiliary_loss_clip": 0.01187262, "auxiliary_loss_mlp": 0.01033206, "balance_loss_clip": 1.05587041, "balance_loss_mlp": 1.0244863, "epoch": 0.45632176997535023, "flos": 13589783602560.0, "grad_norm": 1.8289714963148282, "language_loss": 0.8022666, "learning_rate": 2.3778534665726892e-06, "loss": 0.82447135, "num_input_tokens_seen": 81721155, "step": 3795, "time_per_iteration": 3.477999210357666 }, { "auxiliary_loss_clip": 0.0117062, "auxiliary_loss_mlp": 0.0103104, "balance_loss_clip": 1.01270151, "balance_loss_mlp": 1.02310705, "epoch": 0.4564420128659893, "flos": 32635401937920.0, "grad_norm": 1.7204685990257371, "language_loss": 0.72053087, "learning_rate": 2.377088494640168e-06, "loss": 0.74254751, "num_input_tokens_seen": 81742905, "step": 3796, "time_per_iteration": 2.7295708656311035 }, { "auxiliary_loss_clip": 0.01179704, "auxiliary_loss_mlp": 0.01028407, "balance_loss_clip": 1.01626229, "balance_loss_mlp": 1.02042639, "epoch": 0.4565622557566284, "flos": 20377690208640.0, "grad_norm": 2.1699027249472898, "language_loss": 0.78114593, "learning_rate": 2.3763234655036216e-06, "loss": 0.80322701, "num_input_tokens_seen": 81762105, "step": 3797, "time_per_iteration": 2.6873841285705566 }, { "auxiliary_loss_clip": 0.01176276, "auxiliary_loss_mlp": 0.0103029, "balance_loss_clip": 0.93396628, "balance_loss_mlp": 1.02210033, "epoch": 0.45668249864726745, "flos": 25374372364800.0, "grad_norm": 1.830403477819858, "language_loss": 0.87218171, "learning_rate": 2.3755583792791046e-06, "loss": 0.89424735, "num_input_tokens_seen": 81781975, "step": 3798, "time_per_iteration": 2.8111095428466797 }, { "auxiliary_loss_clip": 0.01185333, "auxiliary_loss_mlp": 0.0103067, "balance_loss_clip": 1.01517105, "balance_loss_mlp": 1.02259934, "epoch": 0.45680274153790656, "flos": 15559806977280.0, "grad_norm": 1.786084467119313, "language_loss": 0.74274522, "learning_rate": 2.3747932360826803e-06, "loss": 0.76490521, "num_input_tokens_seen": 81798905, "step": 3799, "time_per_iteration": 2.6749398708343506 }, { "auxiliary_loss_clip": 0.01182348, "auxiliary_loss_mlp": 0.01028553, "balance_loss_clip": 1.01570618, "balance_loss_mlp": 1.02013135, "epoch": 0.4569229844285457, "flos": 19792884879360.0, "grad_norm": 2.2422164691391617, "language_loss": 0.82500923, "learning_rate": 2.3740280360304205e-06, "loss": 0.84711826, "num_input_tokens_seen": 81816630, "step": 3800, "time_per_iteration": 2.6122946739196777 }, { "auxiliary_loss_clip": 0.01179625, "auxiliary_loss_mlp": 0.01024473, "balance_loss_clip": 0.93930167, "balance_loss_mlp": 1.01623547, "epoch": 0.45704322731918473, "flos": 24093941270400.0, "grad_norm": 1.597851110680576, "language_loss": 0.68066919, "learning_rate": 2.3732627792384038e-06, "loss": 0.70271021, "num_input_tokens_seen": 81837700, "step": 3801, "time_per_iteration": 2.752634286880493 }, { "auxiliary_loss_clip": 0.01183637, "auxiliary_loss_mlp": 0.01023891, "balance_loss_clip": 1.0521946, "balance_loss_mlp": 1.01600814, "epoch": 0.45716347020982384, "flos": 31317803245440.0, "grad_norm": 1.8455698670441048, "language_loss": 0.74904609, "learning_rate": 2.3724974658227207e-06, "loss": 0.77112138, "num_input_tokens_seen": 81858490, "step": 3802, "time_per_iteration": 2.6603589057922363 }, { "auxiliary_loss_clip": 0.01181426, "auxiliary_loss_mlp": 0.01124456, "balance_loss_clip": 0.97787392, "balance_loss_mlp": 0.0, "epoch": 0.45728371310046295, "flos": 26501392471680.0, "grad_norm": 1.7255970585397673, "language_loss": 0.71198219, "learning_rate": 2.3717320958994687e-06, "loss": 0.73504108, "num_input_tokens_seen": 81876050, "step": 3803, "time_per_iteration": 2.7176735401153564 }, { "auxiliary_loss_clip": 0.01174907, "auxiliary_loss_mlp": 0.01022348, "balance_loss_clip": 0.93141836, "balance_loss_mlp": 1.01426005, "epoch": 0.457403955991102, "flos": 17929408222080.0, "grad_norm": 2.9753636691499414, "language_loss": 0.70342004, "learning_rate": 2.3709666695847534e-06, "loss": 0.72539258, "num_input_tokens_seen": 81894230, "step": 3804, "time_per_iteration": 2.702040433883667 }, { "auxiliary_loss_clip": 0.01167772, "auxiliary_loss_mlp": 0.01031356, "balance_loss_clip": 0.89621824, "balance_loss_mlp": 1.02321398, "epoch": 0.4575241988817411, "flos": 42230660837760.0, "grad_norm": 1.6090865191546002, "language_loss": 0.70097685, "learning_rate": 2.370201186994689e-06, "loss": 0.72296804, "num_input_tokens_seen": 81917915, "step": 3805, "time_per_iteration": 2.9132587909698486 }, { "auxiliary_loss_clip": 0.01172854, "auxiliary_loss_mlp": 0.01033511, "balance_loss_clip": 0.97712159, "balance_loss_mlp": 1.02562571, "epoch": 0.45764444177238023, "flos": 30117309868800.0, "grad_norm": 1.8018461350623733, "language_loss": 0.69399405, "learning_rate": 2.369435648245399e-06, "loss": 0.71605772, "num_input_tokens_seen": 81938130, "step": 3806, "time_per_iteration": 2.857778787612915 }, { "auxiliary_loss_clip": 0.01178568, "auxiliary_loss_mlp": 0.01033128, "balance_loss_clip": 0.97540635, "balance_loss_mlp": 1.02491474, "epoch": 0.4577646846630193, "flos": 24060293205120.0, "grad_norm": 1.6844729684531932, "language_loss": 0.85209835, "learning_rate": 2.368670053453015e-06, "loss": 0.8742153, "num_input_tokens_seen": 81959820, "step": 3807, "time_per_iteration": 2.7183432579040527 }, { "auxiliary_loss_clip": 0.01190525, "auxiliary_loss_mlp": 0.01033637, "balance_loss_clip": 1.01793647, "balance_loss_mlp": 1.02560198, "epoch": 0.4578849275536584, "flos": 17418578952960.0, "grad_norm": 2.196063519058592, "language_loss": 0.74680692, "learning_rate": 2.3679044027336757e-06, "loss": 0.76904845, "num_input_tokens_seen": 81975710, "step": 3808, "time_per_iteration": 2.6249353885650635 }, { "auxiliary_loss_clip": 0.01187008, "auxiliary_loss_mlp": 0.0103419, "balance_loss_clip": 1.05441213, "balance_loss_mlp": 1.02577376, "epoch": 0.4580051704442975, "flos": 13510169107200.0, "grad_norm": 4.3623021598367195, "language_loss": 0.68965626, "learning_rate": 2.3671386962035326e-06, "loss": 0.71186829, "num_input_tokens_seen": 81993180, "step": 3809, "time_per_iteration": 2.5672624111175537 }, { "auxiliary_loss_clip": 0.0118695, "auxiliary_loss_mlp": 0.01030028, "balance_loss_clip": 1.01610804, "balance_loss_mlp": 1.02186227, "epoch": 0.45812541333493656, "flos": 18037606965120.0, "grad_norm": 2.1095494289547103, "language_loss": 0.68593454, "learning_rate": 2.3663729339787405e-06, "loss": 0.70810437, "num_input_tokens_seen": 82010115, "step": 3810, "time_per_iteration": 2.6907007694244385 }, { "auxiliary_loss_clip": 0.01186829, "auxiliary_loss_mlp": 0.0102782, "balance_loss_clip": 1.05495405, "balance_loss_mlp": 1.01930833, "epoch": 0.45824565622557567, "flos": 20222196232320.0, "grad_norm": 2.542910176283857, "language_loss": 0.73616183, "learning_rate": 2.365607116175466e-06, "loss": 0.75830841, "num_input_tokens_seen": 82025540, "step": 3811, "time_per_iteration": 2.6223223209381104 }, { "auxiliary_loss_clip": 0.01185081, "auxiliary_loss_mlp": 0.01036824, "balance_loss_clip": 1.05439019, "balance_loss_mlp": 1.02784133, "epoch": 0.4583658991162148, "flos": 19864885691520.0, "grad_norm": 2.8702922796605024, "language_loss": 0.66638899, "learning_rate": 2.3648412429098825e-06, "loss": 0.68860805, "num_input_tokens_seen": 82043890, "step": 3812, "time_per_iteration": 2.6961464881896973 }, { "auxiliary_loss_clip": 0.01176663, "auxiliary_loss_mlp": 0.01027986, "balance_loss_clip": 0.93841553, "balance_loss_mlp": 1.01928353, "epoch": 0.45848614200685384, "flos": 21029935322880.0, "grad_norm": 1.7281813184100023, "language_loss": 0.81993091, "learning_rate": 2.364075314298172e-06, "loss": 0.84197736, "num_input_tokens_seen": 82061345, "step": 3813, "time_per_iteration": 3.67948842048645 }, { "auxiliary_loss_clip": 0.01185343, "auxiliary_loss_mlp": 0.01124209, "balance_loss_clip": 1.01491022, "balance_loss_mlp": 0.0, "epoch": 0.45860638489749295, "flos": 21069293650560.0, "grad_norm": 2.0256014079332534, "language_loss": 0.70432442, "learning_rate": 2.3633093304565267e-06, "loss": 0.72741997, "num_input_tokens_seen": 82080400, "step": 3814, "time_per_iteration": 3.5173418521881104 }, { "auxiliary_loss_clip": 0.01189077, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.05438042, "balance_loss_mlp": 1.0201056, "epoch": 0.458726627788132, "flos": 26833889692800.0, "grad_norm": 1.8114806138524737, "language_loss": 0.6346159, "learning_rate": 2.3625432915011443e-06, "loss": 0.65678698, "num_input_tokens_seen": 82102310, "step": 3815, "time_per_iteration": 2.6497881412506104 }, { "auxiliary_loss_clip": 0.01173319, "auxiliary_loss_mlp": 0.01033452, "balance_loss_clip": 0.97572273, "balance_loss_mlp": 1.02520275, "epoch": 0.4588468706787711, "flos": 24097927680000.0, "grad_norm": 1.5943132555184534, "language_loss": 0.65272492, "learning_rate": 2.3617771975482334e-06, "loss": 0.67479265, "num_input_tokens_seen": 82121140, "step": 3816, "time_per_iteration": 2.645253896713257 }, { "auxiliary_loss_clip": 0.01175382, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 0.89938545, "balance_loss_mlp": 1.02585101, "epoch": 0.4589671135694102, "flos": 17889331622400.0, "grad_norm": 1.5682447196612825, "language_loss": 0.7472744, "learning_rate": 2.3610110487140083e-06, "loss": 0.76936638, "num_input_tokens_seen": 82139575, "step": 3817, "time_per_iteration": 2.759247303009033 }, { "auxiliary_loss_clip": 0.01186453, "auxiliary_loss_mlp": 0.01027181, "balance_loss_clip": 0.98014164, "balance_loss_mlp": 1.01906824, "epoch": 0.4590873564600493, "flos": 25626967781760.0, "grad_norm": 1.6830542629018956, "language_loss": 0.80357945, "learning_rate": 2.360244845114695e-06, "loss": 0.82571578, "num_input_tokens_seen": 82159195, "step": 3818, "time_per_iteration": 2.7439253330230713 }, { "auxiliary_loss_clip": 0.01180211, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 0.98011583, "balance_loss_mlp": 1.0201335, "epoch": 0.4592075993506884, "flos": 18514788168960.0, "grad_norm": 2.0760030119771744, "language_loss": 0.68106341, "learning_rate": 2.3594785868665245e-06, "loss": 0.70314795, "num_input_tokens_seen": 82175500, "step": 3819, "time_per_iteration": 3.5506248474121094 }, { "auxiliary_loss_clip": 0.01182124, "auxiliary_loss_mlp": 0.01124138, "balance_loss_clip": 0.93856978, "balance_loss_mlp": 0.0, "epoch": 0.4593278422413275, "flos": 20631111638400.0, "grad_norm": 1.951497147305843, "language_loss": 0.80660689, "learning_rate": 2.3587122740857386e-06, "loss": 0.82966954, "num_input_tokens_seen": 82192600, "step": 3820, "time_per_iteration": 2.7286558151245117 }, { "auxiliary_loss_clip": 0.01178396, "auxiliary_loss_mlp": 0.01029449, "balance_loss_clip": 1.01265621, "balance_loss_mlp": 1.02126515, "epoch": 0.45944808513196655, "flos": 21358517961600.0, "grad_norm": 1.5408923779362895, "language_loss": 0.77671009, "learning_rate": 2.357945906888586e-06, "loss": 0.79878855, "num_input_tokens_seen": 82212040, "step": 3821, "time_per_iteration": 3.5359303951263428 }, { "auxiliary_loss_clip": 0.01184829, "auxiliary_loss_mlp": 0.01026538, "balance_loss_clip": 1.01585925, "balance_loss_mlp": 1.01822925, "epoch": 0.45956832802260567, "flos": 21427789340160.0, "grad_norm": 2.1294778126100176, "language_loss": 0.79579729, "learning_rate": 2.357179485391324e-06, "loss": 0.81791091, "num_input_tokens_seen": 82229895, "step": 3822, "time_per_iteration": 2.66460919380188 }, { "auxiliary_loss_clip": 0.01179925, "auxiliary_loss_mlp": 0.01026933, "balance_loss_clip": 1.05235553, "balance_loss_mlp": 1.01904106, "epoch": 0.4596885709132448, "flos": 22382654538240.0, "grad_norm": 2.0842829868041184, "language_loss": 0.86367381, "learning_rate": 2.3564130097102173e-06, "loss": 0.88574237, "num_input_tokens_seen": 82249550, "step": 3823, "time_per_iteration": 2.6353225708007812 }, { "auxiliary_loss_clip": 0.01179354, "auxiliary_loss_mlp": 0.01032258, "balance_loss_clip": 0.98216528, "balance_loss_mlp": 1.02427673, "epoch": 0.45980881380388383, "flos": 28981957806720.0, "grad_norm": 1.9651772574657023, "language_loss": 0.75103402, "learning_rate": 2.355646479961541e-06, "loss": 0.77315015, "num_input_tokens_seen": 82268860, "step": 3824, "time_per_iteration": 2.7355360984802246 }, { "auxiliary_loss_clip": 0.01183618, "auxiliary_loss_mlp": 0.01026593, "balance_loss_clip": 1.05333591, "balance_loss_mlp": 1.01827252, "epoch": 0.45992905669452294, "flos": 33396599980800.0, "grad_norm": 1.8922469269289623, "language_loss": 0.71600753, "learning_rate": 2.354879896261576e-06, "loss": 0.73810971, "num_input_tokens_seen": 82289070, "step": 3825, "time_per_iteration": 2.7962467670440674 }, { "auxiliary_loss_clip": 0.01171709, "auxiliary_loss_mlp": 0.01025821, "balance_loss_clip": 0.93803847, "balance_loss_mlp": 1.0182755, "epoch": 0.46004929958516205, "flos": 36318184502400.0, "grad_norm": 2.0029293233666645, "language_loss": 0.56310928, "learning_rate": 2.3541132587266133e-06, "loss": 0.58508456, "num_input_tokens_seen": 82311790, "step": 3826, "time_per_iteration": 2.8156814575195312 }, { "auxiliary_loss_clip": 0.01185668, "auxiliary_loss_mlp": 0.01026084, "balance_loss_clip": 0.93871009, "balance_loss_mlp": 1.01804888, "epoch": 0.4601695424758011, "flos": 17238451224960.0, "grad_norm": 1.6493037201061789, "language_loss": 0.69072419, "learning_rate": 2.3533465674729515e-06, "loss": 0.71284175, "num_input_tokens_seen": 82329020, "step": 3827, "time_per_iteration": 2.7423095703125 }, { "auxiliary_loss_clip": 0.01187074, "auxiliary_loss_mlp": 0.01028374, "balance_loss_clip": 1.05606222, "balance_loss_mlp": 1.02065825, "epoch": 0.4602897853664402, "flos": 15888425529600.0, "grad_norm": 1.8553320949488334, "language_loss": 0.73208141, "learning_rate": 2.352579822616895e-06, "loss": 0.75423586, "num_input_tokens_seen": 82346455, "step": 3828, "time_per_iteration": 2.600799322128296 }, { "auxiliary_loss_clip": 0.01182628, "auxiliary_loss_mlp": 0.01026349, "balance_loss_clip": 0.97658014, "balance_loss_mlp": 1.01774812, "epoch": 0.4604100282570793, "flos": 25412617370880.0, "grad_norm": 2.0052431814584204, "language_loss": 0.77775145, "learning_rate": 2.351813024274761e-06, "loss": 0.79984128, "num_input_tokens_seen": 82367810, "step": 3829, "time_per_iteration": 2.7686009407043457 }, { "auxiliary_loss_clip": 0.01187007, "auxiliary_loss_mlp": 0.01027327, "balance_loss_clip": 0.94031775, "balance_loss_mlp": 1.01960194, "epoch": 0.4605302711477184, "flos": 27630711048960.0, "grad_norm": 1.6790099460518757, "language_loss": 0.73325312, "learning_rate": 2.3510461725628693e-06, "loss": 0.75539649, "num_input_tokens_seen": 82388275, "step": 3830, "time_per_iteration": 2.7566823959350586 }, { "auxiliary_loss_clip": 0.01181036, "auxiliary_loss_mlp": 0.01027925, "balance_loss_clip": 0.93727344, "balance_loss_mlp": 1.02043271, "epoch": 0.4606505140383575, "flos": 23839657914240.0, "grad_norm": 1.7243639091410434, "language_loss": 0.70945883, "learning_rate": 2.350279267597554e-06, "loss": 0.73154843, "num_input_tokens_seen": 82408915, "step": 3831, "time_per_iteration": 2.719383955001831 }, { "auxiliary_loss_clip": 0.01181918, "auxiliary_loss_mlp": 0.01032003, "balance_loss_clip": 1.01486206, "balance_loss_mlp": 1.02300286, "epoch": 0.46077075692899655, "flos": 16107013745280.0, "grad_norm": 1.9548565196943377, "language_loss": 0.82646012, "learning_rate": 2.3495123094951515e-06, "loss": 0.84859931, "num_input_tokens_seen": 82427260, "step": 3832, "time_per_iteration": 2.611849784851074 }, { "auxiliary_loss_clip": 0.01171589, "auxiliary_loss_mlp": 0.01031332, "balance_loss_clip": 0.97465456, "balance_loss_mlp": 1.02307129, "epoch": 0.46089099981963566, "flos": 48798147634560.0, "grad_norm": 1.9650518780893562, "language_loss": 0.7558074, "learning_rate": 2.34874529837201e-06, "loss": 0.77783662, "num_input_tokens_seen": 82450805, "step": 3833, "time_per_iteration": 2.9439823627471924 }, { "auxiliary_loss_clip": 0.01169569, "auxiliary_loss_mlp": 0.010305, "balance_loss_clip": 0.85887623, "balance_loss_mlp": 1.02216148, "epoch": 0.46101124271027477, "flos": 19099234362240.0, "grad_norm": 2.739115032846531, "language_loss": 0.7897929, "learning_rate": 2.347978234344483e-06, "loss": 0.81179357, "num_input_tokens_seen": 82467010, "step": 3834, "time_per_iteration": 2.7714545726776123 }, { "auxiliary_loss_clip": 0.01187711, "auxiliary_loss_mlp": 0.01027004, "balance_loss_clip": 1.01648498, "balance_loss_mlp": 1.01884985, "epoch": 0.4611314856009138, "flos": 39347931853440.0, "grad_norm": 1.647436510825817, "language_loss": 0.68770397, "learning_rate": 2.347211117528935e-06, "loss": 0.70985115, "num_input_tokens_seen": 82489310, "step": 3835, "time_per_iteration": 2.8416521549224854 }, { "auxiliary_loss_clip": 0.01191763, "auxiliary_loss_mlp": 0.01031373, "balance_loss_clip": 0.94380689, "balance_loss_mlp": 1.02329087, "epoch": 0.46125172849155294, "flos": 20810772489600.0, "grad_norm": 1.4683842584578253, "language_loss": 0.71696329, "learning_rate": 2.3464439480417374e-06, "loss": 0.73919463, "num_input_tokens_seen": 82508830, "step": 3836, "time_per_iteration": 2.72100567817688 }, { "auxiliary_loss_clip": 0.01188003, "auxiliary_loss_mlp": 0.0102894, "balance_loss_clip": 1.01698697, "balance_loss_mlp": 1.02035117, "epoch": 0.46137197138219205, "flos": 17930808852480.0, "grad_norm": 2.5258588759431664, "language_loss": 0.77162814, "learning_rate": 2.3456767259992676e-06, "loss": 0.79379749, "num_input_tokens_seen": 82526475, "step": 3837, "time_per_iteration": 2.656412124633789 }, { "auxiliary_loss_clip": 0.0118454, "auxiliary_loss_mlp": 0.01124044, "balance_loss_clip": 1.05306292, "balance_loss_mlp": 0.0, "epoch": 0.4614922142728311, "flos": 16836610798080.0, "grad_norm": 2.0194465605870513, "language_loss": 0.88844383, "learning_rate": 2.3449094515179135e-06, "loss": 0.91152966, "num_input_tokens_seen": 82543935, "step": 3838, "time_per_iteration": 3.5131497383117676 }, { "auxiliary_loss_clip": 0.01184051, "auxiliary_loss_mlp": 0.01038526, "balance_loss_clip": 0.97525936, "balance_loss_mlp": 1.03032494, "epoch": 0.4616124571634702, "flos": 26614906427520.0, "grad_norm": 1.5308363610817732, "language_loss": 0.8192879, "learning_rate": 2.34414212471407e-06, "loss": 0.84151363, "num_input_tokens_seen": 82563730, "step": 3839, "time_per_iteration": 2.707132339477539 }, { "auxiliary_loss_clip": 0.0118804, "auxiliary_loss_mlp": 0.01027339, "balance_loss_clip": 1.01581883, "balance_loss_mlp": 1.01907802, "epoch": 0.4617327000541093, "flos": 20340127560960.0, "grad_norm": 1.8369661257382301, "language_loss": 0.72774088, "learning_rate": 2.3433747457041394e-06, "loss": 0.74989468, "num_input_tokens_seen": 82582435, "step": 3840, "time_per_iteration": 3.815237283706665 }, { "auxiliary_loss_clip": 0.01180986, "auxiliary_loss_mlp": 0.01027129, "balance_loss_clip": 0.94150841, "balance_loss_mlp": 1.01843286, "epoch": 0.4618529429447484, "flos": 29570749545600.0, "grad_norm": 1.8770248795738647, "language_loss": 0.84990788, "learning_rate": 2.342607314604533e-06, "loss": 0.87198901, "num_input_tokens_seen": 82602185, "step": 3841, "time_per_iteration": 2.858339786529541 }, { "auxiliary_loss_clip": 0.01182152, "auxiliary_loss_mlp": 0.01029727, "balance_loss_clip": 1.01651549, "balance_loss_mlp": 1.02138853, "epoch": 0.4619731858353875, "flos": 19787030962560.0, "grad_norm": 2.53678503985766, "language_loss": 0.8394866, "learning_rate": 2.3418398315316694e-06, "loss": 0.86160541, "num_input_tokens_seen": 82620005, "step": 3842, "time_per_iteration": 2.650202989578247 }, { "auxiliary_loss_clip": 0.01183819, "auxiliary_loss_mlp": 0.01029347, "balance_loss_clip": 1.05494881, "balance_loss_mlp": 1.02156818, "epoch": 0.4620934287260266, "flos": 18951138587520.0, "grad_norm": 3.1518938239844556, "language_loss": 0.78237927, "learning_rate": 2.3410722966019755e-06, "loss": 0.80451095, "num_input_tokens_seen": 82635120, "step": 3843, "time_per_iteration": 2.639774799346924 }, { "auxiliary_loss_clip": 0.01181046, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.01508665, "balance_loss_mlp": 1.02192092, "epoch": 0.46221367161666566, "flos": 37341674634240.0, "grad_norm": 1.6563904534479927, "language_loss": 0.65728843, "learning_rate": 2.3403047099318848e-06, "loss": 0.67939472, "num_input_tokens_seen": 82659190, "step": 3844, "time_per_iteration": 2.741058349609375 }, { "auxiliary_loss_clip": 0.01167483, "auxiliary_loss_mlp": 0.01027912, "balance_loss_clip": 0.8950069, "balance_loss_mlp": 1.0199548, "epoch": 0.46233391450730477, "flos": 14428549065600.0, "grad_norm": 2.0946380154339046, "language_loss": 0.75053227, "learning_rate": 2.3395370716378405e-06, "loss": 0.77248621, "num_input_tokens_seen": 82676635, "step": 3845, "time_per_iteration": 3.645231246948242 }, { "auxiliary_loss_clip": 0.01184452, "auxiliary_loss_mlp": 0.01030277, "balance_loss_clip": 1.01390886, "balance_loss_mlp": 1.0223496, "epoch": 0.4624541573979438, "flos": 22493044010880.0, "grad_norm": 1.9920172206086932, "language_loss": 0.72616172, "learning_rate": 2.338769381836292e-06, "loss": 0.74830902, "num_input_tokens_seen": 82696245, "step": 3846, "time_per_iteration": 2.7148256301879883 }, { "auxiliary_loss_clip": 0.01178245, "auxiliary_loss_mlp": 0.0102816, "balance_loss_clip": 0.94020748, "balance_loss_mlp": 1.02008986, "epoch": 0.46257440028858293, "flos": 14465070218880.0, "grad_norm": 1.8730180806672967, "language_loss": 0.729195, "learning_rate": 2.3380016406436984e-06, "loss": 0.75125909, "num_input_tokens_seen": 82713725, "step": 3847, "time_per_iteration": 2.6698362827301025 }, { "auxiliary_loss_clip": 0.01176354, "auxiliary_loss_mlp": 0.0103228, "balance_loss_clip": 0.90202999, "balance_loss_mlp": 1.02378654, "epoch": 0.46269464317922204, "flos": 23332204523520.0, "grad_norm": 1.8741162703428256, "language_loss": 0.81016481, "learning_rate": 2.337233848176524e-06, "loss": 0.83225119, "num_input_tokens_seen": 82731495, "step": 3848, "time_per_iteration": 3.674501895904541 }, { "auxiliary_loss_clip": 0.01165927, "auxiliary_loss_mlp": 0.01026515, "balance_loss_clip": 0.89600819, "balance_loss_mlp": 1.01738322, "epoch": 0.4628148860698611, "flos": 18552027594240.0, "grad_norm": 1.8361997484755892, "language_loss": 0.83239329, "learning_rate": 2.3364660045512435e-06, "loss": 0.85431767, "num_input_tokens_seen": 82750255, "step": 3849, "time_per_iteration": 2.680731773376465 }, { "auxiliary_loss_clip": 0.01082765, "auxiliary_loss_mlp": 0.0100051, "balance_loss_clip": 0.94568151, "balance_loss_mlp": 0.99847144, "epoch": 0.4629351289605002, "flos": 70667569670400.0, "grad_norm": 0.757556139633834, "language_loss": 0.58219266, "learning_rate": 2.335698109884337e-06, "loss": 0.60302538, "num_input_tokens_seen": 82815460, "step": 3850, "time_per_iteration": 3.4448745250701904 }, { "auxiliary_loss_clip": 0.01112141, "auxiliary_loss_mlp": 0.01001818, "balance_loss_clip": 0.89425069, "balance_loss_mlp": 0.99974364, "epoch": 0.4630553718511393, "flos": 59687200465920.0, "grad_norm": 0.7942360508901716, "language_loss": 0.59893233, "learning_rate": 2.334930164292294e-06, "loss": 0.62007189, "num_input_tokens_seen": 82878010, "step": 3851, "time_per_iteration": 3.482140302658081 }, { "auxiliary_loss_clip": 0.01166655, "auxiliary_loss_mlp": 0.01029409, "balance_loss_clip": 0.8954528, "balance_loss_mlp": 1.02153504, "epoch": 0.4631756147417784, "flos": 15960605909760.0, "grad_norm": 1.9357242428676125, "language_loss": 0.80460685, "learning_rate": 2.334162167891612e-06, "loss": 0.82656753, "num_input_tokens_seen": 82895275, "step": 3852, "time_per_iteration": 2.7333786487579346 }, { "auxiliary_loss_clip": 0.01180732, "auxiliary_loss_mlp": 0.0103301, "balance_loss_clip": 0.9746629, "balance_loss_mlp": 1.02541971, "epoch": 0.4632958576324175, "flos": 16472907636480.0, "grad_norm": 2.2110523263100594, "language_loss": 0.75163972, "learning_rate": 2.333394120798795e-06, "loss": 0.77377707, "num_input_tokens_seen": 82914010, "step": 3853, "time_per_iteration": 2.6947226524353027 }, { "auxiliary_loss_clip": 0.01180269, "auxiliary_loss_mlp": 0.01026578, "balance_loss_clip": 0.9750998, "balance_loss_mlp": 1.01921117, "epoch": 0.4634161005230566, "flos": 22346492520960.0, "grad_norm": 1.954353430300777, "language_loss": 0.72096181, "learning_rate": 2.3326260231303545e-06, "loss": 0.74303031, "num_input_tokens_seen": 82932610, "step": 3854, "time_per_iteration": 2.7049570083618164 }, { "auxiliary_loss_clip": 0.01179877, "auxiliary_loss_mlp": 0.01027144, "balance_loss_clip": 1.05338931, "balance_loss_mlp": 1.01955009, "epoch": 0.46353634341369565, "flos": 15742233175680.0, "grad_norm": 1.5136994704187066, "language_loss": 0.86507386, "learning_rate": 2.331857875002811e-06, "loss": 0.88714409, "num_input_tokens_seen": 82951210, "step": 3855, "time_per_iteration": 2.6964190006256104 }, { "auxiliary_loss_clip": 0.01180178, "auxiliary_loss_mlp": 0.01030462, "balance_loss_clip": 0.97794962, "balance_loss_mlp": 1.02250528, "epoch": 0.46365658630433476, "flos": 28329820433280.0, "grad_norm": 2.022613735261913, "language_loss": 0.7654987, "learning_rate": 2.3310896765326916e-06, "loss": 0.78760505, "num_input_tokens_seen": 82972210, "step": 3856, "time_per_iteration": 2.8018226623535156 }, { "auxiliary_loss_clip": 0.01175027, "auxiliary_loss_mlp": 0.01028066, "balance_loss_clip": 0.93763667, "balance_loss_mlp": 1.02019858, "epoch": 0.46377682919497387, "flos": 24608074590720.0, "grad_norm": 2.20575724883469, "language_loss": 0.83869159, "learning_rate": 2.330321427836531e-06, "loss": 0.86072254, "num_input_tokens_seen": 82994080, "step": 3857, "time_per_iteration": 2.739224672317505 }, { "auxiliary_loss_clip": 0.01177744, "auxiliary_loss_mlp": 0.01025, "balance_loss_clip": 1.01466286, "balance_loss_mlp": 1.01724529, "epoch": 0.4638970720856129, "flos": 19060953442560.0, "grad_norm": 1.684736097743485, "language_loss": 0.82616723, "learning_rate": 2.3295531290308733e-06, "loss": 0.84819466, "num_input_tokens_seen": 83012230, "step": 3858, "time_per_iteration": 2.6818301677703857 }, { "auxiliary_loss_clip": 0.01186793, "auxiliary_loss_mlp": 0.01123951, "balance_loss_clip": 1.05566192, "balance_loss_mlp": 0.0, "epoch": 0.46401731497625204, "flos": 18471012468480.0, "grad_norm": 2.5182625719853053, "language_loss": 0.74947381, "learning_rate": 2.3287847802322678e-06, "loss": 0.77258134, "num_input_tokens_seen": 83027800, "step": 3859, "time_per_iteration": 2.6191656589508057 }, { "auxiliary_loss_clip": 0.01190076, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 0.97947669, "balance_loss_mlp": 1.02065253, "epoch": 0.4641375578668911, "flos": 26067053214720.0, "grad_norm": 1.8322138081494195, "language_loss": 0.83946031, "learning_rate": 2.3280163815572723e-06, "loss": 0.86165118, "num_input_tokens_seen": 83048395, "step": 3860, "time_per_iteration": 2.718862771987915 }, { "auxiliary_loss_clip": 0.0117111, "auxiliary_loss_mlp": 0.0102998, "balance_loss_clip": 0.97470117, "balance_loss_mlp": 1.02242839, "epoch": 0.4642578007575302, "flos": 19570382081280.0, "grad_norm": 1.797255797172715, "language_loss": 0.76901805, "learning_rate": 2.3272479331224522e-06, "loss": 0.79102898, "num_input_tokens_seen": 83065825, "step": 3861, "time_per_iteration": 2.6821842193603516 }, { "auxiliary_loss_clip": 0.01182542, "auxiliary_loss_mlp": 0.01028975, "balance_loss_clip": 1.05219114, "balance_loss_mlp": 1.02095771, "epoch": 0.4643780436481693, "flos": 28186249772160.0, "grad_norm": 2.727246998783956, "language_loss": 0.77932179, "learning_rate": 2.3264794350443817e-06, "loss": 0.80143696, "num_input_tokens_seen": 83087920, "step": 3862, "time_per_iteration": 2.717923879623413 }, { "auxiliary_loss_clip": 0.0118116, "auxiliary_loss_mlp": 0.01028517, "balance_loss_clip": 1.01225626, "balance_loss_mlp": 1.02048826, "epoch": 0.46449828653880837, "flos": 25375270204800.0, "grad_norm": 1.9076046784734355, "language_loss": 0.7885679, "learning_rate": 2.3257108874396396e-06, "loss": 0.81066465, "num_input_tokens_seen": 83109015, "step": 3863, "time_per_iteration": 2.696209192276001 }, { "auxiliary_loss_clip": 0.01178048, "auxiliary_loss_mlp": 0.01036474, "balance_loss_clip": 0.97500682, "balance_loss_mlp": 1.02826691, "epoch": 0.4646185294294475, "flos": 16034330574720.0, "grad_norm": 1.7675181886146214, "language_loss": 0.73284364, "learning_rate": 2.3249422904248152e-06, "loss": 0.75498879, "num_input_tokens_seen": 83127450, "step": 3864, "time_per_iteration": 2.7591705322265625 }, { "auxiliary_loss_clip": 0.01185218, "auxiliary_loss_mlp": 0.0102692, "balance_loss_clip": 1.01498759, "balance_loss_mlp": 1.01874804, "epoch": 0.4647387723200866, "flos": 26363101109760.0, "grad_norm": 1.4179567765607057, "language_loss": 0.87104797, "learning_rate": 2.324173644116504e-06, "loss": 0.8931694, "num_input_tokens_seen": 83150300, "step": 3865, "time_per_iteration": 3.8302388191223145 }, { "auxiliary_loss_clip": 0.01176192, "auxiliary_loss_mlp": 0.01031574, "balance_loss_clip": 1.01444161, "balance_loss_mlp": 1.02324688, "epoch": 0.46485901521072565, "flos": 27160209774720.0, "grad_norm": 1.6964378421711737, "language_loss": 0.81725961, "learning_rate": 2.3234049486313087e-06, "loss": 0.83933723, "num_input_tokens_seen": 83171750, "step": 3866, "time_per_iteration": 3.6259963512420654 }, { "auxiliary_loss_clip": 0.01181276, "auxiliary_loss_mlp": 0.01021519, "balance_loss_clip": 1.01629567, "balance_loss_mlp": 1.01436329, "epoch": 0.46497925810136476, "flos": 24279851088000.0, "grad_norm": 1.7071266815464243, "language_loss": 0.76083744, "learning_rate": 2.322636204085839e-06, "loss": 0.78286535, "num_input_tokens_seen": 83191820, "step": 3867, "time_per_iteration": 2.7170488834381104 }, { "auxiliary_loss_clip": 0.01167757, "auxiliary_loss_mlp": 0.01027546, "balance_loss_clip": 0.97111928, "balance_loss_mlp": 1.01981246, "epoch": 0.46509950099200387, "flos": 16253134272000.0, "grad_norm": 2.092930645881514, "language_loss": 0.78775007, "learning_rate": 2.3218674105967143e-06, "loss": 0.80970305, "num_input_tokens_seen": 83210085, "step": 3868, "time_per_iteration": 2.71307373046875 }, { "auxiliary_loss_clip": 0.01170068, "auxiliary_loss_mlp": 0.01029498, "balance_loss_clip": 0.97289622, "balance_loss_mlp": 1.02210057, "epoch": 0.4652197438826429, "flos": 23442270773760.0, "grad_norm": 1.5266628215455897, "language_loss": 0.83469534, "learning_rate": 2.3210985682805593e-06, "loss": 0.856691, "num_input_tokens_seen": 83231865, "step": 3869, "time_per_iteration": 2.7430338859558105 }, { "auxiliary_loss_clip": 0.01184002, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.05453515, "balance_loss_mlp": 1.01964974, "epoch": 0.46533998677328203, "flos": 16216397637120.0, "grad_norm": 3.4101130198121528, "language_loss": 0.67654562, "learning_rate": 2.320329677254007e-06, "loss": 0.6986624, "num_input_tokens_seen": 83249195, "step": 3870, "time_per_iteration": 2.57377552986145 }, { "auxiliary_loss_clip": 0.01181438, "auxiliary_loss_mlp": 0.01028077, "balance_loss_clip": 1.05253458, "balance_loss_mlp": 1.02010775, "epoch": 0.46546022966392114, "flos": 21141869080320.0, "grad_norm": 5.2528813312950575, "language_loss": 0.72397166, "learning_rate": 2.319560737633697e-06, "loss": 0.74606681, "num_input_tokens_seen": 83267915, "step": 3871, "time_per_iteration": 3.518630266189575 }, { "auxiliary_loss_clip": 0.01183592, "auxiliary_loss_mlp": 0.01027382, "balance_loss_clip": 0.9358784, "balance_loss_mlp": 1.01931739, "epoch": 0.4655804725545602, "flos": 41171942442240.0, "grad_norm": 1.5235642596089882, "language_loss": 0.68074322, "learning_rate": 2.3187917495362775e-06, "loss": 0.70285296, "num_input_tokens_seen": 83292325, "step": 3872, "time_per_iteration": 2.9102931022644043 }, { "auxiliary_loss_clip": 0.01172048, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 0.89670515, "balance_loss_mlp": 1.01784337, "epoch": 0.4657007154451993, "flos": 19570956698880.0, "grad_norm": 2.7033393566748707, "language_loss": 0.76895791, "learning_rate": 2.318022713078403e-06, "loss": 0.79093635, "num_input_tokens_seen": 83306905, "step": 3873, "time_per_iteration": 2.8125078678131104 }, { "auxiliary_loss_clip": 0.01179091, "auxiliary_loss_mlp": 0.01024007, "balance_loss_clip": 0.97507083, "balance_loss_mlp": 1.01612687, "epoch": 0.4658209583358384, "flos": 15517826956800.0, "grad_norm": 3.05690023807575, "language_loss": 0.85200089, "learning_rate": 2.3172536283767354e-06, "loss": 0.8740319, "num_input_tokens_seen": 83320665, "step": 3874, "time_per_iteration": 3.5374696254730225 }, { "auxiliary_loss_clip": 0.01172419, "auxiliary_loss_mlp": 0.01034101, "balance_loss_clip": 0.93813765, "balance_loss_mlp": 1.02575064, "epoch": 0.4659412012264775, "flos": 14903180403840.0, "grad_norm": 1.9855345328704697, "language_loss": 0.80818081, "learning_rate": 2.3164844955479447e-06, "loss": 0.83024597, "num_input_tokens_seen": 83336475, "step": 3875, "time_per_iteration": 2.707895278930664 }, { "auxiliary_loss_clip": 0.0118442, "auxiliary_loss_mlp": 0.01029659, "balance_loss_clip": 0.89904362, "balance_loss_mlp": 1.02141011, "epoch": 0.4660614441171166, "flos": 24425612478720.0, "grad_norm": 1.634839588448174, "language_loss": 0.70763707, "learning_rate": 2.3157153147087082e-06, "loss": 0.72977787, "num_input_tokens_seen": 83358365, "step": 3876, "time_per_iteration": 2.763254404067993 }, { "auxiliary_loss_clip": 0.01185747, "auxiliary_loss_mlp": 0.01027218, "balance_loss_clip": 0.90239143, "balance_loss_mlp": 1.0194664, "epoch": 0.46618168700775564, "flos": 22091095843200.0, "grad_norm": 1.7280738588484823, "language_loss": 0.82647538, "learning_rate": 2.314946085975709e-06, "loss": 0.84860492, "num_input_tokens_seen": 83377345, "step": 3877, "time_per_iteration": 2.7682788372039795 }, { "auxiliary_loss_clip": 0.01164189, "auxiliary_loss_mlp": 0.0103013, "balance_loss_clip": 0.93572003, "balance_loss_mlp": 1.02255726, "epoch": 0.46630192989839475, "flos": 26176975810560.0, "grad_norm": 2.645491829639233, "language_loss": 0.82470715, "learning_rate": 2.3141768094656393e-06, "loss": 0.84665036, "num_input_tokens_seen": 83395920, "step": 3878, "time_per_iteration": 2.7120211124420166 }, { "auxiliary_loss_clip": 0.01179009, "auxiliary_loss_mlp": 0.01029518, "balance_loss_clip": 0.81815588, "balance_loss_mlp": 1.02199602, "epoch": 0.46642217278903386, "flos": 11509622150400.0, "grad_norm": 2.3229659988369473, "language_loss": 0.83398157, "learning_rate": 2.3134074852951966e-06, "loss": 0.85606682, "num_input_tokens_seen": 83412510, "step": 3879, "time_per_iteration": 3.026613473892212 }, { "auxiliary_loss_clip": 0.01166353, "auxiliary_loss_mlp": 0.01027716, "balance_loss_clip": 0.89475721, "balance_loss_mlp": 1.01991928, "epoch": 0.4665424156796729, "flos": 32306819299200.0, "grad_norm": 1.6123315591103666, "language_loss": 0.77631915, "learning_rate": 2.312638113581088e-06, "loss": 0.79825985, "num_input_tokens_seen": 83432995, "step": 3880, "time_per_iteration": 3.0080325603485107 }, { "auxiliary_loss_clip": 0.01176581, "auxiliary_loss_mlp": 0.01028581, "balance_loss_clip": 1.01088023, "balance_loss_mlp": 1.02000427, "epoch": 0.46666265857031203, "flos": 18436179254400.0, "grad_norm": 2.428444235513101, "language_loss": 0.78428912, "learning_rate": 2.311868694440027e-06, "loss": 0.80634069, "num_input_tokens_seen": 83447415, "step": 3881, "time_per_iteration": 2.643538475036621 }, { "auxiliary_loss_clip": 0.01080021, "auxiliary_loss_mlp": 0.01003499, "balance_loss_clip": 1.01912808, "balance_loss_mlp": 1.00155592, "epoch": 0.46678290146095114, "flos": 68438989221120.0, "grad_norm": 0.771385286715895, "language_loss": 0.62519717, "learning_rate": 2.3110992279887323e-06, "loss": 0.64603233, "num_input_tokens_seen": 83519340, "step": 3882, "time_per_iteration": 3.3185129165649414 }, { "auxiliary_loss_clip": 0.01183299, "auxiliary_loss_mlp": 0.01030194, "balance_loss_clip": 0.93861496, "balance_loss_mlp": 1.02199888, "epoch": 0.4669031443515902, "flos": 17712507945600.0, "grad_norm": 2.625981328626022, "language_loss": 0.84483886, "learning_rate": 2.310329714343932e-06, "loss": 0.86697376, "num_input_tokens_seen": 83535490, "step": 3883, "time_per_iteration": 2.724113702774048 }, { "auxiliary_loss_clip": 0.01173034, "auxiliary_loss_mlp": 0.010292, "balance_loss_clip": 0.97436631, "balance_loss_mlp": 1.02087927, "epoch": 0.4670233872422293, "flos": 23947748916480.0, "grad_norm": 1.8887400787345745, "language_loss": 0.81854576, "learning_rate": 2.309560153622361e-06, "loss": 0.84056813, "num_input_tokens_seen": 83552400, "step": 3884, "time_per_iteration": 2.7610223293304443 }, { "auxiliary_loss_clip": 0.01176414, "auxiliary_loss_mlp": 0.01028999, "balance_loss_clip": 0.93741524, "balance_loss_mlp": 1.02054095, "epoch": 0.4671436301328684, "flos": 28111268131200.0, "grad_norm": 1.963045125444464, "language_loss": 0.74219, "learning_rate": 2.3087905459407602e-06, "loss": 0.7642442, "num_input_tokens_seen": 83571340, "step": 3885, "time_per_iteration": 2.761396884918213 }, { "auxiliary_loss_clip": 0.01084652, "auxiliary_loss_mlp": 0.01002734, "balance_loss_clip": 0.981924, "balance_loss_mlp": 1.00071895, "epoch": 0.46726387302350747, "flos": 69369684566400.0, "grad_norm": 0.814259033285699, "language_loss": 0.62914717, "learning_rate": 2.3080208914158795e-06, "loss": 0.65002102, "num_input_tokens_seen": 83634340, "step": 3886, "time_per_iteration": 3.23954439163208 }, { "auxiliary_loss_clip": 0.01179392, "auxiliary_loss_mlp": 0.01033082, "balance_loss_clip": 0.97793651, "balance_loss_mlp": 1.0252471, "epoch": 0.4673841159141466, "flos": 25519666878720.0, "grad_norm": 2.1129719867907237, "language_loss": 0.71841812, "learning_rate": 2.3072511901644753e-06, "loss": 0.74054289, "num_input_tokens_seen": 83653410, "step": 3887, "time_per_iteration": 2.7471721172332764 }, { "auxiliary_loss_clip": 0.01180734, "auxiliary_loss_mlp": 0.01025514, "balance_loss_clip": 1.05368423, "balance_loss_mlp": 1.01863837, "epoch": 0.4675043588047857, "flos": 24499265316480.0, "grad_norm": 1.9348189428637648, "language_loss": 0.80670774, "learning_rate": 2.306481442303309e-06, "loss": 0.82877022, "num_input_tokens_seen": 83672985, "step": 3888, "time_per_iteration": 2.6620969772338867 }, { "auxiliary_loss_clip": 0.01180401, "auxiliary_loss_mlp": 0.01028986, "balance_loss_clip": 1.01281738, "balance_loss_mlp": 1.02130318, "epoch": 0.46762460169542475, "flos": 20960771685120.0, "grad_norm": 1.79765597249183, "language_loss": 0.7324068, "learning_rate": 2.3057116479491515e-06, "loss": 0.75450069, "num_input_tokens_seen": 83692395, "step": 3889, "time_per_iteration": 2.681292772293091 }, { "auxiliary_loss_clip": 0.01174206, "auxiliary_loss_mlp": 0.01033084, "balance_loss_clip": 1.01055384, "balance_loss_mlp": 1.02560937, "epoch": 0.46774484458606386, "flos": 19171666137600.0, "grad_norm": 1.9924639994011841, "language_loss": 0.76079977, "learning_rate": 2.30494180721878e-06, "loss": 0.78287268, "num_input_tokens_seen": 83709735, "step": 3890, "time_per_iteration": 2.6154708862304688 }, { "auxiliary_loss_clip": 0.01174908, "auxiliary_loss_mlp": 0.01025009, "balance_loss_clip": 1.01126766, "balance_loss_mlp": 1.01776671, "epoch": 0.4678650874767029, "flos": 17967689141760.0, "grad_norm": 1.8370350408690994, "language_loss": 0.89448392, "learning_rate": 2.3041719202289794e-06, "loss": 0.9164831, "num_input_tokens_seen": 83725910, "step": 3891, "time_per_iteration": 3.601923942565918 }, { "auxiliary_loss_clip": 0.0118258, "auxiliary_loss_mlp": 0.01024672, "balance_loss_clip": 1.01598489, "balance_loss_mlp": 1.01743555, "epoch": 0.467985330367342, "flos": 21360816432000.0, "grad_norm": 1.6012605759970084, "language_loss": 0.80153668, "learning_rate": 2.30340198709654e-06, "loss": 0.82360917, "num_input_tokens_seen": 83745745, "step": 3892, "time_per_iteration": 2.6866469383239746 }, { "auxiliary_loss_clip": 0.0118103, "auxiliary_loss_mlp": 0.01030822, "balance_loss_clip": 0.9738512, "balance_loss_mlp": 1.02328777, "epoch": 0.46810557325798113, "flos": 20521835487360.0, "grad_norm": 2.623813919362318, "language_loss": 0.7425971, "learning_rate": 2.3026320079382605e-06, "loss": 0.76471567, "num_input_tokens_seen": 83762680, "step": 3893, "time_per_iteration": 3.6192190647125244 }, { "auxiliary_loss_clip": 0.01178476, "auxiliary_loss_mlp": 0.0102758, "balance_loss_clip": 1.05216217, "balance_loss_mlp": 1.01984358, "epoch": 0.4682258161486202, "flos": 30117848572800.0, "grad_norm": 1.6721595430171536, "language_loss": 0.76478821, "learning_rate": 2.3018619828709454e-06, "loss": 0.78684878, "num_input_tokens_seen": 83784220, "step": 3894, "time_per_iteration": 2.755356550216675 }, { "auxiliary_loss_clip": 0.01177901, "auxiliary_loss_mlp": 0.01123434, "balance_loss_clip": 1.01581383, "balance_loss_mlp": 0.0, "epoch": 0.4683460590392593, "flos": 25293357239040.0, "grad_norm": 1.977362836268463, "language_loss": 0.8189851, "learning_rate": 2.3010919120114084e-06, "loss": 0.84199846, "num_input_tokens_seen": 83800750, "step": 3895, "time_per_iteration": 2.732628107070923 }, { "auxiliary_loss_clip": 0.01173409, "auxiliary_loss_mlp": 0.01033474, "balance_loss_clip": 1.00949621, "balance_loss_mlp": 1.02536178, "epoch": 0.4684663019298984, "flos": 15368330551680.0, "grad_norm": 2.104302853838631, "language_loss": 0.6585114, "learning_rate": 2.3003217954764672e-06, "loss": 0.68058026, "num_input_tokens_seen": 83815455, "step": 3896, "time_per_iteration": 2.6326348781585693 }, { "auxiliary_loss_clip": 0.01179298, "auxiliary_loss_mlp": 0.0102785, "balance_loss_clip": 1.01121867, "balance_loss_mlp": 1.02005649, "epoch": 0.46858654482053747, "flos": 27778842737280.0, "grad_norm": 2.0416618418645833, "language_loss": 0.7940591, "learning_rate": 2.299551633382949e-06, "loss": 0.81613058, "num_input_tokens_seen": 83835765, "step": 3897, "time_per_iteration": 3.6234633922576904 }, { "auxiliary_loss_clip": 0.01168149, "auxiliary_loss_mlp": 0.01029293, "balance_loss_clip": 0.97250295, "balance_loss_mlp": 1.02144909, "epoch": 0.4687067877111766, "flos": 18040623707520.0, "grad_norm": 1.8114792469600445, "language_loss": 0.85836446, "learning_rate": 2.2987814258476854e-06, "loss": 0.88033885, "num_input_tokens_seen": 83853565, "step": 3898, "time_per_iteration": 2.6263558864593506 }, { "auxiliary_loss_clip": 0.01175759, "auxiliary_loss_mlp": 0.01027465, "balance_loss_clip": 0.89407915, "balance_loss_mlp": 1.01943326, "epoch": 0.4688270306018157, "flos": 16977380198400.0, "grad_norm": 3.349273103305113, "language_loss": 0.67924607, "learning_rate": 2.2980111729875177e-06, "loss": 0.70127827, "num_input_tokens_seen": 83869815, "step": 3899, "time_per_iteration": 2.73378849029541 }, { "auxiliary_loss_clip": 0.0117448, "auxiliary_loss_mlp": 0.01031975, "balance_loss_clip": 0.97673541, "balance_loss_mlp": 1.02510285, "epoch": 0.46894727349245474, "flos": 17821640442240.0, "grad_norm": 1.7509166245772794, "language_loss": 0.82141221, "learning_rate": 2.2972408749192917e-06, "loss": 0.84347671, "num_input_tokens_seen": 83887545, "step": 3900, "time_per_iteration": 3.525686502456665 }, { "auxiliary_loss_clip": 0.01176091, "auxiliary_loss_mlp": 0.01122935, "balance_loss_clip": 1.01410842, "balance_loss_mlp": 0.0, "epoch": 0.46906751638309385, "flos": 21471349559040.0, "grad_norm": 1.8978204377093464, "language_loss": 0.67086655, "learning_rate": 2.296470531759861e-06, "loss": 0.69385684, "num_input_tokens_seen": 83905645, "step": 3901, "time_per_iteration": 2.636038064956665 }, { "auxiliary_loss_clip": 0.01169864, "auxiliary_loss_mlp": 0.01029451, "balance_loss_clip": 0.93456835, "balance_loss_mlp": 1.02153516, "epoch": 0.46918775927373296, "flos": 20337829090560.0, "grad_norm": 1.9965396493051664, "language_loss": 0.7936511, "learning_rate": 2.2957001436260866e-06, "loss": 0.81564426, "num_input_tokens_seen": 83922705, "step": 3902, "time_per_iteration": 2.6387083530426025 }, { "auxiliary_loss_clip": 0.01173334, "auxiliary_loss_mlp": 0.0102941, "balance_loss_clip": 0.97460777, "balance_loss_mlp": 1.02092862, "epoch": 0.469308002164372, "flos": 18403249461120.0, "grad_norm": 1.8386522465780135, "language_loss": 0.72925639, "learning_rate": 2.294929710634836e-06, "loss": 0.75128382, "num_input_tokens_seen": 83940795, "step": 3903, "time_per_iteration": 2.6676864624023438 }, { "auxiliary_loss_clip": 0.01174323, "auxiliary_loss_mlp": 0.0102316, "balance_loss_clip": 1.00951171, "balance_loss_mlp": 1.0158819, "epoch": 0.46942824505501113, "flos": 37962067363200.0, "grad_norm": 1.9309957082488196, "language_loss": 0.61393774, "learning_rate": 2.2941592329029823e-06, "loss": 0.63591254, "num_input_tokens_seen": 83961900, "step": 3904, "time_per_iteration": 2.778479814529419 }, { "auxiliary_loss_clip": 0.01173362, "auxiliary_loss_mlp": 0.01030919, "balance_loss_clip": 1.01381052, "balance_loss_mlp": 1.02340257, "epoch": 0.46954848794565024, "flos": 21872507627520.0, "grad_norm": 1.7989216225019589, "language_loss": 0.78841925, "learning_rate": 2.2933887105474067e-06, "loss": 0.810462, "num_input_tokens_seen": 83980075, "step": 3905, "time_per_iteration": 2.663454532623291 }, { "auxiliary_loss_clip": 0.01173835, "auxiliary_loss_mlp": 0.01029077, "balance_loss_clip": 1.01421821, "balance_loss_mlp": 1.02181995, "epoch": 0.4696687308362893, "flos": 22016545165440.0, "grad_norm": 1.6447452567881582, "language_loss": 0.81297576, "learning_rate": 2.2926181436849974e-06, "loss": 0.83500487, "num_input_tokens_seen": 83999430, "step": 3906, "time_per_iteration": 2.699350595474243 }, { "auxiliary_loss_clip": 0.01179904, "auxiliary_loss_mlp": 0.01028717, "balance_loss_clip": 1.0154109, "balance_loss_mlp": 1.02112579, "epoch": 0.4697889737269284, "flos": 21613663244160.0, "grad_norm": 1.5536617547583322, "language_loss": 0.72573066, "learning_rate": 2.2918475324326478e-06, "loss": 0.74781686, "num_input_tokens_seen": 84019150, "step": 3907, "time_per_iteration": 2.6544077396392822 }, { "auxiliary_loss_clip": 0.0118337, "auxiliary_loss_mlp": 0.01123649, "balance_loss_clip": 1.01613283, "balance_loss_mlp": 0.0, "epoch": 0.46990921661756746, "flos": 25228323665280.0, "grad_norm": 1.9336113757379765, "language_loss": 0.91217017, "learning_rate": 2.2910768769072603e-06, "loss": 0.93524033, "num_input_tokens_seen": 84037930, "step": 3908, "time_per_iteration": 2.6972672939300537 }, { "auxiliary_loss_clip": 0.01170425, "auxiliary_loss_mlp": 0.01030452, "balance_loss_clip": 1.01189065, "balance_loss_mlp": 1.02292991, "epoch": 0.47002945950820657, "flos": 13844031045120.0, "grad_norm": 1.85566492689334, "language_loss": 0.76011354, "learning_rate": 2.2903061772257417e-06, "loss": 0.78212237, "num_input_tokens_seen": 84055915, "step": 3909, "time_per_iteration": 2.6926541328430176 }, { "auxiliary_loss_clip": 0.01177091, "auxiliary_loss_mlp": 0.01029435, "balance_loss_clip": 1.01385999, "balance_loss_mlp": 1.02162671, "epoch": 0.4701497023988457, "flos": 26247001374720.0, "grad_norm": 1.4483816368283435, "language_loss": 0.78491867, "learning_rate": 2.289535433505007e-06, "loss": 0.80698383, "num_input_tokens_seen": 84077270, "step": 3910, "time_per_iteration": 2.679877519607544 }, { "auxiliary_loss_clip": 0.0117866, "auxiliary_loss_mlp": 0.01025085, "balance_loss_clip": 0.9735986, "balance_loss_mlp": 1.01814151, "epoch": 0.47026994528948474, "flos": 25629517647360.0, "grad_norm": 1.8471937957093116, "language_loss": 0.63736141, "learning_rate": 2.2887646458619767e-06, "loss": 0.65939885, "num_input_tokens_seen": 84098635, "step": 3911, "time_per_iteration": 2.7589542865753174 }, { "auxiliary_loss_clip": 0.01180021, "auxiliary_loss_mlp": 0.01033823, "balance_loss_clip": 0.93713498, "balance_loss_mlp": 1.02541327, "epoch": 0.47039018818012385, "flos": 20554406144640.0, "grad_norm": 1.8846299051227082, "language_loss": 0.76873159, "learning_rate": 2.2879938144135797e-06, "loss": 0.79087007, "num_input_tokens_seen": 84114740, "step": 3912, "time_per_iteration": 2.7265403270721436 }, { "auxiliary_loss_clip": 0.01173019, "auxiliary_loss_mlp": 0.01123239, "balance_loss_clip": 0.93507624, "balance_loss_mlp": 0.0, "epoch": 0.47051043107076296, "flos": 21577249831680.0, "grad_norm": 1.534455471003871, "language_loss": 0.75304163, "learning_rate": 2.2872229392767496e-06, "loss": 0.7760042, "num_input_tokens_seen": 84134845, "step": 3913, "time_per_iteration": 2.739321708679199 }, { "auxiliary_loss_clip": 0.01183399, "auxiliary_loss_mlp": 0.01032331, "balance_loss_clip": 1.01522005, "balance_loss_mlp": 1.024737, "epoch": 0.470630673961402, "flos": 18953185662720.0, "grad_norm": 1.4560709931654807, "language_loss": 0.74774039, "learning_rate": 2.286452020568428e-06, "loss": 0.76989764, "num_input_tokens_seen": 84152920, "step": 3914, "time_per_iteration": 2.6811909675598145 }, { "auxiliary_loss_clip": 0.01184181, "auxiliary_loss_mlp": 0.01028283, "balance_loss_clip": 1.05314219, "balance_loss_mlp": 1.02024221, "epoch": 0.4707509168520411, "flos": 19938969492480.0, "grad_norm": 1.643555635614546, "language_loss": 0.72730941, "learning_rate": 2.2856810584055637e-06, "loss": 0.74943405, "num_input_tokens_seen": 84170455, "step": 3915, "time_per_iteration": 2.5957493782043457 }, { "auxiliary_loss_clip": 0.01178312, "auxiliary_loss_mlp": 0.01027561, "balance_loss_clip": 1.01247334, "balance_loss_mlp": 1.01917481, "epoch": 0.47087115974268023, "flos": 40118754741120.0, "grad_norm": 1.44264830705297, "language_loss": 0.67709732, "learning_rate": 2.2849100529051085e-06, "loss": 0.69915599, "num_input_tokens_seen": 84197390, "step": 3916, "time_per_iteration": 2.858718156814575 }, { "auxiliary_loss_clip": 0.01179842, "auxiliary_loss_mlp": 0.01023653, "balance_loss_clip": 1.05314279, "balance_loss_mlp": 1.01626837, "epoch": 0.4709914026333193, "flos": 13552723745280.0, "grad_norm": 2.4040279755884395, "language_loss": 0.7984345, "learning_rate": 2.284139004184026e-06, "loss": 0.8204695, "num_input_tokens_seen": 84214620, "step": 3917, "time_per_iteration": 3.533043146133423 }, { "auxiliary_loss_clip": 0.01181608, "auxiliary_loss_mlp": 0.01036479, "balance_loss_clip": 1.05275369, "balance_loss_mlp": 1.02847743, "epoch": 0.4711116455239584, "flos": 19974628719360.0, "grad_norm": 1.8901311153155127, "language_loss": 0.74194312, "learning_rate": 2.2833679123592814e-06, "loss": 0.76412392, "num_input_tokens_seen": 84231880, "step": 3918, "time_per_iteration": 3.6352312564849854 }, { "auxiliary_loss_clip": 0.01176706, "auxiliary_loss_mlp": 0.01026199, "balance_loss_clip": 0.97600454, "balance_loss_mlp": 1.01873064, "epoch": 0.4712318884145975, "flos": 32124824064000.0, "grad_norm": 1.612416944800895, "language_loss": 0.63433367, "learning_rate": 2.2825967775478508e-06, "loss": 0.65636271, "num_input_tokens_seen": 84252980, "step": 3919, "time_per_iteration": 2.7769525051116943 }, { "auxiliary_loss_clip": 0.0117975, "auxiliary_loss_mlp": 0.01024046, "balance_loss_clip": 1.05262589, "balance_loss_mlp": 1.01617265, "epoch": 0.47135213130523657, "flos": 20047850593920.0, "grad_norm": 2.0637409973088148, "language_loss": 0.83121943, "learning_rate": 2.2818255998667135e-06, "loss": 0.85325742, "num_input_tokens_seen": 84271490, "step": 3920, "time_per_iteration": 2.5894248485565186 }, { "auxiliary_loss_clip": 0.0117857, "auxiliary_loss_mlp": 0.01028352, "balance_loss_clip": 1.01517141, "balance_loss_mlp": 1.0208776, "epoch": 0.4714723741958757, "flos": 19426990988160.0, "grad_norm": 1.5269116758956032, "language_loss": 0.78631234, "learning_rate": 2.2810543794328566e-06, "loss": 0.80838156, "num_input_tokens_seen": 84290525, "step": 3921, "time_per_iteration": 2.6445746421813965 }, { "auxiliary_loss_clip": 0.01183132, "auxiliary_loss_mlp": 0.01025348, "balance_loss_clip": 1.01485276, "balance_loss_mlp": 1.01784039, "epoch": 0.4715926170865148, "flos": 20373883367040.0, "grad_norm": 1.6575589868465423, "language_loss": 0.82083011, "learning_rate": 2.2802831163632735e-06, "loss": 0.84291488, "num_input_tokens_seen": 84309245, "step": 3922, "time_per_iteration": 2.720424175262451 }, { "auxiliary_loss_clip": 0.01175413, "auxiliary_loss_mlp": 0.01031005, "balance_loss_clip": 0.85972106, "balance_loss_mlp": 1.02223134, "epoch": 0.47171285997715384, "flos": 22672884430080.0, "grad_norm": 2.035475552656223, "language_loss": 0.74224776, "learning_rate": 2.279511810774965e-06, "loss": 0.76431191, "num_input_tokens_seen": 84330775, "step": 3923, "time_per_iteration": 2.82926869392395 }, { "auxiliary_loss_clip": 0.01179536, "auxiliary_loss_mlp": 0.01026358, "balance_loss_clip": 1.05099726, "balance_loss_mlp": 1.01868689, "epoch": 0.47183310286779295, "flos": 21105419754240.0, "grad_norm": 1.940931909905118, "language_loss": 0.71555674, "learning_rate": 2.2787404627849364e-06, "loss": 0.7376157, "num_input_tokens_seen": 84349985, "step": 3924, "time_per_iteration": 3.525672674179077 }, { "auxiliary_loss_clip": 0.01174013, "auxiliary_loss_mlp": 0.01023784, "balance_loss_clip": 0.97324026, "balance_loss_mlp": 1.01591671, "epoch": 0.471953345758432, "flos": 21726566668800.0, "grad_norm": 1.7172367691297787, "language_loss": 0.79030645, "learning_rate": 2.277969072510202e-06, "loss": 0.81228435, "num_input_tokens_seen": 84368965, "step": 3925, "time_per_iteration": 2.6665539741516113 }, { "auxiliary_loss_clip": 0.01177491, "auxiliary_loss_mlp": 0.01025579, "balance_loss_clip": 0.97490633, "balance_loss_mlp": 1.01836085, "epoch": 0.4720735886490711, "flos": 19861078849920.0, "grad_norm": 1.520723755385773, "language_loss": 0.81422281, "learning_rate": 2.2771976400677803e-06, "loss": 0.83625352, "num_input_tokens_seen": 84387795, "step": 3926, "time_per_iteration": 2.7157576084136963 }, { "auxiliary_loss_clip": 0.01160229, "auxiliary_loss_mlp": 0.01024303, "balance_loss_clip": 0.89472032, "balance_loss_mlp": 1.01642609, "epoch": 0.47219383153971023, "flos": 19171809792000.0, "grad_norm": 1.6505001281074074, "language_loss": 0.7886101, "learning_rate": 2.2764261655746965e-06, "loss": 0.81045538, "num_input_tokens_seen": 84405290, "step": 3927, "time_per_iteration": 3.7002596855163574 }, { "auxiliary_loss_clip": 0.01174467, "auxiliary_loss_mlp": 0.01033203, "balance_loss_clip": 0.93767905, "balance_loss_mlp": 1.02518618, "epoch": 0.4723140744303493, "flos": 23224005780480.0, "grad_norm": 1.6086064732699694, "language_loss": 0.75803936, "learning_rate": 2.2756546491479832e-06, "loss": 0.78011608, "num_input_tokens_seen": 84426205, "step": 3928, "time_per_iteration": 2.801769256591797 }, { "auxiliary_loss_clip": 0.01182144, "auxiliary_loss_mlp": 0.0112343, "balance_loss_clip": 1.05134797, "balance_loss_mlp": 0.0, "epoch": 0.4724343173209884, "flos": 18223265387520.0, "grad_norm": 2.0412208981252786, "language_loss": 0.80335838, "learning_rate": 2.274883090904679e-06, "loss": 0.82641417, "num_input_tokens_seen": 84443970, "step": 3929, "time_per_iteration": 2.6210222244262695 }, { "auxiliary_loss_clip": 0.0118292, "auxiliary_loss_mlp": 0.01027274, "balance_loss_clip": 1.0541656, "balance_loss_mlp": 1.0193522, "epoch": 0.4725545602116275, "flos": 21251037490560.0, "grad_norm": 3.6249815121461313, "language_loss": 0.68082917, "learning_rate": 2.2741114909618283e-06, "loss": 0.70293111, "num_input_tokens_seen": 84459865, "step": 3930, "time_per_iteration": 2.5650651454925537 }, { "auxiliary_loss_clip": 0.01177962, "auxiliary_loss_mlp": 0.01027516, "balance_loss_clip": 0.93824345, "balance_loss_mlp": 1.01967752, "epoch": 0.47267480310226656, "flos": 21434002392960.0, "grad_norm": 1.6019845872073553, "language_loss": 0.72181153, "learning_rate": 2.2733398494364828e-06, "loss": 0.74386632, "num_input_tokens_seen": 84479110, "step": 3931, "time_per_iteration": 2.700230360031128 }, { "auxiliary_loss_clip": 0.01178726, "auxiliary_loss_mlp": 0.01031097, "balance_loss_clip": 0.9798007, "balance_loss_mlp": 1.02342618, "epoch": 0.47279504599290567, "flos": 18770508069120.0, "grad_norm": 2.107848890856697, "language_loss": 0.84395373, "learning_rate": 2.272568166445699e-06, "loss": 0.86605197, "num_input_tokens_seen": 84497675, "step": 3932, "time_per_iteration": 2.6494975090026855 }, { "auxiliary_loss_clip": 0.0117987, "auxiliary_loss_mlp": 0.0102756, "balance_loss_clip": 1.01301479, "balance_loss_mlp": 1.01954317, "epoch": 0.4729152888835448, "flos": 21105742976640.0, "grad_norm": 1.8450470312974536, "language_loss": 0.64230347, "learning_rate": 2.271796442106541e-06, "loss": 0.66437769, "num_input_tokens_seen": 84517030, "step": 3933, "time_per_iteration": 2.6700918674468994 }, { "auxiliary_loss_clip": 0.01086002, "auxiliary_loss_mlp": 0.0100038, "balance_loss_clip": 0.90923887, "balance_loss_mlp": 0.9983061, "epoch": 0.47303553177418384, "flos": 70201877840640.0, "grad_norm": 0.8089513541824346, "language_loss": 0.56540263, "learning_rate": 2.271024676536079e-06, "loss": 0.58626646, "num_input_tokens_seen": 84577290, "step": 3934, "time_per_iteration": 3.273853063583374 }, { "auxiliary_loss_clip": 0.01186058, "auxiliary_loss_mlp": 0.01035753, "balance_loss_clip": 0.97981137, "balance_loss_mlp": 1.02698541, "epoch": 0.47315577466482295, "flos": 22455122227200.0, "grad_norm": 1.9805480198256198, "language_loss": 0.73365009, "learning_rate": 2.2702528698513894e-06, "loss": 0.7558682, "num_input_tokens_seen": 84598415, "step": 3935, "time_per_iteration": 2.707585334777832 }, { "auxiliary_loss_clip": 0.01177888, "auxiliary_loss_mlp": 0.01027248, "balance_loss_clip": 0.97458982, "balance_loss_mlp": 1.01960373, "epoch": 0.47327601755546206, "flos": 24352857480960.0, "grad_norm": 1.617074456626834, "language_loss": 0.78637004, "learning_rate": 2.269481022169554e-06, "loss": 0.80842143, "num_input_tokens_seen": 84617010, "step": 3936, "time_per_iteration": 2.7523419857025146 }, { "auxiliary_loss_clip": 0.01186918, "auxiliary_loss_mlp": 0.01029636, "balance_loss_clip": 0.97672123, "balance_loss_mlp": 1.02129722, "epoch": 0.4733962604461011, "flos": 22926772736640.0, "grad_norm": 2.824599554424758, "language_loss": 0.80031216, "learning_rate": 2.2687091336076614e-06, "loss": 0.82247764, "num_input_tokens_seen": 84636350, "step": 3937, "time_per_iteration": 2.751363754272461 }, { "auxiliary_loss_clip": 0.01178052, "auxiliary_loss_mlp": 0.01028657, "balance_loss_clip": 1.01390576, "balance_loss_mlp": 1.02043128, "epoch": 0.4735165033367402, "flos": 18327369980160.0, "grad_norm": 1.7447117437975892, "language_loss": 0.80029643, "learning_rate": 2.267937204282807e-06, "loss": 0.82236356, "num_input_tokens_seen": 84653490, "step": 3938, "time_per_iteration": 2.67191219329834 }, { "auxiliary_loss_clip": 0.01187012, "auxiliary_loss_mlp": 0.01027955, "balance_loss_clip": 1.01628947, "balance_loss_mlp": 1.01931262, "epoch": 0.4736367462273793, "flos": 23037018554880.0, "grad_norm": 2.6015197654744537, "language_loss": 0.78868127, "learning_rate": 2.2671652343120926e-06, "loss": 0.81083089, "num_input_tokens_seen": 84673965, "step": 3939, "time_per_iteration": 2.739126682281494 }, { "auxiliary_loss_clip": 0.01182461, "auxiliary_loss_mlp": 0.0103312, "balance_loss_clip": 1.05574846, "balance_loss_mlp": 1.0249902, "epoch": 0.4737569891180184, "flos": 25374336451200.0, "grad_norm": 1.7058062382925838, "language_loss": 0.80353338, "learning_rate": 2.2663932238126236e-06, "loss": 0.8256892, "num_input_tokens_seen": 84692525, "step": 3940, "time_per_iteration": 2.72830867767334 }, { "auxiliary_loss_clip": 0.01177779, "auxiliary_loss_mlp": 0.0102183, "balance_loss_clip": 1.01285326, "balance_loss_mlp": 1.01415324, "epoch": 0.4738772320086575, "flos": 25849326925440.0, "grad_norm": 1.3405803556231821, "language_loss": 0.80055326, "learning_rate": 2.265621172901515e-06, "loss": 0.82254934, "num_input_tokens_seen": 84715640, "step": 3941, "time_per_iteration": 2.75921630859375 }, { "auxiliary_loss_clip": 0.01186266, "auxiliary_loss_mlp": 0.01035082, "balance_loss_clip": 1.05601215, "balance_loss_mlp": 1.02724051, "epoch": 0.47399747489929656, "flos": 27564420499200.0, "grad_norm": 2.2181030830447375, "language_loss": 0.71184921, "learning_rate": 2.2648490816958854e-06, "loss": 0.73406267, "num_input_tokens_seen": 84736635, "step": 3942, "time_per_iteration": 2.664879083633423 }, { "auxiliary_loss_clip": 0.01179148, "auxiliary_loss_mlp": 0.01025459, "balance_loss_clip": 1.01410675, "balance_loss_mlp": 1.01695359, "epoch": 0.47411771778993567, "flos": 24863650836480.0, "grad_norm": 2.104669264617669, "language_loss": 0.73298478, "learning_rate": 2.264076950312861e-06, "loss": 0.75503087, "num_input_tokens_seen": 84755445, "step": 3943, "time_per_iteration": 3.5666141510009766 }, { "auxiliary_loss_clip": 0.01182795, "auxiliary_loss_mlp": 0.01025736, "balance_loss_clip": 0.97615159, "balance_loss_mlp": 1.01779652, "epoch": 0.4742379606805748, "flos": 22748009725440.0, "grad_norm": 1.9340150829431766, "language_loss": 0.82367969, "learning_rate": 2.2633047788695727e-06, "loss": 0.84576499, "num_input_tokens_seen": 84775750, "step": 3944, "time_per_iteration": 3.6382944583892822 }, { "auxiliary_loss_clip": 0.01180713, "auxiliary_loss_mlp": 0.01025551, "balance_loss_clip": 0.979114, "balance_loss_mlp": 1.01803446, "epoch": 0.47435820357121383, "flos": 19681130689920.0, "grad_norm": 1.7157077288028184, "language_loss": 0.63841796, "learning_rate": 2.262532567483159e-06, "loss": 0.66048062, "num_input_tokens_seen": 84794310, "step": 3945, "time_per_iteration": 2.7329397201538086 }, { "auxiliary_loss_clip": 0.01184238, "auxiliary_loss_mlp": 0.01123051, "balance_loss_clip": 1.05419731, "balance_loss_mlp": 0.0, "epoch": 0.47447844646185294, "flos": 25228718714880.0, "grad_norm": 1.9566870451750777, "language_loss": 0.79908001, "learning_rate": 2.2617603162707635e-06, "loss": 0.82215297, "num_input_tokens_seen": 84814720, "step": 3946, "time_per_iteration": 2.692387580871582 }, { "auxiliary_loss_clip": 0.01181712, "auxiliary_loss_mlp": 0.01027651, "balance_loss_clip": 1.0540005, "balance_loss_mlp": 1.0200007, "epoch": 0.47459868935249205, "flos": 24570619683840.0, "grad_norm": 2.772793531920568, "language_loss": 0.82634419, "learning_rate": 2.2609880253495363e-06, "loss": 0.84843779, "num_input_tokens_seen": 84834355, "step": 3947, "time_per_iteration": 2.6504337787628174 }, { "auxiliary_loss_clip": 0.01189172, "auxiliary_loss_mlp": 0.01027363, "balance_loss_clip": 0.93817371, "balance_loss_mlp": 1.01986468, "epoch": 0.4747189322431311, "flos": 20558500295040.0, "grad_norm": 1.8057579586768553, "language_loss": 0.86091578, "learning_rate": 2.260215694836633e-06, "loss": 0.8830812, "num_input_tokens_seen": 84853530, "step": 3948, "time_per_iteration": 2.7247140407562256 }, { "auxiliary_loss_clip": 0.01175279, "auxiliary_loss_mlp": 0.01123291, "balance_loss_clip": 0.8962661, "balance_loss_mlp": 0.0, "epoch": 0.4748391751337702, "flos": 25995231970560.0, "grad_norm": 1.7103313949492143, "language_loss": 0.65183961, "learning_rate": 2.2594433248492157e-06, "loss": 0.67482531, "num_input_tokens_seen": 84872505, "step": 3949, "time_per_iteration": 2.8262157440185547 }, { "auxiliary_loss_clip": 0.01183663, "auxiliary_loss_mlp": 0.01033628, "balance_loss_clip": 1.01352334, "balance_loss_mlp": 1.02589142, "epoch": 0.47495941802440933, "flos": 22821052032000.0, "grad_norm": 1.6336300995570627, "language_loss": 0.79917753, "learning_rate": 2.2586709155044527e-06, "loss": 0.82135046, "num_input_tokens_seen": 84893105, "step": 3950, "time_per_iteration": 4.222455739974976 }, { "auxiliary_loss_clip": 0.01183808, "auxiliary_loss_mlp": 0.01028442, "balance_loss_clip": 1.0547719, "balance_loss_mlp": 1.02032351, "epoch": 0.4750796609150484, "flos": 27891782075520.0, "grad_norm": 1.5639861707146598, "language_loss": 0.76008672, "learning_rate": 2.2578984669195167e-06, "loss": 0.78220928, "num_input_tokens_seen": 84914070, "step": 3951, "time_per_iteration": 2.7918498516082764 }, { "auxiliary_loss_clip": 0.01175423, "auxiliary_loss_mlp": 0.01027504, "balance_loss_clip": 1.01088238, "balance_loss_mlp": 1.02054787, "epoch": 0.4751999038056875, "flos": 35660085471360.0, "grad_norm": 1.8634826245142948, "language_loss": 0.68056327, "learning_rate": 2.2571259792115887e-06, "loss": 0.70259249, "num_input_tokens_seen": 84935290, "step": 3952, "time_per_iteration": 2.7514243125915527 }, { "auxiliary_loss_clip": 0.01174054, "auxiliary_loss_mlp": 0.01027076, "balance_loss_clip": 1.0134536, "balance_loss_mlp": 1.01938701, "epoch": 0.4753201466963266, "flos": 22090880361600.0, "grad_norm": 1.6621334024143253, "language_loss": 0.79213738, "learning_rate": 2.2563534524978544e-06, "loss": 0.81414866, "num_input_tokens_seen": 84952760, "step": 3953, "time_per_iteration": 3.5696890354156494 }, { "auxiliary_loss_clip": 0.01178379, "auxiliary_loss_mlp": 0.01023343, "balance_loss_clip": 0.94169497, "balance_loss_mlp": 1.01561821, "epoch": 0.47544038958696566, "flos": 30190854965760.0, "grad_norm": 1.5162524952443193, "language_loss": 0.70729095, "learning_rate": 2.2555808868955052e-06, "loss": 0.72930825, "num_input_tokens_seen": 84974890, "step": 3954, "time_per_iteration": 2.8019964694976807 }, { "auxiliary_loss_clip": 0.01179738, "auxiliary_loss_mlp": 0.01030361, "balance_loss_clip": 0.90005791, "balance_loss_mlp": 1.02225471, "epoch": 0.47556063247760477, "flos": 23472219738240.0, "grad_norm": 2.9576822817120982, "language_loss": 0.74090487, "learning_rate": 2.254808282521738e-06, "loss": 0.76300585, "num_input_tokens_seen": 84993640, "step": 3955, "time_per_iteration": 2.79168438911438 }, { "auxiliary_loss_clip": 0.01182814, "auxiliary_loss_mlp": 0.01123158, "balance_loss_clip": 0.93746746, "balance_loss_mlp": 0.0, "epoch": 0.4756808753682438, "flos": 25155209531520.0, "grad_norm": 1.8759965885794865, "language_loss": 0.81357157, "learning_rate": 2.2540356394937573e-06, "loss": 0.8366313, "num_input_tokens_seen": 85012340, "step": 3956, "time_per_iteration": 2.7377727031707764 }, { "auxiliary_loss_clip": 0.01181889, "auxiliary_loss_mlp": 0.010268, "balance_loss_clip": 0.93760145, "balance_loss_mlp": 1.01867568, "epoch": 0.47580111825888294, "flos": 15669729573120.0, "grad_norm": 1.9949998302790641, "language_loss": 0.83807933, "learning_rate": 2.253262957928772e-06, "loss": 0.86016631, "num_input_tokens_seen": 85029225, "step": 3957, "time_per_iteration": 2.7102699279785156 }, { "auxiliary_loss_clip": 0.01170522, "auxiliary_loss_mlp": 0.01025505, "balance_loss_clip": 0.97149736, "balance_loss_mlp": 1.01745212, "epoch": 0.47592136114952205, "flos": 17636556637440.0, "grad_norm": 1.8185153801245773, "language_loss": 0.71959275, "learning_rate": 2.2524902379439976e-06, "loss": 0.74155301, "num_input_tokens_seen": 85047895, "step": 3958, "time_per_iteration": 2.667665719985962 }, { "auxiliary_loss_clip": 0.01099413, "auxiliary_loss_mlp": 0.01003249, "balance_loss_clip": 0.85013974, "balance_loss_mlp": 1.00118625, "epoch": 0.4760416040401611, "flos": 61417159292160.0, "grad_norm": 0.7495430508026236, "language_loss": 0.6374656, "learning_rate": 2.251717479656655e-06, "loss": 0.65849221, "num_input_tokens_seen": 85112690, "step": 3959, "time_per_iteration": 3.416773557662964 }, { "auxiliary_loss_clip": 0.01184482, "auxiliary_loss_mlp": 0.01032202, "balance_loss_clip": 1.05430508, "balance_loss_mlp": 1.02435255, "epoch": 0.4761618469308002, "flos": 18405871153920.0, "grad_norm": 1.8718279799183892, "language_loss": 0.76082563, "learning_rate": 2.2509446831839704e-06, "loss": 0.78299248, "num_input_tokens_seen": 85132130, "step": 3960, "time_per_iteration": 2.642162561416626 }, { "auxiliary_loss_clip": 0.01178949, "auxiliary_loss_mlp": 0.01029591, "balance_loss_clip": 0.97393447, "balance_loss_mlp": 1.02185464, "epoch": 0.4762820898214393, "flos": 18040911016320.0, "grad_norm": 2.337249096009304, "language_loss": 0.81730533, "learning_rate": 2.250171848643177e-06, "loss": 0.83939075, "num_input_tokens_seen": 85149420, "step": 3961, "time_per_iteration": 2.6722962856292725 }, { "auxiliary_loss_clip": 0.01176936, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 0.97658896, "balance_loss_mlp": 1.0262661, "epoch": 0.4764023327120784, "flos": 19318253541120.0, "grad_norm": 1.7114521730332501, "language_loss": 0.85614586, "learning_rate": 2.249398976151513e-06, "loss": 0.87825227, "num_input_tokens_seen": 85166970, "step": 3962, "time_per_iteration": 2.658738851547241 }, { "auxiliary_loss_clip": 0.01179037, "auxiliary_loss_mlp": 0.01027107, "balance_loss_clip": 1.05221808, "balance_loss_mlp": 1.01932847, "epoch": 0.4765225756027175, "flos": 22747255539840.0, "grad_norm": 1.9748417614280838, "language_loss": 0.78385568, "learning_rate": 2.248626065826223e-06, "loss": 0.80591714, "num_input_tokens_seen": 85185175, "step": 3963, "time_per_iteration": 2.6171867847442627 }, { "auxiliary_loss_clip": 0.01080856, "auxiliary_loss_mlp": 0.01000652, "balance_loss_clip": 1.02072859, "balance_loss_mlp": 0.99868548, "epoch": 0.4766428184933566, "flos": 65933392106880.0, "grad_norm": 0.7945541631189439, "language_loss": 0.62641823, "learning_rate": 2.2478531177845564e-06, "loss": 0.64723337, "num_input_tokens_seen": 85246170, "step": 3964, "time_per_iteration": 3.167912483215332 }, { "auxiliary_loss_clip": 0.01185556, "auxiliary_loss_mlp": 0.01032709, "balance_loss_clip": 0.97905451, "balance_loss_mlp": 1.02484739, "epoch": 0.47676306138399566, "flos": 24136495908480.0, "grad_norm": 1.696415590811519, "language_loss": 0.84958357, "learning_rate": 2.247080132143769e-06, "loss": 0.87176621, "num_input_tokens_seen": 85268525, "step": 3965, "time_per_iteration": 2.7269718647003174 }, { "auxiliary_loss_clip": 0.01171392, "auxiliary_loss_mlp": 0.01025827, "balance_loss_clip": 0.93133169, "balance_loss_mlp": 1.01754785, "epoch": 0.47688330427463477, "flos": 12604322995200.0, "grad_norm": 1.9009701192365227, "language_loss": 0.69104612, "learning_rate": 2.246307109021121e-06, "loss": 0.71301824, "num_input_tokens_seen": 85285930, "step": 3966, "time_per_iteration": 2.71795916557312 }, { "auxiliary_loss_clip": 0.01171923, "auxiliary_loss_mlp": 0.01028437, "balance_loss_clip": 0.97360671, "balance_loss_mlp": 1.02130222, "epoch": 0.4770035471652739, "flos": 21390585828480.0, "grad_norm": 1.8801183003646522, "language_loss": 0.82323337, "learning_rate": 2.2455340485338817e-06, "loss": 0.84523696, "num_input_tokens_seen": 85303565, "step": 3967, "time_per_iteration": 2.6356818675994873 }, { "auxiliary_loss_clip": 0.01179983, "auxiliary_loss_mlp": 0.01026679, "balance_loss_clip": 1.01258671, "balance_loss_mlp": 1.01902008, "epoch": 0.47712379005591293, "flos": 25156251025920.0, "grad_norm": 2.062357111743258, "language_loss": 0.67628932, "learning_rate": 2.244760950799322e-06, "loss": 0.69835603, "num_input_tokens_seen": 85321835, "step": 3968, "time_per_iteration": 3.745187997817993 }, { "auxiliary_loss_clip": 0.01161428, "auxiliary_loss_mlp": 0.01026109, "balance_loss_clip": 0.93746346, "balance_loss_mlp": 1.01910532, "epoch": 0.47724403294655204, "flos": 22054323294720.0, "grad_norm": 1.7930922023183729, "language_loss": 0.72551656, "learning_rate": 2.2439878159347203e-06, "loss": 0.74739194, "num_input_tokens_seen": 85341260, "step": 3969, "time_per_iteration": 2.7598681449890137 }, { "auxiliary_loss_clip": 0.01081199, "auxiliary_loss_mlp": 0.01005474, "balance_loss_clip": 1.02110338, "balance_loss_mlp": 1.00350749, "epoch": 0.4773642758371911, "flos": 70229387658240.0, "grad_norm": 0.8461277284827895, "language_loss": 0.55274415, "learning_rate": 2.2432146440573616e-06, "loss": 0.5736109, "num_input_tokens_seen": 85407220, "step": 3970, "time_per_iteration": 4.316458463668823 }, { "auxiliary_loss_clip": 0.01177689, "auxiliary_loss_mlp": 0.0102866, "balance_loss_clip": 0.97579414, "balance_loss_mlp": 1.02047026, "epoch": 0.4774845187278302, "flos": 23548602009600.0, "grad_norm": 2.75859660414755, "language_loss": 0.66309249, "learning_rate": 2.242441435284534e-06, "loss": 0.68515599, "num_input_tokens_seen": 85426095, "step": 3971, "time_per_iteration": 2.6829447746276855 }, { "auxiliary_loss_clip": 0.01179999, "auxiliary_loss_mlp": 0.01032858, "balance_loss_clip": 1.01536798, "balance_loss_mlp": 1.02488542, "epoch": 0.4776047616184693, "flos": 23075371301760.0, "grad_norm": 2.0734740365897126, "language_loss": 0.85426891, "learning_rate": 2.2416681897335337e-06, "loss": 0.87639749, "num_input_tokens_seen": 85444245, "step": 3972, "time_per_iteration": 2.688155174255371 }, { "auxiliary_loss_clip": 0.01175516, "auxiliary_loss_mlp": 0.01031227, "balance_loss_clip": 0.89951462, "balance_loss_mlp": 1.02346075, "epoch": 0.4777250045091084, "flos": 31898119374720.0, "grad_norm": 2.090673568173791, "language_loss": 0.66926968, "learning_rate": 2.240894907521661e-06, "loss": 0.69133711, "num_input_tokens_seen": 85463325, "step": 3973, "time_per_iteration": 2.836580991744995 }, { "auxiliary_loss_clip": 0.01177257, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 0.97475123, "balance_loss_mlp": 1.01939797, "epoch": 0.4778452473997475, "flos": 24278163148800.0, "grad_norm": 1.9761821539613176, "language_loss": 0.6364429, "learning_rate": 2.240121588766223e-06, "loss": 0.65848827, "num_input_tokens_seen": 85483375, "step": 3974, "time_per_iteration": 2.766446828842163 }, { "auxiliary_loss_clip": 0.01169738, "auxiliary_loss_mlp": 0.01034475, "balance_loss_clip": 0.97432017, "balance_loss_mlp": 1.02740288, "epoch": 0.4779654902903866, "flos": 31575031516800.0, "grad_norm": 1.766693225748727, "language_loss": 0.71425879, "learning_rate": 2.239348233584531e-06, "loss": 0.73630095, "num_input_tokens_seen": 85504230, "step": 3975, "time_per_iteration": 3.6623754501342773 }, { "auxiliary_loss_clip": 0.01181733, "auxiliary_loss_mlp": 0.01023871, "balance_loss_clip": 1.0146687, "balance_loss_mlp": 1.01661062, "epoch": 0.47808573318102565, "flos": 19500428344320.0, "grad_norm": 1.7534684789792143, "language_loss": 0.80641788, "learning_rate": 2.2385748420939013e-06, "loss": 0.82847393, "num_input_tokens_seen": 85523425, "step": 3976, "time_per_iteration": 2.7025582790374756 }, { "auxiliary_loss_clip": 0.01178688, "auxiliary_loss_mlp": 0.01024837, "balance_loss_clip": 1.05443573, "balance_loss_mlp": 1.01717162, "epoch": 0.47820597607166476, "flos": 22601135013120.0, "grad_norm": 1.6141070808389437, "language_loss": 0.72103786, "learning_rate": 2.2378014144116583e-06, "loss": 0.74307311, "num_input_tokens_seen": 85542235, "step": 3977, "time_per_iteration": 2.6742091178894043 }, { "auxiliary_loss_clip": 0.01183259, "auxiliary_loss_mlp": 0.01024596, "balance_loss_clip": 1.05374944, "balance_loss_mlp": 1.01707423, "epoch": 0.4783262189623039, "flos": 23003011353600.0, "grad_norm": 3.7545619142271014, "language_loss": 0.79356098, "learning_rate": 2.23702795065513e-06, "loss": 0.8156395, "num_input_tokens_seen": 85561815, "step": 3978, "time_per_iteration": 2.646331548690796 }, { "auxiliary_loss_clip": 0.01082452, "auxiliary_loss_mlp": 0.01002705, "balance_loss_clip": 0.98199815, "balance_loss_mlp": 1.00060654, "epoch": 0.47844646185294293, "flos": 49772801226240.0, "grad_norm": 0.9973391865312368, "language_loss": 0.67553055, "learning_rate": 2.2362544509416493e-06, "loss": 0.69638211, "num_input_tokens_seen": 85613930, "step": 3979, "time_per_iteration": 4.024907112121582 }, { "auxiliary_loss_clip": 0.01169211, "auxiliary_loss_mlp": 0.01024436, "balance_loss_clip": 0.97229671, "balance_loss_mlp": 1.01708031, "epoch": 0.47856670474358204, "flos": 20229558520320.0, "grad_norm": 2.217414866191193, "language_loss": 0.82758129, "learning_rate": 2.2354809153885572e-06, "loss": 0.8495177, "num_input_tokens_seen": 85631000, "step": 3980, "time_per_iteration": 2.627129077911377 }, { "auxiliary_loss_clip": 0.01177238, "auxiliary_loss_mlp": 0.0102641, "balance_loss_clip": 1.01263785, "balance_loss_mlp": 1.01876259, "epoch": 0.47868694763422115, "flos": 20990936131200.0, "grad_norm": 1.8081379191263647, "language_loss": 0.82967263, "learning_rate": 2.234707344113197e-06, "loss": 0.85170913, "num_input_tokens_seen": 85649095, "step": 3981, "time_per_iteration": 2.620471477508545 }, { "auxiliary_loss_clip": 0.0117754, "auxiliary_loss_mlp": 0.01027763, "balance_loss_clip": 1.05287051, "balance_loss_mlp": 1.0204556, "epoch": 0.4788071905248602, "flos": 19026551191680.0, "grad_norm": 1.9395327499697337, "language_loss": 0.77433288, "learning_rate": 2.233933737232919e-06, "loss": 0.796386, "num_input_tokens_seen": 85666875, "step": 3982, "time_per_iteration": 2.6357529163360596 }, { "auxiliary_loss_clip": 0.01165607, "auxiliary_loss_mlp": 0.0112323, "balance_loss_clip": 0.89660603, "balance_loss_mlp": 0.0, "epoch": 0.4789274334154993, "flos": 23002221254400.0, "grad_norm": 2.1611270135219094, "language_loss": 0.78136921, "learning_rate": 2.2331600948650793e-06, "loss": 0.80425763, "num_input_tokens_seen": 85687020, "step": 3983, "time_per_iteration": 2.7675275802612305 }, { "auxiliary_loss_clip": 0.01167901, "auxiliary_loss_mlp": 0.01124087, "balance_loss_clip": 0.93728745, "balance_loss_mlp": 0.0, "epoch": 0.4790476763061384, "flos": 23075586783360.0, "grad_norm": 1.3786218390657268, "language_loss": 0.79799509, "learning_rate": 2.2323864171270386e-06, "loss": 0.82091498, "num_input_tokens_seen": 85708290, "step": 3984, "time_per_iteration": 2.7854630947113037 }, { "auxiliary_loss_clip": 0.01178502, "auxiliary_loss_mlp": 0.01027273, "balance_loss_clip": 0.93507564, "balance_loss_mlp": 1.01879692, "epoch": 0.4791679191967775, "flos": 21179288073600.0, "grad_norm": 1.8150697633336277, "language_loss": 0.72034764, "learning_rate": 2.231612704136164e-06, "loss": 0.74240541, "num_input_tokens_seen": 85728660, "step": 3985, "time_per_iteration": 2.7113404273986816 }, { "auxiliary_loss_clip": 0.01170953, "auxiliary_loss_mlp": 0.01033016, "balance_loss_clip": 1.01091254, "balance_loss_mlp": 1.02512479, "epoch": 0.4792881620874166, "flos": 22301495758080.0, "grad_norm": 3.3626337589589674, "language_loss": 0.74650824, "learning_rate": 2.2308389560098253e-06, "loss": 0.76854789, "num_input_tokens_seen": 85745035, "step": 3986, "time_per_iteration": 2.6774332523345947 }, { "auxiliary_loss_clip": 0.01181988, "auxiliary_loss_mlp": 0.01029154, "balance_loss_clip": 0.93911731, "balance_loss_mlp": 1.02152205, "epoch": 0.47940840497805565, "flos": 17420877423360.0, "grad_norm": 2.1446998255948144, "language_loss": 0.77173758, "learning_rate": 2.2300651728654008e-06, "loss": 0.79384905, "num_input_tokens_seen": 85760295, "step": 3987, "time_per_iteration": 2.6814935207366943 }, { "auxiliary_loss_clip": 0.0107546, "auxiliary_loss_mlp": 0.0111682, "balance_loss_clip": 0.98059368, "balance_loss_mlp": 0.0, "epoch": 0.47952864786869476, "flos": 65358175708800.0, "grad_norm": 0.7626167703485714, "language_loss": 0.60248744, "learning_rate": 2.229291354820272e-06, "loss": 0.62441027, "num_input_tokens_seen": 85821305, "step": 3988, "time_per_iteration": 3.256737232208252 }, { "auxiliary_loss_clip": 0.01174942, "auxiliary_loss_mlp": 0.01027833, "balance_loss_clip": 1.01111031, "balance_loss_mlp": 1.02028155, "epoch": 0.47964889075933387, "flos": 16799802336000.0, "grad_norm": 1.9532909016461715, "language_loss": 0.76057088, "learning_rate": 2.228517501991828e-06, "loss": 0.78259867, "num_input_tokens_seen": 85840105, "step": 3989, "time_per_iteration": 2.6199166774749756 }, { "auxiliary_loss_clip": 0.01078915, "auxiliary_loss_mlp": 0.01000991, "balance_loss_clip": 0.94377756, "balance_loss_mlp": 0.99890518, "epoch": 0.4797691336499729, "flos": 70079244808320.0, "grad_norm": 0.8278212904239187, "language_loss": 0.61109257, "learning_rate": 2.22774361449746e-06, "loss": 0.63189167, "num_input_tokens_seen": 85896585, "step": 3990, "time_per_iteration": 3.261406660079956 }, { "auxiliary_loss_clip": 0.01176033, "auxiliary_loss_mlp": 0.01024158, "balance_loss_clip": 0.86207432, "balance_loss_mlp": 1.01656461, "epoch": 0.47988937654061203, "flos": 18953329317120.0, "grad_norm": 2.4759798310822956, "language_loss": 0.70261288, "learning_rate": 2.2269696924545668e-06, "loss": 0.72461474, "num_input_tokens_seen": 85914415, "step": 3991, "time_per_iteration": 2.806281566619873 }, { "auxiliary_loss_clip": 0.01176205, "auxiliary_loss_mlp": 0.01028703, "balance_loss_clip": 0.93906009, "balance_loss_mlp": 1.02116275, "epoch": 0.48000961943125114, "flos": 14461981649280.0, "grad_norm": 2.0585959840140475, "language_loss": 0.77484155, "learning_rate": 2.2261957359805523e-06, "loss": 0.79689068, "num_input_tokens_seen": 85931650, "step": 3992, "time_per_iteration": 2.7230756282806396 }, { "auxiliary_loss_clip": 0.01180615, "auxiliary_loss_mlp": 0.01025625, "balance_loss_clip": 1.05273175, "balance_loss_mlp": 1.01778102, "epoch": 0.4801298623218902, "flos": 27051149105280.0, "grad_norm": 1.8325477288491352, "language_loss": 0.73516572, "learning_rate": 2.225421745192823e-06, "loss": 0.75722808, "num_input_tokens_seen": 85951805, "step": 3993, "time_per_iteration": 2.6593899726867676 }, { "auxiliary_loss_clip": 0.01179002, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.01664996, "balance_loss_mlp": 1.02426934, "epoch": 0.4802501052125293, "flos": 26355236031360.0, "grad_norm": 2.288061185572494, "language_loss": 0.78168964, "learning_rate": 2.2246477202087955e-06, "loss": 0.80379528, "num_input_tokens_seen": 85972485, "step": 3994, "time_per_iteration": 3.6701674461364746 }, { "auxiliary_loss_clip": 0.01178996, "auxiliary_loss_mlp": 0.01029431, "balance_loss_clip": 0.97427016, "balance_loss_mlp": 1.02214777, "epoch": 0.4803703481031684, "flos": 20993916960000.0, "grad_norm": 10.561255078027752, "language_loss": 0.82815325, "learning_rate": 2.223873661145887e-06, "loss": 0.85023761, "num_input_tokens_seen": 85992540, "step": 3995, "time_per_iteration": 2.717434883117676 }, { "auxiliary_loss_clip": 0.01178157, "auxiliary_loss_mlp": 0.01122875, "balance_loss_clip": 0.97942179, "balance_loss_mlp": 0.0, "epoch": 0.4804905909938075, "flos": 20703722981760.0, "grad_norm": 1.608437983281736, "language_loss": 0.71001679, "learning_rate": 2.2230995681215226e-06, "loss": 0.7330271, "num_input_tokens_seen": 86012065, "step": 3996, "time_per_iteration": 3.6881227493286133 }, { "auxiliary_loss_clip": 0.01171415, "auxiliary_loss_mlp": 0.01024229, "balance_loss_clip": 0.93597639, "balance_loss_mlp": 1.01636124, "epoch": 0.4806108338844466, "flos": 16654831044480.0, "grad_norm": 1.866714850150994, "language_loss": 0.78062475, "learning_rate": 2.2223254412531305e-06, "loss": 0.80258119, "num_input_tokens_seen": 86029435, "step": 3997, "time_per_iteration": 2.6905627250671387 }, { "auxiliary_loss_clip": 0.01159969, "auxiliary_loss_mlp": 0.0102586, "balance_loss_clip": 0.96977568, "balance_loss_mlp": 1.01895237, "epoch": 0.4807310767750857, "flos": 20011329440640.0, "grad_norm": 2.055680500562341, "language_loss": 0.82856715, "learning_rate": 2.221551280658146e-06, "loss": 0.85042548, "num_input_tokens_seen": 86048495, "step": 3998, "time_per_iteration": 2.6393752098083496 }, { "auxiliary_loss_clip": 0.01169978, "auxiliary_loss_mlp": 0.01026211, "balance_loss_clip": 0.89830971, "balance_loss_mlp": 1.01872444, "epoch": 0.48085131966572475, "flos": 23185257984000.0, "grad_norm": 1.695819323071918, "language_loss": 0.74132991, "learning_rate": 2.2207770864540085e-06, "loss": 0.76329184, "num_input_tokens_seen": 86067470, "step": 3999, "time_per_iteration": 2.7243921756744385 }, { "auxiliary_loss_clip": 0.01170449, "auxiliary_loss_mlp": 0.01026041, "balance_loss_clip": 0.97507203, "balance_loss_mlp": 1.0182451, "epoch": 0.48097156255636386, "flos": 20558643949440.0, "grad_norm": 1.9591459172573358, "language_loss": 0.72664642, "learning_rate": 2.220002858758162e-06, "loss": 0.74861133, "num_input_tokens_seen": 86085460, "step": 4000, "time_per_iteration": 2.687453031539917 }, { "auxiliary_loss_clip": 0.01083595, "auxiliary_loss_mlp": 0.01004006, "balance_loss_clip": 0.98247862, "balance_loss_mlp": 1.00201547, "epoch": 0.481091805447003, "flos": 70511608817280.0, "grad_norm": 0.8803125212786385, "language_loss": 0.60860884, "learning_rate": 2.2192285976880573e-06, "loss": 0.62948483, "num_input_tokens_seen": 86149715, "step": 4001, "time_per_iteration": 3.199812889099121 }, { "auxiliary_loss_clip": 0.01173261, "auxiliary_loss_mlp": 0.01122057, "balance_loss_clip": 0.93342417, "balance_loss_mlp": 0.0, "epoch": 0.48121204833764203, "flos": 36428214839040.0, "grad_norm": 1.5197649567192624, "language_loss": 0.80577302, "learning_rate": 2.2184543033611485e-06, "loss": 0.82872623, "num_input_tokens_seen": 86170795, "step": 4002, "time_per_iteration": 3.751117467880249 }, { "auxiliary_loss_clip": 0.01182421, "auxiliary_loss_mlp": 0.01026059, "balance_loss_clip": 1.01434338, "balance_loss_mlp": 1.01893604, "epoch": 0.48133229122828114, "flos": 27490264871040.0, "grad_norm": 2.056468586707116, "language_loss": 0.81560725, "learning_rate": 2.2176799758948957e-06, "loss": 0.83769202, "num_input_tokens_seen": 86190955, "step": 4003, "time_per_iteration": 2.7709903717041016 }, { "auxiliary_loss_clip": 0.01170503, "auxiliary_loss_mlp": 0.01021499, "balance_loss_clip": 0.97452164, "balance_loss_mlp": 1.01410794, "epoch": 0.4814525341189202, "flos": 43072802179200.0, "grad_norm": 1.6250689160100267, "language_loss": 0.73318201, "learning_rate": 2.2169056154067635e-06, "loss": 0.75510204, "num_input_tokens_seen": 86214875, "step": 4004, "time_per_iteration": 2.945591688156128 }, { "auxiliary_loss_clip": 0.01177611, "auxiliary_loss_mlp": 0.01123099, "balance_loss_clip": 1.01427078, "balance_loss_mlp": 0.0, "epoch": 0.4815727770095593, "flos": 24236901400320.0, "grad_norm": 1.8208350959062822, "language_loss": 0.82549411, "learning_rate": 2.216131222014222e-06, "loss": 0.84850121, "num_input_tokens_seen": 86232950, "step": 4005, "time_per_iteration": 3.6596832275390625 }, { "auxiliary_loss_clip": 0.01167425, "auxiliary_loss_mlp": 0.01022005, "balance_loss_clip": 0.93561602, "balance_loss_mlp": 1.01460767, "epoch": 0.4816930199001984, "flos": 18113630100480.0, "grad_norm": 2.7400016645109693, "language_loss": 0.8025009, "learning_rate": 2.2153567958347455e-06, "loss": 0.82439518, "num_input_tokens_seen": 86249160, "step": 4006, "time_per_iteration": 2.7231571674346924 }, { "auxiliary_loss_clip": 0.01179782, "auxiliary_loss_mlp": 0.0103534, "balance_loss_clip": 0.97952199, "balance_loss_mlp": 1.02808642, "epoch": 0.48181326279083747, "flos": 17274720983040.0, "grad_norm": 1.9880608698593258, "language_loss": 0.80141687, "learning_rate": 2.214582336985815e-06, "loss": 0.82356805, "num_input_tokens_seen": 86267060, "step": 4007, "time_per_iteration": 2.705195188522339 }, { "auxiliary_loss_clip": 0.01169548, "auxiliary_loss_mlp": 0.01031824, "balance_loss_clip": 0.97471237, "balance_loss_mlp": 1.02394998, "epoch": 0.4819335056814766, "flos": 14903252231040.0, "grad_norm": 2.133217309600125, "language_loss": 0.66350913, "learning_rate": 2.2138078455849142e-06, "loss": 0.68552279, "num_input_tokens_seen": 86285055, "step": 4008, "time_per_iteration": 2.6863112449645996 }, { "auxiliary_loss_clip": 0.01183265, "auxiliary_loss_mlp": 0.01024123, "balance_loss_clip": 1.01424694, "balance_loss_mlp": 1.01707196, "epoch": 0.4820537485721157, "flos": 19244888012160.0, "grad_norm": 1.7439656242403034, "language_loss": 0.7859894, "learning_rate": 2.2130333217495334e-06, "loss": 0.80806327, "num_input_tokens_seen": 86304225, "step": 4009, "time_per_iteration": 2.661059617996216 }, { "auxiliary_loss_clip": 0.01172278, "auxiliary_loss_mlp": 0.01027569, "balance_loss_clip": 0.97451741, "balance_loss_mlp": 1.01973081, "epoch": 0.48217399146275475, "flos": 16033791870720.0, "grad_norm": 2.465164586967833, "language_loss": 0.67257142, "learning_rate": 2.2122587655971665e-06, "loss": 0.69456995, "num_input_tokens_seen": 86319170, "step": 4010, "time_per_iteration": 2.6825106143951416 }, { "auxiliary_loss_clip": 0.01176644, "auxiliary_loss_mlp": 0.01028709, "balance_loss_clip": 0.97485167, "balance_loss_mlp": 1.02074337, "epoch": 0.48229423435339386, "flos": 24134197438080.0, "grad_norm": 1.8303689111761716, "language_loss": 0.63614428, "learning_rate": 2.211484177245314e-06, "loss": 0.65819776, "num_input_tokens_seen": 86338760, "step": 4011, "time_per_iteration": 2.697883367538452 }, { "auxiliary_loss_clip": 0.01180575, "auxiliary_loss_mlp": 0.01024471, "balance_loss_clip": 1.0533911, "balance_loss_mlp": 1.01673388, "epoch": 0.48241447724403297, "flos": 23805435231360.0, "grad_norm": 1.8814606006180679, "language_loss": 0.72102082, "learning_rate": 2.21070955681148e-06, "loss": 0.74307132, "num_input_tokens_seen": 86357865, "step": 4012, "time_per_iteration": 2.6364872455596924 }, { "auxiliary_loss_clip": 0.01166977, "auxiliary_loss_mlp": 0.01023576, "balance_loss_clip": 0.93766433, "balance_loss_mlp": 1.01618171, "epoch": 0.482534720134672, "flos": 23110312256640.0, "grad_norm": 1.535921755359754, "language_loss": 0.78114569, "learning_rate": 2.209934904413174e-06, "loss": 0.80305123, "num_input_tokens_seen": 86379470, "step": 4013, "time_per_iteration": 2.720353603363037 }, { "auxiliary_loss_clip": 0.01165394, "auxiliary_loss_mlp": 0.01023794, "balance_loss_clip": 0.85252005, "balance_loss_mlp": 1.01591992, "epoch": 0.48265496302531113, "flos": 20923819568640.0, "grad_norm": 2.4190460762000674, "language_loss": 0.71754372, "learning_rate": 2.2091602201679095e-06, "loss": 0.73943561, "num_input_tokens_seen": 86399080, "step": 4014, "time_per_iteration": 2.813106060028076 }, { "auxiliary_loss_clip": 0.01178317, "auxiliary_loss_mlp": 0.01032019, "balance_loss_clip": 0.93600821, "balance_loss_mlp": 1.02432752, "epoch": 0.48277520591595025, "flos": 15231152511360.0, "grad_norm": 2.2627551687327196, "language_loss": 0.83008909, "learning_rate": 2.208385504193206e-06, "loss": 0.85219252, "num_input_tokens_seen": 86416580, "step": 4015, "time_per_iteration": 2.6845619678497314 }, { "auxiliary_loss_clip": 0.01178017, "auxiliary_loss_mlp": 0.0102541, "balance_loss_clip": 1.04959822, "balance_loss_mlp": 1.01781094, "epoch": 0.4828954488065893, "flos": 17858664385920.0, "grad_norm": 2.3235905034182767, "language_loss": 0.81539559, "learning_rate": 2.2076107566065873e-06, "loss": 0.83742994, "num_input_tokens_seen": 86434365, "step": 4016, "time_per_iteration": 2.629756212234497 }, { "auxiliary_loss_clip": 0.01184147, "auxiliary_loss_mlp": 0.01028452, "balance_loss_clip": 1.01595306, "balance_loss_mlp": 1.02158213, "epoch": 0.4830156916972284, "flos": 32087405070720.0, "grad_norm": 2.2179310529822347, "language_loss": 0.75710618, "learning_rate": 2.2068359775255816e-06, "loss": 0.77923214, "num_input_tokens_seen": 86452675, "step": 4017, "time_per_iteration": 2.716343879699707 }, { "auxiliary_loss_clip": 0.01168248, "auxiliary_loss_mlp": 0.01023563, "balance_loss_clip": 0.8961091, "balance_loss_mlp": 1.0162499, "epoch": 0.48313593458786747, "flos": 21871717528320.0, "grad_norm": 2.4677711213905855, "language_loss": 0.78325349, "learning_rate": 2.206061167067723e-06, "loss": 0.80517161, "num_input_tokens_seen": 86470785, "step": 4018, "time_per_iteration": 2.6764845848083496 }, { "auxiliary_loss_clip": 0.01170053, "auxiliary_loss_mlp": 0.0102736, "balance_loss_clip": 0.93409497, "balance_loss_mlp": 1.01943874, "epoch": 0.4832561774785066, "flos": 22601206840320.0, "grad_norm": 2.211025706794542, "language_loss": 0.79403526, "learning_rate": 2.205286325350549e-06, "loss": 0.8160094, "num_input_tokens_seen": 86489850, "step": 4019, "time_per_iteration": 2.7432777881622314 }, { "auxiliary_loss_clip": 0.01175166, "auxiliary_loss_mlp": 0.01031847, "balance_loss_clip": 0.89851093, "balance_loss_mlp": 1.02453995, "epoch": 0.4833764203691457, "flos": 13437342282240.0, "grad_norm": 2.0602174655568244, "language_loss": 0.72602206, "learning_rate": 2.204511452491603e-06, "loss": 0.74809217, "num_input_tokens_seen": 86506475, "step": 4020, "time_per_iteration": 3.5961263179779053 }, { "auxiliary_loss_clip": 0.01177287, "auxiliary_loss_mlp": 0.01024503, "balance_loss_clip": 1.05309772, "balance_loss_mlp": 1.01783013, "epoch": 0.48349666325978474, "flos": 44128036955520.0, "grad_norm": 1.5761524060851784, "language_loss": 0.75276625, "learning_rate": 2.2037365486084316e-06, "loss": 0.77478415, "num_input_tokens_seen": 86529715, "step": 4021, "time_per_iteration": 2.7959485054016113 }, { "auxiliary_loss_clip": 0.01184275, "auxiliary_loss_mlp": 0.01024539, "balance_loss_clip": 0.93500876, "balance_loss_mlp": 1.01736879, "epoch": 0.48361690615042385, "flos": 26028377245440.0, "grad_norm": 1.8160179121030966, "language_loss": 0.78136063, "learning_rate": 2.2029616138185886e-06, "loss": 0.8034488, "num_input_tokens_seen": 86548715, "step": 4022, "time_per_iteration": 3.691298007965088 }, { "auxiliary_loss_clip": 0.01173747, "auxiliary_loss_mlp": 0.01029134, "balance_loss_clip": 0.93965954, "balance_loss_mlp": 1.02201724, "epoch": 0.48373714904106296, "flos": 22273306560000.0, "grad_norm": 1.6977247451686124, "language_loss": 0.8289364, "learning_rate": 2.202186648239629e-06, "loss": 0.8509652, "num_input_tokens_seen": 86568650, "step": 4023, "time_per_iteration": 2.7014856338500977 }, { "auxiliary_loss_clip": 0.01172637, "auxiliary_loss_mlp": 0.01020766, "balance_loss_clip": 1.01316905, "balance_loss_mlp": 1.01363182, "epoch": 0.483857391931702, "flos": 28292293699200.0, "grad_norm": 1.6578756705229662, "language_loss": 0.71789753, "learning_rate": 2.201411651989117e-06, "loss": 0.73983157, "num_input_tokens_seen": 86590630, "step": 4024, "time_per_iteration": 2.7153923511505127 }, { "auxiliary_loss_clip": 0.01174154, "auxiliary_loss_mlp": 0.01122534, "balance_loss_clip": 0.97732353, "balance_loss_mlp": 0.0, "epoch": 0.48397763482234113, "flos": 27418048577280.0, "grad_norm": 2.1116124302985497, "language_loss": 0.77902985, "learning_rate": 2.2006366251846167e-06, "loss": 0.80199671, "num_input_tokens_seen": 86611270, "step": 4025, "time_per_iteration": 2.737135887145996 }, { "auxiliary_loss_clip": 0.01177993, "auxiliary_loss_mlp": 0.01030126, "balance_loss_clip": 0.9768551, "balance_loss_mlp": 1.02312875, "epoch": 0.48409787771298024, "flos": 16797252470400.0, "grad_norm": 1.7345832957250824, "language_loss": 0.75704098, "learning_rate": 2.1998615679436997e-06, "loss": 0.77912223, "num_input_tokens_seen": 86628810, "step": 4026, "time_per_iteration": 2.6541507244110107 }, { "auxiliary_loss_clip": 0.01184204, "auxiliary_loss_mlp": 0.01030086, "balance_loss_clip": 0.97530526, "balance_loss_mlp": 1.022403, "epoch": 0.4842181206036193, "flos": 25083496028160.0, "grad_norm": 2.188588170586712, "language_loss": 0.77275717, "learning_rate": 2.199086480383942e-06, "loss": 0.79490012, "num_input_tokens_seen": 86648185, "step": 4027, "time_per_iteration": 2.790452480316162 }, { "auxiliary_loss_clip": 0.01189251, "auxiliary_loss_mlp": 0.01028448, "balance_loss_clip": 0.97654134, "balance_loss_mlp": 1.02033591, "epoch": 0.4843383634942584, "flos": 30372311496960.0, "grad_norm": 2.598407428554456, "language_loss": 0.6771276, "learning_rate": 2.1983113626229234e-06, "loss": 0.69930452, "num_input_tokens_seen": 86667435, "step": 4028, "time_per_iteration": 3.6389999389648438 }, { "auxiliary_loss_clip": 0.01165732, "auxiliary_loss_mlp": 0.01122786, "balance_loss_clip": 0.93289781, "balance_loss_mlp": 0.0, "epoch": 0.4844586063848975, "flos": 20413564917120.0, "grad_norm": 3.0894267235476978, "language_loss": 0.78692985, "learning_rate": 2.1975362147782293e-06, "loss": 0.80981499, "num_input_tokens_seen": 86686630, "step": 4029, "time_per_iteration": 2.7622578144073486 }, { "auxiliary_loss_clip": 0.01099124, "auxiliary_loss_mlp": 0.01006266, "balance_loss_clip": 0.96511149, "balance_loss_mlp": 1.00432265, "epoch": 0.48457884927553657, "flos": 70303722854400.0, "grad_norm": 0.6948341948732175, "language_loss": 0.54187119, "learning_rate": 2.196761036967448e-06, "loss": 0.56292498, "num_input_tokens_seen": 86754595, "step": 4030, "time_per_iteration": 3.367311954498291 }, { "auxiliary_loss_clip": 0.01171772, "auxiliary_loss_mlp": 0.01023286, "balance_loss_clip": 1.01281607, "balance_loss_mlp": 1.01667273, "epoch": 0.4846990921661757, "flos": 19934516206080.0, "grad_norm": 1.6741964411632941, "language_loss": 0.77637756, "learning_rate": 2.1959858293081743e-06, "loss": 0.79832816, "num_input_tokens_seen": 86773730, "step": 4031, "time_per_iteration": 3.5254809856414795 }, { "auxiliary_loss_clip": 0.01169362, "auxiliary_loss_mlp": 0.01028449, "balance_loss_clip": 0.9348076, "balance_loss_mlp": 1.02110577, "epoch": 0.4848193350568148, "flos": 23075945919360.0, "grad_norm": 2.5051023205555665, "language_loss": 0.76198971, "learning_rate": 2.1952105919180056e-06, "loss": 0.78396785, "num_input_tokens_seen": 86792985, "step": 4032, "time_per_iteration": 2.7636117935180664 }, { "auxiliary_loss_clip": 0.01177846, "auxiliary_loss_mlp": 0.01031099, "balance_loss_clip": 0.97662103, "balance_loss_mlp": 1.02305269, "epoch": 0.48493957794745385, "flos": 22455481363200.0, "grad_norm": 2.293068687379989, "language_loss": 0.67708445, "learning_rate": 2.1944353249145456e-06, "loss": 0.69917393, "num_input_tokens_seen": 86812095, "step": 4033, "time_per_iteration": 2.7035205364227295 }, { "auxiliary_loss_clip": 0.01180616, "auxiliary_loss_mlp": 0.01030733, "balance_loss_clip": 1.05453265, "balance_loss_mlp": 1.02409875, "epoch": 0.48505982083809296, "flos": 25046112948480.0, "grad_norm": 1.7357478232558492, "language_loss": 0.74671477, "learning_rate": 2.193660028415401e-06, "loss": 0.76882827, "num_input_tokens_seen": 86832875, "step": 4034, "time_per_iteration": 2.7118144035339355 }, { "auxiliary_loss_clip": 0.01165549, "auxiliary_loss_mlp": 0.01024643, "balance_loss_clip": 0.97278237, "balance_loss_mlp": 1.01764584, "epoch": 0.485180063728732, "flos": 26761386090240.0, "grad_norm": 1.6409087897339267, "language_loss": 0.81914282, "learning_rate": 2.1928847025381852e-06, "loss": 0.84104472, "num_input_tokens_seen": 86853480, "step": 4035, "time_per_iteration": 2.7602739334106445 }, { "auxiliary_loss_clip": 0.0117407, "auxiliary_loss_mlp": 0.01030077, "balance_loss_clip": 1.01030779, "balance_loss_mlp": 1.02260232, "epoch": 0.4853003066193711, "flos": 24059143969920.0, "grad_norm": 1.6366350873230442, "language_loss": 0.83919722, "learning_rate": 2.192109347400512e-06, "loss": 0.86123872, "num_input_tokens_seen": 86873695, "step": 4036, "time_per_iteration": 2.6844820976257324 }, { "auxiliary_loss_clip": 0.0117574, "auxiliary_loss_mlp": 0.0102899, "balance_loss_clip": 0.97218776, "balance_loss_mlp": 1.02095497, "epoch": 0.48542054951001024, "flos": 23076376882560.0, "grad_norm": 1.669388118659948, "language_loss": 0.79047465, "learning_rate": 2.191333963120004e-06, "loss": 0.81252193, "num_input_tokens_seen": 86892675, "step": 4037, "time_per_iteration": 2.7018470764160156 }, { "auxiliary_loss_clip": 0.01175523, "auxiliary_loss_mlp": 0.01026799, "balance_loss_clip": 0.97452915, "balance_loss_mlp": 1.01964378, "epoch": 0.4855407924006493, "flos": 25664889565440.0, "grad_norm": 2.53788125988837, "language_loss": 0.69605047, "learning_rate": 2.190558549814286e-06, "loss": 0.71807373, "num_input_tokens_seen": 86912835, "step": 4038, "time_per_iteration": 2.7441184520721436 }, { "auxiliary_loss_clip": 0.01173391, "auxiliary_loss_mlp": 0.01024947, "balance_loss_clip": 0.97455889, "balance_loss_mlp": 1.01778543, "epoch": 0.4856610352912884, "flos": 23987933256960.0, "grad_norm": 1.704185440983328, "language_loss": 0.7948457, "learning_rate": 2.1897831076009872e-06, "loss": 0.81682903, "num_input_tokens_seen": 86932475, "step": 4039, "time_per_iteration": 2.7231593132019043 }, { "auxiliary_loss_clip": 0.01178587, "auxiliary_loss_mlp": 0.01024745, "balance_loss_clip": 1.0133357, "balance_loss_mlp": 1.01681161, "epoch": 0.4857812781819275, "flos": 24096814358400.0, "grad_norm": 1.7020689835463125, "language_loss": 0.79802322, "learning_rate": 2.1890076365977426e-06, "loss": 0.8200565, "num_input_tokens_seen": 86952300, "step": 4040, "time_per_iteration": 2.702937126159668 }, { "auxiliary_loss_clip": 0.0108504, "auxiliary_loss_mlp": 0.01004888, "balance_loss_clip": 0.94860893, "balance_loss_mlp": 1.00298107, "epoch": 0.48590152107256657, "flos": 56266635185280.0, "grad_norm": 0.8578407566624827, "language_loss": 0.52876908, "learning_rate": 2.188232136922189e-06, "loss": 0.54966837, "num_input_tokens_seen": 87010420, "step": 4041, "time_per_iteration": 3.1705968379974365 }, { "auxiliary_loss_clip": 0.01163062, "auxiliary_loss_mlp": 0.01029463, "balance_loss_clip": 0.85618615, "balance_loss_mlp": 1.02151215, "epoch": 0.4860217639632057, "flos": 20046988667520.0, "grad_norm": 1.9664460370500743, "language_loss": 0.75632131, "learning_rate": 2.187456608691971e-06, "loss": 0.77824652, "num_input_tokens_seen": 87029295, "step": 4042, "time_per_iteration": 2.780057430267334 }, { "auxiliary_loss_clip": 0.01180401, "auxiliary_loss_mlp": 0.01027094, "balance_loss_clip": 0.93943202, "balance_loss_mlp": 1.01981068, "epoch": 0.4861420068538448, "flos": 17822143232640.0, "grad_norm": 1.7568787549697686, "language_loss": 0.87735403, "learning_rate": 2.1866810520247334e-06, "loss": 0.89942896, "num_input_tokens_seen": 87048165, "step": 4043, "time_per_iteration": 2.7103142738342285 }, { "auxiliary_loss_clip": 0.01179076, "auxiliary_loss_mlp": 0.01027098, "balance_loss_clip": 1.01227117, "balance_loss_mlp": 1.01886702, "epoch": 0.48626224974448384, "flos": 26250125857920.0, "grad_norm": 1.7698151706199332, "language_loss": 0.64878428, "learning_rate": 2.185905467038129e-06, "loss": 0.67084599, "num_input_tokens_seen": 87067070, "step": 4044, "time_per_iteration": 2.671438694000244 }, { "auxiliary_loss_clip": 0.01176848, "auxiliary_loss_mlp": 0.01025202, "balance_loss_clip": 1.05374169, "balance_loss_mlp": 1.01864839, "epoch": 0.48638249263512295, "flos": 22054502862720.0, "grad_norm": 1.7685387770330065, "language_loss": 0.77724177, "learning_rate": 2.1851298538498127e-06, "loss": 0.79926229, "num_input_tokens_seen": 87086785, "step": 4045, "time_per_iteration": 2.66357421875 }, { "auxiliary_loss_clip": 0.01186329, "auxiliary_loss_mlp": 0.01123231, "balance_loss_clip": 1.0165689, "balance_loss_mlp": 0.0, "epoch": 0.48650273552576206, "flos": 25119945354240.0, "grad_norm": 2.1020164130370596, "language_loss": 0.80192745, "learning_rate": 2.184354212577446e-06, "loss": 0.82502306, "num_input_tokens_seen": 87107090, "step": 4046, "time_per_iteration": 3.681401014328003 }, { "auxiliary_loss_clip": 0.01184528, "auxiliary_loss_mlp": 0.01034433, "balance_loss_clip": 1.05338025, "balance_loss_mlp": 1.02640438, "epoch": 0.4866229784164011, "flos": 17456931699840.0, "grad_norm": 2.8174572028640466, "language_loss": 0.62257099, "learning_rate": 2.1835785433386907e-06, "loss": 0.64476061, "num_input_tokens_seen": 87125905, "step": 4047, "time_per_iteration": 2.6149163246154785 }, { "auxiliary_loss_clip": 0.01167352, "auxiliary_loss_mlp": 0.01029563, "balance_loss_clip": 0.93807679, "balance_loss_mlp": 1.02175522, "epoch": 0.48674322130704023, "flos": 23331127115520.0, "grad_norm": 1.6799859359212117, "language_loss": 0.64945942, "learning_rate": 2.182802846251216e-06, "loss": 0.67142856, "num_input_tokens_seen": 87146175, "step": 4048, "time_per_iteration": 3.75087571144104 }, { "auxiliary_loss_clip": 0.01177203, "auxiliary_loss_mlp": 0.0102436, "balance_loss_clip": 0.93501878, "balance_loss_mlp": 1.01735663, "epoch": 0.4868634641976793, "flos": 28804344030720.0, "grad_norm": 1.7382807567523708, "language_loss": 0.72118533, "learning_rate": 2.182027121432696e-06, "loss": 0.7432009, "num_input_tokens_seen": 87166800, "step": 4049, "time_per_iteration": 2.8333818912506104 }, { "auxiliary_loss_clip": 0.0118087, "auxiliary_loss_mlp": 0.01029454, "balance_loss_clip": 1.05163312, "balance_loss_mlp": 1.02138638, "epoch": 0.4869837070883184, "flos": 19025976574080.0, "grad_norm": 1.833084383224113, "language_loss": 0.8195194, "learning_rate": 2.1812513690008054e-06, "loss": 0.84162271, "num_input_tokens_seen": 87185920, "step": 4050, "time_per_iteration": 2.688155174255371 }, { "auxiliary_loss_clip": 0.01185381, "auxiliary_loss_mlp": 0.01025521, "balance_loss_clip": 1.0150367, "balance_loss_mlp": 1.01769781, "epoch": 0.4871039499789575, "flos": 15121409483520.0, "grad_norm": 2.0915574712156695, "language_loss": 0.79695129, "learning_rate": 2.180475589073227e-06, "loss": 0.81906033, "num_input_tokens_seen": 87203620, "step": 4051, "time_per_iteration": 2.5875747203826904 }, { "auxiliary_loss_clip": 0.01165082, "auxiliary_loss_mlp": 0.01025052, "balance_loss_clip": 1.00968742, "balance_loss_mlp": 1.01779532, "epoch": 0.48722419286959656, "flos": 26174066808960.0, "grad_norm": 1.7923986823201021, "language_loss": 0.73428202, "learning_rate": 2.1796997817676456e-06, "loss": 0.75618333, "num_input_tokens_seen": 87224630, "step": 4052, "time_per_iteration": 2.746382236480713 }, { "auxiliary_loss_clip": 0.0118036, "auxiliary_loss_mlp": 0.01122085, "balance_loss_clip": 1.01511717, "balance_loss_mlp": 0.0, "epoch": 0.4873444357602357, "flos": 24026142349440.0, "grad_norm": 1.8053113492790507, "language_loss": 0.67302895, "learning_rate": 2.1789239472017494e-06, "loss": 0.69605339, "num_input_tokens_seen": 87246280, "step": 4053, "time_per_iteration": 2.687880277633667 }, { "auxiliary_loss_clip": 0.01172986, "auxiliary_loss_mlp": 0.01026318, "balance_loss_clip": 0.93659455, "balance_loss_mlp": 1.01927304, "epoch": 0.4874646786508748, "flos": 22820441500800.0, "grad_norm": 2.1227403509933644, "language_loss": 0.72857684, "learning_rate": 2.1781480854932326e-06, "loss": 0.75056982, "num_input_tokens_seen": 87266045, "step": 4054, "time_per_iteration": 3.6499698162078857 }, { "auxiliary_loss_clip": 0.01175164, "auxiliary_loss_mlp": 0.01022276, "balance_loss_clip": 0.90012026, "balance_loss_mlp": 1.0147717, "epoch": 0.48758492154151384, "flos": 21287594557440.0, "grad_norm": 1.835100584348888, "language_loss": 0.79209381, "learning_rate": 2.1773721967597933e-06, "loss": 0.8140682, "num_input_tokens_seen": 87284495, "step": 4055, "time_per_iteration": 2.7676455974578857 }, { "auxiliary_loss_clip": 0.01076154, "auxiliary_loss_mlp": 0.01003147, "balance_loss_clip": 0.94464302, "balance_loss_mlp": 1.0011797, "epoch": 0.48770516443215295, "flos": 62244109180800.0, "grad_norm": 2.9758619597275904, "language_loss": 0.57388514, "learning_rate": 2.1765962811191322e-06, "loss": 0.59467816, "num_input_tokens_seen": 87338960, "step": 4056, "time_per_iteration": 3.1479737758636475 }, { "auxiliary_loss_clip": 0.01077697, "auxiliary_loss_mlp": 0.01004048, "balance_loss_clip": 0.86754, "balance_loss_mlp": 1.00200927, "epoch": 0.48782540732279206, "flos": 66133451882880.0, "grad_norm": 0.8713234286235637, "language_loss": 0.62115085, "learning_rate": 2.1758203386889566e-06, "loss": 0.64196825, "num_input_tokens_seen": 87401730, "step": 4057, "time_per_iteration": 4.407903671264648 }, { "auxiliary_loss_clip": 0.01173621, "auxiliary_loss_mlp": 0.01122903, "balance_loss_clip": 0.93605119, "balance_loss_mlp": 0.0, "epoch": 0.4879456502134311, "flos": 14607922608000.0, "grad_norm": 2.9950573693115863, "language_loss": 0.84375143, "learning_rate": 2.1750443695869746e-06, "loss": 0.86671662, "num_input_tokens_seen": 87417300, "step": 4058, "time_per_iteration": 2.7348744869232178 }, { "auxiliary_loss_clip": 0.01181556, "auxiliary_loss_mlp": 0.01028286, "balance_loss_clip": 1.01414275, "balance_loss_mlp": 1.0211215, "epoch": 0.4880658931040702, "flos": 19500464257920.0, "grad_norm": 1.7944185554622516, "language_loss": 0.85923249, "learning_rate": 2.174268373930901e-06, "loss": 0.88133091, "num_input_tokens_seen": 87434815, "step": 4059, "time_per_iteration": 2.7251136302948 }, { "auxiliary_loss_clip": 0.0117054, "auxiliary_loss_mlp": 0.0112263, "balance_loss_clip": 0.93724895, "balance_loss_mlp": 0.0, "epoch": 0.48818613599470934, "flos": 16723060928640.0, "grad_norm": 1.7870995412912334, "language_loss": 0.79303503, "learning_rate": 2.1734923518384537e-06, "loss": 0.81596678, "num_input_tokens_seen": 87451420, "step": 4060, "time_per_iteration": 2.695770502090454 }, { "auxiliary_loss_clip": 0.01160474, "auxiliary_loss_mlp": 0.01029649, "balance_loss_clip": 0.93736291, "balance_loss_mlp": 1.02272928, "epoch": 0.4883063788853484, "flos": 26756932803840.0, "grad_norm": 1.6769631765513042, "language_loss": 0.82412612, "learning_rate": 2.1727163034273547e-06, "loss": 0.84602737, "num_input_tokens_seen": 87469585, "step": 4061, "time_per_iteration": 2.8294429779052734 }, { "auxiliary_loss_clip": 0.01176896, "auxiliary_loss_mlp": 0.01028121, "balance_loss_clip": 1.01135325, "balance_loss_mlp": 1.02092385, "epoch": 0.4884266217759875, "flos": 16763388923520.0, "grad_norm": 4.169659251276357, "language_loss": 0.78796774, "learning_rate": 2.17194022881533e-06, "loss": 0.81001788, "num_input_tokens_seen": 87485675, "step": 4062, "time_per_iteration": 2.711298942565918 }, { "auxiliary_loss_clip": 0.01177403, "auxiliary_loss_mlp": 0.01029486, "balance_loss_clip": 0.97414935, "balance_loss_mlp": 1.02146864, "epoch": 0.4885468646666266, "flos": 24207132003840.0, "grad_norm": 3.921976603758382, "language_loss": 0.67566353, "learning_rate": 2.1711641281201092e-06, "loss": 0.69773233, "num_input_tokens_seen": 87505605, "step": 4063, "time_per_iteration": 2.695185661315918 }, { "auxiliary_loss_clip": 0.01178722, "auxiliary_loss_mlp": 0.01032838, "balance_loss_clip": 1.01578367, "balance_loss_mlp": 1.02544975, "epoch": 0.48866710755726567, "flos": 14610795696000.0, "grad_norm": 2.1490412206190346, "language_loss": 0.79189432, "learning_rate": 2.1703880014594264e-06, "loss": 0.8140099, "num_input_tokens_seen": 87523195, "step": 4064, "time_per_iteration": 2.62819504737854 }, { "auxiliary_loss_clip": 0.01170814, "auxiliary_loss_mlp": 0.010281, "balance_loss_clip": 0.90070623, "balance_loss_mlp": 1.02054834, "epoch": 0.4887873504479048, "flos": 28804451771520.0, "grad_norm": 1.8170428841689426, "language_loss": 0.73313105, "learning_rate": 2.1696118489510182e-06, "loss": 0.75512022, "num_input_tokens_seen": 87544125, "step": 4065, "time_per_iteration": 2.7863519191741943 }, { "auxiliary_loss_clip": 0.01183216, "auxiliary_loss_mlp": 0.01123272, "balance_loss_clip": 0.93846136, "balance_loss_mlp": 0.0, "epoch": 0.48890759333854383, "flos": 22784387224320.0, "grad_norm": 2.024084044596419, "language_loss": 0.72151846, "learning_rate": 2.1688356707126286e-06, "loss": 0.74458325, "num_input_tokens_seen": 87563745, "step": 4066, "time_per_iteration": 2.692044973373413 }, { "auxiliary_loss_clip": 0.01172673, "auxiliary_loss_mlp": 0.01030279, "balance_loss_clip": 0.93797731, "balance_loss_mlp": 1.02192807, "epoch": 0.48902783622918294, "flos": 17786088956160.0, "grad_norm": 1.8111693572151453, "language_loss": 0.69989121, "learning_rate": 2.168059466862001e-06, "loss": 0.72192073, "num_input_tokens_seen": 87581895, "step": 4067, "time_per_iteration": 2.681426763534546 }, { "auxiliary_loss_clip": 0.01175247, "auxiliary_loss_mlp": 0.0102885, "balance_loss_clip": 0.97119975, "balance_loss_mlp": 1.02122653, "epoch": 0.48914807911982205, "flos": 22310294590080.0, "grad_norm": 1.9500244876283281, "language_loss": 0.81498688, "learning_rate": 2.167283237516887e-06, "loss": 0.83702779, "num_input_tokens_seen": 87600170, "step": 4068, "time_per_iteration": 2.6638495922088623 }, { "auxiliary_loss_clip": 0.01178351, "auxiliary_loss_mlp": 0.01026546, "balance_loss_clip": 0.97453475, "balance_loss_mlp": 1.01919699, "epoch": 0.4892683220104611, "flos": 16363020954240.0, "grad_norm": 1.706102932916757, "language_loss": 0.74479645, "learning_rate": 2.1665069827950383e-06, "loss": 0.76684546, "num_input_tokens_seen": 87617455, "step": 4069, "time_per_iteration": 2.6504688262939453 }, { "auxiliary_loss_clip": 0.01176276, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 0.97515357, "balance_loss_mlp": 1.02366614, "epoch": 0.4893885649011002, "flos": 15739144606080.0, "grad_norm": 1.810650029354026, "language_loss": 0.86628914, "learning_rate": 2.1657307028142126e-06, "loss": 0.88836372, "num_input_tokens_seen": 87634995, "step": 4070, "time_per_iteration": 2.6394169330596924 }, { "auxiliary_loss_clip": 0.01177982, "auxiliary_loss_mlp": 0.01030013, "balance_loss_clip": 0.9756906, "balance_loss_mlp": 1.02186775, "epoch": 0.48950880779173933, "flos": 28581984887040.0, "grad_norm": 1.7939991747894781, "language_loss": 0.66983622, "learning_rate": 2.164954397692171e-06, "loss": 0.69191617, "num_input_tokens_seen": 87654420, "step": 4071, "time_per_iteration": 3.6817121505737305 }, { "auxiliary_loss_clip": 0.01086387, "auxiliary_loss_mlp": 0.0100856, "balance_loss_clip": 0.94613194, "balance_loss_mlp": 1.00653315, "epoch": 0.4896290506823784, "flos": 66186310746240.0, "grad_norm": 1.0788034113967944, "language_loss": 0.77384454, "learning_rate": 2.164178067546678e-06, "loss": 0.79479402, "num_input_tokens_seen": 87713585, "step": 4072, "time_per_iteration": 3.322200298309326 }, { "auxiliary_loss_clip": 0.01183744, "auxiliary_loss_mlp": 0.01025311, "balance_loss_clip": 0.97458893, "balance_loss_mlp": 1.01789045, "epoch": 0.4897492935730175, "flos": 12531065207040.0, "grad_norm": 1.8284727290343255, "language_loss": 0.90627992, "learning_rate": 2.163401712495504e-06, "loss": 0.92837048, "num_input_tokens_seen": 87731280, "step": 4073, "time_per_iteration": 2.7034919261932373 }, { "auxiliary_loss_clip": 0.01180644, "auxiliary_loss_mlp": 0.01023054, "balance_loss_clip": 0.90005165, "balance_loss_mlp": 1.01532364, "epoch": 0.4898695364636566, "flos": 23476816679040.0, "grad_norm": 1.5443378297739403, "language_loss": 0.79323387, "learning_rate": 2.1626253326564194e-06, "loss": 0.8152709, "num_input_tokens_seen": 87750230, "step": 4074, "time_per_iteration": 3.6880617141723633 }, { "auxiliary_loss_clip": 0.01173616, "auxiliary_loss_mlp": 0.01029096, "balance_loss_clip": 0.97220755, "balance_loss_mlp": 1.02103186, "epoch": 0.48998977935429566, "flos": 27160209774720.0, "grad_norm": 1.6433135580675922, "language_loss": 0.76872802, "learning_rate": 2.161848928147201e-06, "loss": 0.79075515, "num_input_tokens_seen": 87770500, "step": 4075, "time_per_iteration": 2.725044012069702 }, { "auxiliary_loss_clip": 0.01180051, "auxiliary_loss_mlp": 0.01027366, "balance_loss_clip": 1.01669729, "balance_loss_mlp": 1.02020502, "epoch": 0.4901100222449348, "flos": 20339588856960.0, "grad_norm": 2.0091504664354694, "language_loss": 0.80660498, "learning_rate": 2.161072499085629e-06, "loss": 0.8286792, "num_input_tokens_seen": 87789495, "step": 4076, "time_per_iteration": 2.702565908432007 }, { "auxiliary_loss_clip": 0.01180742, "auxiliary_loss_mlp": 0.01031357, "balance_loss_clip": 0.93803841, "balance_loss_mlp": 1.0238167, "epoch": 0.4902302651355739, "flos": 30446359384320.0, "grad_norm": 1.550561242067185, "language_loss": 0.82838243, "learning_rate": 2.160296045589487e-06, "loss": 0.85050344, "num_input_tokens_seen": 87812955, "step": 4077, "time_per_iteration": 2.838564395904541 }, { "auxiliary_loss_clip": 0.01179086, "auxiliary_loss_mlp": 0.01029155, "balance_loss_clip": 1.01483011, "balance_loss_mlp": 1.02160311, "epoch": 0.49035050802621294, "flos": 19174180089600.0, "grad_norm": 1.6599918066635189, "language_loss": 0.69606459, "learning_rate": 2.159519567776562e-06, "loss": 0.71814698, "num_input_tokens_seen": 87832605, "step": 4078, "time_per_iteration": 2.5829339027404785 }, { "auxiliary_loss_clip": 0.01177406, "auxiliary_loss_mlp": 0.01035604, "balance_loss_clip": 0.89540678, "balance_loss_mlp": 1.02754557, "epoch": 0.49047075091685205, "flos": 22228489365120.0, "grad_norm": 2.4637420817238307, "language_loss": 0.70736933, "learning_rate": 2.1587430657646463e-06, "loss": 0.72949946, "num_input_tokens_seen": 87846040, "step": 4079, "time_per_iteration": 2.7323105335235596 }, { "auxiliary_loss_clip": 0.01175661, "auxiliary_loss_mlp": 0.01028535, "balance_loss_clip": 0.97579348, "balance_loss_mlp": 1.02088165, "epoch": 0.4905909938074911, "flos": 20156516213760.0, "grad_norm": 2.2619532689119826, "language_loss": 0.78068161, "learning_rate": 2.157966539671533e-06, "loss": 0.80272359, "num_input_tokens_seen": 87865680, "step": 4080, "time_per_iteration": 3.523442506790161 }, { "auxiliary_loss_clip": 0.01176148, "auxiliary_loss_mlp": 0.01025474, "balance_loss_clip": 0.93523955, "balance_loss_mlp": 1.01788378, "epoch": 0.4907112366981302, "flos": 17202217380480.0, "grad_norm": 1.8489084781298366, "language_loss": 0.67840952, "learning_rate": 2.157189989615021e-06, "loss": 0.70042574, "num_input_tokens_seen": 87884270, "step": 4081, "time_per_iteration": 2.6992571353912354 }, { "auxiliary_loss_clip": 0.01180058, "auxiliary_loss_mlp": 0.01123685, "balance_loss_clip": 1.01292539, "balance_loss_mlp": 0.0, "epoch": 0.4908314795887693, "flos": 21688968107520.0, "grad_norm": 1.7994251593074184, "language_loss": 0.7566669, "learning_rate": 2.156413415712913e-06, "loss": 0.77970433, "num_input_tokens_seen": 87906320, "step": 4082, "time_per_iteration": 3.718390464782715 }, { "auxiliary_loss_clip": 0.01181461, "auxiliary_loss_mlp": 0.01123295, "balance_loss_clip": 0.97536004, "balance_loss_mlp": 0.0, "epoch": 0.4909517224794084, "flos": 26213676531840.0, "grad_norm": 1.654216566440082, "language_loss": 0.78771937, "learning_rate": 2.155636818083014e-06, "loss": 0.81076688, "num_input_tokens_seen": 87927690, "step": 4083, "time_per_iteration": 2.71895170211792 }, { "auxiliary_loss_clip": 0.01172188, "auxiliary_loss_mlp": 0.01025646, "balance_loss_clip": 0.97494078, "balance_loss_mlp": 1.01846993, "epoch": 0.4910719653700475, "flos": 23148377694720.0, "grad_norm": 1.7416115428455472, "language_loss": 0.84166002, "learning_rate": 2.154860196843134e-06, "loss": 0.8636384, "num_input_tokens_seen": 87946885, "step": 4084, "time_per_iteration": 2.7351791858673096 }, { "auxiliary_loss_clip": 0.01179563, "auxiliary_loss_mlp": 0.01023212, "balance_loss_clip": 1.04993367, "balance_loss_mlp": 1.01576138, "epoch": 0.4911922082606866, "flos": 23331845387520.0, "grad_norm": 1.6117591840773973, "language_loss": 0.76710868, "learning_rate": 2.154083552111085e-06, "loss": 0.78913641, "num_input_tokens_seen": 87966055, "step": 4085, "time_per_iteration": 2.715363025665283 }, { "auxiliary_loss_clip": 0.01180134, "auxiliary_loss_mlp": 0.01024684, "balance_loss_clip": 1.0500952, "balance_loss_mlp": 1.01655996, "epoch": 0.49131245115132566, "flos": 29203239542400.0, "grad_norm": 1.6633563710835801, "language_loss": 0.81680763, "learning_rate": 2.1533068840046834e-06, "loss": 0.83885574, "num_input_tokens_seen": 87986320, "step": 4086, "time_per_iteration": 2.7305288314819336 }, { "auxiliary_loss_clip": 0.0116861, "auxiliary_loss_mlp": 0.01123516, "balance_loss_clip": 0.97380221, "balance_loss_mlp": 0.0, "epoch": 0.49143269404196477, "flos": 20147465986560.0, "grad_norm": 2.1960005595245313, "language_loss": 0.61744988, "learning_rate": 2.152530192641749e-06, "loss": 0.64037114, "num_input_tokens_seen": 88001230, "step": 4087, "time_per_iteration": 2.6856791973114014 }, { "auxiliary_loss_clip": 0.01183446, "auxiliary_loss_mlp": 0.01022976, "balance_loss_clip": 1.01437974, "balance_loss_mlp": 1.01594234, "epoch": 0.4915529369326039, "flos": 24389809597440.0, "grad_norm": 1.7387357104759882, "language_loss": 0.72327417, "learning_rate": 2.1517534781401068e-06, "loss": 0.74533838, "num_input_tokens_seen": 88019110, "step": 4088, "time_per_iteration": 2.725222587585449 }, { "auxiliary_loss_clip": 0.01179468, "auxiliary_loss_mlp": 0.01031245, "balance_loss_clip": 1.01427579, "balance_loss_mlp": 1.0239495, "epoch": 0.49167317982324293, "flos": 10524305197440.0, "grad_norm": 1.967269786994216, "language_loss": 0.69241154, "learning_rate": 2.150976740617581e-06, "loss": 0.71451867, "num_input_tokens_seen": 88035670, "step": 4089, "time_per_iteration": 2.6666338443756104 }, { "auxiliary_loss_clip": 0.0118294, "auxiliary_loss_mlp": 0.01032672, "balance_loss_clip": 0.97780788, "balance_loss_mlp": 1.0248816, "epoch": 0.49179342271388204, "flos": 25593427457280.0, "grad_norm": 1.8038495720885104, "language_loss": 0.70874161, "learning_rate": 2.150199980192006e-06, "loss": 0.73089772, "num_input_tokens_seen": 88054790, "step": 4090, "time_per_iteration": 2.700735569000244 }, { "auxiliary_loss_clip": 0.01170314, "auxiliary_loss_mlp": 0.01026779, "balance_loss_clip": 0.97271514, "balance_loss_mlp": 1.01901865, "epoch": 0.49191366560452116, "flos": 21102043875840.0, "grad_norm": 1.5652639495022476, "language_loss": 0.80982691, "learning_rate": 2.1494231969812114e-06, "loss": 0.83179784, "num_input_tokens_seen": 88073780, "step": 4091, "time_per_iteration": 2.706477642059326 }, { "auxiliary_loss_clip": 0.01182568, "auxiliary_loss_mlp": 0.01023646, "balance_loss_clip": 0.93982464, "balance_loss_mlp": 1.01669645, "epoch": 0.4920339084951602, "flos": 26067520091520.0, "grad_norm": 2.414597760079881, "language_loss": 0.81054604, "learning_rate": 2.1486463911030372e-06, "loss": 0.83260822, "num_input_tokens_seen": 88094430, "step": 4092, "time_per_iteration": 2.744919538497925 }, { "auxiliary_loss_clip": 0.01172857, "auxiliary_loss_mlp": 0.01025277, "balance_loss_clip": 0.97301501, "balance_loss_mlp": 1.01793706, "epoch": 0.4921541513857993, "flos": 25081269384960.0, "grad_norm": 1.7569558062216275, "language_loss": 0.74622887, "learning_rate": 2.147869562675324e-06, "loss": 0.76821017, "num_input_tokens_seen": 88113400, "step": 4093, "time_per_iteration": 2.74008846282959 }, { "auxiliary_loss_clip": 0.0117964, "auxiliary_loss_mlp": 0.0102926, "balance_loss_clip": 1.01466656, "balance_loss_mlp": 1.02099323, "epoch": 0.49227439427643843, "flos": 24389809597440.0, "grad_norm": 2.058855951545856, "language_loss": 0.72413415, "learning_rate": 2.147092711815915e-06, "loss": 0.74622309, "num_input_tokens_seen": 88132750, "step": 4094, "time_per_iteration": 2.646038055419922 }, { "auxiliary_loss_clip": 0.01175595, "auxiliary_loss_mlp": 0.01027213, "balance_loss_clip": 0.93768156, "balance_loss_mlp": 1.01975632, "epoch": 0.4923946371670775, "flos": 11363753018880.0, "grad_norm": 2.3003359298080093, "language_loss": 0.86167061, "learning_rate": 2.1463158386426593e-06, "loss": 0.8836987, "num_input_tokens_seen": 88150560, "step": 4095, "time_per_iteration": 2.6809465885162354 }, { "auxiliary_loss_clip": 0.01181572, "auxiliary_loss_mlp": 0.0102402, "balance_loss_clip": 0.97589767, "balance_loss_mlp": 1.01604486, "epoch": 0.4925148800577166, "flos": 30445964334720.0, "grad_norm": 1.9264163014797409, "language_loss": 0.77269387, "learning_rate": 2.145538943273407e-06, "loss": 0.79474986, "num_input_tokens_seen": 88170835, "step": 4096, "time_per_iteration": 2.7422497272491455 }, { "auxiliary_loss_clip": 0.01183391, "auxiliary_loss_mlp": 0.01031902, "balance_loss_clip": 1.05336642, "balance_loss_mlp": 1.02418351, "epoch": 0.49263512294835565, "flos": 20850454039680.0, "grad_norm": 1.797263128441114, "language_loss": 0.71853733, "learning_rate": 2.144762025826013e-06, "loss": 0.74069029, "num_input_tokens_seen": 88189925, "step": 4097, "time_per_iteration": 3.6253254413604736 }, { "auxiliary_loss_clip": 0.0118199, "auxiliary_loss_mlp": 0.01027491, "balance_loss_clip": 1.01305604, "balance_loss_mlp": 1.01960576, "epoch": 0.49275536583899476, "flos": 23767477534080.0, "grad_norm": 3.9782929331321064, "language_loss": 0.87285912, "learning_rate": 2.143985086418334e-06, "loss": 0.89495397, "num_input_tokens_seen": 88205105, "step": 4098, "time_per_iteration": 2.63743257522583 }, { "auxiliary_loss_clip": 0.01180642, "auxiliary_loss_mlp": 0.01025713, "balance_loss_clip": 0.97608185, "balance_loss_mlp": 1.01862025, "epoch": 0.4928756087296339, "flos": 22273522041600.0, "grad_norm": 1.3714335163068392, "language_loss": 0.76507777, "learning_rate": 2.1432081251682324e-06, "loss": 0.78714132, "num_input_tokens_seen": 88225475, "step": 4099, "time_per_iteration": 2.6972169876098633 }, { "auxiliary_loss_clip": 0.01182398, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.01774943, "balance_loss_mlp": 1.02384472, "epoch": 0.49299585162027293, "flos": 19645471463040.0, "grad_norm": 1.6834618663330323, "language_loss": 0.87006915, "learning_rate": 2.142431142193572e-06, "loss": 0.89220566, "num_input_tokens_seen": 88243255, "step": 4100, "time_per_iteration": 3.531752109527588 }, { "auxiliary_loss_clip": 0.01177849, "auxiliary_loss_mlp": 0.01029415, "balance_loss_clip": 1.05236065, "balance_loss_mlp": 1.02232814, "epoch": 0.49311609451091204, "flos": 38837138497920.0, "grad_norm": 2.170169084522847, "language_loss": 0.71705788, "learning_rate": 2.1416541376122207e-06, "loss": 0.7391305, "num_input_tokens_seen": 88263435, "step": 4101, "time_per_iteration": 2.7342898845672607 }, { "auxiliary_loss_clip": 0.01179287, "auxiliary_loss_mlp": 0.01024261, "balance_loss_clip": 1.05204582, "balance_loss_mlp": 1.0165658, "epoch": 0.49323633740155115, "flos": 28329102161280.0, "grad_norm": 2.0954758986827606, "language_loss": 0.72839618, "learning_rate": 2.1408771115420496e-06, "loss": 0.75043166, "num_input_tokens_seen": 88283295, "step": 4102, "time_per_iteration": 2.6962249279022217 }, { "auxiliary_loss_clip": 0.01187584, "auxiliary_loss_mlp": 0.01027783, "balance_loss_clip": 0.90743238, "balance_loss_mlp": 1.0204041, "epoch": 0.4933565802921902, "flos": 21135584200320.0, "grad_norm": 1.7558206412402686, "language_loss": 0.64885092, "learning_rate": 2.140100064100932e-06, "loss": 0.67100459, "num_input_tokens_seen": 88299270, "step": 4103, "time_per_iteration": 2.7259726524353027 }, { "auxiliary_loss_clip": 0.01175512, "auxiliary_loss_mlp": 0.01031418, "balance_loss_clip": 1.01399577, "balance_loss_mlp": 1.02364552, "epoch": 0.4934768231828293, "flos": 18039007595520.0, "grad_norm": 1.7281584559455256, "language_loss": 0.75729942, "learning_rate": 2.139322995406746e-06, "loss": 0.7793687, "num_input_tokens_seen": 88316905, "step": 4104, "time_per_iteration": 2.705813407897949 }, { "auxiliary_loss_clip": 0.01180394, "auxiliary_loss_mlp": 0.01036075, "balance_loss_clip": 1.05308938, "balance_loss_mlp": 1.02853489, "epoch": 0.4935970660734684, "flos": 23469957181440.0, "grad_norm": 2.325343181957132, "language_loss": 0.79863012, "learning_rate": 2.1385459055773727e-06, "loss": 0.82079482, "num_input_tokens_seen": 88335095, "step": 4105, "time_per_iteration": 2.6147818565368652 }, { "auxiliary_loss_clip": 0.01156758, "auxiliary_loss_mlp": 0.01121864, "balance_loss_clip": 0.89529216, "balance_loss_mlp": 0.0, "epoch": 0.4937173089641075, "flos": 64479258840960.0, "grad_norm": 3.102778973575557, "language_loss": 0.73973465, "learning_rate": 2.137768794730696e-06, "loss": 0.76252091, "num_input_tokens_seen": 88358545, "step": 4106, "time_per_iteration": 3.9596378803253174 }, { "auxiliary_loss_clip": 0.01181521, "auxiliary_loss_mlp": 0.01031945, "balance_loss_clip": 0.97765517, "balance_loss_mlp": 1.02380347, "epoch": 0.4938375518547466, "flos": 22346025644160.0, "grad_norm": 2.477954275315522, "language_loss": 0.80404711, "learning_rate": 2.1369916629846026e-06, "loss": 0.82618171, "num_input_tokens_seen": 88378295, "step": 4107, "time_per_iteration": 2.675793409347534 }, { "auxiliary_loss_clip": 0.01177086, "auxiliary_loss_mlp": 0.01025058, "balance_loss_clip": 0.97423786, "balance_loss_mlp": 1.01695228, "epoch": 0.4939577947453857, "flos": 17858700299520.0, "grad_norm": 1.8575167206275922, "language_loss": 0.75098264, "learning_rate": 2.136214510456983e-06, "loss": 0.77300406, "num_input_tokens_seen": 88396750, "step": 4108, "time_per_iteration": 2.6192283630371094 }, { "auxiliary_loss_clip": 0.01092657, "auxiliary_loss_mlp": 0.01116997, "balance_loss_clip": 0.87042785, "balance_loss_mlp": 0.0, "epoch": 0.49407803763602476, "flos": 70066746875520.0, "grad_norm": 0.8912672425675533, "language_loss": 0.63150692, "learning_rate": 2.1354373372657296e-06, "loss": 0.65360343, "num_input_tokens_seen": 88455190, "step": 4109, "time_per_iteration": 4.307246923446655 }, { "auxiliary_loss_clip": 0.01177445, "auxiliary_loss_mlp": 0.01033076, "balance_loss_clip": 1.05164099, "balance_loss_mlp": 1.02566099, "epoch": 0.49419828052666387, "flos": 24317485562880.0, "grad_norm": 1.5711564710819106, "language_loss": 0.71032643, "learning_rate": 2.1346601435287404e-06, "loss": 0.73243165, "num_input_tokens_seen": 88477460, "step": 4110, "time_per_iteration": 2.8226418495178223 }, { "auxiliary_loss_clip": 0.01176544, "auxiliary_loss_mlp": 0.01027581, "balance_loss_clip": 0.97497636, "balance_loss_mlp": 1.02011538, "epoch": 0.494318523417303, "flos": 29386060790400.0, "grad_norm": 1.7912982564051783, "language_loss": 0.80378288, "learning_rate": 2.1338829293639144e-06, "loss": 0.82582414, "num_input_tokens_seen": 88497820, "step": 4111, "time_per_iteration": 2.718397855758667 }, { "auxiliary_loss_clip": 0.0117505, "auxiliary_loss_mlp": 0.0103264, "balance_loss_clip": 0.89813274, "balance_loss_mlp": 1.02508783, "epoch": 0.49443876630794203, "flos": 15268284195840.0, "grad_norm": 2.489597605711961, "language_loss": 0.82763702, "learning_rate": 2.1331056948891547e-06, "loss": 0.84971392, "num_input_tokens_seen": 88514920, "step": 4112, "time_per_iteration": 2.7061989307403564 }, { "auxiliary_loss_clip": 0.01173582, "auxiliary_loss_mlp": 0.01028555, "balance_loss_clip": 0.97436982, "balance_loss_mlp": 1.02078283, "epoch": 0.49455900919858115, "flos": 12347453859840.0, "grad_norm": 2.160508054653871, "language_loss": 0.76316309, "learning_rate": 2.1323284402223666e-06, "loss": 0.78518444, "num_input_tokens_seen": 88530910, "step": 4113, "time_per_iteration": 2.624565362930298 }, { "auxiliary_loss_clip": 0.01178881, "auxiliary_loss_mlp": 0.01121831, "balance_loss_clip": 1.05534708, "balance_loss_mlp": 0.0, "epoch": 0.4946792520892202, "flos": 22779610715520.0, "grad_norm": 1.8352243677991764, "language_loss": 0.88424087, "learning_rate": 2.1315511654814597e-06, "loss": 0.90724802, "num_input_tokens_seen": 88549320, "step": 4114, "time_per_iteration": 2.724445343017578 }, { "auxiliary_loss_clip": 0.01166195, "auxiliary_loss_mlp": 0.01025258, "balance_loss_clip": 0.97352552, "balance_loss_mlp": 1.01844764, "epoch": 0.4947994949798593, "flos": 23148126299520.0, "grad_norm": 1.7693289045704352, "language_loss": 0.78110313, "learning_rate": 2.1307738707843456e-06, "loss": 0.80301762, "num_input_tokens_seen": 88568985, "step": 4115, "time_per_iteration": 2.7594215869903564 }, { "auxiliary_loss_clip": 0.01185789, "auxiliary_loss_mlp": 0.01029244, "balance_loss_clip": 1.01631117, "balance_loss_mlp": 1.02094698, "epoch": 0.4949197378704984, "flos": 23659997063040.0, "grad_norm": 3.320263752628049, "language_loss": 0.68776071, "learning_rate": 2.1299965562489385e-06, "loss": 0.70991099, "num_input_tokens_seen": 88588790, "step": 4116, "time_per_iteration": 2.72031831741333 }, { "auxiliary_loss_clip": 0.01173664, "auxiliary_loss_mlp": 0.01028329, "balance_loss_clip": 1.01037049, "balance_loss_mlp": 1.02119994, "epoch": 0.4950399807611375, "flos": 26911493026560.0, "grad_norm": 1.3508810162305884, "language_loss": 0.79053283, "learning_rate": 2.129219221993158e-06, "loss": 0.81255275, "num_input_tokens_seen": 88613575, "step": 4117, "time_per_iteration": 2.763679265975952 }, { "auxiliary_loss_clip": 0.01086835, "auxiliary_loss_mlp": 0.01005708, "balance_loss_clip": 0.90932262, "balance_loss_mlp": 1.00368118, "epoch": 0.4951602236517766, "flos": 67315270187520.0, "grad_norm": 0.866631903298788, "language_loss": 0.59953338, "learning_rate": 2.128441868134924e-06, "loss": 0.62045884, "num_input_tokens_seen": 88675510, "step": 4118, "time_per_iteration": 3.306649684906006 }, { "auxiliary_loss_clip": 0.01176606, "auxiliary_loss_mlp": 0.01024851, "balance_loss_clip": 0.93433213, "balance_loss_mlp": 1.01761472, "epoch": 0.4952804665424157, "flos": 19901442758400.0, "grad_norm": 2.090862946220142, "language_loss": 0.8245014, "learning_rate": 2.1276644947921606e-06, "loss": 0.84651601, "num_input_tokens_seen": 88694425, "step": 4119, "time_per_iteration": 2.727210760116577 }, { "auxiliary_loss_clip": 0.01175998, "auxiliary_loss_mlp": 0.01030008, "balance_loss_clip": 1.0124203, "balance_loss_mlp": 1.02196097, "epoch": 0.49540070943305475, "flos": 18806813740800.0, "grad_norm": 1.6905505991557361, "language_loss": 0.82440054, "learning_rate": 2.126887102082795e-06, "loss": 0.84646058, "num_input_tokens_seen": 88714450, "step": 4120, "time_per_iteration": 2.64514422416687 }, { "auxiliary_loss_clip": 0.01172818, "auxiliary_loss_mlp": 0.01026844, "balance_loss_clip": 0.93471885, "balance_loss_mlp": 1.01910734, "epoch": 0.49552095232369386, "flos": 24934179191040.0, "grad_norm": 1.658544030487855, "language_loss": 0.70528156, "learning_rate": 2.126109690124757e-06, "loss": 0.72727823, "num_input_tokens_seen": 88735265, "step": 4121, "time_per_iteration": 2.769702434539795 }, { "auxiliary_loss_clip": 0.01174291, "auxiliary_loss_mlp": 0.01023652, "balance_loss_clip": 0.89810348, "balance_loss_mlp": 1.01594198, "epoch": 0.495641195214333, "flos": 22857249962880.0, "grad_norm": 1.5003297946590433, "language_loss": 0.710199, "learning_rate": 2.1253322590359786e-06, "loss": 0.73217845, "num_input_tokens_seen": 88754600, "step": 4122, "time_per_iteration": 2.7332167625427246 }, { "auxiliary_loss_clip": 0.01174098, "auxiliary_loss_mlp": 0.01025569, "balance_loss_clip": 1.01177526, "balance_loss_mlp": 1.01790369, "epoch": 0.49576143810497203, "flos": 25769748343680.0, "grad_norm": 1.591231679391238, "language_loss": 0.74111497, "learning_rate": 2.124554808934397e-06, "loss": 0.76311159, "num_input_tokens_seen": 88775180, "step": 4123, "time_per_iteration": 3.550297975540161 }, { "auxiliary_loss_clip": 0.01163969, "auxiliary_loss_mlp": 0.01029615, "balance_loss_clip": 0.89402604, "balance_loss_mlp": 1.02145505, "epoch": 0.49588168099561114, "flos": 22128838058880.0, "grad_norm": 1.6368246149786154, "language_loss": 0.72897792, "learning_rate": 2.1237773399379496e-06, "loss": 0.75091374, "num_input_tokens_seen": 88796145, "step": 4124, "time_per_iteration": 2.7844996452331543 }, { "auxiliary_loss_clip": 0.01179179, "auxiliary_loss_mlp": 0.01032146, "balance_loss_clip": 0.9712503, "balance_loss_mlp": 1.02437973, "epoch": 0.49600192388625025, "flos": 24387331559040.0, "grad_norm": 1.6302873124588853, "language_loss": 0.87081981, "learning_rate": 2.122999852164578e-06, "loss": 0.89293301, "num_input_tokens_seen": 88816765, "step": 4125, "time_per_iteration": 2.6788792610168457 }, { "auxiliary_loss_clip": 0.01169873, "auxiliary_loss_mlp": 0.01028043, "balance_loss_clip": 0.89771974, "balance_loss_mlp": 1.02003241, "epoch": 0.4961221667768893, "flos": 22857429530880.0, "grad_norm": 1.964139384514205, "language_loss": 0.58118278, "learning_rate": 2.122222345732227e-06, "loss": 0.60316193, "num_input_tokens_seen": 88836680, "step": 4126, "time_per_iteration": 3.7047109603881836 }, { "auxiliary_loss_clip": 0.01175458, "auxiliary_loss_mlp": 0.01021983, "balance_loss_clip": 0.93446672, "balance_loss_mlp": 1.01435935, "epoch": 0.4962424096675284, "flos": 17858089768320.0, "grad_norm": 1.9984236521948644, "language_loss": 0.82596302, "learning_rate": 2.121444820758843e-06, "loss": 0.84793741, "num_input_tokens_seen": 88855320, "step": 4127, "time_per_iteration": 2.6650378704071045 }, { "auxiliary_loss_clip": 0.01178628, "auxiliary_loss_mlp": 0.01027957, "balance_loss_clip": 0.90091145, "balance_loss_mlp": 1.01968384, "epoch": 0.49636265255816747, "flos": 21793611404160.0, "grad_norm": 2.3617349655748785, "language_loss": 0.79222119, "learning_rate": 2.120667277362376e-06, "loss": 0.81428707, "num_input_tokens_seen": 88874035, "step": 4128, "time_per_iteration": 2.737642526626587 }, { "auxiliary_loss_clip": 0.0118641, "auxiliary_loss_mlp": 0.01034924, "balance_loss_clip": 1.05686617, "balance_loss_mlp": 1.02680027, "epoch": 0.4964828954488066, "flos": 16358603581440.0, "grad_norm": 3.153422930863971, "language_loss": 0.84756196, "learning_rate": 2.1198897156607796e-06, "loss": 0.8697753, "num_input_tokens_seen": 88891390, "step": 4129, "time_per_iteration": 2.6156067848205566 }, { "auxiliary_loss_clip": 0.01185189, "auxiliary_loss_mlp": 0.0102654, "balance_loss_clip": 1.01386976, "balance_loss_mlp": 1.01825511, "epoch": 0.4966031383394457, "flos": 24711101775360.0, "grad_norm": 2.366339799250279, "language_loss": 0.73675108, "learning_rate": 2.1191121357720085e-06, "loss": 0.75886834, "num_input_tokens_seen": 88909450, "step": 4130, "time_per_iteration": 2.734835386276245 }, { "auxiliary_loss_clip": 0.01167131, "auxiliary_loss_mlp": 0.0102764, "balance_loss_clip": 0.89684379, "balance_loss_mlp": 1.01983738, "epoch": 0.49672338123008475, "flos": 22930615491840.0, "grad_norm": 1.8270820326590975, "language_loss": 0.74702102, "learning_rate": 2.1183345378140206e-06, "loss": 0.7689687, "num_input_tokens_seen": 88929195, "step": 4131, "time_per_iteration": 3.6757805347442627 }, { "auxiliary_loss_clip": 0.01084972, "auxiliary_loss_mlp": 0.01002203, "balance_loss_clip": 0.98467147, "balance_loss_mlp": 1.00014043, "epoch": 0.49684362412072386, "flos": 65976736844160.0, "grad_norm": 0.8544144578041398, "language_loss": 0.6198045, "learning_rate": 2.1175569219047783e-06, "loss": 0.64067626, "num_input_tokens_seen": 88990635, "step": 4132, "time_per_iteration": 3.356351375579834 }, { "auxiliary_loss_clip": 0.01180314, "auxiliary_loss_mlp": 0.01027125, "balance_loss_clip": 1.05203652, "balance_loss_mlp": 1.01966274, "epoch": 0.49696386701136297, "flos": 19971288754560.0, "grad_norm": 2.0620733305414958, "language_loss": 0.73813343, "learning_rate": 2.1167792881622437e-06, "loss": 0.76020789, "num_input_tokens_seen": 89009655, "step": 4133, "time_per_iteration": 2.6012325286865234 }, { "auxiliary_loss_clip": 0.01173633, "auxiliary_loss_mlp": 0.01032219, "balance_loss_clip": 0.97658122, "balance_loss_mlp": 1.02427673, "epoch": 0.497084109902002, "flos": 24750819239040.0, "grad_norm": 1.4903868010492622, "language_loss": 0.80808365, "learning_rate": 2.116001636704384e-06, "loss": 0.83014214, "num_input_tokens_seen": 89030040, "step": 4134, "time_per_iteration": 2.708038091659546 }, { "auxiliary_loss_clip": 0.01185521, "auxiliary_loss_mlp": 0.01029935, "balance_loss_clip": 0.89929938, "balance_loss_mlp": 1.02156115, "epoch": 0.49720435279264114, "flos": 21871825269120.0, "grad_norm": 1.7278296760608853, "language_loss": 0.80106473, "learning_rate": 2.1152239676491685e-06, "loss": 0.8232193, "num_input_tokens_seen": 89048145, "step": 4135, "time_per_iteration": 3.6717405319213867 }, { "auxiliary_loss_clip": 0.0118099, "auxiliary_loss_mlp": 0.0103419, "balance_loss_clip": 0.97386467, "balance_loss_mlp": 1.0264647, "epoch": 0.49732459568328025, "flos": 23805794367360.0, "grad_norm": 1.6727427629864602, "language_loss": 0.73610693, "learning_rate": 2.114446281114569e-06, "loss": 0.7582587, "num_input_tokens_seen": 89067165, "step": 4136, "time_per_iteration": 2.7594754695892334 }, { "auxiliary_loss_clip": 0.01166935, "auxiliary_loss_mlp": 0.01028886, "balance_loss_clip": 0.97514224, "balance_loss_mlp": 1.02157545, "epoch": 0.4974448385739193, "flos": 20047742853120.0, "grad_norm": 2.736630483390725, "language_loss": 0.76094216, "learning_rate": 2.1136685772185587e-06, "loss": 0.78290033, "num_input_tokens_seen": 89086190, "step": 4137, "time_per_iteration": 2.6470091342926025 }, { "auxiliary_loss_clip": 0.01174218, "auxiliary_loss_mlp": 0.01123436, "balance_loss_clip": 0.97035813, "balance_loss_mlp": 0.0, "epoch": 0.4975650814645584, "flos": 24821347593600.0, "grad_norm": 1.5936278619507065, "language_loss": 0.77904129, "learning_rate": 2.1128908560791163e-06, "loss": 0.80201781, "num_input_tokens_seen": 89106020, "step": 4138, "time_per_iteration": 2.71950626373291 }, { "auxiliary_loss_clip": 0.01181453, "auxiliary_loss_mlp": 0.01025847, "balance_loss_clip": 1.05300319, "balance_loss_mlp": 1.01785362, "epoch": 0.4976853243551975, "flos": 19829477859840.0, "grad_norm": 1.8714139577679978, "language_loss": 0.78035223, "learning_rate": 2.1121131178142203e-06, "loss": 0.80242527, "num_input_tokens_seen": 89125385, "step": 4139, "time_per_iteration": 2.6059887409210205 }, { "auxiliary_loss_clip": 0.01174191, "auxiliary_loss_mlp": 0.01030091, "balance_loss_clip": 0.97248936, "balance_loss_mlp": 1.02270555, "epoch": 0.4978055672458366, "flos": 23142990654720.0, "grad_norm": 1.4994643496197477, "language_loss": 0.82391608, "learning_rate": 2.1113353625418544e-06, "loss": 0.84595883, "num_input_tokens_seen": 89143935, "step": 4140, "time_per_iteration": 2.681539535522461 }, { "auxiliary_loss_clip": 0.01174093, "auxiliary_loss_mlp": 0.01026945, "balance_loss_clip": 1.01733804, "balance_loss_mlp": 1.02029967, "epoch": 0.4979258101364757, "flos": 15559914718080.0, "grad_norm": 1.590520664190833, "language_loss": 0.78786302, "learning_rate": 2.1105575903800017e-06, "loss": 0.8098734, "num_input_tokens_seen": 89162655, "step": 4141, "time_per_iteration": 2.6236910820007324 }, { "auxiliary_loss_clip": 0.01183345, "auxiliary_loss_mlp": 0.0102889, "balance_loss_clip": 1.01325536, "balance_loss_mlp": 1.02143943, "epoch": 0.4980460530271148, "flos": 26356169784960.0, "grad_norm": 2.946087895218686, "language_loss": 0.84859371, "learning_rate": 2.1097798014466502e-06, "loss": 0.87071598, "num_input_tokens_seen": 89182255, "step": 4142, "time_per_iteration": 2.7480289936065674 }, { "auxiliary_loss_clip": 0.01184676, "auxiliary_loss_mlp": 0.01027206, "balance_loss_clip": 1.0141207, "balance_loss_mlp": 1.01918888, "epoch": 0.49816629591775385, "flos": 17274541415040.0, "grad_norm": 2.162986611575255, "language_loss": 0.58927363, "learning_rate": 2.109001995859791e-06, "loss": 0.61139244, "num_input_tokens_seen": 89201155, "step": 4143, "time_per_iteration": 2.8296260833740234 }, { "auxiliary_loss_clip": 0.0108814, "auxiliary_loss_mlp": 0.01000261, "balance_loss_clip": 0.95115596, "balance_loss_mlp": 0.99831784, "epoch": 0.49828653880839296, "flos": 64930947344640.0, "grad_norm": 0.8027607154715206, "language_loss": 0.60099459, "learning_rate": 2.108224173737415e-06, "loss": 0.62187862, "num_input_tokens_seen": 89264455, "step": 4144, "time_per_iteration": 3.3551766872406006 }, { "auxiliary_loss_clip": 0.01175773, "auxiliary_loss_mlp": 0.01026292, "balance_loss_clip": 0.9731102, "balance_loss_mlp": 1.01815617, "epoch": 0.498406781699032, "flos": 27484806003840.0, "grad_norm": 1.777114925943344, "language_loss": 0.76027095, "learning_rate": 2.1074463351975183e-06, "loss": 0.78229165, "num_input_tokens_seen": 89283340, "step": 4145, "time_per_iteration": 2.8067715167999268 }, { "auxiliary_loss_clip": 0.01182128, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 0.93776023, "balance_loss_mlp": 1.02623892, "epoch": 0.49852702458967113, "flos": 31499870307840.0, "grad_norm": 1.7249287264655002, "language_loss": 0.71657312, "learning_rate": 2.106668480358098e-06, "loss": 0.73873043, "num_input_tokens_seen": 89303565, "step": 4146, "time_per_iteration": 2.877586603164673 }, { "auxiliary_loss_clip": 0.01184876, "auxiliary_loss_mlp": 0.01027016, "balance_loss_clip": 0.93450773, "balance_loss_mlp": 1.01869535, "epoch": 0.49864726748031024, "flos": 22852868503680.0, "grad_norm": 1.659136541719576, "language_loss": 0.7065028, "learning_rate": 2.105890609337154e-06, "loss": 0.72862172, "num_input_tokens_seen": 89322080, "step": 4147, "time_per_iteration": 2.9336891174316406 }, { "auxiliary_loss_clip": 0.01085343, "auxiliary_loss_mlp": 0.01001204, "balance_loss_clip": 1.02570462, "balance_loss_mlp": 0.99927324, "epoch": 0.4987675103709493, "flos": 70405708544640.0, "grad_norm": 0.6895151292941333, "language_loss": 0.63869095, "learning_rate": 2.1051127222526883e-06, "loss": 0.65955639, "num_input_tokens_seen": 89394195, "step": 4148, "time_per_iteration": 3.3457283973693848 }, { "auxiliary_loss_clip": 0.01177636, "auxiliary_loss_mlp": 0.01031345, "balance_loss_clip": 1.01595902, "balance_loss_mlp": 1.02410352, "epoch": 0.4988877532615884, "flos": 28767571482240.0, "grad_norm": 1.6108447533493309, "language_loss": 0.80732244, "learning_rate": 2.1043348192227067e-06, "loss": 0.82941222, "num_input_tokens_seen": 89414565, "step": 4149, "time_per_iteration": 3.640289306640625 }, { "auxiliary_loss_clip": 0.01168373, "auxiliary_loss_mlp": 0.01029084, "balance_loss_clip": 0.93879485, "balance_loss_mlp": 1.02144277, "epoch": 0.4990079961522275, "flos": 16872700988160.0, "grad_norm": 1.8034165653291814, "language_loss": 0.61425757, "learning_rate": 2.1035569003652156e-06, "loss": 0.63623214, "num_input_tokens_seen": 89433195, "step": 4150, "time_per_iteration": 2.684704065322876 }, { "auxiliary_loss_clip": 0.01173987, "auxiliary_loss_mlp": 0.0103605, "balance_loss_clip": 0.8994481, "balance_loss_mlp": 1.02725804, "epoch": 0.4991282390428666, "flos": 13291042187520.0, "grad_norm": 2.052183297157016, "language_loss": 0.81708521, "learning_rate": 2.1027789657982255e-06, "loss": 0.8391856, "num_input_tokens_seen": 89447410, "step": 4151, "time_per_iteration": 2.675917387008667 }, { "auxiliary_loss_clip": 0.01177809, "auxiliary_loss_mlp": 0.01029345, "balance_loss_clip": 0.89987946, "balance_loss_mlp": 1.02129829, "epoch": 0.4992484819335057, "flos": 21537496454400.0, "grad_norm": 1.8245104425070762, "language_loss": 0.76967496, "learning_rate": 2.1020010156397482e-06, "loss": 0.7917465, "num_input_tokens_seen": 89464630, "step": 4152, "time_per_iteration": 3.599838972091675 }, { "auxiliary_loss_clip": 0.01182984, "auxiliary_loss_mlp": 0.01022374, "balance_loss_clip": 1.01556373, "balance_loss_mlp": 1.01480389, "epoch": 0.4993687248241448, "flos": 24860095390080.0, "grad_norm": 1.4835468685335813, "language_loss": 0.77452266, "learning_rate": 2.101223050007797e-06, "loss": 0.79657626, "num_input_tokens_seen": 89483180, "step": 4153, "time_per_iteration": 2.7072298526763916 }, { "auxiliary_loss_clip": 0.01083764, "auxiliary_loss_mlp": 0.01001145, "balance_loss_clip": 1.02441645, "balance_loss_mlp": 0.99915427, "epoch": 0.49948896771478385, "flos": 62941602453120.0, "grad_norm": 0.8231680661438822, "language_loss": 0.53888184, "learning_rate": 2.1004450690203904e-06, "loss": 0.55973095, "num_input_tokens_seen": 89539260, "step": 4154, "time_per_iteration": 3.253556966781616 }, { "auxiliary_loss_clip": 0.01083612, "auxiliary_loss_mlp": 0.01002592, "balance_loss_clip": 1.02435064, "balance_loss_mlp": 1.00058961, "epoch": 0.49960921060542296, "flos": 68284213516800.0, "grad_norm": 0.857341537884657, "language_loss": 0.63326102, "learning_rate": 2.099667072795546e-06, "loss": 0.65412307, "num_input_tokens_seen": 89601380, "step": 4155, "time_per_iteration": 3.2035772800445557 }, { "auxiliary_loss_clip": 0.01178943, "auxiliary_loss_mlp": 0.01029732, "balance_loss_clip": 1.01463509, "balance_loss_mlp": 1.02163732, "epoch": 0.49972945349606207, "flos": 23659350618240.0, "grad_norm": 2.409042995934821, "language_loss": 0.79913974, "learning_rate": 2.0988890614512864e-06, "loss": 0.82122648, "num_input_tokens_seen": 89621270, "step": 4156, "time_per_iteration": 2.686555862426758 }, { "auxiliary_loss_clip": 0.01179369, "auxiliary_loss_mlp": 0.01028191, "balance_loss_clip": 0.97653866, "balance_loss_mlp": 1.02073479, "epoch": 0.4998496963867011, "flos": 19755825022080.0, "grad_norm": 1.603114029757432, "language_loss": 0.84259087, "learning_rate": 2.098111035105635e-06, "loss": 0.86466646, "num_input_tokens_seen": 89639695, "step": 4157, "time_per_iteration": 2.707064151763916 }, { "auxiliary_loss_clip": 0.01178133, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 0.90315354, "balance_loss_mlp": 1.02807105, "epoch": 0.49996993927734024, "flos": 22265728790400.0, "grad_norm": 1.6197803460535811, "language_loss": 0.73035789, "learning_rate": 2.0973329938766176e-06, "loss": 0.75250196, "num_input_tokens_seen": 89657125, "step": 4158, "time_per_iteration": 3.7285141944885254 }, { "auxiliary_loss_clip": 0.01185597, "auxiliary_loss_mlp": 0.0102783, "balance_loss_clip": 1.01465821, "balance_loss_mlp": 1.01916325, "epoch": 0.5000901821679793, "flos": 23327212533120.0, "grad_norm": 1.9052707850249286, "language_loss": 0.78645551, "learning_rate": 2.0965549378822618e-06, "loss": 0.80858976, "num_input_tokens_seen": 89678415, "step": 4159, "time_per_iteration": 2.679306745529175 }, { "auxiliary_loss_clip": 0.01169141, "auxiliary_loss_mlp": 0.01028138, "balance_loss_clip": 0.78345686, "balance_loss_mlp": 1.01969218, "epoch": 0.5002104250586185, "flos": 20339014239360.0, "grad_norm": 2.0054229204532077, "language_loss": 0.8395108, "learning_rate": 2.095776867240599e-06, "loss": 0.86148357, "num_input_tokens_seen": 89695405, "step": 4160, "time_per_iteration": 3.764838457107544 }, { "auxiliary_loss_clip": 0.01171683, "auxiliary_loss_mlp": 0.0102358, "balance_loss_clip": 0.93722844, "balance_loss_mlp": 1.01586151, "epoch": 0.5003306679492575, "flos": 13991372634240.0, "grad_norm": 1.8717355574804089, "language_loss": 0.82461154, "learning_rate": 2.094998782069661e-06, "loss": 0.84656417, "num_input_tokens_seen": 89713110, "step": 4161, "time_per_iteration": 2.6967508792877197 }, { "auxiliary_loss_clip": 0.01180188, "auxiliary_loss_mlp": 0.01030592, "balance_loss_clip": 1.05259633, "balance_loss_mlp": 1.02279532, "epoch": 0.5004509108398966, "flos": 27672762896640.0, "grad_norm": 1.5889690120306155, "language_loss": 0.75460327, "learning_rate": 2.0942206824874845e-06, "loss": 0.77671105, "num_input_tokens_seen": 89735885, "step": 4162, "time_per_iteration": 2.657923936843872 }, { "auxiliary_loss_clip": 0.01182196, "auxiliary_loss_mlp": 0.01031941, "balance_loss_clip": 1.01696825, "balance_loss_mlp": 1.02367413, "epoch": 0.5005711537305357, "flos": 14976186796800.0, "grad_norm": 2.179337297228833, "language_loss": 0.79195422, "learning_rate": 2.093442568612105e-06, "loss": 0.81409562, "num_input_tokens_seen": 89753690, "step": 4163, "time_per_iteration": 2.6673030853271484 }, { "auxiliary_loss_clip": 0.01179894, "auxiliary_loss_mlp": 0.01028328, "balance_loss_clip": 1.05079842, "balance_loss_mlp": 1.02023983, "epoch": 0.5006913966211748, "flos": 26503259978880.0, "grad_norm": 1.4773446357934266, "language_loss": 0.8483063, "learning_rate": 2.0926644405615613e-06, "loss": 0.87038845, "num_input_tokens_seen": 89774590, "step": 4164, "time_per_iteration": 2.659348249435425 }, { "auxiliary_loss_clip": 0.01173213, "auxiliary_loss_mlp": 0.01028272, "balance_loss_clip": 0.93713903, "balance_loss_mlp": 1.02079773, "epoch": 0.5008116395118138, "flos": 20449295971200.0, "grad_norm": 2.0588243830436492, "language_loss": 0.80823225, "learning_rate": 2.091886298453897e-06, "loss": 0.8302471, "num_input_tokens_seen": 89792775, "step": 4165, "time_per_iteration": 2.7093605995178223 }, { "auxiliary_loss_clip": 0.01176224, "auxiliary_loss_mlp": 0.01027012, "balance_loss_clip": 1.01244164, "balance_loss_mlp": 1.01973403, "epoch": 0.500931882402453, "flos": 21579871524480.0, "grad_norm": 1.7675224187009757, "language_loss": 0.72834283, "learning_rate": 2.091108142407153e-06, "loss": 0.75037515, "num_input_tokens_seen": 89811515, "step": 4166, "time_per_iteration": 2.6836512088775635 }, { "auxiliary_loss_clip": 0.01095361, "auxiliary_loss_mlp": 0.01002612, "balance_loss_clip": 0.96010602, "balance_loss_mlp": 1.0006808, "epoch": 0.5010521252930921, "flos": 57785011925760.0, "grad_norm": 0.8360404100140526, "language_loss": 0.62440765, "learning_rate": 2.090329972539377e-06, "loss": 0.64538729, "num_input_tokens_seen": 89870080, "step": 4167, "time_per_iteration": 3.2881152629852295 }, { "auxiliary_loss_clip": 0.01164403, "auxiliary_loss_mlp": 0.0103252, "balance_loss_clip": 0.78423238, "balance_loss_mlp": 1.0243721, "epoch": 0.5011723681837311, "flos": 18625500864000.0, "grad_norm": 2.0653932254706326, "language_loss": 0.68314403, "learning_rate": 2.089551788968616e-06, "loss": 0.70511323, "num_input_tokens_seen": 89888045, "step": 4168, "time_per_iteration": 3.1311938762664795 }, { "auxiliary_loss_clip": 0.010827, "auxiliary_loss_mlp": 0.01002833, "balance_loss_clip": 1.0236938, "balance_loss_mlp": 1.00097311, "epoch": 0.5012926110743702, "flos": 55883146608000.0, "grad_norm": 0.8374719568843323, "language_loss": 0.60846627, "learning_rate": 2.08877359181292e-06, "loss": 0.62932158, "num_input_tokens_seen": 89944610, "step": 4169, "time_per_iteration": 3.3895885944366455 }, { "auxiliary_loss_clip": 0.01177969, "auxiliary_loss_mlp": 0.01026893, "balance_loss_clip": 0.93364954, "balance_loss_mlp": 1.01957917, "epoch": 0.5014128539650093, "flos": 24238266117120.0, "grad_norm": 2.098296424743388, "language_loss": 0.85448253, "learning_rate": 2.0879953811903396e-06, "loss": 0.87653112, "num_input_tokens_seen": 89959495, "step": 4170, "time_per_iteration": 2.7412009239196777 }, { "auxiliary_loss_clip": 0.01178664, "auxiliary_loss_mlp": 0.01030619, "balance_loss_clip": 1.01502967, "balance_loss_mlp": 1.023067, "epoch": 0.5015330968556484, "flos": 27527468382720.0, "grad_norm": 1.6898086758072657, "language_loss": 0.78668922, "learning_rate": 2.08721715721893e-06, "loss": 0.8087821, "num_input_tokens_seen": 89978820, "step": 4171, "time_per_iteration": 2.7184863090515137 }, { "auxiliary_loss_clip": 0.01177301, "auxiliary_loss_mlp": 0.01029538, "balance_loss_clip": 1.01385522, "balance_loss_mlp": 1.02231944, "epoch": 0.5016533397462875, "flos": 23800802376960.0, "grad_norm": 1.7705394357459847, "language_loss": 0.76861084, "learning_rate": 2.0864389200167477e-06, "loss": 0.79067928, "num_input_tokens_seen": 89997075, "step": 4172, "time_per_iteration": 2.6860299110412598 }, { "auxiliary_loss_clip": 0.01181287, "auxiliary_loss_mlp": 0.01122611, "balance_loss_clip": 1.01413131, "balance_loss_mlp": 0.0, "epoch": 0.5017735826369266, "flos": 25295009264640.0, "grad_norm": 1.6715264868848558, "language_loss": 0.78977191, "learning_rate": 2.0856606697018504e-06, "loss": 0.8128109, "num_input_tokens_seen": 90015085, "step": 4173, "time_per_iteration": 2.6752169132232666 }, { "auxiliary_loss_clip": 0.01172089, "auxiliary_loss_mlp": 0.01032844, "balance_loss_clip": 0.97245711, "balance_loss_mlp": 1.02420759, "epoch": 0.5018938255275657, "flos": 16873203778560.0, "grad_norm": 2.374910899513716, "language_loss": 0.73421526, "learning_rate": 2.084882406392297e-06, "loss": 0.75626457, "num_input_tokens_seen": 90033045, "step": 4174, "time_per_iteration": 3.049746513366699 }, { "auxiliary_loss_clip": 0.01179963, "auxiliary_loss_mlp": 0.01024309, "balance_loss_clip": 1.01475883, "balance_loss_mlp": 1.01701355, "epoch": 0.5020140684182047, "flos": 25515429073920.0, "grad_norm": 2.8360263220445856, "language_loss": 0.7138775, "learning_rate": 2.0841041302061496e-06, "loss": 0.73592025, "num_input_tokens_seen": 90052505, "step": 4175, "time_per_iteration": 3.9099459648132324 }, { "auxiliary_loss_clip": 0.01163584, "auxiliary_loss_mlp": 0.01025646, "balance_loss_clip": 0.97099233, "balance_loss_mlp": 1.01793635, "epoch": 0.5021343113088439, "flos": 23659278791040.0, "grad_norm": 1.9423001157915287, "language_loss": 0.75616181, "learning_rate": 2.083325841261473e-06, "loss": 0.77805412, "num_input_tokens_seen": 90071565, "step": 4176, "time_per_iteration": 2.683242082595825 }, { "auxiliary_loss_clip": 0.01169499, "auxiliary_loss_mlp": 0.01024527, "balance_loss_clip": 0.9727962, "balance_loss_mlp": 1.01718915, "epoch": 0.502254554199483, "flos": 24534673148160.0, "grad_norm": 1.761747108545789, "language_loss": 0.66134608, "learning_rate": 2.0825475396763322e-06, "loss": 0.68328637, "num_input_tokens_seen": 90092215, "step": 4177, "time_per_iteration": 2.715430974960327 }, { "auxiliary_loss_clip": 0.01159821, "auxiliary_loss_mlp": 0.01030417, "balance_loss_clip": 0.77882022, "balance_loss_mlp": 1.02283275, "epoch": 0.502374797090122, "flos": 34240285607040.0, "grad_norm": 1.3125368560704926, "language_loss": 0.65507579, "learning_rate": 2.081769225568796e-06, "loss": 0.67697811, "num_input_tokens_seen": 90114665, "step": 4178, "time_per_iteration": 4.00312876701355 }, { "auxiliary_loss_clip": 0.01177886, "auxiliary_loss_mlp": 0.01027657, "balance_loss_clip": 1.01166165, "balance_loss_mlp": 1.02000415, "epoch": 0.5024950399807612, "flos": 26031106679040.0, "grad_norm": 1.5080421535909176, "language_loss": 0.75878388, "learning_rate": 2.0809908990569327e-06, "loss": 0.78083932, "num_input_tokens_seen": 90136445, "step": 4179, "time_per_iteration": 2.867166042327881 }, { "auxiliary_loss_clip": 0.01176992, "auxiliary_loss_mlp": 0.01026001, "balance_loss_clip": 0.97621781, "balance_loss_mlp": 1.0188123, "epoch": 0.5026152828714002, "flos": 21252438120960.0, "grad_norm": 1.8205431688874485, "language_loss": 0.7913574, "learning_rate": 2.0802125602588146e-06, "loss": 0.81338727, "num_input_tokens_seen": 90155710, "step": 4180, "time_per_iteration": 2.7435343265533447 }, { "auxiliary_loss_clip": 0.01179607, "auxiliary_loss_mlp": 0.01026271, "balance_loss_clip": 1.05073988, "balance_loss_mlp": 1.01861811, "epoch": 0.5027355257620393, "flos": 30956111245440.0, "grad_norm": 1.8586306881911783, "language_loss": 0.66765082, "learning_rate": 2.0794342092925146e-06, "loss": 0.68970954, "num_input_tokens_seen": 90176845, "step": 4181, "time_per_iteration": 2.6922383308410645 }, { "auxiliary_loss_clip": 0.01182519, "auxiliary_loss_mlp": 0.01032162, "balance_loss_clip": 1.01652288, "balance_loss_mlp": 1.02543592, "epoch": 0.5028557686526784, "flos": 24791147233920.0, "grad_norm": 1.8648414369170772, "language_loss": 0.68153703, "learning_rate": 2.078655846276108e-06, "loss": 0.70368385, "num_input_tokens_seen": 90197175, "step": 4182, "time_per_iteration": 2.6905221939086914 }, { "auxiliary_loss_clip": 0.01172043, "auxiliary_loss_mlp": 0.01034825, "balance_loss_clip": 0.97479582, "balance_loss_mlp": 1.02652764, "epoch": 0.5029760115433175, "flos": 22966992990720.0, "grad_norm": 1.8078607702813247, "language_loss": 0.68296421, "learning_rate": 2.0778774713276727e-06, "loss": 0.70503289, "num_input_tokens_seen": 90216650, "step": 4183, "time_per_iteration": 2.7009339332580566 }, { "auxiliary_loss_clip": 0.01177003, "auxiliary_loss_mlp": 0.01026982, "balance_loss_clip": 1.01222849, "balance_loss_mlp": 1.01932311, "epoch": 0.5030962544339566, "flos": 15305164485120.0, "grad_norm": 2.3231938282104507, "language_loss": 0.67267108, "learning_rate": 2.077099084565287e-06, "loss": 0.69471091, "num_input_tokens_seen": 90234055, "step": 4184, "time_per_iteration": 3.9156887531280518 }, { "auxiliary_loss_clip": 0.01170734, "auxiliary_loss_mlp": 0.01022966, "balance_loss_clip": 0.97211099, "balance_loss_mlp": 1.01488936, "epoch": 0.5032164973245957, "flos": 24494847943680.0, "grad_norm": 2.1966714550577158, "language_loss": 0.65744889, "learning_rate": 2.0763206861070313e-06, "loss": 0.6793859, "num_input_tokens_seen": 90253115, "step": 4185, "time_per_iteration": 2.8155155181884766 }, { "auxiliary_loss_clip": 0.0117968, "auxiliary_loss_mlp": 0.01027631, "balance_loss_clip": 1.05188155, "balance_loss_mlp": 1.01994181, "epoch": 0.5033367402152348, "flos": 16213452721920.0, "grad_norm": 1.9289562906507063, "language_loss": 0.75209606, "learning_rate": 2.0755422760709876e-06, "loss": 0.77416921, "num_input_tokens_seen": 90270515, "step": 4186, "time_per_iteration": 3.549952268600464 }, { "auxiliary_loss_clip": 0.01166716, "auxiliary_loss_mlp": 0.0103136, "balance_loss_clip": 0.89604318, "balance_loss_mlp": 1.0232414, "epoch": 0.5034569831058738, "flos": 21391375927680.0, "grad_norm": 1.9334940521132746, "language_loss": 0.77344185, "learning_rate": 2.0747638545752417e-06, "loss": 0.79542267, "num_input_tokens_seen": 90289075, "step": 4187, "time_per_iteration": 2.7433714866638184 }, { "auxiliary_loss_clip": 0.01172786, "auxiliary_loss_mlp": 0.01036238, "balance_loss_clip": 0.97449839, "balance_loss_mlp": 1.0289005, "epoch": 0.503577225996513, "flos": 20558751690240.0, "grad_norm": 1.9017644740098345, "language_loss": 0.83367163, "learning_rate": 2.073985421737878e-06, "loss": 0.85576189, "num_input_tokens_seen": 90306385, "step": 4188, "time_per_iteration": 2.7334232330322266 }, { "auxiliary_loss_clip": 0.01180628, "auxiliary_loss_mlp": 0.01032525, "balance_loss_clip": 1.01410556, "balance_loss_mlp": 1.02429318, "epoch": 0.5036974688871521, "flos": 27229157930880.0, "grad_norm": 2.2851095292906334, "language_loss": 0.73605567, "learning_rate": 2.0732069776769844e-06, "loss": 0.75818723, "num_input_tokens_seen": 90323795, "step": 4189, "time_per_iteration": 2.6750152111053467 }, { "auxiliary_loss_clip": 0.01180473, "auxiliary_loss_mlp": 0.01027654, "balance_loss_clip": 1.05304384, "balance_loss_mlp": 1.01947069, "epoch": 0.5038177117777911, "flos": 20412164286720.0, "grad_norm": 2.022984212717629, "language_loss": 0.73327315, "learning_rate": 2.072428522510651e-06, "loss": 0.7553544, "num_input_tokens_seen": 90340360, "step": 4190, "time_per_iteration": 2.630311965942383 }, { "auxiliary_loss_clip": 0.01171516, "auxiliary_loss_mlp": 0.01028597, "balance_loss_clip": 0.93698871, "balance_loss_mlp": 1.02134895, "epoch": 0.5039379546684303, "flos": 21907987286400.0, "grad_norm": 2.2378223865920344, "language_loss": 0.76168674, "learning_rate": 2.071650056356968e-06, "loss": 0.78368783, "num_input_tokens_seen": 90357900, "step": 4191, "time_per_iteration": 2.720451593399048 }, { "auxiliary_loss_clip": 0.01178433, "auxiliary_loss_mlp": 0.01029818, "balance_loss_clip": 1.05209088, "balance_loss_mlp": 1.02255237, "epoch": 0.5040581975590693, "flos": 20010718909440.0, "grad_norm": 1.8694560186245304, "language_loss": 0.79955256, "learning_rate": 2.070871579334028e-06, "loss": 0.82163501, "num_input_tokens_seen": 90377010, "step": 4192, "time_per_iteration": 2.5952701568603516 }, { "auxiliary_loss_clip": 0.01178264, "auxiliary_loss_mlp": 0.01025912, "balance_loss_clip": 1.05077457, "balance_loss_mlp": 1.01804423, "epoch": 0.5041784404497084, "flos": 20959837931520.0, "grad_norm": 1.741932616671515, "language_loss": 0.71695673, "learning_rate": 2.0700930915599264e-06, "loss": 0.73899847, "num_input_tokens_seen": 90396740, "step": 4193, "time_per_iteration": 2.5855839252471924 }, { "auxiliary_loss_clip": 0.01178735, "auxiliary_loss_mlp": 0.01026281, "balance_loss_clip": 1.05234432, "balance_loss_mlp": 1.01879501, "epoch": 0.5042986833403476, "flos": 12495082757760.0, "grad_norm": 2.0696338484682224, "language_loss": 0.78513116, "learning_rate": 2.0693145931527583e-06, "loss": 0.80718136, "num_input_tokens_seen": 90413220, "step": 4194, "time_per_iteration": 2.6333892345428467 }, { "auxiliary_loss_clip": 0.01172995, "auxiliary_loss_mlp": 0.01028637, "balance_loss_clip": 0.97381413, "balance_loss_mlp": 1.0210073, "epoch": 0.5044189262309866, "flos": 29202305788800.0, "grad_norm": 1.474940125748584, "language_loss": 0.7800231, "learning_rate": 2.068536084230622e-06, "loss": 0.80203944, "num_input_tokens_seen": 90435085, "step": 4195, "time_per_iteration": 2.7074854373931885 }, { "auxiliary_loss_clip": 0.01179584, "auxiliary_loss_mlp": 0.01030598, "balance_loss_clip": 1.01449609, "balance_loss_mlp": 1.02231848, "epoch": 0.5045391691216257, "flos": 23873198238720.0, "grad_norm": 2.3726436118800383, "language_loss": 0.88793147, "learning_rate": 2.067757564911616e-06, "loss": 0.91003329, "num_input_tokens_seen": 90453660, "step": 4196, "time_per_iteration": 2.6841492652893066 }, { "auxiliary_loss_clip": 0.01185685, "auxiliary_loss_mlp": 0.01122915, "balance_loss_clip": 0.97612113, "balance_loss_mlp": 0.0, "epoch": 0.5046594120122648, "flos": 24644990793600.0, "grad_norm": 1.9176359553213722, "language_loss": 0.92569733, "learning_rate": 2.0669790353138407e-06, "loss": 0.94878328, "num_input_tokens_seen": 90472625, "step": 4197, "time_per_iteration": 2.7545149326324463 }, { "auxiliary_loss_clip": 0.01171172, "auxiliary_loss_mlp": 0.01123065, "balance_loss_clip": 0.93862993, "balance_loss_mlp": 0.0, "epoch": 0.5047796549029039, "flos": 23362835846400.0, "grad_norm": 2.1248968261290697, "language_loss": 0.72905093, "learning_rate": 2.0662004955553995e-06, "loss": 0.7519933, "num_input_tokens_seen": 90492325, "step": 4198, "time_per_iteration": 2.731682777404785 }, { "auxiliary_loss_clip": 0.01172709, "auxiliary_loss_mlp": 0.0102898, "balance_loss_clip": 0.97331351, "balance_loss_mlp": 1.02154732, "epoch": 0.5048998977935429, "flos": 17304095329920.0, "grad_norm": 1.8999935153328973, "language_loss": 0.76972675, "learning_rate": 2.065421945754395e-06, "loss": 0.79174358, "num_input_tokens_seen": 90510055, "step": 4199, "time_per_iteration": 2.6728670597076416 }, { "auxiliary_loss_clip": 0.01178593, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 0.90268248, "balance_loss_mlp": 1.02145994, "epoch": 0.505020140684182, "flos": 34856979235200.0, "grad_norm": 1.576679524474797, "language_loss": 0.78073895, "learning_rate": 2.0646433860289344e-06, "loss": 0.80281675, "num_input_tokens_seen": 90528980, "step": 4200, "time_per_iteration": 2.895808219909668 }, { "auxiliary_loss_clip": 0.01183013, "auxiliary_loss_mlp": 0.01123134, "balance_loss_clip": 1.0141753, "balance_loss_mlp": 0.0, "epoch": 0.5051403835748212, "flos": 24863974058880.0, "grad_norm": 2.049127204466992, "language_loss": 0.82974696, "learning_rate": 2.0638648164971233e-06, "loss": 0.85280842, "num_input_tokens_seen": 90547445, "step": 4201, "time_per_iteration": 3.813405752182007 }, { "auxiliary_loss_clip": 0.01177269, "auxiliary_loss_mlp": 0.01027912, "balance_loss_clip": 0.97592896, "balance_loss_mlp": 1.020908, "epoch": 0.5052606264654602, "flos": 20959694277120.0, "grad_norm": 2.1569500690350085, "language_loss": 0.88711727, "learning_rate": 2.06308623727707e-06, "loss": 0.90916908, "num_input_tokens_seen": 90567545, "step": 4202, "time_per_iteration": 2.7839815616607666 }, { "auxiliary_loss_clip": 0.01177857, "auxiliary_loss_mlp": 0.01027115, "balance_loss_clip": 1.01449239, "balance_loss_mlp": 1.01897264, "epoch": 0.5053808693560993, "flos": 19642382893440.0, "grad_norm": 2.3677188487423804, "language_loss": 0.76409596, "learning_rate": 2.0623076484868846e-06, "loss": 0.78614569, "num_input_tokens_seen": 90585000, "step": 4203, "time_per_iteration": 2.6011080741882324 }, { "auxiliary_loss_clip": 0.01086139, "auxiliary_loss_mlp": 0.01000395, "balance_loss_clip": 0.95172143, "balance_loss_mlp": 0.99845171, "epoch": 0.5055011122467384, "flos": 67504915019520.0, "grad_norm": 0.8368251101518631, "language_loss": 0.6076355, "learning_rate": 2.061529050244679e-06, "loss": 0.62850082, "num_input_tokens_seen": 90644745, "step": 4204, "time_per_iteration": 4.076733589172363 }, { "auxiliary_loss_clip": 0.01182881, "auxiliary_loss_mlp": 0.01023484, "balance_loss_clip": 0.93705302, "balance_loss_mlp": 1.01627517, "epoch": 0.5056213551373775, "flos": 16872952383360.0, "grad_norm": 1.8098982164853743, "language_loss": 0.7416901, "learning_rate": 2.060750442668565e-06, "loss": 0.76375371, "num_input_tokens_seen": 90662500, "step": 4205, "time_per_iteration": 2.6503756046295166 }, { "auxiliary_loss_clip": 0.01180024, "auxiliary_loss_mlp": 0.01028257, "balance_loss_clip": 1.01603878, "balance_loss_mlp": 1.02074349, "epoch": 0.5057415980280165, "flos": 15334179696000.0, "grad_norm": 2.0790670382211185, "language_loss": 0.64219826, "learning_rate": 2.059971825876657e-06, "loss": 0.66428107, "num_input_tokens_seen": 90677010, "step": 4206, "time_per_iteration": 2.60996413230896 }, { "auxiliary_loss_clip": 0.01183082, "auxiliary_loss_mlp": 0.0102243, "balance_loss_clip": 1.01584196, "balance_loss_mlp": 1.01473486, "epoch": 0.5058618409186557, "flos": 19025976574080.0, "grad_norm": 1.7971696628897893, "language_loss": 0.76468855, "learning_rate": 2.0591931999870713e-06, "loss": 0.7867437, "num_input_tokens_seen": 90695935, "step": 4207, "time_per_iteration": 2.6286261081695557 }, { "auxiliary_loss_clip": 0.01082588, "auxiliary_loss_mlp": 0.00999784, "balance_loss_clip": 0.98792613, "balance_loss_mlp": 0.99792403, "epoch": 0.5059820838092948, "flos": 63453114080640.0, "grad_norm": 0.8339636326111658, "language_loss": 0.57637185, "learning_rate": 2.0584145651179234e-06, "loss": 0.59719557, "num_input_tokens_seen": 90751645, "step": 4208, "time_per_iteration": 3.202253818511963 }, { "auxiliary_loss_clip": 0.01178555, "auxiliary_loss_mlp": 0.01122426, "balance_loss_clip": 0.97924042, "balance_loss_mlp": 0.0, "epoch": 0.5061023266999338, "flos": 15441803821440.0, "grad_norm": 2.2977027989890213, "language_loss": 0.80088323, "learning_rate": 2.0576359213873327e-06, "loss": 0.82389307, "num_input_tokens_seen": 90766795, "step": 4209, "time_per_iteration": 2.6478488445281982 }, { "auxiliary_loss_clip": 0.0118338, "auxiliary_loss_mlp": 0.01031603, "balance_loss_clip": 0.97335267, "balance_loss_mlp": 1.0239799, "epoch": 0.506222569590573, "flos": 22451063990400.0, "grad_norm": 2.5958403079984036, "language_loss": 0.70735663, "learning_rate": 2.056857268913419e-06, "loss": 0.72950649, "num_input_tokens_seen": 90786845, "step": 4210, "time_per_iteration": 3.5354270935058594 }, { "auxiliary_loss_clip": 0.01176776, "auxiliary_loss_mlp": 0.01033227, "balance_loss_clip": 1.01329267, "balance_loss_mlp": 1.02597308, "epoch": 0.506342812481212, "flos": 17558665994880.0, "grad_norm": 2.323039433326054, "language_loss": 0.83985078, "learning_rate": 2.056078607814303e-06, "loss": 0.86195076, "num_input_tokens_seen": 90802630, "step": 4211, "time_per_iteration": 2.64373517036438 }, { "auxiliary_loss_clip": 0.01174217, "auxiliary_loss_mlp": 0.01025761, "balance_loss_clip": 1.01342165, "balance_loss_mlp": 1.01826859, "epoch": 0.5064630553718511, "flos": 23402050519680.0, "grad_norm": 1.8704216059035683, "language_loss": 0.78563392, "learning_rate": 2.055299938208106e-06, "loss": 0.8076337, "num_input_tokens_seen": 90823620, "step": 4212, "time_per_iteration": 3.5923702716827393 }, { "auxiliary_loss_clip": 0.01181443, "auxiliary_loss_mlp": 0.01031664, "balance_loss_clip": 1.01339912, "balance_loss_mlp": 1.02409768, "epoch": 0.5065832982624903, "flos": 23987035416960.0, "grad_norm": 1.6680309506424476, "language_loss": 0.8633216, "learning_rate": 2.0545212602129526e-06, "loss": 0.88545263, "num_input_tokens_seen": 90843475, "step": 4213, "time_per_iteration": 2.7869973182678223 }, { "auxiliary_loss_clip": 0.01171096, "auxiliary_loss_mlp": 0.01030237, "balance_loss_clip": 0.97443837, "balance_loss_mlp": 1.0221132, "epoch": 0.5067035411531293, "flos": 21503058289920.0, "grad_norm": 1.915399665637628, "language_loss": 0.66341102, "learning_rate": 2.0537425739469673e-06, "loss": 0.68542433, "num_input_tokens_seen": 90862410, "step": 4214, "time_per_iteration": 2.7864463329315186 }, { "auxiliary_loss_clip": 0.01088742, "auxiliary_loss_mlp": 0.01003705, "balance_loss_clip": 0.9888767, "balance_loss_mlp": 1.00180936, "epoch": 0.5068237840437684, "flos": 65934397687680.0, "grad_norm": 0.8445768424505303, "language_loss": 0.59568948, "learning_rate": 2.052963879528276e-06, "loss": 0.61661398, "num_input_tokens_seen": 90922280, "step": 4215, "time_per_iteration": 3.1675055027008057 }, { "auxiliary_loss_clip": 0.01178472, "auxiliary_loss_mlp": 0.01027643, "balance_loss_clip": 1.01471782, "balance_loss_mlp": 1.01981056, "epoch": 0.5069440269344075, "flos": 27264206626560.0, "grad_norm": 2.0173410510006, "language_loss": 0.76813245, "learning_rate": 2.052185177075007e-06, "loss": 0.79019362, "num_input_tokens_seen": 90941850, "step": 4216, "time_per_iteration": 2.721036911010742 }, { "auxiliary_loss_clip": 0.01178902, "auxiliary_loss_mlp": 0.01024267, "balance_loss_clip": 1.01333761, "balance_loss_mlp": 1.01670957, "epoch": 0.5070642698250466, "flos": 23366319465600.0, "grad_norm": 1.8055564090813778, "language_loss": 0.83015895, "learning_rate": 2.051406466705288e-06, "loss": 0.85219073, "num_input_tokens_seen": 90961390, "step": 4217, "time_per_iteration": 2.661827802658081 }, { "auxiliary_loss_clip": 0.01180234, "auxiliary_loss_mlp": 0.01031526, "balance_loss_clip": 1.0516355, "balance_loss_mlp": 1.02460623, "epoch": 0.5071845127156857, "flos": 20340127560960.0, "grad_norm": 2.0040788347323004, "language_loss": 0.80833769, "learning_rate": 2.0506277485372486e-06, "loss": 0.8304553, "num_input_tokens_seen": 90980215, "step": 4218, "time_per_iteration": 2.6305954456329346 }, { "auxiliary_loss_clip": 0.01170922, "auxiliary_loss_mlp": 0.01026385, "balance_loss_clip": 1.01258481, "balance_loss_mlp": 1.01850569, "epoch": 0.5073047556063248, "flos": 12092955022080.0, "grad_norm": 1.7659028303097246, "language_loss": 0.67149824, "learning_rate": 2.04984902268902e-06, "loss": 0.69347125, "num_input_tokens_seen": 90997415, "step": 4219, "time_per_iteration": 2.6175570487976074 }, { "auxiliary_loss_clip": 0.0118569, "auxiliary_loss_mlp": 0.01025502, "balance_loss_clip": 1.01313114, "balance_loss_mlp": 1.01724064, "epoch": 0.5074249984969639, "flos": 19682854542720.0, "grad_norm": 2.5482121544366723, "language_loss": 0.75751704, "learning_rate": 2.0490702892787345e-06, "loss": 0.77962899, "num_input_tokens_seen": 91016475, "step": 4220, "time_per_iteration": 2.7296864986419678 }, { "auxiliary_loss_clip": 0.01166513, "auxiliary_loss_mlp": 0.010256, "balance_loss_clip": 1.01000059, "balance_loss_mlp": 1.0187242, "epoch": 0.5075452413876029, "flos": 28765703975040.0, "grad_norm": 1.5317066585117391, "language_loss": 0.62243283, "learning_rate": 2.0482915484245246e-06, "loss": 0.64435393, "num_input_tokens_seen": 91038095, "step": 4221, "time_per_iteration": 2.7079062461853027 }, { "auxiliary_loss_clip": 0.01169266, "auxiliary_loss_mlp": 0.01029684, "balance_loss_clip": 0.89944434, "balance_loss_mlp": 1.02176905, "epoch": 0.5076654842782421, "flos": 20339445202560.0, "grad_norm": 2.274220176960059, "language_loss": 0.84018338, "learning_rate": 2.047512800244526e-06, "loss": 0.8621729, "num_input_tokens_seen": 91053360, "step": 4222, "time_per_iteration": 2.730999231338501 }, { "auxiliary_loss_clip": 0.01177278, "auxiliary_loss_mlp": 0.01025622, "balance_loss_clip": 1.01404667, "balance_loss_mlp": 1.01845169, "epoch": 0.5077857271688812, "flos": 26359653404160.0, "grad_norm": 1.8628885528550252, "language_loss": 0.78758144, "learning_rate": 2.046734044856873e-06, "loss": 0.80961049, "num_input_tokens_seen": 91072770, "step": 4223, "time_per_iteration": 2.6440250873565674 }, { "auxiliary_loss_clip": 0.01173654, "auxiliary_loss_mlp": 0.01024487, "balance_loss_clip": 1.01229, "balance_loss_mlp": 1.01717925, "epoch": 0.5079059700595202, "flos": 21798962530560.0, "grad_norm": 1.8552068204636436, "language_loss": 0.81089169, "learning_rate": 2.045955282379702e-06, "loss": 0.83287311, "num_input_tokens_seen": 91091430, "step": 4224, "time_per_iteration": 2.6920578479766846 }, { "auxiliary_loss_clip": 0.01173808, "auxiliary_loss_mlp": 0.010267, "balance_loss_clip": 1.0103997, "balance_loss_mlp": 1.01882625, "epoch": 0.5080262129501594, "flos": 13187943175680.0, "grad_norm": 5.269708855888101, "language_loss": 0.75704652, "learning_rate": 2.045176512931152e-06, "loss": 0.7790516, "num_input_tokens_seen": 91106060, "step": 4225, "time_per_iteration": 2.635894775390625 }, { "auxiliary_loss_clip": 0.01178801, "auxiliary_loss_mlp": 0.01028745, "balance_loss_clip": 0.93808031, "balance_loss_mlp": 1.02145243, "epoch": 0.5081464558407984, "flos": 25301473712640.0, "grad_norm": 1.8043793025899886, "language_loss": 0.7615605, "learning_rate": 2.0443977366293604e-06, "loss": 0.78363597, "num_input_tokens_seen": 91124100, "step": 4226, "time_per_iteration": 2.729919910430908 }, { "auxiliary_loss_clip": 0.01171026, "auxiliary_loss_mlp": 0.0102855, "balance_loss_clip": 0.85788864, "balance_loss_mlp": 1.02036595, "epoch": 0.5082666987314375, "flos": 30951226995840.0, "grad_norm": 2.081512821353048, "language_loss": 0.76808429, "learning_rate": 2.043618953592468e-06, "loss": 0.79008007, "num_input_tokens_seen": 91146555, "step": 4227, "time_per_iteration": 3.93312406539917 }, { "auxiliary_loss_clip": 0.01176326, "auxiliary_loss_mlp": 0.01026762, "balance_loss_clip": 0.97554231, "balance_loss_mlp": 1.01844096, "epoch": 0.5083869416220766, "flos": 19682495406720.0, "grad_norm": 1.4737605152804958, "language_loss": 0.81370008, "learning_rate": 2.0428401639386144e-06, "loss": 0.83573091, "num_input_tokens_seen": 91167120, "step": 4228, "time_per_iteration": 2.929112434387207 }, { "auxiliary_loss_clip": 0.01082715, "auxiliary_loss_mlp": 0.01003048, "balance_loss_clip": 0.95048159, "balance_loss_mlp": 1.00117671, "epoch": 0.5085071845127157, "flos": 71817535589760.0, "grad_norm": 0.8281621713429569, "language_loss": 0.5826053, "learning_rate": 2.042061367785943e-06, "loss": 0.60346287, "num_input_tokens_seen": 91220260, "step": 4229, "time_per_iteration": 3.2506000995635986 }, { "auxiliary_loss_clip": 0.01174606, "auxiliary_loss_mlp": 0.01025504, "balance_loss_clip": 0.93441665, "balance_loss_mlp": 1.01783895, "epoch": 0.5086274274033548, "flos": 35951608252800.0, "grad_norm": 2.3513164551559766, "language_loss": 0.7539714, "learning_rate": 2.041282565252594e-06, "loss": 0.77597249, "num_input_tokens_seen": 91240425, "step": 4230, "time_per_iteration": 3.7685980796813965 }, { "auxiliary_loss_clip": 0.01170916, "auxiliary_loss_mlp": 0.01028162, "balance_loss_clip": 0.93361646, "balance_loss_mlp": 1.02053261, "epoch": 0.5087476702939938, "flos": 23513732881920.0, "grad_norm": 1.5768730397576103, "language_loss": 0.77216959, "learning_rate": 2.040503756456714e-06, "loss": 0.79416037, "num_input_tokens_seen": 91259635, "step": 4231, "time_per_iteration": 2.7548093795776367 }, { "auxiliary_loss_clip": 0.0117083, "auxiliary_loss_mlp": 0.0103033, "balance_loss_clip": 1.01046944, "balance_loss_mlp": 1.02262354, "epoch": 0.508867913184633, "flos": 15122091841920.0, "grad_norm": 2.4420038439330916, "language_loss": 0.78696465, "learning_rate": 2.0397249415164456e-06, "loss": 0.80897623, "num_input_tokens_seen": 91276990, "step": 4232, "time_per_iteration": 2.654402494430542 }, { "auxiliary_loss_clip": 0.01165535, "auxiliary_loss_mlp": 0.01022459, "balance_loss_clip": 0.97059703, "balance_loss_mlp": 1.01525021, "epoch": 0.508988156075272, "flos": 25885309374720.0, "grad_norm": 1.5426630677368576, "language_loss": 0.79935175, "learning_rate": 2.0389461205499354e-06, "loss": 0.82123172, "num_input_tokens_seen": 91296125, "step": 4233, "time_per_iteration": 2.654046058654785 }, { "auxiliary_loss_clip": 0.0117535, "auxiliary_loss_mlp": 0.01027123, "balance_loss_clip": 0.93554533, "balance_loss_mlp": 1.02009821, "epoch": 0.5091083989659111, "flos": 13844857057920.0, "grad_norm": 1.8647457459779042, "language_loss": 0.73547131, "learning_rate": 2.03816729367533e-06, "loss": 0.757496, "num_input_tokens_seen": 91314280, "step": 4234, "time_per_iteration": 2.7211925983428955 }, { "auxiliary_loss_clip": 0.01184054, "auxiliary_loss_mlp": 0.01034225, "balance_loss_clip": 0.97756302, "balance_loss_mlp": 1.02620268, "epoch": 0.5092286418565503, "flos": 21104881050240.0, "grad_norm": 1.9807819943303764, "language_loss": 0.7163533, "learning_rate": 2.0373884610107765e-06, "loss": 0.73853612, "num_input_tokens_seen": 91334595, "step": 4235, "time_per_iteration": 3.6406452655792236 }, { "auxiliary_loss_clip": 0.01179546, "auxiliary_loss_mlp": 0.01026446, "balance_loss_clip": 1.01144087, "balance_loss_mlp": 1.01892662, "epoch": 0.5093488847471893, "flos": 18621298972800.0, "grad_norm": 2.5078438563994525, "language_loss": 0.69583064, "learning_rate": 2.0366096226744225e-06, "loss": 0.71789056, "num_input_tokens_seen": 91349790, "step": 4236, "time_per_iteration": 2.5896196365356445 }, { "auxiliary_loss_clip": 0.01167828, "auxiliary_loss_mlp": 0.01032804, "balance_loss_clip": 1.01169229, "balance_loss_mlp": 1.02560973, "epoch": 0.5094691276378284, "flos": 23803783205760.0, "grad_norm": 2.8185280347683226, "language_loss": 0.76487267, "learning_rate": 2.035830778784418e-06, "loss": 0.786879, "num_input_tokens_seen": 91370465, "step": 4237, "time_per_iteration": 3.5796875953674316 }, { "auxiliary_loss_clip": 0.01180237, "auxiliary_loss_mlp": 0.01032719, "balance_loss_clip": 0.97740608, "balance_loss_mlp": 1.02541769, "epoch": 0.5095893705284675, "flos": 17420410546560.0, "grad_norm": 1.7626853923622143, "language_loss": 0.80092341, "learning_rate": 2.0350519294589134e-06, "loss": 0.823053, "num_input_tokens_seen": 91388505, "step": 4238, "time_per_iteration": 2.621946096420288 }, { "auxiliary_loss_clip": 0.01169223, "auxiliary_loss_mlp": 0.01028648, "balance_loss_clip": 0.89459842, "balance_loss_mlp": 1.02063143, "epoch": 0.5097096134191066, "flos": 25849362839040.0, "grad_norm": 1.7950195195035714, "language_loss": 0.82854915, "learning_rate": 2.0342730748160588e-06, "loss": 0.85052776, "num_input_tokens_seen": 91408970, "step": 4239, "time_per_iteration": 2.7966928482055664 }, { "auxiliary_loss_clip": 0.01172486, "auxiliary_loss_mlp": 0.01029107, "balance_loss_clip": 0.97158223, "balance_loss_mlp": 1.0209825, "epoch": 0.5098298563097456, "flos": 27745122844800.0, "grad_norm": 1.825879298399853, "language_loss": 0.71106422, "learning_rate": 2.033494214974006e-06, "loss": 0.73308015, "num_input_tokens_seen": 91430115, "step": 4240, "time_per_iteration": 2.762299060821533 }, { "auxiliary_loss_clip": 0.01166147, "auxiliary_loss_mlp": 0.01027361, "balance_loss_clip": 0.9743073, "balance_loss_mlp": 1.02037823, "epoch": 0.5099500992003848, "flos": 21358913011200.0, "grad_norm": 1.8418302815786745, "language_loss": 0.83677322, "learning_rate": 2.0327153500509067e-06, "loss": 0.85870826, "num_input_tokens_seen": 91449140, "step": 4241, "time_per_iteration": 2.6896393299102783 }, { "auxiliary_loss_clip": 0.01177268, "auxiliary_loss_mlp": 0.0102849, "balance_loss_clip": 0.97446805, "balance_loss_mlp": 1.02079177, "epoch": 0.5100703420910239, "flos": 19865999013120.0, "grad_norm": 1.8932330464808924, "language_loss": 0.84787202, "learning_rate": 2.031936480164916e-06, "loss": 0.86992967, "num_input_tokens_seen": 91466880, "step": 4242, "time_per_iteration": 2.687657117843628 }, { "auxiliary_loss_clip": 0.01173793, "auxiliary_loss_mlp": 0.01031772, "balance_loss_clip": 0.97575492, "balance_loss_mlp": 1.02461362, "epoch": 0.5101905849816629, "flos": 24648797635200.0, "grad_norm": 1.8265315138822675, "language_loss": 0.80418515, "learning_rate": 2.0311576054341857e-06, "loss": 0.82624084, "num_input_tokens_seen": 91487495, "step": 4243, "time_per_iteration": 2.695629835128784 }, { "auxiliary_loss_clip": 0.01179102, "auxiliary_loss_mlp": 0.01027173, "balance_loss_clip": 1.05183005, "balance_loss_mlp": 1.0193764, "epoch": 0.5103108278723021, "flos": 22930076787840.0, "grad_norm": 1.5552975844561523, "language_loss": 0.62776554, "learning_rate": 2.0303787259768715e-06, "loss": 0.64982826, "num_input_tokens_seen": 91508395, "step": 4244, "time_per_iteration": 2.677269697189331 }, { "auxiliary_loss_clip": 0.0117785, "auxiliary_loss_mlp": 0.01028418, "balance_loss_clip": 0.97720081, "balance_loss_mlp": 1.02066374, "epoch": 0.5104310707629411, "flos": 21506613736320.0, "grad_norm": 3.036822237607028, "language_loss": 0.69270647, "learning_rate": 2.0295998419111294e-06, "loss": 0.71476912, "num_input_tokens_seen": 91525685, "step": 4245, "time_per_iteration": 2.7271313667297363 }, { "auxiliary_loss_clip": 0.01173057, "auxiliary_loss_mlp": 0.0102796, "balance_loss_clip": 0.85792196, "balance_loss_mlp": 1.01940084, "epoch": 0.5105513136535802, "flos": 14903180403840.0, "grad_norm": 2.225124051974836, "language_loss": 0.73787111, "learning_rate": 2.028820953355115e-06, "loss": 0.75988132, "num_input_tokens_seen": 91543785, "step": 4246, "time_per_iteration": 2.8412020206451416 }, { "auxiliary_loss_clip": 0.01182932, "auxiliary_loss_mlp": 0.01031034, "balance_loss_clip": 0.97348106, "balance_loss_mlp": 1.02273142, "epoch": 0.5106715565442194, "flos": 22602212421120.0, "grad_norm": 6.549200929290353, "language_loss": 0.78373486, "learning_rate": 2.0280420604269834e-06, "loss": 0.80587453, "num_input_tokens_seen": 91563325, "step": 4247, "time_per_iteration": 2.8347532749176025 }, { "auxiliary_loss_clip": 0.0108362, "auxiliary_loss_mlp": 0.01007287, "balance_loss_clip": 0.98523021, "balance_loss_mlp": 1.00546288, "epoch": 0.5107917994348584, "flos": 71027645558400.0, "grad_norm": 0.7168382879251968, "language_loss": 0.58994943, "learning_rate": 2.027263163244895e-06, "loss": 0.6108585, "num_input_tokens_seen": 91632450, "step": 4248, "time_per_iteration": 3.342674732208252 }, { "auxiliary_loss_clip": 0.01175107, "auxiliary_loss_mlp": 0.01025495, "balance_loss_clip": 1.01422811, "balance_loss_mlp": 1.01778173, "epoch": 0.5109120423254975, "flos": 24827416992000.0, "grad_norm": 1.6129533903907167, "language_loss": 0.74516106, "learning_rate": 2.026484261927005e-06, "loss": 0.76716715, "num_input_tokens_seen": 91651945, "step": 4249, "time_per_iteration": 2.712252140045166 }, { "auxiliary_loss_clip": 0.01185025, "auxiliary_loss_mlp": 0.01027387, "balance_loss_clip": 1.01623893, "balance_loss_mlp": 1.01965666, "epoch": 0.5110322852161366, "flos": 21247661612160.0, "grad_norm": 2.123438624107982, "language_loss": 0.74336815, "learning_rate": 2.025705356591475e-06, "loss": 0.76549226, "num_input_tokens_seen": 91669635, "step": 4250, "time_per_iteration": 2.6301887035369873 }, { "auxiliary_loss_clip": 0.01085942, "auxiliary_loss_mlp": 0.01116403, "balance_loss_clip": 0.91267687, "balance_loss_mlp": 0.0, "epoch": 0.5111525281067757, "flos": 66457114358400.0, "grad_norm": 0.7642682826861105, "language_loss": 0.58003616, "learning_rate": 2.024926447356462e-06, "loss": 0.6020596, "num_input_tokens_seen": 91731920, "step": 4251, "time_per_iteration": 3.2508060932159424 }, { "auxiliary_loss_clip": 0.01178648, "auxiliary_loss_mlp": 0.0102722, "balance_loss_clip": 1.01360774, "balance_loss_mlp": 1.01926899, "epoch": 0.5112727709974147, "flos": 14866731077760.0, "grad_norm": 1.742924871037084, "language_loss": 0.78548455, "learning_rate": 2.024147534340127e-06, "loss": 0.80754322, "num_input_tokens_seen": 91749780, "step": 4252, "time_per_iteration": 2.5660741329193115 }, { "auxiliary_loss_clip": 0.01170138, "auxiliary_loss_mlp": 0.01027371, "balance_loss_clip": 0.97101426, "balance_loss_mlp": 1.0195632, "epoch": 0.5113930138880539, "flos": 21177600134400.0, "grad_norm": 1.4842416972279542, "language_loss": 0.79598016, "learning_rate": 2.02336861766063e-06, "loss": 0.81795526, "num_input_tokens_seen": 91768840, "step": 4253, "time_per_iteration": 3.6996726989746094 }, { "auxiliary_loss_clip": 0.01186398, "auxiliary_loss_mlp": 0.0102564, "balance_loss_clip": 1.0151279, "balance_loss_mlp": 1.0178318, "epoch": 0.511513256778693, "flos": 20409111630720.0, "grad_norm": 1.8131908238161927, "language_loss": 0.78741401, "learning_rate": 2.0225896974361327e-06, "loss": 0.80953443, "num_input_tokens_seen": 91788945, "step": 4254, "time_per_iteration": 2.6979641914367676 }, { "auxiliary_loss_clip": 0.01083497, "auxiliary_loss_mlp": 0.01004624, "balance_loss_clip": 0.90866041, "balance_loss_mlp": 1.00274014, "epoch": 0.511633499669332, "flos": 69879975131520.0, "grad_norm": 1.0092126819043963, "language_loss": 0.59999585, "learning_rate": 2.0218107737847962e-06, "loss": 0.62087703, "num_input_tokens_seen": 91850990, "step": 4255, "time_per_iteration": 3.3164443969726562 }, { "auxiliary_loss_clip": 0.01180178, "auxiliary_loss_mlp": 0.010237, "balance_loss_clip": 1.05261445, "balance_loss_mlp": 1.01590395, "epoch": 0.5117537425599712, "flos": 24097855852800.0, "grad_norm": 2.3892092638927256, "language_loss": 0.7512219, "learning_rate": 2.0210318468247826e-06, "loss": 0.77326065, "num_input_tokens_seen": 91869960, "step": 4256, "time_per_iteration": 3.658742666244507 }, { "auxiliary_loss_clip": 0.01173604, "auxiliary_loss_mlp": 0.01029151, "balance_loss_clip": 0.97266752, "balance_loss_mlp": 1.02190888, "epoch": 0.5118739854506102, "flos": 20959550622720.0, "grad_norm": 1.9042885127774343, "language_loss": 0.81961513, "learning_rate": 2.020252916674255e-06, "loss": 0.84164274, "num_input_tokens_seen": 91889075, "step": 4257, "time_per_iteration": 2.684547185897827 }, { "auxiliary_loss_clip": 0.01178579, "auxiliary_loss_mlp": 0.01025461, "balance_loss_clip": 1.01258314, "balance_loss_mlp": 1.01749802, "epoch": 0.5119942283412493, "flos": 17457326749440.0, "grad_norm": 1.6747749343251748, "language_loss": 0.81451154, "learning_rate": 2.019473983451375e-06, "loss": 0.83655202, "num_input_tokens_seen": 91907495, "step": 4258, "time_per_iteration": 2.649697780609131 }, { "auxiliary_loss_clip": 0.01182898, "auxiliary_loss_mlp": 0.01030628, "balance_loss_clip": 0.93658966, "balance_loss_mlp": 1.02255189, "epoch": 0.5121144712318885, "flos": 21066743784960.0, "grad_norm": 1.726425132415522, "language_loss": 0.71831226, "learning_rate": 2.0186950472743076e-06, "loss": 0.74044752, "num_input_tokens_seen": 91927400, "step": 4259, "time_per_iteration": 2.7100472450256348 }, { "auxiliary_loss_clip": 0.01178306, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 1.05023885, "balance_loss_mlp": 1.02457619, "epoch": 0.5122347141225275, "flos": 19860791541120.0, "grad_norm": 1.5862006545456424, "language_loss": 0.73775983, "learning_rate": 2.0179161082612162e-06, "loss": 0.75986618, "num_input_tokens_seen": 91946790, "step": 4260, "time_per_iteration": 2.552375555038452 }, { "auxiliary_loss_clip": 0.01170002, "auxiliary_loss_mlp": 0.01025927, "balance_loss_clip": 0.97193933, "balance_loss_mlp": 1.01796341, "epoch": 0.5123549570131666, "flos": 22528487756160.0, "grad_norm": 1.7626861393265931, "language_loss": 0.72490537, "learning_rate": 2.017137166530266e-06, "loss": 0.74686468, "num_input_tokens_seen": 91966325, "step": 4261, "time_per_iteration": 3.5390467643737793 }, { "auxiliary_loss_clip": 0.01182274, "auxiliary_loss_mlp": 0.01027799, "balance_loss_clip": 0.97514653, "balance_loss_mlp": 1.02010727, "epoch": 0.5124751999038056, "flos": 20333375804160.0, "grad_norm": 2.1140228330961266, "language_loss": 0.79945409, "learning_rate": 2.0163582221996213e-06, "loss": 0.82155484, "num_input_tokens_seen": 91984700, "step": 4262, "time_per_iteration": 2.6528232097625732 }, { "auxiliary_loss_clip": 0.01177189, "auxiliary_loss_mlp": 0.01027141, "balance_loss_clip": 0.97419244, "balance_loss_mlp": 1.01935661, "epoch": 0.5125954427944448, "flos": 39785970211200.0, "grad_norm": 1.688842309737281, "language_loss": 0.67779171, "learning_rate": 2.015579275387446e-06, "loss": 0.69983506, "num_input_tokens_seen": 92010020, "step": 4263, "time_per_iteration": 3.7060768604278564 }, { "auxiliary_loss_clip": 0.01165748, "auxiliary_loss_mlp": 0.01026486, "balance_loss_clip": 0.97449285, "balance_loss_mlp": 1.01861286, "epoch": 0.5127156856850839, "flos": 29205394358400.0, "grad_norm": 2.7528317202091306, "language_loss": 0.6875596, "learning_rate": 2.0148003262119085e-06, "loss": 0.70948195, "num_input_tokens_seen": 92030990, "step": 4264, "time_per_iteration": 2.768073320388794 }, { "auxiliary_loss_clip": 0.01175578, "auxiliary_loss_mlp": 0.01027434, "balance_loss_clip": 0.93694711, "balance_loss_mlp": 1.0190891, "epoch": 0.5128359285757229, "flos": 13553693412480.0, "grad_norm": 1.7317922988567316, "language_loss": 0.76441544, "learning_rate": 2.0140213747911728e-06, "loss": 0.78644556, "num_input_tokens_seen": 92049525, "step": 4265, "time_per_iteration": 2.684333086013794 }, { "auxiliary_loss_clip": 0.01175419, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 0.93791473, "balance_loss_mlp": 1.02216709, "epoch": 0.5129561714663621, "flos": 25192089820800.0, "grad_norm": 2.179863196788815, "language_loss": 0.80715382, "learning_rate": 2.013242421243406e-06, "loss": 0.82921374, "num_input_tokens_seen": 92068430, "step": 4266, "time_per_iteration": 2.760343313217163 }, { "auxiliary_loss_clip": 0.01175235, "auxiliary_loss_mlp": 0.01031115, "balance_loss_clip": 0.89977568, "balance_loss_mlp": 1.02327681, "epoch": 0.5130764143570011, "flos": 18150223080960.0, "grad_norm": 1.6247674571985953, "language_loss": 0.78977823, "learning_rate": 2.012463465686774e-06, "loss": 0.81184173, "num_input_tokens_seen": 92088180, "step": 4267, "time_per_iteration": 2.811763048171997 }, { "auxiliary_loss_clip": 0.01103354, "auxiliary_loss_mlp": 0.01003548, "balance_loss_clip": 0.88687116, "balance_loss_mlp": 1.00172448, "epoch": 0.5131966572476402, "flos": 59794896418560.0, "grad_norm": 0.7743373287857619, "language_loss": 0.5483886, "learning_rate": 2.0116845082394446e-06, "loss": 0.56945765, "num_input_tokens_seen": 92153015, "step": 4268, "time_per_iteration": 3.3414509296417236 }, { "auxiliary_loss_clip": 0.01180126, "auxiliary_loss_mlp": 0.01024542, "balance_loss_clip": 1.01253438, "balance_loss_mlp": 1.01672816, "epoch": 0.5133169001382794, "flos": 18515219132160.0, "grad_norm": 1.8066097013422124, "language_loss": 0.78790832, "learning_rate": 2.0109055490195836e-06, "loss": 0.809955, "num_input_tokens_seen": 92171470, "step": 4269, "time_per_iteration": 2.634348154067993 }, { "auxiliary_loss_clip": 0.01170923, "auxiliary_loss_mlp": 0.0102751, "balance_loss_clip": 0.85263389, "balance_loss_mlp": 1.01937962, "epoch": 0.5134371430289184, "flos": 15523537219200.0, "grad_norm": 1.9012827104769112, "language_loss": 0.63863307, "learning_rate": 2.0101265881453605e-06, "loss": 0.66061735, "num_input_tokens_seen": 92189945, "step": 4270, "time_per_iteration": 2.800853729248047 }, { "auxiliary_loss_clip": 0.01168416, "auxiliary_loss_mlp": 0.01026322, "balance_loss_clip": 0.97464263, "balance_loss_mlp": 1.01925921, "epoch": 0.5135573859195575, "flos": 21433786911360.0, "grad_norm": 2.184572312175384, "language_loss": 0.78057748, "learning_rate": 2.009347625734941e-06, "loss": 0.80252486, "num_input_tokens_seen": 92209855, "step": 4271, "time_per_iteration": 2.6830201148986816 }, { "auxiliary_loss_clip": 0.01186007, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.0566082, "balance_loss_mlp": 1.02427828, "epoch": 0.5136776288101966, "flos": 17712651600000.0, "grad_norm": 2.272007949250053, "language_loss": 0.75362831, "learning_rate": 2.0085686619064954e-06, "loss": 0.77580798, "num_input_tokens_seen": 92226295, "step": 4272, "time_per_iteration": 2.5979490280151367 }, { "auxiliary_loss_clip": 0.01186083, "auxiliary_loss_mlp": 0.01027686, "balance_loss_clip": 1.01681721, "balance_loss_mlp": 1.01960993, "epoch": 0.5137978717008357, "flos": 16581680997120.0, "grad_norm": 2.207917266449237, "language_loss": 0.82600129, "learning_rate": 2.00778969677819e-06, "loss": 0.84813893, "num_input_tokens_seen": 92243330, "step": 4273, "time_per_iteration": 2.6260414123535156 }, { "auxiliary_loss_clip": 0.01173821, "auxiliary_loss_mlp": 0.01028733, "balance_loss_clip": 0.97314262, "balance_loss_mlp": 1.02118731, "epoch": 0.5139181145914747, "flos": 20668243322880.0, "grad_norm": 1.6457028969998406, "language_loss": 0.64063716, "learning_rate": 2.0070107304681934e-06, "loss": 0.66266268, "num_input_tokens_seen": 92262285, "step": 4274, "time_per_iteration": 2.6899003982543945 }, { "auxiliary_loss_clip": 0.01175204, "auxiliary_loss_mlp": 0.01030263, "balance_loss_clip": 0.93880475, "balance_loss_mlp": 1.02251399, "epoch": 0.5140383574821139, "flos": 32926996546560.0, "grad_norm": 3.67224633657248, "language_loss": 0.78294075, "learning_rate": 2.006231763094675e-06, "loss": 0.80499542, "num_input_tokens_seen": 92283305, "step": 4275, "time_per_iteration": 2.80299973487854 }, { "auxiliary_loss_clip": 0.01173742, "auxiliary_loss_mlp": 0.01030677, "balance_loss_clip": 0.97815943, "balance_loss_mlp": 1.02320838, "epoch": 0.514158600372753, "flos": 19537093152000.0, "grad_norm": 2.228999410625891, "language_loss": 0.87521654, "learning_rate": 2.0054527947758027e-06, "loss": 0.89726073, "num_input_tokens_seen": 92302105, "step": 4276, "time_per_iteration": 2.662658214569092 }, { "auxiliary_loss_clip": 0.01084458, "auxiliary_loss_mlp": 0.0099975, "balance_loss_clip": 0.98673087, "balance_loss_mlp": 0.99809277, "epoch": 0.514278843263392, "flos": 62523855279360.0, "grad_norm": 0.8651603449748272, "language_loss": 0.55971205, "learning_rate": 2.004673825629746e-06, "loss": 0.58055413, "num_input_tokens_seen": 92362885, "step": 4277, "time_per_iteration": 3.227248191833496 }, { "auxiliary_loss_clip": 0.01169794, "auxiliary_loss_mlp": 0.01026206, "balance_loss_clip": 0.97293717, "balance_loss_mlp": 1.01867151, "epoch": 0.5143990861540312, "flos": 25882328545920.0, "grad_norm": 2.2018236728839877, "language_loss": 0.72398162, "learning_rate": 2.0038948557746744e-06, "loss": 0.74594164, "num_input_tokens_seen": 92384740, "step": 4278, "time_per_iteration": 2.6913743019104004 }, { "auxiliary_loss_clip": 0.01174063, "auxiliary_loss_mlp": 0.01026308, "balance_loss_clip": 1.01334643, "balance_loss_mlp": 1.01889944, "epoch": 0.5145193290446702, "flos": 23330660238720.0, "grad_norm": 1.6273348067854687, "language_loss": 0.75168282, "learning_rate": 2.0031158853287558e-06, "loss": 0.77368653, "num_input_tokens_seen": 92405175, "step": 4279, "time_per_iteration": 3.671506643295288 }, { "auxiliary_loss_clip": 0.01178709, "auxiliary_loss_mlp": 0.01027238, "balance_loss_clip": 0.97738147, "balance_loss_mlp": 1.01995468, "epoch": 0.5146395719353093, "flos": 22856603518080.0, "grad_norm": 1.9174899422228897, "language_loss": 0.70648229, "learning_rate": 2.0023369144101593e-06, "loss": 0.72854173, "num_input_tokens_seen": 92423345, "step": 4280, "time_per_iteration": 2.683335304260254 }, { "auxiliary_loss_clip": 0.01165996, "auxiliary_loss_mlp": 0.01027297, "balance_loss_clip": 0.97265685, "balance_loss_mlp": 1.01995337, "epoch": 0.5147598148259485, "flos": 26391577616640.0, "grad_norm": 1.697058197190965, "language_loss": 0.77130777, "learning_rate": 2.0015579431370555e-06, "loss": 0.79324067, "num_input_tokens_seen": 92445025, "step": 4281, "time_per_iteration": 2.72682785987854 }, { "auxiliary_loss_clip": 0.01176787, "auxiliary_loss_mlp": 0.01025247, "balance_loss_clip": 1.0148741, "balance_loss_mlp": 1.01821947, "epoch": 0.5148800577165875, "flos": 29965694561280.0, "grad_norm": 1.9421942923411086, "language_loss": 0.69840968, "learning_rate": 2.000778971627612e-06, "loss": 0.72043002, "num_input_tokens_seen": 92464490, "step": 4282, "time_per_iteration": 3.684110641479492 }, { "auxiliary_loss_clip": 0.01166343, "auxiliary_loss_mlp": 0.01031477, "balance_loss_clip": 0.97165382, "balance_loss_mlp": 1.02422285, "epoch": 0.5150003006072266, "flos": 17931383470080.0, "grad_norm": 1.7862306386939601, "language_loss": 0.90436196, "learning_rate": 2e-06, "loss": 0.92634022, "num_input_tokens_seen": 92482085, "step": 4283, "time_per_iteration": 2.7229526042938232 }, { "auxiliary_loss_clip": 0.01178718, "auxiliary_loss_mlp": 0.01029636, "balance_loss_clip": 1.0538702, "balance_loss_mlp": 1.02217305, "epoch": 0.5151205434978657, "flos": 18478733892480.0, "grad_norm": 1.6610035961897442, "language_loss": 0.85774225, "learning_rate": 1.9992210283723878e-06, "loss": 0.87982577, "num_input_tokens_seen": 92499325, "step": 4284, "time_per_iteration": 2.636569023132324 }, { "auxiliary_loss_clip": 0.01178301, "auxiliary_loss_mlp": 0.01031817, "balance_loss_clip": 1.05365705, "balance_loss_mlp": 1.02470016, "epoch": 0.5152407863885048, "flos": 25341263003520.0, "grad_norm": 1.445576059830181, "language_loss": 0.7916826, "learning_rate": 1.9984420568629448e-06, "loss": 0.81378382, "num_input_tokens_seen": 92522090, "step": 4285, "time_per_iteration": 2.6551122665405273 }, { "auxiliary_loss_clip": 0.01179781, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.01461792, "balance_loss_mlp": 1.02153909, "epoch": 0.5153610292791438, "flos": 18329740277760.0, "grad_norm": 1.8488870237807173, "language_loss": 0.78474957, "learning_rate": 1.9976630855898405e-06, "loss": 0.80683655, "num_input_tokens_seen": 92539845, "step": 4286, "time_per_iteration": 2.601257562637329 }, { "auxiliary_loss_clip": 0.01168785, "auxiliary_loss_mlp": 0.01022997, "balance_loss_clip": 0.97039264, "balance_loss_mlp": 1.01597595, "epoch": 0.515481272169783, "flos": 30409945971840.0, "grad_norm": 1.9384856461165818, "language_loss": 0.74248642, "learning_rate": 1.9968841146712445e-06, "loss": 0.7644043, "num_input_tokens_seen": 92559460, "step": 4287, "time_per_iteration": 3.6052112579345703 }, { "auxiliary_loss_clip": 0.01169793, "auxiliary_loss_mlp": 0.01122991, "balance_loss_clip": 0.85937643, "balance_loss_mlp": 0.0, "epoch": 0.5156015150604221, "flos": 23037305863680.0, "grad_norm": 1.535040050006439, "language_loss": 0.71395516, "learning_rate": 1.996105144225326e-06, "loss": 0.73688304, "num_input_tokens_seen": 92579695, "step": 4288, "time_per_iteration": 2.7742114067077637 }, { "auxiliary_loss_clip": 0.01178202, "auxiliary_loss_mlp": 0.01026115, "balance_loss_clip": 1.01512802, "balance_loss_mlp": 1.01867676, "epoch": 0.5157217579510611, "flos": 17858556645120.0, "grad_norm": 1.8319600203841768, "language_loss": 0.78579277, "learning_rate": 1.995326174370254e-06, "loss": 0.80783594, "num_input_tokens_seen": 92598795, "step": 4289, "time_per_iteration": 3.5202107429504395 }, { "auxiliary_loss_clip": 0.01174917, "auxiliary_loss_mlp": 0.01122122, "balance_loss_clip": 1.0118556, "balance_loss_mlp": 0.0, "epoch": 0.5158420008417003, "flos": 19171486569600.0, "grad_norm": 1.9803702270840382, "language_loss": 0.72819269, "learning_rate": 1.994547205224197e-06, "loss": 0.75116313, "num_input_tokens_seen": 92617700, "step": 4290, "time_per_iteration": 2.6698014736175537 }, { "auxiliary_loss_clip": 0.01173679, "auxiliary_loss_mlp": 0.01024816, "balance_loss_clip": 0.97524524, "balance_loss_mlp": 1.01721668, "epoch": 0.5159622437323393, "flos": 22419534827520.0, "grad_norm": 1.9802525086978704, "language_loss": 0.6752255, "learning_rate": 1.993768236905325e-06, "loss": 0.69721043, "num_input_tokens_seen": 92638370, "step": 4291, "time_per_iteration": 2.6725354194641113 }, { "auxiliary_loss_clip": 0.01170656, "auxiliary_loss_mlp": 0.01028947, "balance_loss_clip": 0.97296959, "balance_loss_mlp": 1.02149022, "epoch": 0.5160824866229784, "flos": 24603010773120.0, "grad_norm": 2.555539599623418, "language_loss": 0.67067623, "learning_rate": 1.992989269531807e-06, "loss": 0.69267231, "num_input_tokens_seen": 92657180, "step": 4292, "time_per_iteration": 2.7176592350006104 }, { "auxiliary_loss_clip": 0.01172213, "auxiliary_loss_mlp": 0.01026406, "balance_loss_clip": 0.97189796, "balance_loss_mlp": 1.01876426, "epoch": 0.5162027295136175, "flos": 18002737837440.0, "grad_norm": 2.6984228881415673, "language_loss": 0.68003476, "learning_rate": 1.99221030322181e-06, "loss": 0.702021, "num_input_tokens_seen": 92673985, "step": 4293, "time_per_iteration": 2.6122829914093018 }, { "auxiliary_loss_clip": 0.01178886, "auxiliary_loss_mlp": 0.01029431, "balance_loss_clip": 0.97399884, "balance_loss_mlp": 1.02224886, "epoch": 0.5163229724042566, "flos": 27344611221120.0, "grad_norm": 1.4897291739503407, "language_loss": 0.8089906, "learning_rate": 1.991431338093505e-06, "loss": 0.83107382, "num_input_tokens_seen": 92696340, "step": 4294, "time_per_iteration": 2.763117790222168 }, { "auxiliary_loss_clip": 0.01175621, "auxiliary_loss_mlp": 0.01023368, "balance_loss_clip": 0.9762398, "balance_loss_mlp": 1.01626635, "epoch": 0.5164432152948957, "flos": 21762764599680.0, "grad_norm": 1.6535070299152925, "language_loss": 0.79044789, "learning_rate": 1.9906523742650587e-06, "loss": 0.81243777, "num_input_tokens_seen": 92715200, "step": 4295, "time_per_iteration": 2.743835210800171 }, { "auxiliary_loss_clip": 0.01179703, "auxiliary_loss_mlp": 0.01029656, "balance_loss_clip": 1.05120063, "balance_loss_mlp": 1.02130508, "epoch": 0.5165634581855347, "flos": 25550334115200.0, "grad_norm": 2.2995513560834997, "language_loss": 0.7732209, "learning_rate": 1.9898734118546397e-06, "loss": 0.79531455, "num_input_tokens_seen": 92735150, "step": 4296, "time_per_iteration": 2.729722499847412 }, { "auxiliary_loss_clip": 0.01173439, "auxiliary_loss_mlp": 0.01025853, "balance_loss_clip": 0.82013178, "balance_loss_mlp": 1.01805663, "epoch": 0.5166837010761739, "flos": 19901191363200.0, "grad_norm": 1.4444002969499907, "language_loss": 0.80289131, "learning_rate": 1.989094450980416e-06, "loss": 0.82488424, "num_input_tokens_seen": 92755250, "step": 4297, "time_per_iteration": 2.932920217514038 }, { "auxiliary_loss_clip": 0.01173491, "auxiliary_loss_mlp": 0.01028485, "balance_loss_clip": 1.01293957, "balance_loss_mlp": 1.0212338, "epoch": 0.516803943966813, "flos": 26646076454400.0, "grad_norm": 1.9109804843503875, "language_loss": 0.76519096, "learning_rate": 1.9883154917605556e-06, "loss": 0.7872107, "num_input_tokens_seen": 92774460, "step": 4298, "time_per_iteration": 2.7472758293151855 }, { "auxiliary_loss_clip": 0.01174939, "auxiliary_loss_mlp": 0.01023906, "balance_loss_clip": 1.04925799, "balance_loss_mlp": 1.01627648, "epoch": 0.516924186857452, "flos": 19682854542720.0, "grad_norm": 4.353622204573165, "language_loss": 0.83347309, "learning_rate": 1.9875365343132262e-06, "loss": 0.85546148, "num_input_tokens_seen": 92791580, "step": 4299, "time_per_iteration": 2.75736403465271 }, { "auxiliary_loss_clip": 0.01176175, "auxiliary_loss_mlp": 0.01122435, "balance_loss_clip": 1.01441908, "balance_loss_mlp": 0.0, "epoch": 0.5170444297480912, "flos": 15956583586560.0, "grad_norm": 2.1331192355035378, "language_loss": 0.84770143, "learning_rate": 1.9867575787565946e-06, "loss": 0.8706876, "num_input_tokens_seen": 92806240, "step": 4300, "time_per_iteration": 2.647461414337158 }, { "auxiliary_loss_clip": 0.01179988, "auxiliary_loss_mlp": 0.01034727, "balance_loss_clip": 1.01427197, "balance_loss_mlp": 1.02701163, "epoch": 0.5171646726387302, "flos": 14174157968640.0, "grad_norm": 2.70834214414083, "language_loss": 0.85720068, "learning_rate": 1.9859786252088275e-06, "loss": 0.8793478, "num_input_tokens_seen": 92823420, "step": 4301, "time_per_iteration": 2.6905131340026855 }, { "auxiliary_loss_clip": 0.01177313, "auxiliary_loss_mlp": 0.01030796, "balance_loss_clip": 0.93637347, "balance_loss_mlp": 1.0229702, "epoch": 0.5172849155293693, "flos": 23578550974080.0, "grad_norm": 3.290877381282161, "language_loss": 0.66673172, "learning_rate": 1.9851996737880914e-06, "loss": 0.68881279, "num_input_tokens_seen": 92838605, "step": 4302, "time_per_iteration": 2.723557710647583 }, { "auxiliary_loss_clip": 0.01183262, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.01480794, "balance_loss_mlp": 1.02721238, "epoch": 0.5174051584200084, "flos": 14283541860480.0, "grad_norm": 2.0278714867106222, "language_loss": 0.7438699, "learning_rate": 1.9844207246125537e-06, "loss": 0.7660501, "num_input_tokens_seen": 92855185, "step": 4303, "time_per_iteration": 2.604175329208374 }, { "auxiliary_loss_clip": 0.01172445, "auxiliary_loss_mlp": 0.01030983, "balance_loss_clip": 0.97394109, "balance_loss_mlp": 1.02371693, "epoch": 0.5175254013106475, "flos": 37889384192640.0, "grad_norm": 1.653951880844074, "language_loss": 0.68622768, "learning_rate": 1.983641777800379e-06, "loss": 0.70826197, "num_input_tokens_seen": 92877830, "step": 4304, "time_per_iteration": 2.814754009246826 }, { "auxiliary_loss_clip": 0.01090016, "auxiliary_loss_mlp": 0.00999815, "balance_loss_clip": 0.95070779, "balance_loss_mlp": 0.99801457, "epoch": 0.5176456442012866, "flos": 68549737829760.0, "grad_norm": 0.7484159707873345, "language_loss": 0.58757734, "learning_rate": 1.9828628334697343e-06, "loss": 0.60847563, "num_input_tokens_seen": 92945040, "step": 4305, "time_per_iteration": 4.397078514099121 }, { "auxiliary_loss_clip": 0.01091933, "auxiliary_loss_mlp": 0.01001577, "balance_loss_clip": 0.95117408, "balance_loss_mlp": 0.99969333, "epoch": 0.5177658870919257, "flos": 64084137235200.0, "grad_norm": 0.769065538021447, "language_loss": 0.54725111, "learning_rate": 1.982083891738784e-06, "loss": 0.56818616, "num_input_tokens_seen": 93005910, "step": 4306, "time_per_iteration": 3.2814226150512695 }, { "auxiliary_loss_clip": 0.01171589, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 0.9768306, "balance_loss_mlp": 1.02096415, "epoch": 0.5178861299825648, "flos": 26651248012800.0, "grad_norm": 1.5666143895383697, "language_loss": 0.82597184, "learning_rate": 1.9813049527256923e-06, "loss": 0.84796941, "num_input_tokens_seen": 93026305, "step": 4307, "time_per_iteration": 2.758751392364502 }, { "auxiliary_loss_clip": 0.01168189, "auxiliary_loss_mlp": 0.01034653, "balance_loss_clip": 0.9321906, "balance_loss_mlp": 1.02709532, "epoch": 0.5180063728732038, "flos": 17931886260480.0, "grad_norm": 2.0736288974757437, "language_loss": 0.8195858, "learning_rate": 1.9805260165486252e-06, "loss": 0.84161425, "num_input_tokens_seen": 93045675, "step": 4308, "time_per_iteration": 3.6291232109069824 }, { "auxiliary_loss_clip": 0.01177246, "auxiliary_loss_mlp": 0.0102787, "balance_loss_clip": 1.0140121, "balance_loss_mlp": 1.02056563, "epoch": 0.518126615763843, "flos": 19500895221120.0, "grad_norm": 1.9558940119797772, "language_loss": 0.86238205, "learning_rate": 1.9797470833257457e-06, "loss": 0.88443315, "num_input_tokens_seen": 93065375, "step": 4309, "time_per_iteration": 2.6591126918792725 }, { "auxiliary_loss_clip": 0.01178595, "auxiliary_loss_mlp": 0.0103025, "balance_loss_clip": 1.01629508, "balance_loss_mlp": 1.02219748, "epoch": 0.5182468586544821, "flos": 20704082117760.0, "grad_norm": 3.9317112575735242, "language_loss": 0.77405655, "learning_rate": 1.9789681531752177e-06, "loss": 0.79614496, "num_input_tokens_seen": 93085595, "step": 4310, "time_per_iteration": 2.747605800628662 }, { "auxiliary_loss_clip": 0.0117017, "auxiliary_loss_mlp": 0.01031653, "balance_loss_clip": 0.89828956, "balance_loss_mlp": 1.02431524, "epoch": 0.5183671015451211, "flos": 23112107936640.0, "grad_norm": 1.48962459162743, "language_loss": 0.72425586, "learning_rate": 1.978189226215204e-06, "loss": 0.74627405, "num_input_tokens_seen": 93106140, "step": 4311, "time_per_iteration": 2.7882142066955566 }, { "auxiliary_loss_clip": 0.01178931, "auxiliary_loss_mlp": 0.01029118, "balance_loss_clip": 1.05298448, "balance_loss_mlp": 1.02159607, "epoch": 0.5184873444357603, "flos": 17597090568960.0, "grad_norm": 2.19900396706331, "language_loss": 0.76929373, "learning_rate": 1.9774103025638675e-06, "loss": 0.79137421, "num_input_tokens_seen": 93124265, "step": 4312, "time_per_iteration": 2.6668152809143066 }, { "auxiliary_loss_clip": 0.01178333, "auxiliary_loss_mlp": 0.01028606, "balance_loss_clip": 0.90363872, "balance_loss_mlp": 1.02017176, "epoch": 0.5186075873263993, "flos": 24936800883840.0, "grad_norm": 1.4648744228335915, "language_loss": 0.76308441, "learning_rate": 1.9766313823393696e-06, "loss": 0.78515381, "num_input_tokens_seen": 93145130, "step": 4313, "time_per_iteration": 3.682837724685669 }, { "auxiliary_loss_clip": 0.01163656, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 0.89436734, "balance_loss_mlp": 1.02025819, "epoch": 0.5187278302170384, "flos": 15190106244480.0, "grad_norm": 3.629076597650621, "language_loss": 0.6962921, "learning_rate": 1.975852465659873e-06, "loss": 0.71820545, "num_input_tokens_seen": 93161110, "step": 4314, "time_per_iteration": 2.6838808059692383 }, { "auxiliary_loss_clip": 0.01178774, "auxiliary_loss_mlp": 0.01026223, "balance_loss_clip": 1.01472199, "balance_loss_mlp": 1.01782489, "epoch": 0.5188480731076776, "flos": 25009412227200.0, "grad_norm": 2.311756960680654, "language_loss": 0.70590925, "learning_rate": 1.9750735526435377e-06, "loss": 0.72795916, "num_input_tokens_seen": 93178055, "step": 4315, "time_per_iteration": 3.5766732692718506 }, { "auxiliary_loss_clip": 0.01179019, "auxiliary_loss_mlp": 0.01028102, "balance_loss_clip": 0.97780573, "balance_loss_mlp": 1.02063906, "epoch": 0.5189683159983166, "flos": 24790141653120.0, "grad_norm": 2.3403136391261072, "language_loss": 0.79057944, "learning_rate": 1.974294643408525e-06, "loss": 0.81265068, "num_input_tokens_seen": 93195850, "step": 4316, "time_per_iteration": 2.6998822689056396 }, { "auxiliary_loss_clip": 0.01181803, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.01372361, "balance_loss_mlp": 1.02378154, "epoch": 0.5190885588889557, "flos": 24754266944640.0, "grad_norm": 1.7480174116692258, "language_loss": 0.67125463, "learning_rate": 1.9735157380729947e-06, "loss": 0.69339061, "num_input_tokens_seen": 93216260, "step": 4317, "time_per_iteration": 2.6633734703063965 }, { "auxiliary_loss_clip": 0.01179383, "auxiliary_loss_mlp": 0.01030257, "balance_loss_clip": 0.97412264, "balance_loss_mlp": 1.02251124, "epoch": 0.5192088017795948, "flos": 24712646060160.0, "grad_norm": 1.7177461205122788, "language_loss": 0.841268, "learning_rate": 1.9727368367551053e-06, "loss": 0.8633644, "num_input_tokens_seen": 93234810, "step": 4318, "time_per_iteration": 2.7499139308929443 }, { "auxiliary_loss_clip": 0.01163588, "auxiliary_loss_mlp": 0.01022552, "balance_loss_clip": 0.97220999, "balance_loss_mlp": 1.01494074, "epoch": 0.5193290446702339, "flos": 27229588894080.0, "grad_norm": 1.900224284220436, "language_loss": 0.68360907, "learning_rate": 1.9719579395730164e-06, "loss": 0.70547044, "num_input_tokens_seen": 93254185, "step": 4319, "time_per_iteration": 2.675508499145508 }, { "auxiliary_loss_clip": 0.01180397, "auxiliary_loss_mlp": 0.0102694, "balance_loss_clip": 1.05421185, "balance_loss_mlp": 1.01904273, "epoch": 0.5194492875608729, "flos": 11473352392320.0, "grad_norm": 1.9945132451199694, "language_loss": 0.93588167, "learning_rate": 1.9711790466448854e-06, "loss": 0.95795512, "num_input_tokens_seen": 93268205, "step": 4320, "time_per_iteration": 2.6070525646209717 }, { "auxiliary_loss_clip": 0.01176379, "auxiliary_loss_mlp": 0.01025479, "balance_loss_clip": 0.90027976, "balance_loss_mlp": 1.0180105, "epoch": 0.5195695304515121, "flos": 20338906498560.0, "grad_norm": 2.5787110082102727, "language_loss": 0.71316063, "learning_rate": 1.9704001580888704e-06, "loss": 0.73517919, "num_input_tokens_seen": 93286945, "step": 4321, "time_per_iteration": 2.7461137771606445 }, { "auxiliary_loss_clip": 0.01168789, "auxiliary_loss_mlp": 0.01122826, "balance_loss_clip": 0.97282833, "balance_loss_mlp": 0.0, "epoch": 0.5196897733421512, "flos": 20048317470720.0, "grad_norm": 1.8865955182023915, "language_loss": 0.86971796, "learning_rate": 1.9696212740231283e-06, "loss": 0.89263415, "num_input_tokens_seen": 93305595, "step": 4322, "time_per_iteration": 2.7411842346191406 }, { "auxiliary_loss_clip": 0.01184071, "auxiliary_loss_mlp": 0.01029889, "balance_loss_clip": 1.01391768, "balance_loss_mlp": 1.02141333, "epoch": 0.5198100162327902, "flos": 23805507058560.0, "grad_norm": 2.2033987745399233, "language_loss": 0.82105446, "learning_rate": 1.9688423945658146e-06, "loss": 0.84319401, "num_input_tokens_seen": 93326460, "step": 4323, "time_per_iteration": 2.772435188293457 }, { "auxiliary_loss_clip": 0.01158958, "auxiliary_loss_mlp": 0.01024925, "balance_loss_clip": 0.89286077, "balance_loss_mlp": 1.01652706, "epoch": 0.5199302591234293, "flos": 24023951619840.0, "grad_norm": 2.0438417962990036, "language_loss": 0.72242415, "learning_rate": 1.9680635198350845e-06, "loss": 0.74426299, "num_input_tokens_seen": 93346170, "step": 4324, "time_per_iteration": 2.804072856903076 }, { "auxiliary_loss_clip": 0.01175776, "auxiliary_loss_mlp": 0.01032033, "balance_loss_clip": 1.0124011, "balance_loss_mlp": 1.02389669, "epoch": 0.5200505020140684, "flos": 26359366095360.0, "grad_norm": 2.333424879491109, "language_loss": 0.72590202, "learning_rate": 1.967284649949093e-06, "loss": 0.74798012, "num_input_tokens_seen": 93365380, "step": 4325, "time_per_iteration": 2.713942527770996 }, { "auxiliary_loss_clip": 0.01172238, "auxiliary_loss_mlp": 0.01026092, "balance_loss_clip": 0.93433338, "balance_loss_mlp": 1.01834941, "epoch": 0.5201707449047075, "flos": 39604262284800.0, "grad_norm": 1.7501963605301936, "language_loss": 0.72490686, "learning_rate": 1.966505785025994e-06, "loss": 0.74689019, "num_input_tokens_seen": 93387285, "step": 4326, "time_per_iteration": 2.8717200756073 }, { "auxiliary_loss_clip": 0.01178218, "auxiliary_loss_mlp": 0.01025885, "balance_loss_clip": 0.93953723, "balance_loss_mlp": 1.01850033, "epoch": 0.5202909877953465, "flos": 53682788292480.0, "grad_norm": 1.6691924197788552, "language_loss": 0.76169789, "learning_rate": 1.965726925183941e-06, "loss": 0.78373897, "num_input_tokens_seen": 93410390, "step": 4327, "time_per_iteration": 3.0347158908843994 }, { "auxiliary_loss_clip": 0.01180704, "auxiliary_loss_mlp": 0.01028684, "balance_loss_clip": 1.05438709, "balance_loss_mlp": 1.02076221, "epoch": 0.5204112306859857, "flos": 19537021324800.0, "grad_norm": 2.1019234710913643, "language_loss": 0.8516618, "learning_rate": 1.964948070541087e-06, "loss": 0.87375569, "num_input_tokens_seen": 93429050, "step": 4328, "time_per_iteration": 2.563976764678955 }, { "auxiliary_loss_clip": 0.01166117, "auxiliary_loss_mlp": 0.01027917, "balance_loss_clip": 1.01042032, "balance_loss_mlp": 1.02069306, "epoch": 0.5205314735766248, "flos": 15304697608320.0, "grad_norm": 2.4933251732386688, "language_loss": 0.69759655, "learning_rate": 1.9641692212155816e-06, "loss": 0.7195369, "num_input_tokens_seen": 93446815, "step": 4329, "time_per_iteration": 2.543768882751465 }, { "auxiliary_loss_clip": 0.01172739, "auxiliary_loss_mlp": 0.01028901, "balance_loss_clip": 0.90010333, "balance_loss_mlp": 1.02145648, "epoch": 0.5206517164672638, "flos": 59263701160320.0, "grad_norm": 1.8704270337626112, "language_loss": 0.72457075, "learning_rate": 1.9633903773255777e-06, "loss": 0.74658716, "num_input_tokens_seen": 93469130, "step": 4330, "time_per_iteration": 2.957411050796509 }, { "auxiliary_loss_clip": 0.01175257, "auxiliary_loss_mlp": 0.01023226, "balance_loss_clip": 1.05007708, "balance_loss_mlp": 1.01584959, "epoch": 0.520771959357903, "flos": 26871129118080.0, "grad_norm": 2.789276558065326, "language_loss": 0.74772382, "learning_rate": 1.9626115389892237e-06, "loss": 0.76970863, "num_input_tokens_seen": 93489920, "step": 4331, "time_per_iteration": 3.6269278526306152 }, { "auxiliary_loss_clip": 0.01183136, "auxiliary_loss_mlp": 0.01027799, "balance_loss_clip": 0.93784517, "balance_loss_mlp": 1.02074456, "epoch": 0.520892202248542, "flos": 26907075653760.0, "grad_norm": 2.3886525553182194, "language_loss": 0.85217047, "learning_rate": 1.96183270632467e-06, "loss": 0.87427986, "num_input_tokens_seen": 93509770, "step": 4332, "time_per_iteration": 2.7383079528808594 }, { "auxiliary_loss_clip": 0.01168711, "auxiliary_loss_mlp": 0.01123518, "balance_loss_clip": 0.93566012, "balance_loss_mlp": 0.0, "epoch": 0.5210124451391811, "flos": 25849434666240.0, "grad_norm": 1.5347495296629199, "language_loss": 0.78474623, "learning_rate": 1.9610538794500644e-06, "loss": 0.80766845, "num_input_tokens_seen": 93529320, "step": 4333, "time_per_iteration": 2.680722951889038 }, { "auxiliary_loss_clip": 0.01091371, "auxiliary_loss_mlp": 0.00999692, "balance_loss_clip": 0.91310334, "balance_loss_mlp": 0.99796355, "epoch": 0.5211326880298203, "flos": 70553804319360.0, "grad_norm": 0.7816928599084583, "language_loss": 0.59465575, "learning_rate": 1.9602750584835542e-06, "loss": 0.61556637, "num_input_tokens_seen": 93595255, "step": 4334, "time_per_iteration": 4.355034589767456 }, { "auxiliary_loss_clip": 0.01170178, "auxiliary_loss_mlp": 0.01022988, "balance_loss_clip": 0.97304869, "balance_loss_mlp": 1.015293, "epoch": 0.5212529309204593, "flos": 15628898787840.0, "grad_norm": 2.0110455018985482, "language_loss": 0.82437193, "learning_rate": 1.959496243543286e-06, "loss": 0.84630358, "num_input_tokens_seen": 93613135, "step": 4335, "time_per_iteration": 2.69643235206604 }, { "auxiliary_loss_clip": 0.0118456, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 1.01962459, "balance_loss_mlp": 1.02493453, "epoch": 0.5213731738110984, "flos": 26242655829120.0, "grad_norm": 3.092673441619775, "language_loss": 0.79454607, "learning_rate": 1.9587174347474057e-06, "loss": 0.8167218, "num_input_tokens_seen": 93629645, "step": 4336, "time_per_iteration": 2.624083995819092 }, { "auxiliary_loss_clip": 0.01158446, "auxiliary_loss_mlp": 0.01027608, "balance_loss_clip": 0.89643639, "balance_loss_mlp": 1.01973474, "epoch": 0.5214934167017375, "flos": 19418407637760.0, "grad_norm": 2.2372829952472872, "language_loss": 0.81484044, "learning_rate": 1.9579386322140574e-06, "loss": 0.83670104, "num_input_tokens_seen": 93645325, "step": 4337, "time_per_iteration": 2.6822011470794678 }, { "auxiliary_loss_clip": 0.01181033, "auxiliary_loss_mlp": 0.01123364, "balance_loss_clip": 1.05366194, "balance_loss_mlp": 0.0, "epoch": 0.5216136595923766, "flos": 30955788023040.0, "grad_norm": 1.6317698267914824, "language_loss": 0.80802232, "learning_rate": 1.9571598360613854e-06, "loss": 0.83106625, "num_input_tokens_seen": 93668200, "step": 4338, "time_per_iteration": 3.5818612575531006 }, { "auxiliary_loss_clip": 0.01158855, "auxiliary_loss_mlp": 0.01030218, "balance_loss_clip": 0.97084397, "balance_loss_mlp": 1.02240336, "epoch": 0.5217339024830157, "flos": 21945047143680.0, "grad_norm": 2.434741172168486, "language_loss": 0.69512427, "learning_rate": 1.956381046407532e-06, "loss": 0.71701509, "num_input_tokens_seen": 93688495, "step": 4339, "time_per_iteration": 2.7170064449310303 }, { "auxiliary_loss_clip": 0.01173163, "auxiliary_loss_mlp": 0.01033077, "balance_loss_clip": 0.93801016, "balance_loss_mlp": 1.02511966, "epoch": 0.5218541453736548, "flos": 20923209037440.0, "grad_norm": 1.720546853573058, "language_loss": 0.86185175, "learning_rate": 1.9556022633706394e-06, "loss": 0.88391411, "num_input_tokens_seen": 93707285, "step": 4340, "time_per_iteration": 2.7344837188720703 }, { "auxiliary_loss_clip": 0.01167584, "auxiliary_loss_mlp": 0.01026193, "balance_loss_clip": 0.97415829, "balance_loss_mlp": 1.01827168, "epoch": 0.5219743882642939, "flos": 23951663498880.0, "grad_norm": 1.662730923965574, "language_loss": 0.79936087, "learning_rate": 1.954823487068848e-06, "loss": 0.8212986, "num_input_tokens_seen": 93727495, "step": 4341, "time_per_iteration": 2.8685672283172607 }, { "auxiliary_loss_clip": 0.01181577, "auxiliary_loss_mlp": 0.01024929, "balance_loss_clip": 1.01782131, "balance_loss_mlp": 1.0173738, "epoch": 0.5220946311549329, "flos": 28799280213120.0, "grad_norm": 1.6249218739591074, "language_loss": 0.8109321, "learning_rate": 1.9540447176202976e-06, "loss": 0.8329972, "num_input_tokens_seen": 93748740, "step": 4342, "time_per_iteration": 3.64652419090271 }, { "auxiliary_loss_clip": 0.01087262, "auxiliary_loss_mlp": 0.01006132, "balance_loss_clip": 0.99100769, "balance_loss_mlp": 1.00427198, "epoch": 0.5222148740455721, "flos": 67189369017600.0, "grad_norm": 0.8652006369067357, "language_loss": 0.60650182, "learning_rate": 1.9532659551431272e-06, "loss": 0.6274358, "num_input_tokens_seen": 93815770, "step": 4343, "time_per_iteration": 3.3811287879943848 }, { "auxiliary_loss_clip": 0.01180529, "auxiliary_loss_mlp": 0.01026385, "balance_loss_clip": 1.01410294, "balance_loss_mlp": 1.01922691, "epoch": 0.5223351169362112, "flos": 61856164339200.0, "grad_norm": 1.4998929289921235, "language_loss": 0.67456067, "learning_rate": 1.9524871997554744e-06, "loss": 0.69662976, "num_input_tokens_seen": 93843530, "step": 4344, "time_per_iteration": 3.0321006774902344 }, { "auxiliary_loss_clip": 0.01181006, "auxiliary_loss_mlp": 0.01022241, "balance_loss_clip": 1.01569271, "balance_loss_mlp": 1.01508224, "epoch": 0.5224553598268502, "flos": 14647388676480.0, "grad_norm": 2.492310964445858, "language_loss": 0.80399632, "learning_rate": 1.951708451575475e-06, "loss": 0.82602882, "num_input_tokens_seen": 93860595, "step": 4345, "time_per_iteration": 2.6933398246765137 }, { "auxiliary_loss_clip": 0.0118338, "auxiliary_loss_mlp": 0.01027206, "balance_loss_clip": 0.93629849, "balance_loss_mlp": 1.019207, "epoch": 0.5225756027174894, "flos": 14826043946880.0, "grad_norm": 3.8707993624890986, "language_loss": 0.81701618, "learning_rate": 1.9509297107212657e-06, "loss": 0.83912206, "num_input_tokens_seen": 93877365, "step": 4346, "time_per_iteration": 2.724191904067993 }, { "auxiliary_loss_clip": 0.01179054, "auxiliary_loss_mlp": 0.01030136, "balance_loss_clip": 1.05302596, "balance_loss_mlp": 1.02246523, "epoch": 0.5226958456081284, "flos": 23512009029120.0, "grad_norm": 1.5941892535041438, "language_loss": 0.79091895, "learning_rate": 1.95015097731098e-06, "loss": 0.81301081, "num_input_tokens_seen": 93896855, "step": 4347, "time_per_iteration": 2.660287857055664 }, { "auxiliary_loss_clip": 0.01180407, "auxiliary_loss_mlp": 0.01026168, "balance_loss_clip": 1.05348158, "balance_loss_mlp": 1.01826429, "epoch": 0.5228160884987675, "flos": 19062928690560.0, "grad_norm": 2.133662448534502, "language_loss": 0.82076371, "learning_rate": 1.949372251462751e-06, "loss": 0.84282947, "num_input_tokens_seen": 93914270, "step": 4348, "time_per_iteration": 2.5753071308135986 }, { "auxiliary_loss_clip": 0.0117696, "auxiliary_loss_mlp": 0.01122569, "balance_loss_clip": 0.93994159, "balance_loss_mlp": 0.0, "epoch": 0.5229363313894067, "flos": 21063224252160.0, "grad_norm": 1.9113156457226155, "language_loss": 0.82838142, "learning_rate": 1.9485935332947124e-06, "loss": 0.85137677, "num_input_tokens_seen": 93932180, "step": 4349, "time_per_iteration": 2.732783079147339 }, { "auxiliary_loss_clip": 0.01171996, "auxiliary_loss_mlp": 0.01025927, "balance_loss_clip": 0.97592509, "balance_loss_mlp": 1.01897085, "epoch": 0.5230565742800457, "flos": 14830389492480.0, "grad_norm": 2.5406718035790457, "language_loss": 0.83556551, "learning_rate": 1.947814822924993e-06, "loss": 0.85754472, "num_input_tokens_seen": 93949690, "step": 4350, "time_per_iteration": 2.7248618602752686 }, { "auxiliary_loss_clip": 0.01177062, "auxiliary_loss_mlp": 0.01030309, "balance_loss_clip": 1.05264175, "balance_loss_mlp": 1.02271521, "epoch": 0.5231768171706848, "flos": 25813021253760.0, "grad_norm": 2.501870196584648, "language_loss": 0.82864761, "learning_rate": 1.9470361204717236e-06, "loss": 0.85072136, "num_input_tokens_seen": 93968830, "step": 4351, "time_per_iteration": 2.674142599105835 }, { "auxiliary_loss_clip": 0.01175015, "auxiliary_loss_mlp": 0.01122993, "balance_loss_clip": 0.93492991, "balance_loss_mlp": 0.0, "epoch": 0.5232970600613239, "flos": 22743807834240.0, "grad_norm": 1.477472397513293, "language_loss": 0.80619895, "learning_rate": 1.9462574260530326e-06, "loss": 0.82917905, "num_input_tokens_seen": 93989110, "step": 4352, "time_per_iteration": 2.7383675575256348 }, { "auxiliary_loss_clip": 0.01166566, "auxiliary_loss_mlp": 0.01031211, "balance_loss_clip": 1.01114297, "balance_loss_mlp": 1.02322364, "epoch": 0.523417302951963, "flos": 17310703432320.0, "grad_norm": 1.6576095478244668, "language_loss": 0.81021047, "learning_rate": 1.9454787397870472e-06, "loss": 0.83218819, "num_input_tokens_seen": 94006430, "step": 4353, "time_per_iteration": 2.699814558029175 }, { "auxiliary_loss_clip": 0.01165578, "auxiliary_loss_mlp": 0.01030231, "balance_loss_clip": 0.86271572, "balance_loss_mlp": 1.02269697, "epoch": 0.523537545842602, "flos": 18551740285440.0, "grad_norm": 1.890209048792157, "language_loss": 0.7199707, "learning_rate": 1.944700061791894e-06, "loss": 0.74192882, "num_input_tokens_seen": 94024825, "step": 4354, "time_per_iteration": 2.7492213249206543 }, { "auxiliary_loss_clip": 0.01176858, "auxiliary_loss_mlp": 0.01024826, "balance_loss_clip": 1.01526785, "balance_loss_mlp": 1.01762581, "epoch": 0.5236577887332411, "flos": 19719267955200.0, "grad_norm": 3.545823873757292, "language_loss": 0.65523815, "learning_rate": 1.943921392185698e-06, "loss": 0.67725492, "num_input_tokens_seen": 94043450, "step": 4355, "time_per_iteration": 2.7256462574005127 }, { "auxiliary_loss_clip": 0.01177279, "auxiliary_loss_mlp": 0.01030243, "balance_loss_clip": 0.97266304, "balance_loss_mlp": 1.02239299, "epoch": 0.5237780316238803, "flos": 23550218121600.0, "grad_norm": 2.24940457745536, "language_loss": 0.77112913, "learning_rate": 1.9431427310865814e-06, "loss": 0.79320443, "num_input_tokens_seen": 94063055, "step": 4356, "time_per_iteration": 2.733116865158081 }, { "auxiliary_loss_clip": 0.011553, "auxiliary_loss_mlp": 0.01027508, "balance_loss_clip": 0.93475264, "balance_loss_mlp": 1.0202837, "epoch": 0.5238982745145193, "flos": 22491894775680.0, "grad_norm": 1.5757638370734182, "language_loss": 0.78539431, "learning_rate": 1.942364078612667e-06, "loss": 0.80722243, "num_input_tokens_seen": 94081785, "step": 4357, "time_per_iteration": 3.7279486656188965 }, { "auxiliary_loss_clip": 0.0118048, "auxiliary_loss_mlp": 0.01029619, "balance_loss_clip": 0.93636203, "balance_loss_mlp": 1.02204943, "epoch": 0.5240185174051584, "flos": 27088927234560.0, "grad_norm": 4.577215436667188, "language_loss": 0.75568867, "learning_rate": 1.9415854348820765e-06, "loss": 0.77778965, "num_input_tokens_seen": 94101635, "step": 4358, "time_per_iteration": 2.7688474655151367 }, { "auxiliary_loss_clip": 0.01182614, "auxiliary_loss_mlp": 0.01031648, "balance_loss_clip": 1.01402807, "balance_loss_mlp": 1.02367306, "epoch": 0.5241387602957975, "flos": 22674680110080.0, "grad_norm": 2.2375645668596227, "language_loss": 0.68396622, "learning_rate": 1.940806800012929e-06, "loss": 0.70610881, "num_input_tokens_seen": 94121705, "step": 4359, "time_per_iteration": 3.6187164783477783 }, { "auxiliary_loss_clip": 0.01167441, "auxiliary_loss_mlp": 0.0112308, "balance_loss_clip": 0.89845902, "balance_loss_mlp": 0.0, "epoch": 0.5242590031864366, "flos": 40553453134080.0, "grad_norm": 1.6165067496927286, "language_loss": 0.63653052, "learning_rate": 1.9400281741233432e-06, "loss": 0.65943575, "num_input_tokens_seen": 94146595, "step": 4360, "time_per_iteration": 2.872763156890869 }, { "auxiliary_loss_clip": 0.01085876, "auxiliary_loss_mlp": 0.01007317, "balance_loss_clip": 0.9149453, "balance_loss_mlp": 1.00537348, "epoch": 0.5243792460770756, "flos": 66676313105280.0, "grad_norm": 0.6966657677299528, "language_loss": 0.52519292, "learning_rate": 1.939249557331435e-06, "loss": 0.54612482, "num_input_tokens_seen": 94212410, "step": 4361, "time_per_iteration": 3.3709123134613037 }, { "auxiliary_loss_clip": 0.01182499, "auxiliary_loss_mlp": 0.01029683, "balance_loss_clip": 0.93718481, "balance_loss_mlp": 1.02216101, "epoch": 0.5244994889677148, "flos": 28183663992960.0, "grad_norm": 1.7502323169315994, "language_loss": 0.72812986, "learning_rate": 1.938470949755321e-06, "loss": 0.75025165, "num_input_tokens_seen": 94232290, "step": 4362, "time_per_iteration": 2.8081305027008057 }, { "auxiliary_loss_clip": 0.01085924, "auxiliary_loss_mlp": 0.01005035, "balance_loss_clip": 0.90960675, "balance_loss_mlp": 1.0032109, "epoch": 0.5246197318583539, "flos": 65950379239680.0, "grad_norm": 0.8126602832668435, "language_loss": 0.55722177, "learning_rate": 1.937692351513115e-06, "loss": 0.57813138, "num_input_tokens_seen": 94291285, "step": 4363, "time_per_iteration": 3.2566795349121094 }, { "auxiliary_loss_clip": 0.0118049, "auxiliary_loss_mlp": 0.01023735, "balance_loss_clip": 1.01332235, "balance_loss_mlp": 1.01630282, "epoch": 0.5247399747489929, "flos": 21033490769280.0, "grad_norm": 1.5984622930130652, "language_loss": 0.80709809, "learning_rate": 1.9369137627229297e-06, "loss": 0.82914037, "num_input_tokens_seen": 94309685, "step": 4364, "time_per_iteration": 3.431626558303833 }, { "auxiliary_loss_clip": 0.01174751, "auxiliary_loss_mlp": 0.01033009, "balance_loss_clip": 1.01470923, "balance_loss_mlp": 1.02593958, "epoch": 0.5248602176396321, "flos": 19025940660480.0, "grad_norm": 1.9077496651667032, "language_loss": 0.88154721, "learning_rate": 1.936135183502877e-06, "loss": 0.90362489, "num_input_tokens_seen": 94326985, "step": 4365, "time_per_iteration": 2.6721458435058594 }, { "auxiliary_loss_clip": 0.01177872, "auxiliary_loss_mlp": 0.0103175, "balance_loss_clip": 0.93672401, "balance_loss_mlp": 1.0230062, "epoch": 0.5249804605302711, "flos": 22200084685440.0, "grad_norm": 2.148995573669631, "language_loss": 0.80592799, "learning_rate": 1.935356613971066e-06, "loss": 0.82802415, "num_input_tokens_seen": 94347645, "step": 4366, "time_per_iteration": 2.700730085372925 }, { "auxiliary_loss_clip": 0.01173514, "auxiliary_loss_mlp": 0.01123161, "balance_loss_clip": 0.97444946, "balance_loss_mlp": 0.0, "epoch": 0.5251007034209102, "flos": 23805686626560.0, "grad_norm": 1.9964905930856405, "language_loss": 0.7704711, "learning_rate": 1.9345780542456047e-06, "loss": 0.79343784, "num_input_tokens_seen": 94367020, "step": 4367, "time_per_iteration": 2.7576687335968018 }, { "auxiliary_loss_clip": 0.01163644, "auxiliary_loss_mlp": 0.01023556, "balance_loss_clip": 1.0105269, "balance_loss_mlp": 1.01557541, "epoch": 0.5252209463115494, "flos": 23294605962240.0, "grad_norm": 1.8668590325051946, "language_loss": 0.72093177, "learning_rate": 1.9337995044446007e-06, "loss": 0.74280381, "num_input_tokens_seen": 94385860, "step": 4368, "time_per_iteration": 3.5544803142547607 }, { "auxiliary_loss_clip": 0.01181347, "auxiliary_loss_mlp": 0.01027073, "balance_loss_clip": 1.01424801, "balance_loss_mlp": 1.01975906, "epoch": 0.5253411892021884, "flos": 19828687760640.0, "grad_norm": 1.8109847690591856, "language_loss": 0.79553604, "learning_rate": 1.9330209646861596e-06, "loss": 0.81762016, "num_input_tokens_seen": 94405010, "step": 4369, "time_per_iteration": 2.6171348094940186 }, { "auxiliary_loss_clip": 0.01174184, "auxiliary_loss_mlp": 0.01026323, "balance_loss_clip": 0.97550207, "balance_loss_mlp": 1.0189383, "epoch": 0.5254614320928275, "flos": 24133730561280.0, "grad_norm": 1.6473751831898273, "language_loss": 0.7765801, "learning_rate": 1.9322424350883843e-06, "loss": 0.79858518, "num_input_tokens_seen": 94426845, "step": 4370, "time_per_iteration": 2.708477735519409 }, { "auxiliary_loss_clip": 0.01175221, "auxiliary_loss_mlp": 0.01028974, "balance_loss_clip": 0.97358888, "balance_loss_mlp": 1.02132106, "epoch": 0.5255816749834666, "flos": 24644954880000.0, "grad_norm": 1.6977638307471823, "language_loss": 0.78564858, "learning_rate": 1.931463915769379e-06, "loss": 0.80769056, "num_input_tokens_seen": 94446960, "step": 4371, "time_per_iteration": 2.7168350219726562 }, { "auxiliary_loss_clip": 0.0116953, "auxiliary_loss_mlp": 0.01029852, "balance_loss_clip": 0.89644337, "balance_loss_mlp": 1.02187657, "epoch": 0.5257019178741057, "flos": 14136595320960.0, "grad_norm": 2.1503138058911753, "language_loss": 0.73989093, "learning_rate": 1.930685406847242e-06, "loss": 0.76188475, "num_input_tokens_seen": 94461535, "step": 4372, "time_per_iteration": 2.678866386413574 }, { "auxiliary_loss_clip": 0.01175568, "auxiliary_loss_mlp": 0.01024404, "balance_loss_clip": 0.97569513, "balance_loss_mlp": 1.01723051, "epoch": 0.5258221607647448, "flos": 23548961145600.0, "grad_norm": 1.4332653609072659, "language_loss": 0.81614363, "learning_rate": 1.9299069084400734e-06, "loss": 0.83814335, "num_input_tokens_seen": 94482395, "step": 4373, "time_per_iteration": 2.724531650543213 }, { "auxiliary_loss_clip": 0.0117017, "auxiliary_loss_mlp": 0.01026667, "balance_loss_clip": 0.93819243, "balance_loss_mlp": 1.0186857, "epoch": 0.5259424036553839, "flos": 24966103403520.0, "grad_norm": 1.9879200875964933, "language_loss": 0.6982283, "learning_rate": 1.9291284206659717e-06, "loss": 0.72019666, "num_input_tokens_seen": 94500580, "step": 4374, "time_per_iteration": 2.7473948001861572 }, { "auxiliary_loss_clip": 0.01178108, "auxiliary_loss_mlp": 0.01029398, "balance_loss_clip": 1.05282736, "balance_loss_mlp": 1.02147079, "epoch": 0.526062646546023, "flos": 28763908295040.0, "grad_norm": 1.9823999554207854, "language_loss": 0.71487194, "learning_rate": 1.928349943643032e-06, "loss": 0.736947, "num_input_tokens_seen": 94519680, "step": 4375, "time_per_iteration": 2.6388959884643555 }, { "auxiliary_loss_clip": 0.01175116, "auxiliary_loss_mlp": 0.01026575, "balance_loss_clip": 1.01671648, "balance_loss_mlp": 1.01905298, "epoch": 0.526182889436662, "flos": 22821375254400.0, "grad_norm": 1.851173965644763, "language_loss": 0.81941247, "learning_rate": 1.9275714774893493e-06, "loss": 0.84142935, "num_input_tokens_seen": 94539135, "step": 4376, "time_per_iteration": 2.693971633911133 }, { "auxiliary_loss_clip": 0.01160612, "auxiliary_loss_mlp": 0.0102971, "balance_loss_clip": 0.93297637, "balance_loss_mlp": 1.02171659, "epoch": 0.5263031323273012, "flos": 22929466256640.0, "grad_norm": 2.162418973242262, "language_loss": 0.72587931, "learning_rate": 1.9267930223230154e-06, "loss": 0.74778253, "num_input_tokens_seen": 94557610, "step": 4377, "time_per_iteration": 2.7221286296844482 }, { "auxiliary_loss_clip": 0.0117766, "auxiliary_loss_mlp": 0.0103041, "balance_loss_clip": 0.97780704, "balance_loss_mlp": 1.02258396, "epoch": 0.5264233752179402, "flos": 17748634049280.0, "grad_norm": 1.9445508975196728, "language_loss": 0.78273696, "learning_rate": 1.9260145782621224e-06, "loss": 0.80481762, "num_input_tokens_seen": 94575390, "step": 4378, "time_per_iteration": 2.6464879512786865 }, { "auxiliary_loss_clip": 0.01174026, "auxiliary_loss_mlp": 0.01032913, "balance_loss_clip": 0.9779619, "balance_loss_mlp": 1.02566564, "epoch": 0.5265436181085793, "flos": 24421626069120.0, "grad_norm": 1.7422697013122663, "language_loss": 0.88014793, "learning_rate": 1.925236145424758e-06, "loss": 0.90221727, "num_input_tokens_seen": 94594210, "step": 4379, "time_per_iteration": 2.729095697402954 }, { "auxiliary_loss_clip": 0.01083957, "auxiliary_loss_mlp": 0.00999438, "balance_loss_clip": 0.98546898, "balance_loss_mlp": 0.99761409, "epoch": 0.5266638609992185, "flos": 69207298156800.0, "grad_norm": 0.6992301399397682, "language_loss": 0.57602614, "learning_rate": 1.924457723929012e-06, "loss": 0.59686011, "num_input_tokens_seen": 94665020, "step": 4380, "time_per_iteration": 3.3852412700653076 }, { "auxiliary_loss_clip": 0.01174652, "auxiliary_loss_mlp": 0.01030234, "balance_loss_clip": 1.01199579, "balance_loss_mlp": 1.0227356, "epoch": 0.5267841038898575, "flos": 20738699850240.0, "grad_norm": 1.4866674331594936, "language_loss": 0.83050108, "learning_rate": 1.9236793138929685e-06, "loss": 0.85254991, "num_input_tokens_seen": 94684290, "step": 4381, "time_per_iteration": 2.691603183746338 }, { "auxiliary_loss_clip": 0.01177613, "auxiliary_loss_mlp": 0.01031039, "balance_loss_clip": 1.01294231, "balance_loss_mlp": 1.02310562, "epoch": 0.5269043467804966, "flos": 17234392988160.0, "grad_norm": 2.624477682109535, "language_loss": 0.81035036, "learning_rate": 1.9229009154347133e-06, "loss": 0.83243686, "num_input_tokens_seen": 94701880, "step": 4382, "time_per_iteration": 2.6597719192504883 }, { "auxiliary_loss_clip": 0.01151732, "auxiliary_loss_mlp": 0.01122363, "balance_loss_clip": 0.89388585, "balance_loss_mlp": 0.0, "epoch": 0.5270245896711357, "flos": 18223157646720.0, "grad_norm": 1.908344528532374, "language_loss": 0.80180949, "learning_rate": 1.922122528672327e-06, "loss": 0.82455039, "num_input_tokens_seen": 94720545, "step": 4383, "time_per_iteration": 3.7005317211151123 }, { "auxiliary_loss_clip": 0.01175346, "auxiliary_loss_mlp": 0.01024749, "balance_loss_clip": 1.05165637, "balance_loss_mlp": 1.01742983, "epoch": 0.5271448325617748, "flos": 21287558643840.0, "grad_norm": 3.148780622872174, "language_loss": 0.78241491, "learning_rate": 1.9213441537238914e-06, "loss": 0.80441594, "num_input_tokens_seen": 94737420, "step": 4384, "time_per_iteration": 2.6173248291015625 }, { "auxiliary_loss_clip": 0.01079905, "auxiliary_loss_mlp": 0.01000428, "balance_loss_clip": 0.87094933, "balance_loss_mlp": 0.99858016, "epoch": 0.5272650754524139, "flos": 65495497403520.0, "grad_norm": 0.8363103168153869, "language_loss": 0.57395673, "learning_rate": 1.920565790707485e-06, "loss": 0.59476006, "num_input_tokens_seen": 94802810, "step": 4385, "time_per_iteration": 4.42167329788208 }, { "auxiliary_loss_clip": 0.0117875, "auxiliary_loss_mlp": 0.01028906, "balance_loss_clip": 0.89700484, "balance_loss_mlp": 1.02038872, "epoch": 0.527385318343053, "flos": 19676426008320.0, "grad_norm": 1.9270239576933657, "language_loss": 0.66326261, "learning_rate": 1.9197874397411853e-06, "loss": 0.68533915, "num_input_tokens_seen": 94819440, "step": 4386, "time_per_iteration": 2.6644034385681152 }, { "auxiliary_loss_clip": 0.0116905, "auxiliary_loss_mlp": 0.01031268, "balance_loss_clip": 0.93138134, "balance_loss_mlp": 1.0233047, "epoch": 0.5275055612336921, "flos": 12712018947840.0, "grad_norm": 4.166307305918473, "language_loss": 0.6621474, "learning_rate": 1.919009100943067e-06, "loss": 0.68415058, "num_input_tokens_seen": 94835130, "step": 4387, "time_per_iteration": 2.7103097438812256 }, { "auxiliary_loss_clip": 0.01184942, "auxiliary_loss_mlp": 0.01032922, "balance_loss_clip": 0.8998462, "balance_loss_mlp": 1.02479196, "epoch": 0.5276258041243311, "flos": 17749029098880.0, "grad_norm": 2.3961046165310926, "language_loss": 0.65536958, "learning_rate": 1.9182307744312043e-06, "loss": 0.67754817, "num_input_tokens_seen": 94852235, "step": 4388, "time_per_iteration": 2.7786288261413574 }, { "auxiliary_loss_clip": 0.01175347, "auxiliary_loss_mlp": 0.01028054, "balance_loss_clip": 0.97272414, "balance_loss_mlp": 1.0203923, "epoch": 0.5277460470149702, "flos": 22710447077760.0, "grad_norm": 2.1655280716905008, "language_loss": 0.76341796, "learning_rate": 1.9174524603236676e-06, "loss": 0.78545201, "num_input_tokens_seen": 94871185, "step": 4389, "time_per_iteration": 2.6828012466430664 }, { "auxiliary_loss_clip": 0.01173408, "auxiliary_loss_mlp": 0.01028481, "balance_loss_clip": 0.97599816, "balance_loss_mlp": 1.0209446, "epoch": 0.5278662899056094, "flos": 19902699734400.0, "grad_norm": 1.8178484624294768, "language_loss": 0.76446998, "learning_rate": 1.916674158738527e-06, "loss": 0.78648877, "num_input_tokens_seen": 94890090, "step": 4390, "time_per_iteration": 3.5592596530914307 }, { "auxiliary_loss_clip": 0.01169368, "auxiliary_loss_mlp": 0.01123265, "balance_loss_clip": 0.93817407, "balance_loss_mlp": 0.0, "epoch": 0.5279865327962484, "flos": 18005215875840.0, "grad_norm": 2.2445874040393567, "language_loss": 0.60154045, "learning_rate": 1.9158958697938506e-06, "loss": 0.62446678, "num_input_tokens_seen": 94908470, "step": 4391, "time_per_iteration": 2.685035467147827 }, { "auxiliary_loss_clip": 0.0116898, "auxiliary_loss_mlp": 0.01030532, "balance_loss_clip": 0.97389507, "balance_loss_mlp": 1.02268481, "epoch": 0.5281067756868875, "flos": 15924443892480.0, "grad_norm": 2.1831836503261637, "language_loss": 0.85677719, "learning_rate": 1.9151175936077032e-06, "loss": 0.87877232, "num_input_tokens_seen": 94923440, "step": 4392, "time_per_iteration": 2.668843984603882 }, { "auxiliary_loss_clip": 0.01172275, "auxiliary_loss_mlp": 0.01026848, "balance_loss_clip": 1.01353455, "balance_loss_mlp": 1.01976132, "epoch": 0.5282270185775266, "flos": 19426488197760.0, "grad_norm": 1.6428684917644099, "language_loss": 0.79186594, "learning_rate": 1.9143393302981507e-06, "loss": 0.8138572, "num_input_tokens_seen": 94941125, "step": 4393, "time_per_iteration": 2.614903450012207 }, { "auxiliary_loss_clip": 0.01175102, "auxiliary_loss_mlp": 0.01033396, "balance_loss_clip": 0.97281533, "balance_loss_mlp": 1.02567744, "epoch": 0.5283472614681657, "flos": 16399613934720.0, "grad_norm": 1.8118306321324298, "language_loss": 0.82981205, "learning_rate": 1.913561079983252e-06, "loss": 0.85189706, "num_input_tokens_seen": 94959950, "step": 4394, "time_per_iteration": 3.592318534851074 }, { "auxiliary_loss_clip": 0.01180852, "auxiliary_loss_mlp": 0.01038353, "balance_loss_clip": 0.97555566, "balance_loss_mlp": 1.03020513, "epoch": 0.5284675043588047, "flos": 26760524163840.0, "grad_norm": 2.0408262513622804, "language_loss": 0.74435914, "learning_rate": 1.9127828427810693e-06, "loss": 0.76655126, "num_input_tokens_seen": 94980515, "step": 4395, "time_per_iteration": 2.7152206897735596 }, { "auxiliary_loss_clip": 0.01181081, "auxiliary_loss_mlp": 0.01027596, "balance_loss_clip": 0.93635517, "balance_loss_mlp": 1.01980615, "epoch": 0.5285877472494439, "flos": 19899898473600.0, "grad_norm": 2.558769104060844, "language_loss": 0.81105322, "learning_rate": 1.9120046188096607e-06, "loss": 0.83314002, "num_input_tokens_seen": 94998560, "step": 4396, "time_per_iteration": 2.6952834129333496 }, { "auxiliary_loss_clip": 0.01176372, "auxiliary_loss_mlp": 0.0102704, "balance_loss_clip": 0.97704661, "balance_loss_mlp": 1.01914847, "epoch": 0.528707990140083, "flos": 20011257613440.0, "grad_norm": 1.7900384000776806, "language_loss": 0.73799312, "learning_rate": 1.9112264081870804e-06, "loss": 0.76002729, "num_input_tokens_seen": 95016950, "step": 4397, "time_per_iteration": 2.648247718811035 }, { "auxiliary_loss_clip": 0.01175215, "auxiliary_loss_mlp": 0.01028133, "balance_loss_clip": 0.94011486, "balance_loss_mlp": 1.01997316, "epoch": 0.528828233030722, "flos": 20667956014080.0, "grad_norm": 1.8395969246548842, "language_loss": 0.75548065, "learning_rate": 1.9104482110313843e-06, "loss": 0.77751422, "num_input_tokens_seen": 95036540, "step": 4398, "time_per_iteration": 2.7433745861053467 }, { "auxiliary_loss_clip": 0.01176657, "auxiliary_loss_mlp": 0.01024326, "balance_loss_clip": 1.01463425, "balance_loss_mlp": 1.01695883, "epoch": 0.5289484759213612, "flos": 25192448956800.0, "grad_norm": 1.7414685360590882, "language_loss": 0.74299169, "learning_rate": 1.909670027460623e-06, "loss": 0.76500154, "num_input_tokens_seen": 95053840, "step": 4399, "time_per_iteration": 2.711043357849121 }, { "auxiliary_loss_clip": 0.01179146, "auxiliary_loss_mlp": 0.01033545, "balance_loss_clip": 1.01602113, "balance_loss_mlp": 1.02555192, "epoch": 0.5290687188120002, "flos": 31139255715840.0, "grad_norm": 1.9645027710471392, "language_loss": 0.71751535, "learning_rate": 1.908891857592847e-06, "loss": 0.73964226, "num_input_tokens_seen": 95074910, "step": 4400, "time_per_iteration": 2.715564727783203 }, { "auxiliary_loss_clip": 0.01169461, "auxiliary_loss_mlp": 0.01027632, "balance_loss_clip": 0.93785256, "balance_loss_mlp": 1.01944828, "epoch": 0.5291889617026393, "flos": 20119851406080.0, "grad_norm": 2.054212691711671, "language_loss": 0.89672887, "learning_rate": 1.9081137015461034e-06, "loss": 0.91869974, "num_input_tokens_seen": 95090985, "step": 4401, "time_per_iteration": 2.7609055042266846 }, { "auxiliary_loss_clip": 0.01163148, "auxiliary_loss_mlp": 0.01027088, "balance_loss_clip": 0.90012777, "balance_loss_mlp": 1.01967287, "epoch": 0.5293092045932785, "flos": 19643747610240.0, "grad_norm": 1.8432894787470195, "language_loss": 0.9031117, "learning_rate": 1.9073355594384383e-06, "loss": 0.92501402, "num_input_tokens_seen": 95109225, "step": 4402, "time_per_iteration": 2.7155137062072754 }, { "auxiliary_loss_clip": 0.01168141, "auxiliary_loss_mlp": 0.01030351, "balance_loss_clip": 0.93777359, "balance_loss_mlp": 1.02298427, "epoch": 0.5294294474839175, "flos": 24317736958080.0, "grad_norm": 1.828544831038968, "language_loss": 0.80319619, "learning_rate": 1.906557431387895e-06, "loss": 0.82518113, "num_input_tokens_seen": 95128215, "step": 4403, "time_per_iteration": 2.7884879112243652 }, { "auxiliary_loss_clip": 0.01173341, "auxiliary_loss_mlp": 0.0103037, "balance_loss_clip": 0.94103408, "balance_loss_mlp": 1.02235937, "epoch": 0.5295496903745566, "flos": 18875941464960.0, "grad_norm": 2.48591924067585, "language_loss": 0.78615814, "learning_rate": 1.905779317512516e-06, "loss": 0.80819523, "num_input_tokens_seen": 95145760, "step": 4404, "time_per_iteration": 2.687934398651123 }, { "auxiliary_loss_clip": 0.01173784, "auxiliary_loss_mlp": 0.01024595, "balance_loss_clip": 1.01390529, "balance_loss_mlp": 1.01735866, "epoch": 0.5296699332651957, "flos": 20923101296640.0, "grad_norm": 2.013711349618487, "language_loss": 0.80738819, "learning_rate": 1.9050012179303385e-06, "loss": 0.82937193, "num_input_tokens_seen": 95164270, "step": 4405, "time_per_iteration": 2.6196625232696533 }, { "auxiliary_loss_clip": 0.01176629, "auxiliary_loss_mlp": 0.01028827, "balance_loss_clip": 1.01137245, "balance_loss_mlp": 1.02091169, "epoch": 0.5297901761558348, "flos": 22046745525120.0, "grad_norm": 2.4099116289140015, "language_loss": 0.69336867, "learning_rate": 1.904223132759401e-06, "loss": 0.71542317, "num_input_tokens_seen": 95182870, "step": 4406, "time_per_iteration": 2.6424007415771484 }, { "auxiliary_loss_clip": 0.0117667, "auxiliary_loss_mlp": 0.01026393, "balance_loss_clip": 1.01434445, "balance_loss_mlp": 1.0191803, "epoch": 0.5299104190464738, "flos": 21798495653760.0, "grad_norm": 2.2069108478402426, "language_loss": 0.68999779, "learning_rate": 1.9034450621177383e-06, "loss": 0.71202838, "num_input_tokens_seen": 95201190, "step": 4407, "time_per_iteration": 2.6471378803253174 }, { "auxiliary_loss_clip": 0.01177895, "auxiliary_loss_mlp": 0.01033638, "balance_loss_clip": 1.01658249, "balance_loss_mlp": 1.0257287, "epoch": 0.530030661937113, "flos": 14720790119040.0, "grad_norm": 1.7201117217599398, "language_loss": 0.69907081, "learning_rate": 1.9026670061233824e-06, "loss": 0.72118616, "num_input_tokens_seen": 95218625, "step": 4408, "time_per_iteration": 2.6249139308929443 }, { "auxiliary_loss_clip": 0.01172474, "auxiliary_loss_mlp": 0.01022311, "balance_loss_clip": 0.97744048, "balance_loss_mlp": 1.01516449, "epoch": 0.5301509048277521, "flos": 21251504367360.0, "grad_norm": 1.9315112342228633, "language_loss": 0.80622214, "learning_rate": 1.901888964894365e-06, "loss": 0.82817, "num_input_tokens_seen": 95237665, "step": 4409, "time_per_iteration": 3.716747283935547 }, { "auxiliary_loss_clip": 0.01177834, "auxiliary_loss_mlp": 0.01027123, "balance_loss_clip": 1.05092669, "balance_loss_mlp": 1.01918399, "epoch": 0.5302711477183911, "flos": 25957058791680.0, "grad_norm": 1.826688187660483, "language_loss": 0.6808064, "learning_rate": 1.9011109385487134e-06, "loss": 0.70285594, "num_input_tokens_seen": 95258915, "step": 4410, "time_per_iteration": 2.7014689445495605 }, { "auxiliary_loss_clip": 0.01180919, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.05380106, "balance_loss_mlp": 1.02178359, "epoch": 0.5303913906090303, "flos": 22273126992000.0, "grad_norm": 2.353755405447872, "language_loss": 0.66470891, "learning_rate": 1.900332927204454e-06, "loss": 0.68681461, "num_input_tokens_seen": 95277365, "step": 4411, "time_per_iteration": 3.5954954624176025 }, { "auxiliary_loss_clip": 0.01181455, "auxiliary_loss_mlp": 0.0103032, "balance_loss_clip": 0.97483146, "balance_loss_mlp": 1.02176929, "epoch": 0.5305116334996693, "flos": 24936010784640.0, "grad_norm": 1.6524117745300972, "language_loss": 0.76792443, "learning_rate": 1.8995549309796097e-06, "loss": 0.79004222, "num_input_tokens_seen": 95296670, "step": 4412, "time_per_iteration": 2.691923141479492 }, { "auxiliary_loss_clip": 0.01184866, "auxiliary_loss_mlp": 0.01032949, "balance_loss_clip": 1.01643026, "balance_loss_mlp": 1.02528119, "epoch": 0.5306318763903084, "flos": 20189338266240.0, "grad_norm": 1.6712849345441911, "language_loss": 0.76369488, "learning_rate": 1.8987769499922028e-06, "loss": 0.785873, "num_input_tokens_seen": 95315640, "step": 4413, "time_per_iteration": 2.6356945037841797 }, { "auxiliary_loss_clip": 0.01173018, "auxiliary_loss_mlp": 0.01122987, "balance_loss_clip": 1.01332927, "balance_loss_mlp": 0.0, "epoch": 0.5307521192809476, "flos": 20266366982400.0, "grad_norm": 2.2538434120017263, "language_loss": 0.70821404, "learning_rate": 1.897998984360252e-06, "loss": 0.73117411, "num_input_tokens_seen": 95334610, "step": 4414, "time_per_iteration": 2.636291980743408 }, { "auxiliary_loss_clip": 0.01169948, "auxiliary_loss_mlp": 0.01029697, "balance_loss_clip": 0.97410244, "balance_loss_mlp": 1.02202582, "epoch": 0.5308723621715866, "flos": 28844276976000.0, "grad_norm": 1.394966342243936, "language_loss": 0.78382587, "learning_rate": 1.897221034201775e-06, "loss": 0.80582237, "num_input_tokens_seen": 95358350, "step": 4415, "time_per_iteration": 2.7753961086273193 }, { "auxiliary_loss_clip": 0.01172836, "auxiliary_loss_mlp": 0.01022046, "balance_loss_clip": 0.93450326, "balance_loss_mlp": 1.01503026, "epoch": 0.5309926050622257, "flos": 27457766040960.0, "grad_norm": 1.5099486811330385, "language_loss": 0.66507679, "learning_rate": 1.8964430996347842e-06, "loss": 0.68702561, "num_input_tokens_seen": 95379900, "step": 4416, "time_per_iteration": 3.692641496658325 }, { "auxiliary_loss_clip": 0.0117131, "auxiliary_loss_mlp": 0.01033562, "balance_loss_clip": 0.97443563, "balance_loss_mlp": 1.02544963, "epoch": 0.5311128479528648, "flos": 20514545026560.0, "grad_norm": 1.6168730527496507, "language_loss": 0.82477069, "learning_rate": 1.8956651807772931e-06, "loss": 0.8468194, "num_input_tokens_seen": 95397935, "step": 4417, "time_per_iteration": 2.6586952209472656 }, { "auxiliary_loss_clip": 0.01171429, "auxiliary_loss_mlp": 0.01025314, "balance_loss_clip": 1.01325905, "balance_loss_mlp": 1.01800668, "epoch": 0.5312330908435039, "flos": 21397660807680.0, "grad_norm": 1.6161051086602554, "language_loss": 0.83852983, "learning_rate": 1.8948872777473115e-06, "loss": 0.86049724, "num_input_tokens_seen": 95415890, "step": 4418, "time_per_iteration": 2.6863973140716553 }, { "auxiliary_loss_clip": 0.0117627, "auxiliary_loss_mlp": 0.01025349, "balance_loss_clip": 0.97648561, "balance_loss_mlp": 1.01776147, "epoch": 0.531353333734143, "flos": 24717350741760.0, "grad_norm": 1.665409622005013, "language_loss": 0.62859511, "learning_rate": 1.8941093906628458e-06, "loss": 0.65061128, "num_input_tokens_seen": 95433675, "step": 4419, "time_per_iteration": 2.701368570327759 }, { "auxiliary_loss_clip": 0.01168718, "auxiliary_loss_mlp": 0.01027107, "balance_loss_clip": 0.9734621, "balance_loss_mlp": 1.0196979, "epoch": 0.531473576624782, "flos": 30480689808000.0, "grad_norm": 1.8842277146101687, "language_loss": 0.71217883, "learning_rate": 1.893331519641902e-06, "loss": 0.73413706, "num_input_tokens_seen": 95455820, "step": 4420, "time_per_iteration": 3.6980140209198 }, { "auxiliary_loss_clip": 0.01160495, "auxiliary_loss_mlp": 0.01029184, "balance_loss_clip": 0.93358171, "balance_loss_mlp": 1.02164412, "epoch": 0.5315938195154212, "flos": 23002975440000.0, "grad_norm": 2.220741914965985, "language_loss": 0.74000823, "learning_rate": 1.8925536648024815e-06, "loss": 0.76190507, "num_input_tokens_seen": 95473240, "step": 4421, "time_per_iteration": 2.751681327819824 }, { "auxiliary_loss_clip": 0.01179552, "auxiliary_loss_mlp": 0.01025641, "balance_loss_clip": 1.0524348, "balance_loss_mlp": 1.01818132, "epoch": 0.5317140624060602, "flos": 22748584343040.0, "grad_norm": 1.7912562162717853, "language_loss": 0.75893962, "learning_rate": 1.8917758262625849e-06, "loss": 0.78099149, "num_input_tokens_seen": 95493480, "step": 4422, "time_per_iteration": 2.647508144378662 }, { "auxiliary_loss_clip": 0.01167516, "auxiliary_loss_mlp": 0.01026611, "balance_loss_clip": 0.97563338, "balance_loss_mlp": 1.01940489, "epoch": 0.5318343052966993, "flos": 22821087945600.0, "grad_norm": 1.579641845253382, "language_loss": 0.8064574, "learning_rate": 1.8909980041402089e-06, "loss": 0.8283987, "num_input_tokens_seen": 95512075, "step": 4423, "time_per_iteration": 2.679215908050537 }, { "auxiliary_loss_clip": 0.011681, "auxiliary_loss_mlp": 0.01032599, "balance_loss_clip": 1.01067889, "balance_loss_mlp": 1.02520227, "epoch": 0.5319545481873384, "flos": 13626089274240.0, "grad_norm": 2.0824439032533473, "language_loss": 0.65839958, "learning_rate": 1.8902201985533494e-06, "loss": 0.68040657, "num_input_tokens_seen": 95529340, "step": 4424, "time_per_iteration": 2.608609676361084 }, { "auxiliary_loss_clip": 0.01178347, "auxiliary_loss_mlp": 0.01021289, "balance_loss_clip": 0.97782391, "balance_loss_mlp": 1.01357317, "epoch": 0.5320747910779775, "flos": 22162522037760.0, "grad_norm": 1.789181470778224, "language_loss": 0.74841893, "learning_rate": 1.8894424096199983e-06, "loss": 0.77041531, "num_input_tokens_seen": 95548545, "step": 4425, "time_per_iteration": 2.7071080207824707 }, { "auxiliary_loss_clip": 0.01177948, "auxiliary_loss_mlp": 0.01030732, "balance_loss_clip": 1.0160445, "balance_loss_mlp": 1.02305496, "epoch": 0.5321950339686166, "flos": 18588081870720.0, "grad_norm": 1.9685151854281044, "language_loss": 0.86092621, "learning_rate": 1.8886646374581463e-06, "loss": 0.88301301, "num_input_tokens_seen": 95567770, "step": 4426, "time_per_iteration": 2.6317763328552246 }, { "auxiliary_loss_clip": 0.01170556, "auxiliary_loss_mlp": 0.01029303, "balance_loss_clip": 1.01021385, "balance_loss_mlp": 1.02140522, "epoch": 0.5323152768592557, "flos": 22856818999680.0, "grad_norm": 1.6263211670249968, "language_loss": 0.71124458, "learning_rate": 1.8878868821857795e-06, "loss": 0.73324317, "num_input_tokens_seen": 95587420, "step": 4427, "time_per_iteration": 2.663574457168579 }, { "auxiliary_loss_clip": 0.01168395, "auxiliary_loss_mlp": 0.01027859, "balance_loss_clip": 0.89572686, "balance_loss_mlp": 1.01853085, "epoch": 0.5324355197498948, "flos": 33948690998400.0, "grad_norm": 3.036208784246846, "language_loss": 0.75096673, "learning_rate": 1.8871091439208838e-06, "loss": 0.77292925, "num_input_tokens_seen": 95609030, "step": 4428, "time_per_iteration": 2.8307552337646484 }, { "auxiliary_loss_clip": 0.01168005, "auxiliary_loss_mlp": 0.01037544, "balance_loss_clip": 0.89885312, "balance_loss_mlp": 1.02982545, "epoch": 0.5325557626405338, "flos": 23256720092160.0, "grad_norm": 2.0062662756743466, "language_loss": 0.77238393, "learning_rate": 1.8863314227814414e-06, "loss": 0.79443949, "num_input_tokens_seen": 95627340, "step": 4429, "time_per_iteration": 2.746704578399658 }, { "auxiliary_loss_clip": 0.01183457, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 1.01641083, "balance_loss_mlp": 1.02183306, "epoch": 0.532676005531173, "flos": 26718687797760.0, "grad_norm": 3.3368385333289727, "language_loss": 0.49104625, "learning_rate": 1.8855537188854313e-06, "loss": 0.51317894, "num_input_tokens_seen": 95646315, "step": 4430, "time_per_iteration": 2.722280502319336 }, { "auxiliary_loss_clip": 0.01175792, "auxiliary_loss_mlp": 0.01028198, "balance_loss_clip": 1.01068091, "balance_loss_mlp": 1.02017558, "epoch": 0.5327962484218121, "flos": 17894610921600.0, "grad_norm": 1.8414940589769178, "language_loss": 0.77876973, "learning_rate": 1.8847760323508315e-06, "loss": 0.80080962, "num_input_tokens_seen": 95665220, "step": 4431, "time_per_iteration": 2.6505913734436035 }, { "auxiliary_loss_clip": 0.01167764, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 0.97588217, "balance_loss_mlp": 1.02156329, "epoch": 0.5329164913124511, "flos": 17925385898880.0, "grad_norm": 1.62617777540037, "language_loss": 0.752783, "learning_rate": 1.883998363295616e-06, "loss": 0.77474982, "num_input_tokens_seen": 95682700, "step": 4432, "time_per_iteration": 2.6851227283477783 }, { "auxiliary_loss_clip": 0.01081192, "auxiliary_loss_mlp": 0.01005515, "balance_loss_clip": 0.94600058, "balance_loss_mlp": 1.00352383, "epoch": 0.5330367342030903, "flos": 57254178781440.0, "grad_norm": 0.8787276460709612, "language_loss": 0.62696636, "learning_rate": 1.8832207118377565e-06, "loss": 0.64783335, "num_input_tokens_seen": 95738070, "step": 4433, "time_per_iteration": 3.149780511856079 }, { "auxiliary_loss_clip": 0.01175781, "auxiliary_loss_mlp": 0.01028647, "balance_loss_clip": 1.05294263, "balance_loss_mlp": 1.02155077, "epoch": 0.5331569770937293, "flos": 17420518287360.0, "grad_norm": 1.8970123561321226, "language_loss": 0.69514215, "learning_rate": 1.882443078095222e-06, "loss": 0.71718633, "num_input_tokens_seen": 95756950, "step": 4434, "time_per_iteration": 2.599177122116089 }, { "auxiliary_loss_clip": 0.01086933, "auxiliary_loss_mlp": 0.01001722, "balance_loss_clip": 0.87223935, "balance_loss_mlp": 0.99989861, "epoch": 0.5332772199843684, "flos": 56750783627520.0, "grad_norm": 0.8845015032371883, "language_loss": 0.66868472, "learning_rate": 1.8816654621859794e-06, "loss": 0.68957126, "num_input_tokens_seen": 95816615, "step": 4435, "time_per_iteration": 4.186995267868042 }, { "auxiliary_loss_clip": 0.01173311, "auxiliary_loss_mlp": 0.01024425, "balance_loss_clip": 1.05117226, "balance_loss_mlp": 1.01698935, "epoch": 0.5333974628750076, "flos": 18697753071360.0, "grad_norm": 2.7163393903090887, "language_loss": 0.72297108, "learning_rate": 1.8808878642279915e-06, "loss": 0.74494845, "num_input_tokens_seen": 95832020, "step": 4436, "time_per_iteration": 2.7198355197906494 }, { "auxiliary_loss_clip": 0.01169368, "auxiliary_loss_mlp": 0.0102481, "balance_loss_clip": 0.92979395, "balance_loss_mlp": 1.01645398, "epoch": 0.5335177057656466, "flos": 23805507058560.0, "grad_norm": 2.0165397903499116, "language_loss": 0.64819694, "learning_rate": 1.8801102843392209e-06, "loss": 0.67013872, "num_input_tokens_seen": 95851425, "step": 4437, "time_per_iteration": 3.8033974170684814 }, { "auxiliary_loss_clip": 0.01171206, "auxiliary_loss_mlp": 0.01026577, "balance_loss_clip": 0.93522382, "balance_loss_mlp": 1.01897156, "epoch": 0.5336379486562857, "flos": 25078683605760.0, "grad_norm": 1.598229979483385, "language_loss": 0.85120016, "learning_rate": 1.8793327226376238e-06, "loss": 0.87317801, "num_input_tokens_seen": 95870745, "step": 4438, "time_per_iteration": 2.7158267498016357 }, { "auxiliary_loss_clip": 0.01181307, "auxiliary_loss_mlp": 0.01027061, "balance_loss_clip": 0.97495675, "balance_loss_mlp": 1.01926446, "epoch": 0.5337581915469248, "flos": 21396691140480.0, "grad_norm": 2.0718413144486236, "language_loss": 0.80164158, "learning_rate": 1.8785551792411569e-06, "loss": 0.82372522, "num_input_tokens_seen": 95889755, "step": 4439, "time_per_iteration": 2.704508066177368 }, { "auxiliary_loss_clip": 0.01174545, "auxiliary_loss_mlp": 0.01028614, "balance_loss_clip": 0.97416091, "balance_loss_mlp": 1.02158999, "epoch": 0.5338784344375639, "flos": 14865905064960.0, "grad_norm": 1.7419364518791767, "language_loss": 0.82514322, "learning_rate": 1.8777776542677733e-06, "loss": 0.84717476, "num_input_tokens_seen": 95907805, "step": 4440, "time_per_iteration": 2.6315648555755615 }, { "auxiliary_loss_clip": 0.01164553, "auxiliary_loss_mlp": 0.01032084, "balance_loss_clip": 0.93083, "balance_loss_mlp": 1.02437091, "epoch": 0.5339986773282029, "flos": 20813501923200.0, "grad_norm": 1.8269074431498737, "language_loss": 0.73033959, "learning_rate": 1.8770001478354216e-06, "loss": 0.75230598, "num_input_tokens_seen": 95927480, "step": 4441, "time_per_iteration": 2.765068769454956 }, { "auxiliary_loss_clip": 0.01169817, "auxiliary_loss_mlp": 0.01029693, "balance_loss_clip": 1.01138234, "balance_loss_mlp": 1.02213502, "epoch": 0.5341189202188421, "flos": 17969089772160.0, "grad_norm": 2.150018793319201, "language_loss": 0.84320027, "learning_rate": 1.8762226600620504e-06, "loss": 0.86519533, "num_input_tokens_seen": 95946095, "step": 4442, "time_per_iteration": 3.5218727588653564 }, { "auxiliary_loss_clip": 0.01179541, "auxiliary_loss_mlp": 0.01029778, "balance_loss_clip": 0.97384655, "balance_loss_mlp": 1.02174377, "epoch": 0.5342391631094812, "flos": 11031866328960.0, "grad_norm": 2.4103546655654475, "language_loss": 0.59229159, "learning_rate": 1.8754451910656031e-06, "loss": 0.61438477, "num_input_tokens_seen": 95959995, "step": 4443, "time_per_iteration": 2.5864953994750977 }, { "auxiliary_loss_clip": 0.01175229, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 0.89732563, "balance_loss_mlp": 1.01847625, "epoch": 0.5343594060001202, "flos": 15339135772800.0, "grad_norm": 1.7485331732736837, "language_loss": 0.82656211, "learning_rate": 1.8746677409640212e-06, "loss": 0.84857786, "num_input_tokens_seen": 95977095, "step": 4444, "time_per_iteration": 2.708543062210083 }, { "auxiliary_loss_clip": 0.01177863, "auxiliary_loss_mlp": 0.01026217, "balance_loss_clip": 1.01293874, "balance_loss_mlp": 1.01813424, "epoch": 0.5344796488907594, "flos": 26900898514560.0, "grad_norm": 1.5956796927393602, "language_loss": 0.84801495, "learning_rate": 1.8738903098752432e-06, "loss": 0.87005579, "num_input_tokens_seen": 95996225, "step": 4445, "time_per_iteration": 2.718959093093872 }, { "auxiliary_loss_clip": 0.01173315, "auxiliary_loss_mlp": 0.01023912, "balance_loss_clip": 0.97447741, "balance_loss_mlp": 1.0161097, "epoch": 0.5345998917813984, "flos": 25411216740480.0, "grad_norm": 2.4393580112642654, "language_loss": 0.73249108, "learning_rate": 1.8731128979172052e-06, "loss": 0.75446337, "num_input_tokens_seen": 96015425, "step": 4446, "time_per_iteration": 3.6376760005950928 }, { "auxiliary_loss_clip": 0.01167989, "auxiliary_loss_mlp": 0.01026403, "balance_loss_clip": 0.97320622, "balance_loss_mlp": 1.01865387, "epoch": 0.5347201346720375, "flos": 32853379622400.0, "grad_norm": 2.1417142749210525, "language_loss": 0.67316127, "learning_rate": 1.8723355052078394e-06, "loss": 0.6951052, "num_input_tokens_seen": 96035460, "step": 4447, "time_per_iteration": 2.812887668609619 }, { "auxiliary_loss_clip": 0.01169482, "auxiliary_loss_mlp": 0.01027253, "balance_loss_clip": 1.01085711, "balance_loss_mlp": 1.01881313, "epoch": 0.5348403775626767, "flos": 17967940536960.0, "grad_norm": 1.9979703631799874, "language_loss": 0.77013886, "learning_rate": 1.8715581318650765e-06, "loss": 0.79210615, "num_input_tokens_seen": 96054515, "step": 4448, "time_per_iteration": 2.6616435050964355 }, { "auxiliary_loss_clip": 0.01186108, "auxiliary_loss_mlp": 0.01027739, "balance_loss_clip": 0.93971419, "balance_loss_mlp": 1.01910233, "epoch": 0.5349606204533157, "flos": 17603339535360.0, "grad_norm": 2.3700497813876975, "language_loss": 0.8180089, "learning_rate": 1.8707807780068422e-06, "loss": 0.84014738, "num_input_tokens_seen": 96072330, "step": 4449, "time_per_iteration": 2.661534309387207 }, { "auxiliary_loss_clip": 0.01173269, "auxiliary_loss_mlp": 0.01030076, "balance_loss_clip": 0.97388035, "balance_loss_mlp": 1.02204132, "epoch": 0.5350808633439548, "flos": 29167831710720.0, "grad_norm": 2.0066686961895646, "language_loss": 0.66247374, "learning_rate": 1.8700034437510611e-06, "loss": 0.68450719, "num_input_tokens_seen": 96092425, "step": 4450, "time_per_iteration": 2.751826047897339 }, { "auxiliary_loss_clip": 0.01162282, "auxiliary_loss_mlp": 0.01026886, "balance_loss_clip": 0.93433475, "balance_loss_mlp": 1.01916122, "epoch": 0.5352011062345938, "flos": 19499997381120.0, "grad_norm": 2.1717181692746497, "language_loss": 0.81304461, "learning_rate": 1.8692261292156549e-06, "loss": 0.83493626, "num_input_tokens_seen": 96111660, "step": 4451, "time_per_iteration": 2.701000213623047 }, { "auxiliary_loss_clip": 0.0117595, "auxiliary_loss_mlp": 0.01024989, "balance_loss_clip": 1.05335665, "balance_loss_mlp": 1.01715684, "epoch": 0.535321349125233, "flos": 23477642691840.0, "grad_norm": 2.3151423272870635, "language_loss": 0.82066929, "learning_rate": 1.8684488345185401e-06, "loss": 0.84267873, "num_input_tokens_seen": 96131835, "step": 4452, "time_per_iteration": 2.6536948680877686 }, { "auxiliary_loss_clip": 0.01179903, "auxiliary_loss_mlp": 0.0102784, "balance_loss_clip": 1.05443513, "balance_loss_mlp": 1.02019322, "epoch": 0.535441592015872, "flos": 20478059786880.0, "grad_norm": 2.1273178822269414, "language_loss": 0.78637171, "learning_rate": 1.8676715597776332e-06, "loss": 0.80844915, "num_input_tokens_seen": 96150180, "step": 4453, "time_per_iteration": 2.6552765369415283 }, { "auxiliary_loss_clip": 0.01158302, "auxiliary_loss_mlp": 0.01024378, "balance_loss_clip": 0.89493561, "balance_loss_mlp": 1.01730299, "epoch": 0.5355618349065111, "flos": 19573147428480.0, "grad_norm": 1.6973496839259974, "language_loss": 0.76025784, "learning_rate": 1.8668943051108455e-06, "loss": 0.78208458, "num_input_tokens_seen": 96167485, "step": 4454, "time_per_iteration": 2.7786037921905518 }, { "auxiliary_loss_clip": 0.01172126, "auxiliary_loss_mlp": 0.01025035, "balance_loss_clip": 0.97435558, "balance_loss_mlp": 1.01740003, "epoch": 0.5356820777971503, "flos": 24024633978240.0, "grad_norm": 1.7736945543432674, "language_loss": 0.76413167, "learning_rate": 1.8661170706360856e-06, "loss": 0.78610331, "num_input_tokens_seen": 96186650, "step": 4455, "time_per_iteration": 2.6575300693511963 }, { "auxiliary_loss_clip": 0.0117336, "auxiliary_loss_mlp": 0.01019508, "balance_loss_clip": 1.01522732, "balance_loss_mlp": 1.01259375, "epoch": 0.5358023206877893, "flos": 20884676722560.0, "grad_norm": 1.591752877129162, "language_loss": 0.81498945, "learning_rate": 1.8653398564712594e-06, "loss": 0.83691812, "num_input_tokens_seen": 96205595, "step": 4456, "time_per_iteration": 2.606964588165283 }, { "auxiliary_loss_clip": 0.01173471, "auxiliary_loss_mlp": 0.01028962, "balance_loss_clip": 1.01363468, "balance_loss_mlp": 1.02144563, "epoch": 0.5359225635784284, "flos": 22418996123520.0, "grad_norm": 1.5299684417867614, "language_loss": 0.82303667, "learning_rate": 1.8645626627342704e-06, "loss": 0.845061, "num_input_tokens_seen": 96226360, "step": 4457, "time_per_iteration": 2.6692404747009277 }, { "auxiliary_loss_clip": 0.01178262, "auxiliary_loss_mlp": 0.01026516, "balance_loss_clip": 1.01317883, "balance_loss_mlp": 1.01921415, "epoch": 0.5360428064690675, "flos": 24097784025600.0, "grad_norm": 2.3450865264130214, "language_loss": 0.80970395, "learning_rate": 1.8637854895430172e-06, "loss": 0.8317517, "num_input_tokens_seen": 96245625, "step": 4458, "time_per_iteration": 2.6275620460510254 }, { "auxiliary_loss_clip": 0.01163757, "auxiliary_loss_mlp": 0.01027232, "balance_loss_clip": 0.93399847, "balance_loss_mlp": 1.01882732, "epoch": 0.5361630493597066, "flos": 21434505183360.0, "grad_norm": 2.5376816477748827, "language_loss": 0.69350898, "learning_rate": 1.8630083370153978e-06, "loss": 0.71541888, "num_input_tokens_seen": 96265265, "step": 4459, "time_per_iteration": 2.7269716262817383 }, { "auxiliary_loss_clip": 0.01087942, "auxiliary_loss_mlp": 0.01006835, "balance_loss_clip": 0.83253086, "balance_loss_mlp": 1.00489235, "epoch": 0.5362832922503457, "flos": 68888696520960.0, "grad_norm": 0.7504317705183794, "language_loss": 0.55467606, "learning_rate": 1.8622312052693041e-06, "loss": 0.57562387, "num_input_tokens_seen": 96326445, "step": 4460, "time_per_iteration": 3.50167179107666 }, { "auxiliary_loss_clip": 0.0116634, "auxiliary_loss_mlp": 0.01026962, "balance_loss_clip": 1.0098896, "balance_loss_mlp": 1.0192256, "epoch": 0.5364035351409848, "flos": 9793702563840.0, "grad_norm": 2.103062870419619, "language_loss": 0.71643221, "learning_rate": 1.8614540944226267e-06, "loss": 0.73836529, "num_input_tokens_seen": 96343115, "step": 4461, "time_per_iteration": 4.250727891921997 }, { "auxiliary_loss_clip": 0.01171692, "auxiliary_loss_mlp": 0.01023529, "balance_loss_clip": 0.97671998, "balance_loss_mlp": 1.01604283, "epoch": 0.5365237780316239, "flos": 23290080848640.0, "grad_norm": 1.6463690141920666, "language_loss": 0.67946273, "learning_rate": 1.8606770045932537e-06, "loss": 0.70141494, "num_input_tokens_seen": 96362230, "step": 4462, "time_per_iteration": 2.7525479793548584 }, { "auxiliary_loss_clip": 0.01161886, "auxiliary_loss_mlp": 0.01026731, "balance_loss_clip": 0.93163681, "balance_loss_mlp": 1.01913154, "epoch": 0.5366440209222629, "flos": 26578133879040.0, "grad_norm": 2.637008200999252, "language_loss": 0.81723022, "learning_rate": 1.859899935899068e-06, "loss": 0.83911633, "num_input_tokens_seen": 96382085, "step": 4463, "time_per_iteration": 3.710582733154297 }, { "auxiliary_loss_clip": 0.01172957, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 0.97720242, "balance_loss_mlp": 1.0232892, "epoch": 0.5367642638129021, "flos": 19608052469760.0, "grad_norm": 1.4429533431432957, "language_loss": 0.78880787, "learning_rate": 1.8591228884579506e-06, "loss": 0.81085384, "num_input_tokens_seen": 96400580, "step": 4464, "time_per_iteration": 2.7541894912719727 }, { "auxiliary_loss_clip": 0.01174431, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 0.9356972, "balance_loss_mlp": 1.02241206, "epoch": 0.5368845067035412, "flos": 23915214172800.0, "grad_norm": 1.8531688950821659, "language_loss": 0.82185376, "learning_rate": 1.8583458623877795e-06, "loss": 0.84390038, "num_input_tokens_seen": 96419680, "step": 4465, "time_per_iteration": 2.777848482131958 }, { "auxiliary_loss_clip": 0.01176471, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.01285434, "balance_loss_mlp": 1.02078843, "epoch": 0.5370047495941802, "flos": 16873131951360.0, "grad_norm": 1.9802483612733106, "language_loss": 0.74175858, "learning_rate": 1.8575688578064281e-06, "loss": 0.76380497, "num_input_tokens_seen": 96437805, "step": 4466, "time_per_iteration": 2.5945327281951904 }, { "auxiliary_loss_clip": 0.01179915, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.01647472, "balance_loss_mlp": 1.02506328, "epoch": 0.5371249924848194, "flos": 20740926493440.0, "grad_norm": 1.8370864886285694, "language_loss": 0.76946628, "learning_rate": 1.8567918748317674e-06, "loss": 0.79159087, "num_input_tokens_seen": 96457155, "step": 4467, "time_per_iteration": 2.714538812637329 }, { "auxiliary_loss_clip": 0.01171131, "auxiliary_loss_mlp": 0.01030708, "balance_loss_clip": 0.93436909, "balance_loss_mlp": 1.02295935, "epoch": 0.5372452353754584, "flos": 17968120104960.0, "grad_norm": 1.9271447158093122, "language_loss": 0.82670283, "learning_rate": 1.8560149135816659e-06, "loss": 0.84872127, "num_input_tokens_seen": 96473990, "step": 4468, "time_per_iteration": 3.7060344219207764 }, { "auxiliary_loss_clip": 0.01169768, "auxiliary_loss_mlp": 0.01023351, "balance_loss_clip": 1.01102948, "balance_loss_mlp": 1.01600766, "epoch": 0.5373654782660975, "flos": 15377021642880.0, "grad_norm": 2.152863118482644, "language_loss": 0.84280324, "learning_rate": 1.8552379741739873e-06, "loss": 0.86473441, "num_input_tokens_seen": 96491335, "step": 4469, "time_per_iteration": 2.5889241695404053 }, { "auxiliary_loss_clip": 0.01085598, "auxiliary_loss_mlp": 0.01117091, "balance_loss_clip": 0.91004539, "balance_loss_mlp": 0.0, "epoch": 0.5374857211567367, "flos": 69000091574400.0, "grad_norm": 0.9071726969037395, "language_loss": 0.55670071, "learning_rate": 1.8544610567265935e-06, "loss": 0.5787276, "num_input_tokens_seen": 96545275, "step": 4470, "time_per_iteration": 3.296612024307251 }, { "auxiliary_loss_clip": 0.01173907, "auxiliary_loss_mlp": 0.01122557, "balance_loss_clip": 0.97772682, "balance_loss_mlp": 0.0, "epoch": 0.5376059640473757, "flos": 15085355207040.0, "grad_norm": 1.840848427207644, "language_loss": 0.83262831, "learning_rate": 1.853684161357341e-06, "loss": 0.85559291, "num_input_tokens_seen": 96562935, "step": 4471, "time_per_iteration": 2.6744742393493652 }, { "auxiliary_loss_clip": 0.01171513, "auxiliary_loss_mlp": 0.01123341, "balance_loss_clip": 1.01387334, "balance_loss_mlp": 0.0, "epoch": 0.5377262069380148, "flos": 19792597570560.0, "grad_norm": 1.6360756433955022, "language_loss": 0.77022254, "learning_rate": 1.852907288184085e-06, "loss": 0.79317105, "num_input_tokens_seen": 96581820, "step": 4472, "time_per_iteration": 3.605560779571533 }, { "auxiliary_loss_clip": 0.01178559, "auxiliary_loss_mlp": 0.01029718, "balance_loss_clip": 0.89912772, "balance_loss_mlp": 1.0218327, "epoch": 0.5378464498286539, "flos": 30003077640960.0, "grad_norm": 1.9425275677635285, "language_loss": 0.70256752, "learning_rate": 1.8521304373246762e-06, "loss": 0.72465032, "num_input_tokens_seen": 96602865, "step": 4473, "time_per_iteration": 2.839046001434326 }, { "auxiliary_loss_clip": 0.01174951, "auxiliary_loss_mlp": 0.01030415, "balance_loss_clip": 1.0112133, "balance_loss_mlp": 1.02199841, "epoch": 0.537966692719293, "flos": 21251217058560.0, "grad_norm": 2.1966245736423313, "language_loss": 0.88917267, "learning_rate": 1.8513536088969626e-06, "loss": 0.91122639, "num_input_tokens_seen": 96620530, "step": 4474, "time_per_iteration": 2.639350414276123 }, { "auxiliary_loss_clip": 0.01177165, "auxiliary_loss_mlp": 0.01029522, "balance_loss_clip": 1.01489079, "balance_loss_mlp": 1.02240205, "epoch": 0.538086935609932, "flos": 21543170803200.0, "grad_norm": 1.676958029092068, "language_loss": 0.80174971, "learning_rate": 1.8505768030187884e-06, "loss": 0.8238166, "num_input_tokens_seen": 96640660, "step": 4475, "time_per_iteration": 2.713242530822754 }, { "auxiliary_loss_clip": 0.01168661, "auxiliary_loss_mlp": 0.0102875, "balance_loss_clip": 0.97544897, "balance_loss_mlp": 1.02105474, "epoch": 0.5382071785005712, "flos": 22747219626240.0, "grad_norm": 1.6633842457422745, "language_loss": 0.80160695, "learning_rate": 1.849800019807995e-06, "loss": 0.8235811, "num_input_tokens_seen": 96661885, "step": 4476, "time_per_iteration": 2.695765256881714 }, { "auxiliary_loss_clip": 0.01173027, "auxiliary_loss_mlp": 0.01025188, "balance_loss_clip": 0.93768257, "balance_loss_mlp": 1.01787448, "epoch": 0.5383274213912103, "flos": 24934574240640.0, "grad_norm": 3.176450576808386, "language_loss": 0.70824409, "learning_rate": 1.8490232593824186e-06, "loss": 0.73022622, "num_input_tokens_seen": 96678340, "step": 4477, "time_per_iteration": 2.776684045791626 }, { "auxiliary_loss_clip": 0.01171423, "auxiliary_loss_mlp": 0.01028511, "balance_loss_clip": 0.97695625, "balance_loss_mlp": 1.02161217, "epoch": 0.5384476642818493, "flos": 22310186849280.0, "grad_norm": 1.6427562226039394, "language_loss": 0.84847617, "learning_rate": 1.8482465218598935e-06, "loss": 0.87047553, "num_input_tokens_seen": 96698285, "step": 4478, "time_per_iteration": 2.710658073425293 }, { "auxiliary_loss_clip": 0.01171888, "auxiliary_loss_mlp": 0.0102689, "balance_loss_clip": 0.93420303, "balance_loss_mlp": 1.01938629, "epoch": 0.5385679071724885, "flos": 22711021695360.0, "grad_norm": 1.789641602294654, "language_loss": 0.83307266, "learning_rate": 1.8474698073582508e-06, "loss": 0.85506046, "num_input_tokens_seen": 96719655, "step": 4479, "time_per_iteration": 2.7624974250793457 }, { "auxiliary_loss_clip": 0.01174829, "auxiliary_loss_mlp": 0.01031202, "balance_loss_clip": 0.93553853, "balance_loss_mlp": 1.02335835, "epoch": 0.5386881500631275, "flos": 15953746412160.0, "grad_norm": 2.10594424872241, "language_loss": 0.86874926, "learning_rate": 1.8466931159953166e-06, "loss": 0.89080954, "num_input_tokens_seen": 96736290, "step": 4480, "time_per_iteration": 2.6979758739471436 }, { "auxiliary_loss_clip": 0.01181068, "auxiliary_loss_mlp": 0.01023856, "balance_loss_clip": 0.97885776, "balance_loss_mlp": 1.01626277, "epoch": 0.5388083929537666, "flos": 24060041809920.0, "grad_norm": 1.6991158926405947, "language_loss": 0.84253508, "learning_rate": 1.8459164478889158e-06, "loss": 0.86458433, "num_input_tokens_seen": 96757685, "step": 4481, "time_per_iteration": 2.7090795040130615 }, { "auxiliary_loss_clip": 0.01161536, "auxiliary_loss_mlp": 0.01026199, "balance_loss_clip": 0.93357909, "balance_loss_mlp": 1.0191927, "epoch": 0.5389286358444056, "flos": 22236893147520.0, "grad_norm": 1.6984307152906624, "language_loss": 0.75718689, "learning_rate": 1.8451398031568663e-06, "loss": 0.7790643, "num_input_tokens_seen": 96777310, "step": 4482, "time_per_iteration": 2.674281120300293 }, { "auxiliary_loss_clip": 0.01170793, "auxiliary_loss_mlp": 0.01028966, "balance_loss_clip": 0.93582606, "balance_loss_mlp": 1.02127385, "epoch": 0.5390488787350448, "flos": 24281718595200.0, "grad_norm": 1.6103859881465232, "language_loss": 0.74730062, "learning_rate": 1.844363181916986e-06, "loss": 0.7692982, "num_input_tokens_seen": 96798035, "step": 4483, "time_per_iteration": 2.747267246246338 }, { "auxiliary_loss_clip": 0.01169138, "auxiliary_loss_mlp": 0.01029918, "balance_loss_clip": 1.01057601, "balance_loss_mlp": 1.02140033, "epoch": 0.5391691216256839, "flos": 16581393688320.0, "grad_norm": 2.12368133441766, "language_loss": 0.83117044, "learning_rate": 1.8435865842870868e-06, "loss": 0.85316098, "num_input_tokens_seen": 96815975, "step": 4484, "time_per_iteration": 2.6630005836486816 }, { "auxiliary_loss_clip": 0.01161415, "auxiliary_loss_mlp": 0.01123174, "balance_loss_clip": 0.96974027, "balance_loss_mlp": 0.0, "epoch": 0.5392893645163229, "flos": 23330049707520.0, "grad_norm": 1.9169332046129193, "language_loss": 0.71933746, "learning_rate": 1.8428100103849787e-06, "loss": 0.74218339, "num_input_tokens_seen": 96835770, "step": 4485, "time_per_iteration": 2.6739139556884766 }, { "auxiliary_loss_clip": 0.01172268, "auxiliary_loss_mlp": 0.01031397, "balance_loss_clip": 0.97754169, "balance_loss_mlp": 1.02366614, "epoch": 0.5394096074069621, "flos": 15669801400320.0, "grad_norm": 3.644257911198776, "language_loss": 0.7338289, "learning_rate": 1.842033460328467e-06, "loss": 0.75586557, "num_input_tokens_seen": 96854490, "step": 4486, "time_per_iteration": 2.73640775680542 }, { "auxiliary_loss_clip": 0.0117493, "auxiliary_loss_mlp": 0.01122829, "balance_loss_clip": 0.97335052, "balance_loss_mlp": 0.0, "epoch": 0.5395298502976011, "flos": 22893447893760.0, "grad_norm": 1.5783140676029233, "language_loss": 0.75063956, "learning_rate": 1.8412569342353541e-06, "loss": 0.77361715, "num_input_tokens_seen": 96874645, "step": 4487, "time_per_iteration": 3.744321584701538 }, { "auxiliary_loss_clip": 0.01178446, "auxiliary_loss_mlp": 0.01029515, "balance_loss_clip": 0.97583413, "balance_loss_mlp": 1.0214895, "epoch": 0.5396500931882402, "flos": 23842135952640.0, "grad_norm": 1.8723428195365568, "language_loss": 0.84549654, "learning_rate": 1.840480432223438e-06, "loss": 0.86757618, "num_input_tokens_seen": 96893650, "step": 4488, "time_per_iteration": 2.793437957763672 }, { "auxiliary_loss_clip": 0.01174441, "auxiliary_loss_mlp": 0.01025656, "balance_loss_clip": 0.97316426, "balance_loss_mlp": 1.01810122, "epoch": 0.5397703360788794, "flos": 26322988596480.0, "grad_norm": 1.9146957744479902, "language_loss": 0.77879608, "learning_rate": 1.8397039544105131e-06, "loss": 0.80079705, "num_input_tokens_seen": 96912735, "step": 4489, "time_per_iteration": 2.6838014125823975 }, { "auxiliary_loss_clip": 0.01161004, "auxiliary_loss_mlp": 0.01030293, "balance_loss_clip": 0.96808541, "balance_loss_mlp": 1.02244329, "epoch": 0.5398905789695184, "flos": 21214588164480.0, "grad_norm": 1.6232010865671822, "language_loss": 0.69626021, "learning_rate": 1.8389275009143711e-06, "loss": 0.71817327, "num_input_tokens_seen": 96932475, "step": 4490, "time_per_iteration": 3.6936161518096924 }, { "auxiliary_loss_clip": 0.01171506, "auxiliary_loss_mlp": 0.01023202, "balance_loss_clip": 1.04823196, "balance_loss_mlp": 1.01587641, "epoch": 0.5400108218601575, "flos": 25080335631360.0, "grad_norm": 1.6232011311579748, "language_loss": 0.73805726, "learning_rate": 1.8381510718527988e-06, "loss": 0.76000428, "num_input_tokens_seen": 96952085, "step": 4491, "time_per_iteration": 2.6641740798950195 }, { "auxiliary_loss_clip": 0.01171623, "auxiliary_loss_mlp": 0.01030029, "balance_loss_clip": 0.9703657, "balance_loss_mlp": 1.02191651, "epoch": 0.5401310647507966, "flos": 26357498588160.0, "grad_norm": 1.719522224823814, "language_loss": 0.63594377, "learning_rate": 1.8373746673435812e-06, "loss": 0.6579603, "num_input_tokens_seen": 96973110, "step": 4492, "time_per_iteration": 2.755009651184082 }, { "auxiliary_loss_clip": 0.01179373, "auxiliary_loss_mlp": 0.01022576, "balance_loss_clip": 1.05434394, "balance_loss_mlp": 1.01521778, "epoch": 0.5402513076414357, "flos": 27855332749440.0, "grad_norm": 1.7653930695851843, "language_loss": 0.79002702, "learning_rate": 1.8365982875044964e-06, "loss": 0.81204647, "num_input_tokens_seen": 96993420, "step": 4493, "time_per_iteration": 2.6672730445861816 }, { "auxiliary_loss_clip": 0.0117985, "auxiliary_loss_mlp": 0.01123946, "balance_loss_clip": 1.01307237, "balance_loss_mlp": 0.0, "epoch": 0.5403715505320748, "flos": 22893771116160.0, "grad_norm": 1.8957152432510225, "language_loss": 0.75915694, "learning_rate": 1.8358219324533217e-06, "loss": 0.78219491, "num_input_tokens_seen": 97013685, "step": 4494, "time_per_iteration": 3.6897222995758057 }, { "auxiliary_loss_clip": 0.01168387, "auxiliary_loss_mlp": 0.01023169, "balance_loss_clip": 0.97287267, "balance_loss_mlp": 1.01616502, "epoch": 0.5404917934227139, "flos": 30224143895040.0, "grad_norm": 1.711262792152538, "language_loss": 0.70192617, "learning_rate": 1.8350456023078292e-06, "loss": 0.72384173, "num_input_tokens_seen": 97036060, "step": 4495, "time_per_iteration": 2.7246758937835693 }, { "auxiliary_loss_clip": 0.01178909, "auxiliary_loss_mlp": 0.01024482, "balance_loss_clip": 1.05121279, "balance_loss_mlp": 1.01646531, "epoch": 0.540612036313353, "flos": 19938502615680.0, "grad_norm": 2.2408592939358947, "language_loss": 0.77749336, "learning_rate": 1.8342692971857874e-06, "loss": 0.79952729, "num_input_tokens_seen": 97055260, "step": 4496, "time_per_iteration": 2.611072540283203 }, { "auxiliary_loss_clip": 0.01171348, "auxiliary_loss_mlp": 0.01026914, "balance_loss_clip": 0.97547257, "balance_loss_mlp": 1.01959443, "epoch": 0.540732279203992, "flos": 24279599692800.0, "grad_norm": 2.6884794781548638, "language_loss": 0.71287394, "learning_rate": 1.833493017204962e-06, "loss": 0.73485661, "num_input_tokens_seen": 97075365, "step": 4497, "time_per_iteration": 2.737849473953247 }, { "auxiliary_loss_clip": 0.01177404, "auxiliary_loss_mlp": 0.01026349, "balance_loss_clip": 1.05235338, "balance_loss_mlp": 1.01885629, "epoch": 0.5408525220946312, "flos": 20193216935040.0, "grad_norm": 4.349173460847881, "language_loss": 0.77979296, "learning_rate": 1.8327167624831134e-06, "loss": 0.80183053, "num_input_tokens_seen": 97093095, "step": 4498, "time_per_iteration": 3.515509843826294 }, { "auxiliary_loss_clip": 0.01176535, "auxiliary_loss_mlp": 0.01028574, "balance_loss_clip": 1.05319333, "balance_loss_mlp": 1.02027738, "epoch": 0.5409727649852702, "flos": 24134448833280.0, "grad_norm": 1.6182322862727434, "language_loss": 0.70642781, "learning_rate": 1.831940533137999e-06, "loss": 0.72847891, "num_input_tokens_seen": 97112000, "step": 4499, "time_per_iteration": 2.629213333129883 }, { "auxiliary_loss_clip": 0.01176092, "auxiliary_loss_mlp": 0.010292, "balance_loss_clip": 1.01563585, "balance_loss_mlp": 1.02183855, "epoch": 0.5410930078759093, "flos": 23912700220800.0, "grad_norm": 2.1520726234473173, "language_loss": 0.72199297, "learning_rate": 1.8311643292873718e-06, "loss": 0.74404585, "num_input_tokens_seen": 97130820, "step": 4500, "time_per_iteration": 2.6958205699920654 }, { "auxiliary_loss_clip": 0.01169514, "auxiliary_loss_mlp": 0.01027376, "balance_loss_clip": 1.01380754, "balance_loss_mlp": 1.0204891, "epoch": 0.5412132507665485, "flos": 21105132445440.0, "grad_norm": 1.768618946479683, "language_loss": 0.88023567, "learning_rate": 1.8303881510489818e-06, "loss": 0.90220451, "num_input_tokens_seen": 97149210, "step": 4501, "time_per_iteration": 2.6087028980255127 }, { "auxiliary_loss_clip": 0.01175039, "auxiliary_loss_mlp": 0.01024836, "balance_loss_clip": 0.97568166, "balance_loss_mlp": 1.01759076, "epoch": 0.5413334936571875, "flos": 30227340205440.0, "grad_norm": 1.9245039294541366, "language_loss": 0.68946528, "learning_rate": 1.829611998540574e-06, "loss": 0.71146405, "num_input_tokens_seen": 97170415, "step": 4502, "time_per_iteration": 2.739171028137207 }, { "auxiliary_loss_clip": 0.01171699, "auxiliary_loss_mlp": 0.01122967, "balance_loss_clip": 1.0102824, "balance_loss_mlp": 0.0, "epoch": 0.5414537365478266, "flos": 24279635606400.0, "grad_norm": 1.696850602643885, "language_loss": 0.79443377, "learning_rate": 1.8288358718798914e-06, "loss": 0.81738043, "num_input_tokens_seen": 97189605, "step": 4503, "time_per_iteration": 2.692054271697998 }, { "auxiliary_loss_clip": 0.01171754, "auxiliary_loss_mlp": 0.01122943, "balance_loss_clip": 1.01415896, "balance_loss_mlp": 0.0, "epoch": 0.5415739794384657, "flos": 16654543735680.0, "grad_norm": 1.7175384019862086, "language_loss": 0.72592366, "learning_rate": 1.8280597711846703e-06, "loss": 0.74887061, "num_input_tokens_seen": 97207845, "step": 4504, "time_per_iteration": 2.660144329071045 }, { "auxiliary_loss_clip": 0.01174074, "auxiliary_loss_mlp": 0.01030215, "balance_loss_clip": 1.01397979, "balance_loss_mlp": 1.02245748, "epoch": 0.5416942223291048, "flos": 23185724860800.0, "grad_norm": 1.7814429985361522, "language_loss": 0.83333892, "learning_rate": 1.8272836965726455e-06, "loss": 0.85538179, "num_input_tokens_seen": 97226780, "step": 4505, "time_per_iteration": 2.6728224754333496 }, { "auxiliary_loss_clip": 0.01164183, "auxiliary_loss_mlp": 0.01030915, "balance_loss_clip": 0.85438335, "balance_loss_mlp": 1.02245736, "epoch": 0.5418144652197439, "flos": 20303247271680.0, "grad_norm": 1.80580381205194, "language_loss": 0.78302014, "learning_rate": 1.8265076481615461e-06, "loss": 0.8049711, "num_input_tokens_seen": 97246695, "step": 4506, "time_per_iteration": 2.924346923828125 }, { "auxiliary_loss_clip": 0.01175276, "auxiliary_loss_mlp": 0.01030112, "balance_loss_clip": 0.97833598, "balance_loss_mlp": 1.02175534, "epoch": 0.541934708110383, "flos": 12458633431680.0, "grad_norm": 6.684273886854852, "language_loss": 0.86793184, "learning_rate": 1.8257316260690987e-06, "loss": 0.88998568, "num_input_tokens_seen": 97264480, "step": 4507, "time_per_iteration": 2.6934430599212646 }, { "auxiliary_loss_clip": 0.01176244, "auxiliary_loss_mlp": 0.01029742, "balance_loss_clip": 1.01333332, "balance_loss_mlp": 1.02282155, "epoch": 0.5420549510010221, "flos": 21253802837760.0, "grad_norm": 1.4495018435377771, "language_loss": 0.76195264, "learning_rate": 1.8249556304130254e-06, "loss": 0.78401256, "num_input_tokens_seen": 97285760, "step": 4508, "time_per_iteration": 2.6912269592285156 }, { "auxiliary_loss_clip": 0.01160488, "auxiliary_loss_mlp": 0.01023869, "balance_loss_clip": 0.97069275, "balance_loss_mlp": 1.01609087, "epoch": 0.5421751938916611, "flos": 29490524519040.0, "grad_norm": 1.9657680457449707, "language_loss": 0.68487048, "learning_rate": 1.824179661311044e-06, "loss": 0.70671403, "num_input_tokens_seen": 97304510, "step": 4509, "time_per_iteration": 2.72220778465271 }, { "auxiliary_loss_clip": 0.01165764, "auxiliary_loss_mlp": 0.01033075, "balance_loss_clip": 0.89322937, "balance_loss_mlp": 1.0254935, "epoch": 0.5422954367823003, "flos": 18734238311040.0, "grad_norm": 1.966968897170057, "language_loss": 0.80005044, "learning_rate": 1.823403718880868e-06, "loss": 0.82203883, "num_input_tokens_seen": 97323270, "step": 4510, "time_per_iteration": 2.7411866188049316 }, { "auxiliary_loss_clip": 0.01169332, "auxiliary_loss_mlp": 0.0102784, "balance_loss_clip": 0.97041273, "balance_loss_mlp": 1.02044892, "epoch": 0.5424156796729394, "flos": 39969006940800.0, "grad_norm": 1.6115406690379366, "language_loss": 0.66332734, "learning_rate": 1.822627803240207e-06, "loss": 0.6852991, "num_input_tokens_seen": 97345600, "step": 4511, "time_per_iteration": 2.812331438064575 }, { "auxiliary_loss_clip": 0.01177178, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 0.93666941, "balance_loss_mlp": 1.0232079, "epoch": 0.5425359225635784, "flos": 11546538353280.0, "grad_norm": 1.9869339843036018, "language_loss": 0.84855676, "learning_rate": 1.8218519145067675e-06, "loss": 0.87063587, "num_input_tokens_seen": 97361220, "step": 4512, "time_per_iteration": 2.656986713409424 }, { "auxiliary_loss_clip": 0.0116519, "auxiliary_loss_mlp": 0.01028144, "balance_loss_clip": 0.9344722, "balance_loss_mlp": 1.02040398, "epoch": 0.5426561654542175, "flos": 20229702174720.0, "grad_norm": 1.8482775140224128, "language_loss": 0.89574754, "learning_rate": 1.8210760527982508e-06, "loss": 0.91768086, "num_input_tokens_seen": 97381505, "step": 4513, "time_per_iteration": 2.797795295715332 }, { "auxiliary_loss_clip": 0.01176627, "auxiliary_loss_mlp": 0.0112268, "balance_loss_clip": 0.9769243, "balance_loss_mlp": 0.0, "epoch": 0.5427764083448566, "flos": 21871681614720.0, "grad_norm": 1.7333984440565384, "language_loss": 0.75128341, "learning_rate": 1.8203002182323552e-06, "loss": 0.77427649, "num_input_tokens_seen": 97399060, "step": 4514, "time_per_iteration": 3.6578288078308105 }, { "auxiliary_loss_clip": 0.01178369, "auxiliary_loss_mlp": 0.01035504, "balance_loss_clip": 0.97626805, "balance_loss_mlp": 1.02812505, "epoch": 0.5428966512354957, "flos": 19640946349440.0, "grad_norm": 1.6557563097158754, "language_loss": 0.75568533, "learning_rate": 1.819524410926773e-06, "loss": 0.77782404, "num_input_tokens_seen": 97416740, "step": 4515, "time_per_iteration": 2.6507580280303955 }, { "auxiliary_loss_clip": 0.01166311, "auxiliary_loss_mlp": 0.0102803, "balance_loss_clip": 0.85926592, "balance_loss_mlp": 1.02032328, "epoch": 0.5430168941261347, "flos": 22382187661440.0, "grad_norm": 1.6001740020390396, "language_loss": 0.76787865, "learning_rate": 1.8187486309991944e-06, "loss": 0.78982204, "num_input_tokens_seen": 97437620, "step": 4516, "time_per_iteration": 3.825134754180908 }, { "auxiliary_loss_clip": 0.01180363, "auxiliary_loss_mlp": 0.01030278, "balance_loss_clip": 1.01388633, "balance_loss_mlp": 1.02316153, "epoch": 0.5431371370167739, "flos": 18764187275520.0, "grad_norm": 1.8077171998963375, "language_loss": 0.77356327, "learning_rate": 1.817972878567304e-06, "loss": 0.79566973, "num_input_tokens_seen": 97456275, "step": 4517, "time_per_iteration": 2.645902395248413 }, { "auxiliary_loss_clip": 0.01176641, "auxiliary_loss_mlp": 0.01022029, "balance_loss_clip": 0.97296607, "balance_loss_mlp": 1.0150013, "epoch": 0.543257379907413, "flos": 18806023641600.0, "grad_norm": 1.6842932357843434, "language_loss": 0.76180017, "learning_rate": 1.8171971537487834e-06, "loss": 0.78378683, "num_input_tokens_seen": 97474925, "step": 4518, "time_per_iteration": 2.7064061164855957 }, { "auxiliary_loss_clip": 0.01175955, "auxiliary_loss_mlp": 0.01026544, "balance_loss_clip": 1.04979765, "balance_loss_mlp": 1.01851487, "epoch": 0.543377622798052, "flos": 17493381025920.0, "grad_norm": 2.162753702113439, "language_loss": 0.80520213, "learning_rate": 1.8164214566613093e-06, "loss": 0.82722712, "num_input_tokens_seen": 97493550, "step": 4519, "time_per_iteration": 3.667018413543701 }, { "auxiliary_loss_clip": 0.0117162, "auxiliary_loss_mlp": 0.01031102, "balance_loss_clip": 1.04925799, "balance_loss_mlp": 1.02330577, "epoch": 0.5434978656886912, "flos": 18989311766400.0, "grad_norm": 2.5508339104383504, "language_loss": 0.65884477, "learning_rate": 1.8156457874225547e-06, "loss": 0.68087196, "num_input_tokens_seen": 97512010, "step": 4520, "time_per_iteration": 2.7294063568115234 }, { "auxiliary_loss_clip": 0.01168065, "auxiliary_loss_mlp": 0.01025442, "balance_loss_clip": 0.97604877, "balance_loss_mlp": 1.01816452, "epoch": 0.5436181085793302, "flos": 17274936464640.0, "grad_norm": 1.834788663508302, "language_loss": 0.80685198, "learning_rate": 1.814870146150187e-06, "loss": 0.82878709, "num_input_tokens_seen": 97530120, "step": 4521, "time_per_iteration": 2.6451730728149414 }, { "auxiliary_loss_clip": 0.01181024, "auxiliary_loss_mlp": 0.01024107, "balance_loss_clip": 0.97470766, "balance_loss_mlp": 1.01651347, "epoch": 0.5437383514699693, "flos": 19098587917440.0, "grad_norm": 1.9877433010908132, "language_loss": 0.78455853, "learning_rate": 1.814094532961871e-06, "loss": 0.80660987, "num_input_tokens_seen": 97548695, "step": 4522, "time_per_iteration": 2.6134426593780518 }, { "auxiliary_loss_clip": 0.01170514, "auxiliary_loss_mlp": 0.01029967, "balance_loss_clip": 0.89408457, "balance_loss_mlp": 1.02222395, "epoch": 0.5438585943606085, "flos": 22602715211520.0, "grad_norm": 1.8384308692238116, "language_loss": 0.83439642, "learning_rate": 1.8133189479752666e-06, "loss": 0.85640121, "num_input_tokens_seen": 97567625, "step": 4523, "time_per_iteration": 2.771425485610962 }, { "auxiliary_loss_clip": 0.01176482, "auxiliary_loss_mlp": 0.01025123, "balance_loss_clip": 1.05209649, "balance_loss_mlp": 1.01822984, "epoch": 0.5439788372512475, "flos": 21798495653760.0, "grad_norm": 1.9782442116962915, "language_loss": 0.8159368, "learning_rate": 1.8125433913080292e-06, "loss": 0.83795285, "num_input_tokens_seen": 97585325, "step": 4524, "time_per_iteration": 3.5453426837921143 }, { "auxiliary_loss_clip": 0.01155828, "auxiliary_loss_mlp": 0.01030293, "balance_loss_clip": 0.74194151, "balance_loss_mlp": 1.02318239, "epoch": 0.5440990801418866, "flos": 16399362539520.0, "grad_norm": 2.041498967250096, "language_loss": 0.82878661, "learning_rate": 1.811767863077811e-06, "loss": 0.85064781, "num_input_tokens_seen": 97604275, "step": 4525, "time_per_iteration": 3.050366163253784 }, { "auxiliary_loss_clip": 0.01162248, "auxiliary_loss_mlp": 0.01024685, "balance_loss_clip": 0.85933834, "balance_loss_mlp": 1.01749969, "epoch": 0.5442193230325257, "flos": 21615638492160.0, "grad_norm": 1.5510041061245226, "language_loss": 0.78399694, "learning_rate": 1.8109923634022577e-06, "loss": 0.8058663, "num_input_tokens_seen": 97624300, "step": 4526, "time_per_iteration": 2.969479560852051 }, { "auxiliary_loss_clip": 0.01177216, "auxiliary_loss_mlp": 0.0102937, "balance_loss_clip": 1.05080581, "balance_loss_mlp": 1.02194965, "epoch": 0.5443395659231648, "flos": 15481198062720.0, "grad_norm": 2.2406795239731965, "language_loss": 0.86131263, "learning_rate": 1.8102168923990128e-06, "loss": 0.88337851, "num_input_tokens_seen": 97637845, "step": 4527, "time_per_iteration": 2.578857421875 }, { "auxiliary_loss_clip": 0.01178699, "auxiliary_loss_mlp": 0.0112257, "balance_loss_clip": 1.01432681, "balance_loss_mlp": 0.0, "epoch": 0.5444598088138038, "flos": 18770436241920.0, "grad_norm": 1.7436190610408238, "language_loss": 0.79974151, "learning_rate": 1.809441450185714e-06, "loss": 0.8227542, "num_input_tokens_seen": 97656330, "step": 4528, "time_per_iteration": 2.712008476257324 }, { "auxiliary_loss_clip": 0.01175056, "auxiliary_loss_mlp": 0.01025852, "balance_loss_clip": 0.9704538, "balance_loss_mlp": 1.01881588, "epoch": 0.544580051704443, "flos": 21142335957120.0, "grad_norm": 2.123773620887498, "language_loss": 0.72942007, "learning_rate": 1.8086660368799958e-06, "loss": 0.75142908, "num_input_tokens_seen": 97674380, "step": 4529, "time_per_iteration": 2.6839191913604736 }, { "auxiliary_loss_clip": 0.01175568, "auxiliary_loss_mlp": 0.01028522, "balance_loss_clip": 0.97428876, "balance_loss_mlp": 1.02092266, "epoch": 0.5447002945950821, "flos": 32491508054400.0, "grad_norm": 1.63766792420767, "language_loss": 0.77114534, "learning_rate": 1.807890652599488e-06, "loss": 0.79318619, "num_input_tokens_seen": 97698765, "step": 4530, "time_per_iteration": 3.426009178161621 }, { "auxiliary_loss_clip": 0.01173684, "auxiliary_loss_mlp": 0.01025364, "balance_loss_clip": 1.05126226, "balance_loss_mlp": 1.01844084, "epoch": 0.5448205374857211, "flos": 11798307757440.0, "grad_norm": 1.8999275287796757, "language_loss": 0.82356691, "learning_rate": 1.8071152974618156e-06, "loss": 0.84555733, "num_input_tokens_seen": 97716565, "step": 4531, "time_per_iteration": 2.634549617767334 }, { "auxiliary_loss_clip": 0.01171795, "auxiliary_loss_mlp": 0.01122768, "balance_loss_clip": 0.93294072, "balance_loss_mlp": 0.0, "epoch": 0.5449407803763603, "flos": 24133766474880.0, "grad_norm": 2.153038445079365, "language_loss": 0.78286183, "learning_rate": 1.806339971584599e-06, "loss": 0.80580741, "num_input_tokens_seen": 97733225, "step": 4532, "time_per_iteration": 2.733930826187134 }, { "auxiliary_loss_clip": 0.01176208, "auxiliary_loss_mlp": 0.01028152, "balance_loss_clip": 1.05164039, "balance_loss_mlp": 1.02096939, "epoch": 0.5450610232669993, "flos": 23258551685760.0, "grad_norm": 1.703110592673255, "language_loss": 0.85320079, "learning_rate": 1.8055646750854546e-06, "loss": 0.87524438, "num_input_tokens_seen": 97752735, "step": 4533, "time_per_iteration": 2.66784405708313 }, { "auxiliary_loss_clip": 0.01179404, "auxiliary_loss_mlp": 0.0102966, "balance_loss_clip": 0.97579992, "balance_loss_mlp": 1.02176237, "epoch": 0.5451812661576384, "flos": 17785083375360.0, "grad_norm": 2.1697795204871864, "language_loss": 0.81677961, "learning_rate": 1.8047894080819945e-06, "loss": 0.83887017, "num_input_tokens_seen": 97769985, "step": 4534, "time_per_iteration": 2.616541624069214 }, { "auxiliary_loss_clip": 0.01080059, "auxiliary_loss_mlp": 0.01005153, "balance_loss_clip": 1.02181435, "balance_loss_mlp": 1.00346017, "epoch": 0.5453015090482776, "flos": 71062586513280.0, "grad_norm": 0.7265061635175725, "language_loss": 0.63223684, "learning_rate": 1.8040141706918258e-06, "loss": 0.65308893, "num_input_tokens_seen": 97831225, "step": 4535, "time_per_iteration": 3.325915813446045 }, { "auxiliary_loss_clip": 0.01178127, "auxiliary_loss_mlp": 0.01026521, "balance_loss_clip": 0.97758466, "balance_loss_mlp": 1.01898742, "epoch": 0.5454217519389166, "flos": 25552201622400.0, "grad_norm": 1.6719191684765227, "language_loss": 0.76723278, "learning_rate": 1.8032389630325525e-06, "loss": 0.78927934, "num_input_tokens_seen": 97849975, "step": 4536, "time_per_iteration": 2.726456880569458 }, { "auxiliary_loss_clip": 0.01171369, "auxiliary_loss_mlp": 0.01029249, "balance_loss_clip": 0.97115201, "balance_loss_mlp": 1.02179205, "epoch": 0.5455419948295557, "flos": 23658345037440.0, "grad_norm": 1.555448533144301, "language_loss": 0.75478524, "learning_rate": 1.8024637852217707e-06, "loss": 0.77679145, "num_input_tokens_seen": 97869700, "step": 4537, "time_per_iteration": 2.6975786685943604 }, { "auxiliary_loss_clip": 0.01176551, "auxiliary_loss_mlp": 0.01027785, "balance_loss_clip": 0.97659367, "balance_loss_mlp": 1.02090979, "epoch": 0.5456622377201948, "flos": 23403989854080.0, "grad_norm": 1.6283907473560677, "language_loss": 0.84424567, "learning_rate": 1.8016886373770766e-06, "loss": 0.86628902, "num_input_tokens_seen": 97888215, "step": 4538, "time_per_iteration": 2.68530011177063 }, { "auxiliary_loss_clip": 0.01169874, "auxiliary_loss_mlp": 0.01028035, "balance_loss_clip": 0.97347248, "balance_loss_mlp": 1.02029228, "epoch": 0.5457824806108339, "flos": 23988040997760.0, "grad_norm": 1.605939172476789, "language_loss": 0.7906847, "learning_rate": 1.8009135196160579e-06, "loss": 0.81266379, "num_input_tokens_seen": 97907090, "step": 4539, "time_per_iteration": 3.6937475204467773 }, { "auxiliary_loss_clip": 0.01166822, "auxiliary_loss_mlp": 0.01026525, "balance_loss_clip": 0.93525976, "balance_loss_mlp": 1.01948524, "epoch": 0.545902723501473, "flos": 22565870835840.0, "grad_norm": 1.8168645193206585, "language_loss": 0.83937877, "learning_rate": 1.8001384320563e-06, "loss": 0.86131221, "num_input_tokens_seen": 97927345, "step": 4540, "time_per_iteration": 2.7487196922302246 }, { "auxiliary_loss_clip": 0.01078887, "auxiliary_loss_mlp": 0.0100425, "balance_loss_clip": 1.02083111, "balance_loss_mlp": 1.002581, "epoch": 0.5460229663921121, "flos": 55198399685760.0, "grad_norm": 0.772296205842347, "language_loss": 0.57891464, "learning_rate": 1.7993633748153833e-06, "loss": 0.59974599, "num_input_tokens_seen": 97981950, "step": 4541, "time_per_iteration": 3.1754908561706543 }, { "auxiliary_loss_clip": 0.01179295, "auxiliary_loss_mlp": 0.0102427, "balance_loss_clip": 1.01316631, "balance_loss_mlp": 1.01724863, "epoch": 0.5461432092827512, "flos": 15413866018560.0, "grad_norm": 1.7736352180502037, "language_loss": 0.72689593, "learning_rate": 1.7985883480108834e-06, "loss": 0.74893159, "num_input_tokens_seen": 97999585, "step": 4542, "time_per_iteration": 3.919572114944458 }, { "auxiliary_loss_clip": 0.01170532, "auxiliary_loss_mlp": 0.010277, "balance_loss_clip": 1.01369524, "balance_loss_mlp": 1.0199455, "epoch": 0.5462634521733902, "flos": 24024921287040.0, "grad_norm": 1.5140074148186125, "language_loss": 0.72320449, "learning_rate": 1.797813351760371e-06, "loss": 0.74518687, "num_input_tokens_seen": 98021290, "step": 4543, "time_per_iteration": 2.8116278648376465 }, { "auxiliary_loss_clip": 0.01176362, "auxiliary_loss_mlp": 0.01027684, "balance_loss_clip": 1.05139709, "balance_loss_mlp": 1.01994109, "epoch": 0.5463836950640293, "flos": 22820944291200.0, "grad_norm": 1.8988315998827008, "language_loss": 0.78051829, "learning_rate": 1.7970383861814116e-06, "loss": 0.80255878, "num_input_tokens_seen": 98041060, "step": 4544, "time_per_iteration": 2.667147397994995 }, { "auxiliary_loss_clip": 0.01177123, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 1.01479435, "balance_loss_mlp": 1.02532828, "epoch": 0.5465039379546685, "flos": 20448290390400.0, "grad_norm": 2.0991554573876767, "language_loss": 0.74023288, "learning_rate": 1.7962634513915684e-06, "loss": 0.76233393, "num_input_tokens_seen": 98058410, "step": 4545, "time_per_iteration": 3.643336057662964 }, { "auxiliary_loss_clip": 0.01176196, "auxiliary_loss_mlp": 0.01027452, "balance_loss_clip": 1.05180657, "balance_loss_mlp": 1.02039814, "epoch": 0.5466241808453075, "flos": 17343310003200.0, "grad_norm": 1.5502106069908697, "language_loss": 0.79421103, "learning_rate": 1.7954885475083969e-06, "loss": 0.81624752, "num_input_tokens_seen": 98076080, "step": 4546, "time_per_iteration": 2.600972890853882 }, { "auxiliary_loss_clip": 0.01178785, "auxiliary_loss_mlp": 0.01027051, "balance_loss_clip": 1.05240667, "balance_loss_mlp": 1.0191946, "epoch": 0.5467444237359466, "flos": 21617039122560.0, "grad_norm": 2.008552739490441, "language_loss": 0.72706461, "learning_rate": 1.7947136746494513e-06, "loss": 0.74912292, "num_input_tokens_seen": 98096995, "step": 4547, "time_per_iteration": 2.606633424758911 }, { "auxiliary_loss_clip": 0.01173313, "auxiliary_loss_mlp": 0.01024672, "balance_loss_clip": 1.01474214, "balance_loss_mlp": 1.01672101, "epoch": 0.5468646666265857, "flos": 24170467196160.0, "grad_norm": 1.9312604726171245, "language_loss": 0.88236755, "learning_rate": 1.793938832932277e-06, "loss": 0.90434742, "num_input_tokens_seen": 98115105, "step": 4548, "time_per_iteration": 2.6548893451690674 }, { "auxiliary_loss_clip": 0.01174472, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.05059087, "balance_loss_mlp": 1.02275872, "epoch": 0.5469849095172248, "flos": 27527001505920.0, "grad_norm": 1.75908595286192, "language_loss": 0.70416701, "learning_rate": 1.7931640224744185e-06, "loss": 0.72621095, "num_input_tokens_seen": 98135655, "step": 4549, "time_per_iteration": 2.64452862739563 }, { "auxiliary_loss_clip": 0.01157591, "auxiliary_loss_mlp": 0.01026434, "balance_loss_clip": 0.93031359, "balance_loss_mlp": 1.01887929, "epoch": 0.5471051524078638, "flos": 27964680727680.0, "grad_norm": 1.4690718814564754, "language_loss": 0.73710543, "learning_rate": 1.7923892433934127e-06, "loss": 0.7589457, "num_input_tokens_seen": 98156730, "step": 4550, "time_per_iteration": 3.6835551261901855 }, { "auxiliary_loss_clip": 0.0117639, "auxiliary_loss_mlp": 0.0112331, "balance_loss_clip": 0.97593856, "balance_loss_mlp": 0.0, "epoch": 0.547225395298503, "flos": 18150510389760.0, "grad_norm": 2.6870365865263204, "language_loss": 0.78785241, "learning_rate": 1.7916144958067939e-06, "loss": 0.81084943, "num_input_tokens_seen": 98174590, "step": 4551, "time_per_iteration": 2.7090983390808105 }, { "auxiliary_loss_clip": 0.01174366, "auxiliary_loss_mlp": 0.01031338, "balance_loss_clip": 1.01148331, "balance_loss_mlp": 1.02349997, "epoch": 0.5473456381891421, "flos": 21361498790400.0, "grad_norm": 1.636380960497627, "language_loss": 0.7903772, "learning_rate": 1.7908397798320905e-06, "loss": 0.81243426, "num_input_tokens_seen": 98194325, "step": 4552, "time_per_iteration": 2.644164562225342 }, { "auxiliary_loss_clip": 0.01174692, "auxiliary_loss_mlp": 0.01123527, "balance_loss_clip": 1.01196289, "balance_loss_mlp": 0.0, "epoch": 0.5474658810797811, "flos": 19932145908480.0, "grad_norm": 2.095677284868044, "language_loss": 0.74949086, "learning_rate": 1.7900650955868265e-06, "loss": 0.7724731, "num_input_tokens_seen": 98213970, "step": 4553, "time_per_iteration": 2.6406326293945312 }, { "auxiliary_loss_clip": 0.01174549, "auxiliary_loss_mlp": 0.01122807, "balance_loss_clip": 1.01448631, "balance_loss_mlp": 0.0, "epoch": 0.5475861239704203, "flos": 50476217264640.0, "grad_norm": 1.3755020765127866, "language_loss": 0.76536191, "learning_rate": 1.7892904431885202e-06, "loss": 0.7883355, "num_input_tokens_seen": 98241145, "step": 4554, "time_per_iteration": 2.902128219604492 }, { "auxiliary_loss_clip": 0.01167752, "auxiliary_loss_mlp": 0.01026441, "balance_loss_clip": 0.89501137, "balance_loss_mlp": 1.01870465, "epoch": 0.5477063668610593, "flos": 20705123612160.0, "grad_norm": 1.6873951287372453, "language_loss": 0.75475812, "learning_rate": 1.788515822754686e-06, "loss": 0.77670002, "num_input_tokens_seen": 98261565, "step": 4555, "time_per_iteration": 2.753251791000366 }, { "auxiliary_loss_clip": 0.01170325, "auxiliary_loss_mlp": 0.01030813, "balance_loss_clip": 0.93354046, "balance_loss_mlp": 1.02202129, "epoch": 0.5478266097516984, "flos": 19609740408960.0, "grad_norm": 1.8258056897101305, "language_loss": 0.78151876, "learning_rate": 1.7877412344028335e-06, "loss": 0.8035301, "num_input_tokens_seen": 98281370, "step": 4556, "time_per_iteration": 2.6966466903686523 }, { "auxiliary_loss_clip": 0.01176585, "auxiliary_loss_mlp": 0.01027137, "balance_loss_clip": 1.01372099, "balance_loss_mlp": 1.01986194, "epoch": 0.5479468526423376, "flos": 12896599962240.0, "grad_norm": 2.3938555755131437, "language_loss": 0.77276731, "learning_rate": 1.7869666782504668e-06, "loss": 0.79480457, "num_input_tokens_seen": 98297950, "step": 4557, "time_per_iteration": 2.657944440841675 }, { "auxiliary_loss_clip": 0.01161483, "auxiliary_loss_mlp": 0.0102572, "balance_loss_clip": 0.97118139, "balance_loss_mlp": 1.01841271, "epoch": 0.5480670955329766, "flos": 18588800142720.0, "grad_norm": 1.916159954012353, "language_loss": 0.68827564, "learning_rate": 1.7861921544150867e-06, "loss": 0.71014774, "num_input_tokens_seen": 98316800, "step": 4558, "time_per_iteration": 2.702550172805786 }, { "auxiliary_loss_clip": 0.01156365, "auxiliary_loss_mlp": 0.01122968, "balance_loss_clip": 0.85882831, "balance_loss_mlp": 0.0, "epoch": 0.5481873384236157, "flos": 15954608338560.0, "grad_norm": 2.262030781485172, "language_loss": 0.76221383, "learning_rate": 1.7854176630141856e-06, "loss": 0.78500712, "num_input_tokens_seen": 98333935, "step": 4559, "time_per_iteration": 2.7546727657318115 }, { "auxiliary_loss_clip": 0.01179234, "auxiliary_loss_mlp": 0.01031356, "balance_loss_clip": 1.05206978, "balance_loss_mlp": 1.02305961, "epoch": 0.5483075813142548, "flos": 22783812606720.0, "grad_norm": 2.3021120063559413, "language_loss": 0.84553158, "learning_rate": 1.784643204165255e-06, "loss": 0.86763746, "num_input_tokens_seen": 98353255, "step": 4560, "time_per_iteration": 2.609367847442627 }, { "auxiliary_loss_clip": 0.01171959, "auxiliary_loss_mlp": 0.01028624, "balance_loss_clip": 1.01387024, "balance_loss_mlp": 1.02157271, "epoch": 0.5484278242048939, "flos": 19317212046720.0, "grad_norm": 2.2210536598912465, "language_loss": 0.77045023, "learning_rate": 1.7838687779857783e-06, "loss": 0.79245603, "num_input_tokens_seen": 98371130, "step": 4561, "time_per_iteration": 2.608546018600464 }, { "auxiliary_loss_clip": 0.01163145, "auxiliary_loss_mlp": 0.01025182, "balance_loss_clip": 0.97063953, "balance_loss_mlp": 1.01767528, "epoch": 0.5485480670955329, "flos": 22816024128000.0, "grad_norm": 2.187950531908411, "language_loss": 0.64064765, "learning_rate": 1.7830943845932366e-06, "loss": 0.6625309, "num_input_tokens_seen": 98390455, "step": 4562, "time_per_iteration": 2.6406452655792236 }, { "auxiliary_loss_clip": 0.0117788, "auxiliary_loss_mlp": 0.01024186, "balance_loss_clip": 0.975389, "balance_loss_mlp": 1.01661611, "epoch": 0.5486683099861721, "flos": 22671304231680.0, "grad_norm": 1.6324575848669702, "language_loss": 0.75272155, "learning_rate": 1.7823200241051044e-06, "loss": 0.77474225, "num_input_tokens_seen": 98409370, "step": 4563, "time_per_iteration": 2.7243690490722656 }, { "auxiliary_loss_clip": 0.01176324, "auxiliary_loss_mlp": 0.01025423, "balance_loss_clip": 1.05154669, "balance_loss_mlp": 1.018062, "epoch": 0.5487885528768112, "flos": 23149383275520.0, "grad_norm": 1.7612610542800171, "language_loss": 0.80545497, "learning_rate": 1.7815456966388513e-06, "loss": 0.82747245, "num_input_tokens_seen": 98428465, "step": 4564, "time_per_iteration": 2.657492160797119 }, { "auxiliary_loss_clip": 0.01172206, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 0.93352902, "balance_loss_mlp": 1.0181191, "epoch": 0.5489087957674502, "flos": 22053928245120.0, "grad_norm": 2.1015356699156875, "language_loss": 0.81025708, "learning_rate": 1.780771402311943e-06, "loss": 0.83224553, "num_input_tokens_seen": 98447300, "step": 4565, "time_per_iteration": 2.680356502532959 }, { "auxiliary_loss_clip": 0.01175487, "auxiliary_loss_mlp": 0.01026127, "balance_loss_clip": 0.97625685, "balance_loss_mlp": 1.01843786, "epoch": 0.5490290386580894, "flos": 24315977191680.0, "grad_norm": 1.6824986905686878, "language_loss": 0.7886498, "learning_rate": 1.7799971412418374e-06, "loss": 0.81066597, "num_input_tokens_seen": 98468695, "step": 4566, "time_per_iteration": 3.9915363788604736 }, { "auxiliary_loss_clip": 0.01171333, "auxiliary_loss_mlp": 0.01026961, "balance_loss_clip": 0.93472207, "balance_loss_mlp": 1.01885152, "epoch": 0.5491492815487284, "flos": 18294942977280.0, "grad_norm": 2.098477128763159, "language_loss": 0.73724341, "learning_rate": 1.7792229135459918e-06, "loss": 0.75922638, "num_input_tokens_seen": 98485345, "step": 4567, "time_per_iteration": 2.654310941696167 }, { "auxiliary_loss_clip": 0.01089691, "auxiliary_loss_mlp": 0.01004276, "balance_loss_clip": 0.88324785, "balance_loss_mlp": 1.00244009, "epoch": 0.5492695244393675, "flos": 64550257050240.0, "grad_norm": 0.7332107769124592, "language_loss": 0.61671722, "learning_rate": 1.7784487193418538e-06, "loss": 0.63765693, "num_input_tokens_seen": 98543195, "step": 4568, "time_per_iteration": 4.1942760944366455 }, { "auxiliary_loss_clip": 0.01152477, "auxiliary_loss_mlp": 0.01030868, "balance_loss_clip": 0.92859226, "balance_loss_mlp": 1.02305436, "epoch": 0.5493897673300067, "flos": 17379579761280.0, "grad_norm": 1.9985790150830982, "language_loss": 0.60854304, "learning_rate": 1.7776745587468698e-06, "loss": 0.63037646, "num_input_tokens_seen": 98560620, "step": 4569, "time_per_iteration": 2.71940279006958 }, { "auxiliary_loss_clip": 0.01174261, "auxiliary_loss_mlp": 0.01028811, "balance_loss_clip": 1.04930758, "balance_loss_mlp": 1.02126837, "epoch": 0.5495100102206457, "flos": 19901765980800.0, "grad_norm": 2.064893399502752, "language_loss": 0.817873, "learning_rate": 1.7769004318784776e-06, "loss": 0.83990377, "num_input_tokens_seen": 98578265, "step": 4570, "time_per_iteration": 2.6464736461639404 }, { "auxiliary_loss_clip": 0.01174159, "auxiliary_loss_mlp": 0.01023177, "balance_loss_clip": 1.01226902, "balance_loss_mlp": 1.01573813, "epoch": 0.5496302531112848, "flos": 16727190992640.0, "grad_norm": 1.6722189764060558, "language_loss": 0.806714, "learning_rate": 1.776126338854113e-06, "loss": 0.82868737, "num_input_tokens_seen": 98596055, "step": 4571, "time_per_iteration": 3.5595107078552246 }, { "auxiliary_loss_clip": 0.01173591, "auxiliary_loss_mlp": 0.01030127, "balance_loss_clip": 1.0146035, "balance_loss_mlp": 1.02323651, "epoch": 0.5497504960019239, "flos": 24572343536640.0, "grad_norm": 1.5186308869019698, "language_loss": 0.84390235, "learning_rate": 1.7753522797912044e-06, "loss": 0.86593956, "num_input_tokens_seen": 98616140, "step": 4572, "time_per_iteration": 2.936065912246704 }, { "auxiliary_loss_clip": 0.01176734, "auxiliary_loss_mlp": 0.01029112, "balance_loss_clip": 0.97144133, "balance_loss_mlp": 1.02136946, "epoch": 0.549870738892563, "flos": 15450494912640.0, "grad_norm": 1.9812520305336985, "language_loss": 0.69923574, "learning_rate": 1.7745782548071765e-06, "loss": 0.72129416, "num_input_tokens_seen": 98633035, "step": 4573, "time_per_iteration": 2.736241102218628 }, { "auxiliary_loss_clip": 0.01172704, "auxiliary_loss_mlp": 0.01027731, "balance_loss_clip": 0.9414916, "balance_loss_mlp": 1.02052224, "epoch": 0.549990981783202, "flos": 21069114082560.0, "grad_norm": 1.5535249077124589, "language_loss": 0.73960257, "learning_rate": 1.7738042640194482e-06, "loss": 0.76160693, "num_input_tokens_seen": 98652700, "step": 4574, "time_per_iteration": 2.686276435852051 }, { "auxiliary_loss_clip": 0.01175296, "auxiliary_loss_mlp": 0.0102008, "balance_loss_clip": 1.05104613, "balance_loss_mlp": 1.01302266, "epoch": 0.5501112246738411, "flos": 21395901041280.0, "grad_norm": 1.5842096598008277, "language_loss": 0.70305514, "learning_rate": 1.7730303075454335e-06, "loss": 0.7250089, "num_input_tokens_seen": 98671590, "step": 4575, "time_per_iteration": 2.6166276931762695 }, { "auxiliary_loss_clip": 0.01174355, "auxiliary_loss_mlp": 0.01027847, "balance_loss_clip": 0.93532497, "balance_loss_mlp": 1.02056026, "epoch": 0.5502314675644803, "flos": 17456931699840.0, "grad_norm": 1.808944597152072, "language_loss": 0.84909129, "learning_rate": 1.7722563855025402e-06, "loss": 0.8711133, "num_input_tokens_seen": 98689620, "step": 4576, "time_per_iteration": 3.647942066192627 }, { "auxiliary_loss_clip": 0.01170224, "auxiliary_loss_mlp": 0.0103345, "balance_loss_clip": 0.97096682, "balance_loss_mlp": 1.02541482, "epoch": 0.5503517104551193, "flos": 24310410583680.0, "grad_norm": 2.3242568355559454, "language_loss": 0.70832324, "learning_rate": 1.7714824980081721e-06, "loss": 0.73035997, "num_input_tokens_seen": 98708915, "step": 4577, "time_per_iteration": 2.702502489089966 }, { "auxiliary_loss_clip": 0.01171661, "auxiliary_loss_mlp": 0.01029573, "balance_loss_clip": 1.01367724, "balance_loss_mlp": 1.02169323, "epoch": 0.5504719533457584, "flos": 22419427086720.0, "grad_norm": 1.7258786586500496, "language_loss": 0.74136889, "learning_rate": 1.7707086451797276e-06, "loss": 0.76338124, "num_input_tokens_seen": 98729790, "step": 4578, "time_per_iteration": 2.6738874912261963 }, { "auxiliary_loss_clip": 0.01082221, "auxiliary_loss_mlp": 0.01003985, "balance_loss_clip": 0.90636522, "balance_loss_mlp": 1.00210178, "epoch": 0.5505921962363975, "flos": 67294155968640.0, "grad_norm": 0.7012205640638194, "language_loss": 0.52395421, "learning_rate": 1.7699348271345993e-06, "loss": 0.54481632, "num_input_tokens_seen": 98792415, "step": 4579, "time_per_iteration": 3.2287280559539795 }, { "auxiliary_loss_clip": 0.01089328, "auxiliary_loss_mlp": 0.01001876, "balance_loss_clip": 0.86923862, "balance_loss_mlp": 0.99995708, "epoch": 0.5507124391270366, "flos": 45685125578880.0, "grad_norm": 0.7123221283698115, "language_loss": 0.54504371, "learning_rate": 1.7691610439901753e-06, "loss": 0.56595576, "num_input_tokens_seen": 98855350, "step": 4580, "time_per_iteration": 3.351167917251587 }, { "auxiliary_loss_clip": 0.01179354, "auxiliary_loss_mlp": 0.01027154, "balance_loss_clip": 1.01503849, "balance_loss_mlp": 1.01949799, "epoch": 0.5508326820176757, "flos": 22273845264000.0, "grad_norm": 1.8829361244574536, "language_loss": 0.74914551, "learning_rate": 1.7683872958638367e-06, "loss": 0.77121067, "num_input_tokens_seen": 98874230, "step": 4581, "time_per_iteration": 2.6598165035247803 }, { "auxiliary_loss_clip": 0.01168936, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 0.97148073, "balance_loss_mlp": 1.01911378, "epoch": 0.5509529249083148, "flos": 20012442762240.0, "grad_norm": 1.9746558831261456, "language_loss": 0.84356582, "learning_rate": 1.7676135828729614e-06, "loss": 0.86551869, "num_input_tokens_seen": 98893940, "step": 4582, "time_per_iteration": 2.7135417461395264 }, { "auxiliary_loss_clip": 0.01174725, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 1.01349711, "balance_loss_mlp": 1.023278, "epoch": 0.5510731677989539, "flos": 21834801325440.0, "grad_norm": 2.040787505857809, "language_loss": 0.82815003, "learning_rate": 1.7668399051349205e-06, "loss": 0.85020655, "num_input_tokens_seen": 98913620, "step": 4583, "time_per_iteration": 2.6642723083496094 }, { "auxiliary_loss_clip": 0.01169037, "auxiliary_loss_mlp": 0.01027066, "balance_loss_clip": 0.93497324, "balance_loss_mlp": 1.01892376, "epoch": 0.5511934106895929, "flos": 21467901853440.0, "grad_norm": 2.502468809962186, "language_loss": 0.83213311, "learning_rate": 1.766066262767081e-06, "loss": 0.85409415, "num_input_tokens_seen": 98931460, "step": 4584, "time_per_iteration": 2.8164279460906982 }, { "auxiliary_loss_clip": 0.0116814, "auxiliary_loss_mlp": 0.01026636, "balance_loss_clip": 0.97389364, "balance_loss_mlp": 1.01886964, "epoch": 0.5513136535802321, "flos": 21068934514560.0, "grad_norm": 2.239313872592076, "language_loss": 0.77176821, "learning_rate": 1.765292655886803e-06, "loss": 0.79371601, "num_input_tokens_seen": 98950105, "step": 4585, "time_per_iteration": 2.659482002258301 }, { "auxiliary_loss_clip": 0.01179858, "auxiliary_loss_mlp": 0.01034233, "balance_loss_clip": 0.93499875, "balance_loss_mlp": 1.02660346, "epoch": 0.5514338964708712, "flos": 27815004754560.0, "grad_norm": 1.6531155045763637, "language_loss": 0.70406425, "learning_rate": 1.764519084611443e-06, "loss": 0.72620517, "num_input_tokens_seen": 98970560, "step": 4586, "time_per_iteration": 2.799955129623413 }, { "auxiliary_loss_clip": 0.011702, "auxiliary_loss_mlp": 0.01027158, "balance_loss_clip": 0.97227758, "balance_loss_mlp": 1.01918852, "epoch": 0.5515541393615102, "flos": 21908525990400.0, "grad_norm": 1.7091106536903062, "language_loss": 0.77525175, "learning_rate": 1.7637455490583505e-06, "loss": 0.79722536, "num_input_tokens_seen": 98989885, "step": 4587, "time_per_iteration": 2.6622867584228516 }, { "auxiliary_loss_clip": 0.0117241, "auxiliary_loss_mlp": 0.0102992, "balance_loss_clip": 1.01198387, "balance_loss_mlp": 1.02280366, "epoch": 0.5516743822521494, "flos": 20485422074880.0, "grad_norm": 1.9982203957518048, "language_loss": 0.7732408, "learning_rate": 1.7629720493448701e-06, "loss": 0.79526412, "num_input_tokens_seen": 99007180, "step": 4588, "time_per_iteration": 2.6199238300323486 }, { "auxiliary_loss_clip": 0.01180904, "auxiliary_loss_mlp": 0.01030747, "balance_loss_clip": 0.97584403, "balance_loss_mlp": 1.02405024, "epoch": 0.5517946251427884, "flos": 14940383915520.0, "grad_norm": 1.5449545349198728, "language_loss": 0.84951645, "learning_rate": 1.7621985855883418e-06, "loss": 0.87163299, "num_input_tokens_seen": 99023880, "step": 4589, "time_per_iteration": 2.637514591217041 }, { "auxiliary_loss_clip": 0.01168331, "auxiliary_loss_mlp": 0.0102828, "balance_loss_clip": 0.97367656, "balance_loss_mlp": 1.02114248, "epoch": 0.5519148680334275, "flos": 18404865573120.0, "grad_norm": 2.553458182709257, "language_loss": 0.72286755, "learning_rate": 1.7614251579060983e-06, "loss": 0.74483365, "num_input_tokens_seen": 99042475, "step": 4590, "time_per_iteration": 2.763655662536621 }, { "auxiliary_loss_clip": 0.01172908, "auxiliary_loss_mlp": 0.01025018, "balance_loss_clip": 0.93540937, "balance_loss_mlp": 1.01750779, "epoch": 0.5520351109240667, "flos": 25113337251840.0, "grad_norm": 2.055251425670415, "language_loss": 0.84411085, "learning_rate": 1.76065176641547e-06, "loss": 0.86609012, "num_input_tokens_seen": 99065185, "step": 4591, "time_per_iteration": 2.820150852203369 }, { "auxiliary_loss_clip": 0.01173914, "auxiliary_loss_mlp": 0.01021639, "balance_loss_clip": 1.01071763, "balance_loss_mlp": 1.01479948, "epoch": 0.5521553538147057, "flos": 21069545045760.0, "grad_norm": 4.296330765720839, "language_loss": 0.77471471, "learning_rate": 1.759878411233777e-06, "loss": 0.7966702, "num_input_tokens_seen": 99083645, "step": 4592, "time_per_iteration": 3.535870313644409 }, { "auxiliary_loss_clip": 0.01173622, "auxiliary_loss_mlp": 0.0102392, "balance_loss_clip": 1.01279688, "balance_loss_mlp": 1.01633191, "epoch": 0.5522755967053448, "flos": 18879999701760.0, "grad_norm": 2.637935263666629, "language_loss": 0.76087087, "learning_rate": 1.7591050924783388e-06, "loss": 0.78284633, "num_input_tokens_seen": 99100835, "step": 4593, "time_per_iteration": 2.671049118041992 }, { "auxiliary_loss_clip": 0.01080173, "auxiliary_loss_mlp": 0.01001319, "balance_loss_clip": 0.86714566, "balance_loss_mlp": 0.99964964, "epoch": 0.5523958395959839, "flos": 64675622494080.0, "grad_norm": 0.8320131636954873, "language_loss": 0.57913792, "learning_rate": 1.7583318102664661e-06, "loss": 0.59995282, "num_input_tokens_seen": 99168400, "step": 4594, "time_per_iteration": 3.353775978088379 }, { "auxiliary_loss_clip": 0.01174688, "auxiliary_loss_mlp": 0.01023458, "balance_loss_clip": 1.00954878, "balance_loss_mlp": 1.01593041, "epoch": 0.552516082486623, "flos": 10889732211840.0, "grad_norm": 1.845718073563209, "language_loss": 0.78798044, "learning_rate": 1.757558564715466e-06, "loss": 0.80996192, "num_input_tokens_seen": 99186475, "step": 4595, "time_per_iteration": 3.7518651485443115 }, { "auxiliary_loss_clip": 0.01172506, "auxiliary_loss_mlp": 0.01025369, "balance_loss_clip": 1.01049793, "balance_loss_mlp": 1.01762605, "epoch": 0.552636325377262, "flos": 22199797376640.0, "grad_norm": 3.0045373372968442, "language_loss": 0.73972118, "learning_rate": 1.7567853559426386e-06, "loss": 0.76169991, "num_input_tokens_seen": 99203525, "step": 4596, "time_per_iteration": 2.680792808532715 }, { "auxiliary_loss_clip": 0.01177341, "auxiliary_loss_mlp": 0.01022902, "balance_loss_clip": 1.01385355, "balance_loss_mlp": 1.01555908, "epoch": 0.5527565682679012, "flos": 23988184652160.0, "grad_norm": 1.872486206365953, "language_loss": 0.75450456, "learning_rate": 1.7560121840652797e-06, "loss": 0.77650696, "num_input_tokens_seen": 99222910, "step": 4597, "time_per_iteration": 3.6994097232818604 }, { "auxiliary_loss_clip": 0.01158868, "auxiliary_loss_mlp": 0.01022673, "balance_loss_clip": 0.93328178, "balance_loss_mlp": 1.01551175, "epoch": 0.5528768111585403, "flos": 19719267955200.0, "grad_norm": 1.7986180851475766, "language_loss": 0.69309908, "learning_rate": 1.7552390492006782e-06, "loss": 0.71491444, "num_input_tokens_seen": 99241230, "step": 4598, "time_per_iteration": 2.6972944736480713 }, { "auxiliary_loss_clip": 0.01174133, "auxiliary_loss_mlp": 0.01123087, "balance_loss_clip": 0.89418137, "balance_loss_mlp": 0.0, "epoch": 0.5529970540491793, "flos": 26215975002240.0, "grad_norm": 1.7052845526763487, "language_loss": 0.65013182, "learning_rate": 1.7544659514661184e-06, "loss": 0.67310393, "num_input_tokens_seen": 99264320, "step": 4599, "time_per_iteration": 2.8134331703186035 }, { "auxiliary_loss_clip": 0.01164705, "auxiliary_loss_mlp": 0.01025421, "balance_loss_clip": 0.97110009, "balance_loss_mlp": 1.01835489, "epoch": 0.5531172969398185, "flos": 24425971614720.0, "grad_norm": 2.1108560020802623, "language_loss": 0.79798776, "learning_rate": 1.7536928909788786e-06, "loss": 0.81988895, "num_input_tokens_seen": 99283625, "step": 4600, "time_per_iteration": 2.7388062477111816 }, { "auxiliary_loss_clip": 0.01087043, "auxiliary_loss_mlp": 0.01001387, "balance_loss_clip": 0.8663038, "balance_loss_mlp": 0.99947929, "epoch": 0.5532375398304575, "flos": 64907316195840.0, "grad_norm": 0.9374898142105726, "language_loss": 0.61982477, "learning_rate": 1.752919867856231e-06, "loss": 0.64070904, "num_input_tokens_seen": 99335270, "step": 4601, "time_per_iteration": 3.1736836433410645 }, { "auxiliary_loss_clip": 0.01162013, "auxiliary_loss_mlp": 0.01025319, "balance_loss_clip": 0.97041166, "balance_loss_mlp": 1.01810145, "epoch": 0.5533577827210966, "flos": 19683105937920.0, "grad_norm": 1.634851722048585, "language_loss": 0.7875486, "learning_rate": 1.7521468822154436e-06, "loss": 0.80942196, "num_input_tokens_seen": 99354185, "step": 4602, "time_per_iteration": 3.6234967708587646 }, { "auxiliary_loss_clip": 0.01168249, "auxiliary_loss_mlp": 0.01029414, "balance_loss_clip": 0.97577298, "balance_loss_mlp": 1.02200484, "epoch": 0.5534780256117358, "flos": 32306496076800.0, "grad_norm": 1.8485848912632932, "language_loss": 0.74900115, "learning_rate": 1.751373934173777e-06, "loss": 0.77097774, "num_input_tokens_seen": 99376930, "step": 4603, "time_per_iteration": 2.835470676422119 }, { "auxiliary_loss_clip": 0.0117566, "auxiliary_loss_mlp": 0.01026706, "balance_loss_clip": 1.04983056, "balance_loss_mlp": 1.01896334, "epoch": 0.5535982685023748, "flos": 23222425582080.0, "grad_norm": 1.8176231855121192, "language_loss": 0.72664905, "learning_rate": 1.750601023848487e-06, "loss": 0.74867272, "num_input_tokens_seen": 99397655, "step": 4604, "time_per_iteration": 2.628831148147583 }, { "auxiliary_loss_clip": 0.01172357, "auxiliary_loss_mlp": 0.01122697, "balance_loss_clip": 1.05023599, "balance_loss_mlp": 0.0, "epoch": 0.5537185113930139, "flos": 24352534258560.0, "grad_norm": 1.7939366361463975, "language_loss": 0.7403065, "learning_rate": 1.749828151356823e-06, "loss": 0.76325703, "num_input_tokens_seen": 99417850, "step": 4605, "time_per_iteration": 2.7795724868774414 }, { "auxiliary_loss_clip": 0.01172058, "auxiliary_loss_mlp": 0.01023489, "balance_loss_clip": 0.97360104, "balance_loss_mlp": 1.01628244, "epoch": 0.553838754283653, "flos": 23549068886400.0, "grad_norm": 2.457034564770766, "language_loss": 0.75447822, "learning_rate": 1.7490553168160297e-06, "loss": 0.77643371, "num_input_tokens_seen": 99438920, "step": 4606, "time_per_iteration": 2.755877733230591 }, { "auxiliary_loss_clip": 0.01171583, "auxiliary_loss_mlp": 0.0102319, "balance_loss_clip": 0.97232866, "balance_loss_mlp": 1.01593614, "epoch": 0.5539589971742921, "flos": 17275044205440.0, "grad_norm": 2.395900295951043, "language_loss": 0.76601112, "learning_rate": 1.748282520343345e-06, "loss": 0.78795886, "num_input_tokens_seen": 99457950, "step": 4607, "time_per_iteration": 2.6745994091033936 }, { "auxiliary_loss_clip": 0.01178779, "auxiliary_loss_mlp": 0.01029851, "balance_loss_clip": 1.01188254, "balance_loss_mlp": 1.0223918, "epoch": 0.5540792400649311, "flos": 27564169104000.0, "grad_norm": 1.760400458732099, "language_loss": 0.78603333, "learning_rate": 1.7475097620560023e-06, "loss": 0.80811965, "num_input_tokens_seen": 99478015, "step": 4608, "time_per_iteration": 2.692812919616699 }, { "auxiliary_loss_clip": 0.01173977, "auxiliary_loss_mlp": 0.01024663, "balance_loss_clip": 1.05009508, "balance_loss_mlp": 1.01758194, "epoch": 0.5541994829555702, "flos": 23878657105920.0, "grad_norm": 1.7886536092631484, "language_loss": 0.70725036, "learning_rate": 1.746737042071228e-06, "loss": 0.72923672, "num_input_tokens_seen": 99496520, "step": 4609, "time_per_iteration": 2.691563129425049 }, { "auxiliary_loss_clip": 0.01166154, "auxiliary_loss_mlp": 0.01023771, "balance_loss_clip": 0.9735747, "balance_loss_mlp": 1.01707792, "epoch": 0.5543197258462094, "flos": 20115721342080.0, "grad_norm": 1.741458799517779, "language_loss": 0.78946877, "learning_rate": 1.7459643605062424e-06, "loss": 0.81136805, "num_input_tokens_seen": 99513780, "step": 4610, "time_per_iteration": 2.6733522415161133 }, { "auxiliary_loss_clip": 0.01165563, "auxiliary_loss_mlp": 0.01030126, "balance_loss_clip": 0.89686561, "balance_loss_mlp": 1.02303648, "epoch": 0.5544399687368484, "flos": 20916565021440.0, "grad_norm": 1.6167702146003675, "language_loss": 0.80601996, "learning_rate": 1.745191717478262e-06, "loss": 0.82797682, "num_input_tokens_seen": 99532360, "step": 4611, "time_per_iteration": 2.7518253326416016 }, { "auxiliary_loss_clip": 0.01168205, "auxiliary_loss_mlp": 0.01025251, "balance_loss_clip": 0.97325885, "balance_loss_mlp": 1.01803327, "epoch": 0.5545602116274875, "flos": 25518661297920.0, "grad_norm": 1.598633741378701, "language_loss": 0.79528737, "learning_rate": 1.7444191131044948e-06, "loss": 0.81722194, "num_input_tokens_seen": 99552635, "step": 4612, "time_per_iteration": 2.7287440299987793 }, { "auxiliary_loss_clip": 0.01171144, "auxiliary_loss_mlp": 0.01028065, "balance_loss_clip": 0.9752847, "balance_loss_mlp": 1.02113342, "epoch": 0.5546804545181266, "flos": 20995568985600.0, "grad_norm": 1.7605210563620335, "language_loss": 0.72687989, "learning_rate": 1.7436465475021456e-06, "loss": 0.74887198, "num_input_tokens_seen": 99572685, "step": 4613, "time_per_iteration": 2.7073373794555664 }, { "auxiliary_loss_clip": 0.01158318, "auxiliary_loss_mlp": 0.01022617, "balance_loss_clip": 0.93279433, "balance_loss_mlp": 1.0157057, "epoch": 0.5548006974087657, "flos": 26833638297600.0, "grad_norm": 2.127964023921297, "language_loss": 0.71547985, "learning_rate": 1.7428740207884111e-06, "loss": 0.73728925, "num_input_tokens_seen": 99593565, "step": 4614, "time_per_iteration": 2.803891897201538 }, { "auxiliary_loss_clip": 0.0117032, "auxiliary_loss_mlp": 0.01028901, "balance_loss_clip": 0.89650857, "balance_loss_mlp": 1.02112842, "epoch": 0.5549209402994048, "flos": 33656414031360.0, "grad_norm": 1.8131748884629242, "language_loss": 0.60865182, "learning_rate": 1.7421015330804833e-06, "loss": 0.63064402, "num_input_tokens_seen": 99613485, "step": 4615, "time_per_iteration": 2.8246636390686035 }, { "auxiliary_loss_clip": 0.01174365, "auxiliary_loss_mlp": 0.01024093, "balance_loss_clip": 1.05088782, "balance_loss_mlp": 1.01611781, "epoch": 0.5550411831900439, "flos": 23769524609280.0, "grad_norm": 1.8230174396850982, "language_loss": 0.72100914, "learning_rate": 1.7413290844955475e-06, "loss": 0.74299377, "num_input_tokens_seen": 99633515, "step": 4616, "time_per_iteration": 2.647641658782959 }, { "auxiliary_loss_clip": 0.01162476, "auxiliary_loss_mlp": 0.01032869, "balance_loss_clip": 1.01133835, "balance_loss_mlp": 1.02581799, "epoch": 0.555161426080683, "flos": 21651189978240.0, "grad_norm": 1.7507267275533096, "language_loss": 0.78236628, "learning_rate": 1.7405566751507843e-06, "loss": 0.80431974, "num_input_tokens_seen": 99651560, "step": 4617, "time_per_iteration": 2.6495509147644043 }, { "auxiliary_loss_clip": 0.01167442, "auxiliary_loss_mlp": 0.01020367, "balance_loss_clip": 0.93297613, "balance_loss_mlp": 1.01375437, "epoch": 0.555281668971322, "flos": 49563116605440.0, "grad_norm": 1.5623884202614984, "language_loss": 0.67471159, "learning_rate": 1.7397843051633668e-06, "loss": 0.69658971, "num_input_tokens_seen": 99674255, "step": 4618, "time_per_iteration": 3.8524680137634277 }, { "auxiliary_loss_clip": 0.01168623, "auxiliary_loss_mlp": 0.01029879, "balance_loss_clip": 1.01091933, "balance_loss_mlp": 1.02200532, "epoch": 0.5554019118619612, "flos": 20741608851840.0, "grad_norm": 1.7105294141859009, "language_loss": 0.71341699, "learning_rate": 1.739011974650464e-06, "loss": 0.73540205, "num_input_tokens_seen": 99693585, "step": 4619, "time_per_iteration": 2.631053924560547 }, { "auxiliary_loss_clip": 0.01174555, "auxiliary_loss_mlp": 0.01029616, "balance_loss_clip": 0.89707023, "balance_loss_mlp": 1.0216645, "epoch": 0.5555221547526003, "flos": 25483217552640.0, "grad_norm": 1.825955303961391, "language_loss": 0.76442218, "learning_rate": 1.7382396837292365e-06, "loss": 0.78646386, "num_input_tokens_seen": 99714045, "step": 4620, "time_per_iteration": 2.7532148361206055 }, { "auxiliary_loss_clip": 0.01175113, "auxiliary_loss_mlp": 0.01024395, "balance_loss_clip": 1.05115891, "balance_loss_mlp": 1.01710582, "epoch": 0.5556423976432393, "flos": 21762513204480.0, "grad_norm": 1.8798024706986955, "language_loss": 0.73285168, "learning_rate": 1.737467432516841e-06, "loss": 0.75484681, "num_input_tokens_seen": 99734145, "step": 4621, "time_per_iteration": 3.6164915561676025 }, { "auxiliary_loss_clip": 0.01167339, "auxiliary_loss_mlp": 0.01021561, "balance_loss_clip": 0.97003496, "balance_loss_mlp": 1.01425695, "epoch": 0.5557626405338785, "flos": 24900171989760.0, "grad_norm": 3.014713508157937, "language_loss": 0.74454135, "learning_rate": 1.7366952211304274e-06, "loss": 0.76643038, "num_input_tokens_seen": 99751990, "step": 4622, "time_per_iteration": 2.6845834255218506 }, { "auxiliary_loss_clip": 0.01160566, "auxiliary_loss_mlp": 0.01022411, "balance_loss_clip": 0.97152942, "balance_loss_mlp": 1.01566112, "epoch": 0.5558828834245175, "flos": 18697501676160.0, "grad_norm": 2.0534662855679393, "language_loss": 0.8321383, "learning_rate": 1.735923049687139e-06, "loss": 0.85396808, "num_input_tokens_seen": 99768565, "step": 4623, "time_per_iteration": 3.693748950958252 }, { "auxiliary_loss_clip": 0.01165625, "auxiliary_loss_mlp": 0.01030776, "balance_loss_clip": 0.97259009, "balance_loss_mlp": 1.02336168, "epoch": 0.5560031263151566, "flos": 27272179445760.0, "grad_norm": 1.4032260541081658, "language_loss": 0.73932952, "learning_rate": 1.7351509183041144e-06, "loss": 0.76129353, "num_input_tokens_seen": 99788895, "step": 4624, "time_per_iteration": 2.7303383350372314 }, { "auxiliary_loss_clip": 0.01176051, "auxiliary_loss_mlp": 0.01024519, "balance_loss_clip": 1.05142665, "balance_loss_mlp": 1.01750326, "epoch": 0.5561233692057957, "flos": 23403738458880.0, "grad_norm": 1.640759825219395, "language_loss": 0.71583086, "learning_rate": 1.7343788270984852e-06, "loss": 0.7378366, "num_input_tokens_seen": 99808035, "step": 4625, "time_per_iteration": 2.64003586769104 }, { "auxiliary_loss_clip": 0.01173797, "auxiliary_loss_mlp": 0.01029163, "balance_loss_clip": 0.9766683, "balance_loss_mlp": 1.02148604, "epoch": 0.5562436120964348, "flos": 37670867804160.0, "grad_norm": 1.7007261529046787, "language_loss": 0.74511206, "learning_rate": 1.7336067761873764e-06, "loss": 0.7671417, "num_input_tokens_seen": 99830460, "step": 4626, "time_per_iteration": 2.8421645164489746 }, { "auxiliary_loss_clip": 0.01177356, "auxiliary_loss_mlp": 0.01024999, "balance_loss_clip": 1.0107646, "balance_loss_mlp": 1.01741147, "epoch": 0.5563638549870739, "flos": 25155245445120.0, "grad_norm": 23.097462559625914, "language_loss": 0.76442361, "learning_rate": 1.7328347656879076e-06, "loss": 0.78644717, "num_input_tokens_seen": 99850320, "step": 4627, "time_per_iteration": 2.750303030014038 }, { "auxiliary_loss_clip": 0.01168721, "auxiliary_loss_mlp": 0.0102605, "balance_loss_clip": 0.9335447, "balance_loss_mlp": 1.01822352, "epoch": 0.556484097877713, "flos": 13581810783360.0, "grad_norm": 2.0846820517224653, "language_loss": 0.68424958, "learning_rate": 1.7320627957171927e-06, "loss": 0.70619732, "num_input_tokens_seen": 99864980, "step": 4628, "time_per_iteration": 3.561199903488159 }, { "auxiliary_loss_clip": 0.01172228, "auxiliary_loss_mlp": 0.01030811, "balance_loss_clip": 1.05042708, "balance_loss_mlp": 1.02331889, "epoch": 0.5566043407683521, "flos": 24681368292480.0, "grad_norm": 2.181842919876161, "language_loss": 0.8137095, "learning_rate": 1.7312908663923382e-06, "loss": 0.83573991, "num_input_tokens_seen": 99881155, "step": 4629, "time_per_iteration": 2.817945957183838 }, { "auxiliary_loss_clip": 0.01163977, "auxiliary_loss_mlp": 0.01029913, "balance_loss_clip": 1.00895977, "balance_loss_mlp": 1.02236092, "epoch": 0.5567245836589911, "flos": 20588161950720.0, "grad_norm": 1.9270049485293728, "language_loss": 0.67014289, "learning_rate": 1.7305189778304463e-06, "loss": 0.69208181, "num_input_tokens_seen": 99899330, "step": 4630, "time_per_iteration": 2.646085262298584 }, { "auxiliary_loss_clip": 0.01170016, "auxiliary_loss_mlp": 0.01028094, "balance_loss_clip": 0.97438502, "balance_loss_mlp": 1.02072096, "epoch": 0.5568448265496303, "flos": 20704189858560.0, "grad_norm": 1.7227534438790282, "language_loss": 0.79726249, "learning_rate": 1.729747130148611e-06, "loss": 0.81924355, "num_input_tokens_seen": 99918525, "step": 4631, "time_per_iteration": 2.7368836402893066 }, { "auxiliary_loss_clip": 0.01176368, "auxiliary_loss_mlp": 0.01028265, "balance_loss_clip": 0.93544108, "balance_loss_mlp": 1.02018237, "epoch": 0.5569650694402694, "flos": 25302910256640.0, "grad_norm": 1.9497867597328198, "language_loss": 0.76729333, "learning_rate": 1.7289753234639208e-06, "loss": 0.78933966, "num_input_tokens_seen": 99937500, "step": 4632, "time_per_iteration": 2.7778615951538086 }, { "auxiliary_loss_clip": 0.01179037, "auxiliary_loss_mlp": 0.01027985, "balance_loss_clip": 1.01338065, "balance_loss_mlp": 1.02014148, "epoch": 0.5570853123309084, "flos": 19712623939200.0, "grad_norm": 1.7649878468316567, "language_loss": 0.76374108, "learning_rate": 1.7282035578934592e-06, "loss": 0.78581131, "num_input_tokens_seen": 99955665, "step": 4633, "time_per_iteration": 2.6397652626037598 }, { "auxiliary_loss_clip": 0.01166804, "auxiliary_loss_mlp": 0.01024919, "balance_loss_clip": 0.97525072, "balance_loss_mlp": 1.01718819, "epoch": 0.5572055552215476, "flos": 16108091153280.0, "grad_norm": 1.8968292601172938, "language_loss": 0.78793406, "learning_rate": 1.727431833554301e-06, "loss": 0.80985129, "num_input_tokens_seen": 99974140, "step": 4634, "time_per_iteration": 2.651608943939209 }, { "auxiliary_loss_clip": 0.01174439, "auxiliary_loss_mlp": 0.01030572, "balance_loss_clip": 0.85713589, "balance_loss_mlp": 1.02242422, "epoch": 0.5573257981121866, "flos": 17128815937920.0, "grad_norm": 1.7498905799532938, "language_loss": 0.77220798, "learning_rate": 1.7266601505635175e-06, "loss": 0.79425806, "num_input_tokens_seen": 99991480, "step": 4635, "time_per_iteration": 2.789574384689331 }, { "auxiliary_loss_clip": 0.01172417, "auxiliary_loss_mlp": 0.01027659, "balance_loss_clip": 1.01188636, "balance_loss_mlp": 1.02030063, "epoch": 0.5574460410028257, "flos": 18807029222400.0, "grad_norm": 2.137418245211094, "language_loss": 0.75883949, "learning_rate": 1.7258885090381717e-06, "loss": 0.78084028, "num_input_tokens_seen": 100009520, "step": 4636, "time_per_iteration": 2.647726535797119 }, { "auxiliary_loss_clip": 0.01172729, "auxiliary_loss_mlp": 0.01025523, "balance_loss_clip": 0.97252423, "balance_loss_mlp": 1.01849294, "epoch": 0.5575662838934649, "flos": 29642678530560.0, "grad_norm": 1.8134179821541025, "language_loss": 0.78317171, "learning_rate": 1.7251169090953213e-06, "loss": 0.8051542, "num_input_tokens_seen": 100029995, "step": 4637, "time_per_iteration": 2.7223315238952637 }, { "auxiliary_loss_clip": 0.01171675, "auxiliary_loss_mlp": 0.01025338, "balance_loss_clip": 1.01214814, "balance_loss_mlp": 1.01746464, "epoch": 0.5576865267841039, "flos": 22054466949120.0, "grad_norm": 2.3851652263784526, "language_loss": 0.75821602, "learning_rate": 1.7243453508520168e-06, "loss": 0.78018618, "num_input_tokens_seen": 100046980, "step": 4638, "time_per_iteration": 2.6787874698638916 }, { "auxiliary_loss_clip": 0.01170732, "auxiliary_loss_mlp": 0.01033077, "balance_loss_clip": 0.97117817, "balance_loss_mlp": 1.02542388, "epoch": 0.557806769674743, "flos": 17196040241280.0, "grad_norm": 2.0187484817648556, "language_loss": 0.84335178, "learning_rate": 1.7235738344253038e-06, "loss": 0.86538988, "num_input_tokens_seen": 100060610, "step": 4639, "time_per_iteration": 2.706834554672241 }, { "auxiliary_loss_clip": 0.01174875, "auxiliary_loss_mlp": 0.01029811, "balance_loss_clip": 1.01572776, "balance_loss_mlp": 1.02227736, "epoch": 0.557927012565382, "flos": 24712717887360.0, "grad_norm": 1.761414331891317, "language_loss": 0.83012491, "learning_rate": 1.72280235993222e-06, "loss": 0.85217178, "num_input_tokens_seen": 100078915, "step": 4640, "time_per_iteration": 2.7079732418060303 }, { "auxiliary_loss_clip": 0.01170113, "auxiliary_loss_mlp": 0.0112337, "balance_loss_clip": 1.01227474, "balance_loss_mlp": 0.0, "epoch": 0.5580472554560212, "flos": 16983090460800.0, "grad_norm": 2.544453035205309, "language_loss": 0.69665003, "learning_rate": 1.722030927489798e-06, "loss": 0.71958488, "num_input_tokens_seen": 100096195, "step": 4641, "time_per_iteration": 2.623210906982422 }, { "auxiliary_loss_clip": 0.01170908, "auxiliary_loss_mlp": 0.01030455, "balance_loss_clip": 0.93590885, "balance_loss_mlp": 1.02277493, "epoch": 0.5581674983466602, "flos": 23509100027520.0, "grad_norm": 1.616495871596787, "language_loss": 0.74228513, "learning_rate": 1.7212595372150634e-06, "loss": 0.7642988, "num_input_tokens_seen": 100116175, "step": 4642, "time_per_iteration": 2.7434604167938232 }, { "auxiliary_loss_clip": 0.01175793, "auxiliary_loss_mlp": 0.01024552, "balance_loss_clip": 1.05280364, "balance_loss_mlp": 1.0175488, "epoch": 0.5582877412372993, "flos": 13480291969920.0, "grad_norm": 2.14511276456334, "language_loss": 0.7270304, "learning_rate": 1.720488189225035e-06, "loss": 0.74903387, "num_input_tokens_seen": 100133875, "step": 4643, "time_per_iteration": 2.569561719894409 }, { "auxiliary_loss_clip": 0.01174682, "auxiliary_loss_mlp": 0.01026776, "balance_loss_clip": 1.01083374, "balance_loss_mlp": 1.01875329, "epoch": 0.5584079841279385, "flos": 21903605827200.0, "grad_norm": 3.0373852847119087, "language_loss": 0.79210675, "learning_rate": 1.7197168836367265e-06, "loss": 0.81412125, "num_input_tokens_seen": 100150685, "step": 4644, "time_per_iteration": 3.586256265640259 }, { "auxiliary_loss_clip": 0.01168349, "auxiliary_loss_mlp": 0.0112265, "balance_loss_clip": 1.00957263, "balance_loss_mlp": 0.0, "epoch": 0.5585282270185775, "flos": 18843550375680.0, "grad_norm": 2.0782923500456394, "language_loss": 0.81571805, "learning_rate": 1.7189456205671433e-06, "loss": 0.83862805, "num_input_tokens_seen": 100169530, "step": 4645, "time_per_iteration": 2.6399972438812256 }, { "auxiliary_loss_clip": 0.01181526, "auxiliary_loss_mlp": 0.010311, "balance_loss_clip": 1.01331186, "balance_loss_mlp": 1.02347088, "epoch": 0.5586484699092166, "flos": 21868449390720.0, "grad_norm": 2.0809755521659508, "language_loss": 0.81979978, "learning_rate": 1.7181744001332866e-06, "loss": 0.84192598, "num_input_tokens_seen": 100188140, "step": 4646, "time_per_iteration": 2.5844991207122803 }, { "auxiliary_loss_clip": 0.01172782, "auxiliary_loss_mlp": 0.01031591, "balance_loss_clip": 1.05127931, "balance_loss_mlp": 1.02430701, "epoch": 0.5587687127998557, "flos": 22893232412160.0, "grad_norm": 1.8298169631388037, "language_loss": 0.63500261, "learning_rate": 1.7174032224521493e-06, "loss": 0.65704632, "num_input_tokens_seen": 100206850, "step": 4647, "time_per_iteration": 3.6176435947418213 }, { "auxiliary_loss_clip": 0.01171184, "auxiliary_loss_mlp": 0.01027503, "balance_loss_clip": 1.01109672, "balance_loss_mlp": 1.02061892, "epoch": 0.5588889556904948, "flos": 20303067703680.0, "grad_norm": 1.5150730894332431, "language_loss": 0.69780231, "learning_rate": 1.7166320876407184e-06, "loss": 0.71978915, "num_input_tokens_seen": 100226270, "step": 4648, "time_per_iteration": 2.618736743927002 }, { "auxiliary_loss_clip": 0.01178224, "auxiliary_loss_mlp": 0.01122881, "balance_loss_clip": 1.05243158, "balance_loss_mlp": 0.0, "epoch": 0.5590091985811338, "flos": 16472153450880.0, "grad_norm": 2.025063938039886, "language_loss": 0.68062866, "learning_rate": 1.7158609958159742e-06, "loss": 0.70363969, "num_input_tokens_seen": 100243675, "step": 4649, "time_per_iteration": 2.735722064971924 }, { "auxiliary_loss_clip": 0.01182771, "auxiliary_loss_mlp": 0.01032394, "balance_loss_clip": 0.86101687, "balance_loss_mlp": 1.02417469, "epoch": 0.559129441471773, "flos": 14532186781440.0, "grad_norm": 2.0296453682517424, "language_loss": 0.78102446, "learning_rate": 1.7150899470948911e-06, "loss": 0.80317616, "num_input_tokens_seen": 100258940, "step": 4650, "time_per_iteration": 3.7731754779815674 }, { "auxiliary_loss_clip": 0.01076468, "auxiliary_loss_mlp": 0.01002676, "balance_loss_clip": 0.94320047, "balance_loss_mlp": 1.00087571, "epoch": 0.5592496843624121, "flos": 60521009852160.0, "grad_norm": 0.8027029975573141, "language_loss": 0.56669235, "learning_rate": 1.7143189415944365e-06, "loss": 0.58748376, "num_input_tokens_seen": 100323400, "step": 4651, "time_per_iteration": 3.2594847679138184 }, { "auxiliary_loss_clip": 0.01171067, "auxiliary_loss_mlp": 0.01023347, "balance_loss_clip": 1.012923, "balance_loss_mlp": 1.01570547, "epoch": 0.5593699272530511, "flos": 20886256920960.0, "grad_norm": 1.573595049210598, "language_loss": 0.76066566, "learning_rate": 1.7135479794315714e-06, "loss": 0.78260982, "num_input_tokens_seen": 100340355, "step": 4652, "time_per_iteration": 2.707627058029175 }, { "auxiliary_loss_clip": 0.01169763, "auxiliary_loss_mlp": 0.01029004, "balance_loss_clip": 0.9365536, "balance_loss_mlp": 1.02179217, "epoch": 0.5594901701436903, "flos": 12896743616640.0, "grad_norm": 1.8712136846649232, "language_loss": 0.78704667, "learning_rate": 1.7127770607232502e-06, "loss": 0.80903435, "num_input_tokens_seen": 100358900, "step": 4653, "time_per_iteration": 2.6990373134613037 }, { "auxiliary_loss_clip": 0.01179301, "auxiliary_loss_mlp": 0.01031993, "balance_loss_clip": 0.93510854, "balance_loss_mlp": 1.02435136, "epoch": 0.5596104130343293, "flos": 23112107936640.0, "grad_norm": 1.7703027923883383, "language_loss": 0.80015171, "learning_rate": 1.7120061855864204e-06, "loss": 0.82226461, "num_input_tokens_seen": 100378910, "step": 4654, "time_per_iteration": 3.6738758087158203 }, { "auxiliary_loss_clip": 0.01174758, "auxiliary_loss_mlp": 0.01028478, "balance_loss_clip": 1.01491427, "balance_loss_mlp": 1.02081251, "epoch": 0.5597306559249684, "flos": 25957812977280.0, "grad_norm": 1.7892911705115928, "language_loss": 0.70939314, "learning_rate": 1.7112353541380233e-06, "loss": 0.73142546, "num_input_tokens_seen": 100398770, "step": 4655, "time_per_iteration": 2.6949074268341064 }, { "auxiliary_loss_clip": 0.0117405, "auxiliary_loss_mlp": 0.01030047, "balance_loss_clip": 0.97666883, "balance_loss_mlp": 1.02256691, "epoch": 0.5598508988156076, "flos": 22492289825280.0, "grad_norm": 1.4757144852837967, "language_loss": 0.72184157, "learning_rate": 1.7104645664949931e-06, "loss": 0.7438826, "num_input_tokens_seen": 100421240, "step": 4656, "time_per_iteration": 2.750643253326416 }, { "auxiliary_loss_clip": 0.01171517, "auxiliary_loss_mlp": 0.01032051, "balance_loss_clip": 0.97116023, "balance_loss_mlp": 1.02435005, "epoch": 0.5599711417062466, "flos": 23112538899840.0, "grad_norm": 2.1452586585551563, "language_loss": 0.7143966, "learning_rate": 1.7096938227742584e-06, "loss": 0.73643219, "num_input_tokens_seen": 100442370, "step": 4657, "time_per_iteration": 2.6868388652801514 }, { "auxiliary_loss_clip": 0.01173556, "auxiliary_loss_mlp": 0.01026618, "balance_loss_clip": 1.05025887, "balance_loss_mlp": 1.01877356, "epoch": 0.5600913845968857, "flos": 22339345714560.0, "grad_norm": 2.002594822957731, "language_loss": 0.84062684, "learning_rate": 1.70892312309274e-06, "loss": 0.86262858, "num_input_tokens_seen": 100460260, "step": 4658, "time_per_iteration": 2.6263620853424072 }, { "auxiliary_loss_clip": 0.01168615, "auxiliary_loss_mlp": 0.01024429, "balance_loss_clip": 0.96791255, "balance_loss_mlp": 1.01652527, "epoch": 0.5602116274875248, "flos": 17633791290240.0, "grad_norm": 2.2781372197953322, "language_loss": 0.68428516, "learning_rate": 1.7081524675673523e-06, "loss": 0.70621562, "num_input_tokens_seen": 100475750, "step": 4659, "time_per_iteration": 2.6318469047546387 }, { "auxiliary_loss_clip": 0.01079425, "auxiliary_loss_mlp": 0.01006221, "balance_loss_clip": 0.9425801, "balance_loss_mlp": 1.00451648, "epoch": 0.5603318703781639, "flos": 70115945529600.0, "grad_norm": 0.7781516619115009, "language_loss": 0.5961042, "learning_rate": 1.7073818563150026e-06, "loss": 0.61696064, "num_input_tokens_seen": 100537830, "step": 4660, "time_per_iteration": 3.3622214794158936 }, { "auxiliary_loss_clip": 0.01166431, "auxiliary_loss_mlp": 0.01024405, "balance_loss_clip": 1.00859594, "balance_loss_mlp": 1.01673973, "epoch": 0.560452113268803, "flos": 18545850455040.0, "grad_norm": 2.639770390742843, "language_loss": 0.86285943, "learning_rate": 1.7066112894525935e-06, "loss": 0.88476777, "num_input_tokens_seen": 100555910, "step": 4661, "time_per_iteration": 2.678603172302246 }, { "auxiliary_loss_clip": 0.01162921, "auxiliary_loss_mlp": 0.01025679, "balance_loss_clip": 0.97215116, "balance_loss_mlp": 1.01783514, "epoch": 0.5605723561594421, "flos": 25264665250560.0, "grad_norm": 1.7395491217210668, "language_loss": 0.73000067, "learning_rate": 1.7058407670970177e-06, "loss": 0.75188673, "num_input_tokens_seen": 100577385, "step": 4662, "time_per_iteration": 2.732184410095215 }, { "auxiliary_loss_clip": 0.01177811, "auxiliary_loss_mlp": 0.01028216, "balance_loss_clip": 1.01138854, "balance_loss_mlp": 1.02008295, "epoch": 0.5606925990500812, "flos": 20594949621120.0, "grad_norm": 2.0303853295577254, "language_loss": 0.6159929, "learning_rate": 1.7050702893651643e-06, "loss": 0.63805318, "num_input_tokens_seen": 100596965, "step": 4663, "time_per_iteration": 2.6508381366729736 }, { "auxiliary_loss_clip": 0.01174478, "auxiliary_loss_mlp": 0.01028979, "balance_loss_clip": 1.01401544, "balance_loss_mlp": 1.02140284, "epoch": 0.5608128419407202, "flos": 35006044677120.0, "grad_norm": 2.0192520074694107, "language_loss": 0.75369871, "learning_rate": 1.7042998563739134e-06, "loss": 0.77573329, "num_input_tokens_seen": 100615315, "step": 4664, "time_per_iteration": 2.952866792678833 }, { "auxiliary_loss_clip": 0.01180873, "auxiliary_loss_mlp": 0.01029587, "balance_loss_clip": 0.97400838, "balance_loss_mlp": 1.02110517, "epoch": 0.5609330848313594, "flos": 24639819235200.0, "grad_norm": 1.9156488545482662, "language_loss": 0.71724463, "learning_rate": 1.703529468240139e-06, "loss": 0.73934919, "num_input_tokens_seen": 100634185, "step": 4665, "time_per_iteration": 2.6926910877227783 }, { "auxiliary_loss_clip": 0.0116872, "auxiliary_loss_mlp": 0.0102632, "balance_loss_clip": 0.97404706, "balance_loss_mlp": 1.01898813, "epoch": 0.5610533277219985, "flos": 18762894385920.0, "grad_norm": 2.0660098720944013, "language_loss": 0.7332561, "learning_rate": 1.7027591250807088e-06, "loss": 0.75520647, "num_input_tokens_seen": 100651360, "step": 4666, "time_per_iteration": 2.628932237625122 }, { "auxiliary_loss_clip": 0.01179232, "auxiliary_loss_mlp": 0.01027188, "balance_loss_clip": 1.05371499, "balance_loss_mlp": 1.01966596, "epoch": 0.5611735706126375, "flos": 15012384727680.0, "grad_norm": 4.855982603000018, "language_loss": 0.8435477, "learning_rate": 1.7019888270124825e-06, "loss": 0.86561191, "num_input_tokens_seen": 100668525, "step": 4667, "time_per_iteration": 2.5817058086395264 }, { "auxiliary_loss_clip": 0.01177178, "auxiliary_loss_mlp": 0.01034981, "balance_loss_clip": 1.01518893, "balance_loss_mlp": 1.02685714, "epoch": 0.5612938135032767, "flos": 16468167041280.0, "grad_norm": 1.7228905660213547, "language_loss": 0.81691885, "learning_rate": 1.7012185741523147e-06, "loss": 0.83904046, "num_input_tokens_seen": 100684850, "step": 4668, "time_per_iteration": 2.653822898864746 }, { "auxiliary_loss_clip": 0.01176265, "auxiliary_loss_mlp": 0.01026803, "balance_loss_clip": 1.05205536, "balance_loss_mlp": 1.0188992, "epoch": 0.5614140563939157, "flos": 25666433850240.0, "grad_norm": 3.7619458125121823, "language_loss": 0.62113959, "learning_rate": 1.7004483666170514e-06, "loss": 0.64317024, "num_input_tokens_seen": 100705345, "step": 4669, "time_per_iteration": 2.645946979522705 }, { "auxiliary_loss_clip": 0.01171855, "auxiliary_loss_mlp": 0.0102436, "balance_loss_clip": 1.01153111, "balance_loss_mlp": 1.01695132, "epoch": 0.5615342992845548, "flos": 24717566223360.0, "grad_norm": 1.9726075761564639, "language_loss": 0.80418289, "learning_rate": 1.699678204523533e-06, "loss": 0.82614499, "num_input_tokens_seen": 100725210, "step": 4670, "time_per_iteration": 3.6266820430755615 }, { "auxiliary_loss_clip": 0.01177787, "auxiliary_loss_mlp": 0.01030292, "balance_loss_clip": 0.97788107, "balance_loss_mlp": 1.02284145, "epoch": 0.5616545421751938, "flos": 22015934634240.0, "grad_norm": 4.480903504685959, "language_loss": 0.68641621, "learning_rate": 1.6989080879885918e-06, "loss": 0.70849699, "num_input_tokens_seen": 100743070, "step": 4671, "time_per_iteration": 2.6368956565856934 }, { "auxiliary_loss_clip": 0.0108237, "auxiliary_loss_mlp": 0.01000021, "balance_loss_clip": 0.90734887, "balance_loss_mlp": 0.99817336, "epoch": 0.561774785065833, "flos": 53760358690560.0, "grad_norm": 0.9012466629027077, "language_loss": 0.61060077, "learning_rate": 1.6981380171290544e-06, "loss": 0.63142473, "num_input_tokens_seen": 100804095, "step": 4672, "time_per_iteration": 3.287855386734009 }, { "auxiliary_loss_clip": 0.01168011, "auxiliary_loss_mlp": 0.01030685, "balance_loss_clip": 0.97284257, "balance_loss_mlp": 1.02280569, "epoch": 0.5618950279564721, "flos": 19750007018880.0, "grad_norm": 2.186398921970929, "language_loss": 0.74483407, "learning_rate": 1.6973679920617396e-06, "loss": 0.76682103, "num_input_tokens_seen": 100821630, "step": 4673, "time_per_iteration": 3.5977554321289062 }, { "auxiliary_loss_clip": 0.01173376, "auxiliary_loss_mlp": 0.01022114, "balance_loss_clip": 0.97795689, "balance_loss_mlp": 1.01471996, "epoch": 0.5620152708471111, "flos": 16800592435200.0, "grad_norm": 1.8555698712257727, "language_loss": 0.85151613, "learning_rate": 1.6965980129034603e-06, "loss": 0.87347102, "num_input_tokens_seen": 100839015, "step": 4674, "time_per_iteration": 2.645084857940674 }, { "auxiliary_loss_clip": 0.0117478, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 0.9769029, "balance_loss_mlp": 1.02219319, "epoch": 0.5621355137377503, "flos": 26797799502720.0, "grad_norm": 1.753641104145318, "language_loss": 0.76692724, "learning_rate": 1.6958280797710209e-06, "loss": 0.78897715, "num_input_tokens_seen": 100860940, "step": 4675, "time_per_iteration": 2.7102112770080566 }, { "auxiliary_loss_clip": 0.01079088, "auxiliary_loss_mlp": 0.00999598, "balance_loss_clip": 0.94425237, "balance_loss_mlp": 0.99789345, "epoch": 0.5622557566283893, "flos": 61207046686080.0, "grad_norm": 0.7115153384491768, "language_loss": 0.54760683, "learning_rate": 1.6950581927812198e-06, "loss": 0.56839365, "num_input_tokens_seen": 100920510, "step": 4676, "time_per_iteration": 3.9959428310394287 }, { "auxiliary_loss_clip": 0.0117477, "auxiliary_loss_mlp": 0.0102365, "balance_loss_clip": 1.012429, "balance_loss_mlp": 1.01620507, "epoch": 0.5623759995190284, "flos": 26468534505600.0, "grad_norm": 1.7917849875661171, "language_loss": 0.79138231, "learning_rate": 1.6942883520508486e-06, "loss": 0.81336647, "num_input_tokens_seen": 100939245, "step": 4677, "time_per_iteration": 2.702059507369995 }, { "auxiliary_loss_clip": 0.01175416, "auxiliary_loss_mlp": 0.01024686, "balance_loss_clip": 1.01315856, "balance_loss_mlp": 1.01742053, "epoch": 0.5624962424096676, "flos": 19390900798080.0, "grad_norm": 1.8910983570754336, "language_loss": 0.77232784, "learning_rate": 1.693518557696691e-06, "loss": 0.79432887, "num_input_tokens_seen": 100958385, "step": 4678, "time_per_iteration": 2.6586875915527344 }, { "auxiliary_loss_clip": 0.0116668, "auxiliary_loss_mlp": 0.01023423, "balance_loss_clip": 1.00874889, "balance_loss_mlp": 1.01601982, "epoch": 0.5626164853003066, "flos": 20667345482880.0, "grad_norm": 2.036378245139231, "language_loss": 0.892353, "learning_rate": 1.6927488098355252e-06, "loss": 0.91425401, "num_input_tokens_seen": 100976015, "step": 4679, "time_per_iteration": 2.6670329570770264 }, { "auxiliary_loss_clip": 0.01088488, "auxiliary_loss_mlp": 0.00998943, "balance_loss_clip": 0.86830163, "balance_loss_mlp": 0.99710697, "epoch": 0.5627367281909457, "flos": 62766071665920.0, "grad_norm": 0.9115170751515826, "language_loss": 0.63195902, "learning_rate": 1.6919791085841201e-06, "loss": 0.65283334, "num_input_tokens_seen": 101033425, "step": 4680, "time_per_iteration": 4.1379547119140625 }, { "auxiliary_loss_clip": 0.01165615, "auxiliary_loss_mlp": 0.01021902, "balance_loss_clip": 1.00965476, "balance_loss_mlp": 1.01380169, "epoch": 0.5628569710815848, "flos": 12787144243200.0, "grad_norm": 2.211914814669022, "language_loss": 0.79057497, "learning_rate": 1.6912094540592396e-06, "loss": 0.81245011, "num_input_tokens_seen": 101048945, "step": 4681, "time_per_iteration": 2.6724090576171875 }, { "auxiliary_loss_clip": 0.0117035, "auxiliary_loss_mlp": 0.01025162, "balance_loss_clip": 1.01190114, "balance_loss_mlp": 1.01713967, "epoch": 0.5629772139722239, "flos": 13762082165760.0, "grad_norm": 2.658302068666858, "language_loss": 0.8107655, "learning_rate": 1.6904398463776393e-06, "loss": 0.83272064, "num_input_tokens_seen": 101062745, "step": 4682, "time_per_iteration": 2.718466281890869 }, { "auxiliary_loss_clip": 0.01174648, "auxiliary_loss_mlp": 0.0102716, "balance_loss_clip": 1.01111603, "balance_loss_mlp": 1.01955783, "epoch": 0.5630974568628629, "flos": 21467830026240.0, "grad_norm": 2.119936479137371, "language_loss": 0.72645921, "learning_rate": 1.6896702856560683e-06, "loss": 0.74847734, "num_input_tokens_seen": 101081840, "step": 4683, "time_per_iteration": 2.6943249702453613 }, { "auxiliary_loss_clip": 0.01164721, "auxiliary_loss_mlp": 0.01029959, "balance_loss_clip": 0.93258655, "balance_loss_mlp": 1.02261603, "epoch": 0.5632176997535021, "flos": 14245907385600.0, "grad_norm": 2.499414839622862, "language_loss": 0.70023835, "learning_rate": 1.6889007720112677e-06, "loss": 0.72218513, "num_input_tokens_seen": 101099585, "step": 4684, "time_per_iteration": 2.687415361404419 }, { "auxiliary_loss_clip": 0.01177284, "auxiliary_loss_mlp": 0.01025195, "balance_loss_clip": 1.01454556, "balance_loss_mlp": 1.01806629, "epoch": 0.5633379426441412, "flos": 20812244947200.0, "grad_norm": 1.7016575794454158, "language_loss": 0.77282083, "learning_rate": 1.6881313055599734e-06, "loss": 0.79484558, "num_input_tokens_seen": 101119515, "step": 4685, "time_per_iteration": 2.712481737136841 }, { "auxiliary_loss_clip": 0.01154524, "auxiliary_loss_mlp": 0.01031769, "balance_loss_clip": 0.96831685, "balance_loss_mlp": 1.02340627, "epoch": 0.5634581855347802, "flos": 22600883617920.0, "grad_norm": 3.4429400361254228, "language_loss": 0.82008183, "learning_rate": 1.6873618864189117e-06, "loss": 0.84194475, "num_input_tokens_seen": 101135285, "step": 4686, "time_per_iteration": 2.6638081073760986 }, { "auxiliary_loss_clip": 0.01172022, "auxiliary_loss_mlp": 0.01024982, "balance_loss_clip": 1.00989532, "balance_loss_mlp": 1.01711392, "epoch": 0.5635784284254194, "flos": 21506972872320.0, "grad_norm": 2.002181134347284, "language_loss": 0.77845025, "learning_rate": 1.686592514704803e-06, "loss": 0.80042028, "num_input_tokens_seen": 101152680, "step": 4687, "time_per_iteration": 2.686345100402832 }, { "auxiliary_loss_clip": 0.01170664, "auxiliary_loss_mlp": 0.01020167, "balance_loss_clip": 0.97547072, "balance_loss_mlp": 1.01316667, "epoch": 0.5636986713160584, "flos": 19827466698240.0, "grad_norm": 2.0210765382351337, "language_loss": 0.71409905, "learning_rate": 1.685823190534361e-06, "loss": 0.73600733, "num_input_tokens_seen": 101170920, "step": 4688, "time_per_iteration": 2.655482292175293 }, { "auxiliary_loss_clip": 0.01176664, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.05090475, "balance_loss_mlp": 1.0250901, "epoch": 0.5638189142066975, "flos": 19792453916160.0, "grad_norm": 1.7690071086501822, "language_loss": 0.83631378, "learning_rate": 1.6850539140242907e-06, "loss": 0.85840893, "num_input_tokens_seen": 101190180, "step": 4689, "time_per_iteration": 2.6222214698791504 }, { "auxiliary_loss_clip": 0.01174747, "auxiliary_loss_mlp": 0.01026349, "balance_loss_clip": 1.01099849, "balance_loss_mlp": 1.01848137, "epoch": 0.5639391570973367, "flos": 22893771116160.0, "grad_norm": 1.7805855667948187, "language_loss": 0.82247519, "learning_rate": 1.684284685291292e-06, "loss": 0.84448612, "num_input_tokens_seen": 101211825, "step": 4690, "time_per_iteration": 2.6874799728393555 }, { "auxiliary_loss_clip": 0.01175785, "auxiliary_loss_mlp": 0.01029958, "balance_loss_clip": 1.05169845, "balance_loss_mlp": 1.02269852, "epoch": 0.5640593999879757, "flos": 23727077712000.0, "grad_norm": 2.111687286769386, "language_loss": 0.80704367, "learning_rate": 1.683515504452055e-06, "loss": 0.82910115, "num_input_tokens_seen": 101229200, "step": 4691, "time_per_iteration": 2.63234543800354 }, { "auxiliary_loss_clip": 0.0116293, "auxiliary_loss_mlp": 0.01023797, "balance_loss_clip": 0.93299156, "balance_loss_mlp": 1.01620901, "epoch": 0.5641796428786148, "flos": 22710123855360.0, "grad_norm": 1.423667122739211, "language_loss": 0.6676482, "learning_rate": 1.6827463716232648e-06, "loss": 0.68951547, "num_input_tokens_seen": 101249860, "step": 4692, "time_per_iteration": 2.7524521350860596 }, { "auxiliary_loss_clip": 0.01171584, "auxiliary_loss_mlp": 0.01122692, "balance_loss_clip": 1.01153743, "balance_loss_mlp": 0.0, "epoch": 0.5642998857692539, "flos": 19791987039360.0, "grad_norm": 1.7640078089053581, "language_loss": 0.75611454, "learning_rate": 1.6819772869215972e-06, "loss": 0.77905738, "num_input_tokens_seen": 101268940, "step": 4693, "time_per_iteration": 2.6680972576141357 }, { "auxiliary_loss_clip": 0.01176707, "auxiliary_loss_mlp": 0.01026962, "balance_loss_clip": 0.97401178, "balance_loss_mlp": 1.01905513, "epoch": 0.564420128659893, "flos": 23185904428800.0, "grad_norm": 1.6633345962862682, "language_loss": 0.81927705, "learning_rate": 1.6812082504637228e-06, "loss": 0.84131372, "num_input_tokens_seen": 101290260, "step": 4694, "time_per_iteration": 2.795771837234497 }, { "auxiliary_loss_clip": 0.01168035, "auxiliary_loss_mlp": 0.01026165, "balance_loss_clip": 1.01253569, "balance_loss_mlp": 1.01867628, "epoch": 0.564540371550532, "flos": 23258264376960.0, "grad_norm": 1.4384673745034138, "language_loss": 0.7432307, "learning_rate": 1.6804392623663025e-06, "loss": 0.76517272, "num_input_tokens_seen": 101311465, "step": 4695, "time_per_iteration": 3.703347682952881 }, { "auxiliary_loss_clip": 0.01168304, "auxiliary_loss_mlp": 0.01025454, "balance_loss_clip": 1.01244831, "balance_loss_mlp": 1.01846814, "epoch": 0.5646606144411712, "flos": 25010058672000.0, "grad_norm": 1.6425765070288223, "language_loss": 0.77836245, "learning_rate": 1.6796703227459935e-06, "loss": 0.8003, "num_input_tokens_seen": 101329420, "step": 4696, "time_per_iteration": 2.677619218826294 }, { "auxiliary_loss_clip": 0.01158601, "auxiliary_loss_mlp": 0.01026593, "balance_loss_clip": 0.89487398, "balance_loss_mlp": 1.0193454, "epoch": 0.5647808573318103, "flos": 36539645806080.0, "grad_norm": 1.6462647274254882, "language_loss": 0.75904095, "learning_rate": 1.6789014317194407e-06, "loss": 0.78089291, "num_input_tokens_seen": 101350900, "step": 4697, "time_per_iteration": 2.878448963165283 }, { "auxiliary_loss_clip": 0.01177938, "auxiliary_loss_mlp": 0.01027239, "balance_loss_clip": 0.97327268, "balance_loss_mlp": 1.01949668, "epoch": 0.5649011002224493, "flos": 22528451842560.0, "grad_norm": 2.3213480082713365, "language_loss": 0.72931707, "learning_rate": 1.6781325894032853e-06, "loss": 0.75136882, "num_input_tokens_seen": 101369860, "step": 4698, "time_per_iteration": 2.681361675262451 }, { "auxiliary_loss_clip": 0.01171943, "auxiliary_loss_mlp": 0.01029649, "balance_loss_clip": 0.9774847, "balance_loss_mlp": 1.02234721, "epoch": 0.5650213431130885, "flos": 18515147304960.0, "grad_norm": 5.586758450630948, "language_loss": 0.91707987, "learning_rate": 1.6773637959141608e-06, "loss": 0.93909574, "num_input_tokens_seen": 101386835, "step": 4699, "time_per_iteration": 3.600480794906616 }, { "auxiliary_loss_clip": 0.01163226, "auxiliary_loss_mlp": 0.01026363, "balance_loss_clip": 0.97327632, "balance_loss_mlp": 1.01918042, "epoch": 0.5651415860037275, "flos": 17526310819200.0, "grad_norm": 3.7740869082633, "language_loss": 0.66552258, "learning_rate": 1.6765950513686915e-06, "loss": 0.68741846, "num_input_tokens_seen": 101404945, "step": 4700, "time_per_iteration": 2.628237724304199 }, { "auxiliary_loss_clip": 0.01168864, "auxiliary_loss_mlp": 0.01031756, "balance_loss_clip": 0.89476097, "balance_loss_mlp": 1.02372718, "epoch": 0.5652618288943666, "flos": 25520026014720.0, "grad_norm": 1.9167733343700302, "language_loss": 0.76260805, "learning_rate": 1.675826355883496e-06, "loss": 0.78461432, "num_input_tokens_seen": 101424160, "step": 4701, "time_per_iteration": 2.791165351867676 }, { "auxiliary_loss_clip": 0.01167855, "auxiliary_loss_mlp": 0.01023553, "balance_loss_clip": 0.97638559, "balance_loss_mlp": 1.0158937, "epoch": 0.5653820717850057, "flos": 19683105937920.0, "grad_norm": 2.1834545761605897, "language_loss": 0.79132092, "learning_rate": 1.6750577095751848e-06, "loss": 0.81323504, "num_input_tokens_seen": 101443270, "step": 4702, "time_per_iteration": 3.6856863498687744 }, { "auxiliary_loss_clip": 0.01176604, "auxiliary_loss_mlp": 0.01025889, "balance_loss_clip": 1.05290532, "balance_loss_mlp": 1.0181371, "epoch": 0.5655023146756448, "flos": 26979722910720.0, "grad_norm": 1.6852066697785448, "language_loss": 0.72874498, "learning_rate": 1.6742891125603605e-06, "loss": 0.75076997, "num_input_tokens_seen": 101464175, "step": 4703, "time_per_iteration": 2.6446151733398438 }, { "auxiliary_loss_clip": 0.01173385, "auxiliary_loss_mlp": 0.01025275, "balance_loss_clip": 1.01322424, "balance_loss_mlp": 1.0176158, "epoch": 0.5656225575662839, "flos": 27669351104640.0, "grad_norm": 1.6886718177087907, "language_loss": 0.72032708, "learning_rate": 1.6735205649556185e-06, "loss": 0.74231374, "num_input_tokens_seen": 101484045, "step": 4704, "time_per_iteration": 2.760829210281372 }, { "auxiliary_loss_clip": 0.01174795, "auxiliary_loss_mlp": 0.01031412, "balance_loss_clip": 0.93624824, "balance_loss_mlp": 1.0242182, "epoch": 0.5657428004569229, "flos": 24349732997760.0, "grad_norm": 1.583589444622084, "language_loss": 0.84954679, "learning_rate": 1.6727520668775476e-06, "loss": 0.87160885, "num_input_tokens_seen": 101504330, "step": 4705, "time_per_iteration": 2.765378475189209 }, { "auxiliary_loss_clip": 0.01176797, "auxiliary_loss_mlp": 0.01027906, "balance_loss_clip": 1.05089355, "balance_loss_mlp": 1.02027678, "epoch": 0.5658630433475621, "flos": 21944041562880.0, "grad_norm": 1.487054163134341, "language_loss": 0.75243551, "learning_rate": 1.6719836184427275e-06, "loss": 0.77448261, "num_input_tokens_seen": 101524635, "step": 4706, "time_per_iteration": 3.5516374111175537 }, { "auxiliary_loss_clip": 0.0116591, "auxiliary_loss_mlp": 0.01022691, "balance_loss_clip": 0.97146618, "balance_loss_mlp": 1.01529348, "epoch": 0.5659832862382012, "flos": 30409012218240.0, "grad_norm": 1.71248508188343, "language_loss": 0.64732778, "learning_rate": 1.671215219767733e-06, "loss": 0.66921377, "num_input_tokens_seen": 101544095, "step": 4707, "time_per_iteration": 2.858273983001709 }, { "auxiliary_loss_clip": 0.01167742, "auxiliary_loss_mlp": 0.01030578, "balance_loss_clip": 0.89434886, "balance_loss_mlp": 1.0229013, "epoch": 0.5661035291288402, "flos": 13188194570880.0, "grad_norm": 2.1477416311648985, "language_loss": 0.7614249, "learning_rate": 1.670446870969127e-06, "loss": 0.78340816, "num_input_tokens_seen": 101561760, "step": 4708, "time_per_iteration": 2.7632627487182617 }, { "auxiliary_loss_clip": 0.01175972, "auxiliary_loss_mlp": 0.01032927, "balance_loss_clip": 0.97553003, "balance_loss_mlp": 1.0255239, "epoch": 0.5662237720194794, "flos": 16143032108160.0, "grad_norm": 2.0414016687088945, "language_loss": 0.80030489, "learning_rate": 1.6696785721634685e-06, "loss": 0.82239389, "num_input_tokens_seen": 101576245, "step": 4709, "time_per_iteration": 2.7199714183807373 }, { "auxiliary_loss_clip": 0.01174496, "auxiliary_loss_mlp": 0.01031695, "balance_loss_clip": 1.01288664, "balance_loss_mlp": 1.02318335, "epoch": 0.5663440149101184, "flos": 17676848718720.0, "grad_norm": 1.9057437769198378, "language_loss": 0.73677266, "learning_rate": 1.6689103234673086e-06, "loss": 0.7588346, "num_input_tokens_seen": 101594565, "step": 4710, "time_per_iteration": 2.629354238510132 }, { "auxiliary_loss_clip": 0.01173172, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 0.97619104, "balance_loss_mlp": 1.01939666, "epoch": 0.5664642578007575, "flos": 23368330627200.0, "grad_norm": 2.2275917344385534, "language_loss": 0.7700516, "learning_rate": 1.668142124997189e-06, "loss": 0.7920537, "num_input_tokens_seen": 101614225, "step": 4711, "time_per_iteration": 2.714325189590454 }, { "auxiliary_loss_clip": 0.01069232, "auxiliary_loss_mlp": 0.01002572, "balance_loss_clip": 0.94230366, "balance_loss_mlp": 1.00097454, "epoch": 0.5665845006913967, "flos": 65516470945920.0, "grad_norm": 0.7404001513035104, "language_loss": 0.59854656, "learning_rate": 1.6673739768696453e-06, "loss": 0.6192646, "num_input_tokens_seen": 101680795, "step": 4712, "time_per_iteration": 3.2927589416503906 }, { "auxiliary_loss_clip": 0.01176923, "auxiliary_loss_mlp": 0.0102926, "balance_loss_clip": 0.97312129, "balance_loss_mlp": 1.02182698, "epoch": 0.5667047435820357, "flos": 26140885620480.0, "grad_norm": 1.6296513958422951, "language_loss": 0.77403402, "learning_rate": 1.6666058792012052e-06, "loss": 0.79609585, "num_input_tokens_seen": 101701680, "step": 4713, "time_per_iteration": 2.731757402420044 }, { "auxiliary_loss_clip": 0.01075013, "auxiliary_loss_mlp": 0.01003953, "balance_loss_clip": 0.97806442, "balance_loss_mlp": 1.00226045, "epoch": 0.5668249864726748, "flos": 71866949725440.0, "grad_norm": 0.999325375590386, "language_loss": 0.68743938, "learning_rate": 1.6658378321083878e-06, "loss": 0.70822901, "num_input_tokens_seen": 101766010, "step": 4714, "time_per_iteration": 3.306774616241455 }, { "auxiliary_loss_clip": 0.01175545, "auxiliary_loss_mlp": 0.01028803, "balance_loss_clip": 0.86007249, "balance_loss_mlp": 1.0214057, "epoch": 0.5669452293633139, "flos": 22195667312640.0, "grad_norm": 1.6840304978772629, "language_loss": 0.82626092, "learning_rate": 1.6650698357077055e-06, "loss": 0.84830439, "num_input_tokens_seen": 101783055, "step": 4715, "time_per_iteration": 2.844304323196411 }, { "auxiliary_loss_clip": 0.01172428, "auxiliary_loss_mlp": 0.01028648, "balance_loss_clip": 0.97040623, "balance_loss_mlp": 1.02076268, "epoch": 0.567065472253953, "flos": 18223193560320.0, "grad_norm": 2.572769295949536, "language_loss": 0.80670482, "learning_rate": 1.6643018901156632e-06, "loss": 0.82871556, "num_input_tokens_seen": 101802150, "step": 4716, "time_per_iteration": 2.680852174758911 }, { "auxiliary_loss_clip": 0.0117783, "auxiliary_loss_mlp": 0.0102572, "balance_loss_clip": 0.97431529, "balance_loss_mlp": 1.01790547, "epoch": 0.567185715144592, "flos": 20371548983040.0, "grad_norm": 2.148523099456146, "language_loss": 0.79310155, "learning_rate": 1.6635339954487566e-06, "loss": 0.81513709, "num_input_tokens_seen": 101818025, "step": 4717, "time_per_iteration": 2.699856758117676 }, { "auxiliary_loss_clip": 0.01173756, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 0.97308159, "balance_loss_mlp": 1.02552807, "epoch": 0.5673059580352312, "flos": 23221348174080.0, "grad_norm": 1.863849048831106, "language_loss": 0.81985176, "learning_rate": 1.6627661518234765e-06, "loss": 0.84192145, "num_input_tokens_seen": 101837280, "step": 4718, "time_per_iteration": 2.7257943153381348 }, { "auxiliary_loss_clip": 0.01174374, "auxiliary_loss_mlp": 0.01024822, "balance_loss_clip": 0.89885068, "balance_loss_mlp": 1.01709688, "epoch": 0.5674262009258703, "flos": 21719599430400.0, "grad_norm": 1.5810766195283208, "language_loss": 0.85530871, "learning_rate": 1.661998359356302e-06, "loss": 0.87730062, "num_input_tokens_seen": 101856310, "step": 4719, "time_per_iteration": 2.7810370922088623 }, { "auxiliary_loss_clip": 0.0107015, "auxiliary_loss_mlp": 0.01005171, "balance_loss_clip": 1.01417351, "balance_loss_mlp": 1.00345409, "epoch": 0.5675464438165093, "flos": 67470369114240.0, "grad_norm": 0.7421974344085144, "language_loss": 0.55870813, "learning_rate": 1.6612306181637077e-06, "loss": 0.57946134, "num_input_tokens_seen": 101915635, "step": 4720, "time_per_iteration": 3.1846373081207275 }, { "auxiliary_loss_clip": 0.01169985, "auxiliary_loss_mlp": 0.01026359, "balance_loss_clip": 0.93450242, "balance_loss_mlp": 1.01921213, "epoch": 0.5676666867071485, "flos": 18879173688960.0, "grad_norm": 2.1621083930153318, "language_loss": 0.66010261, "learning_rate": 1.6604629283621598e-06, "loss": 0.68206608, "num_input_tokens_seen": 101933565, "step": 4721, "time_per_iteration": 3.6268954277038574 }, { "auxiliary_loss_clip": 0.01179437, "auxiliary_loss_mlp": 0.01028273, "balance_loss_clip": 1.05267906, "balance_loss_mlp": 1.020679, "epoch": 0.5677869295977875, "flos": 33546778744320.0, "grad_norm": 1.9942710892160898, "language_loss": 0.7483145, "learning_rate": 1.6596952900681152e-06, "loss": 0.77039158, "num_input_tokens_seen": 101954325, "step": 4722, "time_per_iteration": 2.7123188972473145 }, { "auxiliary_loss_clip": 0.01165162, "auxiliary_loss_mlp": 0.0103092, "balance_loss_clip": 0.90124357, "balance_loss_mlp": 1.02308226, "epoch": 0.5679071724884266, "flos": 28037256157440.0, "grad_norm": 1.9988902246332048, "language_loss": 0.81699502, "learning_rate": 1.658927703398025e-06, "loss": 0.83895588, "num_input_tokens_seen": 101974390, "step": 4723, "time_per_iteration": 2.8030877113342285 }, { "auxiliary_loss_clip": 0.0116589, "auxiliary_loss_mlp": 0.01023064, "balance_loss_clip": 0.89258683, "balance_loss_mlp": 1.01594138, "epoch": 0.5680274153790658, "flos": 23550110380800.0, "grad_norm": 2.1176772405040163, "language_loss": 0.78035873, "learning_rate": 1.6581601684683309e-06, "loss": 0.80224824, "num_input_tokens_seen": 101994815, "step": 4724, "time_per_iteration": 2.7520222663879395 }, { "auxiliary_loss_clip": 0.01172774, "auxiliary_loss_mlp": 0.01023474, "balance_loss_clip": 1.011374, "balance_loss_mlp": 1.01562691, "epoch": 0.5681476582697048, "flos": 22455158140800.0, "grad_norm": 4.593705949270212, "language_loss": 0.68472016, "learning_rate": 1.6573926853954674e-06, "loss": 0.70668262, "num_input_tokens_seen": 102012400, "step": 4725, "time_per_iteration": 3.657846689224243 }, { "auxiliary_loss_clip": 0.01163097, "auxiliary_loss_mlp": 0.01028242, "balance_loss_clip": 0.9704054, "balance_loss_mlp": 1.02096415, "epoch": 0.5682679011603439, "flos": 19536913584000.0, "grad_norm": 2.148057278187096, "language_loss": 0.83021867, "learning_rate": 1.6566252542958608e-06, "loss": 0.85213202, "num_input_tokens_seen": 102031900, "step": 4726, "time_per_iteration": 2.7036218643188477 }, { "auxiliary_loss_clip": 0.01160445, "auxiliary_loss_mlp": 0.01022747, "balance_loss_clip": 0.93493599, "balance_loss_mlp": 1.01524234, "epoch": 0.568388144050983, "flos": 28765488493440.0, "grad_norm": 1.8122234543919784, "language_loss": 0.78400767, "learning_rate": 1.6558578752859305e-06, "loss": 0.80583954, "num_input_tokens_seen": 102050860, "step": 4727, "time_per_iteration": 2.7622835636138916 }, { "auxiliary_loss_clip": 0.01165415, "auxiliary_loss_mlp": 0.01027068, "balance_loss_clip": 0.93199652, "balance_loss_mlp": 1.01934361, "epoch": 0.5685083869416221, "flos": 21209452519680.0, "grad_norm": 1.7477053568224366, "language_loss": 0.78596908, "learning_rate": 1.6550905484820865e-06, "loss": 0.80789387, "num_input_tokens_seen": 102069320, "step": 4728, "time_per_iteration": 3.5730464458465576 }, { "auxiliary_loss_clip": 0.01174396, "auxiliary_loss_mlp": 0.01026468, "balance_loss_clip": 1.04899859, "balance_loss_mlp": 1.01861262, "epoch": 0.5686286298322611, "flos": 24827021942400.0, "grad_norm": 2.1093368140561473, "language_loss": 0.79129207, "learning_rate": 1.6543232740007328e-06, "loss": 0.81330067, "num_input_tokens_seen": 102086435, "step": 4729, "time_per_iteration": 2.624187469482422 }, { "auxiliary_loss_clip": 0.01176591, "auxiliary_loss_mlp": 0.01029795, "balance_loss_clip": 1.01372623, "balance_loss_mlp": 1.02204633, "epoch": 0.5687488727229003, "flos": 26615121909120.0, "grad_norm": 2.7410923100790834, "language_loss": 0.67068863, "learning_rate": 1.653556051958263e-06, "loss": 0.69275248, "num_input_tokens_seen": 102106115, "step": 4730, "time_per_iteration": 2.6919617652893066 }, { "auxiliary_loss_clip": 0.01150053, "auxiliary_loss_mlp": 0.01024591, "balance_loss_clip": 0.85747713, "balance_loss_mlp": 1.01691973, "epoch": 0.5688691156135394, "flos": 20808725414400.0, "grad_norm": 1.6234860811729417, "language_loss": 0.73883748, "learning_rate": 1.6527888824710642e-06, "loss": 0.760584, "num_input_tokens_seen": 102125715, "step": 4731, "time_per_iteration": 3.7288870811462402 }, { "auxiliary_loss_clip": 0.01160485, "auxiliary_loss_mlp": 0.01027762, "balance_loss_clip": 0.9315573, "balance_loss_mlp": 1.01951861, "epoch": 0.5689893585041784, "flos": 25880963829120.0, "grad_norm": 2.8633053061204286, "language_loss": 0.76189291, "learning_rate": 1.6520217656555166e-06, "loss": 0.78377545, "num_input_tokens_seen": 102145005, "step": 4732, "time_per_iteration": 2.732875347137451 }, { "auxiliary_loss_clip": 0.0115693, "auxiliary_loss_mlp": 0.01024137, "balance_loss_clip": 0.97036564, "balance_loss_mlp": 1.01688945, "epoch": 0.5691096013948175, "flos": 23477463123840.0, "grad_norm": 1.3894788516760102, "language_loss": 0.70700735, "learning_rate": 1.65125470162799e-06, "loss": 0.72881806, "num_input_tokens_seen": 102165360, "step": 4733, "time_per_iteration": 2.738770008087158 }, { "auxiliary_loss_clip": 0.01172918, "auxiliary_loss_mlp": 0.01023671, "balance_loss_clip": 0.93473047, "balance_loss_mlp": 1.01582718, "epoch": 0.5692298442854566, "flos": 18075600576000.0, "grad_norm": 4.407061538691369, "language_loss": 0.69437271, "learning_rate": 1.6504876905048485e-06, "loss": 0.71633863, "num_input_tokens_seen": 102182320, "step": 4734, "time_per_iteration": 2.6807284355163574 }, { "auxiliary_loss_clip": 0.01174314, "auxiliary_loss_mlp": 0.01027306, "balance_loss_clip": 1.05232072, "balance_loss_mlp": 1.02009392, "epoch": 0.5693500871760957, "flos": 23039317025280.0, "grad_norm": 1.6104928706774648, "language_loss": 0.72290313, "learning_rate": 1.6497207324024464e-06, "loss": 0.7449193, "num_input_tokens_seen": 102201220, "step": 4735, "time_per_iteration": 2.669229507446289 }, { "auxiliary_loss_clip": 0.01178573, "auxiliary_loss_mlp": 0.01027987, "balance_loss_clip": 0.97255075, "balance_loss_mlp": 1.02017307, "epoch": 0.5694703300667348, "flos": 18989670902400.0, "grad_norm": 2.6952107407727146, "language_loss": 0.83023071, "learning_rate": 1.6489538274371305e-06, "loss": 0.85229635, "num_input_tokens_seen": 102219825, "step": 4736, "time_per_iteration": 2.791064739227295 }, { "auxiliary_loss_clip": 0.01169692, "auxiliary_loss_mlp": 0.01027401, "balance_loss_clip": 1.01390123, "balance_loss_mlp": 1.01993299, "epoch": 0.5695905729573739, "flos": 21908705558400.0, "grad_norm": 1.8787833095572182, "language_loss": 0.82850647, "learning_rate": 1.6481869757252396e-06, "loss": 0.8504774, "num_input_tokens_seen": 102238160, "step": 4737, "time_per_iteration": 2.678405284881592 }, { "auxiliary_loss_clip": 0.0117375, "auxiliary_loss_mlp": 0.01026541, "balance_loss_clip": 1.01316428, "balance_loss_mlp": 1.01927257, "epoch": 0.569710815848013, "flos": 28476659232000.0, "grad_norm": 1.314083876610931, "language_loss": 0.71678555, "learning_rate": 1.647420177383105e-06, "loss": 0.73878843, "num_input_tokens_seen": 102261030, "step": 4738, "time_per_iteration": 2.70883846282959 }, { "auxiliary_loss_clip": 0.01170677, "auxiliary_loss_mlp": 0.010305, "balance_loss_clip": 1.01406765, "balance_loss_mlp": 1.02323413, "epoch": 0.569831058738652, "flos": 28366162018560.0, "grad_norm": 1.7086835090041654, "language_loss": 0.72620237, "learning_rate": 1.646653432527049e-06, "loss": 0.74821413, "num_input_tokens_seen": 102281670, "step": 4739, "time_per_iteration": 2.761516809463501 }, { "auxiliary_loss_clip": 0.01172045, "auxiliary_loss_mlp": 0.0102874, "balance_loss_clip": 0.93634468, "balance_loss_mlp": 1.02166486, "epoch": 0.5699513016292912, "flos": 25849973370240.0, "grad_norm": 1.419723249428737, "language_loss": 0.74011272, "learning_rate": 1.645886741273387e-06, "loss": 0.7621206, "num_input_tokens_seen": 102303485, "step": 4740, "time_per_iteration": 2.7844183444976807 }, { "auxiliary_loss_clip": 0.01173195, "auxiliary_loss_mlp": 0.01031507, "balance_loss_clip": 0.93975806, "balance_loss_mlp": 1.02324581, "epoch": 0.5700715445199303, "flos": 18037858360320.0, "grad_norm": 2.3441357432506136, "language_loss": 0.73310649, "learning_rate": 1.645120103738424e-06, "loss": 0.75515354, "num_input_tokens_seen": 102320995, "step": 4741, "time_per_iteration": 2.677733898162842 }, { "auxiliary_loss_clip": 0.01159767, "auxiliary_loss_mlp": 0.01122531, "balance_loss_clip": 1.01045728, "balance_loss_mlp": 0.0, "epoch": 0.5701917874105693, "flos": 11473352392320.0, "grad_norm": 2.100187323344353, "language_loss": 0.83790815, "learning_rate": 1.6443535200384591e-06, "loss": 0.86073112, "num_input_tokens_seen": 102339170, "step": 4742, "time_per_iteration": 2.6587533950805664 }, { "auxiliary_loss_clip": 0.01173708, "auxiliary_loss_mlp": 0.01028957, "balance_loss_clip": 1.05205595, "balance_loss_mlp": 1.02109528, "epoch": 0.5703120303012085, "flos": 21761759018880.0, "grad_norm": 1.5273493201318196, "language_loss": 0.70591605, "learning_rate": 1.6435869902897827e-06, "loss": 0.72794271, "num_input_tokens_seen": 102357750, "step": 4743, "time_per_iteration": 2.6389923095703125 }, { "auxiliary_loss_clip": 0.0107523, "auxiliary_loss_mlp": 0.01000032, "balance_loss_clip": 0.90207839, "balance_loss_mlp": 0.99826759, "epoch": 0.5704322731918475, "flos": 56746258513920.0, "grad_norm": 0.8001991900012321, "language_loss": 0.61969638, "learning_rate": 1.6428205146086764e-06, "loss": 0.64044905, "num_input_tokens_seen": 102419730, "step": 4744, "time_per_iteration": 3.348389148712158 }, { "auxiliary_loss_clip": 0.01176283, "auxiliary_loss_mlp": 0.01022321, "balance_loss_clip": 0.97368252, "balance_loss_mlp": 1.01479244, "epoch": 0.5705525160824866, "flos": 20741141975040.0, "grad_norm": 1.5019344876155356, "language_loss": 0.70847118, "learning_rate": 1.6420540931114142e-06, "loss": 0.73045719, "num_input_tokens_seen": 102440320, "step": 4745, "time_per_iteration": 2.6813278198242188 }, { "auxiliary_loss_clip": 0.01176429, "auxiliary_loss_mlp": 0.01028252, "balance_loss_clip": 0.97412622, "balance_loss_mlp": 1.02053905, "epoch": 0.5706727589731257, "flos": 18771262254720.0, "grad_norm": 1.6043075690117343, "language_loss": 0.79136777, "learning_rate": 1.6412877259142616e-06, "loss": 0.81341457, "num_input_tokens_seen": 102460240, "step": 4746, "time_per_iteration": 2.696263074874878 }, { "auxiliary_loss_clip": 0.01168875, "auxiliary_loss_mlp": 0.01026773, "balance_loss_clip": 0.97496474, "balance_loss_mlp": 1.01955462, "epoch": 0.5707930018637648, "flos": 27634733372160.0, "grad_norm": 2.222708711526621, "language_loss": 0.7390312, "learning_rate": 1.6405214131334757e-06, "loss": 0.76098764, "num_input_tokens_seen": 102478765, "step": 4747, "time_per_iteration": 2.770901679992676 }, { "auxiliary_loss_clip": 0.01164294, "auxiliary_loss_mlp": 0.01031228, "balance_loss_clip": 0.89837623, "balance_loss_mlp": 1.02334774, "epoch": 0.5709132447544039, "flos": 27597673514880.0, "grad_norm": 1.688696082041837, "language_loss": 0.79649174, "learning_rate": 1.6397551548853052e-06, "loss": 0.81844693, "num_input_tokens_seen": 102496930, "step": 4748, "time_per_iteration": 3.7098007202148438 }, { "auxiliary_loss_clip": 0.01171839, "auxiliary_loss_mlp": 0.01025211, "balance_loss_clip": 0.97494471, "balance_loss_mlp": 1.01716089, "epoch": 0.571033487645043, "flos": 21686095019520.0, "grad_norm": 1.6410677105662794, "language_loss": 0.70870233, "learning_rate": 1.6389889512859917e-06, "loss": 0.7306729, "num_input_tokens_seen": 102516590, "step": 4749, "time_per_iteration": 2.6911849975585938 }, { "auxiliary_loss_clip": 0.01074474, "auxiliary_loss_mlp": 0.01001421, "balance_loss_clip": 0.94041681, "balance_loss_mlp": 0.99975187, "epoch": 0.5711537305356821, "flos": 70181445980160.0, "grad_norm": 0.8243586683075905, "language_loss": 0.60389978, "learning_rate": 1.638222802451767e-06, "loss": 0.62465876, "num_input_tokens_seen": 102578070, "step": 4750, "time_per_iteration": 3.275149345397949 }, { "auxiliary_loss_clip": 0.01166903, "auxiliary_loss_mlp": 0.01025271, "balance_loss_clip": 1.01257575, "balance_loss_mlp": 1.01793408, "epoch": 0.5712739734263211, "flos": 24717494396160.0, "grad_norm": 1.5613844825268155, "language_loss": 0.75275981, "learning_rate": 1.6374567084988561e-06, "loss": 0.77468157, "num_input_tokens_seen": 102599255, "step": 4751, "time_per_iteration": 2.6823980808258057 }, { "auxiliary_loss_clip": 0.01180497, "auxiliary_loss_mlp": 0.01036344, "balance_loss_clip": 0.97824252, "balance_loss_mlp": 1.02821934, "epoch": 0.5713942163169603, "flos": 26578169792640.0, "grad_norm": 1.8225584981048755, "language_loss": 0.76408833, "learning_rate": 1.6366906695434738e-06, "loss": 0.78625673, "num_input_tokens_seen": 102621775, "step": 4752, "time_per_iteration": 3.6022839546203613 }, { "auxiliary_loss_clip": 0.01173807, "auxiliary_loss_mlp": 0.01028197, "balance_loss_clip": 1.01384282, "balance_loss_mlp": 1.02078235, "epoch": 0.5715144592075994, "flos": 21142443697920.0, "grad_norm": 2.430291003350859, "language_loss": 0.85768527, "learning_rate": 1.6359246857018275e-06, "loss": 0.87970531, "num_input_tokens_seen": 102639305, "step": 4753, "time_per_iteration": 2.609912395477295 }, { "auxiliary_loss_clip": 0.01163528, "auxiliary_loss_mlp": 0.01025193, "balance_loss_clip": 0.89313173, "balance_loss_mlp": 1.01799595, "epoch": 0.5716347020982384, "flos": 23330265189120.0, "grad_norm": 1.7811179777120951, "language_loss": 0.78110683, "learning_rate": 1.6351587570901178e-06, "loss": 0.80299401, "num_input_tokens_seen": 102659430, "step": 4754, "time_per_iteration": 3.7311887741088867 }, { "auxiliary_loss_clip": 0.01172526, "auxiliary_loss_mlp": 0.01025225, "balance_loss_clip": 0.93687803, "balance_loss_mlp": 1.01754808, "epoch": 0.5717549449888776, "flos": 17009555806080.0, "grad_norm": 2.741784543450186, "language_loss": 0.76218492, "learning_rate": 1.634392883824534e-06, "loss": 0.7841624, "num_input_tokens_seen": 102671430, "step": 4755, "time_per_iteration": 2.63808274269104 }, { "auxiliary_loss_clip": 0.01174214, "auxiliary_loss_mlp": 0.01028858, "balance_loss_clip": 0.8967427, "balance_loss_mlp": 1.02129364, "epoch": 0.5718751878795166, "flos": 35518130922240.0, "grad_norm": 1.6102037032520118, "language_loss": 0.67592037, "learning_rate": 1.6336270660212595e-06, "loss": 0.69795114, "num_input_tokens_seen": 102693025, "step": 4756, "time_per_iteration": 2.8633482456207275 }, { "auxiliary_loss_clip": 0.011739, "auxiliary_loss_mlp": 0.01024781, "balance_loss_clip": 0.97751272, "balance_loss_mlp": 1.01657319, "epoch": 0.5719954307701557, "flos": 38613989255040.0, "grad_norm": 1.956902567418706, "language_loss": 0.66025215, "learning_rate": 1.6328613037964676e-06, "loss": 0.682239, "num_input_tokens_seen": 102716090, "step": 4757, "time_per_iteration": 3.717622995376587 }, { "auxiliary_loss_clip": 0.01172674, "auxiliary_loss_mlp": 0.01024733, "balance_loss_clip": 1.01195133, "balance_loss_mlp": 1.01702023, "epoch": 0.5721156736607949, "flos": 20631111638400.0, "grad_norm": 2.1447276975347087, "language_loss": 0.67949605, "learning_rate": 1.6320955972663241e-06, "loss": 0.70147014, "num_input_tokens_seen": 102735685, "step": 4758, "time_per_iteration": 2.7026333808898926 }, { "auxiliary_loss_clip": 0.01174321, "auxiliary_loss_mlp": 0.01026218, "balance_loss_clip": 1.01167679, "balance_loss_mlp": 1.01875544, "epoch": 0.5722359165514339, "flos": 37415076076800.0, "grad_norm": 1.678283771622943, "language_loss": 0.65292621, "learning_rate": 1.6313299465469857e-06, "loss": 0.67493165, "num_input_tokens_seen": 102758415, "step": 4759, "time_per_iteration": 2.812150716781616 }, { "auxiliary_loss_clip": 0.01166092, "auxiliary_loss_mlp": 0.01029675, "balance_loss_clip": 1.01084888, "balance_loss_mlp": 1.02170563, "epoch": 0.572356159442073, "flos": 21972877205760.0, "grad_norm": 2.442846494854772, "language_loss": 0.79537207, "learning_rate": 1.6305643517546014e-06, "loss": 0.81732976, "num_input_tokens_seen": 102773795, "step": 4760, "time_per_iteration": 2.6285347938537598 }, { "auxiliary_loss_clip": 0.01172823, "auxiliary_loss_mlp": 0.01025293, "balance_loss_clip": 1.05152512, "balance_loss_mlp": 1.01809299, "epoch": 0.5724764023327121, "flos": 19135540033920.0, "grad_norm": 1.8488810105844748, "language_loss": 0.84791791, "learning_rate": 1.629798813005311e-06, "loss": 0.86989909, "num_input_tokens_seen": 102793515, "step": 4761, "time_per_iteration": 2.6362738609313965 }, { "auxiliary_loss_clip": 0.01174867, "auxiliary_loss_mlp": 0.01025309, "balance_loss_clip": 0.90139019, "balance_loss_mlp": 1.01759589, "epoch": 0.5725966452233512, "flos": 22819759142400.0, "grad_norm": 1.9989615431461742, "language_loss": 0.71057761, "learning_rate": 1.6290333304152473e-06, "loss": 0.73257935, "num_input_tokens_seen": 102813390, "step": 4762, "time_per_iteration": 2.7919795513153076 }, { "auxiliary_loss_clip": 0.01171596, "auxiliary_loss_mlp": 0.01026669, "balance_loss_clip": 0.97690493, "balance_loss_mlp": 1.01860452, "epoch": 0.5727168881139902, "flos": 41496610498560.0, "grad_norm": 1.8746057721356904, "language_loss": 0.56850934, "learning_rate": 1.6282679041005314e-06, "loss": 0.59049195, "num_input_tokens_seen": 102838980, "step": 4763, "time_per_iteration": 2.8101730346679688 }, { "auxiliary_loss_clip": 0.0116077, "auxiliary_loss_mlp": 0.01028682, "balance_loss_clip": 0.97094542, "balance_loss_mlp": 1.02124953, "epoch": 0.5728371310046293, "flos": 14647675985280.0, "grad_norm": 2.0351558808883325, "language_loss": 0.86791599, "learning_rate": 1.6275025341772789e-06, "loss": 0.88981056, "num_input_tokens_seen": 102855285, "step": 4764, "time_per_iteration": 2.6706180572509766 }, { "auxiliary_loss_clip": 0.01172797, "auxiliary_loss_mlp": 0.01038671, "balance_loss_clip": 0.97424316, "balance_loss_mlp": 1.03039765, "epoch": 0.5729573738952685, "flos": 21506613736320.0, "grad_norm": 12.605694393460332, "language_loss": 0.81779039, "learning_rate": 1.626737220761596e-06, "loss": 0.83990502, "num_input_tokens_seen": 102872750, "step": 4765, "time_per_iteration": 2.6634650230407715 }, { "auxiliary_loss_clip": 0.01174336, "auxiliary_loss_mlp": 0.0103118, "balance_loss_clip": 1.01557326, "balance_loss_mlp": 1.0237056, "epoch": 0.5730776167859075, "flos": 23621680229760.0, "grad_norm": 1.8488287388489455, "language_loss": 0.78566039, "learning_rate": 1.62597196396958e-06, "loss": 0.80771554, "num_input_tokens_seen": 102890920, "step": 4766, "time_per_iteration": 2.631263017654419 }, { "auxiliary_loss_clip": 0.01172179, "auxiliary_loss_mlp": 0.01025664, "balance_loss_clip": 1.0126791, "balance_loss_mlp": 1.0180881, "epoch": 0.5731978596765466, "flos": 25739224761600.0, "grad_norm": 1.864346494064599, "language_loss": 0.85336709, "learning_rate": 1.6252067639173197e-06, "loss": 0.87534559, "num_input_tokens_seen": 102912830, "step": 4767, "time_per_iteration": 2.6816160678863525 }, { "auxiliary_loss_clip": 0.01174123, "auxiliary_loss_mlp": 0.01027791, "balance_loss_clip": 1.01251638, "balance_loss_mlp": 1.01956582, "epoch": 0.5733181025671857, "flos": 26359509749760.0, "grad_norm": 2.040737834314138, "language_loss": 0.69746053, "learning_rate": 1.6244416207208956e-06, "loss": 0.71947968, "num_input_tokens_seen": 102933765, "step": 4768, "time_per_iteration": 2.7073161602020264 }, { "auxiliary_loss_clip": 0.01173431, "auxiliary_loss_mlp": 0.0102581, "balance_loss_clip": 0.9370122, "balance_loss_mlp": 1.01756692, "epoch": 0.5734383454578248, "flos": 29423874833280.0, "grad_norm": 1.9693823241116069, "language_loss": 0.73497069, "learning_rate": 1.6236765344963787e-06, "loss": 0.75696307, "num_input_tokens_seen": 102955025, "step": 4769, "time_per_iteration": 2.8025882244110107 }, { "auxiliary_loss_clip": 0.01170727, "auxiliary_loss_mlp": 0.01025939, "balance_loss_clip": 0.97561687, "balance_loss_mlp": 1.01821399, "epoch": 0.5735585883484638, "flos": 34969954487040.0, "grad_norm": 1.9393744449740848, "language_loss": 0.69038719, "learning_rate": 1.6229115053598322e-06, "loss": 0.71235389, "num_input_tokens_seen": 102976780, "step": 4770, "time_per_iteration": 2.824862480163574 }, { "auxiliary_loss_clip": 0.01173648, "auxiliary_loss_mlp": 0.01025462, "balance_loss_clip": 1.01385128, "balance_loss_mlp": 1.01847315, "epoch": 0.573678831239103, "flos": 18770759464320.0, "grad_norm": 1.7060467627939873, "language_loss": 0.71983063, "learning_rate": 1.6221465334273108e-06, "loss": 0.74182171, "num_input_tokens_seen": 102995990, "step": 4771, "time_per_iteration": 2.6687376499176025 }, { "auxiliary_loss_clip": 0.01177118, "auxiliary_loss_mlp": 0.01022211, "balance_loss_clip": 0.93732089, "balance_loss_mlp": 1.01442671, "epoch": 0.5737990741297421, "flos": 25702883176320.0, "grad_norm": 1.9653903056684936, "language_loss": 0.61979723, "learning_rate": 1.6213816188148593e-06, "loss": 0.64179051, "num_input_tokens_seen": 103014695, "step": 4772, "time_per_iteration": 2.761899471282959 }, { "auxiliary_loss_clip": 0.01166016, "auxiliary_loss_mlp": 0.0103011, "balance_loss_clip": 0.97777212, "balance_loss_mlp": 1.02301133, "epoch": 0.5739193170203811, "flos": 27269234530560.0, "grad_norm": 1.5365047236456009, "language_loss": 0.77026248, "learning_rate": 1.6206167616385162e-06, "loss": 0.79222375, "num_input_tokens_seen": 103035760, "step": 4773, "time_per_iteration": 3.6487390995025635 }, { "auxiliary_loss_clip": 0.01180631, "auxiliary_loss_mlp": 0.01027509, "balance_loss_clip": 0.97791505, "balance_loss_mlp": 1.01926541, "epoch": 0.5740395599110203, "flos": 12239721993600.0, "grad_norm": 1.9235046245666862, "language_loss": 0.73354137, "learning_rate": 1.6198519620143078e-06, "loss": 0.75562274, "num_input_tokens_seen": 103052915, "step": 4774, "time_per_iteration": 2.6626038551330566 }, { "auxiliary_loss_clip": 0.01173976, "auxiliary_loss_mlp": 0.0103084, "balance_loss_clip": 0.93681097, "balance_loss_mlp": 1.02362823, "epoch": 0.5741598028016593, "flos": 25921399564800.0, "grad_norm": 1.7166789145351782, "language_loss": 0.77938247, "learning_rate": 1.6190872200582546e-06, "loss": 0.80143058, "num_input_tokens_seen": 103074655, "step": 4775, "time_per_iteration": 2.785268545150757 }, { "auxiliary_loss_clip": 0.01164716, "auxiliary_loss_mlp": 0.01123088, "balance_loss_clip": 0.97218621, "balance_loss_mlp": 0.0, "epoch": 0.5742800456922984, "flos": 19244133826560.0, "grad_norm": 2.010838180539087, "language_loss": 0.78247792, "learning_rate": 1.6183225358863676e-06, "loss": 0.80535591, "num_input_tokens_seen": 103091550, "step": 4776, "time_per_iteration": 2.634241819381714 }, { "auxiliary_loss_clip": 0.01164376, "auxiliary_loss_mlp": 0.01033437, "balance_loss_clip": 0.97201455, "balance_loss_mlp": 1.02545619, "epoch": 0.5744002885829376, "flos": 30920487932160.0, "grad_norm": 2.02756106894906, "language_loss": 0.71555859, "learning_rate": 1.617557909614648e-06, "loss": 0.73753679, "num_input_tokens_seen": 103110985, "step": 4777, "time_per_iteration": 2.7450969219207764 }, { "auxiliary_loss_clip": 0.01164541, "auxiliary_loss_mlp": 0.0103338, "balance_loss_clip": 0.93396753, "balance_loss_mlp": 1.02581656, "epoch": 0.5745205314735766, "flos": 23840017050240.0, "grad_norm": 1.8289295357045885, "language_loss": 0.86030221, "learning_rate": 1.6167933413590899e-06, "loss": 0.88228142, "num_input_tokens_seen": 103129890, "step": 4778, "time_per_iteration": 3.763535737991333 }, { "auxiliary_loss_clip": 0.01169176, "auxiliary_loss_mlp": 0.0102594, "balance_loss_clip": 1.01101494, "balance_loss_mlp": 1.01847148, "epoch": 0.5746407743642157, "flos": 12311902373760.0, "grad_norm": 1.9326819437535177, "language_loss": 0.90535617, "learning_rate": 1.6160288312356773e-06, "loss": 0.92730737, "num_input_tokens_seen": 103147020, "step": 4779, "time_per_iteration": 2.6137964725494385 }, { "auxiliary_loss_clip": 0.01178041, "auxiliary_loss_mlp": 0.01028639, "balance_loss_clip": 1.0131495, "balance_loss_mlp": 1.02075887, "epoch": 0.5747610172548548, "flos": 24133658734080.0, "grad_norm": 1.6297212502493128, "language_loss": 0.81600231, "learning_rate": 1.6152643793603857e-06, "loss": 0.83806914, "num_input_tokens_seen": 103167370, "step": 4780, "time_per_iteration": 3.5824038982391357 }, { "auxiliary_loss_clip": 0.01175946, "auxiliary_loss_mlp": 0.01025217, "balance_loss_clip": 1.05208802, "balance_loss_mlp": 1.01720309, "epoch": 0.5748812601454939, "flos": 25408451393280.0, "grad_norm": 1.6816300407431368, "language_loss": 0.87802023, "learning_rate": 1.6144999858491815e-06, "loss": 0.90003181, "num_input_tokens_seen": 103186000, "step": 4781, "time_per_iteration": 2.638383388519287 }, { "auxiliary_loss_clip": 0.0117695, "auxiliary_loss_mlp": 0.01024848, "balance_loss_clip": 0.97474432, "balance_loss_mlp": 1.01699221, "epoch": 0.575001503036133, "flos": 30624942827520.0, "grad_norm": 1.5993120857504985, "language_loss": 0.85962588, "learning_rate": 1.6137356508180232e-06, "loss": 0.88164383, "num_input_tokens_seen": 103207710, "step": 4782, "time_per_iteration": 2.7295517921447754 }, { "auxiliary_loss_clip": 0.01174654, "auxiliary_loss_mlp": 0.01123349, "balance_loss_clip": 1.05057049, "balance_loss_mlp": 0.0, "epoch": 0.5751217459267721, "flos": 21726566668800.0, "grad_norm": 1.7633871687166438, "language_loss": 0.8110733, "learning_rate": 1.6129713743828593e-06, "loss": 0.83405334, "num_input_tokens_seen": 103226720, "step": 4783, "time_per_iteration": 3.547368288040161 }, { "auxiliary_loss_clip": 0.01169262, "auxiliary_loss_mlp": 0.01026947, "balance_loss_clip": 0.97139376, "balance_loss_mlp": 1.02021503, "epoch": 0.5752419888174112, "flos": 21651620941440.0, "grad_norm": 1.5098922076866907, "language_loss": 0.75485927, "learning_rate": 1.6122071566596306e-06, "loss": 0.77682137, "num_input_tokens_seen": 103246995, "step": 4784, "time_per_iteration": 2.770841360092163 }, { "auxiliary_loss_clip": 0.01179361, "auxiliary_loss_mlp": 0.01031477, "balance_loss_clip": 1.01503778, "balance_loss_mlp": 1.02410984, "epoch": 0.5753622317080502, "flos": 17775997234560.0, "grad_norm": 2.0372565874455986, "language_loss": 0.83064777, "learning_rate": 1.6114429977642674e-06, "loss": 0.85275614, "num_input_tokens_seen": 103261500, "step": 4785, "time_per_iteration": 2.621615409851074 }, { "auxiliary_loss_clip": 0.01172297, "auxiliary_loss_mlp": 0.01027575, "balance_loss_clip": 1.01318598, "balance_loss_mlp": 1.02009499, "epoch": 0.5754824745986894, "flos": 19789616741760.0, "grad_norm": 1.9945191568043545, "language_loss": 0.73723733, "learning_rate": 1.6106788978126926e-06, "loss": 0.75923604, "num_input_tokens_seen": 103280475, "step": 4786, "time_per_iteration": 2.626302719116211 }, { "auxiliary_loss_clip": 0.01163885, "auxiliary_loss_mlp": 0.01029905, "balance_loss_clip": 0.8940891, "balance_loss_mlp": 1.02231193, "epoch": 0.5756027174893285, "flos": 30985665160320.0, "grad_norm": 2.1103448309552437, "language_loss": 0.78351587, "learning_rate": 1.6099148569208196e-06, "loss": 0.80545378, "num_input_tokens_seen": 103297695, "step": 4787, "time_per_iteration": 2.8225388526916504 }, { "auxiliary_loss_clip": 0.01174694, "auxiliary_loss_mlp": 0.01034902, "balance_loss_clip": 0.97673941, "balance_loss_mlp": 1.02686715, "epoch": 0.5757229603799675, "flos": 28546864364160.0, "grad_norm": 1.7174558803868474, "language_loss": 0.63150299, "learning_rate": 1.6091508752045523e-06, "loss": 0.6535989, "num_input_tokens_seen": 103318575, "step": 4788, "time_per_iteration": 2.7110037803649902 }, { "auxiliary_loss_clip": 0.01158976, "auxiliary_loss_mlp": 0.01028634, "balance_loss_clip": 0.93254602, "balance_loss_mlp": 1.0210402, "epoch": 0.5758432032706067, "flos": 22999024944000.0, "grad_norm": 1.5535963437084315, "language_loss": 0.86360288, "learning_rate": 1.608386952779787e-06, "loss": 0.88547897, "num_input_tokens_seen": 103337945, "step": 4789, "time_per_iteration": 2.7513442039489746 }, { "auxiliary_loss_clip": 0.01177626, "auxiliary_loss_mlp": 0.01026665, "balance_loss_clip": 0.97576386, "balance_loss_mlp": 1.01908028, "epoch": 0.5759634461612457, "flos": 25739727552000.0, "grad_norm": 1.6109983527383334, "language_loss": 0.74700129, "learning_rate": 1.6076230897624098e-06, "loss": 0.76904416, "num_input_tokens_seen": 103360150, "step": 4790, "time_per_iteration": 2.6921310424804688 }, { "auxiliary_loss_clip": 0.01172498, "auxiliary_loss_mlp": 0.01025093, "balance_loss_clip": 1.00992095, "balance_loss_mlp": 1.01677775, "epoch": 0.5760836890518848, "flos": 30591761639040.0, "grad_norm": 1.8373559273573667, "language_loss": 0.77194774, "learning_rate": 1.6068592862682974e-06, "loss": 0.79392362, "num_input_tokens_seen": 103378305, "step": 4791, "time_per_iteration": 2.744241237640381 }, { "auxiliary_loss_clip": 0.01174679, "auxiliary_loss_mlp": 0.01024821, "balance_loss_clip": 0.97492939, "balance_loss_mlp": 1.01703691, "epoch": 0.576203931942524, "flos": 36538963447680.0, "grad_norm": 1.7925732366542797, "language_loss": 0.73588884, "learning_rate": 1.6060955424133187e-06, "loss": 0.75788379, "num_input_tokens_seen": 103399230, "step": 4792, "time_per_iteration": 2.8181698322296143 }, { "auxiliary_loss_clip": 0.01173451, "auxiliary_loss_mlp": 0.01028156, "balance_loss_clip": 1.01336002, "balance_loss_mlp": 1.02031767, "epoch": 0.576324174833163, "flos": 25516937445120.0, "grad_norm": 1.7179898968276788, "language_loss": 0.89358699, "learning_rate": 1.6053318583133332e-06, "loss": 0.91560304, "num_input_tokens_seen": 103420100, "step": 4793, "time_per_iteration": 2.6486005783081055 }, { "auxiliary_loss_clip": 0.01172076, "auxiliary_loss_mlp": 0.0102723, "balance_loss_clip": 1.01252174, "balance_loss_mlp": 1.01994371, "epoch": 0.5764444177238021, "flos": 25119262995840.0, "grad_norm": 1.8064176760110027, "language_loss": 0.75111997, "learning_rate": 1.6045682340841907e-06, "loss": 0.77311301, "num_input_tokens_seen": 103439025, "step": 4794, "time_per_iteration": 2.683978319168091 }, { "auxiliary_loss_clip": 0.01076321, "auxiliary_loss_mlp": 0.01116749, "balance_loss_clip": 0.90323544, "balance_loss_mlp": 0.0, "epoch": 0.5765646606144411, "flos": 62212687758720.0, "grad_norm": 0.7576528681290013, "language_loss": 0.58066475, "learning_rate": 1.6038046698417336e-06, "loss": 0.60259545, "num_input_tokens_seen": 103499920, "step": 4795, "time_per_iteration": 3.2089178562164307 }, { "auxiliary_loss_clip": 0.0117232, "auxiliary_loss_mlp": 0.01026363, "balance_loss_clip": 1.01184225, "balance_loss_mlp": 1.01834035, "epoch": 0.5766849035050803, "flos": 25118760205440.0, "grad_norm": 1.9065628710219116, "language_loss": 0.68909943, "learning_rate": 1.6030411657017919e-06, "loss": 0.71108627, "num_input_tokens_seen": 103519575, "step": 4796, "time_per_iteration": 2.6710429191589355 }, { "auxiliary_loss_clip": 0.01164377, "auxiliary_loss_mlp": 0.01030553, "balance_loss_clip": 1.01108313, "balance_loss_mlp": 1.02363276, "epoch": 0.5768051463957193, "flos": 15991093578240.0, "grad_norm": 1.6906500363476953, "language_loss": 0.84352171, "learning_rate": 1.6022777217801903e-06, "loss": 0.86547107, "num_input_tokens_seen": 103536530, "step": 4797, "time_per_iteration": 2.6360442638397217 }, { "auxiliary_loss_clip": 0.01174188, "auxiliary_loss_mlp": 0.01029175, "balance_loss_clip": 0.93869764, "balance_loss_mlp": 1.02100313, "epoch": 0.5769253892863584, "flos": 22163635359360.0, "grad_norm": 1.976383219402723, "language_loss": 0.73688126, "learning_rate": 1.601514338192742e-06, "loss": 0.75891489, "num_input_tokens_seen": 103556460, "step": 4798, "time_per_iteration": 2.6946823596954346 }, { "auxiliary_loss_clip": 0.01170596, "auxiliary_loss_mlp": 0.0102406, "balance_loss_clip": 1.04954839, "balance_loss_mlp": 1.01700246, "epoch": 0.5770456321769976, "flos": 22856388036480.0, "grad_norm": 2.010467377041633, "language_loss": 0.71476889, "learning_rate": 1.6007510150552514e-06, "loss": 0.73671538, "num_input_tokens_seen": 103574520, "step": 4799, "time_per_iteration": 3.569782018661499 }, { "auxiliary_loss_clip": 0.01176534, "auxiliary_loss_mlp": 0.01028439, "balance_loss_clip": 1.01108801, "balance_loss_mlp": 1.02069354, "epoch": 0.5771658750676366, "flos": 46353672489600.0, "grad_norm": 1.4992809212535814, "language_loss": 0.62003148, "learning_rate": 1.599987752483515e-06, "loss": 0.64208126, "num_input_tokens_seen": 103598965, "step": 4800, "time_per_iteration": 2.83721661567688 }, { "auxiliary_loss_clip": 0.0116303, "auxiliary_loss_mlp": 0.01031362, "balance_loss_clip": 0.93222332, "balance_loss_mlp": 1.02372646, "epoch": 0.5772861179582757, "flos": 22159972172160.0, "grad_norm": 1.8115348989296236, "language_loss": 0.68378878, "learning_rate": 1.5992245505933184e-06, "loss": 0.7057327, "num_input_tokens_seen": 103618665, "step": 4801, "time_per_iteration": 2.7070794105529785 }, { "auxiliary_loss_clip": 0.01177784, "auxiliary_loss_mlp": 0.01027271, "balance_loss_clip": 1.05318367, "balance_loss_mlp": 1.01960588, "epoch": 0.5774063608489148, "flos": 31248926916480.0, "grad_norm": 1.7395487130095713, "language_loss": 0.70865446, "learning_rate": 1.5984614095004388e-06, "loss": 0.73070502, "num_input_tokens_seen": 103639800, "step": 4802, "time_per_iteration": 2.687243938446045 }, { "auxiliary_loss_clip": 0.01165822, "auxiliary_loss_mlp": 0.01027321, "balance_loss_clip": 1.01031983, "balance_loss_mlp": 1.01974857, "epoch": 0.5775266037395539, "flos": 22527123039360.0, "grad_norm": 2.186316907706923, "language_loss": 0.80803216, "learning_rate": 1.5976983293206438e-06, "loss": 0.82996356, "num_input_tokens_seen": 103655605, "step": 4803, "time_per_iteration": 2.665675163269043 }, { "auxiliary_loss_clip": 0.0116341, "auxiliary_loss_mlp": 0.01023038, "balance_loss_clip": 0.96970308, "balance_loss_mlp": 1.01540279, "epoch": 0.577646846630193, "flos": 21068790860160.0, "grad_norm": 1.5481445788396035, "language_loss": 0.71270812, "learning_rate": 1.5969353101696928e-06, "loss": 0.73457265, "num_input_tokens_seen": 103674045, "step": 4804, "time_per_iteration": 3.5567944049835205 }, { "auxiliary_loss_clip": 0.01172673, "auxiliary_loss_mlp": 0.01023446, "balance_loss_clip": 1.01193631, "balance_loss_mlp": 1.01639771, "epoch": 0.5777670895208321, "flos": 29714284293120.0, "grad_norm": 1.6578809473770688, "language_loss": 0.79563934, "learning_rate": 1.5961723521633341e-06, "loss": 0.81760055, "num_input_tokens_seen": 103695285, "step": 4805, "time_per_iteration": 2.8790242671966553 }, { "auxiliary_loss_clip": 0.01164709, "auxiliary_loss_mlp": 0.01031099, "balance_loss_clip": 0.9703114, "balance_loss_mlp": 1.02322555, "epoch": 0.5778873324114712, "flos": 19500428344320.0, "grad_norm": 2.1163780067994002, "language_loss": 0.90676647, "learning_rate": 1.5954094554173097e-06, "loss": 0.92872459, "num_input_tokens_seen": 103713275, "step": 4806, "time_per_iteration": 2.677145004272461 }, { "auxiliary_loss_clip": 0.01173907, "auxiliary_loss_mlp": 0.01032476, "balance_loss_clip": 0.97383773, "balance_loss_mlp": 1.02516222, "epoch": 0.5780075753021102, "flos": 14136846716160.0, "grad_norm": 1.9935844601329589, "language_loss": 0.79668415, "learning_rate": 1.5946466200473482e-06, "loss": 0.81874794, "num_input_tokens_seen": 103731185, "step": 4807, "time_per_iteration": 3.611171007156372 }, { "auxiliary_loss_clip": 0.01176322, "auxiliary_loss_mlp": 0.01027432, "balance_loss_clip": 0.97370976, "balance_loss_mlp": 1.02073205, "epoch": 0.5781278181927494, "flos": 15262178883840.0, "grad_norm": 1.816667868388869, "language_loss": 0.83301526, "learning_rate": 1.5938838461691723e-06, "loss": 0.85505283, "num_input_tokens_seen": 103748095, "step": 4808, "time_per_iteration": 3.5683324337005615 }, { "auxiliary_loss_clip": 0.01177191, "auxiliary_loss_mlp": 0.01031077, "balance_loss_clip": 1.05296552, "balance_loss_mlp": 1.0235815, "epoch": 0.5782480610833884, "flos": 16726831856640.0, "grad_norm": 2.533825461020709, "language_loss": 0.83138001, "learning_rate": 1.593121133898494e-06, "loss": 0.8534627, "num_input_tokens_seen": 103765300, "step": 4809, "time_per_iteration": 2.570295572280884 }, { "auxiliary_loss_clip": 0.01176983, "auxiliary_loss_mlp": 0.01034964, "balance_loss_clip": 1.01167476, "balance_loss_mlp": 1.02736461, "epoch": 0.5783683039740275, "flos": 25482140144640.0, "grad_norm": 2.7992782217281755, "language_loss": 0.79472244, "learning_rate": 1.592358483351016e-06, "loss": 0.81684196, "num_input_tokens_seen": 103785475, "step": 4810, "time_per_iteration": 2.660071611404419 }, { "auxiliary_loss_clip": 0.01169056, "auxiliary_loss_mlp": 0.01022816, "balance_loss_clip": 1.01085091, "balance_loss_mlp": 1.01571429, "epoch": 0.5784885468646667, "flos": 18405835240320.0, "grad_norm": 1.931845398880066, "language_loss": 0.72281003, "learning_rate": 1.5915958946424326e-06, "loss": 0.7447288, "num_input_tokens_seen": 103804160, "step": 4811, "time_per_iteration": 2.5925042629241943 }, { "auxiliary_loss_clip": 0.01174947, "auxiliary_loss_mlp": 0.01123788, "balance_loss_clip": 0.93617356, "balance_loss_mlp": 0.0, "epoch": 0.5786087897553057, "flos": 46100717936640.0, "grad_norm": 1.5071514471523701, "language_loss": 0.74481797, "learning_rate": 1.5908333678884271e-06, "loss": 0.76780534, "num_input_tokens_seen": 103830580, "step": 4812, "time_per_iteration": 2.90096378326416 }, { "auxiliary_loss_clip": 0.01172415, "auxiliary_loss_mlp": 0.01027671, "balance_loss_clip": 1.01283288, "balance_loss_mlp": 1.0201602, "epoch": 0.5787290326459448, "flos": 12385950261120.0, "grad_norm": 1.7883438721789156, "language_loss": 0.73852217, "learning_rate": 1.5900709032046743e-06, "loss": 0.76052302, "num_input_tokens_seen": 103848655, "step": 4813, "time_per_iteration": 2.6896591186523438 }, { "auxiliary_loss_clip": 0.01169906, "auxiliary_loss_mlp": 0.01025009, "balance_loss_clip": 0.97692597, "balance_loss_mlp": 1.01794839, "epoch": 0.5788492755365839, "flos": 23290332243840.0, "grad_norm": 2.3550460786378427, "language_loss": 0.78467172, "learning_rate": 1.5893085007068391e-06, "loss": 0.8066209, "num_input_tokens_seen": 103866215, "step": 4814, "time_per_iteration": 2.676488161087036 }, { "auxiliary_loss_clip": 0.01156937, "auxiliary_loss_mlp": 0.01027025, "balance_loss_clip": 0.97067153, "balance_loss_mlp": 1.0195744, "epoch": 0.578969518427223, "flos": 24061047390720.0, "grad_norm": 2.1907880252530094, "language_loss": 0.71455336, "learning_rate": 1.5885461605105786e-06, "loss": 0.73639292, "num_input_tokens_seen": 103887815, "step": 4815, "time_per_iteration": 2.721961498260498 }, { "auxiliary_loss_clip": 0.01177729, "auxiliary_loss_mlp": 0.01025114, "balance_loss_clip": 0.97576761, "balance_loss_mlp": 1.01780057, "epoch": 0.579089761317862, "flos": 21871825269120.0, "grad_norm": 1.8835736790974649, "language_loss": 0.76604694, "learning_rate": 1.5877838827315375e-06, "loss": 0.78807533, "num_input_tokens_seen": 103906360, "step": 4816, "time_per_iteration": 2.6390323638916016 }, { "auxiliary_loss_clip": 0.0117507, "auxiliary_loss_mlp": 0.01024168, "balance_loss_clip": 1.05262744, "balance_loss_mlp": 1.0163126, "epoch": 0.5792100042085012, "flos": 22929681738240.0, "grad_norm": 1.7579351546201778, "language_loss": 0.70133483, "learning_rate": 1.587021667485355e-06, "loss": 0.72332716, "num_input_tokens_seen": 103925730, "step": 4817, "time_per_iteration": 2.686959743499756 }, { "auxiliary_loss_clip": 0.01175183, "auxiliary_loss_mlp": 0.01023898, "balance_loss_clip": 0.9729054, "balance_loss_mlp": 1.01696265, "epoch": 0.5793302470991403, "flos": 21470056669440.0, "grad_norm": 1.6876599165559552, "language_loss": 0.78231895, "learning_rate": 1.5862595148876559e-06, "loss": 0.80430979, "num_input_tokens_seen": 103945835, "step": 4818, "time_per_iteration": 2.7022931575775146 }, { "auxiliary_loss_clip": 0.01172827, "auxiliary_loss_mlp": 0.01027985, "balance_loss_clip": 0.89813596, "balance_loss_mlp": 1.02026057, "epoch": 0.5794504899897793, "flos": 12711013367040.0, "grad_norm": 2.1326507520562847, "language_loss": 0.76118702, "learning_rate": 1.58549742505406e-06, "loss": 0.78319514, "num_input_tokens_seen": 103960580, "step": 4819, "time_per_iteration": 2.680922508239746 }, { "auxiliary_loss_clip": 0.01174717, "auxiliary_loss_mlp": 0.01025343, "balance_loss_clip": 1.05049455, "balance_loss_mlp": 1.01776135, "epoch": 0.5795707328804185, "flos": 14867054300160.0, "grad_norm": 1.984952051394037, "language_loss": 0.75937259, "learning_rate": 1.5847353981001747e-06, "loss": 0.7813732, "num_input_tokens_seen": 103977760, "step": 4820, "time_per_iteration": 2.674656391143799 }, { "auxiliary_loss_clip": 0.01165671, "auxiliary_loss_mlp": 0.0102876, "balance_loss_clip": 0.97284174, "balance_loss_mlp": 1.02123153, "epoch": 0.5796909757710575, "flos": 36430046432640.0, "grad_norm": 1.8394222113279566, "language_loss": 0.69731832, "learning_rate": 1.5839734341415993e-06, "loss": 0.71926266, "num_input_tokens_seen": 103999960, "step": 4821, "time_per_iteration": 2.7485368251800537 }, { "auxiliary_loss_clip": 0.01171258, "auxiliary_loss_mlp": 0.01025166, "balance_loss_clip": 1.01611555, "balance_loss_mlp": 1.01824021, "epoch": 0.5798112186616966, "flos": 23039891642880.0, "grad_norm": 1.6390237952245776, "language_loss": 0.76762414, "learning_rate": 1.5832115332939238e-06, "loss": 0.78958833, "num_input_tokens_seen": 104018400, "step": 4822, "time_per_iteration": 2.6090774536132812 }, { "auxiliary_loss_clip": 0.01178017, "auxiliary_loss_mlp": 0.01028395, "balance_loss_clip": 1.01514995, "balance_loss_mlp": 1.02087915, "epoch": 0.5799314615523358, "flos": 16652604401280.0, "grad_norm": 1.6070859209395096, "language_loss": 0.74687082, "learning_rate": 1.5824496956727272e-06, "loss": 0.76893497, "num_input_tokens_seen": 104035605, "step": 4823, "time_per_iteration": 2.6408848762512207 }, { "auxiliary_loss_clip": 0.01171574, "auxiliary_loss_mlp": 0.01029985, "balance_loss_clip": 0.97391367, "balance_loss_mlp": 1.0229038, "epoch": 0.5800517044429748, "flos": 20485673470080.0, "grad_norm": 1.5621328708745625, "language_loss": 0.72900462, "learning_rate": 1.5816879213935797e-06, "loss": 0.75102019, "num_input_tokens_seen": 104054415, "step": 4824, "time_per_iteration": 2.684293270111084 }, { "auxiliary_loss_clip": 0.0116862, "auxiliary_loss_mlp": 0.01026432, "balance_loss_clip": 1.01206017, "balance_loss_mlp": 1.01955366, "epoch": 0.5801719473336139, "flos": 31538258968320.0, "grad_norm": 1.5483983625063358, "language_loss": 0.79835415, "learning_rate": 1.5809262105720416e-06, "loss": 0.82030475, "num_input_tokens_seen": 104075455, "step": 4825, "time_per_iteration": 3.6522231101989746 }, { "auxiliary_loss_clip": 0.011737, "auxiliary_loss_mlp": 0.01025261, "balance_loss_clip": 1.05132079, "balance_loss_mlp": 1.01821864, "epoch": 0.580292190224253, "flos": 20375966355840.0, "grad_norm": 1.502864553199058, "language_loss": 0.7937004, "learning_rate": 1.5801645633236644e-06, "loss": 0.81569004, "num_input_tokens_seen": 104096440, "step": 4826, "time_per_iteration": 2.659268379211426 }, { "auxiliary_loss_clip": 0.01162239, "auxiliary_loss_mlp": 0.01029843, "balance_loss_clip": 0.97124076, "balance_loss_mlp": 1.02271104, "epoch": 0.5804124331148921, "flos": 26615373304320.0, "grad_norm": 1.7790990137607678, "language_loss": 0.77096182, "learning_rate": 1.579402979763989e-06, "loss": 0.79288268, "num_input_tokens_seen": 104116775, "step": 4827, "time_per_iteration": 2.6967458724975586 }, { "auxiliary_loss_clip": 0.01182507, "auxiliary_loss_mlp": 0.01025116, "balance_loss_clip": 0.86065423, "balance_loss_mlp": 1.01733756, "epoch": 0.5805326760055312, "flos": 13478496289920.0, "grad_norm": 2.6292619736278007, "language_loss": 0.81140441, "learning_rate": 1.578641460008548e-06, "loss": 0.8334806, "num_input_tokens_seen": 104134510, "step": 4828, "time_per_iteration": 2.759833574295044 }, { "auxiliary_loss_clip": 0.01170249, "auxiliary_loss_mlp": 0.0102971, "balance_loss_clip": 1.01189804, "balance_loss_mlp": 1.02271819, "epoch": 0.5806529188961702, "flos": 12091374823680.0, "grad_norm": 3.508136286963905, "language_loss": 0.67860591, "learning_rate": 1.5778800041728613e-06, "loss": 0.70060551, "num_input_tokens_seen": 104150800, "step": 4829, "time_per_iteration": 2.5742692947387695 }, { "auxiliary_loss_clip": 0.01168611, "auxiliary_loss_mlp": 0.01023049, "balance_loss_clip": 1.01374149, "balance_loss_mlp": 1.01618266, "epoch": 0.5807731617868094, "flos": 26214107495040.0, "grad_norm": 1.3981228932831606, "language_loss": 0.66234452, "learning_rate": 1.577118612372443e-06, "loss": 0.68426114, "num_input_tokens_seen": 104172640, "step": 4830, "time_per_iteration": 3.6154134273529053 }, { "auxiliary_loss_clip": 0.01166954, "auxiliary_loss_mlp": 0.01123084, "balance_loss_clip": 0.97250551, "balance_loss_mlp": 0.0, "epoch": 0.5808934046774484, "flos": 37962139190400.0, "grad_norm": 1.7750805697661352, "language_loss": 0.70218241, "learning_rate": 1.5763572847227943e-06, "loss": 0.72508276, "num_input_tokens_seen": 104193525, "step": 4831, "time_per_iteration": 2.8574776649475098 }, { "auxiliary_loss_clip": 0.01172028, "auxiliary_loss_mlp": 0.01023611, "balance_loss_clip": 1.01161623, "balance_loss_mlp": 1.01655376, "epoch": 0.5810136475680875, "flos": 20485853038080.0, "grad_norm": 1.7782667467791837, "language_loss": 0.81191349, "learning_rate": 1.5755960213394091e-06, "loss": 0.83386993, "num_input_tokens_seen": 104210625, "step": 4832, "time_per_iteration": 2.605346918106079 }, { "auxiliary_loss_clip": 0.01172854, "auxiliary_loss_mlp": 0.01025789, "balance_loss_clip": 0.93520916, "balance_loss_mlp": 1.01865125, "epoch": 0.5811338904587267, "flos": 17530153574400.0, "grad_norm": 1.8121144735561145, "language_loss": 0.78526461, "learning_rate": 1.5748348223377703e-06, "loss": 0.80725104, "num_input_tokens_seen": 104228180, "step": 4833, "time_per_iteration": 3.5895674228668213 }, { "auxiliary_loss_clip": 0.01171404, "auxiliary_loss_mlp": 0.01024949, "balance_loss_clip": 0.97552991, "balance_loss_mlp": 1.0178113, "epoch": 0.5812541333493657, "flos": 19458017360640.0, "grad_norm": 1.440086264705817, "language_loss": 0.77943492, "learning_rate": 1.5740736878333507e-06, "loss": 0.8013984, "num_input_tokens_seen": 104246020, "step": 4834, "time_per_iteration": 3.506565570831299 }, { "auxiliary_loss_clip": 0.0117405, "auxiliary_loss_mlp": 0.01026709, "balance_loss_clip": 0.97371483, "balance_loss_mlp": 1.01919913, "epoch": 0.5813743762400048, "flos": 20594949621120.0, "grad_norm": 2.1235468927704595, "language_loss": 0.7769109, "learning_rate": 1.5733126179416143e-06, "loss": 0.7989186, "num_input_tokens_seen": 104260505, "step": 4835, "time_per_iteration": 2.6546237468719482 }, { "auxiliary_loss_clip": 0.01170921, "auxiliary_loss_mlp": 0.01023928, "balance_loss_clip": 1.01072598, "balance_loss_mlp": 1.01676941, "epoch": 0.5814946191306439, "flos": 33178227246720.0, "grad_norm": 2.019343525534334, "language_loss": 0.72531438, "learning_rate": 1.5725516127780137e-06, "loss": 0.7472629, "num_input_tokens_seen": 104282640, "step": 4836, "time_per_iteration": 2.745758295059204 }, { "auxiliary_loss_clip": 0.01176998, "auxiliary_loss_mlp": 0.0102831, "balance_loss_clip": 1.01063073, "balance_loss_mlp": 1.02034116, "epoch": 0.581614862021283, "flos": 16143283503360.0, "grad_norm": 2.3958879398506654, "language_loss": 0.88317877, "learning_rate": 1.5717906724579943e-06, "loss": 0.90523189, "num_input_tokens_seen": 104299700, "step": 4837, "time_per_iteration": 2.5952954292297363 }, { "auxiliary_loss_clip": 0.0117649, "auxiliary_loss_mlp": 0.01027478, "balance_loss_clip": 0.9355222, "balance_loss_mlp": 1.02038789, "epoch": 0.581735104911922, "flos": 33802642298880.0, "grad_norm": 2.0023443060021044, "language_loss": 0.67871058, "learning_rate": 1.571029797096989e-06, "loss": 0.70075023, "num_input_tokens_seen": 104320805, "step": 4838, "time_per_iteration": 2.8327181339263916 }, { "auxiliary_loss_clip": 0.01172424, "auxiliary_loss_mlp": 0.01024058, "balance_loss_clip": 1.05086136, "balance_loss_mlp": 1.01724267, "epoch": 0.5818553478025612, "flos": 23331163029120.0, "grad_norm": 1.8986486034791459, "language_loss": 0.78988826, "learning_rate": 1.570268986810423e-06, "loss": 0.81185305, "num_input_tokens_seen": 104340700, "step": 4839, "time_per_iteration": 2.605787992477417 }, { "auxiliary_loss_clip": 0.01168623, "auxiliary_loss_mlp": 0.01020577, "balance_loss_clip": 0.97334981, "balance_loss_mlp": 1.01350832, "epoch": 0.5819755906932003, "flos": 20996143603200.0, "grad_norm": 1.82954556381647, "language_loss": 0.74795783, "learning_rate": 1.5695082417137096e-06, "loss": 0.76984984, "num_input_tokens_seen": 104358575, "step": 4840, "time_per_iteration": 2.6954550743103027 }, { "auxiliary_loss_clip": 0.01168523, "auxiliary_loss_mlp": 0.01026566, "balance_loss_clip": 0.97158241, "balance_loss_mlp": 1.01955938, "epoch": 0.5820958335838393, "flos": 21431668008960.0, "grad_norm": 1.5716964681699417, "language_loss": 0.74921167, "learning_rate": 1.5687475619222539e-06, "loss": 0.77116257, "num_input_tokens_seen": 104378530, "step": 4841, "time_per_iteration": 2.703399181365967 }, { "auxiliary_loss_clip": 0.01160901, "auxiliary_loss_mlp": 0.01024389, "balance_loss_clip": 0.9690907, "balance_loss_mlp": 1.01614523, "epoch": 0.5822160764744785, "flos": 17967473660160.0, "grad_norm": 2.12672753220257, "language_loss": 0.73301136, "learning_rate": 1.5679869475514496e-06, "loss": 0.75486422, "num_input_tokens_seen": 104395465, "step": 4842, "time_per_iteration": 2.631713390350342 }, { "auxiliary_loss_clip": 0.0117068, "auxiliary_loss_mlp": 0.010233, "balance_loss_clip": 1.01124668, "balance_loss_mlp": 1.01614439, "epoch": 0.5823363193651175, "flos": 23033858158080.0, "grad_norm": 1.9340073981643309, "language_loss": 0.81291336, "learning_rate": 1.567226398716682e-06, "loss": 0.83485317, "num_input_tokens_seen": 104415380, "step": 4843, "time_per_iteration": 2.659597396850586 }, { "auxiliary_loss_clip": 0.01179235, "auxiliary_loss_mlp": 0.01023989, "balance_loss_clip": 0.9745875, "balance_loss_mlp": 1.01628208, "epoch": 0.5824565622557566, "flos": 32891840110080.0, "grad_norm": 1.8350967888746383, "language_loss": 0.6177572, "learning_rate": 1.566465915533326e-06, "loss": 0.63978946, "num_input_tokens_seen": 104437410, "step": 4844, "time_per_iteration": 2.7843053340911865 }, { "auxiliary_loss_clip": 0.01169747, "auxiliary_loss_mlp": 0.01027901, "balance_loss_clip": 1.01148713, "balance_loss_mlp": 1.02055752, "epoch": 0.5825768051463958, "flos": 22229674513920.0, "grad_norm": 2.317093835013644, "language_loss": 0.88304651, "learning_rate": 1.5657054981167458e-06, "loss": 0.90502298, "num_input_tokens_seen": 104456305, "step": 4845, "time_per_iteration": 2.664926528930664 }, { "auxiliary_loss_clip": 0.01167988, "auxiliary_loss_mlp": 0.01023677, "balance_loss_clip": 1.01151061, "balance_loss_mlp": 1.0167625, "epoch": 0.5826970480370348, "flos": 28001561016960.0, "grad_norm": 1.816573892279501, "language_loss": 0.67825067, "learning_rate": 1.5649451465822965e-06, "loss": 0.70016724, "num_input_tokens_seen": 104477695, "step": 4846, "time_per_iteration": 2.717822790145874 }, { "auxiliary_loss_clip": 0.01168934, "auxiliary_loss_mlp": 0.01025294, "balance_loss_clip": 0.8997286, "balance_loss_mlp": 1.01874948, "epoch": 0.5828172909276739, "flos": 17858053854720.0, "grad_norm": 1.569524277471846, "language_loss": 0.8359881, "learning_rate": 1.5641848610453218e-06, "loss": 0.85793036, "num_input_tokens_seen": 104496355, "step": 4847, "time_per_iteration": 2.7401955127716064 }, { "auxiliary_loss_clip": 0.01169825, "auxiliary_loss_mlp": 0.01023091, "balance_loss_clip": 1.01337373, "balance_loss_mlp": 1.01598012, "epoch": 0.582937533818313, "flos": 19865244827520.0, "grad_norm": 1.9229978814030595, "language_loss": 0.8608613, "learning_rate": 1.563424641621158e-06, "loss": 0.88279045, "num_input_tokens_seen": 104515535, "step": 4848, "time_per_iteration": 2.6726441383361816 }, { "auxiliary_loss_clip": 0.01174099, "auxiliary_loss_mlp": 0.01028269, "balance_loss_clip": 0.97337019, "balance_loss_mlp": 1.02105391, "epoch": 0.5830577767089521, "flos": 26870734068480.0, "grad_norm": 1.7241042568338305, "language_loss": 0.69643652, "learning_rate": 1.5626644884251282e-06, "loss": 0.7184602, "num_input_tokens_seen": 104535055, "step": 4849, "time_per_iteration": 2.7362890243530273 }, { "auxiliary_loss_clip": 0.01171164, "auxiliary_loss_mlp": 0.01026009, "balance_loss_clip": 1.04985285, "balance_loss_mlp": 1.01902294, "epoch": 0.5831780195995911, "flos": 25298205575040.0, "grad_norm": 1.6049557520150999, "language_loss": 0.88109803, "learning_rate": 1.5619044015725488e-06, "loss": 0.90306973, "num_input_tokens_seen": 104554745, "step": 4850, "time_per_iteration": 2.5907607078552246 }, { "auxiliary_loss_clip": 0.01181301, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.0542407, "balance_loss_mlp": 1.02371836, "epoch": 0.5832982624902303, "flos": 14756988049920.0, "grad_norm": 2.220173552222601, "language_loss": 0.86673439, "learning_rate": 1.5611443811787224e-06, "loss": 0.8888638, "num_input_tokens_seen": 104568870, "step": 4851, "time_per_iteration": 3.525029182434082 }, { "auxiliary_loss_clip": 0.01175478, "auxiliary_loss_mlp": 0.01024146, "balance_loss_clip": 1.01642394, "balance_loss_mlp": 1.01756597, "epoch": 0.5834185053808694, "flos": 20444555376000.0, "grad_norm": 2.086129781153764, "language_loss": 0.69578373, "learning_rate": 1.560384427358945e-06, "loss": 0.71777999, "num_input_tokens_seen": 104588415, "step": 4852, "time_per_iteration": 2.661118507385254 }, { "auxiliary_loss_clip": 0.01161551, "auxiliary_loss_mlp": 0.01028804, "balance_loss_clip": 0.96917093, "balance_loss_mlp": 1.02126122, "epoch": 0.5835387482715084, "flos": 27200394115200.0, "grad_norm": 1.409446318427351, "language_loss": 0.73020393, "learning_rate": 1.5596245402284998e-06, "loss": 0.7521075, "num_input_tokens_seen": 104611940, "step": 4853, "time_per_iteration": 2.7532482147216797 }, { "auxiliary_loss_clip": 0.0117669, "auxiliary_loss_mlp": 0.01028021, "balance_loss_clip": 1.01562858, "balance_loss_mlp": 1.02037358, "epoch": 0.5836589911621476, "flos": 16654615562880.0, "grad_norm": 1.7092310513900153, "language_loss": 0.81843221, "learning_rate": 1.5588647199026619e-06, "loss": 0.84047937, "num_input_tokens_seen": 104629675, "step": 4854, "time_per_iteration": 2.6649169921875 }, { "auxiliary_loss_clip": 0.01179117, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 1.05392373, "balance_loss_mlp": 1.02372241, "epoch": 0.5837792340527866, "flos": 20446817932800.0, "grad_norm": 2.292507607394243, "language_loss": 0.87802529, "learning_rate": 1.5581049664966956e-06, "loss": 0.90013057, "num_input_tokens_seen": 104647435, "step": 4855, "time_per_iteration": 2.562539577484131 }, { "auxiliary_loss_clip": 0.01085388, "auxiliary_loss_mlp": 0.01009162, "balance_loss_clip": 0.83109885, "balance_loss_mlp": 1.00736189, "epoch": 0.5838994769434257, "flos": 65995480765440.0, "grad_norm": 0.9933767824581061, "language_loss": 0.65178984, "learning_rate": 1.5573452801258545e-06, "loss": 0.67273533, "num_input_tokens_seen": 104694605, "step": 4856, "time_per_iteration": 4.034788131713867 }, { "auxiliary_loss_clip": 0.01181019, "auxiliary_loss_mlp": 0.01032333, "balance_loss_clip": 1.01459599, "balance_loss_mlp": 1.02460766, "epoch": 0.5840197198340649, "flos": 21470523546240.0, "grad_norm": 2.083545256088693, "language_loss": 0.63721377, "learning_rate": 1.5565856609053824e-06, "loss": 0.6593473, "num_input_tokens_seen": 104713400, "step": 4857, "time_per_iteration": 2.601360321044922 }, { "auxiliary_loss_clip": 0.01176046, "auxiliary_loss_mlp": 0.01025415, "balance_loss_clip": 1.05230951, "balance_loss_mlp": 1.01720142, "epoch": 0.5841399627247039, "flos": 19135144984320.0, "grad_norm": 1.7401298886476373, "language_loss": 0.80298036, "learning_rate": 1.5558261089505127e-06, "loss": 0.82499492, "num_input_tokens_seen": 104732130, "step": 4858, "time_per_iteration": 2.634106159210205 }, { "auxiliary_loss_clip": 0.01172049, "auxiliary_loss_mlp": 0.01026192, "balance_loss_clip": 1.013129, "balance_loss_mlp": 1.01849055, "epoch": 0.584260205615343, "flos": 26425692558720.0, "grad_norm": 1.8286966539015057, "language_loss": 0.79775894, "learning_rate": 1.5550666243764697e-06, "loss": 0.81974137, "num_input_tokens_seen": 104750290, "step": 4859, "time_per_iteration": 3.5606882572174072 }, { "auxiliary_loss_clip": 0.01172613, "auxiliary_loss_mlp": 0.01025666, "balance_loss_clip": 1.01374269, "balance_loss_mlp": 1.01845145, "epoch": 0.584380448505982, "flos": 13881809174400.0, "grad_norm": 2.047197241299626, "language_loss": 0.77404892, "learning_rate": 1.554307207298465e-06, "loss": 0.79603171, "num_input_tokens_seen": 104768550, "step": 4860, "time_per_iteration": 3.5945703983306885 }, { "auxiliary_loss_clip": 0.01177887, "auxiliary_loss_mlp": 0.01029519, "balance_loss_clip": 1.05187535, "balance_loss_mlp": 1.0217824, "epoch": 0.5845006913966212, "flos": 21543709507200.0, "grad_norm": 1.8478677857255037, "language_loss": 0.78800416, "learning_rate": 1.553547857831704e-06, "loss": 0.81007826, "num_input_tokens_seen": 104785060, "step": 4861, "time_per_iteration": 2.5828683376312256 }, { "auxiliary_loss_clip": 0.01073512, "auxiliary_loss_mlp": 0.010091, "balance_loss_clip": 1.01784801, "balance_loss_mlp": 1.00747836, "epoch": 0.5846209342872603, "flos": 58375452712320.0, "grad_norm": 0.8902650920891464, "language_loss": 0.64244139, "learning_rate": 1.5527885760913771e-06, "loss": 0.66326749, "num_input_tokens_seen": 104834950, "step": 4862, "time_per_iteration": 2.983236074447632 }, { "auxiliary_loss_clip": 0.01171073, "auxiliary_loss_mlp": 0.01029029, "balance_loss_clip": 0.97626692, "balance_loss_mlp": 1.02196634, "epoch": 0.5847411771778993, "flos": 18588045957120.0, "grad_norm": 1.5757426500061504, "language_loss": 0.76572561, "learning_rate": 1.552029362192668e-06, "loss": 0.78772664, "num_input_tokens_seen": 104854210, "step": 4863, "time_per_iteration": 2.718829393386841 }, { "auxiliary_loss_clip": 0.0116584, "auxiliary_loss_mlp": 0.01024624, "balance_loss_clip": 0.93571508, "balance_loss_mlp": 1.01719689, "epoch": 0.5848614200685385, "flos": 24240780069120.0, "grad_norm": 1.7299396597179098, "language_loss": 0.72172272, "learning_rate": 1.5512702162507478e-06, "loss": 0.74362731, "num_input_tokens_seen": 104874525, "step": 4864, "time_per_iteration": 2.798288583755493 }, { "auxiliary_loss_clip": 0.01075185, "auxiliary_loss_mlp": 0.01005236, "balance_loss_clip": 0.9421227, "balance_loss_mlp": 1.00360286, "epoch": 0.5849816629591775, "flos": 71660245933440.0, "grad_norm": 1.1225930247946625, "language_loss": 0.55732775, "learning_rate": 1.5505111383807792e-06, "loss": 0.57813191, "num_input_tokens_seen": 104937195, "step": 4865, "time_per_iteration": 3.307067394256592 }, { "auxiliary_loss_clip": 0.01169718, "auxiliary_loss_mlp": 0.01025663, "balance_loss_clip": 0.89579856, "balance_loss_mlp": 1.0181706, "epoch": 0.5851019058498166, "flos": 23802095266560.0, "grad_norm": 1.6758946545306375, "language_loss": 0.80338699, "learning_rate": 1.5497521286979138e-06, "loss": 0.82534087, "num_input_tokens_seen": 104957435, "step": 4866, "time_per_iteration": 2.8051931858062744 }, { "auxiliary_loss_clip": 0.01176527, "auxiliary_loss_mlp": 0.01027696, "balance_loss_clip": 0.93579972, "balance_loss_mlp": 1.01970279, "epoch": 0.5852221487404557, "flos": 24388516707840.0, "grad_norm": 1.8379666337410383, "language_loss": 0.74288732, "learning_rate": 1.5489931873172927e-06, "loss": 0.76492959, "num_input_tokens_seen": 104978755, "step": 4867, "time_per_iteration": 2.744908571243286 }, { "auxiliary_loss_clip": 0.01146706, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 0.85303879, "balance_loss_mlp": 1.01939106, "epoch": 0.5853423916310948, "flos": 27271425260160.0, "grad_norm": 1.5880807054102906, "language_loss": 0.79091311, "learning_rate": 1.5482343143540467e-06, "loss": 0.81264764, "num_input_tokens_seen": 105000020, "step": 4868, "time_per_iteration": 2.8599061965942383 }, { "auxiliary_loss_clip": 0.01173506, "auxiliary_loss_mlp": 0.01122255, "balance_loss_clip": 0.93539482, "balance_loss_mlp": 0.0, "epoch": 0.5854626345217339, "flos": 11983786611840.0, "grad_norm": 1.876321061920366, "language_loss": 0.82823694, "learning_rate": 1.547475509923295e-06, "loss": 0.85119456, "num_input_tokens_seen": 105017060, "step": 4869, "time_per_iteration": 2.7250192165374756 }, { "auxiliary_loss_clip": 0.01084604, "auxiliary_loss_mlp": 0.01003722, "balance_loss_clip": 0.86769861, "balance_loss_mlp": 1.00189829, "epoch": 0.585582877412373, "flos": 64342335173760.0, "grad_norm": 0.8029360249755917, "language_loss": 0.56085348, "learning_rate": 1.5467167741401495e-06, "loss": 0.58173674, "num_input_tokens_seen": 105078540, "step": 4870, "time_per_iteration": 3.2937159538269043 }, { "auxiliary_loss_clip": 0.01167306, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 0.97233474, "balance_loss_mlp": 1.024194, "epoch": 0.5857031203030121, "flos": 17011926103680.0, "grad_norm": 2.03618860181037, "language_loss": 0.71395314, "learning_rate": 1.5459581071197083e-06, "loss": 0.73594582, "num_input_tokens_seen": 105094200, "step": 4871, "time_per_iteration": 2.7232258319854736 }, { "auxiliary_loss_clip": 0.0117669, "auxiliary_loss_mlp": 0.01027156, "balance_loss_clip": 1.01466954, "balance_loss_mlp": 1.01966417, "epoch": 0.5858233631936511, "flos": 20885682303360.0, "grad_norm": 2.0639060229230366, "language_loss": 0.83494574, "learning_rate": 1.5451995089770624e-06, "loss": 0.85698414, "num_input_tokens_seen": 105113985, "step": 4872, "time_per_iteration": 2.70143985748291 }, { "auxiliary_loss_clip": 0.01172972, "auxiliary_loss_mlp": 0.01022628, "balance_loss_clip": 1.05247962, "balance_loss_mlp": 1.01521945, "epoch": 0.5859436060842903, "flos": 23191902000000.0, "grad_norm": 2.2516368609466055, "language_loss": 0.71783054, "learning_rate": 1.5444409798272885e-06, "loss": 0.73978651, "num_input_tokens_seen": 105138075, "step": 4873, "time_per_iteration": 2.710498809814453 }, { "auxiliary_loss_clip": 0.01170084, "auxiliary_loss_mlp": 0.0102624, "balance_loss_clip": 0.93475616, "balance_loss_mlp": 1.01839614, "epoch": 0.5860638489749294, "flos": 22492648961280.0, "grad_norm": 1.809512110485991, "language_loss": 0.80969959, "learning_rate": 1.543682519785456e-06, "loss": 0.83166283, "num_input_tokens_seen": 105156555, "step": 4874, "time_per_iteration": 2.7812342643737793 }, { "auxiliary_loss_clip": 0.01169907, "auxiliary_loss_mlp": 0.01031883, "balance_loss_clip": 0.97370172, "balance_loss_mlp": 1.02418804, "epoch": 0.5861840918655684, "flos": 17566243764480.0, "grad_norm": 2.6205969511973035, "language_loss": 0.806171, "learning_rate": 1.5429241289666219e-06, "loss": 0.8281889, "num_input_tokens_seen": 105174055, "step": 4875, "time_per_iteration": 2.702301502227783 }, { "auxiliary_loss_clip": 0.0116267, "auxiliary_loss_mlp": 0.01023098, "balance_loss_clip": 0.97231132, "balance_loss_mlp": 1.01644063, "epoch": 0.5863043347562076, "flos": 25556152118400.0, "grad_norm": 1.9998250328421714, "language_loss": 0.69867969, "learning_rate": 1.5421658074858342e-06, "loss": 0.72053742, "num_input_tokens_seen": 105192160, "step": 4876, "time_per_iteration": 2.6804587841033936 }, { "auxiliary_loss_clip": 0.01166362, "auxiliary_loss_mlp": 0.01025192, "balance_loss_clip": 0.97409844, "balance_loss_mlp": 1.01756835, "epoch": 0.5864245776468466, "flos": 20667525050880.0, "grad_norm": 2.474628781312841, "language_loss": 0.66394126, "learning_rate": 1.5414075554581298e-06, "loss": 0.68585682, "num_input_tokens_seen": 105210205, "step": 4877, "time_per_iteration": 3.624894380569458 }, { "auxiliary_loss_clip": 0.01176449, "auxiliary_loss_mlp": 0.01028418, "balance_loss_clip": 1.0516417, "balance_loss_mlp": 1.0207175, "epoch": 0.5865448205374857, "flos": 28913907490560.0, "grad_norm": 2.2535648546447917, "language_loss": 0.78582656, "learning_rate": 1.5406493729985348e-06, "loss": 0.80787528, "num_input_tokens_seen": 105229400, "step": 4878, "time_per_iteration": 2.6729607582092285 }, { "auxiliary_loss_clip": 0.01175511, "auxiliary_loss_mlp": 0.01122966, "balance_loss_clip": 0.89909482, "balance_loss_mlp": 0.0, "epoch": 0.5866650634281249, "flos": 25842575168640.0, "grad_norm": 1.9081515354110243, "language_loss": 0.72188669, "learning_rate": 1.5398912602220644e-06, "loss": 0.7448715, "num_input_tokens_seen": 105248675, "step": 4879, "time_per_iteration": 2.73886775970459 }, { "auxiliary_loss_clip": 0.01184326, "auxiliary_loss_mlp": 0.01027166, "balance_loss_clip": 0.89897752, "balance_loss_mlp": 1.0195905, "epoch": 0.5867853063187639, "flos": 17052325925760.0, "grad_norm": 1.8056677649190256, "language_loss": 0.78512084, "learning_rate": 1.539133217243724e-06, "loss": 0.80723572, "num_input_tokens_seen": 105265695, "step": 4880, "time_per_iteration": 2.7831320762634277 }, { "auxiliary_loss_clip": 0.01177462, "auxiliary_loss_mlp": 0.01025255, "balance_loss_clip": 0.93719304, "balance_loss_mlp": 1.01727378, "epoch": 0.586905549209403, "flos": 24645026707200.0, "grad_norm": 2.072730230869904, "language_loss": 0.75918233, "learning_rate": 1.5383752441785081e-06, "loss": 0.78120947, "num_input_tokens_seen": 105284920, "step": 4881, "time_per_iteration": 2.7938811779022217 }, { "auxiliary_loss_clip": 0.01178981, "auxiliary_loss_mlp": 0.01036926, "balance_loss_clip": 1.01581597, "balance_loss_mlp": 1.02922761, "epoch": 0.5870257921000421, "flos": 14720538723840.0, "grad_norm": 2.0882048441616945, "language_loss": 0.84756225, "learning_rate": 1.5376173411414003e-06, "loss": 0.86972129, "num_input_tokens_seen": 105302960, "step": 4882, "time_per_iteration": 2.6713125705718994 }, { "auxiliary_loss_clip": 0.01167564, "auxiliary_loss_mlp": 0.01029105, "balance_loss_clip": 0.96907961, "balance_loss_mlp": 1.02173841, "epoch": 0.5871460349906812, "flos": 23914998691200.0, "grad_norm": 1.7914712331244955, "language_loss": 0.78697288, "learning_rate": 1.5368595082473753e-06, "loss": 0.80893958, "num_input_tokens_seen": 105321260, "step": 4883, "time_per_iteration": 3.6498873233795166 }, { "auxiliary_loss_clip": 0.01176202, "auxiliary_loss_mlp": 0.01024886, "balance_loss_clip": 1.01299024, "balance_loss_mlp": 1.01746488, "epoch": 0.5872662778813202, "flos": 22164174063360.0, "grad_norm": 1.5884313555564074, "language_loss": 0.77926266, "learning_rate": 1.5361017456113935e-06, "loss": 0.80127358, "num_input_tokens_seen": 105341610, "step": 4884, "time_per_iteration": 2.621384620666504 }, { "auxiliary_loss_clip": 0.01172602, "auxiliary_loss_mlp": 0.0102575, "balance_loss_clip": 1.01178348, "balance_loss_mlp": 1.01821554, "epoch": 0.5873865207719594, "flos": 18441925430400.0, "grad_norm": 1.9295770472524352, "language_loss": 0.86291856, "learning_rate": 1.5353440533484085e-06, "loss": 0.88490206, "num_input_tokens_seen": 105360465, "step": 4885, "time_per_iteration": 3.5614209175109863 }, { "auxiliary_loss_clip": 0.01175527, "auxiliary_loss_mlp": 0.01029799, "balance_loss_clip": 0.97610915, "balance_loss_mlp": 1.0215795, "epoch": 0.5875067636625985, "flos": 54015321427200.0, "grad_norm": 2.0791800313582764, "language_loss": 0.66471756, "learning_rate": 1.534586431573361e-06, "loss": 0.6867708, "num_input_tokens_seen": 105385405, "step": 4886, "time_per_iteration": 3.8691866397857666 }, { "auxiliary_loss_clip": 0.0116643, "auxiliary_loss_mlp": 0.01031345, "balance_loss_clip": 0.85754514, "balance_loss_mlp": 1.02313781, "epoch": 0.5876270065532375, "flos": 27995707100160.0, "grad_norm": 1.9146097553962744, "language_loss": 0.79238784, "learning_rate": 1.5338288804011817e-06, "loss": 0.81436557, "num_input_tokens_seen": 105404905, "step": 4887, "time_per_iteration": 2.9037067890167236 }, { "auxiliary_loss_clip": 0.01166641, "auxiliary_loss_mlp": 0.01026556, "balance_loss_clip": 0.97176707, "balance_loss_mlp": 1.01869106, "epoch": 0.5877472494438767, "flos": 21361462876800.0, "grad_norm": 1.9349151490577459, "language_loss": 0.71079868, "learning_rate": 1.533071399946791e-06, "loss": 0.73273063, "num_input_tokens_seen": 105423650, "step": 4888, "time_per_iteration": 2.761176347732544 }, { "auxiliary_loss_clip": 0.0117534, "auxiliary_loss_mlp": 0.01027084, "balance_loss_clip": 0.97494471, "balance_loss_mlp": 1.01954961, "epoch": 0.5878674923345157, "flos": 22383013674240.0, "grad_norm": 1.7547523856337945, "language_loss": 0.57266665, "learning_rate": 1.5323139903250977e-06, "loss": 0.59469086, "num_input_tokens_seen": 105444255, "step": 4889, "time_per_iteration": 2.631535530090332 }, { "auxiliary_loss_clip": 0.01178951, "auxiliary_loss_mlp": 0.01026357, "balance_loss_clip": 0.97942406, "balance_loss_mlp": 1.01910305, "epoch": 0.5879877352251548, "flos": 21868664872320.0, "grad_norm": 1.5085379955896852, "language_loss": 0.77148753, "learning_rate": 1.5315566516510002e-06, "loss": 0.7935406, "num_input_tokens_seen": 105462425, "step": 4890, "time_per_iteration": 2.716266393661499 }, { "auxiliary_loss_clip": 0.01176092, "auxiliary_loss_mlp": 0.01025722, "balance_loss_clip": 1.05219722, "balance_loss_mlp": 1.01787198, "epoch": 0.5881079781157939, "flos": 17493811989120.0, "grad_norm": 1.762633365566265, "language_loss": 0.67221546, "learning_rate": 1.5307993840393857e-06, "loss": 0.6942336, "num_input_tokens_seen": 105480505, "step": 4891, "time_per_iteration": 2.630439043045044 }, { "auxiliary_loss_clip": 0.01172509, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.05004263, "balance_loss_mlp": 1.01839149, "epoch": 0.588228221006433, "flos": 22601853285120.0, "grad_norm": 2.073421471144809, "language_loss": 0.80644011, "learning_rate": 1.530042187605132e-06, "loss": 0.82842088, "num_input_tokens_seen": 105499760, "step": 4892, "time_per_iteration": 2.6046838760375977 }, { "auxiliary_loss_clip": 0.01176537, "auxiliary_loss_mlp": 0.01122207, "balance_loss_clip": 1.01443326, "balance_loss_mlp": 0.0, "epoch": 0.5883484638970721, "flos": 26176939896960.0, "grad_norm": 1.4774184573057711, "language_loss": 0.8421979, "learning_rate": 1.5292850624631044e-06, "loss": 0.86518526, "num_input_tokens_seen": 105521955, "step": 4893, "time_per_iteration": 2.65144681930542 }, { "auxiliary_loss_clip": 0.01173549, "auxiliary_loss_mlp": 0.01022032, "balance_loss_clip": 1.01601171, "balance_loss_mlp": 1.01502287, "epoch": 0.5884687067877111, "flos": 30443737691520.0, "grad_norm": 1.852174967171871, "language_loss": 0.8011688, "learning_rate": 1.5285280087281593e-06, "loss": 0.82312459, "num_input_tokens_seen": 105542685, "step": 4894, "time_per_iteration": 2.687023639678955 }, { "auxiliary_loss_clip": 0.01075942, "auxiliary_loss_mlp": 0.01000534, "balance_loss_clip": 0.93984044, "balance_loss_mlp": 0.99891245, "epoch": 0.5885889496783503, "flos": 70507550580480.0, "grad_norm": 0.7290814745416798, "language_loss": 0.5663296, "learning_rate": 1.5277710265151398e-06, "loss": 0.58709431, "num_input_tokens_seen": 105612165, "step": 4895, "time_per_iteration": 3.4053585529327393 }, { "auxiliary_loss_clip": 0.01172238, "auxiliary_loss_mlp": 0.01028201, "balance_loss_clip": 1.01243865, "balance_loss_mlp": 1.02074409, "epoch": 0.5887091925689893, "flos": 19098767485440.0, "grad_norm": 2.767487762358123, "language_loss": 0.76522386, "learning_rate": 1.5270141159388803e-06, "loss": 0.78722829, "num_input_tokens_seen": 105629185, "step": 4896, "time_per_iteration": 2.620994806289673 }, { "auxiliary_loss_clip": 0.01172826, "auxiliary_loss_mlp": 0.01029599, "balance_loss_clip": 1.04999471, "balance_loss_mlp": 1.02210045, "epoch": 0.5888294354596284, "flos": 23294282739840.0, "grad_norm": 1.5567784278718602, "language_loss": 0.79949558, "learning_rate": 1.526257277114203e-06, "loss": 0.82151985, "num_input_tokens_seen": 105650260, "step": 4897, "time_per_iteration": 2.6511850357055664 }, { "auxiliary_loss_clip": 0.01170152, "auxiliary_loss_mlp": 0.01027944, "balance_loss_clip": 0.97638935, "balance_loss_mlp": 1.02078271, "epoch": 0.5889496783502676, "flos": 21981532383360.0, "grad_norm": 1.740984618685435, "language_loss": 0.79469824, "learning_rate": 1.5255005101559201e-06, "loss": 0.81667924, "num_input_tokens_seen": 105667870, "step": 4898, "time_per_iteration": 2.682520866394043 }, { "auxiliary_loss_clip": 0.01174321, "auxiliary_loss_mlp": 0.01030505, "balance_loss_clip": 1.01178598, "balance_loss_mlp": 1.02287829, "epoch": 0.5890699212409066, "flos": 21685233093120.0, "grad_norm": 1.8179718117668526, "language_loss": 0.76888973, "learning_rate": 1.524743815178833e-06, "loss": 0.79093802, "num_input_tokens_seen": 105685830, "step": 4899, "time_per_iteration": 2.643958806991577 }, { "auxiliary_loss_clip": 0.01169419, "auxiliary_loss_mlp": 0.01021135, "balance_loss_clip": 0.97073168, "balance_loss_mlp": 1.0142777, "epoch": 0.5891901641315457, "flos": 19464553635840.0, "grad_norm": 1.7139485741150702, "language_loss": 0.80732989, "learning_rate": 1.5239871922977315e-06, "loss": 0.82923543, "num_input_tokens_seen": 105705745, "step": 4900, "time_per_iteration": 2.727644920349121 }, { "auxiliary_loss_clip": 0.01167296, "auxiliary_loss_mlp": 0.01032558, "balance_loss_clip": 0.97171962, "balance_loss_mlp": 1.02536952, "epoch": 0.5893104070221848, "flos": 19609884063360.0, "grad_norm": 1.6588000406296086, "language_loss": 0.89452255, "learning_rate": 1.523230641627394e-06, "loss": 0.91652107, "num_input_tokens_seen": 105724730, "step": 4901, "time_per_iteration": 2.6940391063690186 }, { "auxiliary_loss_clip": 0.01168886, "auxiliary_loss_mlp": 0.01028505, "balance_loss_clip": 0.89283121, "balance_loss_mlp": 1.02164412, "epoch": 0.5894306499128239, "flos": 29060063930880.0, "grad_norm": 1.8978950731376292, "language_loss": 0.72449327, "learning_rate": 1.5224741632825888e-06, "loss": 0.74646717, "num_input_tokens_seen": 105744920, "step": 4902, "time_per_iteration": 2.9610955715179443 }, { "auxiliary_loss_clip": 0.01175172, "auxiliary_loss_mlp": 0.01027151, "balance_loss_clip": 1.05045068, "balance_loss_mlp": 1.0195756, "epoch": 0.589550892803463, "flos": 42298890721920.0, "grad_norm": 1.8217146681770207, "language_loss": 0.69352794, "learning_rate": 1.521717757378074e-06, "loss": 0.71555114, "num_input_tokens_seen": 105765465, "step": 4903, "time_per_iteration": 3.6702966690063477 }, { "auxiliary_loss_clip": 0.01179266, "auxiliary_loss_mlp": 0.01023033, "balance_loss_clip": 1.01416528, "balance_loss_mlp": 1.01540351, "epoch": 0.5896711356941021, "flos": 14137062197760.0, "grad_norm": 2.0920998655393963, "language_loss": 0.69307899, "learning_rate": 1.5209614240285943e-06, "loss": 0.71510196, "num_input_tokens_seen": 105783120, "step": 4904, "time_per_iteration": 2.5866050720214844 }, { "auxiliary_loss_clip": 0.01171526, "auxiliary_loss_mlp": 0.01122534, "balance_loss_clip": 1.0496515, "balance_loss_mlp": 0.0, "epoch": 0.5897913785847412, "flos": 17201355454080.0, "grad_norm": 2.235767540698618, "language_loss": 0.84494293, "learning_rate": 1.520205163348887e-06, "loss": 0.86788356, "num_input_tokens_seen": 105801055, "step": 4905, "time_per_iteration": 2.5809361934661865 }, { "auxiliary_loss_clip": 0.01081738, "auxiliary_loss_mlp": 0.01001089, "balance_loss_clip": 0.90299714, "balance_loss_mlp": 0.99940825, "epoch": 0.5899116214753802, "flos": 48794164202880.0, "grad_norm": 0.7411165497851017, "language_loss": 0.56996459, "learning_rate": 1.519448975453674e-06, "loss": 0.59079283, "num_input_tokens_seen": 105856155, "step": 4906, "time_per_iteration": 3.143397092819214 }, { "auxiliary_loss_clip": 0.01176346, "auxiliary_loss_mlp": 0.0112268, "balance_loss_clip": 1.01626468, "balance_loss_mlp": 0.0, "epoch": 0.5900318643660194, "flos": 21103659987840.0, "grad_norm": 1.886705544179361, "language_loss": 0.76176947, "learning_rate": 1.5186928604576696e-06, "loss": 0.78475976, "num_input_tokens_seen": 105873350, "step": 4907, "time_per_iteration": 2.621286392211914 }, { "auxiliary_loss_clip": 0.01170578, "auxiliary_loss_mlp": 0.01029063, "balance_loss_clip": 0.97199368, "balance_loss_mlp": 1.02174973, "epoch": 0.5901521072566585, "flos": 21178390233600.0, "grad_norm": 2.200575000912749, "language_loss": 0.77268505, "learning_rate": 1.5179368184755752e-06, "loss": 0.79468143, "num_input_tokens_seen": 105891435, "step": 4908, "time_per_iteration": 2.7102293968200684 }, { "auxiliary_loss_clip": 0.01172481, "auxiliary_loss_mlp": 0.01027775, "balance_loss_clip": 0.97447199, "balance_loss_mlp": 1.02076828, "epoch": 0.5902723501472975, "flos": 20225967160320.0, "grad_norm": 1.674126379568804, "language_loss": 0.82855105, "learning_rate": 1.5171808496220821e-06, "loss": 0.85055357, "num_input_tokens_seen": 105910190, "step": 4909, "time_per_iteration": 3.6590065956115723 }, { "auxiliary_loss_clip": 0.0117526, "auxiliary_loss_mlp": 0.01025942, "balance_loss_clip": 0.97298682, "balance_loss_mlp": 1.01869154, "epoch": 0.5903925930379367, "flos": 22964407211520.0, "grad_norm": 1.6541726339979037, "language_loss": 0.81678057, "learning_rate": 1.5164249540118708e-06, "loss": 0.83879262, "num_input_tokens_seen": 105929315, "step": 4910, "time_per_iteration": 2.6811037063598633 }, { "auxiliary_loss_clip": 0.01172533, "auxiliary_loss_mlp": 0.01027023, "balance_loss_clip": 0.85918546, "balance_loss_mlp": 1.02018344, "epoch": 0.5905128359285757, "flos": 23367720096000.0, "grad_norm": 1.5894906218066958, "language_loss": 0.83233786, "learning_rate": 1.5156691317596093e-06, "loss": 0.8543334, "num_input_tokens_seen": 105950740, "step": 4911, "time_per_iteration": 3.70139217376709 }, { "auxiliary_loss_clip": 0.01175921, "auxiliary_loss_mlp": 0.01122081, "balance_loss_clip": 1.01236725, "balance_loss_mlp": 0.0, "epoch": 0.5906330788192148, "flos": 28032335994240.0, "grad_norm": 1.9108505096982882, "language_loss": 0.66458142, "learning_rate": 1.5149133829799556e-06, "loss": 0.68756145, "num_input_tokens_seen": 105968735, "step": 4912, "time_per_iteration": 3.6610803604125977 }, { "auxiliary_loss_clip": 0.01180916, "auxiliary_loss_mlp": 0.01032231, "balance_loss_clip": 0.97446507, "balance_loss_mlp": 1.02490592, "epoch": 0.590753321709854, "flos": 18477943793280.0, "grad_norm": 2.0944370307982605, "language_loss": 0.80823696, "learning_rate": 1.5141577077875556e-06, "loss": 0.83036846, "num_input_tokens_seen": 105986060, "step": 4913, "time_per_iteration": 2.6877400875091553 }, { "auxiliary_loss_clip": 0.01176421, "auxiliary_loss_mlp": 0.01024755, "balance_loss_clip": 1.01212788, "balance_loss_mlp": 1.01769781, "epoch": 0.590873564600493, "flos": 16873706568960.0, "grad_norm": 2.0195893711407926, "language_loss": 0.72591841, "learning_rate": 1.5134021062970451e-06, "loss": 0.74793017, "num_input_tokens_seen": 106004440, "step": 4914, "time_per_iteration": 2.65054988861084 }, { "auxiliary_loss_clip": 0.01163094, "auxiliary_loss_mlp": 0.01028857, "balance_loss_clip": 0.937841, "balance_loss_mlp": 1.02126312, "epoch": 0.5909938074911321, "flos": 13516166678400.0, "grad_norm": 1.919872407201989, "language_loss": 0.80773991, "learning_rate": 1.5126465786230483e-06, "loss": 0.82965946, "num_input_tokens_seen": 106021215, "step": 4915, "time_per_iteration": 2.6977415084838867 }, { "auxiliary_loss_clip": 0.01173507, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.05054379, "balance_loss_mlp": 1.01800311, "epoch": 0.5911140503817712, "flos": 26024067613440.0, "grad_norm": 2.2513723489785775, "language_loss": 0.82194388, "learning_rate": 1.5118911248801787e-06, "loss": 0.8439312, "num_input_tokens_seen": 106039225, "step": 4916, "time_per_iteration": 2.650275230407715 }, { "auxiliary_loss_clip": 0.01169735, "auxiliary_loss_mlp": 0.01022135, "balance_loss_clip": 1.01164484, "balance_loss_mlp": 1.01528955, "epoch": 0.5912342932724103, "flos": 23258731253760.0, "grad_norm": 1.7851617082728015, "language_loss": 0.79827917, "learning_rate": 1.5111357451830364e-06, "loss": 0.82019788, "num_input_tokens_seen": 106057920, "step": 4917, "time_per_iteration": 2.624985456466675 }, { "auxiliary_loss_clip": 0.01174733, "auxiliary_loss_mlp": 0.01019725, "balance_loss_clip": 1.01247573, "balance_loss_mlp": 1.01320159, "epoch": 0.5913545361630493, "flos": 19573039687680.0, "grad_norm": 2.4745377288321597, "language_loss": 0.70492554, "learning_rate": 1.5103804396462131e-06, "loss": 0.72687012, "num_input_tokens_seen": 106077855, "step": 4918, "time_per_iteration": 2.668116807937622 }, { "auxiliary_loss_clip": 0.01176349, "auxiliary_loss_mlp": 0.01029946, "balance_loss_clip": 1.01149702, "balance_loss_mlp": 1.02198565, "epoch": 0.5914747790536885, "flos": 26213532877440.0, "grad_norm": 2.97760730230223, "language_loss": 0.79512972, "learning_rate": 1.5096252083842877e-06, "loss": 0.81719267, "num_input_tokens_seen": 106097065, "step": 4919, "time_per_iteration": 2.663921594619751 }, { "auxiliary_loss_clip": 0.0117017, "auxiliary_loss_mlp": 0.01027487, "balance_loss_clip": 1.0098455, "balance_loss_mlp": 1.02034044, "epoch": 0.5915950219443276, "flos": 27417545786880.0, "grad_norm": 1.7908560047202573, "language_loss": 0.8525784, "learning_rate": 1.5088700515118285e-06, "loss": 0.87455499, "num_input_tokens_seen": 106116385, "step": 4920, "time_per_iteration": 2.7054617404937744 }, { "auxiliary_loss_clip": 0.01164359, "auxiliary_loss_mlp": 0.01027583, "balance_loss_clip": 0.93505383, "balance_loss_mlp": 1.02009034, "epoch": 0.5917152648349666, "flos": 21907879545600.0, "grad_norm": 1.473502040998763, "language_loss": 0.66633046, "learning_rate": 1.508114969143392e-06, "loss": 0.68824989, "num_input_tokens_seen": 106136370, "step": 4921, "time_per_iteration": 2.69323992729187 }, { "auxiliary_loss_clip": 0.01169476, "auxiliary_loss_mlp": 0.0102436, "balance_loss_clip": 0.97022712, "balance_loss_mlp": 1.01753807, "epoch": 0.5918355077256057, "flos": 28109185142400.0, "grad_norm": 1.3888761105132317, "language_loss": 0.77443475, "learning_rate": 1.5073599613935238e-06, "loss": 0.79637313, "num_input_tokens_seen": 106158490, "step": 4922, "time_per_iteration": 2.7846755981445312 }, { "auxiliary_loss_clip": 0.01168472, "auxiliary_loss_mlp": 0.01026833, "balance_loss_clip": 0.97265017, "balance_loss_mlp": 1.019382, "epoch": 0.5919557506162448, "flos": 28183807647360.0, "grad_norm": 1.8004729284966274, "language_loss": 0.57419157, "learning_rate": 1.5066050283767574e-06, "loss": 0.59614468, "num_input_tokens_seen": 106179170, "step": 4923, "time_per_iteration": 2.7697336673736572 }, { "auxiliary_loss_clip": 0.01167875, "auxiliary_loss_mlp": 0.01022858, "balance_loss_clip": 0.9735707, "balance_loss_mlp": 1.01583362, "epoch": 0.5920759935068839, "flos": 12094355652480.0, "grad_norm": 3.5321157212198715, "language_loss": 0.82573199, "learning_rate": 1.505850170207616e-06, "loss": 0.84763932, "num_input_tokens_seen": 106196035, "step": 4924, "time_per_iteration": 2.7610888481140137 }, { "auxiliary_loss_clip": 0.01168234, "auxiliary_loss_mlp": 0.01025118, "balance_loss_clip": 0.97173786, "balance_loss_mlp": 1.01772738, "epoch": 0.592196236397523, "flos": 29424772673280.0, "grad_norm": 2.0998529844703877, "language_loss": 0.78325427, "learning_rate": 1.505095387000611e-06, "loss": 0.80518782, "num_input_tokens_seen": 106218335, "step": 4925, "time_per_iteration": 2.7562217712402344 }, { "auxiliary_loss_clip": 0.01159616, "auxiliary_loss_mlp": 0.01026251, "balance_loss_clip": 0.97157931, "balance_loss_mlp": 1.01917005, "epoch": 0.5923164792881621, "flos": 24384709866240.0, "grad_norm": 1.8567338691241775, "language_loss": 0.74348867, "learning_rate": 1.504340678870242e-06, "loss": 0.76534736, "num_input_tokens_seen": 106236550, "step": 4926, "time_per_iteration": 2.688044786453247 }, { "auxiliary_loss_clip": 0.01169991, "auxiliary_loss_mlp": 0.01025045, "balance_loss_clip": 1.01148343, "balance_loss_mlp": 1.01739717, "epoch": 0.5924367221788012, "flos": 24024238928640.0, "grad_norm": 2.7635843139706844, "language_loss": 0.89654171, "learning_rate": 1.5035860459309989e-06, "loss": 0.91849214, "num_input_tokens_seen": 106254265, "step": 4927, "time_per_iteration": 2.645669937133789 }, { "auxiliary_loss_clip": 0.01166003, "auxiliary_loss_mlp": 0.01032399, "balance_loss_clip": 0.97222477, "balance_loss_mlp": 1.023947, "epoch": 0.5925569650694402, "flos": 26870590414080.0, "grad_norm": 2.1055871669481814, "language_loss": 0.63553512, "learning_rate": 1.5028314882973568e-06, "loss": 0.65751922, "num_input_tokens_seen": 106274670, "step": 4928, "time_per_iteration": 2.704340696334839 }, { "auxiliary_loss_clip": 0.01174619, "auxiliary_loss_mlp": 0.01032311, "balance_loss_clip": 0.97597891, "balance_loss_mlp": 1.02463436, "epoch": 0.5926772079600794, "flos": 22302788647680.0, "grad_norm": 1.6975241086763877, "language_loss": 0.84439045, "learning_rate": 1.502077006083783e-06, "loss": 0.86645973, "num_input_tokens_seen": 106293330, "step": 4929, "time_per_iteration": 3.632917642593384 }, { "auxiliary_loss_clip": 0.01178847, "auxiliary_loss_mlp": 0.01122525, "balance_loss_clip": 1.0142746, "balance_loss_mlp": 0.0, "epoch": 0.5927974508507184, "flos": 19865244827520.0, "grad_norm": 1.7420837986804534, "language_loss": 0.76289731, "learning_rate": 1.5013225994047315e-06, "loss": 0.78591108, "num_input_tokens_seen": 106310960, "step": 4930, "time_per_iteration": 2.6323928833007812 }, { "auxiliary_loss_clip": 0.01176752, "auxiliary_loss_mlp": 0.01122672, "balance_loss_clip": 1.01570344, "balance_loss_mlp": 0.0, "epoch": 0.5929176937413575, "flos": 15776743167360.0, "grad_norm": 1.5854140698275405, "language_loss": 0.8054769, "learning_rate": 1.5005682683746452e-06, "loss": 0.82847118, "num_input_tokens_seen": 106329475, "step": 4931, "time_per_iteration": 2.6671388149261475 }, { "auxiliary_loss_clip": 0.01171142, "auxiliary_loss_mlp": 0.01027664, "balance_loss_clip": 1.01259851, "balance_loss_mlp": 1.01994503, "epoch": 0.5930379366319967, "flos": 17601472028160.0, "grad_norm": 2.6137245890507805, "language_loss": 0.72783744, "learning_rate": 1.4998140131079553e-06, "loss": 0.74982554, "num_input_tokens_seen": 106345565, "step": 4932, "time_per_iteration": 2.674422025680542 }, { "auxiliary_loss_clip": 0.01167212, "auxiliary_loss_mlp": 0.0112241, "balance_loss_clip": 0.85753787, "balance_loss_mlp": 0.0, "epoch": 0.5931581795226357, "flos": 17704283731200.0, "grad_norm": 1.9673771412754826, "language_loss": 0.73426574, "learning_rate": 1.4990598337190821e-06, "loss": 0.75716197, "num_input_tokens_seen": 106361920, "step": 4933, "time_per_iteration": 2.676201343536377 }, { "auxiliary_loss_clip": 0.01172495, "auxiliary_loss_mlp": 0.01122911, "balance_loss_clip": 1.05022478, "balance_loss_mlp": 0.0, "epoch": 0.5932784224132748, "flos": 24280102483200.0, "grad_norm": 1.8821935451429754, "language_loss": 0.67615056, "learning_rate": 1.4983057303224338e-06, "loss": 0.69910455, "num_input_tokens_seen": 106381735, "step": 4934, "time_per_iteration": 2.7006118297576904 }, { "auxiliary_loss_clip": 0.01166062, "auxiliary_loss_mlp": 0.01026616, "balance_loss_clip": 0.8956784, "balance_loss_mlp": 1.01962101, "epoch": 0.5933986653039139, "flos": 22926700909440.0, "grad_norm": 1.7085380168535442, "language_loss": 0.8778528, "learning_rate": 1.4975517030324072e-06, "loss": 0.89977956, "num_input_tokens_seen": 106399745, "step": 4935, "time_per_iteration": 3.636960506439209 }, { "auxiliary_loss_clip": 0.01069743, "auxiliary_loss_mlp": 0.01116659, "balance_loss_clip": 1.01477599, "balance_loss_mlp": 0.0, "epoch": 0.593518908194553, "flos": 71121730256640.0, "grad_norm": 0.7848478805896684, "language_loss": 0.61825937, "learning_rate": 1.4967977519633882e-06, "loss": 0.64012343, "num_input_tokens_seen": 106457205, "step": 4936, "time_per_iteration": 3.301440954208374 }, { "auxiliary_loss_clip": 0.01166926, "auxiliary_loss_mlp": 0.01032385, "balance_loss_clip": 0.9349696, "balance_loss_mlp": 1.02437735, "epoch": 0.593639151085192, "flos": 20448649526400.0, "grad_norm": 2.5025324558968807, "language_loss": 0.7839421, "learning_rate": 1.4960438772297494e-06, "loss": 0.80593526, "num_input_tokens_seen": 106474250, "step": 4937, "time_per_iteration": 3.5244929790496826 }, { "auxiliary_loss_clip": 0.0117064, "auxiliary_loss_mlp": 0.01028343, "balance_loss_clip": 0.97093517, "balance_loss_mlp": 1.02033758, "epoch": 0.5937593939758312, "flos": 30883428074880.0, "grad_norm": 1.9924182292173762, "language_loss": 0.73880905, "learning_rate": 1.495290078945855e-06, "loss": 0.76079887, "num_input_tokens_seen": 106494015, "step": 4938, "time_per_iteration": 3.6357271671295166 }, { "auxiliary_loss_clip": 0.01171885, "auxiliary_loss_mlp": 0.01022594, "balance_loss_clip": 1.05081391, "balance_loss_mlp": 1.01585019, "epoch": 0.5938796368664703, "flos": 36898069668480.0, "grad_norm": 1.7015265233048205, "language_loss": 0.74250734, "learning_rate": 1.4945363572260529e-06, "loss": 0.76445222, "num_input_tokens_seen": 106515010, "step": 4939, "time_per_iteration": 2.6948301792144775 }, { "auxiliary_loss_clip": 0.01171792, "auxiliary_loss_mlp": 0.01023467, "balance_loss_clip": 1.01030231, "balance_loss_mlp": 1.01606452, "epoch": 0.5939998797571093, "flos": 23842926051840.0, "grad_norm": 1.767845072960063, "language_loss": 0.67995071, "learning_rate": 1.4937827121846845e-06, "loss": 0.70190322, "num_input_tokens_seen": 106535265, "step": 4940, "time_per_iteration": 2.6700191497802734 }, { "auxiliary_loss_clip": 0.01167028, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 0.93623108, "balance_loss_mlp": 1.0296371, "epoch": 0.5941201226477485, "flos": 25191407462400.0, "grad_norm": 1.4969028019130797, "language_loss": 0.73391008, "learning_rate": 1.4930291439360755e-06, "loss": 0.75594813, "num_input_tokens_seen": 106557830, "step": 4941, "time_per_iteration": 2.8364691734313965 }, { "auxiliary_loss_clip": 0.01173254, "auxiliary_loss_mlp": 0.01027372, "balance_loss_clip": 1.01270652, "balance_loss_mlp": 1.01995182, "epoch": 0.5942403655383875, "flos": 22418996123520.0, "grad_norm": 1.9366642761244772, "language_loss": 0.79192799, "learning_rate": 1.4922756525945427e-06, "loss": 0.81393427, "num_input_tokens_seen": 106577140, "step": 4942, "time_per_iteration": 2.6875314712524414 }, { "auxiliary_loss_clip": 0.01074545, "auxiliary_loss_mlp": 0.01003275, "balance_loss_clip": 0.97780085, "balance_loss_mlp": 1.00161827, "epoch": 0.5943606084290266, "flos": 67629310796160.0, "grad_norm": 0.7751151997092094, "language_loss": 0.59622079, "learning_rate": 1.4915222382743894e-06, "loss": 0.61699903, "num_input_tokens_seen": 106635975, "step": 4943, "time_per_iteration": 3.2700698375701904 }, { "auxiliary_loss_clip": 0.01173067, "auxiliary_loss_mlp": 0.01027744, "balance_loss_clip": 1.01290298, "balance_loss_mlp": 1.01997399, "epoch": 0.5944808513196658, "flos": 18223157646720.0, "grad_norm": 1.978546614901157, "language_loss": 0.7171275, "learning_rate": 1.4907689010899085e-06, "loss": 0.73913562, "num_input_tokens_seen": 106653555, "step": 4944, "time_per_iteration": 2.661505937576294 }, { "auxiliary_loss_clip": 0.01172155, "auxiliary_loss_mlp": 0.01022086, "balance_loss_clip": 0.97266793, "balance_loss_mlp": 1.01472461, "epoch": 0.5946010942103048, "flos": 24790824011520.0, "grad_norm": 1.7221468075319668, "language_loss": 0.62857008, "learning_rate": 1.4900156411553804e-06, "loss": 0.65051246, "num_input_tokens_seen": 106673385, "step": 4945, "time_per_iteration": 2.762211561203003 }, { "auxiliary_loss_clip": 0.01173991, "auxiliary_loss_mlp": 0.0103337, "balance_loss_clip": 0.97536236, "balance_loss_mlp": 1.02526999, "epoch": 0.5947213371009439, "flos": 15231619388160.0, "grad_norm": 2.017967584475808, "language_loss": 0.85509092, "learning_rate": 1.4892624585850739e-06, "loss": 0.87716454, "num_input_tokens_seen": 106691740, "step": 4946, "time_per_iteration": 2.7046072483062744 }, { "auxiliary_loss_clip": 0.0117723, "auxiliary_loss_mlp": 0.01026079, "balance_loss_clip": 1.0521481, "balance_loss_mlp": 1.01862574, "epoch": 0.594841579991583, "flos": 25848069949440.0, "grad_norm": 1.8503433978194652, "language_loss": 0.79560262, "learning_rate": 1.4885093534932465e-06, "loss": 0.81763577, "num_input_tokens_seen": 106709705, "step": 4947, "time_per_iteration": 2.595560073852539 }, { "auxiliary_loss_clip": 0.0116824, "auxiliary_loss_mlp": 0.0103206, "balance_loss_clip": 0.97623461, "balance_loss_mlp": 1.02534842, "epoch": 0.5949618228822221, "flos": 23981109672960.0, "grad_norm": 1.886255005004411, "language_loss": 0.7129885, "learning_rate": 1.4877563259941433e-06, "loss": 0.73499143, "num_input_tokens_seen": 106727560, "step": 4948, "time_per_iteration": 2.6846907138824463 }, { "auxiliary_loss_clip": 0.01181427, "auxiliary_loss_mlp": 0.01026268, "balance_loss_clip": 1.01482093, "balance_loss_mlp": 1.01824522, "epoch": 0.5950820657728612, "flos": 40547491476480.0, "grad_norm": 2.042527968632584, "language_loss": 0.67886126, "learning_rate": 1.4870033762019988e-06, "loss": 0.70093822, "num_input_tokens_seen": 106747725, "step": 4949, "time_per_iteration": 2.8002097606658936 }, { "auxiliary_loss_clip": 0.01169629, "auxiliary_loss_mlp": 0.0103124, "balance_loss_clip": 0.97379827, "balance_loss_mlp": 1.02320552, "epoch": 0.5952023086635003, "flos": 23184467884800.0, "grad_norm": 1.545299629711732, "language_loss": 0.7342909, "learning_rate": 1.4862505042310334e-06, "loss": 0.75629956, "num_input_tokens_seen": 106767010, "step": 4950, "time_per_iteration": 2.693079710006714 }, { "auxiliary_loss_clip": 0.0116287, "auxiliary_loss_mlp": 0.01028061, "balance_loss_clip": 0.97361225, "balance_loss_mlp": 1.02083373, "epoch": 0.5953225515541394, "flos": 33653289548160.0, "grad_norm": 1.7926706498157896, "language_loss": 0.69537604, "learning_rate": 1.4854977101954587e-06, "loss": 0.71728545, "num_input_tokens_seen": 106789230, "step": 4951, "time_per_iteration": 2.7795684337615967 }, { "auxiliary_loss_clip": 0.01170911, "auxiliary_loss_mlp": 0.01024191, "balance_loss_clip": 1.00870013, "balance_loss_mlp": 1.01686859, "epoch": 0.5954427944447784, "flos": 24459619680000.0, "grad_norm": 1.7401807070428719, "language_loss": 0.86440837, "learning_rate": 1.4847449942094716e-06, "loss": 0.88635939, "num_input_tokens_seen": 106808110, "step": 4952, "time_per_iteration": 2.663518190383911 }, { "auxiliary_loss_clip": 0.01166102, "auxiliary_loss_mlp": 0.01028995, "balance_loss_clip": 0.97224051, "balance_loss_mlp": 1.02191734, "epoch": 0.5955630373354175, "flos": 18551848026240.0, "grad_norm": 1.8398936119166531, "language_loss": 0.86145794, "learning_rate": 1.4839923563872598e-06, "loss": 0.8834089, "num_input_tokens_seen": 106826650, "step": 4953, "time_per_iteration": 2.6708157062530518 }, { "auxiliary_loss_clip": 0.01175297, "auxiliary_loss_mlp": 0.01030482, "balance_loss_clip": 0.93875128, "balance_loss_mlp": 1.02269483, "epoch": 0.5956832802260567, "flos": 19791699730560.0, "grad_norm": 1.8082640152543152, "language_loss": 0.76183939, "learning_rate": 1.483239796842997e-06, "loss": 0.78389716, "num_input_tokens_seen": 106844680, "step": 4954, "time_per_iteration": 2.6663479804992676 }, { "auxiliary_loss_clip": 0.0117078, "auxiliary_loss_mlp": 0.01034423, "balance_loss_clip": 0.93731076, "balance_loss_mlp": 1.0275805, "epoch": 0.5958035231166957, "flos": 19750868945280.0, "grad_norm": 2.020088534817219, "language_loss": 0.83620018, "learning_rate": 1.4824873156908462e-06, "loss": 0.85825223, "num_input_tokens_seen": 106862605, "step": 4955, "time_per_iteration": 3.6031906604766846 }, { "auxiliary_loss_clip": 0.01174049, "auxiliary_loss_mlp": 0.0112328, "balance_loss_clip": 1.01416135, "balance_loss_mlp": 0.0, "epoch": 0.5959237660073348, "flos": 21652806090240.0, "grad_norm": 1.4752142108655906, "language_loss": 0.75484419, "learning_rate": 1.4817349130449584e-06, "loss": 0.77781749, "num_input_tokens_seen": 106882325, "step": 4956, "time_per_iteration": 2.6706173419952393 }, { "auxiliary_loss_clip": 0.01170168, "auxiliary_loss_mlp": 0.01027105, "balance_loss_clip": 1.01349235, "balance_loss_mlp": 1.02027464, "epoch": 0.5960440088979739, "flos": 21171207513600.0, "grad_norm": 1.9640426248201777, "language_loss": 0.82915705, "learning_rate": 1.4809825890194717e-06, "loss": 0.85112977, "num_input_tokens_seen": 106900995, "step": 4957, "time_per_iteration": 2.7085421085357666 }, { "auxiliary_loss_clip": 0.01163688, "auxiliary_loss_mlp": 0.01030983, "balance_loss_clip": 0.97152328, "balance_loss_mlp": 1.02371395, "epoch": 0.596164251788613, "flos": 14757526753920.0, "grad_norm": 2.64396174167258, "language_loss": 0.7742753, "learning_rate": 1.4802303437285139e-06, "loss": 0.79622197, "num_input_tokens_seen": 106918265, "step": 4958, "time_per_iteration": 2.692646026611328 }, { "auxiliary_loss_clip": 0.01167137, "auxiliary_loss_mlp": 0.0102873, "balance_loss_clip": 0.97156531, "balance_loss_mlp": 1.02184558, "epoch": 0.596284494679252, "flos": 20485924865280.0, "grad_norm": 2.241347242159276, "language_loss": 0.80264747, "learning_rate": 1.4794781772861994e-06, "loss": 0.82460618, "num_input_tokens_seen": 106934760, "step": 4959, "time_per_iteration": 2.797010898590088 }, { "auxiliary_loss_clip": 0.01168122, "auxiliary_loss_mlp": 0.01122402, "balance_loss_clip": 0.97199988, "balance_loss_mlp": 0.0, "epoch": 0.5964047375698912, "flos": 31212262108800.0, "grad_norm": 2.001622475147847, "language_loss": 0.66566944, "learning_rate": 1.4787260898066324e-06, "loss": 0.68857473, "num_input_tokens_seen": 106954760, "step": 4960, "time_per_iteration": 2.7452962398529053 }, { "auxiliary_loss_clip": 0.01171401, "auxiliary_loss_mlp": 0.01023652, "balance_loss_clip": 1.05050397, "balance_loss_mlp": 1.0165112, "epoch": 0.5965249804605303, "flos": 27483620855040.0, "grad_norm": 2.05807261471852, "language_loss": 0.86203045, "learning_rate": 1.4779740814039023e-06, "loss": 0.88398099, "num_input_tokens_seen": 106974845, "step": 4961, "time_per_iteration": 3.5626401901245117 }, { "auxiliary_loss_clip": 0.01172202, "auxiliary_loss_mlp": 0.01025529, "balance_loss_clip": 1.04912615, "balance_loss_mlp": 1.01802468, "epoch": 0.5966452233511693, "flos": 30773936442240.0, "grad_norm": 1.7667891035940275, "language_loss": 0.68361664, "learning_rate": 1.4772221521920894e-06, "loss": 0.70559394, "num_input_tokens_seen": 106994870, "step": 4962, "time_per_iteration": 2.637964963912964 }, { "auxiliary_loss_clip": 0.01172212, "auxiliary_loss_mlp": 0.0102866, "balance_loss_clip": 0.97606254, "balance_loss_mlp": 1.02120376, "epoch": 0.5967654662418085, "flos": 25481170477440.0, "grad_norm": 2.240675031982481, "language_loss": 0.74107134, "learning_rate": 1.4764703022852598e-06, "loss": 0.76308, "num_input_tokens_seen": 107015390, "step": 4963, "time_per_iteration": 3.6528568267822266 }, { "auxiliary_loss_clip": 0.0115354, "auxiliary_loss_mlp": 0.01026056, "balance_loss_clip": 0.85608947, "balance_loss_mlp": 1.01913285, "epoch": 0.5968857091324475, "flos": 19099126621440.0, "grad_norm": 1.6199977450527692, "language_loss": 0.77020895, "learning_rate": 1.4757185317974696e-06, "loss": 0.79200494, "num_input_tokens_seen": 107033775, "step": 4964, "time_per_iteration": 3.618755340576172 }, { "auxiliary_loss_clip": 0.01171133, "auxiliary_loss_mlp": 0.01024463, "balance_loss_clip": 1.01026559, "balance_loss_mlp": 1.01684523, "epoch": 0.5970059520230866, "flos": 23692711374720.0, "grad_norm": 2.3964976700036726, "language_loss": 0.70685643, "learning_rate": 1.474966840842761e-06, "loss": 0.7288124, "num_input_tokens_seen": 107053355, "step": 4965, "time_per_iteration": 2.7206602096557617 }, { "auxiliary_loss_clip": 0.01175322, "auxiliary_loss_mlp": 0.01027099, "balance_loss_clip": 1.01234889, "balance_loss_mlp": 1.01970172, "epoch": 0.5971261949137258, "flos": 23185545292800.0, "grad_norm": 2.110011760531482, "language_loss": 0.87058616, "learning_rate": 1.4742152295351655e-06, "loss": 0.89261031, "num_input_tokens_seen": 107072510, "step": 4966, "time_per_iteration": 2.6321871280670166 }, { "auxiliary_loss_clip": 0.01171682, "auxiliary_loss_mlp": 0.01123042, "balance_loss_clip": 1.01213026, "balance_loss_mlp": 0.0, "epoch": 0.5972464378043648, "flos": 20557710195840.0, "grad_norm": 6.669028647632212, "language_loss": 0.63925165, "learning_rate": 1.4734636979887016e-06, "loss": 0.6621989, "num_input_tokens_seen": 107089970, "step": 4967, "time_per_iteration": 2.652977466583252 }, { "auxiliary_loss_clip": 0.01173183, "auxiliary_loss_mlp": 0.01033894, "balance_loss_clip": 0.93348777, "balance_loss_mlp": 1.02633584, "epoch": 0.5973666806950039, "flos": 29387030457600.0, "grad_norm": 1.8614321725895009, "language_loss": 0.89956957, "learning_rate": 1.4727122463173755e-06, "loss": 0.9216404, "num_input_tokens_seen": 107108500, "step": 4968, "time_per_iteration": 2.822801351547241 }, { "auxiliary_loss_clip": 0.01171593, "auxiliary_loss_mlp": 0.01023335, "balance_loss_clip": 0.97529823, "balance_loss_mlp": 1.01629281, "epoch": 0.597486923585643, "flos": 22273522041600.0, "grad_norm": 1.9807999912063095, "language_loss": 0.64014786, "learning_rate": 1.471960874635183e-06, "loss": 0.6620971, "num_input_tokens_seen": 107128060, "step": 4969, "time_per_iteration": 2.6235291957855225 }, { "auxiliary_loss_clip": 0.01164098, "auxiliary_loss_mlp": 0.01021601, "balance_loss_clip": 0.97199178, "balance_loss_mlp": 1.01442814, "epoch": 0.5976071664762821, "flos": 13772461196160.0, "grad_norm": 2.011165595149073, "language_loss": 0.70677853, "learning_rate": 1.4712095830561055e-06, "loss": 0.72863549, "num_input_tokens_seen": 107146550, "step": 4970, "time_per_iteration": 2.7040295600891113 }, { "auxiliary_loss_clip": 0.01169341, "auxiliary_loss_mlp": 0.01025425, "balance_loss_clip": 0.9714793, "balance_loss_mlp": 1.0184809, "epoch": 0.5977274093669211, "flos": 19098623831040.0, "grad_norm": 1.7215224130824318, "language_loss": 0.80957627, "learning_rate": 1.4704583716941147e-06, "loss": 0.8315239, "num_input_tokens_seen": 107165415, "step": 4971, "time_per_iteration": 2.6670219898223877 }, { "auxiliary_loss_clip": 0.01162392, "auxiliary_loss_mlp": 0.01026742, "balance_loss_clip": 1.0103538, "balance_loss_mlp": 1.01935697, "epoch": 0.5978476522575603, "flos": 20376002269440.0, "grad_norm": 1.9061012857603425, "language_loss": 0.72178036, "learning_rate": 1.4697072406631672e-06, "loss": 0.74367166, "num_input_tokens_seen": 107185320, "step": 4972, "time_per_iteration": 2.665391445159912 }, { "auxiliary_loss_clip": 0.01174634, "auxiliary_loss_mlp": 0.01026783, "balance_loss_clip": 0.89998084, "balance_loss_mlp": 1.01881373, "epoch": 0.5979678951481994, "flos": 29023147728000.0, "grad_norm": 1.6618332504922355, "language_loss": 0.72561365, "learning_rate": 1.4689561900772097e-06, "loss": 0.74762785, "num_input_tokens_seen": 107205380, "step": 4973, "time_per_iteration": 2.750819683074951 }, { "auxiliary_loss_clip": 0.01169957, "auxiliary_loss_mlp": 0.01026683, "balance_loss_clip": 0.97393173, "balance_loss_mlp": 1.01962578, "epoch": 0.5980881380388384, "flos": 17967689141760.0, "grad_norm": 2.1053839284900224, "language_loss": 0.72794664, "learning_rate": 1.4682052200501758e-06, "loss": 0.74991304, "num_input_tokens_seen": 107222585, "step": 4974, "time_per_iteration": 2.6712186336517334 }, { "auxiliary_loss_clip": 0.01171158, "auxiliary_loss_mlp": 0.01024551, "balance_loss_clip": 1.04830945, "balance_loss_mlp": 1.01803637, "epoch": 0.5982083809294776, "flos": 22962827013120.0, "grad_norm": 1.6205421952834211, "language_loss": 0.80039573, "learning_rate": 1.4674543306959876e-06, "loss": 0.82235277, "num_input_tokens_seen": 107242055, "step": 4975, "time_per_iteration": 2.611116647720337 }, { "auxiliary_loss_clip": 0.0117707, "auxiliary_loss_mlp": 0.01026785, "balance_loss_clip": 0.97494054, "balance_loss_mlp": 1.01953125, "epoch": 0.5983286238201166, "flos": 20991941712000.0, "grad_norm": 2.4552939046626525, "language_loss": 0.84125072, "learning_rate": 1.4667035221285535e-06, "loss": 0.8632893, "num_input_tokens_seen": 107259695, "step": 4976, "time_per_iteration": 2.656154155731201 }, { "auxiliary_loss_clip": 0.01170758, "auxiliary_loss_mlp": 0.01030168, "balance_loss_clip": 1.01299357, "balance_loss_mlp": 1.02311659, "epoch": 0.5984488667107557, "flos": 28183448511360.0, "grad_norm": 1.9978569459669984, "language_loss": 0.73902637, "learning_rate": 1.4659527944617715e-06, "loss": 0.76103562, "num_input_tokens_seen": 107279640, "step": 4977, "time_per_iteration": 2.6134183406829834 }, { "auxiliary_loss_clip": 0.01157703, "auxiliary_loss_mlp": 0.01025181, "balance_loss_clip": 0.85634327, "balance_loss_mlp": 1.01819205, "epoch": 0.5985691096013949, "flos": 16471794314880.0, "grad_norm": 1.855379600672011, "language_loss": 0.76487565, "learning_rate": 1.465202147809526e-06, "loss": 0.78670454, "num_input_tokens_seen": 107298135, "step": 4978, "time_per_iteration": 2.6474483013153076 }, { "auxiliary_loss_clip": 0.01175966, "auxiliary_loss_mlp": 0.01025294, "balance_loss_clip": 1.05262089, "balance_loss_mlp": 1.01787364, "epoch": 0.5986893524920339, "flos": 26719046933760.0, "grad_norm": 1.7711258762092028, "language_loss": 0.76404756, "learning_rate": 1.4644515822856888e-06, "loss": 0.78606015, "num_input_tokens_seen": 107316570, "step": 4979, "time_per_iteration": 2.5164992809295654 }, { "auxiliary_loss_clip": 0.01075964, "auxiliary_loss_mlp": 0.0100052, "balance_loss_clip": 0.90303349, "balance_loss_mlp": 0.99883884, "epoch": 0.598809595382673, "flos": 61608061100160.0, "grad_norm": 0.7606024628554326, "language_loss": 0.56480408, "learning_rate": 1.4637010980041215e-06, "loss": 0.5855689, "num_input_tokens_seen": 107378680, "step": 4980, "time_per_iteration": 3.2080395221710205 }, { "auxiliary_loss_clip": 0.0117576, "auxiliary_loss_mlp": 0.01026081, "balance_loss_clip": 1.05129647, "balance_loss_mlp": 1.01895475, "epoch": 0.5989298382733121, "flos": 11801719549440.0, "grad_norm": 1.9573158995543414, "language_loss": 0.89272833, "learning_rate": 1.4629506950786707e-06, "loss": 0.91474676, "num_input_tokens_seen": 107394860, "step": 4981, "time_per_iteration": 3.5111851692199707 }, { "auxiliary_loss_clip": 0.01069863, "auxiliary_loss_mlp": 0.01001911, "balance_loss_clip": 1.01516581, "balance_loss_mlp": 1.00030124, "epoch": 0.5990500811639512, "flos": 60025800021120.0, "grad_norm": 0.8264667938778889, "language_loss": 0.56099391, "learning_rate": 1.4622003736231733e-06, "loss": 0.58171165, "num_input_tokens_seen": 107453850, "step": 4982, "time_per_iteration": 3.1935343742370605 }, { "auxiliary_loss_clip": 0.01173857, "auxiliary_loss_mlp": 0.01024025, "balance_loss_clip": 1.01352298, "balance_loss_mlp": 1.01663411, "epoch": 0.5991703240545903, "flos": 18222726683520.0, "grad_norm": 1.7796261489002208, "language_loss": 0.80673665, "learning_rate": 1.461450133751451e-06, "loss": 0.82871544, "num_input_tokens_seen": 107471920, "step": 4983, "time_per_iteration": 2.6606602668762207 }, { "auxiliary_loss_clip": 0.01175481, "auxiliary_loss_mlp": 0.01024861, "balance_loss_clip": 1.01185167, "balance_loss_mlp": 1.01792848, "epoch": 0.5992905669452293, "flos": 27709894581120.0, "grad_norm": 1.7622741866874854, "language_loss": 0.7593534, "learning_rate": 1.4606999755773153e-06, "loss": 0.78135681, "num_input_tokens_seen": 107493125, "step": 4984, "time_per_iteration": 2.6542890071868896 }, { "auxiliary_loss_clip": 0.01172336, "auxiliary_loss_mlp": 0.01024474, "balance_loss_clip": 1.05014312, "balance_loss_mlp": 1.01755977, "epoch": 0.5994108098358685, "flos": 20449008662400.0, "grad_norm": 1.5078353673139473, "language_loss": 0.82457471, "learning_rate": 1.4599498992145643e-06, "loss": 0.84654284, "num_input_tokens_seen": 107513150, "step": 4985, "time_per_iteration": 2.581233024597168 }, { "auxiliary_loss_clip": 0.01177836, "auxiliary_loss_mlp": 0.01122904, "balance_loss_clip": 0.97518152, "balance_loss_mlp": 0.0, "epoch": 0.5995310527265075, "flos": 22269966595200.0, "grad_norm": 1.7692129612677834, "language_loss": 0.70893961, "learning_rate": 1.4591999047769846e-06, "loss": 0.731947, "num_input_tokens_seen": 107532005, "step": 4986, "time_per_iteration": 2.64294171333313 }, { "auxiliary_loss_clip": 0.01160137, "auxiliary_loss_mlp": 0.01027262, "balance_loss_clip": 0.85614228, "balance_loss_mlp": 1.01990724, "epoch": 0.5996512956171466, "flos": 18916951818240.0, "grad_norm": 1.6665374231107406, "language_loss": 0.75786424, "learning_rate": 1.4584499923783486e-06, "loss": 0.77973819, "num_input_tokens_seen": 107550585, "step": 4987, "time_per_iteration": 3.604705572128296 }, { "auxiliary_loss_clip": 0.01170858, "auxiliary_loss_mlp": 0.0102786, "balance_loss_clip": 0.97401655, "balance_loss_mlp": 1.02072191, "epoch": 0.5997715385077858, "flos": 15370916330880.0, "grad_norm": 1.6453927341934536, "language_loss": 0.76231122, "learning_rate": 1.457700162132419e-06, "loss": 0.78429842, "num_input_tokens_seen": 107567575, "step": 4988, "time_per_iteration": 2.6185405254364014 }, { "auxiliary_loss_clip": 0.01165373, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 0.89665323, "balance_loss_mlp": 1.02384686, "epoch": 0.5998917813984248, "flos": 25264844818560.0, "grad_norm": 2.511572468467276, "language_loss": 0.72510517, "learning_rate": 1.4569504141529433e-06, "loss": 0.74707305, "num_input_tokens_seen": 107585410, "step": 4989, "time_per_iteration": 3.7675373554229736 }, { "auxiliary_loss_clip": 0.01173656, "auxiliary_loss_mlp": 0.01026064, "balance_loss_clip": 1.01404881, "balance_loss_mlp": 1.018327, "epoch": 0.6000120242890639, "flos": 22054502862720.0, "grad_norm": 2.279009500653632, "language_loss": 0.71765614, "learning_rate": 1.456200748553658e-06, "loss": 0.73965341, "num_input_tokens_seen": 107603405, "step": 4990, "time_per_iteration": 3.590041160583496 }, { "auxiliary_loss_clip": 0.01177093, "auxiliary_loss_mlp": 0.01029024, "balance_loss_clip": 1.05204129, "balance_loss_mlp": 1.02205646, "epoch": 0.600132267179703, "flos": 29863421562240.0, "grad_norm": 1.4464099013490201, "language_loss": 0.78534836, "learning_rate": 1.455451165448287e-06, "loss": 0.80740952, "num_input_tokens_seen": 107626060, "step": 4991, "time_per_iteration": 2.8230514526367188 }, { "auxiliary_loss_clip": 0.01171491, "auxiliary_loss_mlp": 0.0102631, "balance_loss_clip": 0.97426039, "balance_loss_mlp": 1.01908302, "epoch": 0.6002525100703421, "flos": 25045358762880.0, "grad_norm": 2.834434575032191, "language_loss": 0.73592675, "learning_rate": 1.4547016649505407e-06, "loss": 0.75790477, "num_input_tokens_seen": 107644070, "step": 4992, "time_per_iteration": 2.6731679439544678 }, { "auxiliary_loss_clip": 0.01162905, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 0.93158114, "balance_loss_mlp": 1.02351916, "epoch": 0.6003727529609811, "flos": 20849592113280.0, "grad_norm": 1.9441324720704534, "language_loss": 0.85185182, "learning_rate": 1.4539522471741193e-06, "loss": 0.87379324, "num_input_tokens_seen": 107661495, "step": 4993, "time_per_iteration": 321.68075823783875 }, { "auxiliary_loss_clip": 0.01176228, "auxiliary_loss_mlp": 0.01029211, "balance_loss_clip": 1.01128864, "balance_loss_mlp": 1.02063429, "epoch": 0.6004929958516203, "flos": 15594604277760.0, "grad_norm": 2.0979119606071586, "language_loss": 0.70800781, "learning_rate": 1.4532029122327067e-06, "loss": 0.73006219, "num_input_tokens_seen": 107678280, "step": 4994, "time_per_iteration": 2.704411745071411 }, { "auxiliary_loss_clip": 0.01165984, "auxiliary_loss_mlp": 0.01027873, "balance_loss_clip": 0.93778384, "balance_loss_mlp": 1.02075624, "epoch": 0.6006132387422594, "flos": 21763267390080.0, "grad_norm": 1.796928660662483, "language_loss": 0.75845385, "learning_rate": 1.4524536602399783e-06, "loss": 0.78039241, "num_input_tokens_seen": 107697370, "step": 4995, "time_per_iteration": 2.7671751976013184 }, { "auxiliary_loss_clip": 0.01167706, "auxiliary_loss_mlp": 0.01027994, "balance_loss_clip": 0.97671103, "balance_loss_mlp": 1.02115774, "epoch": 0.6007334816328984, "flos": 22858542852480.0, "grad_norm": 1.6665760995198922, "language_loss": 0.77365005, "learning_rate": 1.4517044913095938e-06, "loss": 0.79560703, "num_input_tokens_seen": 107717790, "step": 4996, "time_per_iteration": 2.757735252380371 }, { "auxiliary_loss_clip": 0.01172147, "auxiliary_loss_mlp": 0.010305, "balance_loss_clip": 1.01291728, "balance_loss_mlp": 1.0228529, "epoch": 0.6008537245235376, "flos": 28324577047680.0, "grad_norm": 1.5604417571535965, "language_loss": 0.8186152, "learning_rate": 1.4509554055552022e-06, "loss": 0.84064162, "num_input_tokens_seen": 107738020, "step": 4997, "time_per_iteration": 2.7172293663024902 }, { "auxiliary_loss_clip": 0.01167781, "auxiliary_loss_mlp": 0.01032776, "balance_loss_clip": 0.97374141, "balance_loss_mlp": 1.025105, "epoch": 0.6009739674141766, "flos": 20886113266560.0, "grad_norm": 2.3524746151510794, "language_loss": 0.84136719, "learning_rate": 1.450206403090439e-06, "loss": 0.86337262, "num_input_tokens_seen": 107756215, "step": 4998, "time_per_iteration": 2.6787848472595215 }, { "auxiliary_loss_clip": 0.01166531, "auxiliary_loss_mlp": 0.01028716, "balance_loss_clip": 1.012187, "balance_loss_mlp": 1.02136111, "epoch": 0.6010942103048157, "flos": 20481004702080.0, "grad_norm": 2.2563306863911414, "language_loss": 0.86241907, "learning_rate": 1.4494574840289274e-06, "loss": 0.88437158, "num_input_tokens_seen": 107773330, "step": 4999, "time_per_iteration": 2.7049922943115234 }, { "auxiliary_loss_clip": 0.0117903, "auxiliary_loss_mlp": 0.01025396, "balance_loss_clip": 1.01291537, "balance_loss_mlp": 1.01773691, "epoch": 0.6012144531954549, "flos": 23805973935360.0, "grad_norm": 1.736882516438286, "language_loss": 0.7392087, "learning_rate": 1.4487086484842782e-06, "loss": 0.761253, "num_input_tokens_seen": 107791975, "step": 5000, "time_per_iteration": 2.6522634029388428 }, { "auxiliary_loss_clip": 0.01173407, "auxiliary_loss_mlp": 0.01031983, "balance_loss_clip": 1.0519253, "balance_loss_mlp": 1.02499473, "epoch": 0.6013346960860939, "flos": 18988378012800.0, "grad_norm": 1.8430172236746492, "language_loss": 0.60120678, "learning_rate": 1.4479598965700878e-06, "loss": 0.62326068, "num_input_tokens_seen": 107809240, "step": 5001, "time_per_iteration": 2.5957906246185303 }, { "auxiliary_loss_clip": 0.01165414, "auxiliary_loss_mlp": 0.01034683, "balance_loss_clip": 0.9333849, "balance_loss_mlp": 1.02764034, "epoch": 0.601454938976733, "flos": 24025316336640.0, "grad_norm": 2.212631110157406, "language_loss": 0.69169533, "learning_rate": 1.4472112283999427e-06, "loss": 0.7136963, "num_input_tokens_seen": 107827895, "step": 5002, "time_per_iteration": 2.718151807785034 }, { "auxiliary_loss_clip": 0.0117068, "auxiliary_loss_mlp": 0.01026941, "balance_loss_clip": 1.01357484, "balance_loss_mlp": 1.01961839, "epoch": 0.6015751818673721, "flos": 26427129102720.0, "grad_norm": 1.8651789510017303, "language_loss": 0.6903019, "learning_rate": 1.4464626440874143e-06, "loss": 0.71227813, "num_input_tokens_seen": 107847010, "step": 5003, "time_per_iteration": 2.678391933441162 }, { "auxiliary_loss_clip": 0.01176152, "auxiliary_loss_mlp": 0.01026343, "balance_loss_clip": 0.89482534, "balance_loss_mlp": 1.01834965, "epoch": 0.6016954247580112, "flos": 13115260005120.0, "grad_norm": 2.754389267125356, "language_loss": 0.74282336, "learning_rate": 1.4457141437460636e-06, "loss": 0.76484835, "num_input_tokens_seen": 107864235, "step": 5004, "time_per_iteration": 2.748781204223633 }, { "auxiliary_loss_clip": 0.01173354, "auxiliary_loss_mlp": 0.01032076, "balance_loss_clip": 0.97412777, "balance_loss_mlp": 1.0237608, "epoch": 0.6018156676486502, "flos": 23768447201280.0, "grad_norm": 1.6878071813549558, "language_loss": 0.7317332, "learning_rate": 1.444965727489436e-06, "loss": 0.75378752, "num_input_tokens_seen": 107883680, "step": 5005, "time_per_iteration": 2.671736478805542 }, { "auxiliary_loss_clip": 0.01163271, "auxiliary_loss_mlp": 0.01024179, "balance_loss_clip": 0.93151367, "balance_loss_mlp": 1.01733947, "epoch": 0.6019359105392894, "flos": 26469360518400.0, "grad_norm": 2.127298622499147, "language_loss": 0.62797213, "learning_rate": 1.444217395431066e-06, "loss": 0.64984661, "num_input_tokens_seen": 107906220, "step": 5006, "time_per_iteration": 2.8040719032287598 }, { "auxiliary_loss_clip": 0.01073181, "auxiliary_loss_mlp": 0.01012366, "balance_loss_clip": 0.9021076, "balance_loss_mlp": 1.0108937, "epoch": 0.6020561534299285, "flos": 69190849728000.0, "grad_norm": 0.7949881992888984, "language_loss": 0.5586074, "learning_rate": 1.4434691476844755e-06, "loss": 0.57946283, "num_input_tokens_seen": 107967195, "step": 5007, "time_per_iteration": 4.191433429718018 }, { "auxiliary_loss_clip": 0.01170214, "auxiliary_loss_mlp": 0.01034285, "balance_loss_clip": 0.97681576, "balance_loss_mlp": 1.02720976, "epoch": 0.6021763963205675, "flos": 21835304115840.0, "grad_norm": 2.014361084075841, "language_loss": 0.66780698, "learning_rate": 1.4427209843631729e-06, "loss": 0.689852, "num_input_tokens_seen": 107984245, "step": 5008, "time_per_iteration": 2.6844873428344727 }, { "auxiliary_loss_clip": 0.01170541, "auxiliary_loss_mlp": 0.01122473, "balance_loss_clip": 1.05075526, "balance_loss_mlp": 0.0, "epoch": 0.6022966392112067, "flos": 26578636669440.0, "grad_norm": 1.656021281556466, "language_loss": 0.80917406, "learning_rate": 1.4419729055806534e-06, "loss": 0.83210421, "num_input_tokens_seen": 108003680, "step": 5009, "time_per_iteration": 2.683014392852783 }, { "auxiliary_loss_clip": 0.01167111, "auxiliary_loss_mlp": 0.01122777, "balance_loss_clip": 0.97638988, "balance_loss_mlp": 0.0, "epoch": 0.6024168821018457, "flos": 20703722981760.0, "grad_norm": 1.8590208455059514, "language_loss": 0.82114869, "learning_rate": 1.441224911450401e-06, "loss": 0.84404755, "num_input_tokens_seen": 108019635, "step": 5010, "time_per_iteration": 2.6880130767822266 }, { "auxiliary_loss_clip": 0.01174883, "auxiliary_loss_mlp": 0.01028302, "balance_loss_clip": 1.01160014, "balance_loss_mlp": 1.02082467, "epoch": 0.6025371249924848, "flos": 24680973242880.0, "grad_norm": 1.6056521010592602, "language_loss": 0.82147932, "learning_rate": 1.4404770020858851e-06, "loss": 0.84351122, "num_input_tokens_seen": 108039120, "step": 5011, "time_per_iteration": 2.636472463607788 }, { "auxiliary_loss_clip": 0.01166198, "auxiliary_loss_mlp": 0.01026027, "balance_loss_clip": 1.01078081, "balance_loss_mlp": 1.01869559, "epoch": 0.602657367883124, "flos": 25955801815680.0, "grad_norm": 1.583064274549858, "language_loss": 0.85995436, "learning_rate": 1.439729177600563e-06, "loss": 0.88187665, "num_input_tokens_seen": 108059615, "step": 5012, "time_per_iteration": 3.6496455669403076 }, { "auxiliary_loss_clip": 0.01170788, "auxiliary_loss_mlp": 0.01029295, "balance_loss_clip": 1.01333976, "balance_loss_mlp": 1.0222435, "epoch": 0.602777610773763, "flos": 16690633925760.0, "grad_norm": 1.6998217575796017, "language_loss": 0.72765535, "learning_rate": 1.4389814381078793e-06, "loss": 0.74965614, "num_input_tokens_seen": 108078855, "step": 5013, "time_per_iteration": 2.5956473350524902 }, { "auxiliary_loss_clip": 0.01167896, "auxiliary_loss_mlp": 0.01031327, "balance_loss_clip": 0.74541277, "balance_loss_mlp": 1.02435911, "epoch": 0.6028978536644021, "flos": 13334243270400.0, "grad_norm": 1.9861655010421113, "language_loss": 0.80186391, "learning_rate": 1.438233783721265e-06, "loss": 0.82385612, "num_input_tokens_seen": 108095020, "step": 5014, "time_per_iteration": 4.00597071647644 }, { "auxiliary_loss_clip": 0.01168565, "auxiliary_loss_mlp": 0.01026184, "balance_loss_clip": 0.97619456, "balance_loss_mlp": 1.01931119, "epoch": 0.6030180965550412, "flos": 19644825018240.0, "grad_norm": 3.8811942703955333, "language_loss": 0.77908659, "learning_rate": 1.43748621455414e-06, "loss": 0.80103409, "num_input_tokens_seen": 108111455, "step": 5015, "time_per_iteration": 4.112343788146973 }, { "auxiliary_loss_clip": 0.01170076, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 0.97460073, "balance_loss_mlp": 1.01985669, "epoch": 0.6031383394456803, "flos": 14458390289280.0, "grad_norm": 2.2740653326221145, "language_loss": 0.8053689, "learning_rate": 1.4367387307199082e-06, "loss": 0.82734013, "num_input_tokens_seen": 108128305, "step": 5016, "time_per_iteration": 2.712754726409912 }, { "auxiliary_loss_clip": 0.01163316, "auxiliary_loss_mlp": 0.0102624, "balance_loss_clip": 1.00842643, "balance_loss_mlp": 1.01873541, "epoch": 0.6032585823363193, "flos": 13917791623680.0, "grad_norm": 1.971466593697614, "language_loss": 0.82277751, "learning_rate": 1.4359913323319632e-06, "loss": 0.84467304, "num_input_tokens_seen": 108145475, "step": 5017, "time_per_iteration": 2.6487503051757812 }, { "auxiliary_loss_clip": 0.01154343, "auxiliary_loss_mlp": 0.01024809, "balance_loss_clip": 0.855066, "balance_loss_mlp": 1.01708412, "epoch": 0.6033788252269584, "flos": 24353252530560.0, "grad_norm": 1.6573828644384965, "language_loss": 0.77372587, "learning_rate": 1.4352440195036847e-06, "loss": 0.79551739, "num_input_tokens_seen": 108165650, "step": 5018, "time_per_iteration": 2.787829637527466 }, { "auxiliary_loss_clip": 0.01166505, "auxiliary_loss_mlp": 0.01024976, "balance_loss_clip": 0.81538922, "balance_loss_mlp": 1.0178895, "epoch": 0.6034990681175976, "flos": 25521247077120.0, "grad_norm": 2.040986518892681, "language_loss": 0.7989558, "learning_rate": 1.4344967923484395e-06, "loss": 0.82087064, "num_input_tokens_seen": 108187620, "step": 5019, "time_per_iteration": 2.855949878692627 }, { "auxiliary_loss_clip": 0.01168069, "auxiliary_loss_mlp": 0.01029424, "balance_loss_clip": 1.01114678, "balance_loss_mlp": 1.02208686, "epoch": 0.6036193110082366, "flos": 25958387594880.0, "grad_norm": 2.290470542444935, "language_loss": 0.72464275, "learning_rate": 1.433749650979581e-06, "loss": 0.74661773, "num_input_tokens_seen": 108207605, "step": 5020, "time_per_iteration": 2.6892223358154297 }, { "auxiliary_loss_clip": 0.01173858, "auxiliary_loss_mlp": 0.01026012, "balance_loss_clip": 0.93386316, "balance_loss_mlp": 1.01829362, "epoch": 0.6037395538988757, "flos": 25593427457280.0, "grad_norm": 2.265679941164304, "language_loss": 0.67870545, "learning_rate": 1.433002595510451e-06, "loss": 0.7007041, "num_input_tokens_seen": 108226385, "step": 5021, "time_per_iteration": 2.828340530395508 }, { "auxiliary_loss_clip": 0.01166187, "auxiliary_loss_mlp": 0.01123011, "balance_loss_clip": 0.97188616, "balance_loss_mlp": 0.0, "epoch": 0.6038597967895148, "flos": 17816253402240.0, "grad_norm": 1.7357803018255762, "language_loss": 0.71373785, "learning_rate": 1.4322556260543757e-06, "loss": 0.73662984, "num_input_tokens_seen": 108242960, "step": 5022, "time_per_iteration": 2.669541597366333 }, { "auxiliary_loss_clip": 0.01080301, "auxiliary_loss_mlp": 0.01000131, "balance_loss_clip": 0.90401125, "balance_loss_mlp": 0.9984979, "epoch": 0.6039800396801539, "flos": 65169213235200.0, "grad_norm": 0.9021438431877629, "language_loss": 0.62742203, "learning_rate": 1.4315087427246703e-06, "loss": 0.64822632, "num_input_tokens_seen": 108296785, "step": 5023, "time_per_iteration": 3.123610258102417 }, { "auxiliary_loss_clip": 0.01068911, "auxiliary_loss_mlp": 0.01001096, "balance_loss_clip": 1.01381063, "balance_loss_mlp": 0.99943876, "epoch": 0.604100282570793, "flos": 67386409073280.0, "grad_norm": 0.8815009190903949, "language_loss": 0.58555567, "learning_rate": 1.4307619456346372e-06, "loss": 0.60625571, "num_input_tokens_seen": 108341090, "step": 5024, "time_per_iteration": 2.950090169906616 }, { "auxiliary_loss_clip": 0.01171711, "auxiliary_loss_mlp": 0.01028769, "balance_loss_clip": 1.00852597, "balance_loss_mlp": 1.02162838, "epoch": 0.6042205254614321, "flos": 35297495631360.0, "grad_norm": 1.9028580848209, "language_loss": 0.73967886, "learning_rate": 1.430015234897564e-06, "loss": 0.7616837, "num_input_tokens_seen": 108364370, "step": 5025, "time_per_iteration": 2.734583616256714 }, { "auxiliary_loss_clip": 0.01170937, "auxiliary_loss_mlp": 0.01122376, "balance_loss_clip": 1.04976773, "balance_loss_mlp": 0.0, "epoch": 0.6043407683520712, "flos": 45658262206080.0, "grad_norm": 1.6915122553409727, "language_loss": 0.66420937, "learning_rate": 1.4292686106267274e-06, "loss": 0.68714249, "num_input_tokens_seen": 108387220, "step": 5026, "time_per_iteration": 2.836838722229004 }, { "auxiliary_loss_clip": 0.01176462, "auxiliary_loss_mlp": 0.0103143, "balance_loss_clip": 1.01273489, "balance_loss_mlp": 1.0239557, "epoch": 0.6044610112427102, "flos": 16180020138240.0, "grad_norm": 1.8913788564754954, "language_loss": 0.77298802, "learning_rate": 1.4285220729353876e-06, "loss": 0.79506695, "num_input_tokens_seen": 108405760, "step": 5027, "time_per_iteration": 2.64276123046875 }, { "auxiliary_loss_clip": 0.01164736, "auxiliary_loss_mlp": 0.01026418, "balance_loss_clip": 0.96889389, "balance_loss_mlp": 1.01916373, "epoch": 0.6045812541333494, "flos": 13804062186240.0, "grad_norm": 1.9715575311832427, "language_loss": 0.77766597, "learning_rate": 1.4277756219367957e-06, "loss": 0.79957759, "num_input_tokens_seen": 108422785, "step": 5028, "time_per_iteration": 2.699321746826172 }, { "auxiliary_loss_clip": 0.01176474, "auxiliary_loss_mlp": 0.01025227, "balance_loss_clip": 0.93603289, "balance_loss_mlp": 1.01752019, "epoch": 0.6047014970239885, "flos": 19975059682560.0, "grad_norm": 1.9242093916438054, "language_loss": 0.79589856, "learning_rate": 1.4270292577441864e-06, "loss": 0.81791556, "num_input_tokens_seen": 108442290, "step": 5029, "time_per_iteration": 2.677422046661377 }, { "auxiliary_loss_clip": 0.01176091, "auxiliary_loss_mlp": 0.01031069, "balance_loss_clip": 1.01100326, "balance_loss_mlp": 1.02355242, "epoch": 0.6048217399146275, "flos": 25337097025920.0, "grad_norm": 1.645405166339408, "language_loss": 0.72068191, "learning_rate": 1.4262829804707836e-06, "loss": 0.74275351, "num_input_tokens_seen": 108464280, "step": 5030, "time_per_iteration": 2.7149698734283447 }, { "auxiliary_loss_clip": 0.01171465, "auxiliary_loss_mlp": 0.01032689, "balance_loss_clip": 1.00939119, "balance_loss_mlp": 1.02444577, "epoch": 0.6049419828052667, "flos": 26030819370240.0, "grad_norm": 1.432828571300034, "language_loss": 0.69765985, "learning_rate": 1.4255367902297958e-06, "loss": 0.71970141, "num_input_tokens_seen": 108485610, "step": 5031, "time_per_iteration": 2.6996657848358154 }, { "auxiliary_loss_clip": 0.01171098, "auxiliary_loss_mlp": 0.0102532, "balance_loss_clip": 1.05042434, "balance_loss_mlp": 1.01828063, "epoch": 0.6050622256959057, "flos": 14648106948480.0, "grad_norm": 2.1118502022580907, "language_loss": 0.7862289, "learning_rate": 1.4247906871344215e-06, "loss": 0.80819309, "num_input_tokens_seen": 108501005, "step": 5032, "time_per_iteration": 2.6895368099212646 }, { "auxiliary_loss_clip": 0.01162509, "auxiliary_loss_mlp": 0.01024553, "balance_loss_clip": 0.9697845, "balance_loss_mlp": 1.01783299, "epoch": 0.6051824685865448, "flos": 23331450337920.0, "grad_norm": 2.7093102160503824, "language_loss": 0.75189137, "learning_rate": 1.4240446712978415e-06, "loss": 0.77376199, "num_input_tokens_seen": 108519990, "step": 5033, "time_per_iteration": 3.989734411239624 }, { "auxiliary_loss_clip": 0.01174548, "auxiliary_loss_mlp": 0.01026171, "balance_loss_clip": 1.01114452, "balance_loss_mlp": 1.01851141, "epoch": 0.605302711477184, "flos": 27563307177600.0, "grad_norm": 2.3674089327608723, "language_loss": 0.74328959, "learning_rate": 1.423298742833227e-06, "loss": 0.7652967, "num_input_tokens_seen": 108538650, "step": 5034, "time_per_iteration": 2.6990530490875244 }, { "auxiliary_loss_clip": 0.01173756, "auxiliary_loss_mlp": 0.01028924, "balance_loss_clip": 0.93389893, "balance_loss_mlp": 1.02174175, "epoch": 0.605422954367823, "flos": 15154698412800.0, "grad_norm": 1.7266789031868994, "language_loss": 0.71823847, "learning_rate": 1.4225529018537352e-06, "loss": 0.74026525, "num_input_tokens_seen": 108554155, "step": 5035, "time_per_iteration": 2.7026867866516113 }, { "auxiliary_loss_clip": 0.01171127, "auxiliary_loss_mlp": 0.01028899, "balance_loss_clip": 1.05007792, "balance_loss_mlp": 1.02191925, "epoch": 0.6055431972584621, "flos": 27673912131840.0, "grad_norm": 1.8679405997769958, "language_loss": 0.77732933, "learning_rate": 1.4218071484725082e-06, "loss": 0.79932958, "num_input_tokens_seen": 108576275, "step": 5036, "time_per_iteration": 2.652538776397705 }, { "auxiliary_loss_clip": 0.01169381, "auxiliary_loss_mlp": 0.01029724, "balance_loss_clip": 0.97567153, "balance_loss_mlp": 1.02188349, "epoch": 0.6056634401491012, "flos": 19387489006080.0, "grad_norm": 1.9114424971154256, "language_loss": 0.76015389, "learning_rate": 1.4210614828026786e-06, "loss": 0.7821449, "num_input_tokens_seen": 108594125, "step": 5037, "time_per_iteration": 2.68062162399292 }, { "auxiliary_loss_clip": 0.0116904, "auxiliary_loss_mlp": 0.01027202, "balance_loss_clip": 1.04694653, "balance_loss_mlp": 1.02030587, "epoch": 0.6057836830397403, "flos": 24789459294720.0, "grad_norm": 1.462996390645732, "language_loss": 0.74582869, "learning_rate": 1.4203159049573605e-06, "loss": 0.76779115, "num_input_tokens_seen": 108615360, "step": 5038, "time_per_iteration": 3.6866233348846436 }, { "auxiliary_loss_clip": 0.01177349, "auxiliary_loss_mlp": 0.01029436, "balance_loss_clip": 0.97213769, "balance_loss_mlp": 1.02219665, "epoch": 0.6059039259303793, "flos": 20558248899840.0, "grad_norm": 1.9186856201185936, "language_loss": 0.86904937, "learning_rate": 1.4195704150496593e-06, "loss": 0.89111722, "num_input_tokens_seen": 108633075, "step": 5039, "time_per_iteration": 2.7784807682037354 }, { "auxiliary_loss_clip": 0.01174873, "auxiliary_loss_mlp": 0.0102271, "balance_loss_clip": 0.97709143, "balance_loss_mlp": 1.01540291, "epoch": 0.6060241688210185, "flos": 21069724613760.0, "grad_norm": 1.6626566394784767, "language_loss": 0.73637533, "learning_rate": 1.4188250131926639e-06, "loss": 0.75835121, "num_input_tokens_seen": 108651875, "step": 5040, "time_per_iteration": 3.5782103538513184 }, { "auxiliary_loss_clip": 0.0116983, "auxiliary_loss_mlp": 0.01029516, "balance_loss_clip": 0.9726488, "balance_loss_mlp": 1.02106404, "epoch": 0.6061444117116576, "flos": 16361081619840.0, "grad_norm": 1.844985516891751, "language_loss": 0.80596972, "learning_rate": 1.4180796994994525e-06, "loss": 0.82796317, "num_input_tokens_seen": 108669290, "step": 5041, "time_per_iteration": 3.5492374897003174 }, { "auxiliary_loss_clip": 0.01165644, "auxiliary_loss_mlp": 0.01027453, "balance_loss_clip": 0.97153383, "balance_loss_mlp": 1.02061903, "epoch": 0.6062646546022966, "flos": 21507296094720.0, "grad_norm": 1.8968120528794141, "language_loss": 0.72160745, "learning_rate": 1.4173344740830877e-06, "loss": 0.74353838, "num_input_tokens_seen": 108688420, "step": 5042, "time_per_iteration": 2.6662251949310303 }, { "auxiliary_loss_clip": 0.01165943, "auxiliary_loss_mlp": 0.01031052, "balance_loss_clip": 0.97534883, "balance_loss_mlp": 1.0236845, "epoch": 0.6063848974929358, "flos": 38983151283840.0, "grad_norm": 1.4736920293246427, "language_loss": 0.70293814, "learning_rate": 1.4165893370566206e-06, "loss": 0.72490811, "num_input_tokens_seen": 108712175, "step": 5043, "time_per_iteration": 2.806328535079956 }, { "auxiliary_loss_clip": 0.0116615, "auxiliary_loss_mlp": 0.01025905, "balance_loss_clip": 1.00982666, "balance_loss_mlp": 1.01807904, "epoch": 0.6065051403835748, "flos": 19646584784640.0, "grad_norm": 1.5335055092783683, "language_loss": 0.77468729, "learning_rate": 1.4158442885330865e-06, "loss": 0.79660785, "num_input_tokens_seen": 108730745, "step": 5044, "time_per_iteration": 2.6100852489471436 }, { "auxiliary_loss_clip": 0.01163203, "auxiliary_loss_mlp": 0.01025959, "balance_loss_clip": 1.00840497, "balance_loss_mlp": 1.01815701, "epoch": 0.6066253832742139, "flos": 23513086437120.0, "grad_norm": 2.1950885948220726, "language_loss": 0.78702253, "learning_rate": 1.4150993286255094e-06, "loss": 0.80891418, "num_input_tokens_seen": 108749995, "step": 5045, "time_per_iteration": 2.67533278465271 }, { "auxiliary_loss_clip": 0.01169413, "auxiliary_loss_mlp": 0.01026553, "balance_loss_clip": 1.0480684, "balance_loss_mlp": 1.01953721, "epoch": 0.6067456261648531, "flos": 19133708440320.0, "grad_norm": 2.3362947268263987, "language_loss": 0.7989288, "learning_rate": 1.4143544574468993e-06, "loss": 0.8208884, "num_input_tokens_seen": 108768355, "step": 5046, "time_per_iteration": 2.5896360874176025 }, { "auxiliary_loss_clip": 0.01166647, "auxiliary_loss_mlp": 0.01024357, "balance_loss_clip": 1.01160586, "balance_loss_mlp": 1.01776755, "epoch": 0.6068658690554921, "flos": 20520614424960.0, "grad_norm": 1.5448244412277152, "language_loss": 0.8224982, "learning_rate": 1.4136096751102523e-06, "loss": 0.84440827, "num_input_tokens_seen": 108786685, "step": 5047, "time_per_iteration": 2.663482189178467 }, { "auxiliary_loss_clip": 0.01172211, "auxiliary_loss_mlp": 0.01025614, "balance_loss_clip": 0.97283125, "balance_loss_mlp": 1.0186522, "epoch": 0.6069861119461312, "flos": 27374560185600.0, "grad_norm": 2.5632360724866183, "language_loss": 0.83030862, "learning_rate": 1.4128649817285516e-06, "loss": 0.85228682, "num_input_tokens_seen": 108804820, "step": 5048, "time_per_iteration": 2.6880712509155273 }, { "auxiliary_loss_clip": 0.01168538, "auxiliary_loss_mlp": 0.01030236, "balance_loss_clip": 0.97046417, "balance_loss_mlp": 1.02251768, "epoch": 0.6071063548367702, "flos": 25626500904960.0, "grad_norm": 1.7658966638857956, "language_loss": 0.63589895, "learning_rate": 1.412120377414766e-06, "loss": 0.65788674, "num_input_tokens_seen": 108825010, "step": 5049, "time_per_iteration": 2.7260966300964355 }, { "auxiliary_loss_clip": 0.0117243, "auxiliary_loss_mlp": 0.01025705, "balance_loss_clip": 1.05174947, "balance_loss_mlp": 1.0188241, "epoch": 0.6072265977274094, "flos": 24460517520000.0, "grad_norm": 1.4801692185703939, "language_loss": 0.71510625, "learning_rate": 1.4113758622818522e-06, "loss": 0.73708755, "num_input_tokens_seen": 108845075, "step": 5050, "time_per_iteration": 2.86273193359375 }, { "auxiliary_loss_clip": 0.01175652, "auxiliary_loss_mlp": 0.011217, "balance_loss_clip": 0.97479606, "balance_loss_mlp": 0.0, "epoch": 0.6073468406180484, "flos": 18149253413760.0, "grad_norm": 1.8781945404794407, "language_loss": 0.82700741, "learning_rate": 1.410631436442751e-06, "loss": 0.84998095, "num_input_tokens_seen": 108863870, "step": 5051, "time_per_iteration": 2.667175531387329 }, { "auxiliary_loss_clip": 0.01177285, "auxiliary_loss_mlp": 0.01027938, "balance_loss_clip": 1.01166403, "balance_loss_mlp": 1.02020693, "epoch": 0.6074670835086875, "flos": 20697617669760.0, "grad_norm": 1.8436335924086036, "language_loss": 0.86804366, "learning_rate": 1.4098871000103936e-06, "loss": 0.89009589, "num_input_tokens_seen": 108882470, "step": 5052, "time_per_iteration": 2.6355106830596924 }, { "auxiliary_loss_clip": 0.01170221, "auxiliary_loss_mlp": 0.01023138, "balance_loss_clip": 0.97214097, "balance_loss_mlp": 1.01597035, "epoch": 0.6075873263993267, "flos": 23769955572480.0, "grad_norm": 1.6520457527687513, "language_loss": 0.82714558, "learning_rate": 1.409142853097693e-06, "loss": 0.84907919, "num_input_tokens_seen": 108902710, "step": 5053, "time_per_iteration": 2.7476792335510254 }, { "auxiliary_loss_clip": 0.01173928, "auxiliary_loss_mlp": 0.01027749, "balance_loss_clip": 0.97354609, "balance_loss_mlp": 1.0202148, "epoch": 0.6077075692899657, "flos": 24454484035200.0, "grad_norm": 5.182148516454582, "language_loss": 0.79632312, "learning_rate": 1.408398695817553e-06, "loss": 0.81833988, "num_input_tokens_seen": 108919935, "step": 5054, "time_per_iteration": 2.7183568477630615 }, { "auxiliary_loss_clip": 0.01168155, "auxiliary_loss_mlp": 0.01028397, "balance_loss_clip": 0.97190928, "balance_loss_mlp": 1.01993883, "epoch": 0.6078278121806048, "flos": 27382102041600.0, "grad_norm": 1.793171385620784, "language_loss": 0.70343161, "learning_rate": 1.4076546282828593e-06, "loss": 0.72539705, "num_input_tokens_seen": 108942790, "step": 5055, "time_per_iteration": 2.699187755584717 }, { "auxiliary_loss_clip": 0.01169932, "auxiliary_loss_mlp": 0.01030083, "balance_loss_clip": 0.96901429, "balance_loss_mlp": 1.02248597, "epoch": 0.6079480550712439, "flos": 38436447306240.0, "grad_norm": 2.805555169135198, "language_loss": 0.66169685, "learning_rate": 1.4069106506064874e-06, "loss": 0.68369699, "num_input_tokens_seen": 108964215, "step": 5056, "time_per_iteration": 2.8143255710601807 }, { "auxiliary_loss_clip": 0.01167132, "auxiliary_loss_mlp": 0.0102246, "balance_loss_clip": 0.97507131, "balance_loss_mlp": 1.01571846, "epoch": 0.608068297961883, "flos": 25336271013120.0, "grad_norm": 1.6877505271670195, "language_loss": 0.78246677, "learning_rate": 1.4061667629012989e-06, "loss": 0.80436271, "num_input_tokens_seen": 108984885, "step": 5057, "time_per_iteration": 2.669180393218994 }, { "auxiliary_loss_clip": 0.01158584, "auxiliary_loss_mlp": 0.01025823, "balance_loss_clip": 0.97349215, "balance_loss_mlp": 1.01880169, "epoch": 0.608188540852522, "flos": 24202463235840.0, "grad_norm": 1.6070478391916454, "language_loss": 0.83084691, "learning_rate": 1.40542296528014e-06, "loss": 0.85269094, "num_input_tokens_seen": 109004545, "step": 5058, "time_per_iteration": 2.6848108768463135 }, { "auxiliary_loss_clip": 0.01167957, "auxiliary_loss_mlp": 0.0103597, "balance_loss_clip": 1.00965309, "balance_loss_mlp": 1.02802515, "epoch": 0.6083087837431612, "flos": 21284146851840.0, "grad_norm": 2.025885007764381, "language_loss": 0.75925875, "learning_rate": 1.4046792578558452e-06, "loss": 0.78129792, "num_input_tokens_seen": 109022440, "step": 5059, "time_per_iteration": 3.5570240020751953 }, { "auxiliary_loss_clip": 0.01166604, "auxiliary_loss_mlp": 0.01021133, "balance_loss_clip": 0.97318316, "balance_loss_mlp": 1.01428485, "epoch": 0.6084290266338003, "flos": 16471435178880.0, "grad_norm": 2.282328163671358, "language_loss": 0.76012057, "learning_rate": 1.4039356407412325e-06, "loss": 0.78199792, "num_input_tokens_seen": 109035680, "step": 5060, "time_per_iteration": 2.65321683883667 }, { "auxiliary_loss_clip": 0.01072397, "auxiliary_loss_mlp": 0.01002621, "balance_loss_clip": 0.97736216, "balance_loss_mlp": 1.00111854, "epoch": 0.6085492695244393, "flos": 66443574931200.0, "grad_norm": 0.7942883685654797, "language_loss": 0.57259929, "learning_rate": 1.40319211404911e-06, "loss": 0.59334946, "num_input_tokens_seen": 109090680, "step": 5061, "time_per_iteration": 3.1945087909698486 }, { "auxiliary_loss_clip": 0.01173931, "auxiliary_loss_mlp": 0.01029093, "balance_loss_clip": 1.05004346, "balance_loss_mlp": 1.02186251, "epoch": 0.6086695124150785, "flos": 23618986709760.0, "grad_norm": 1.7421953053133394, "language_loss": 0.90589041, "learning_rate": 1.4024486778922691e-06, "loss": 0.92792058, "num_input_tokens_seen": 109108995, "step": 5062, "time_per_iteration": 2.6014606952667236 }, { "auxiliary_loss_clip": 0.01172877, "auxiliary_loss_mlp": 0.0103262, "balance_loss_clip": 0.97039533, "balance_loss_mlp": 1.02528298, "epoch": 0.6087897553057176, "flos": 20157054917760.0, "grad_norm": 1.8079653600584398, "language_loss": 0.77178907, "learning_rate": 1.4017053323834884e-06, "loss": 0.79384398, "num_input_tokens_seen": 109128825, "step": 5063, "time_per_iteration": 2.6701765060424805 }, { "auxiliary_loss_clip": 0.01169668, "auxiliary_loss_mlp": 0.01021572, "balance_loss_clip": 0.97035265, "balance_loss_mlp": 1.01451468, "epoch": 0.6089099981963566, "flos": 25482535194240.0, "grad_norm": 1.9289558039272012, "language_loss": 0.76014829, "learning_rate": 1.4009620776355333e-06, "loss": 0.78206068, "num_input_tokens_seen": 109150425, "step": 5064, "time_per_iteration": 3.744518280029297 }, { "auxiliary_loss_clip": 0.01167235, "auxiliary_loss_mlp": 0.01024848, "balance_loss_clip": 1.00994062, "balance_loss_mlp": 1.01706409, "epoch": 0.6090302410869958, "flos": 25332895134720.0, "grad_norm": 1.6213961748707257, "language_loss": 0.79210126, "learning_rate": 1.4002189137611553e-06, "loss": 0.81402212, "num_input_tokens_seen": 109169765, "step": 5065, "time_per_iteration": 2.797499895095825 }, { "auxiliary_loss_clip": 0.01168093, "auxiliary_loss_mlp": 0.01024031, "balance_loss_clip": 1.01009583, "balance_loss_mlp": 1.01690519, "epoch": 0.6091504839776348, "flos": 23987358639360.0, "grad_norm": 2.5286855888358986, "language_loss": 0.69809139, "learning_rate": 1.3994758408730901e-06, "loss": 0.72001266, "num_input_tokens_seen": 109188950, "step": 5066, "time_per_iteration": 3.634504556655884 }, { "auxiliary_loss_clip": 0.01171869, "auxiliary_loss_mlp": 0.010279, "balance_loss_clip": 0.97574604, "balance_loss_mlp": 1.02039623, "epoch": 0.6092707268682739, "flos": 29643037666560.0, "grad_norm": 1.848742223012662, "language_loss": 0.76431251, "learning_rate": 1.3987328590840629e-06, "loss": 0.7863102, "num_input_tokens_seen": 109209895, "step": 5067, "time_per_iteration": 3.578697919845581 }, { "auxiliary_loss_clip": 0.01165906, "auxiliary_loss_mlp": 0.01024432, "balance_loss_clip": 1.00948906, "balance_loss_mlp": 1.01726413, "epoch": 0.609390969758913, "flos": 24024957200640.0, "grad_norm": 1.7694094087346646, "language_loss": 0.86592126, "learning_rate": 1.397989968506783e-06, "loss": 0.88782465, "num_input_tokens_seen": 109228905, "step": 5068, "time_per_iteration": 2.6851580142974854 }, { "auxiliary_loss_clip": 0.01178715, "auxiliary_loss_mlp": 0.01030918, "balance_loss_clip": 1.05329514, "balance_loss_mlp": 1.02354169, "epoch": 0.6095112126495521, "flos": 11102143288320.0, "grad_norm": 1.9584455846095052, "language_loss": 0.72087407, "learning_rate": 1.3972471692539458e-06, "loss": 0.74297029, "num_input_tokens_seen": 109243620, "step": 5069, "time_per_iteration": 2.537277936935425 }, { "auxiliary_loss_clip": 0.01167691, "auxiliary_loss_mlp": 0.01028087, "balance_loss_clip": 0.97519517, "balance_loss_mlp": 1.02101755, "epoch": 0.6096314555401912, "flos": 17265491187840.0, "grad_norm": 2.1744649141747012, "language_loss": 0.75562739, "learning_rate": 1.3965044614382348e-06, "loss": 0.77758521, "num_input_tokens_seen": 109259070, "step": 5070, "time_per_iteration": 2.665353775024414 }, { "auxiliary_loss_clip": 0.01175706, "auxiliary_loss_mlp": 0.01030654, "balance_loss_clip": 1.0522337, "balance_loss_mlp": 1.02253866, "epoch": 0.6097516984308303, "flos": 21645910679040.0, "grad_norm": 2.3389111750839295, "language_loss": 0.75626427, "learning_rate": 1.3957618451723162e-06, "loss": 0.77832788, "num_input_tokens_seen": 109275100, "step": 5071, "time_per_iteration": 2.70641827583313 }, { "auxiliary_loss_clip": 0.01170899, "auxiliary_loss_mlp": 0.01028725, "balance_loss_clip": 0.97213811, "balance_loss_mlp": 1.02173972, "epoch": 0.6098719413214694, "flos": 27199208966400.0, "grad_norm": 1.7824986862144814, "language_loss": 0.71444488, "learning_rate": 1.3950193205688457e-06, "loss": 0.73644114, "num_input_tokens_seen": 109294825, "step": 5072, "time_per_iteration": 2.7088747024536133 }, { "auxiliary_loss_clip": 0.01169794, "auxiliary_loss_mlp": 0.01025542, "balance_loss_clip": 0.97443074, "balance_loss_mlp": 1.01884866, "epoch": 0.6099921842121084, "flos": 20412954385920.0, "grad_norm": 1.7864431727821086, "language_loss": 0.8367036, "learning_rate": 1.3942768877404627e-06, "loss": 0.85865688, "num_input_tokens_seen": 109313790, "step": 5073, "time_per_iteration": 2.6974055767059326 }, { "auxiliary_loss_clip": 0.01173773, "auxiliary_loss_mlp": 0.01027288, "balance_loss_clip": 1.05080581, "balance_loss_mlp": 1.02043343, "epoch": 0.6101124271027476, "flos": 23366139897600.0, "grad_norm": 3.3887756418030905, "language_loss": 0.73555005, "learning_rate": 1.393534546799795e-06, "loss": 0.75756067, "num_input_tokens_seen": 109333490, "step": 5074, "time_per_iteration": 2.6098763942718506 }, { "auxiliary_loss_clip": 0.01158703, "auxiliary_loss_mlp": 0.01025161, "balance_loss_clip": 0.97214311, "balance_loss_mlp": 1.01800871, "epoch": 0.6102326699933867, "flos": 26687840993280.0, "grad_norm": 1.8211509049046632, "language_loss": 0.68033648, "learning_rate": 1.3927922978594536e-06, "loss": 0.70217514, "num_input_tokens_seen": 109354575, "step": 5075, "time_per_iteration": 2.715085506439209 }, { "auxiliary_loss_clip": 0.0106859, "auxiliary_loss_mlp": 0.00999955, "balance_loss_clip": 0.97667849, "balance_loss_mlp": 0.99832171, "epoch": 0.6103529128840257, "flos": 60644612551680.0, "grad_norm": 0.7883969468300963, "language_loss": 0.57469743, "learning_rate": 1.3920501410320387e-06, "loss": 0.59538293, "num_input_tokens_seen": 109410690, "step": 5076, "time_per_iteration": 3.2082626819610596 }, { "auxiliary_loss_clip": 0.01169745, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 0.97226977, "balance_loss_mlp": 1.02046692, "epoch": 0.6104731557746649, "flos": 19021307806080.0, "grad_norm": 2.244901412988615, "language_loss": 0.76155621, "learning_rate": 1.3913080764301333e-06, "loss": 0.78353322, "num_input_tokens_seen": 109427650, "step": 5077, "time_per_iteration": 2.675795078277588 }, { "auxiliary_loss_clip": 0.01175908, "auxiliary_loss_mlp": 0.01032572, "balance_loss_clip": 0.89479893, "balance_loss_mlp": 1.02519274, "epoch": 0.6105933986653039, "flos": 23366894083200.0, "grad_norm": 1.6919897705342712, "language_loss": 0.71256042, "learning_rate": 1.3905661041663085e-06, "loss": 0.73464519, "num_input_tokens_seen": 109448835, "step": 5078, "time_per_iteration": 2.7585692405700684 }, { "auxiliary_loss_clip": 0.01170912, "auxiliary_loss_mlp": 0.01023763, "balance_loss_clip": 1.01187038, "balance_loss_mlp": 1.01638436, "epoch": 0.610713641555943, "flos": 34637565006720.0, "grad_norm": 2.5213755577175525, "language_loss": 0.6515764, "learning_rate": 1.389824224353122e-06, "loss": 0.67352313, "num_input_tokens_seen": 109470425, "step": 5079, "time_per_iteration": 2.7373359203338623 }, { "auxiliary_loss_clip": 0.01169605, "auxiliary_loss_mlp": 0.01023786, "balance_loss_clip": 1.01239097, "balance_loss_mlp": 1.01651978, "epoch": 0.610833884446582, "flos": 26646471504000.0, "grad_norm": 1.6917616085299172, "language_loss": 0.76801229, "learning_rate": 1.389082437103115e-06, "loss": 0.7899462, "num_input_tokens_seen": 109489695, "step": 5080, "time_per_iteration": 2.719517707824707 }, { "auxiliary_loss_clip": 0.01167588, "auxiliary_loss_mlp": 0.01025948, "balance_loss_clip": 0.93313628, "balance_loss_mlp": 1.01799071, "epoch": 0.6109541273372212, "flos": 21215126868480.0, "grad_norm": 2.0556581281756072, "language_loss": 0.77907181, "learning_rate": 1.3883407425288172e-06, "loss": 0.80100715, "num_input_tokens_seen": 109510030, "step": 5081, "time_per_iteration": 2.7013585567474365 }, { "auxiliary_loss_clip": 0.01163837, "auxiliary_loss_mlp": 0.01026195, "balance_loss_clip": 0.97058707, "balance_loss_mlp": 1.01851201, "epoch": 0.6110743702278603, "flos": 20084084438400.0, "grad_norm": 1.976057999585423, "language_loss": 0.79870266, "learning_rate": 1.3875991407427417e-06, "loss": 0.82060301, "num_input_tokens_seen": 109528255, "step": 5082, "time_per_iteration": 2.7414562702178955 }, { "auxiliary_loss_clip": 0.01078597, "auxiliary_loss_mlp": 0.01000546, "balance_loss_clip": 0.9020462, "balance_loss_mlp": 0.99890101, "epoch": 0.6111946131184993, "flos": 68302957438080.0, "grad_norm": 0.8236426654244312, "language_loss": 0.5822078, "learning_rate": 1.38685763185739e-06, "loss": 0.60299921, "num_input_tokens_seen": 109581915, "step": 5083, "time_per_iteration": 3.2954108715057373 }, { "auxiliary_loss_clip": 0.01170593, "auxiliary_loss_mlp": 0.01026685, "balance_loss_clip": 1.04973602, "balance_loss_mlp": 1.0192821, "epoch": 0.6113148560091385, "flos": 19937676602880.0, "grad_norm": 2.857492418773513, "language_loss": 0.67310822, "learning_rate": 1.3861162159852476e-06, "loss": 0.69508094, "num_input_tokens_seen": 109600050, "step": 5084, "time_per_iteration": 2.6239521503448486 }, { "auxiliary_loss_clip": 0.01175063, "auxiliary_loss_mlp": 0.01029626, "balance_loss_clip": 0.97491431, "balance_loss_mlp": 1.02218771, "epoch": 0.6114350988997775, "flos": 23731854220800.0, "grad_norm": 1.6563588280716481, "language_loss": 0.79512674, "learning_rate": 1.3853748932387875e-06, "loss": 0.8171736, "num_input_tokens_seen": 109620690, "step": 5085, "time_per_iteration": 2.7596635818481445 }, { "auxiliary_loss_clip": 0.0115936, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 0.9709543, "balance_loss_mlp": 1.02295995, "epoch": 0.6115553417904166, "flos": 24023700224640.0, "grad_norm": 2.3436931825481655, "language_loss": 0.75175554, "learning_rate": 1.3846336637304671e-06, "loss": 0.77365696, "num_input_tokens_seen": 109638960, "step": 5086, "time_per_iteration": 3.623995542526245 }, { "auxiliary_loss_clip": 0.01166162, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 0.97462004, "balance_loss_mlp": 1.02146208, "epoch": 0.6116755846810558, "flos": 23733542160000.0, "grad_norm": 1.8164313917848418, "language_loss": 0.82558608, "learning_rate": 1.3838925275727316e-06, "loss": 0.84753686, "num_input_tokens_seen": 109659700, "step": 5087, "time_per_iteration": 2.7765262126922607 }, { "auxiliary_loss_clip": 0.01172532, "auxiliary_loss_mlp": 0.01028055, "balance_loss_clip": 1.05076146, "balance_loss_mlp": 1.02036881, "epoch": 0.6117958275716948, "flos": 18661626967680.0, "grad_norm": 1.7211631433848353, "language_loss": 0.78923976, "learning_rate": 1.3831514848780089e-06, "loss": 0.81124562, "num_input_tokens_seen": 109679275, "step": 5088, "time_per_iteration": 2.601261615753174 }, { "auxiliary_loss_clip": 0.01163722, "auxiliary_loss_mlp": 0.01028668, "balance_loss_clip": 1.01060867, "balance_loss_mlp": 1.02118182, "epoch": 0.6119160704623339, "flos": 16471183783680.0, "grad_norm": 2.1645442605940572, "language_loss": 0.91705406, "learning_rate": 1.3824105357587152e-06, "loss": 0.93897796, "num_input_tokens_seen": 109696380, "step": 5089, "time_per_iteration": 2.6084957122802734 }, { "auxiliary_loss_clip": 0.01162951, "auxiliary_loss_mlp": 0.01026671, "balance_loss_clip": 0.97102356, "balance_loss_mlp": 1.01930428, "epoch": 0.612036313352973, "flos": 23915465568000.0, "grad_norm": 2.6315678025552063, "language_loss": 0.82311237, "learning_rate": 1.381669680327253e-06, "loss": 0.84500855, "num_input_tokens_seen": 109718060, "step": 5090, "time_per_iteration": 3.6556310653686523 }, { "auxiliary_loss_clip": 0.01164187, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 0.97469926, "balance_loss_mlp": 1.0229919, "epoch": 0.6121565562436121, "flos": 26974766833920.0, "grad_norm": 2.0506503010728236, "language_loss": 0.70666647, "learning_rate": 1.380928918696008e-06, "loss": 0.72861409, "num_input_tokens_seen": 109736830, "step": 5091, "time_per_iteration": 2.6931121349334717 }, { "auxiliary_loss_clip": 0.01170572, "auxiliary_loss_mlp": 0.0102751, "balance_loss_clip": 1.00981927, "balance_loss_mlp": 1.01982057, "epoch": 0.6122767991342511, "flos": 15668867646720.0, "grad_norm": 2.253643345617657, "language_loss": 0.71635991, "learning_rate": 1.3801882509773548e-06, "loss": 0.73834074, "num_input_tokens_seen": 109754690, "step": 5092, "time_per_iteration": 3.5368456840515137 }, { "auxiliary_loss_clip": 0.01163367, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.00784183, "balance_loss_mlp": 1.02291107, "epoch": 0.6123970420248903, "flos": 27964321591680.0, "grad_norm": 1.4858692515833114, "language_loss": 0.8181771, "learning_rate": 1.3794476772836503e-06, "loss": 0.84012127, "num_input_tokens_seen": 109775790, "step": 5093, "time_per_iteration": 3.609910726547241 }, { "auxiliary_loss_clip": 0.01161306, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 0.93561941, "balance_loss_mlp": 1.01795626, "epoch": 0.6125172849155294, "flos": 21468727866240.0, "grad_norm": 1.5409156636897028, "language_loss": 0.84695387, "learning_rate": 1.3787071977272402e-06, "loss": 0.86881912, "num_input_tokens_seen": 109795050, "step": 5094, "time_per_iteration": 2.758223056793213 }, { "auxiliary_loss_clip": 0.01165418, "auxiliary_loss_mlp": 0.01022872, "balance_loss_clip": 0.8989473, "balance_loss_mlp": 1.01562405, "epoch": 0.6126375278061684, "flos": 16248321849600.0, "grad_norm": 2.9258025480919643, "language_loss": 0.71621943, "learning_rate": 1.3779668124204535e-06, "loss": 0.73810232, "num_input_tokens_seen": 109811465, "step": 5095, "time_per_iteration": 2.7176833152770996 }, { "auxiliary_loss_clip": 0.01163593, "auxiliary_loss_mlp": 0.01027123, "balance_loss_clip": 0.97361797, "balance_loss_mlp": 1.01965475, "epoch": 0.6127577706968076, "flos": 20448865008000.0, "grad_norm": 1.6318455071939066, "language_loss": 0.80890012, "learning_rate": 1.3772265214756074e-06, "loss": 0.83080727, "num_input_tokens_seen": 109831225, "step": 5096, "time_per_iteration": 2.681588888168335 }, { "auxiliary_loss_clip": 0.01171973, "auxiliary_loss_mlp": 0.01028135, "balance_loss_clip": 1.00898516, "balance_loss_mlp": 1.02073181, "epoch": 0.6128780135874466, "flos": 18260397072000.0, "grad_norm": 2.124353579391665, "language_loss": 0.74981141, "learning_rate": 1.3764863250050025e-06, "loss": 0.77181256, "num_input_tokens_seen": 109849465, "step": 5097, "time_per_iteration": 2.588686227798462 }, { "auxiliary_loss_clip": 0.01165721, "auxiliary_loss_mlp": 0.01027193, "balance_loss_clip": 0.93208051, "balance_loss_mlp": 1.0199213, "epoch": 0.6129982564780857, "flos": 24937088192640.0, "grad_norm": 1.7522997005132805, "language_loss": 0.80575562, "learning_rate": 1.3757462231209272e-06, "loss": 0.82768476, "num_input_tokens_seen": 109869770, "step": 5098, "time_per_iteration": 2.7357351779937744 }, { "auxiliary_loss_clip": 0.01162601, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 0.97309858, "balance_loss_mlp": 1.02165771, "epoch": 0.6131184993687249, "flos": 22492038430080.0, "grad_norm": 1.819666846796521, "language_loss": 0.88787699, "learning_rate": 1.3750062159356525e-06, "loss": 0.90979218, "num_input_tokens_seen": 109889120, "step": 5099, "time_per_iteration": 2.732231855392456 }, { "auxiliary_loss_clip": 0.0115694, "auxiliary_loss_mlp": 0.01022752, "balance_loss_clip": 0.93357742, "balance_loss_mlp": 1.01529527, "epoch": 0.6132387422593639, "flos": 15885839750400.0, "grad_norm": 1.6347410546052752, "language_loss": 0.82913506, "learning_rate": 1.3742663035614382e-06, "loss": 0.85093188, "num_input_tokens_seen": 109906490, "step": 5100, "time_per_iteration": 2.650526523590088 }, { "auxiliary_loss_clip": 0.01173591, "auxiliary_loss_mlp": 0.01026813, "balance_loss_clip": 1.05032277, "balance_loss_mlp": 1.01950574, "epoch": 0.613358985150003, "flos": 25411539962880.0, "grad_norm": 1.6385025588939435, "language_loss": 0.80213243, "learning_rate": 1.3735264861105283e-06, "loss": 0.8241365, "num_input_tokens_seen": 109927130, "step": 5101, "time_per_iteration": 2.6835505962371826 }, { "auxiliary_loss_clip": 0.01167388, "auxiliary_loss_mlp": 0.01030915, "balance_loss_clip": 0.93376517, "balance_loss_mlp": 1.02396846, "epoch": 0.6134792280406421, "flos": 21361283308800.0, "grad_norm": 4.021898268686976, "language_loss": 0.78171575, "learning_rate": 1.372786763695152e-06, "loss": 0.80369884, "num_input_tokens_seen": 109945890, "step": 5102, "time_per_iteration": 2.691572427749634 }, { "auxiliary_loss_clip": 0.01172253, "auxiliary_loss_mlp": 0.01026884, "balance_loss_clip": 1.01124322, "balance_loss_mlp": 1.01876593, "epoch": 0.6135994709312812, "flos": 21211248199680.0, "grad_norm": 1.6674976761077636, "language_loss": 0.77229649, "learning_rate": 1.3720471364275257e-06, "loss": 0.79428786, "num_input_tokens_seen": 109965535, "step": 5103, "time_per_iteration": 2.6854259967803955 }, { "auxiliary_loss_clip": 0.01162796, "auxiliary_loss_mlp": 0.01123086, "balance_loss_clip": 0.9326911, "balance_loss_mlp": 0.0, "epoch": 0.6137197138219203, "flos": 14794047907200.0, "grad_norm": 2.097120597200302, "language_loss": 0.77987558, "learning_rate": 1.3713076044198486e-06, "loss": 0.80273438, "num_input_tokens_seen": 109982345, "step": 5104, "time_per_iteration": 2.676046371459961 }, { "auxiliary_loss_clip": 0.01160571, "auxiliary_loss_mlp": 0.01024238, "balance_loss_clip": 0.97080451, "balance_loss_mlp": 1.01693916, "epoch": 0.6138399567125594, "flos": 20084515401600.0, "grad_norm": 2.3237227982301976, "language_loss": 0.80850339, "learning_rate": 1.3705681677843086e-06, "loss": 0.83035147, "num_input_tokens_seen": 110000940, "step": 5105, "time_per_iteration": 2.743741750717163 }, { "auxiliary_loss_clip": 0.01068293, "auxiliary_loss_mlp": 0.01002312, "balance_loss_clip": 1.01388752, "balance_loss_mlp": 1.00075078, "epoch": 0.6139601996031985, "flos": 60123838193280.0, "grad_norm": 0.781158858008484, "language_loss": 0.60631371, "learning_rate": 1.3698288266330768e-06, "loss": 0.62701976, "num_input_tokens_seen": 110061565, "step": 5106, "time_per_iteration": 3.2781553268432617 }, { "auxiliary_loss_clip": 0.01168836, "auxiliary_loss_mlp": 0.01026875, "balance_loss_clip": 0.97706348, "balance_loss_mlp": 1.01995778, "epoch": 0.6140804424938375, "flos": 23586703361280.0, "grad_norm": 2.615278354060527, "language_loss": 0.72866988, "learning_rate": 1.3690895810783113e-06, "loss": 0.75062692, "num_input_tokens_seen": 110080360, "step": 5107, "time_per_iteration": 2.6816482543945312 }, { "auxiliary_loss_clip": 0.01174968, "auxiliary_loss_mlp": 0.01122932, "balance_loss_clip": 0.85645932, "balance_loss_mlp": 0.0, "epoch": 0.6142006853844767, "flos": 21398199511680.0, "grad_norm": 2.2471045660608, "language_loss": 0.71227193, "learning_rate": 1.3683504312321543e-06, "loss": 0.73525095, "num_input_tokens_seen": 110100695, "step": 5108, "time_per_iteration": 2.862395763397217 }, { "auxiliary_loss_clip": 0.01173895, "auxiliary_loss_mlp": 0.01025451, "balance_loss_clip": 1.01021266, "balance_loss_mlp": 1.01841736, "epoch": 0.6143209282751158, "flos": 12057367622400.0, "grad_norm": 1.7506186919713371, "language_loss": 0.79979837, "learning_rate": 1.3676113772067355e-06, "loss": 0.82179183, "num_input_tokens_seen": 110117750, "step": 5109, "time_per_iteration": 2.9608728885650635 }, { "auxiliary_loss_clip": 0.01173315, "auxiliary_loss_mlp": 0.01028076, "balance_loss_clip": 0.89721334, "balance_loss_mlp": 1.02009463, "epoch": 0.6144411711657548, "flos": 25082274965760.0, "grad_norm": 1.838831790264973, "language_loss": 0.72533357, "learning_rate": 1.3668724191141671e-06, "loss": 0.74734747, "num_input_tokens_seen": 110137020, "step": 5110, "time_per_iteration": 2.776017904281616 }, { "auxiliary_loss_clip": 0.01168901, "auxiliary_loss_mlp": 0.01025679, "balance_loss_clip": 0.94048285, "balance_loss_mlp": 1.01796651, "epoch": 0.6145614140563939, "flos": 20114069316480.0, "grad_norm": 1.881121728124457, "language_loss": 0.66134882, "learning_rate": 1.3661335570665493e-06, "loss": 0.68329465, "num_input_tokens_seen": 110154930, "step": 5111, "time_per_iteration": 3.697387933731079 }, { "auxiliary_loss_clip": 0.01173003, "auxiliary_loss_mlp": 0.01029975, "balance_loss_clip": 0.97464591, "balance_loss_mlp": 1.02247643, "epoch": 0.614681656947033, "flos": 16800376953600.0, "grad_norm": 3.289901846442159, "language_loss": 0.70051515, "learning_rate": 1.3653947911759676e-06, "loss": 0.72254491, "num_input_tokens_seen": 110172480, "step": 5112, "time_per_iteration": 2.671170711517334 }, { "auxiliary_loss_clip": 0.01159553, "auxiliary_loss_mlp": 0.01031783, "balance_loss_clip": 0.89493501, "balance_loss_mlp": 1.02368307, "epoch": 0.6148018998376721, "flos": 38801587011840.0, "grad_norm": 1.6274641485251453, "language_loss": 0.74412328, "learning_rate": 1.3646561215544904e-06, "loss": 0.76603663, "num_input_tokens_seen": 110197120, "step": 5113, "time_per_iteration": 2.948216676712036 }, { "auxiliary_loss_clip": 0.01168566, "auxiliary_loss_mlp": 0.01024654, "balance_loss_clip": 1.01090288, "balance_loss_mlp": 1.01700115, "epoch": 0.6149221427283111, "flos": 23327032965120.0, "grad_norm": 1.9778871140150691, "language_loss": 0.79641962, "learning_rate": 1.363917548314176e-06, "loss": 0.81835186, "num_input_tokens_seen": 110216385, "step": 5114, "time_per_iteration": 2.6274309158325195 }, { "auxiliary_loss_clip": 0.01177111, "auxiliary_loss_mlp": 0.01024548, "balance_loss_clip": 1.01092696, "balance_loss_mlp": 1.01725876, "epoch": 0.6150423856189503, "flos": 22379494141440.0, "grad_norm": 1.5910605650244694, "language_loss": 0.73367655, "learning_rate": 1.3631790715670626e-06, "loss": 0.7556932, "num_input_tokens_seen": 110234790, "step": 5115, "time_per_iteration": 2.6772048473358154 }, { "auxiliary_loss_clip": 0.01150929, "auxiliary_loss_mlp": 0.01023383, "balance_loss_clip": 0.81934547, "balance_loss_mlp": 1.01630807, "epoch": 0.6151626285095894, "flos": 18692078722560.0, "grad_norm": 6.349920378318319, "language_loss": 0.85614753, "learning_rate": 1.3624406914251783e-06, "loss": 0.87789065, "num_input_tokens_seen": 110251910, "step": 5116, "time_per_iteration": 3.741163969039917 }, { "auxiliary_loss_clip": 0.01169828, "auxiliary_loss_mlp": 0.0102505, "balance_loss_clip": 1.01045251, "balance_loss_mlp": 1.01789176, "epoch": 0.6152828714002284, "flos": 15851688894720.0, "grad_norm": 1.9900499957939326, "language_loss": 0.88324034, "learning_rate": 1.3617024080005335e-06, "loss": 0.9051891, "num_input_tokens_seen": 110268810, "step": 5117, "time_per_iteration": 2.7104341983795166 }, { "auxiliary_loss_clip": 0.01173644, "auxiliary_loss_mlp": 0.01122746, "balance_loss_clip": 0.9729172, "balance_loss_mlp": 0.0, "epoch": 0.6154031142908676, "flos": 24869792062080.0, "grad_norm": 1.7616398376649378, "language_loss": 0.74229538, "learning_rate": 1.3609642214051266e-06, "loss": 0.76525927, "num_input_tokens_seen": 110293035, "step": 5118, "time_per_iteration": 3.622174024581909 }, { "auxiliary_loss_clip": 0.0116299, "auxiliary_loss_mlp": 0.01024432, "balance_loss_clip": 0.97308463, "balance_loss_mlp": 1.01676679, "epoch": 0.6155233571815066, "flos": 19244744357760.0, "grad_norm": 1.7766426550925385, "language_loss": 0.66218597, "learning_rate": 1.3602261317509385e-06, "loss": 0.68406016, "num_input_tokens_seen": 110309695, "step": 5119, "time_per_iteration": 2.6710779666900635 }, { "auxiliary_loss_clip": 0.01175843, "auxiliary_loss_mlp": 0.01026751, "balance_loss_clip": 1.01365089, "balance_loss_mlp": 1.0192349, "epoch": 0.6156436000721457, "flos": 18770077105920.0, "grad_norm": 2.60946760296633, "language_loss": 0.82222772, "learning_rate": 1.3594881391499387e-06, "loss": 0.84425366, "num_input_tokens_seen": 110328610, "step": 5120, "time_per_iteration": 2.6860201358795166 }, { "auxiliary_loss_clip": 0.01174142, "auxiliary_loss_mlp": 0.01025181, "balance_loss_clip": 0.97489285, "balance_loss_mlp": 1.01804018, "epoch": 0.6157638429627849, "flos": 18041198325120.0, "grad_norm": 2.318533750832802, "language_loss": 0.79307759, "learning_rate": 1.3587502437140778e-06, "loss": 0.81507087, "num_input_tokens_seen": 110346775, "step": 5121, "time_per_iteration": 2.619142770767212 }, { "auxiliary_loss_clip": 0.01172425, "auxiliary_loss_mlp": 0.01027732, "balance_loss_clip": 0.97128278, "balance_loss_mlp": 1.02086258, "epoch": 0.6158840858534239, "flos": 25556726736000.0, "grad_norm": 2.177568775158125, "language_loss": 0.84791517, "learning_rate": 1.3580124455552952e-06, "loss": 0.86991668, "num_input_tokens_seen": 110366140, "step": 5122, "time_per_iteration": 2.7820191383361816 }, { "auxiliary_loss_clip": 0.01172796, "auxiliary_loss_mlp": 0.01122499, "balance_loss_clip": 1.01234424, "balance_loss_mlp": 0.0, "epoch": 0.616004328744063, "flos": 24640788902400.0, "grad_norm": 1.6307965817850631, "language_loss": 0.87007535, "learning_rate": 1.3572747447855148e-06, "loss": 0.89302826, "num_input_tokens_seen": 110386550, "step": 5123, "time_per_iteration": 2.7277889251708984 }, { "auxiliary_loss_clip": 0.01174297, "auxiliary_loss_mlp": 0.0102804, "balance_loss_clip": 1.05057967, "balance_loss_mlp": 1.01992774, "epoch": 0.6161245716347021, "flos": 21689686379520.0, "grad_norm": 1.8107880713866467, "language_loss": 0.69154847, "learning_rate": 1.356537141516644e-06, "loss": 0.71357179, "num_input_tokens_seen": 110403970, "step": 5124, "time_per_iteration": 2.685330390930176 }, { "auxiliary_loss_clip": 0.01171919, "auxiliary_loss_mlp": 0.01026316, "balance_loss_clip": 1.01493406, "balance_loss_mlp": 1.01911628, "epoch": 0.6162448145253412, "flos": 35189225061120.0, "grad_norm": 1.855653394508742, "language_loss": 0.61854666, "learning_rate": 1.3557996358605775e-06, "loss": 0.64052904, "num_input_tokens_seen": 110423890, "step": 5125, "time_per_iteration": 2.8005893230438232 }, { "auxiliary_loss_clip": 0.01170184, "auxiliary_loss_mlp": 0.01024517, "balance_loss_clip": 1.01102018, "balance_loss_mlp": 1.01749527, "epoch": 0.6163650574159802, "flos": 21615279356160.0, "grad_norm": 2.1328725418052135, "language_loss": 0.70044863, "learning_rate": 1.3550622279291941e-06, "loss": 0.72239566, "num_input_tokens_seen": 110442035, "step": 5126, "time_per_iteration": 2.677809000015259 }, { "auxiliary_loss_clip": 0.01154754, "auxiliary_loss_mlp": 0.01026559, "balance_loss_clip": 0.8929792, "balance_loss_mlp": 1.01947236, "epoch": 0.6164853003066194, "flos": 24572163968640.0, "grad_norm": 1.4469548120213547, "language_loss": 0.83093703, "learning_rate": 1.354324917834358e-06, "loss": 0.85275018, "num_input_tokens_seen": 110463280, "step": 5127, "time_per_iteration": 2.7677581310272217 }, { "auxiliary_loss_clip": 0.01161308, "auxiliary_loss_mlp": 0.01123034, "balance_loss_clip": 0.85660285, "balance_loss_mlp": 0.0, "epoch": 0.6166055431972585, "flos": 21835986474240.0, "grad_norm": 1.7103413446590028, "language_loss": 0.76817405, "learning_rate": 1.353587705687918e-06, "loss": 0.79101741, "num_input_tokens_seen": 110481455, "step": 5128, "time_per_iteration": 2.7403719425201416 }, { "auxiliary_loss_clip": 0.01178051, "auxiliary_loss_mlp": 0.01029411, "balance_loss_clip": 0.97612375, "balance_loss_mlp": 1.02160871, "epoch": 0.6167257860878975, "flos": 17785262943360.0, "grad_norm": 2.5639164810286617, "language_loss": 0.72665828, "learning_rate": 1.3528505916017096e-06, "loss": 0.74873292, "num_input_tokens_seen": 110499155, "step": 5129, "time_per_iteration": 2.6679186820983887 }, { "auxiliary_loss_clip": 0.01169112, "auxiliary_loss_mlp": 0.01027957, "balance_loss_clip": 1.00960374, "balance_loss_mlp": 1.02038074, "epoch": 0.6168460289785367, "flos": 23214811898880.0, "grad_norm": 2.541170488111723, "language_loss": 0.88814008, "learning_rate": 1.3521135756875514e-06, "loss": 0.91011077, "num_input_tokens_seen": 110515470, "step": 5130, "time_per_iteration": 2.58976149559021 }, { "auxiliary_loss_clip": 0.01155883, "auxiliary_loss_mlp": 0.01026519, "balance_loss_clip": 0.85636258, "balance_loss_mlp": 1.01950383, "epoch": 0.6169662718691757, "flos": 26213281482240.0, "grad_norm": 1.4477185632211904, "language_loss": 0.86174226, "learning_rate": 1.3513766580572496e-06, "loss": 0.88356626, "num_input_tokens_seen": 110538290, "step": 5131, "time_per_iteration": 2.9317967891693115 }, { "auxiliary_loss_clip": 0.01170926, "auxiliary_loss_mlp": 0.01020983, "balance_loss_clip": 1.01271558, "balance_loss_mlp": 1.01418257, "epoch": 0.6170865147598148, "flos": 19026120228480.0, "grad_norm": 2.2090537608701593, "language_loss": 0.77303147, "learning_rate": 1.3506398388225924e-06, "loss": 0.7949506, "num_input_tokens_seen": 110555610, "step": 5132, "time_per_iteration": 2.6159582138061523 }, { "auxiliary_loss_clip": 0.01170218, "auxiliary_loss_mlp": 0.01029511, "balance_loss_clip": 1.05010688, "balance_loss_mlp": 1.02221251, "epoch": 0.617206757650454, "flos": 18260361158400.0, "grad_norm": 1.605494202489554, "language_loss": 0.71628386, "learning_rate": 1.349903118095355e-06, "loss": 0.73828119, "num_input_tokens_seen": 110574745, "step": 5133, "time_per_iteration": 2.6602354049682617 }, { "auxiliary_loss_clip": 0.01177075, "auxiliary_loss_mlp": 0.01024587, "balance_loss_clip": 1.01222825, "balance_loss_mlp": 1.01750636, "epoch": 0.617327000541093, "flos": 18186959715840.0, "grad_norm": 1.6032144572970708, "language_loss": 0.73505092, "learning_rate": 1.349166495987298e-06, "loss": 0.75706756, "num_input_tokens_seen": 110593310, "step": 5134, "time_per_iteration": 2.6422648429870605 }, { "auxiliary_loss_clip": 0.01084014, "auxiliary_loss_mlp": 0.01001103, "balance_loss_clip": 0.95353186, "balance_loss_mlp": 0.99938625, "epoch": 0.6174472434317321, "flos": 61833796122240.0, "grad_norm": 0.836756107084329, "language_loss": 0.60917735, "learning_rate": 1.348429972610166e-06, "loss": 0.63002849, "num_input_tokens_seen": 110657615, "step": 5135, "time_per_iteration": 3.287794589996338 }, { "auxiliary_loss_clip": 0.01089805, "auxiliary_loss_mlp": 0.01000309, "balance_loss_clip": 0.88354844, "balance_loss_mlp": 0.99874717, "epoch": 0.6175674863223712, "flos": 71230970494080.0, "grad_norm": 0.8511215956885204, "language_loss": 0.57886875, "learning_rate": 1.3476935480756897e-06, "loss": 0.59976989, "num_input_tokens_seen": 110714365, "step": 5136, "time_per_iteration": 3.210911273956299 }, { "auxiliary_loss_clip": 0.01158544, "auxiliary_loss_mlp": 0.01029312, "balance_loss_clip": 0.9332298, "balance_loss_mlp": 1.02177179, "epoch": 0.6176877292130103, "flos": 21835447770240.0, "grad_norm": 2.0498866553764827, "language_loss": 0.75300533, "learning_rate": 1.346957222495583e-06, "loss": 0.77488387, "num_input_tokens_seen": 110732160, "step": 5137, "time_per_iteration": 3.74656081199646 }, { "auxiliary_loss_clip": 0.01178098, "auxiliary_loss_mlp": 0.01122566, "balance_loss_clip": 0.97688907, "balance_loss_mlp": 0.0, "epoch": 0.6178079721036493, "flos": 17741738638080.0, "grad_norm": 2.6503567501687098, "language_loss": 0.7083019, "learning_rate": 1.3462209959815466e-06, "loss": 0.73130846, "num_input_tokens_seen": 110746900, "step": 5138, "time_per_iteration": 2.6938540935516357 }, { "auxiliary_loss_clip": 0.01173535, "auxiliary_loss_mlp": 0.01030201, "balance_loss_clip": 0.97493649, "balance_loss_mlp": 1.02267325, "epoch": 0.6179282149942885, "flos": 22633131052800.0, "grad_norm": 1.7668153993644513, "language_loss": 0.74189246, "learning_rate": 1.345484868645265e-06, "loss": 0.76392978, "num_input_tokens_seen": 110765710, "step": 5139, "time_per_iteration": 2.8336353302001953 }, { "auxiliary_loss_clip": 0.01177753, "auxiliary_loss_mlp": 0.01024465, "balance_loss_clip": 0.93530083, "balance_loss_mlp": 1.01752067, "epoch": 0.6180484578849276, "flos": 22310330503680.0, "grad_norm": 2.108132268764228, "language_loss": 0.78680897, "learning_rate": 1.3447488405984088e-06, "loss": 0.80883116, "num_input_tokens_seen": 110783970, "step": 5140, "time_per_iteration": 2.7559008598327637 }, { "auxiliary_loss_clip": 0.01166563, "auxiliary_loss_mlp": 0.01029194, "balance_loss_clip": 0.97237551, "balance_loss_mlp": 1.02169538, "epoch": 0.6181687007755666, "flos": 35225458905600.0, "grad_norm": 2.743270818347703, "language_loss": 0.6946072, "learning_rate": 1.3440129119526322e-06, "loss": 0.71656477, "num_input_tokens_seen": 110806395, "step": 5141, "time_per_iteration": 2.8307430744171143 }, { "auxiliary_loss_clip": 0.01069135, "auxiliary_loss_mlp": 0.01005332, "balance_loss_clip": 1.01444125, "balance_loss_mlp": 1.00374627, "epoch": 0.6182889436662057, "flos": 61547370094080.0, "grad_norm": 1.5174722751227943, "language_loss": 0.51189142, "learning_rate": 1.3432770828195762e-06, "loss": 0.53263611, "num_input_tokens_seen": 110867380, "step": 5142, "time_per_iteration": 3.3678927421569824 }, { "auxiliary_loss_clip": 0.01158168, "auxiliary_loss_mlp": 0.01030016, "balance_loss_clip": 0.93150014, "balance_loss_mlp": 1.02264595, "epoch": 0.6184091865568448, "flos": 19609991804160.0, "grad_norm": 2.6948225908697547, "language_loss": 0.70692092, "learning_rate": 1.3425413533108635e-06, "loss": 0.72880274, "num_input_tokens_seen": 110885980, "step": 5143, "time_per_iteration": 3.6640286445617676 }, { "auxiliary_loss_clip": 0.01171091, "auxiliary_loss_mlp": 0.0102843, "balance_loss_clip": 0.90133113, "balance_loss_mlp": 1.02145028, "epoch": 0.6185294294474839, "flos": 23586882929280.0, "grad_norm": 1.981436932787451, "language_loss": 0.71016192, "learning_rate": 1.341805723538105e-06, "loss": 0.73215711, "num_input_tokens_seen": 110906085, "step": 5144, "time_per_iteration": 4.521790504455566 }, { "auxiliary_loss_clip": 0.01176112, "auxiliary_loss_mlp": 0.01027227, "balance_loss_clip": 0.97392523, "balance_loss_mlp": 1.02021503, "epoch": 0.618649672338123, "flos": 26762032535040.0, "grad_norm": 1.5756777844494658, "language_loss": 0.77588665, "learning_rate": 1.3410701936128948e-06, "loss": 0.79792011, "num_input_tokens_seen": 110928865, "step": 5145, "time_per_iteration": 2.6688072681427 }, { "auxiliary_loss_clip": 0.01170944, "auxiliary_loss_mlp": 0.01029101, "balance_loss_clip": 1.01307118, "balance_loss_mlp": 1.021981, "epoch": 0.6187699152287621, "flos": 14456630522880.0, "grad_norm": 2.3996157865982215, "language_loss": 0.84965694, "learning_rate": 1.340334763646812e-06, "loss": 0.87165731, "num_input_tokens_seen": 110943000, "step": 5146, "time_per_iteration": 2.637018918991089 }, { "auxiliary_loss_clip": 0.01174908, "auxiliary_loss_mlp": 0.01027727, "balance_loss_clip": 1.0497303, "balance_loss_mlp": 1.01979327, "epoch": 0.6188901581194012, "flos": 20084766796800.0, "grad_norm": 1.6593610644060053, "language_loss": 0.74490333, "learning_rate": 1.3395994337514218e-06, "loss": 0.76692975, "num_input_tokens_seen": 110963170, "step": 5147, "time_per_iteration": 2.598961591720581 }, { "auxiliary_loss_clip": 0.01161665, "auxiliary_loss_mlp": 0.01024086, "balance_loss_clip": 1.00921583, "balance_loss_mlp": 1.01727295, "epoch": 0.6190104010100402, "flos": 25700728360320.0, "grad_norm": 1.6498413938421788, "language_loss": 0.78661752, "learning_rate": 1.3388642040382725e-06, "loss": 0.80847502, "num_input_tokens_seen": 110983595, "step": 5148, "time_per_iteration": 2.6911680698394775 }, { "auxiliary_loss_clip": 0.01167641, "auxiliary_loss_mlp": 0.01028083, "balance_loss_clip": 0.9309603, "balance_loss_mlp": 1.02054906, "epoch": 0.6191306439006794, "flos": 30442372974720.0, "grad_norm": 1.5919160508964596, "language_loss": 0.84300351, "learning_rate": 1.3381290746188975e-06, "loss": 0.86496073, "num_input_tokens_seen": 111002965, "step": 5149, "time_per_iteration": 2.8394083976745605 }, { "auxiliary_loss_clip": 0.01175318, "auxiliary_loss_mlp": 0.01029665, "balance_loss_clip": 1.01743877, "balance_loss_mlp": 1.02230418, "epoch": 0.6192508867913185, "flos": 26685793918080.0, "grad_norm": 1.96265967094198, "language_loss": 0.6710304, "learning_rate": 1.3373940456048152e-06, "loss": 0.69308031, "num_input_tokens_seen": 111022990, "step": 5150, "time_per_iteration": 2.6831839084625244 }, { "auxiliary_loss_clip": 0.01171203, "auxiliary_loss_mlp": 0.0102619, "balance_loss_clip": 1.05071425, "balance_loss_mlp": 1.0190022, "epoch": 0.6193711296819575, "flos": 36722036090880.0, "grad_norm": 1.5667323494368979, "language_loss": 0.58823007, "learning_rate": 1.3366591171075299e-06, "loss": 0.61020398, "num_input_tokens_seen": 111046495, "step": 5151, "time_per_iteration": 2.806478977203369 }, { "auxiliary_loss_clip": 0.01168842, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 0.97377992, "balance_loss_mlp": 1.01883376, "epoch": 0.6194913725725967, "flos": 25192556697600.0, "grad_norm": 1.8310216267563213, "language_loss": 0.91136098, "learning_rate": 1.335924289238529e-06, "loss": 0.93331647, "num_input_tokens_seen": 111065705, "step": 5152, "time_per_iteration": 2.813859701156616 }, { "auxiliary_loss_clip": 0.01177205, "auxiliary_loss_mlp": 0.01123216, "balance_loss_clip": 1.01661193, "balance_loss_mlp": 0.0, "epoch": 0.6196116154632357, "flos": 21178821196800.0, "grad_norm": 1.6156622882325755, "language_loss": 0.76993591, "learning_rate": 1.3351895621092859e-06, "loss": 0.79294008, "num_input_tokens_seen": 111086050, "step": 5153, "time_per_iteration": 2.723015308380127 }, { "auxiliary_loss_clip": 0.01144627, "auxiliary_loss_mlp": 0.01027932, "balance_loss_clip": 0.77230799, "balance_loss_mlp": 1.02049351, "epoch": 0.6197318583538748, "flos": 16253744803200.0, "grad_norm": 2.084759566670751, "language_loss": 0.76635647, "learning_rate": 1.3344549358312567e-06, "loss": 0.788082, "num_input_tokens_seen": 111104450, "step": 5154, "time_per_iteration": 2.9589200019836426 }, { "auxiliary_loss_clip": 0.01176772, "auxiliary_loss_mlp": 0.01025527, "balance_loss_clip": 1.01309729, "balance_loss_mlp": 1.01799893, "epoch": 0.619852101244514, "flos": 24425612478720.0, "grad_norm": 1.8957683201636908, "language_loss": 0.77965367, "learning_rate": 1.3337204105158852e-06, "loss": 0.80167663, "num_input_tokens_seen": 111123320, "step": 5155, "time_per_iteration": 3.240668296813965 }, { "auxiliary_loss_clip": 0.01149545, "auxiliary_loss_mlp": 0.01023553, "balance_loss_clip": 0.9258914, "balance_loss_mlp": 1.01660895, "epoch": 0.619972344135153, "flos": 16727298733440.0, "grad_norm": 1.8300587253700602, "language_loss": 0.72808355, "learning_rate": 1.332985986274597e-06, "loss": 0.74981451, "num_input_tokens_seen": 111140950, "step": 5156, "time_per_iteration": 2.8121705055236816 }, { "auxiliary_loss_clip": 0.01162524, "auxiliary_loss_mlp": 0.01121836, "balance_loss_clip": 0.86066788, "balance_loss_mlp": 0.0, "epoch": 0.6200925870257921, "flos": 12495190498560.0, "grad_norm": 1.9401159847603493, "language_loss": 0.75185388, "learning_rate": 1.3322516632188047e-06, "loss": 0.77469754, "num_input_tokens_seen": 111157845, "step": 5157, "time_per_iteration": 2.8073208332061768 }, { "auxiliary_loss_clip": 0.01163388, "auxiliary_loss_mlp": 0.01030256, "balance_loss_clip": 0.93371463, "balance_loss_mlp": 1.02319312, "epoch": 0.6202128299164312, "flos": 26539350168960.0, "grad_norm": 1.731475882080453, "language_loss": 0.67074561, "learning_rate": 1.3315174414599045e-06, "loss": 0.69268203, "num_input_tokens_seen": 111179165, "step": 5158, "time_per_iteration": 2.7982733249664307 }, { "auxiliary_loss_clip": 0.0116463, "auxiliary_loss_mlp": 0.01026648, "balance_loss_clip": 1.00985384, "balance_loss_mlp": 1.01926577, "epoch": 0.6203330728070703, "flos": 18770508069120.0, "grad_norm": 1.6003825836281782, "language_loss": 0.75222766, "learning_rate": 1.3307833211092768e-06, "loss": 0.77414042, "num_input_tokens_seen": 111197830, "step": 5159, "time_per_iteration": 2.6258270740509033 }, { "auxiliary_loss_clip": 0.01175278, "auxiliary_loss_mlp": 0.01023162, "balance_loss_clip": 1.05332303, "balance_loss_mlp": 1.01626527, "epoch": 0.6204533156977093, "flos": 20629782835200.0, "grad_norm": 1.5668857827691793, "language_loss": 0.749466, "learning_rate": 1.3300493022782873e-06, "loss": 0.7714504, "num_input_tokens_seen": 111218400, "step": 5160, "time_per_iteration": 2.6662561893463135 }, { "auxiliary_loss_clip": 0.01156022, "auxiliary_loss_mlp": 0.01123269, "balance_loss_clip": 0.89539123, "balance_loss_mlp": 0.0, "epoch": 0.6205735585883485, "flos": 17348050598400.0, "grad_norm": 2.9366996072770384, "language_loss": 0.72450036, "learning_rate": 1.3293153850782855e-06, "loss": 0.74729329, "num_input_tokens_seen": 111236720, "step": 5161, "time_per_iteration": 2.7111101150512695 }, { "auxiliary_loss_clip": 0.01158788, "auxiliary_loss_mlp": 0.0102981, "balance_loss_clip": 0.93386841, "balance_loss_mlp": 1.02139974, "epoch": 0.6206938014789876, "flos": 22965017742720.0, "grad_norm": 1.791093709821655, "language_loss": 0.71062887, "learning_rate": 1.3285815696206069e-06, "loss": 0.73251486, "num_input_tokens_seen": 111258265, "step": 5162, "time_per_iteration": 2.7876901626586914 }, { "auxiliary_loss_clip": 0.01167342, "auxiliary_loss_mlp": 0.0102938, "balance_loss_clip": 0.93115652, "balance_loss_mlp": 1.02157199, "epoch": 0.6208140443696266, "flos": 23983192661760.0, "grad_norm": 2.172231731000855, "language_loss": 0.76983011, "learning_rate": 1.32784785601657e-06, "loss": 0.79179734, "num_input_tokens_seen": 111277675, "step": 5163, "time_per_iteration": 4.057806015014648 }, { "auxiliary_loss_clip": 0.01173517, "auxiliary_loss_mlp": 0.01026586, "balance_loss_clip": 0.97194636, "balance_loss_mlp": 1.01927209, "epoch": 0.6209342872602658, "flos": 35077291303680.0, "grad_norm": 1.6441276364580137, "language_loss": 0.7403996, "learning_rate": 1.3271142443774798e-06, "loss": 0.76240063, "num_input_tokens_seen": 111299910, "step": 5164, "time_per_iteration": 2.776503086090088 }, { "auxiliary_loss_clip": 0.01169462, "auxiliary_loss_mlp": 0.01024371, "balance_loss_clip": 0.97578788, "balance_loss_mlp": 1.01741469, "epoch": 0.6210545301509048, "flos": 26979327861120.0, "grad_norm": 1.8650580755921193, "language_loss": 0.81352365, "learning_rate": 1.3263807348146228e-06, "loss": 0.83546197, "num_input_tokens_seen": 111319765, "step": 5165, "time_per_iteration": 2.8114101886749268 }, { "auxiliary_loss_clip": 0.01165265, "auxiliary_loss_mlp": 0.01027201, "balance_loss_clip": 0.96969646, "balance_loss_mlp": 1.01985502, "epoch": 0.6211747730415439, "flos": 33618240852480.0, "grad_norm": 2.3683299052668225, "language_loss": 0.73466468, "learning_rate": 1.3256473274392733e-06, "loss": 0.75658935, "num_input_tokens_seen": 111341110, "step": 5166, "time_per_iteration": 2.800865411758423 }, { "auxiliary_loss_clip": 0.01174396, "auxiliary_loss_mlp": 0.01026941, "balance_loss_clip": 1.05153465, "balance_loss_mlp": 1.01910949, "epoch": 0.6212950159321831, "flos": 34167099646080.0, "grad_norm": 1.770076622670259, "language_loss": 0.69903183, "learning_rate": 1.3249140223626873e-06, "loss": 0.7210452, "num_input_tokens_seen": 111362730, "step": 5167, "time_per_iteration": 2.7441906929016113 }, { "auxiliary_loss_clip": 0.01168002, "auxiliary_loss_mlp": 0.01024639, "balance_loss_clip": 1.01218486, "balance_loss_mlp": 1.01748681, "epoch": 0.6214152588228221, "flos": 27965758135680.0, "grad_norm": 2.164341818551392, "language_loss": 0.75417578, "learning_rate": 1.3241808196961077e-06, "loss": 0.77610219, "num_input_tokens_seen": 111383855, "step": 5168, "time_per_iteration": 3.6443674564361572 }, { "auxiliary_loss_clip": 0.01156987, "auxiliary_loss_mlp": 0.01021047, "balance_loss_clip": 0.97205049, "balance_loss_mlp": 1.01429045, "epoch": 0.6215355017134612, "flos": 20230204965120.0, "grad_norm": 1.6973813241441724, "language_loss": 0.70465803, "learning_rate": 1.3234477195507608e-06, "loss": 0.7264384, "num_input_tokens_seen": 111402685, "step": 5169, "time_per_iteration": 2.6465513706207275 }, { "auxiliary_loss_clip": 0.01171374, "auxiliary_loss_mlp": 0.0102542, "balance_loss_clip": 0.93677711, "balance_loss_mlp": 1.01770711, "epoch": 0.6216557446041003, "flos": 41428129219200.0, "grad_norm": 1.8398232566643073, "language_loss": 0.62234229, "learning_rate": 1.322714722037857e-06, "loss": 0.64431024, "num_input_tokens_seen": 111424130, "step": 5170, "time_per_iteration": 3.8020007610321045 }, { "auxiliary_loss_clip": 0.01176229, "auxiliary_loss_mlp": 0.01033777, "balance_loss_clip": 0.93457907, "balance_loss_mlp": 1.0258373, "epoch": 0.6217759874947394, "flos": 27928770105600.0, "grad_norm": 1.8864215402609985, "language_loss": 0.77485114, "learning_rate": 1.321981827268591e-06, "loss": 0.79695117, "num_input_tokens_seen": 111444785, "step": 5171, "time_per_iteration": 3.675651788711548 }, { "auxiliary_loss_clip": 0.01172838, "auxiliary_loss_mlp": 0.01024353, "balance_loss_clip": 0.97249645, "balance_loss_mlp": 1.01758218, "epoch": 0.6218962303853784, "flos": 21765673601280.0, "grad_norm": 1.9950093654419827, "language_loss": 0.81414926, "learning_rate": 1.3212490353541426e-06, "loss": 0.8361212, "num_input_tokens_seen": 111467045, "step": 5172, "time_per_iteration": 2.6907148361206055 }, { "auxiliary_loss_clip": 0.01174196, "auxiliary_loss_mlp": 0.010297, "balance_loss_clip": 1.05148911, "balance_loss_mlp": 1.02233863, "epoch": 0.6220164732760175, "flos": 21246260981760.0, "grad_norm": 2.750267467236352, "language_loss": 0.80284733, "learning_rate": 1.3205163464056762e-06, "loss": 0.82488632, "num_input_tokens_seen": 111483650, "step": 5173, "time_per_iteration": 2.591557502746582 }, { "auxiliary_loss_clip": 0.01169733, "auxiliary_loss_mlp": 0.01026717, "balance_loss_clip": 1.01190436, "balance_loss_mlp": 1.01927781, "epoch": 0.6221367161666567, "flos": 26136360506880.0, "grad_norm": 1.884058777130553, "language_loss": 0.72916996, "learning_rate": 1.319783760534339e-06, "loss": 0.7511344, "num_input_tokens_seen": 111502895, "step": 5174, "time_per_iteration": 2.7457847595214844 }, { "auxiliary_loss_clip": 0.01171293, "auxiliary_loss_mlp": 0.01033058, "balance_loss_clip": 1.01190555, "balance_loss_mlp": 1.02568173, "epoch": 0.6222569590572957, "flos": 16284196558080.0, "grad_norm": 2.057516992787506, "language_loss": 0.75359476, "learning_rate": 1.319051277851266e-06, "loss": 0.77563828, "num_input_tokens_seen": 111519180, "step": 5175, "time_per_iteration": 2.703245162963867 }, { "auxiliary_loss_clip": 0.01171764, "auxiliary_loss_mlp": 0.01024055, "balance_loss_clip": 1.01038742, "balance_loss_mlp": 1.01732898, "epoch": 0.6223772019479348, "flos": 18223840005120.0, "grad_norm": 2.52238292789011, "language_loss": 0.8405447, "learning_rate": 1.3183188984675716e-06, "loss": 0.86250293, "num_input_tokens_seen": 111537545, "step": 5176, "time_per_iteration": 2.6350066661834717 }, { "auxiliary_loss_clip": 0.01168907, "auxiliary_loss_mlp": 0.01027895, "balance_loss_clip": 0.97524995, "balance_loss_mlp": 1.02060282, "epoch": 0.6224974448385739, "flos": 27489797994240.0, "grad_norm": 2.0921720017197147, "language_loss": 0.71327657, "learning_rate": 1.3175866224943586e-06, "loss": 0.73524463, "num_input_tokens_seen": 111556265, "step": 5177, "time_per_iteration": 2.7389261722564697 }, { "auxiliary_loss_clip": 0.01175443, "auxiliary_loss_mlp": 0.01030239, "balance_loss_clip": 0.97452754, "balance_loss_mlp": 1.02231133, "epoch": 0.622617687729213, "flos": 19791951125760.0, "grad_norm": 2.5185628099374053, "language_loss": 0.73520911, "learning_rate": 1.316854450042712e-06, "loss": 0.75726593, "num_input_tokens_seen": 111574205, "step": 5178, "time_per_iteration": 2.7142696380615234 }, { "auxiliary_loss_clip": 0.01176684, "auxiliary_loss_mlp": 0.01022338, "balance_loss_clip": 1.0121491, "balance_loss_mlp": 1.01480412, "epoch": 0.622737930619852, "flos": 23038886062080.0, "grad_norm": 1.7640312458526837, "language_loss": 0.74651122, "learning_rate": 1.3161223812237024e-06, "loss": 0.76850152, "num_input_tokens_seen": 111593560, "step": 5179, "time_per_iteration": 2.703012466430664 }, { "auxiliary_loss_clip": 0.01170764, "auxiliary_loss_mlp": 0.0102276, "balance_loss_clip": 1.04821444, "balance_loss_mlp": 1.0151422, "epoch": 0.6228581735104912, "flos": 12634271959680.0, "grad_norm": 2.2435883148697395, "language_loss": 0.85249031, "learning_rate": 1.3153904161483842e-06, "loss": 0.87442553, "num_input_tokens_seen": 111608860, "step": 5180, "time_per_iteration": 2.67287278175354 }, { "auxiliary_loss_clip": 0.01163793, "auxiliary_loss_mlp": 0.01032394, "balance_loss_clip": 0.93373716, "balance_loss_mlp": 1.0247705, "epoch": 0.6229784164011303, "flos": 23802813538560.0, "grad_norm": 2.1757492914789207, "language_loss": 0.85373265, "learning_rate": 1.3146585549277953e-06, "loss": 0.87569451, "num_input_tokens_seen": 111627500, "step": 5181, "time_per_iteration": 2.7620763778686523 }, { "auxiliary_loss_clip": 0.01182654, "auxiliary_loss_mlp": 0.01028145, "balance_loss_clip": 0.97626412, "balance_loss_mlp": 1.0207752, "epoch": 0.6230986592917693, "flos": 22414219614720.0, "grad_norm": 1.9121272635033535, "language_loss": 0.78597248, "learning_rate": 1.3139267976729591e-06, "loss": 0.80808043, "num_input_tokens_seen": 111647690, "step": 5182, "time_per_iteration": 2.7486302852630615 }, { "auxiliary_loss_clip": 0.01175835, "auxiliary_loss_mlp": 0.01025577, "balance_loss_clip": 1.0122999, "balance_loss_mlp": 1.01815653, "epoch": 0.6232189021824085, "flos": 34528217028480.0, "grad_norm": 1.6335450145571684, "language_loss": 0.72081411, "learning_rate": 1.3131951444948815e-06, "loss": 0.74282819, "num_input_tokens_seen": 111667090, "step": 5183, "time_per_iteration": 2.720386028289795 }, { "auxiliary_loss_clip": 0.01177311, "auxiliary_loss_mlp": 0.01029069, "balance_loss_clip": 0.9773227, "balance_loss_mlp": 1.02119827, "epoch": 0.6233391450730476, "flos": 22237000888320.0, "grad_norm": 1.801287807331537, "language_loss": 0.75996321, "learning_rate": 1.3124635955045546e-06, "loss": 0.78202701, "num_input_tokens_seen": 111686905, "step": 5184, "time_per_iteration": 2.6923959255218506 }, { "auxiliary_loss_clip": 0.01152383, "auxiliary_loss_mlp": 0.0112254, "balance_loss_clip": 0.89204252, "balance_loss_mlp": 0.0, "epoch": 0.6234593879636866, "flos": 20332693445760.0, "grad_norm": 1.835102466432563, "language_loss": 0.84318155, "learning_rate": 1.3117321508129537e-06, "loss": 0.8659308, "num_input_tokens_seen": 111704985, "step": 5185, "time_per_iteration": 2.6915438175201416 }, { "auxiliary_loss_clip": 0.01172912, "auxiliary_loss_mlp": 0.0102491, "balance_loss_clip": 0.97517133, "balance_loss_mlp": 1.01731622, "epoch": 0.6235796308543258, "flos": 20664903358080.0, "grad_norm": 1.406022613210267, "language_loss": 0.76422167, "learning_rate": 1.3110008105310388e-06, "loss": 0.78619993, "num_input_tokens_seen": 111724805, "step": 5186, "time_per_iteration": 2.6984620094299316 }, { "auxiliary_loss_clip": 0.0117324, "auxiliary_loss_mlp": 0.01028618, "balance_loss_clip": 1.04872978, "balance_loss_mlp": 1.021191, "epoch": 0.6236998737449648, "flos": 26618641441920.0, "grad_norm": 1.6119197147476583, "language_loss": 0.77547181, "learning_rate": 1.3102695747697526e-06, "loss": 0.79749042, "num_input_tokens_seen": 111747675, "step": 5187, "time_per_iteration": 2.654129981994629 }, { "auxiliary_loss_clip": 0.01170678, "auxiliary_loss_mlp": 0.01030627, "balance_loss_clip": 0.86066914, "balance_loss_mlp": 1.02289009, "epoch": 0.6238201166356039, "flos": 12674599954560.0, "grad_norm": 2.4430061838072996, "language_loss": 0.90343988, "learning_rate": 1.3095384436400237e-06, "loss": 0.92545289, "num_input_tokens_seen": 111759205, "step": 5188, "time_per_iteration": 2.7159860134124756 }, { "auxiliary_loss_clip": 0.01177846, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 0.97310412, "balance_loss_mlp": 1.02041268, "epoch": 0.623940359526243, "flos": 10452160730880.0, "grad_norm": 2.1097464253096225, "language_loss": 0.8206141, "learning_rate": 1.3088074172527633e-06, "loss": 0.84267211, "num_input_tokens_seen": 111776335, "step": 5189, "time_per_iteration": 3.681229591369629 }, { "auxiliary_loss_clip": 0.01171975, "auxiliary_loss_mlp": 0.01028687, "balance_loss_clip": 0.97073013, "balance_loss_mlp": 1.02124834, "epoch": 0.6240606024168821, "flos": 29059525226880.0, "grad_norm": 2.014748275551256, "language_loss": 0.7144717, "learning_rate": 1.3080764957188684e-06, "loss": 0.73647827, "num_input_tokens_seen": 111796580, "step": 5190, "time_per_iteration": 2.684199810028076 }, { "auxiliary_loss_clip": 0.01168953, "auxiliary_loss_mlp": 0.01029578, "balance_loss_clip": 0.89337528, "balance_loss_mlp": 1.02177584, "epoch": 0.6241808453075212, "flos": 22018089450240.0, "grad_norm": 1.6845030729054675, "language_loss": 0.70580631, "learning_rate": 1.3073456791492192e-06, "loss": 0.72779161, "num_input_tokens_seen": 111816290, "step": 5191, "time_per_iteration": 2.8000335693359375 }, { "auxiliary_loss_clip": 0.01171098, "auxiliary_loss_mlp": 0.01031013, "balance_loss_clip": 0.9719224, "balance_loss_mlp": 1.02398276, "epoch": 0.6243010881981603, "flos": 21138708683520.0, "grad_norm": 1.9013233096861357, "language_loss": 0.78327835, "learning_rate": 1.3066149676546801e-06, "loss": 0.80529952, "num_input_tokens_seen": 111834470, "step": 5192, "time_per_iteration": 2.6556384563446045 }, { "auxiliary_loss_clip": 0.01171348, "auxiliary_loss_mlp": 0.01024461, "balance_loss_clip": 0.97747934, "balance_loss_mlp": 1.01692986, "epoch": 0.6244213310887994, "flos": 22344948236160.0, "grad_norm": 2.2153442298496113, "language_loss": 0.66038167, "learning_rate": 1.3058843613460985e-06, "loss": 0.68233967, "num_input_tokens_seen": 111852410, "step": 5193, "time_per_iteration": 2.6778433322906494 }, { "auxiliary_loss_clip": 0.01180964, "auxiliary_loss_mlp": 0.01029696, "balance_loss_clip": 0.93691003, "balance_loss_mlp": 1.02153039, "epoch": 0.6245415739794384, "flos": 15231978524160.0, "grad_norm": 1.8254483075805887, "language_loss": 0.74098051, "learning_rate": 1.3051538603343075e-06, "loss": 0.76308709, "num_input_tokens_seen": 111870340, "step": 5194, "time_per_iteration": 3.6838760375976562 }, { "auxiliary_loss_clip": 0.01172945, "auxiliary_loss_mlp": 0.01027283, "balance_loss_clip": 1.01521814, "balance_loss_mlp": 1.02035379, "epoch": 0.6246618168700776, "flos": 18879891960960.0, "grad_norm": 2.031132943761988, "language_loss": 0.67664945, "learning_rate": 1.3044234647301235e-06, "loss": 0.69865167, "num_input_tokens_seen": 111888365, "step": 5195, "time_per_iteration": 2.6641032695770264 }, { "auxiliary_loss_clip": 0.01166824, "auxiliary_loss_mlp": 0.0102472, "balance_loss_clip": 1.01127291, "balance_loss_mlp": 1.01753759, "epoch": 0.6247820597607167, "flos": 14319201087360.0, "grad_norm": 1.637958211180795, "language_loss": 0.72239506, "learning_rate": 1.303693174644347e-06, "loss": 0.74431044, "num_input_tokens_seen": 111905840, "step": 5196, "time_per_iteration": 3.5495119094848633 }, { "auxiliary_loss_clip": 0.0116359, "auxiliary_loss_mlp": 0.01029604, "balance_loss_clip": 0.97245085, "balance_loss_mlp": 1.02194452, "epoch": 0.6249023026513557, "flos": 22637979388800.0, "grad_norm": 1.8092131227228347, "language_loss": 0.80479336, "learning_rate": 1.3029629901877625e-06, "loss": 0.82672524, "num_input_tokens_seen": 111925215, "step": 5197, "time_per_iteration": 3.634117603302002 }, { "auxiliary_loss_clip": 0.01179432, "auxiliary_loss_mlp": 0.01033743, "balance_loss_clip": 1.01318216, "balance_loss_mlp": 1.02570188, "epoch": 0.6250225455419949, "flos": 20266690204800.0, "grad_norm": 2.6406079194185437, "language_loss": 0.77620387, "learning_rate": 1.3022329114711376e-06, "loss": 0.79833567, "num_input_tokens_seen": 111943925, "step": 5198, "time_per_iteration": 2.6164722442626953 }, { "auxiliary_loss_clip": 0.01164836, "auxiliary_loss_mlp": 0.01027272, "balance_loss_clip": 0.97286958, "balance_loss_mlp": 1.01979792, "epoch": 0.6251427884326339, "flos": 23437853400960.0, "grad_norm": 1.8396213053144865, "language_loss": 0.69614643, "learning_rate": 1.3015029386052256e-06, "loss": 0.71806753, "num_input_tokens_seen": 111964095, "step": 5199, "time_per_iteration": 2.711641550064087 }, { "auxiliary_loss_clip": 0.01182094, "auxiliary_loss_mlp": 0.01033186, "balance_loss_clip": 0.93658811, "balance_loss_mlp": 1.02505302, "epoch": 0.625263031323273, "flos": 31723055464320.0, "grad_norm": 1.9305192476339044, "language_loss": 0.73149371, "learning_rate": 1.3007730717007622e-06, "loss": 0.75364649, "num_input_tokens_seen": 111984910, "step": 5200, "time_per_iteration": 2.8176238536834717 }, { "auxiliary_loss_clip": 0.01176538, "auxiliary_loss_mlp": 0.0102611, "balance_loss_clip": 1.05098426, "balance_loss_mlp": 1.01796174, "epoch": 0.6253832742139122, "flos": 24134341092480.0, "grad_norm": 1.6951273122277561, "language_loss": 0.75531423, "learning_rate": 1.3000433108684676e-06, "loss": 0.77734077, "num_input_tokens_seen": 112005410, "step": 5201, "time_per_iteration": 2.6652424335479736 }, { "auxiliary_loss_clip": 0.01169225, "auxiliary_loss_mlp": 0.01030865, "balance_loss_clip": 1.0119133, "balance_loss_mlp": 1.02370644, "epoch": 0.6255035171045512, "flos": 27668812400640.0, "grad_norm": 5.307970214042332, "language_loss": 0.80171007, "learning_rate": 1.2993136562190467e-06, "loss": 0.82371104, "num_input_tokens_seen": 112024530, "step": 5202, "time_per_iteration": 2.6720194816589355 }, { "auxiliary_loss_clip": 0.01174342, "auxiliary_loss_mlp": 0.01021851, "balance_loss_clip": 0.97485799, "balance_loss_mlp": 1.01453769, "epoch": 0.6256237599951903, "flos": 20227798753920.0, "grad_norm": 1.484075597642645, "language_loss": 0.70234942, "learning_rate": 1.2985841078631871e-06, "loss": 0.72431129, "num_input_tokens_seen": 112043850, "step": 5203, "time_per_iteration": 2.712054967880249 }, { "auxiliary_loss_clip": 0.01161356, "auxiliary_loss_mlp": 0.01029306, "balance_loss_clip": 0.85359567, "balance_loss_mlp": 1.02189696, "epoch": 0.6257440028858293, "flos": 24170574936960.0, "grad_norm": 1.5813507043044366, "language_loss": 0.78009319, "learning_rate": 1.2978546659115608e-06, "loss": 0.80199981, "num_input_tokens_seen": 112061930, "step": 5204, "time_per_iteration": 2.8245487213134766 }, { "auxiliary_loss_clip": 0.01172885, "auxiliary_loss_mlp": 0.01028279, "balance_loss_clip": 0.9734208, "balance_loss_mlp": 1.02092957, "epoch": 0.6258642457764685, "flos": 15851940289920.0, "grad_norm": 2.4911493980279062, "language_loss": 0.85274875, "learning_rate": 1.2971253304748228e-06, "loss": 0.87476039, "num_input_tokens_seen": 112079645, "step": 5205, "time_per_iteration": 2.669036626815796 }, { "auxiliary_loss_clip": 0.01174921, "auxiliary_loss_mlp": 0.01027415, "balance_loss_clip": 1.01433325, "balance_loss_mlp": 1.01967216, "epoch": 0.6259844886671075, "flos": 11911354836480.0, "grad_norm": 1.6908961135025182, "language_loss": 0.74613202, "learning_rate": 1.296396101663614e-06, "loss": 0.76815546, "num_input_tokens_seen": 112096205, "step": 5206, "time_per_iteration": 2.586435079574585 }, { "auxiliary_loss_clip": 0.01174472, "auxiliary_loss_mlp": 0.0102987, "balance_loss_clip": 1.01289701, "balance_loss_mlp": 1.02252948, "epoch": 0.6261047315577466, "flos": 15887958652800.0, "grad_norm": 2.1143751653177087, "language_loss": 0.83947134, "learning_rate": 1.2956669795885565e-06, "loss": 0.86151475, "num_input_tokens_seen": 112112835, "step": 5207, "time_per_iteration": 2.663515567779541 }, { "auxiliary_loss_clip": 0.01164289, "auxiliary_loss_mlp": 0.01029701, "balance_loss_clip": 0.93730485, "balance_loss_mlp": 1.02205324, "epoch": 0.6262249744483858, "flos": 31248926916480.0, "grad_norm": 2.0013543806319958, "language_loss": 0.68094963, "learning_rate": 1.294937964360259e-06, "loss": 0.70288956, "num_input_tokens_seen": 112133105, "step": 5208, "time_per_iteration": 2.8330795764923096 }, { "auxiliary_loss_clip": 0.01177083, "auxiliary_loss_mlp": 0.01028407, "balance_loss_clip": 0.97195977, "balance_loss_mlp": 1.02070594, "epoch": 0.6263452173390248, "flos": 27198598435200.0, "grad_norm": 2.177915637932996, "language_loss": 0.71260989, "learning_rate": 1.2942090560893108e-06, "loss": 0.7346648, "num_input_tokens_seen": 112152510, "step": 5209, "time_per_iteration": 2.7487235069274902 }, { "auxiliary_loss_clip": 0.01172791, "auxiliary_loss_mlp": 0.0102316, "balance_loss_clip": 1.05123818, "balance_loss_mlp": 1.01594758, "epoch": 0.6264654602296639, "flos": 37342069683840.0, "grad_norm": 1.887128796729829, "language_loss": 0.60692507, "learning_rate": 1.2934802548862882e-06, "loss": 0.62888455, "num_input_tokens_seen": 112175295, "step": 5210, "time_per_iteration": 2.7548558712005615 }, { "auxiliary_loss_clip": 0.01165515, "auxiliary_loss_mlp": 0.0102638, "balance_loss_clip": 0.97138119, "balance_loss_mlp": 1.01874471, "epoch": 0.626585703120303, "flos": 14756952136320.0, "grad_norm": 1.8175057145683415, "language_loss": 0.82686806, "learning_rate": 1.292751560861749e-06, "loss": 0.84878695, "num_input_tokens_seen": 112190200, "step": 5211, "time_per_iteration": 2.7195546627044678 }, { "auxiliary_loss_clip": 0.01177519, "auxiliary_loss_mlp": 0.01028147, "balance_loss_clip": 1.05267358, "balance_loss_mlp": 1.02044046, "epoch": 0.6267059460109421, "flos": 22347318533760.0, "grad_norm": 1.7495970124576123, "language_loss": 0.79481971, "learning_rate": 1.2920229741262354e-06, "loss": 0.81687629, "num_input_tokens_seen": 112208205, "step": 5212, "time_per_iteration": 2.6946768760681152 }, { "auxiliary_loss_clip": 0.01169346, "auxiliary_loss_mlp": 0.01027195, "balance_loss_clip": 0.97208816, "balance_loss_mlp": 1.01978004, "epoch": 0.6268261889015811, "flos": 17748813617280.0, "grad_norm": 2.0125825390545606, "language_loss": 0.75428653, "learning_rate": 1.2912944947902739e-06, "loss": 0.77625197, "num_input_tokens_seen": 112224690, "step": 5213, "time_per_iteration": 2.6322410106658936 }, { "auxiliary_loss_clip": 0.01178225, "auxiliary_loss_mlp": 0.01026732, "balance_loss_clip": 0.97345531, "balance_loss_mlp": 1.01901364, "epoch": 0.6269464317922203, "flos": 32846484211200.0, "grad_norm": 2.0567979758603827, "language_loss": 0.71550107, "learning_rate": 1.2905661229643742e-06, "loss": 0.73755062, "num_input_tokens_seen": 112244450, "step": 5214, "time_per_iteration": 2.770723342895508 }, { "auxiliary_loss_clip": 0.01173246, "auxiliary_loss_mlp": 0.01023862, "balance_loss_clip": 1.04964936, "balance_loss_mlp": 1.01675081, "epoch": 0.6270666746828594, "flos": 17929192740480.0, "grad_norm": 2.0702118944462775, "language_loss": 0.84432709, "learning_rate": 1.2898378587590299e-06, "loss": 0.8662982, "num_input_tokens_seen": 112261050, "step": 5215, "time_per_iteration": 3.4525790214538574 }, { "auxiliary_loss_clip": 0.01170805, "auxiliary_loss_mlp": 0.01030244, "balance_loss_clip": 1.0129571, "balance_loss_mlp": 1.02275503, "epoch": 0.6271869175734984, "flos": 17457326749440.0, "grad_norm": 1.7924639844901717, "language_loss": 0.8767457, "learning_rate": 1.2891097022847173e-06, "loss": 0.89875621, "num_input_tokens_seen": 112278395, "step": 5216, "time_per_iteration": 2.592158555984497 }, { "auxiliary_loss_clip": 0.01170243, "auxiliary_loss_mlp": 0.01023654, "balance_loss_clip": 0.97321224, "balance_loss_mlp": 1.01561332, "epoch": 0.6273071604641376, "flos": 26868615166080.0, "grad_norm": 1.780267244985506, "language_loss": 0.66690743, "learning_rate": 1.2883816536518978e-06, "loss": 0.68884641, "num_input_tokens_seen": 112299535, "step": 5217, "time_per_iteration": 2.7018723487854004 }, { "auxiliary_loss_clip": 0.01167403, "auxiliary_loss_mlp": 0.0102603, "balance_loss_clip": 1.00939822, "balance_loss_mlp": 1.01902056, "epoch": 0.6274274033547766, "flos": 26062384446720.0, "grad_norm": 1.7396025310192866, "language_loss": 0.81967294, "learning_rate": 1.2876537129710155e-06, "loss": 0.84160727, "num_input_tokens_seen": 112317265, "step": 5218, "time_per_iteration": 2.669987916946411 }, { "auxiliary_loss_clip": 0.01170579, "auxiliary_loss_mlp": 0.01026167, "balance_loss_clip": 0.97555381, "balance_loss_mlp": 1.01853776, "epoch": 0.6275476462454157, "flos": 20266259241600.0, "grad_norm": 8.055686011990947, "language_loss": 0.74800247, "learning_rate": 1.286925880352499e-06, "loss": 0.76996994, "num_input_tokens_seen": 112336125, "step": 5219, "time_per_iteration": 2.6414635181427 }, { "auxiliary_loss_clip": 0.01168253, "auxiliary_loss_mlp": 0.01023585, "balance_loss_clip": 0.97252226, "balance_loss_mlp": 1.01620328, "epoch": 0.6276678891360549, "flos": 26320402817280.0, "grad_norm": 1.5682938316617943, "language_loss": 0.71366346, "learning_rate": 1.2861981559067592e-06, "loss": 0.73558187, "num_input_tokens_seen": 112356730, "step": 5220, "time_per_iteration": 3.7046496868133545 }, { "auxiliary_loss_clip": 0.01160559, "auxiliary_loss_mlp": 0.01023123, "balance_loss_clip": 0.89657485, "balance_loss_mlp": 1.01546371, "epoch": 0.6277881320266939, "flos": 13912512324480.0, "grad_norm": 1.866840210124375, "language_loss": 0.80162799, "learning_rate": 1.2854705397441917e-06, "loss": 0.82346481, "num_input_tokens_seen": 112372270, "step": 5221, "time_per_iteration": 2.7463810443878174 }, { "auxiliary_loss_clip": 0.01161194, "auxiliary_loss_mlp": 0.01024394, "balance_loss_clip": 0.93155545, "balance_loss_mlp": 1.01727152, "epoch": 0.627908374917333, "flos": 27048922462080.0, "grad_norm": 2.0642977365416, "language_loss": 0.77431351, "learning_rate": 1.2847430319751747e-06, "loss": 0.7961694, "num_input_tokens_seen": 112390365, "step": 5222, "time_per_iteration": 3.5707414150238037 }, { "auxiliary_loss_clip": 0.01167349, "auxiliary_loss_mlp": 0.01027112, "balance_loss_clip": 1.01201451, "balance_loss_mlp": 1.01984334, "epoch": 0.6280286178079721, "flos": 23769201386880.0, "grad_norm": 2.07030593675051, "language_loss": 0.67219675, "learning_rate": 1.2840156327100712e-06, "loss": 0.69414139, "num_input_tokens_seen": 112407490, "step": 5223, "time_per_iteration": 3.550328016281128 }, { "auxiliary_loss_clip": 0.0117239, "auxiliary_loss_mlp": 0.01027752, "balance_loss_clip": 1.05076861, "balance_loss_mlp": 1.02035475, "epoch": 0.6281488606986112, "flos": 26359150613760.0, "grad_norm": 1.9716301719258096, "language_loss": 0.72031981, "learning_rate": 1.2832883420592272e-06, "loss": 0.74232125, "num_input_tokens_seen": 112426385, "step": 5224, "time_per_iteration": 2.681023597717285 }, { "auxiliary_loss_clip": 0.01164459, "auxiliary_loss_mlp": 0.01023956, "balance_loss_clip": 0.9728148, "balance_loss_mlp": 1.01639247, "epoch": 0.6282691035892503, "flos": 36137194848000.0, "grad_norm": 2.4780087499406456, "language_loss": 0.64536703, "learning_rate": 1.282561160132972e-06, "loss": 0.66725123, "num_input_tokens_seen": 112446905, "step": 5225, "time_per_iteration": 2.825942277908325 }, { "auxiliary_loss_clip": 0.01172414, "auxiliary_loss_mlp": 0.01034007, "balance_loss_clip": 0.9695152, "balance_loss_mlp": 1.02615666, "epoch": 0.6283893464798894, "flos": 26537231266560.0, "grad_norm": 1.5761675497946552, "language_loss": 0.80806327, "learning_rate": 1.2818340870416186e-06, "loss": 0.83012748, "num_input_tokens_seen": 112468040, "step": 5226, "time_per_iteration": 2.720932722091675 }, { "auxiliary_loss_clip": 0.01176635, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 0.93274021, "balance_loss_mlp": 1.02487636, "epoch": 0.6285095893705285, "flos": 22237216369920.0, "grad_norm": 1.8969533016861824, "language_loss": 0.76007956, "learning_rate": 1.2811071228954626e-06, "loss": 0.78217202, "num_input_tokens_seen": 112486675, "step": 5227, "time_per_iteration": 2.6635565757751465 }, { "auxiliary_loss_clip": 0.01170962, "auxiliary_loss_mlp": 0.01020314, "balance_loss_clip": 0.97375274, "balance_loss_mlp": 1.0126102, "epoch": 0.6286298322611675, "flos": 26542259170560.0, "grad_norm": 1.7317810734383405, "language_loss": 0.80982816, "learning_rate": 1.2803802678047846e-06, "loss": 0.83174098, "num_input_tokens_seen": 112506825, "step": 5228, "time_per_iteration": 2.7742912769317627 }, { "auxiliary_loss_clip": 0.01174996, "auxiliary_loss_mlp": 0.01025899, "balance_loss_clip": 0.9761073, "balance_loss_mlp": 1.01791811, "epoch": 0.6287500751518067, "flos": 21795227516160.0, "grad_norm": 1.7863032806427441, "language_loss": 0.73880291, "learning_rate": 1.279653521879848e-06, "loss": 0.76081192, "num_input_tokens_seen": 112526890, "step": 5229, "time_per_iteration": 2.627657651901245 }, { "auxiliary_loss_clip": 0.01154588, "auxiliary_loss_mlp": 0.01032578, "balance_loss_clip": 0.8180083, "balance_loss_mlp": 1.02571177, "epoch": 0.6288703180424458, "flos": 20009605587840.0, "grad_norm": 1.9745794024038261, "language_loss": 0.84007388, "learning_rate": 1.2789268852308997e-06, "loss": 0.86194551, "num_input_tokens_seen": 112542100, "step": 5230, "time_per_iteration": 2.8944127559661865 }, { "auxiliary_loss_clip": 0.01164733, "auxiliary_loss_mlp": 0.01024316, "balance_loss_clip": 1.01176572, "balance_loss_mlp": 1.01644206, "epoch": 0.6289905609330848, "flos": 22124923476480.0, "grad_norm": 2.5554194332708047, "language_loss": 0.70463097, "learning_rate": 1.2782003579681688e-06, "loss": 0.72652149, "num_input_tokens_seen": 112561630, "step": 5231, "time_per_iteration": 2.932400703430176 }, { "auxiliary_loss_clip": 0.01177525, "auxiliary_loss_mlp": 0.0103468, "balance_loss_clip": 1.05241108, "balance_loss_mlp": 1.02712262, "epoch": 0.629110803823724, "flos": 25518481729920.0, "grad_norm": 2.34688168613981, "language_loss": 0.7441743, "learning_rate": 1.2774739402018701e-06, "loss": 0.76629639, "num_input_tokens_seen": 112582465, "step": 5232, "time_per_iteration": 2.614301919937134 }, { "auxiliary_loss_clip": 0.01174546, "auxiliary_loss_mlp": 0.01031034, "balance_loss_clip": 1.01458192, "balance_loss_mlp": 1.02338052, "epoch": 0.629231046714363, "flos": 20886616056960.0, "grad_norm": 1.6057319966573838, "language_loss": 0.73123705, "learning_rate": 1.2767476320422002e-06, "loss": 0.7532928, "num_input_tokens_seen": 112602390, "step": 5233, "time_per_iteration": 2.6554653644561768 }, { "auxiliary_loss_clip": 0.01080742, "auxiliary_loss_mlp": 0.01002212, "balance_loss_clip": 0.90330267, "balance_loss_mlp": 1.00063848, "epoch": 0.6293512896050021, "flos": 65050027908480.0, "grad_norm": 0.7085747051400689, "language_loss": 0.57220399, "learning_rate": 1.2760214335993392e-06, "loss": 0.59303355, "num_input_tokens_seen": 112669035, "step": 5234, "time_per_iteration": 3.340287208557129 }, { "auxiliary_loss_clip": 0.01161018, "auxiliary_loss_mlp": 0.01024555, "balance_loss_clip": 1.00880849, "balance_loss_mlp": 1.01782608, "epoch": 0.6294715324956413, "flos": 34677857088000.0, "grad_norm": 1.8533995414243307, "language_loss": 0.58971477, "learning_rate": 1.2752953449834514e-06, "loss": 0.61157048, "num_input_tokens_seen": 112691485, "step": 5235, "time_per_iteration": 2.7346839904785156 }, { "auxiliary_loss_clip": 0.01173529, "auxiliary_loss_mlp": 0.01029558, "balance_loss_clip": 1.05118418, "balance_loss_mlp": 1.02262318, "epoch": 0.6295917753862803, "flos": 22784207656320.0, "grad_norm": 1.6950794051511135, "language_loss": 0.80098653, "learning_rate": 1.2745693663046836e-06, "loss": 0.82301742, "num_input_tokens_seen": 112710555, "step": 5236, "time_per_iteration": 2.5769805908203125 }, { "auxiliary_loss_clip": 0.01166836, "auxiliary_loss_mlp": 0.01027857, "balance_loss_clip": 1.01217103, "balance_loss_mlp": 1.02101469, "epoch": 0.6297120182769194, "flos": 20850454039680.0, "grad_norm": 1.676080168416026, "language_loss": 0.80709296, "learning_rate": 1.2738434976731662e-06, "loss": 0.82903987, "num_input_tokens_seen": 112728740, "step": 5237, "time_per_iteration": 2.6575307846069336 }, { "auxiliary_loss_clip": 0.0116905, "auxiliary_loss_mlp": 0.01026414, "balance_loss_clip": 0.97338074, "balance_loss_mlp": 1.01889825, "epoch": 0.6298322611675584, "flos": 19497662997120.0, "grad_norm": 1.4531342037736965, "language_loss": 0.75210607, "learning_rate": 1.2731177391990125e-06, "loss": 0.77406073, "num_input_tokens_seen": 112748665, "step": 5238, "time_per_iteration": 2.6348581314086914 }, { "auxiliary_loss_clip": 0.01170616, "auxiliary_loss_mlp": 0.01026035, "balance_loss_clip": 0.97205937, "balance_loss_mlp": 1.01865566, "epoch": 0.6299525040581976, "flos": 12604466649600.0, "grad_norm": 2.474190693664044, "language_loss": 0.81245947, "learning_rate": 1.2723920909923203e-06, "loss": 0.83442605, "num_input_tokens_seen": 112764410, "step": 5239, "time_per_iteration": 2.675504446029663 }, { "auxiliary_loss_clip": 0.01071447, "auxiliary_loss_mlp": 0.01003027, "balance_loss_clip": 1.01629782, "balance_loss_mlp": 1.00147688, "epoch": 0.6300727469488366, "flos": 57725685636480.0, "grad_norm": 0.849443690305279, "language_loss": 0.60438883, "learning_rate": 1.2716665531631688e-06, "loss": 0.62513363, "num_input_tokens_seen": 112818695, "step": 5240, "time_per_iteration": 3.1347122192382812 }, { "auxiliary_loss_clip": 0.01178929, "auxiliary_loss_mlp": 0.01029357, "balance_loss_clip": 1.01220953, "balance_loss_mlp": 1.02176976, "epoch": 0.6301929898394757, "flos": 22527302607360.0, "grad_norm": 1.7293009555753784, "language_loss": 0.77518493, "learning_rate": 1.270941125821623e-06, "loss": 0.79726779, "num_input_tokens_seen": 112839120, "step": 5241, "time_per_iteration": 3.577697515487671 }, { "auxiliary_loss_clip": 0.01165727, "auxiliary_loss_mlp": 0.01026861, "balance_loss_clip": 1.00930417, "balance_loss_mlp": 1.01956868, "epoch": 0.6303132327301149, "flos": 28293550675200.0, "grad_norm": 1.4670191064121383, "language_loss": 0.753353, "learning_rate": 1.2702158090777278e-06, "loss": 0.77527893, "num_input_tokens_seen": 112860210, "step": 5242, "time_per_iteration": 2.7339212894439697 }, { "auxiliary_loss_clip": 0.01164876, "auxiliary_loss_mlp": 0.01027603, "balance_loss_clip": 0.93479145, "balance_loss_mlp": 1.02057576, "epoch": 0.6304334756207539, "flos": 25264521596160.0, "grad_norm": 1.72591243324512, "language_loss": 0.74744111, "learning_rate": 1.2694906030415148e-06, "loss": 0.76936591, "num_input_tokens_seen": 112877955, "step": 5243, "time_per_iteration": 2.711637020111084 }, { "auxiliary_loss_clip": 0.0117965, "auxiliary_loss_mlp": 0.01025918, "balance_loss_clip": 0.97340947, "balance_loss_mlp": 1.01800263, "epoch": 0.630553718511393, "flos": 18033548728320.0, "grad_norm": 5.478442248608229, "language_loss": 0.82083428, "learning_rate": 1.2687655078229958e-06, "loss": 0.8428899, "num_input_tokens_seen": 112892285, "step": 5244, "time_per_iteration": 2.649688482284546 }, { "auxiliary_loss_clip": 0.01167359, "auxiliary_loss_mlp": 0.01024673, "balance_loss_clip": 0.97380042, "balance_loss_mlp": 1.01703453, "epoch": 0.6306739614020321, "flos": 27304103658240.0, "grad_norm": 2.216927418540359, "language_loss": 0.6897912, "learning_rate": 1.2680405235321678e-06, "loss": 0.71171153, "num_input_tokens_seen": 112913620, "step": 5245, "time_per_iteration": 2.6834630966186523 }, { "auxiliary_loss_clip": 0.01175023, "auxiliary_loss_mlp": 0.01123061, "balance_loss_clip": 0.97675955, "balance_loss_mlp": 0.0, "epoch": 0.6307942042926712, "flos": 15341434243200.0, "grad_norm": 1.8798127378967786, "language_loss": 0.78712791, "learning_rate": 1.267315650279011e-06, "loss": 0.81010878, "num_input_tokens_seen": 112932090, "step": 5246, "time_per_iteration": 3.632122039794922 }, { "auxiliary_loss_clip": 0.0116627, "auxiliary_loss_mlp": 0.0102465, "balance_loss_clip": 0.93728173, "balance_loss_mlp": 1.01731229, "epoch": 0.6309144471833102, "flos": 19606400444160.0, "grad_norm": 1.8581789913572349, "language_loss": 0.73807704, "learning_rate": 1.2665908881734874e-06, "loss": 0.75998628, "num_input_tokens_seen": 112950925, "step": 5247, "time_per_iteration": 2.682756185531616 }, { "auxiliary_loss_clip": 0.01173113, "auxiliary_loss_mlp": 0.01027925, "balance_loss_clip": 1.01337874, "balance_loss_mlp": 1.02050459, "epoch": 0.6310346900739494, "flos": 17493345112320.0, "grad_norm": 2.017850611513808, "language_loss": 0.84494936, "learning_rate": 1.2658662373255432e-06, "loss": 0.86695975, "num_input_tokens_seen": 112969315, "step": 5248, "time_per_iteration": 3.464848756790161 }, { "auxiliary_loss_clip": 0.0107547, "auxiliary_loss_mlp": 0.01003213, "balance_loss_clip": 0.9424926, "balance_loss_mlp": 1.00155628, "epoch": 0.6311549329645885, "flos": 55070164131840.0, "grad_norm": 0.705862048756631, "language_loss": 0.52294767, "learning_rate": 1.2651416978451063e-06, "loss": 0.54373455, "num_input_tokens_seen": 113034700, "step": 5249, "time_per_iteration": 4.116872072219849 }, { "auxiliary_loss_clip": 0.01176446, "auxiliary_loss_mlp": 0.01032098, "balance_loss_clip": 1.05070722, "balance_loss_mlp": 1.0242542, "epoch": 0.6312751758552275, "flos": 41902545075840.0, "grad_norm": 1.737670848427465, "language_loss": 0.64854348, "learning_rate": 1.2644172698420903e-06, "loss": 0.67062891, "num_input_tokens_seen": 113056805, "step": 5250, "time_per_iteration": 2.808302879333496 }, { "auxiliary_loss_clip": 0.01172087, "auxiliary_loss_mlp": 0.01027122, "balance_loss_clip": 0.9368304, "balance_loss_mlp": 1.019925, "epoch": 0.6313954187458667, "flos": 19646800266240.0, "grad_norm": 1.9568922039698655, "language_loss": 0.84661013, "learning_rate": 1.2636929534263892e-06, "loss": 0.86860228, "num_input_tokens_seen": 113075790, "step": 5251, "time_per_iteration": 2.687039852142334 }, { "auxiliary_loss_clip": 0.01166392, "auxiliary_loss_mlp": 0.01028299, "balance_loss_clip": 0.92915148, "balance_loss_mlp": 1.02079511, "epoch": 0.6315156616365057, "flos": 22894273906560.0, "grad_norm": 1.6465495821799563, "language_loss": 0.77616298, "learning_rate": 1.2629687487078821e-06, "loss": 0.79810995, "num_input_tokens_seen": 113094600, "step": 5252, "time_per_iteration": 2.750751256942749 }, { "auxiliary_loss_clip": 0.01174422, "auxiliary_loss_mlp": 0.01024736, "balance_loss_clip": 1.01095843, "balance_loss_mlp": 1.01738703, "epoch": 0.6316359045271448, "flos": 23726251699200.0, "grad_norm": 1.9432449296879821, "language_loss": 0.76541525, "learning_rate": 1.2622446557964293e-06, "loss": 0.7874068, "num_input_tokens_seen": 113112605, "step": 5253, "time_per_iteration": 2.650904417037964 }, { "auxiliary_loss_clip": 0.01167864, "auxiliary_loss_mlp": 0.01029998, "balance_loss_clip": 0.96910727, "balance_loss_mlp": 1.02275562, "epoch": 0.631756147417784, "flos": 33108417164160.0, "grad_norm": 1.6353509979476735, "language_loss": 0.7139734, "learning_rate": 1.261520674801876e-06, "loss": 0.73595202, "num_input_tokens_seen": 113133200, "step": 5254, "time_per_iteration": 2.788424253463745 }, { "auxiliary_loss_clip": 0.01172753, "auxiliary_loss_mlp": 0.01025828, "balance_loss_clip": 0.97646534, "balance_loss_mlp": 1.01821899, "epoch": 0.631876390308423, "flos": 31248424126080.0, "grad_norm": 3.002496371830209, "language_loss": 0.7209481, "learning_rate": 1.2607968058340488e-06, "loss": 0.74293399, "num_input_tokens_seen": 113152895, "step": 5255, "time_per_iteration": 2.7852282524108887 }, { "auxiliary_loss_clip": 0.01165526, "auxiliary_loss_mlp": 0.01023533, "balance_loss_clip": 0.97214341, "balance_loss_mlp": 1.01641583, "epoch": 0.6319966331990621, "flos": 24681152810880.0, "grad_norm": 1.8128738379079061, "language_loss": 0.72650123, "learning_rate": 1.2600730490027583e-06, "loss": 0.74839181, "num_input_tokens_seen": 113173135, "step": 5256, "time_per_iteration": 2.683102607727051 }, { "auxiliary_loss_clip": 0.01166284, "auxiliary_loss_mlp": 0.01028222, "balance_loss_clip": 0.93446851, "balance_loss_mlp": 1.02086425, "epoch": 0.6321168760897012, "flos": 17491764913920.0, "grad_norm": 1.6560908931572396, "language_loss": 0.80318344, "learning_rate": 1.2593494044177984e-06, "loss": 0.82512856, "num_input_tokens_seen": 113191440, "step": 5257, "time_per_iteration": 2.729807138442993 }, { "auxiliary_loss_clip": 0.01177743, "auxiliary_loss_mlp": 0.01024182, "balance_loss_clip": 1.04999506, "balance_loss_mlp": 1.01599848, "epoch": 0.6322371189803403, "flos": 18295373940480.0, "grad_norm": 2.1640185273919026, "language_loss": 0.80857897, "learning_rate": 1.2586258721889448e-06, "loss": 0.83059824, "num_input_tokens_seen": 113208790, "step": 5258, "time_per_iteration": 2.6812102794647217 }, { "auxiliary_loss_clip": 0.01159243, "auxiliary_loss_mlp": 0.01024492, "balance_loss_clip": 0.89676511, "balance_loss_mlp": 1.01725042, "epoch": 0.6323573618709794, "flos": 20157270399360.0, "grad_norm": 1.9737355183919922, "language_loss": 0.81779051, "learning_rate": 1.2579024524259573e-06, "loss": 0.8396278, "num_input_tokens_seen": 113225050, "step": 5259, "time_per_iteration": 2.7128593921661377 }, { "auxiliary_loss_clip": 0.01162452, "auxiliary_loss_mlp": 0.01026595, "balance_loss_clip": 0.9689135, "balance_loss_mlp": 1.01909041, "epoch": 0.6324776047616185, "flos": 20042391726720.0, "grad_norm": 1.9167178257671231, "language_loss": 0.91457993, "learning_rate": 1.2571791452385768e-06, "loss": 0.93647039, "num_input_tokens_seen": 113242315, "step": 5260, "time_per_iteration": 2.6505870819091797 }, { "auxiliary_loss_clip": 0.01170352, "auxiliary_loss_mlp": 0.01022328, "balance_loss_clip": 0.97356093, "balance_loss_mlp": 1.01530099, "epoch": 0.6325978476522576, "flos": 30848235724800.0, "grad_norm": 1.6497111101107609, "language_loss": 0.77111697, "learning_rate": 1.2564559507365301e-06, "loss": 0.79304379, "num_input_tokens_seen": 113264720, "step": 5261, "time_per_iteration": 2.8194589614868164 }, { "auxiliary_loss_clip": 0.0117584, "auxiliary_loss_mlp": 0.01028321, "balance_loss_clip": 0.97560066, "balance_loss_mlp": 1.02078056, "epoch": 0.6327180905428966, "flos": 24535104111360.0, "grad_norm": 2.034114683027051, "language_loss": 0.78543472, "learning_rate": 1.2557328690295244e-06, "loss": 0.80747628, "num_input_tokens_seen": 113282910, "step": 5262, "time_per_iteration": 2.704791784286499 }, { "auxiliary_loss_clip": 0.01176778, "auxiliary_loss_mlp": 0.01025347, "balance_loss_clip": 0.93603933, "balance_loss_mlp": 1.0182004, "epoch": 0.6328383334335358, "flos": 21575274583680.0, "grad_norm": 1.626275943673385, "language_loss": 0.76257873, "learning_rate": 1.255009900227251e-06, "loss": 0.78460002, "num_input_tokens_seen": 113301935, "step": 5263, "time_per_iteration": 2.8719987869262695 }, { "auxiliary_loss_clip": 0.01171901, "auxiliary_loss_mlp": 0.01029543, "balance_loss_clip": 1.05227661, "balance_loss_mlp": 1.02265906, "epoch": 0.6329585763241748, "flos": 22929861306240.0, "grad_norm": 1.8775463910977963, "language_loss": 0.79281181, "learning_rate": 1.254287044439383e-06, "loss": 0.81482625, "num_input_tokens_seen": 113321540, "step": 5264, "time_per_iteration": 2.6518213748931885 }, { "auxiliary_loss_clip": 0.01071171, "auxiliary_loss_mlp": 0.01002288, "balance_loss_clip": 1.0159781, "balance_loss_mlp": 1.00075042, "epoch": 0.6330788192148139, "flos": 70936897847040.0, "grad_norm": 0.7713499601648359, "language_loss": 0.54438949, "learning_rate": 1.2535643017755776e-06, "loss": 0.56512409, "num_input_tokens_seen": 113383730, "step": 5265, "time_per_iteration": 3.2608680725097656 }, { "auxiliary_loss_clip": 0.01166782, "auxiliary_loss_mlp": 0.01028541, "balance_loss_clip": 0.93194389, "balance_loss_mlp": 1.02137685, "epoch": 0.6331990621054531, "flos": 21244501215360.0, "grad_norm": 2.165821407695865, "language_loss": 0.72030163, "learning_rate": 1.2528416723454737e-06, "loss": 0.74225485, "num_input_tokens_seen": 113400400, "step": 5266, "time_per_iteration": 2.7386386394500732 }, { "auxiliary_loss_clip": 0.01169535, "auxiliary_loss_mlp": 0.01032024, "balance_loss_clip": 1.05062532, "balance_loss_mlp": 1.02502966, "epoch": 0.6333193049960921, "flos": 34459412526720.0, "grad_norm": 1.630467595528921, "language_loss": 0.71198726, "learning_rate": 1.2521191562586945e-06, "loss": 0.73400277, "num_input_tokens_seen": 113424050, "step": 5267, "time_per_iteration": 3.7575998306274414 }, { "auxiliary_loss_clip": 0.01172805, "auxiliary_loss_mlp": 0.01122467, "balance_loss_clip": 1.05192661, "balance_loss_mlp": 0.0, "epoch": 0.6334395478867312, "flos": 18329883932160.0, "grad_norm": 1.924944879719053, "language_loss": 0.76515067, "learning_rate": 1.2513967536248445e-06, "loss": 0.78810334, "num_input_tokens_seen": 113440370, "step": 5268, "time_per_iteration": 2.5950822830200195 }, { "auxiliary_loss_clip": 0.01170263, "auxiliary_loss_mlp": 0.0102908, "balance_loss_clip": 1.01287234, "balance_loss_mlp": 1.02136099, "epoch": 0.6335597907773702, "flos": 23623152687360.0, "grad_norm": 1.5582838815943747, "language_loss": 0.81298304, "learning_rate": 1.2506744645535117e-06, "loss": 0.83497643, "num_input_tokens_seen": 113460800, "step": 5269, "time_per_iteration": 2.6705162525177 }, { "auxiliary_loss_clip": 0.01157296, "auxiliary_loss_mlp": 0.01024553, "balance_loss_clip": 0.96586019, "balance_loss_mlp": 1.01691771, "epoch": 0.6336800336680094, "flos": 22710913954560.0, "grad_norm": 1.811029840254445, "language_loss": 0.60333401, "learning_rate": 1.249952289154267e-06, "loss": 0.62515259, "num_input_tokens_seen": 113480840, "step": 5270, "time_per_iteration": 2.691159725189209 }, { "auxiliary_loss_clip": 0.01151493, "auxiliary_loss_mlp": 0.01029242, "balance_loss_clip": 0.8569144, "balance_loss_mlp": 1.02277827, "epoch": 0.6338002765586485, "flos": 23622757637760.0, "grad_norm": 1.5768506350154847, "language_loss": 0.76154208, "learning_rate": 1.2492302275366635e-06, "loss": 0.78334945, "num_input_tokens_seen": 113500515, "step": 5271, "time_per_iteration": 2.823188543319702 }, { "auxiliary_loss_clip": 0.01165639, "auxiliary_loss_mlp": 0.01026906, "balance_loss_clip": 1.010481, "balance_loss_mlp": 1.01897252, "epoch": 0.6339205194492875, "flos": 26505450708480.0, "grad_norm": 2.3399635816959847, "language_loss": 0.66119164, "learning_rate": 1.2485082798102377e-06, "loss": 0.68311715, "num_input_tokens_seen": 113520930, "step": 5272, "time_per_iteration": 3.5868661403656006 }, { "auxiliary_loss_clip": 0.01175633, "auxiliary_loss_mlp": 0.01025449, "balance_loss_clip": 0.9351368, "balance_loss_mlp": 1.01804006, "epoch": 0.6340407623399267, "flos": 18544306170240.0, "grad_norm": 3.3141558898436623, "language_loss": 0.68172938, "learning_rate": 1.2477864460845084e-06, "loss": 0.70374024, "num_input_tokens_seen": 113537330, "step": 5273, "time_per_iteration": 3.6418583393096924 }, { "auxiliary_loss_clip": 0.01168786, "auxiliary_loss_mlp": 0.01028822, "balance_loss_clip": 0.97265929, "balance_loss_mlp": 1.02064443, "epoch": 0.6341610052305657, "flos": 17712579772800.0, "grad_norm": 2.7654363602436454, "language_loss": 0.73907351, "learning_rate": 1.2470647264689776e-06, "loss": 0.76104963, "num_input_tokens_seen": 113555810, "step": 5274, "time_per_iteration": 2.687229871749878 }, { "auxiliary_loss_clip": 0.01170343, "auxiliary_loss_mlp": 0.01023561, "balance_loss_clip": 0.85482359, "balance_loss_mlp": 1.01622379, "epoch": 0.6342812481212048, "flos": 23587026583680.0, "grad_norm": 3.0584560667929654, "language_loss": 0.70797724, "learning_rate": 1.2463431210731282e-06, "loss": 0.72991627, "num_input_tokens_seen": 113575395, "step": 5275, "time_per_iteration": 3.639247417449951 }, { "auxiliary_loss_clip": 0.01172834, "auxiliary_loss_mlp": 0.01027824, "balance_loss_clip": 0.8923896, "balance_loss_mlp": 1.01986074, "epoch": 0.634401491011844, "flos": 17821927751040.0, "grad_norm": 2.272663159482606, "language_loss": 0.75869226, "learning_rate": 1.2456216300064289e-06, "loss": 0.7806989, "num_input_tokens_seen": 113592945, "step": 5276, "time_per_iteration": 2.7968199253082275 }, { "auxiliary_loss_clip": 0.01159417, "auxiliary_loss_mlp": 0.01029334, "balance_loss_clip": 0.96963936, "balance_loss_mlp": 1.0215168, "epoch": 0.634521733902483, "flos": 21358158825600.0, "grad_norm": 1.642842211994211, "language_loss": 0.78543973, "learning_rate": 1.244900253378328e-06, "loss": 0.80732727, "num_input_tokens_seen": 113613000, "step": 5277, "time_per_iteration": 2.65165638923645 }, { "auxiliary_loss_clip": 0.01170302, "auxiliary_loss_mlp": 0.01024347, "balance_loss_clip": 0.78434467, "balance_loss_mlp": 1.01730454, "epoch": 0.6346419767931221, "flos": 16545052103040.0, "grad_norm": 1.991415126471734, "language_loss": 0.69467485, "learning_rate": 1.2441789912982583e-06, "loss": 0.7166214, "num_input_tokens_seen": 113630085, "step": 5278, "time_per_iteration": 2.9461796283721924 }, { "auxiliary_loss_clip": 0.01175246, "auxiliary_loss_mlp": 0.01028736, "balance_loss_clip": 1.01320982, "balance_loss_mlp": 1.02107716, "epoch": 0.6347622196837612, "flos": 24350989973760.0, "grad_norm": 2.1910060725591345, "language_loss": 0.64779472, "learning_rate": 1.2434578438756346e-06, "loss": 0.66983449, "num_input_tokens_seen": 113650515, "step": 5279, "time_per_iteration": 3.135246992111206 }, { "auxiliary_loss_clip": 0.0117323, "auxiliary_loss_mlp": 0.01023873, "balance_loss_clip": 1.00982499, "balance_loss_mlp": 1.01663733, "epoch": 0.6348824625744003, "flos": 64523178195840.0, "grad_norm": 1.726098699883536, "language_loss": 0.77618879, "learning_rate": 1.242736811219855e-06, "loss": 0.79815984, "num_input_tokens_seen": 113676475, "step": 5280, "time_per_iteration": 2.972799301147461 }, { "auxiliary_loss_clip": 0.01164334, "auxiliary_loss_mlp": 0.01026699, "balance_loss_clip": 1.01121497, "balance_loss_mlp": 1.01996946, "epoch": 0.6350027054650393, "flos": 28622133313920.0, "grad_norm": 1.631944674111674, "language_loss": 0.8199991, "learning_rate": 1.2420158934402988e-06, "loss": 0.84190941, "num_input_tokens_seen": 113697090, "step": 5281, "time_per_iteration": 2.656184434890747 }, { "auxiliary_loss_clip": 0.01150599, "auxiliary_loss_mlp": 0.01027701, "balance_loss_clip": 0.9298147, "balance_loss_mlp": 1.02017868, "epoch": 0.6351229483556785, "flos": 23002544476800.0, "grad_norm": 1.82087144086173, "language_loss": 0.84740937, "learning_rate": 1.2412950906463286e-06, "loss": 0.86919236, "num_input_tokens_seen": 113714395, "step": 5282, "time_per_iteration": 2.6699166297912598 }, { "auxiliary_loss_clip": 0.01167076, "auxiliary_loss_mlp": 0.010252, "balance_loss_clip": 0.89770532, "balance_loss_mlp": 1.01841426, "epoch": 0.6352431912463176, "flos": 21939300967680.0, "grad_norm": 1.7219408704713295, "language_loss": 0.8971591, "learning_rate": 1.2405744029472902e-06, "loss": 0.91908193, "num_input_tokens_seen": 113733880, "step": 5283, "time_per_iteration": 3.039466142654419 }, { "auxiliary_loss_clip": 0.01164564, "auxiliary_loss_mlp": 0.01030523, "balance_loss_clip": 0.97072721, "balance_loss_mlp": 1.02322161, "epoch": 0.6353634341369566, "flos": 13735257684480.0, "grad_norm": 1.7445606968100924, "language_loss": 0.76218456, "learning_rate": 1.2398538304525108e-06, "loss": 0.78413546, "num_input_tokens_seen": 113752505, "step": 5284, "time_per_iteration": 2.7353928089141846 }, { "auxiliary_loss_clip": 0.01166878, "auxiliary_loss_mlp": 0.01024078, "balance_loss_clip": 0.93555206, "balance_loss_mlp": 1.01575172, "epoch": 0.6354836770275958, "flos": 19316170552320.0, "grad_norm": 2.0846155286770576, "language_loss": 0.75803983, "learning_rate": 1.2391333732713016e-06, "loss": 0.77994943, "num_input_tokens_seen": 113770310, "step": 5285, "time_per_iteration": 2.701449394226074 }, { "auxiliary_loss_clip": 0.01166646, "auxiliary_loss_mlp": 0.0102914, "balance_loss_clip": 0.93252504, "balance_loss_mlp": 1.02155817, "epoch": 0.6356039199182348, "flos": 21613375935360.0, "grad_norm": 1.9961746414118517, "language_loss": 0.78195083, "learning_rate": 1.2384130315129543e-06, "loss": 0.80390871, "num_input_tokens_seen": 113788635, "step": 5286, "time_per_iteration": 2.641934871673584 }, { "auxiliary_loss_clip": 0.01165728, "auxiliary_loss_mlp": 0.01034667, "balance_loss_clip": 0.74051744, "balance_loss_mlp": 1.02701092, "epoch": 0.6357241628088739, "flos": 18111978074880.0, "grad_norm": 2.079254828649695, "language_loss": 0.73604536, "learning_rate": 1.2376928052867447e-06, "loss": 0.75804931, "num_input_tokens_seen": 113807755, "step": 5287, "time_per_iteration": 3.1143980026245117 }, { "auxiliary_loss_clip": 0.01170031, "auxiliary_loss_mlp": 0.0103192, "balance_loss_clip": 0.97392869, "balance_loss_mlp": 1.0245235, "epoch": 0.6358444056995131, "flos": 24935256599040.0, "grad_norm": 1.8728475591216966, "language_loss": 0.77502632, "learning_rate": 1.2369726947019299e-06, "loss": 0.79704589, "num_input_tokens_seen": 113828230, "step": 5288, "time_per_iteration": 3.367049217224121 }, { "auxiliary_loss_clip": 0.01169271, "auxiliary_loss_mlp": 0.01021429, "balance_loss_clip": 1.0098238, "balance_loss_mlp": 1.01458669, "epoch": 0.6359646485901521, "flos": 23293348986240.0, "grad_norm": 2.1651256824612353, "language_loss": 0.67072189, "learning_rate": 1.2362526998677511e-06, "loss": 0.69262886, "num_input_tokens_seen": 113844595, "step": 5289, "time_per_iteration": 2.857494831085205 }, { "auxiliary_loss_clip": 0.01174846, "auxiliary_loss_mlp": 0.01025507, "balance_loss_clip": 0.97358274, "balance_loss_mlp": 1.01870036, "epoch": 0.6360848914807912, "flos": 20887442069760.0, "grad_norm": 1.752323874759085, "language_loss": 0.84563243, "learning_rate": 1.2355328208934301e-06, "loss": 0.86763597, "num_input_tokens_seen": 113863470, "step": 5290, "time_per_iteration": 2.7376649379730225 }, { "auxiliary_loss_clip": 0.01169578, "auxiliary_loss_mlp": 0.01122434, "balance_loss_clip": 1.00952733, "balance_loss_mlp": 0.0, "epoch": 0.6362051343714303, "flos": 18479775386880.0, "grad_norm": 2.183752413257876, "language_loss": 0.7238555, "learning_rate": 1.2348130578881728e-06, "loss": 0.74677563, "num_input_tokens_seen": 113881690, "step": 5291, "time_per_iteration": 2.6464691162109375 }, { "auxiliary_loss_clip": 0.01176742, "auxiliary_loss_mlp": 0.01022379, "balance_loss_clip": 1.0510788, "balance_loss_mlp": 1.01479101, "epoch": 0.6363253772620694, "flos": 24389594115840.0, "grad_norm": 2.338211267898916, "language_loss": 0.75972116, "learning_rate": 1.2340934109611664e-06, "loss": 0.78171241, "num_input_tokens_seen": 113902450, "step": 5292, "time_per_iteration": 2.6657612323760986 }, { "auxiliary_loss_clip": 0.01179346, "auxiliary_loss_mlp": 0.01023713, "balance_loss_clip": 0.97603422, "balance_loss_mlp": 1.01539469, "epoch": 0.6364456201527084, "flos": 25958243940480.0, "grad_norm": 2.6368979635633996, "language_loss": 0.6878202, "learning_rate": 1.2333738802215798e-06, "loss": 0.70985079, "num_input_tokens_seen": 113922670, "step": 5293, "time_per_iteration": 3.7463462352752686 }, { "auxiliary_loss_clip": 0.01160106, "auxiliary_loss_mlp": 0.01019593, "balance_loss_clip": 0.89356959, "balance_loss_mlp": 1.0123446, "epoch": 0.6365658630433476, "flos": 20740711011840.0, "grad_norm": 2.1852823828504047, "language_loss": 0.80799526, "learning_rate": 1.2326544657785668e-06, "loss": 0.82979226, "num_input_tokens_seen": 113942360, "step": 5294, "time_per_iteration": 2.8339481353759766 }, { "auxiliary_loss_clip": 0.0115833, "auxiliary_loss_mlp": 0.01029827, "balance_loss_clip": 0.93337172, "balance_loss_mlp": 1.02277851, "epoch": 0.6366861059339867, "flos": 21434146047360.0, "grad_norm": 2.3571907032384476, "language_loss": 0.7458955, "learning_rate": 1.2319351677412608e-06, "loss": 0.76777703, "num_input_tokens_seen": 113959405, "step": 5295, "time_per_iteration": 2.7719979286193848 }, { "auxiliary_loss_clip": 0.01180037, "auxiliary_loss_mlp": 0.01026313, "balance_loss_clip": 0.93794858, "balance_loss_mlp": 1.01897311, "epoch": 0.6368063488246257, "flos": 22267093507200.0, "grad_norm": 1.7514855323650538, "language_loss": 0.74098849, "learning_rate": 1.2312159862187796e-06, "loss": 0.76305199, "num_input_tokens_seen": 113977815, "step": 5296, "time_per_iteration": 2.6814632415771484 }, { "auxiliary_loss_clip": 0.01176705, "auxiliary_loss_mlp": 0.01027823, "balance_loss_clip": 1.05122066, "balance_loss_mlp": 1.02038419, "epoch": 0.6369265917152649, "flos": 22420719976320.0, "grad_norm": 1.828896359861146, "language_loss": 0.76523817, "learning_rate": 1.2304969213202217e-06, "loss": 0.78728342, "num_input_tokens_seen": 113999075, "step": 5297, "time_per_iteration": 2.6479105949401855 }, { "auxiliary_loss_clip": 0.01163415, "auxiliary_loss_mlp": 0.01024805, "balance_loss_clip": 0.97041112, "balance_loss_mlp": 1.01757538, "epoch": 0.6370468346059039, "flos": 24718176754560.0, "grad_norm": 2.1904700826821104, "language_loss": 0.79085886, "learning_rate": 1.2297779731546692e-06, "loss": 0.8127411, "num_input_tokens_seen": 114018170, "step": 5298, "time_per_iteration": 2.694195032119751 }, { "auxiliary_loss_clip": 0.01168129, "auxiliary_loss_mlp": 0.01028847, "balance_loss_clip": 0.97519553, "balance_loss_mlp": 1.02163744, "epoch": 0.637167077496543, "flos": 25296589463040.0, "grad_norm": 1.9255413011725402, "language_loss": 0.77840859, "learning_rate": 1.2290591418311853e-06, "loss": 0.80037832, "num_input_tokens_seen": 114035565, "step": 5299, "time_per_iteration": 3.5993220806121826 }, { "auxiliary_loss_clip": 0.01176584, "auxiliary_loss_mlp": 0.01028228, "balance_loss_clip": 1.01639867, "balance_loss_mlp": 1.0205152, "epoch": 0.637287320387182, "flos": 27671110871040.0, "grad_norm": 1.5145651755903164, "language_loss": 0.71907401, "learning_rate": 1.2283404274588172e-06, "loss": 0.74112213, "num_input_tokens_seen": 114054510, "step": 5300, "time_per_iteration": 3.66827392578125 }, { "auxiliary_loss_clip": 0.01068602, "auxiliary_loss_mlp": 0.01002575, "balance_loss_clip": 0.78835177, "balance_loss_mlp": 1.00081086, "epoch": 0.6374075632778212, "flos": 63173406873600.0, "grad_norm": 0.7519861666163631, "language_loss": 0.52853268, "learning_rate": 1.227621830146592e-06, "loss": 0.5492444, "num_input_tokens_seen": 114109875, "step": 5301, "time_per_iteration": 3.4559614658355713 }, { "auxiliary_loss_clip": 0.01173871, "auxiliary_loss_mlp": 0.01027594, "balance_loss_clip": 0.93575126, "balance_loss_mlp": 1.02040017, "epoch": 0.6375278061684603, "flos": 25558127366400.0, "grad_norm": 1.70583762845816, "language_loss": 0.79227245, "learning_rate": 1.2269033500035217e-06, "loss": 0.81428713, "num_input_tokens_seen": 114130010, "step": 5302, "time_per_iteration": 3.4161670207977295 }, { "auxiliary_loss_clip": 0.01173627, "auxiliary_loss_mlp": 0.01028968, "balance_loss_clip": 0.93789661, "balance_loss_mlp": 1.02194667, "epoch": 0.6376480490590993, "flos": 25666362023040.0, "grad_norm": 1.8321537320692867, "language_loss": 0.73146749, "learning_rate": 1.2261849871385988e-06, "loss": 0.75349343, "num_input_tokens_seen": 114151115, "step": 5303, "time_per_iteration": 2.7296841144561768 }, { "auxiliary_loss_clip": 0.01172889, "auxiliary_loss_mlp": 0.01026267, "balance_loss_clip": 1.04982376, "balance_loss_mlp": 1.01881921, "epoch": 0.6377682919497385, "flos": 31537684350720.0, "grad_norm": 3.2581558878710757, "language_loss": 0.62468338, "learning_rate": 1.2254667416607972e-06, "loss": 0.64667499, "num_input_tokens_seen": 114172715, "step": 5304, "time_per_iteration": 2.6289923191070557 }, { "auxiliary_loss_clip": 0.01172949, "auxiliary_loss_mlp": 0.01025754, "balance_loss_clip": 1.01325762, "balance_loss_mlp": 1.01898313, "epoch": 0.6378885348403776, "flos": 23039209284480.0, "grad_norm": 1.6912114610887485, "language_loss": 0.83145887, "learning_rate": 1.2247486136790756e-06, "loss": 0.85344589, "num_input_tokens_seen": 114192195, "step": 5305, "time_per_iteration": 2.753183126449585 }, { "auxiliary_loss_clip": 0.01176762, "auxiliary_loss_mlp": 0.01032812, "balance_loss_clip": 1.01454663, "balance_loss_mlp": 1.02549243, "epoch": 0.6380087777310166, "flos": 18697070712960.0, "grad_norm": 2.2817518202979543, "language_loss": 0.8088032, "learning_rate": 1.2240306033023726e-06, "loss": 0.830899, "num_input_tokens_seen": 114210020, "step": 5306, "time_per_iteration": 2.6374611854553223 }, { "auxiliary_loss_clip": 0.01168991, "auxiliary_loss_mlp": 0.01023864, "balance_loss_clip": 0.93140465, "balance_loss_mlp": 1.01643765, "epoch": 0.6381290206216558, "flos": 23331558078720.0, "grad_norm": 1.7989070056112886, "language_loss": 0.72120649, "learning_rate": 1.223312710639611e-06, "loss": 0.74313504, "num_input_tokens_seen": 114228740, "step": 5307, "time_per_iteration": 2.7119176387786865 }, { "auxiliary_loss_clip": 0.01167183, "auxiliary_loss_mlp": 0.01028922, "balance_loss_clip": 0.97229135, "balance_loss_mlp": 1.02131641, "epoch": 0.6382492635122948, "flos": 18880466578560.0, "grad_norm": 2.1984979076979716, "language_loss": 0.86607242, "learning_rate": 1.2225949357996928e-06, "loss": 0.88803351, "num_input_tokens_seen": 114246865, "step": 5308, "time_per_iteration": 2.6325979232788086 }, { "auxiliary_loss_clip": 0.01170996, "auxiliary_loss_mlp": 0.01030644, "balance_loss_clip": 1.0150156, "balance_loss_mlp": 1.02356327, "epoch": 0.6383695064029339, "flos": 27819134818560.0, "grad_norm": 1.462617358201191, "language_loss": 0.80190521, "learning_rate": 1.221877278891505e-06, "loss": 0.82392156, "num_input_tokens_seen": 114266120, "step": 5309, "time_per_iteration": 2.6940865516662598 }, { "auxiliary_loss_clip": 0.0117863, "auxiliary_loss_mlp": 0.01024963, "balance_loss_clip": 1.01312828, "balance_loss_mlp": 1.01757169, "epoch": 0.638489749293573, "flos": 26395635853440.0, "grad_norm": 2.0172921729190345, "language_loss": 0.71166444, "learning_rate": 1.221159740023915e-06, "loss": 0.73370034, "num_input_tokens_seen": 114285950, "step": 5310, "time_per_iteration": 2.662877082824707 }, { "auxiliary_loss_clip": 0.01178243, "auxiliary_loss_mlp": 0.01122838, "balance_loss_clip": 0.93641031, "balance_loss_mlp": 0.0, "epoch": 0.6386099921842121, "flos": 23988328306560.0, "grad_norm": 1.7944117049639337, "language_loss": 0.72497129, "learning_rate": 1.2204423193057735e-06, "loss": 0.74798208, "num_input_tokens_seen": 114304780, "step": 5311, "time_per_iteration": 2.7366411685943604 }, { "auxiliary_loss_clip": 0.01077108, "auxiliary_loss_mlp": 0.01002989, "balance_loss_clip": 0.94153893, "balance_loss_mlp": 1.00139177, "epoch": 0.6387302350748512, "flos": 71731169337600.0, "grad_norm": 0.9697718305704279, "language_loss": 0.63432181, "learning_rate": 1.2197250168459122e-06, "loss": 0.65512288, "num_input_tokens_seen": 114361180, "step": 5312, "time_per_iteration": 3.2715649604797363 }, { "auxiliary_loss_clip": 0.01174587, "auxiliary_loss_mlp": 0.0102641, "balance_loss_clip": 1.01258373, "balance_loss_mlp": 1.01916218, "epoch": 0.6388504779654903, "flos": 14535778141440.0, "grad_norm": 1.7636614395080068, "language_loss": 0.74356896, "learning_rate": 1.2190078327531454e-06, "loss": 0.76557899, "num_input_tokens_seen": 114377425, "step": 5313, "time_per_iteration": 2.598201274871826 }, { "auxiliary_loss_clip": 0.01171917, "auxiliary_loss_mlp": 0.01025796, "balance_loss_clip": 1.01069903, "balance_loss_mlp": 1.01884651, "epoch": 0.6389707208561294, "flos": 22346133384960.0, "grad_norm": 1.4713076522822617, "language_loss": 0.72599483, "learning_rate": 1.2182907671362697e-06, "loss": 0.74797201, "num_input_tokens_seen": 114398120, "step": 5314, "time_per_iteration": 2.6697285175323486 }, { "auxiliary_loss_clip": 0.01171154, "auxiliary_loss_mlp": 0.010304, "balance_loss_clip": 1.01374209, "balance_loss_mlp": 1.02314281, "epoch": 0.6390909637467684, "flos": 19426883247360.0, "grad_norm": 2.286316253600412, "language_loss": 0.78612429, "learning_rate": 1.2175738201040626e-06, "loss": 0.8081398, "num_input_tokens_seen": 114415160, "step": 5315, "time_per_iteration": 2.6210954189300537 }, { "auxiliary_loss_clip": 0.01170462, "auxiliary_loss_mlp": 0.0102655, "balance_loss_clip": 1.01123035, "balance_loss_mlp": 1.01917148, "epoch": 0.6392112066374076, "flos": 24090852700800.0, "grad_norm": 1.7096987208354546, "language_loss": 0.78567779, "learning_rate": 1.2168569917652855e-06, "loss": 0.80764788, "num_input_tokens_seen": 114435015, "step": 5316, "time_per_iteration": 2.7607998847961426 }, { "auxiliary_loss_clip": 0.01174973, "auxiliary_loss_mlp": 0.0102565, "balance_loss_clip": 1.01379848, "balance_loss_mlp": 1.01815498, "epoch": 0.6393314495280467, "flos": 26795141896320.0, "grad_norm": 1.4710086635831854, "language_loss": 0.63655674, "learning_rate": 1.2161402822286797e-06, "loss": 0.65856302, "num_input_tokens_seen": 114455700, "step": 5317, "time_per_iteration": 2.619511842727661 }, { "auxiliary_loss_clip": 0.01169728, "auxiliary_loss_mlp": 0.01029856, "balance_loss_clip": 0.93574882, "balance_loss_mlp": 1.02239382, "epoch": 0.6394516924186857, "flos": 20260692633600.0, "grad_norm": 1.9010091193896534, "language_loss": 0.78964543, "learning_rate": 1.2154236916029703e-06, "loss": 0.81164122, "num_input_tokens_seen": 114473675, "step": 5318, "time_per_iteration": 2.6764628887176514 }, { "auxiliary_loss_clip": 0.01166252, "auxiliary_loss_mlp": 0.01024927, "balance_loss_clip": 0.89204061, "balance_loss_mlp": 1.01701713, "epoch": 0.6395719353093249, "flos": 18368847210240.0, "grad_norm": 2.2140629032736645, "language_loss": 0.73505247, "learning_rate": 1.2147072199968627e-06, "loss": 0.75696427, "num_input_tokens_seen": 114492310, "step": 5319, "time_per_iteration": 2.7351558208465576 }, { "auxiliary_loss_clip": 0.0116885, "auxiliary_loss_mlp": 0.01026112, "balance_loss_clip": 1.01201165, "balance_loss_mlp": 1.01928473, "epoch": 0.6396921781999639, "flos": 17566315591680.0, "grad_norm": 2.021506926951658, "language_loss": 0.71442801, "learning_rate": 1.2139908675190454e-06, "loss": 0.7363776, "num_input_tokens_seen": 114511520, "step": 5320, "time_per_iteration": 3.5400912761688232 }, { "auxiliary_loss_clip": 0.01153693, "auxiliary_loss_mlp": 0.01026258, "balance_loss_clip": 0.85501337, "balance_loss_mlp": 1.01831818, "epoch": 0.639812421090603, "flos": 21251252972160.0, "grad_norm": 2.0961163708460946, "language_loss": 0.74707109, "learning_rate": 1.2132746342781883e-06, "loss": 0.76887059, "num_input_tokens_seen": 114532680, "step": 5321, "time_per_iteration": 2.821512222290039 }, { "auxiliary_loss_clip": 0.01173143, "auxiliary_loss_mlp": 0.01021186, "balance_loss_clip": 1.05051422, "balance_loss_mlp": 1.01407862, "epoch": 0.6399326639812422, "flos": 11180967684480.0, "grad_norm": 2.618796676609036, "language_loss": 0.79918414, "learning_rate": 1.2125585203829442e-06, "loss": 0.82112747, "num_input_tokens_seen": 114548320, "step": 5322, "time_per_iteration": 2.594169855117798 }, { "auxiliary_loss_clip": 0.0116089, "auxiliary_loss_mlp": 0.01030137, "balance_loss_clip": 0.93762702, "balance_loss_mlp": 1.02287686, "epoch": 0.6400529068718812, "flos": 23911048195200.0, "grad_norm": 1.9920107462070997, "language_loss": 0.74301034, "learning_rate": 1.211842525941946e-06, "loss": 0.76492059, "num_input_tokens_seen": 114568115, "step": 5323, "time_per_iteration": 2.7292633056640625 }, { "auxiliary_loss_clip": 0.01164415, "auxiliary_loss_mlp": 0.01021906, "balance_loss_clip": 0.89774317, "balance_loss_mlp": 1.01483703, "epoch": 0.6401731497625203, "flos": 44018724890880.0, "grad_norm": 1.748774626408074, "language_loss": 0.78908515, "learning_rate": 1.2111266510638105e-06, "loss": 0.81094837, "num_input_tokens_seen": 114591040, "step": 5324, "time_per_iteration": 2.8810391426086426 }, { "auxiliary_loss_clip": 0.0115863, "auxiliary_loss_mlp": 0.01026398, "balance_loss_clip": 0.85602498, "balance_loss_mlp": 1.01934052, "epoch": 0.6402933926531594, "flos": 20662209838080.0, "grad_norm": 1.703466761064731, "language_loss": 0.80208248, "learning_rate": 1.2104108958571346e-06, "loss": 0.82393283, "num_input_tokens_seen": 114609310, "step": 5325, "time_per_iteration": 3.766639232635498 }, { "auxiliary_loss_clip": 0.01169469, "auxiliary_loss_mlp": 0.01029679, "balance_loss_clip": 1.01450467, "balance_loss_mlp": 1.02271152, "epoch": 0.6404136355437985, "flos": 24863327614080.0, "grad_norm": 1.464951393432187, "language_loss": 0.75878406, "learning_rate": 1.2096952604304975e-06, "loss": 0.78077543, "num_input_tokens_seen": 114629740, "step": 5326, "time_per_iteration": 2.6960084438323975 }, { "auxiliary_loss_clip": 0.01174647, "auxiliary_loss_mlp": 0.01022556, "balance_loss_clip": 1.01150155, "balance_loss_mlp": 1.01526952, "epoch": 0.6405338784344375, "flos": 40479548901120.0, "grad_norm": 2.1604936374626917, "language_loss": 0.70632219, "learning_rate": 1.2089797448924616e-06, "loss": 0.72829419, "num_input_tokens_seen": 114653615, "step": 5327, "time_per_iteration": 3.67322039604187 }, { "auxiliary_loss_clip": 0.0117078, "auxiliary_loss_mlp": 0.01028314, "balance_loss_clip": 0.89388859, "balance_loss_mlp": 1.02081573, "epoch": 0.6406541213250767, "flos": 20886041439360.0, "grad_norm": 1.9465282230449399, "language_loss": 0.65767032, "learning_rate": 1.2082643493515692e-06, "loss": 0.67966127, "num_input_tokens_seen": 114671935, "step": 5328, "time_per_iteration": 2.704739809036255 }, { "auxiliary_loss_clip": 0.01173425, "auxiliary_loss_mlp": 0.01025983, "balance_loss_clip": 1.01347625, "balance_loss_mlp": 1.01898575, "epoch": 0.6407743642157158, "flos": 23295970679040.0, "grad_norm": 1.7219787618707616, "language_loss": 0.81555575, "learning_rate": 1.207549073916346e-06, "loss": 0.83754981, "num_input_tokens_seen": 114692870, "step": 5329, "time_per_iteration": 2.7030014991760254 }, { "auxiliary_loss_clip": 0.01159562, "auxiliary_loss_mlp": 0.01027212, "balance_loss_clip": 0.97231036, "balance_loss_mlp": 1.01966023, "epoch": 0.6408946071063548, "flos": 15012636122880.0, "grad_norm": 2.0491827831783676, "language_loss": 0.77976072, "learning_rate": 1.2068339186952976e-06, "loss": 0.80162847, "num_input_tokens_seen": 114710410, "step": 5330, "time_per_iteration": 2.608365058898926 }, { "auxiliary_loss_clip": 0.01175031, "auxiliary_loss_mlp": 0.01026943, "balance_loss_clip": 1.01262772, "balance_loss_mlp": 1.01970744, "epoch": 0.6410148499969939, "flos": 22528595496960.0, "grad_norm": 1.6795811759567023, "language_loss": 0.73034906, "learning_rate": 1.2061188837969136e-06, "loss": 0.75236881, "num_input_tokens_seen": 114730020, "step": 5331, "time_per_iteration": 2.6843931674957275 }, { "auxiliary_loss_clip": 0.01163104, "auxiliary_loss_mlp": 0.01024084, "balance_loss_clip": 0.93219912, "balance_loss_mlp": 1.01668751, "epoch": 0.641135092887633, "flos": 12422004537600.0, "grad_norm": 2.3883471949256347, "language_loss": 0.8411665, "learning_rate": 1.2054039693296631e-06, "loss": 0.86303836, "num_input_tokens_seen": 114748015, "step": 5332, "time_per_iteration": 2.6993846893310547 }, { "auxiliary_loss_clip": 0.01162034, "auxiliary_loss_mlp": 0.0102485, "balance_loss_clip": 0.93157923, "balance_loss_mlp": 1.01797724, "epoch": 0.6412553357782721, "flos": 22127329687680.0, "grad_norm": 1.989056097207865, "language_loss": 0.81306517, "learning_rate": 1.2046891754019992e-06, "loss": 0.834934, "num_input_tokens_seen": 114768625, "step": 5333, "time_per_iteration": 2.724975109100342 }, { "auxiliary_loss_clip": 0.01175489, "auxiliary_loss_mlp": 0.01025053, "balance_loss_clip": 1.01425457, "balance_loss_mlp": 1.01836205, "epoch": 0.6413755786689112, "flos": 15888605097600.0, "grad_norm": 2.180003331411685, "language_loss": 0.82554221, "learning_rate": 1.2039745021223548e-06, "loss": 0.84754759, "num_input_tokens_seen": 114786045, "step": 5334, "time_per_iteration": 2.6166269779205322 }, { "auxiliary_loss_clip": 0.01074964, "auxiliary_loss_mlp": 0.01000416, "balance_loss_clip": 0.86699879, "balance_loss_mlp": 0.99871176, "epoch": 0.6414958215595503, "flos": 68039159955840.0, "grad_norm": 0.789367582932735, "language_loss": 0.5711844, "learning_rate": 1.2032599495991456e-06, "loss": 0.59193814, "num_input_tokens_seen": 114850785, "step": 5335, "time_per_iteration": 3.4260647296905518 }, { "auxiliary_loss_clip": 0.01173121, "auxiliary_loss_mlp": 0.01026385, "balance_loss_clip": 1.01409364, "balance_loss_mlp": 1.01873136, "epoch": 0.6416160644501894, "flos": 44091300320640.0, "grad_norm": 1.5616119772057493, "language_loss": 0.69429857, "learning_rate": 1.2025455179407685e-06, "loss": 0.71629363, "num_input_tokens_seen": 114871945, "step": 5336, "time_per_iteration": 2.807809829711914 }, { "auxiliary_loss_clip": 0.01169991, "auxiliary_loss_mlp": 0.0112291, "balance_loss_clip": 1.012218, "balance_loss_mlp": 0.0, "epoch": 0.6417363073408284, "flos": 20959837931520.0, "grad_norm": 2.041301780002135, "language_loss": 0.74326921, "learning_rate": 1.2018312072556022e-06, "loss": 0.76619822, "num_input_tokens_seen": 114890445, "step": 5337, "time_per_iteration": 2.607635021209717 }, { "auxiliary_loss_clip": 0.01171269, "auxiliary_loss_mlp": 0.01122342, "balance_loss_clip": 1.05019498, "balance_loss_mlp": 0.0, "epoch": 0.6418565502314676, "flos": 22455122227200.0, "grad_norm": 1.7305588301728394, "language_loss": 0.74379784, "learning_rate": 1.2011170176520077e-06, "loss": 0.76673388, "num_input_tokens_seen": 114911360, "step": 5338, "time_per_iteration": 2.591027021408081 }, { "auxiliary_loss_clip": 0.01144999, "auxiliary_loss_mlp": 0.01022832, "balance_loss_clip": 0.85457349, "balance_loss_mlp": 1.0153904, "epoch": 0.6419767931221066, "flos": 25045502417280.0, "grad_norm": 1.4872347643103332, "language_loss": 0.81226087, "learning_rate": 1.2004029492383256e-06, "loss": 0.83393919, "num_input_tokens_seen": 114932700, "step": 5339, "time_per_iteration": 2.7830872535705566 }, { "auxiliary_loss_clip": 0.0117203, "auxiliary_loss_mlp": 0.01024719, "balance_loss_clip": 1.01430631, "balance_loss_mlp": 1.01739335, "epoch": 0.6420970360127457, "flos": 19463691709440.0, "grad_norm": 2.7058585282239354, "language_loss": 0.73688561, "learning_rate": 1.1996890021228814e-06, "loss": 0.75885308, "num_input_tokens_seen": 114949475, "step": 5340, "time_per_iteration": 2.6135146617889404 }, { "auxiliary_loss_clip": 0.0116285, "auxiliary_loss_mlp": 0.01022602, "balance_loss_clip": 0.96955335, "balance_loss_mlp": 1.01528275, "epoch": 0.6422172789033849, "flos": 40406147458560.0, "grad_norm": 1.5370648282170822, "language_loss": 0.69732845, "learning_rate": 1.1989751764139785e-06, "loss": 0.71918309, "num_input_tokens_seen": 114973125, "step": 5341, "time_per_iteration": 2.8435423374176025 }, { "auxiliary_loss_clip": 0.01160043, "auxiliary_loss_mlp": 0.01020277, "balance_loss_clip": 0.89045489, "balance_loss_mlp": 1.0130167, "epoch": 0.6423375217940239, "flos": 27672870637440.0, "grad_norm": 1.5196626323023479, "language_loss": 0.83012921, "learning_rate": 1.1982614722199044e-06, "loss": 0.85193235, "num_input_tokens_seen": 114994300, "step": 5342, "time_per_iteration": 2.817350149154663 }, { "auxiliary_loss_clip": 0.01175286, "auxiliary_loss_mlp": 0.01025247, "balance_loss_clip": 0.97274041, "balance_loss_mlp": 1.01843417, "epoch": 0.642457764684663, "flos": 18369242259840.0, "grad_norm": 1.9991402743002882, "language_loss": 0.7806946, "learning_rate": 1.1975478896489276e-06, "loss": 0.80269992, "num_input_tokens_seen": 115012135, "step": 5343, "time_per_iteration": 2.6220901012420654 }, { "auxiliary_loss_clip": 0.01171207, "auxiliary_loss_mlp": 0.0102452, "balance_loss_clip": 1.04956341, "balance_loss_mlp": 1.01732922, "epoch": 0.6425780075753021, "flos": 19750509809280.0, "grad_norm": 1.8488503270185528, "language_loss": 0.76312071, "learning_rate": 1.1968344288092981e-06, "loss": 0.78507793, "num_input_tokens_seen": 115028715, "step": 5344, "time_per_iteration": 2.6460368633270264 }, { "auxiliary_loss_clip": 0.01172622, "auxiliary_loss_mlp": 0.01122101, "balance_loss_clip": 1.01242352, "balance_loss_mlp": 0.0, "epoch": 0.6426982504659412, "flos": 20558536208640.0, "grad_norm": 1.7914632580014003, "language_loss": 0.64842969, "learning_rate": 1.1961210898092468e-06, "loss": 0.67137688, "num_input_tokens_seen": 115047665, "step": 5345, "time_per_iteration": 3.972600221633911 }, { "auxiliary_loss_clip": 0.01177638, "auxiliary_loss_mlp": 0.0102891, "balance_loss_clip": 0.97465223, "balance_loss_mlp": 1.02101874, "epoch": 0.6428184933565803, "flos": 17851984456320.0, "grad_norm": 1.9883516326514679, "language_loss": 0.79304183, "learning_rate": 1.1954078727569874e-06, "loss": 0.81510729, "num_input_tokens_seen": 115064965, "step": 5346, "time_per_iteration": 2.6789145469665527 }, { "auxiliary_loss_clip": 0.01174933, "auxiliary_loss_mlp": 0.01122695, "balance_loss_clip": 0.93496084, "balance_loss_mlp": 0.0, "epoch": 0.6429387362472194, "flos": 22456953820800.0, "grad_norm": 1.6115616784176594, "language_loss": 0.78202116, "learning_rate": 1.1946947777607141e-06, "loss": 0.8049975, "num_input_tokens_seen": 115086100, "step": 5347, "time_per_iteration": 2.836571216583252 }, { "auxiliary_loss_clip": 0.01160973, "auxiliary_loss_mlp": 0.01026356, "balance_loss_clip": 0.89627701, "balance_loss_mlp": 1.0186615, "epoch": 0.6430589791378585, "flos": 24752579005440.0, "grad_norm": 1.9754615269087485, "language_loss": 0.80313772, "learning_rate": 1.1939818049286024e-06, "loss": 0.82501107, "num_input_tokens_seen": 115104260, "step": 5348, "time_per_iteration": 2.710754156112671 }, { "auxiliary_loss_clip": 0.01155004, "auxiliary_loss_mlp": 0.01025919, "balance_loss_clip": 0.85730469, "balance_loss_mlp": 1.0185461, "epoch": 0.6431792220284975, "flos": 24901249397760.0, "grad_norm": 1.8901667642126954, "language_loss": 0.75408113, "learning_rate": 1.1932689543688101e-06, "loss": 0.77589035, "num_input_tokens_seen": 115125365, "step": 5349, "time_per_iteration": 2.8413853645324707 }, { "auxiliary_loss_clip": 0.01172329, "auxiliary_loss_mlp": 0.0102453, "balance_loss_clip": 0.97617412, "balance_loss_mlp": 1.01730013, "epoch": 0.6432994649191367, "flos": 21032305620480.0, "grad_norm": 1.7659184806061539, "language_loss": 0.72877645, "learning_rate": 1.1925562261894756e-06, "loss": 0.75074506, "num_input_tokens_seen": 115144445, "step": 5350, "time_per_iteration": 2.6575794219970703 }, { "auxiliary_loss_clip": 0.01162279, "auxiliary_loss_mlp": 0.01025475, "balance_loss_clip": 0.97140509, "balance_loss_mlp": 1.01857603, "epoch": 0.6434197078097758, "flos": 30884433655680.0, "grad_norm": 1.7137976122132756, "language_loss": 0.77211607, "learning_rate": 1.1918436204987207e-06, "loss": 0.79399359, "num_input_tokens_seen": 115166305, "step": 5351, "time_per_iteration": 4.484054803848267 }, { "auxiliary_loss_clip": 0.01172157, "auxiliary_loss_mlp": 0.01027347, "balance_loss_clip": 1.01469731, "balance_loss_mlp": 1.01989079, "epoch": 0.6435399507004148, "flos": 15012492468480.0, "grad_norm": 1.9606652290082365, "language_loss": 0.81627595, "learning_rate": 1.191131137404645e-06, "loss": 0.83827102, "num_input_tokens_seen": 115183045, "step": 5352, "time_per_iteration": 3.7850754261016846 }, { "auxiliary_loss_clip": 0.01154517, "auxiliary_loss_mlp": 0.01026723, "balance_loss_clip": 0.93249869, "balance_loss_mlp": 1.01915574, "epoch": 0.643660193591054, "flos": 19901981462400.0, "grad_norm": 2.138940487978071, "language_loss": 0.77066267, "learning_rate": 1.190418777015333e-06, "loss": 0.79247504, "num_input_tokens_seen": 115201955, "step": 5353, "time_per_iteration": 2.692643404006958 }, { "auxiliary_loss_clip": 0.01169806, "auxiliary_loss_mlp": 0.01023472, "balance_loss_clip": 0.97287947, "balance_loss_mlp": 1.01660585, "epoch": 0.643780436481693, "flos": 24133622820480.0, "grad_norm": 1.4477100761217385, "language_loss": 0.73401022, "learning_rate": 1.1897065394388487e-06, "loss": 0.755943, "num_input_tokens_seen": 115222395, "step": 5354, "time_per_iteration": 2.6916863918304443 }, { "auxiliary_loss_clip": 0.0117551, "auxiliary_loss_mlp": 0.01028983, "balance_loss_clip": 0.97956258, "balance_loss_mlp": 1.02200913, "epoch": 0.6439006793723321, "flos": 23148808657920.0, "grad_norm": 1.6288902011990776, "language_loss": 0.76552987, "learning_rate": 1.1889944247832385e-06, "loss": 0.78757483, "num_input_tokens_seen": 115242635, "step": 5355, "time_per_iteration": 2.707145929336548 }, { "auxiliary_loss_clip": 0.01175154, "auxiliary_loss_mlp": 0.01026913, "balance_loss_clip": 1.01158476, "balance_loss_mlp": 1.01911652, "epoch": 0.6440209222629713, "flos": 23617909301760.0, "grad_norm": 1.9114648110926435, "language_loss": 0.71070468, "learning_rate": 1.1882824331565283e-06, "loss": 0.73272538, "num_input_tokens_seen": 115262095, "step": 5356, "time_per_iteration": 2.665707588195801 }, { "auxiliary_loss_clip": 0.01160338, "auxiliary_loss_mlp": 0.01029812, "balance_loss_clip": 0.93130946, "balance_loss_mlp": 1.02294302, "epoch": 0.6441411651536103, "flos": 16544872535040.0, "grad_norm": 2.39048139470583, "language_loss": 0.89293444, "learning_rate": 1.1875705646667287e-06, "loss": 0.91483593, "num_input_tokens_seen": 115279985, "step": 5357, "time_per_iteration": 2.7989859580993652 }, { "auxiliary_loss_clip": 0.01166132, "auxiliary_loss_mlp": 0.01030281, "balance_loss_clip": 1.00883222, "balance_loss_mlp": 1.02325988, "epoch": 0.6442614080442494, "flos": 25410965345280.0, "grad_norm": 2.328654029418337, "language_loss": 0.75061095, "learning_rate": 1.1868588194218282e-06, "loss": 0.77257508, "num_input_tokens_seen": 115300365, "step": 5358, "time_per_iteration": 2.621924638748169 }, { "auxiliary_loss_clip": 0.01174246, "auxiliary_loss_mlp": 0.01024684, "balance_loss_clip": 0.97204149, "balance_loss_mlp": 1.01722193, "epoch": 0.6443816509348885, "flos": 28294017552000.0, "grad_norm": 1.6164608724093215, "language_loss": 0.74285334, "learning_rate": 1.1861471975297979e-06, "loss": 0.76484263, "num_input_tokens_seen": 115322060, "step": 5359, "time_per_iteration": 2.8059422969818115 }, { "auxiliary_loss_clip": 0.01165729, "auxiliary_loss_mlp": 0.01025919, "balance_loss_clip": 0.93632799, "balance_loss_mlp": 1.01852202, "epoch": 0.6445018938255276, "flos": 36690075964800.0, "grad_norm": 1.7764208769822185, "language_loss": 0.70653301, "learning_rate": 1.185435699098591e-06, "loss": 0.72844952, "num_input_tokens_seen": 115348255, "step": 5360, "time_per_iteration": 2.8363513946533203 }, { "auxiliary_loss_clip": 0.01172194, "auxiliary_loss_mlp": 0.0103234, "balance_loss_clip": 0.97131598, "balance_loss_mlp": 1.02468407, "epoch": 0.6446221367161666, "flos": 14501411804160.0, "grad_norm": 2.2406636652825767, "language_loss": 0.78230166, "learning_rate": 1.1847243242361403e-06, "loss": 0.80434692, "num_input_tokens_seen": 115366845, "step": 5361, "time_per_iteration": 2.6602602005004883 }, { "auxiliary_loss_clip": 0.0117555, "auxiliary_loss_mlp": 0.01024475, "balance_loss_clip": 0.97481847, "balance_loss_mlp": 1.0168519, "epoch": 0.6447423796068057, "flos": 24609367480320.0, "grad_norm": 1.8495930499580457, "language_loss": 0.77953458, "learning_rate": 1.1840130730503624e-06, "loss": 0.80153489, "num_input_tokens_seen": 115388125, "step": 5362, "time_per_iteration": 2.699272871017456 }, { "auxiliary_loss_clip": 0.01172055, "auxiliary_loss_mlp": 0.0102825, "balance_loss_clip": 1.04938591, "balance_loss_mlp": 1.02102566, "epoch": 0.6448626224974449, "flos": 25047298097280.0, "grad_norm": 1.7541442647660197, "language_loss": 0.74803424, "learning_rate": 1.1833019456491518e-06, "loss": 0.77003729, "num_input_tokens_seen": 115409655, "step": 5363, "time_per_iteration": 2.65234637260437 }, { "auxiliary_loss_clip": 0.01171, "auxiliary_loss_mlp": 0.01027157, "balance_loss_clip": 1.01205528, "balance_loss_mlp": 1.02023435, "epoch": 0.6449828653880839, "flos": 22530355263360.0, "grad_norm": 2.1920096342878055, "language_loss": 0.78711337, "learning_rate": 1.1825909421403871e-06, "loss": 0.80909491, "num_input_tokens_seen": 115428750, "step": 5364, "time_per_iteration": 2.6178135871887207 }, { "auxiliary_loss_clip": 0.01173853, "auxiliary_loss_mlp": 0.01027802, "balance_loss_clip": 1.01225662, "balance_loss_mlp": 1.02037513, "epoch": 0.645103108278723, "flos": 25695736369920.0, "grad_norm": 1.732985098240545, "language_loss": 0.76161379, "learning_rate": 1.181880062631926e-06, "loss": 0.78363031, "num_input_tokens_seen": 115448085, "step": 5365, "time_per_iteration": 2.645230770111084 }, { "auxiliary_loss_clip": 0.01164304, "auxiliary_loss_mlp": 0.01030436, "balance_loss_clip": 0.97373009, "balance_loss_mlp": 1.02285099, "epoch": 0.6452233511693621, "flos": 27450331925760.0, "grad_norm": 3.7522438839079713, "language_loss": 0.84691793, "learning_rate": 1.1811693072316093e-06, "loss": 0.86886537, "num_input_tokens_seen": 115465765, "step": 5366, "time_per_iteration": 2.6699676513671875 }, { "auxiliary_loss_clip": 0.01171054, "auxiliary_loss_mlp": 0.01122782, "balance_loss_clip": 1.04819489, "balance_loss_mlp": 0.0, "epoch": 0.6453435940600012, "flos": 19208618254080.0, "grad_norm": 2.7213659149192395, "language_loss": 0.84231418, "learning_rate": 1.1804586760472574e-06, "loss": 0.86525249, "num_input_tokens_seen": 115482230, "step": 5367, "time_per_iteration": 2.5926156044006348 }, { "auxiliary_loss_clip": 0.01165505, "auxiliary_loss_mlp": 0.01026586, "balance_loss_clip": 0.93299109, "balance_loss_mlp": 1.0189743, "epoch": 0.6454638369506402, "flos": 25737680476800.0, "grad_norm": 2.3480320726095227, "language_loss": 0.80223572, "learning_rate": 1.1797481691866736e-06, "loss": 0.82415664, "num_input_tokens_seen": 115499455, "step": 5368, "time_per_iteration": 2.6987009048461914 }, { "auxiliary_loss_clip": 0.01161997, "auxiliary_loss_mlp": 0.01029309, "balance_loss_clip": 0.97332072, "balance_loss_mlp": 1.02222157, "epoch": 0.6455840798412794, "flos": 20989176364800.0, "grad_norm": 1.9538810816725891, "language_loss": 0.83021021, "learning_rate": 1.1790377867576393e-06, "loss": 0.85212326, "num_input_tokens_seen": 115517205, "step": 5369, "time_per_iteration": 2.6432275772094727 }, { "auxiliary_loss_clip": 0.01171218, "auxiliary_loss_mlp": 0.01024409, "balance_loss_clip": 0.97094524, "balance_loss_mlp": 1.01724112, "epoch": 0.6457043227319185, "flos": 26067556005120.0, "grad_norm": 1.7183113266479226, "language_loss": 0.76457924, "learning_rate": 1.1783275288679203e-06, "loss": 0.7865355, "num_input_tokens_seen": 115534370, "step": 5370, "time_per_iteration": 2.7040369510650635 }, { "auxiliary_loss_clip": 0.01071474, "auxiliary_loss_mlp": 0.01000385, "balance_loss_clip": 0.97593951, "balance_loss_mlp": 0.99883479, "epoch": 0.6458245656225575, "flos": 60370831088640.0, "grad_norm": 0.8669054650016347, "language_loss": 0.57184005, "learning_rate": 1.177617395625262e-06, "loss": 0.59255862, "num_input_tokens_seen": 115592345, "step": 5371, "time_per_iteration": 3.1978421211242676 }, { "auxiliary_loss_clip": 0.01172088, "auxiliary_loss_mlp": 0.01023475, "balance_loss_clip": 1.01240826, "balance_loss_mlp": 1.0163672, "epoch": 0.6459448085131967, "flos": 23076771932160.0, "grad_norm": 1.9243187556197952, "language_loss": 0.75593156, "learning_rate": 1.1769073871373908e-06, "loss": 0.77788717, "num_input_tokens_seen": 115612550, "step": 5372, "time_per_iteration": 3.5313098430633545 }, { "auxiliary_loss_clip": 0.0116346, "auxiliary_loss_mlp": 0.01024618, "balance_loss_clip": 0.93144441, "balance_loss_mlp": 1.01754308, "epoch": 0.6460650514038357, "flos": 22598190097920.0, "grad_norm": 1.709349058233637, "language_loss": 0.83885849, "learning_rate": 1.176197503512015e-06, "loss": 0.86073935, "num_input_tokens_seen": 115632265, "step": 5373, "time_per_iteration": 2.667088747024536 }, { "auxiliary_loss_clip": 0.01163039, "auxiliary_loss_mlp": 0.01026616, "balance_loss_clip": 0.9706918, "balance_loss_mlp": 1.01923668, "epoch": 0.6461852942944748, "flos": 20266726118400.0, "grad_norm": 2.2166920859778028, "language_loss": 0.8269515, "learning_rate": 1.1754877448568223e-06, "loss": 0.84884799, "num_input_tokens_seen": 115651720, "step": 5374, "time_per_iteration": 2.6366162300109863 }, { "auxiliary_loss_clip": 0.01165605, "auxiliary_loss_mlp": 0.01025108, "balance_loss_clip": 0.97120607, "balance_loss_mlp": 1.01751149, "epoch": 0.646305537185114, "flos": 23367109564800.0, "grad_norm": 3.701291022801738, "language_loss": 0.89994472, "learning_rate": 1.1747781112794837e-06, "loss": 0.92185187, "num_input_tokens_seen": 115668215, "step": 5375, "time_per_iteration": 2.6864845752716064 }, { "auxiliary_loss_clip": 0.01164248, "auxiliary_loss_mlp": 0.01025348, "balance_loss_clip": 0.93434513, "balance_loss_mlp": 1.01870501, "epoch": 0.646425780075753, "flos": 24277480790400.0, "grad_norm": 1.5628534430321226, "language_loss": 0.83363229, "learning_rate": 1.1740686028876487e-06, "loss": 0.85552824, "num_input_tokens_seen": 115687080, "step": 5376, "time_per_iteration": 2.6890430450439453 }, { "auxiliary_loss_clip": 0.01168021, "auxiliary_loss_mlp": 0.01024303, "balance_loss_clip": 1.01204062, "balance_loss_mlp": 1.01694143, "epoch": 0.6465460229663921, "flos": 20813968800000.0, "grad_norm": 3.7472597917312567, "language_loss": 0.74825382, "learning_rate": 1.1733592197889507e-06, "loss": 0.77017701, "num_input_tokens_seen": 115703990, "step": 5377, "time_per_iteration": 5.40869402885437 }, { "auxiliary_loss_clip": 0.01167686, "auxiliary_loss_mlp": 0.01027745, "balance_loss_clip": 1.01196742, "balance_loss_mlp": 1.02100396, "epoch": 0.6466662658570312, "flos": 22853299466880.0, "grad_norm": 1.72147066533808, "language_loss": 0.72620404, "learning_rate": 1.1726499620910014e-06, "loss": 0.74815834, "num_input_tokens_seen": 115724270, "step": 5378, "time_per_iteration": 2.6605114936828613 }, { "auxiliary_loss_clip": 0.01170199, "auxiliary_loss_mlp": 0.01025372, "balance_loss_clip": 1.01276934, "balance_loss_mlp": 1.01742077, "epoch": 0.6467865087476703, "flos": 15304553953920.0, "grad_norm": 1.8829294258082276, "language_loss": 0.77036643, "learning_rate": 1.1719408299013955e-06, "loss": 0.79232216, "num_input_tokens_seen": 115742995, "step": 5379, "time_per_iteration": 2.6338560581207275 }, { "auxiliary_loss_clip": 0.01172081, "auxiliary_loss_mlp": 0.01023845, "balance_loss_clip": 1.05067587, "balance_loss_mlp": 1.01656175, "epoch": 0.6469067516383094, "flos": 19573650218880.0, "grad_norm": 2.958535830421578, "language_loss": 0.75485456, "learning_rate": 1.1712318233277067e-06, "loss": 0.77681386, "num_input_tokens_seen": 115762015, "step": 5380, "time_per_iteration": 2.588498830795288 }, { "auxiliary_loss_clip": 0.01071189, "auxiliary_loss_mlp": 0.01002844, "balance_loss_clip": 0.97636008, "balance_loss_mlp": 1.00135982, "epoch": 0.6470269945289485, "flos": 65098002522240.0, "grad_norm": 0.8099738513056588, "language_loss": 0.57914639, "learning_rate": 1.1705229424774916e-06, "loss": 0.59988666, "num_input_tokens_seen": 115816285, "step": 5381, "time_per_iteration": 3.0682859420776367 }, { "auxiliary_loss_clip": 0.01164338, "auxiliary_loss_mlp": 0.01024971, "balance_loss_clip": 0.97095931, "balance_loss_mlp": 1.01734757, "epoch": 0.6471472374195876, "flos": 30696943639680.0, "grad_norm": 1.931432126549577, "language_loss": 0.64570892, "learning_rate": 1.1698141874582867e-06, "loss": 0.66760206, "num_input_tokens_seen": 115837330, "step": 5382, "time_per_iteration": 2.76190447807312 }, { "auxiliary_loss_clip": 0.01169108, "auxiliary_loss_mlp": 0.01024986, "balance_loss_clip": 1.04956269, "balance_loss_mlp": 1.01805067, "epoch": 0.6472674803102266, "flos": 20521835487360.0, "grad_norm": 1.7319262884734488, "language_loss": 0.72305685, "learning_rate": 1.169105558377609e-06, "loss": 0.7449978, "num_input_tokens_seen": 115857420, "step": 5383, "time_per_iteration": 2.5645852088928223 }, { "auxiliary_loss_clip": 0.01169303, "auxiliary_loss_mlp": 0.01123077, "balance_loss_clip": 0.90105391, "balance_loss_mlp": 0.0, "epoch": 0.6473877232008658, "flos": 24715447320960.0, "grad_norm": 1.6130949397794678, "language_loss": 0.78190351, "learning_rate": 1.1683970553429587e-06, "loss": 0.80482733, "num_input_tokens_seen": 115878875, "step": 5384, "time_per_iteration": 2.8011410236358643 }, { "auxiliary_loss_clip": 0.01175017, "auxiliary_loss_mlp": 0.0102829, "balance_loss_clip": 0.93730378, "balance_loss_mlp": 1.02027607, "epoch": 0.6475079660915048, "flos": 15885552441600.0, "grad_norm": 1.905083071702668, "language_loss": 0.82007283, "learning_rate": 1.1676886784618128e-06, "loss": 0.84210587, "num_input_tokens_seen": 115895540, "step": 5385, "time_per_iteration": 2.6721320152282715 }, { "auxiliary_loss_clip": 0.01171938, "auxiliary_loss_mlp": 0.01026801, "balance_loss_clip": 1.01246691, "balance_loss_mlp": 1.01926064, "epoch": 0.6476282089821439, "flos": 17381590922880.0, "grad_norm": 2.3245621836177923, "language_loss": 0.83895522, "learning_rate": 1.1669804278416332e-06, "loss": 0.86094266, "num_input_tokens_seen": 115910265, "step": 5386, "time_per_iteration": 2.6136181354522705 }, { "auxiliary_loss_clip": 0.01176372, "auxiliary_loss_mlp": 0.01025058, "balance_loss_clip": 0.97470969, "balance_loss_mlp": 1.01751828, "epoch": 0.6477484518727831, "flos": 20194078861440.0, "grad_norm": 1.7146071352464882, "language_loss": 0.7120111, "learning_rate": 1.1662723035898602e-06, "loss": 0.73402536, "num_input_tokens_seen": 115930025, "step": 5387, "time_per_iteration": 2.656660318374634 }, { "auxiliary_loss_clip": 0.01170671, "auxiliary_loss_mlp": 0.01025276, "balance_loss_clip": 1.01139069, "balance_loss_mlp": 1.01783454, "epoch": 0.6478686947634221, "flos": 25410426641280.0, "grad_norm": 1.7636708727270223, "language_loss": 0.81978655, "learning_rate": 1.165564305813915e-06, "loss": 0.84174597, "num_input_tokens_seen": 115949025, "step": 5388, "time_per_iteration": 2.6563332080841064 }, { "auxiliary_loss_clip": 0.01172377, "auxiliary_loss_mlp": 0.0102429, "balance_loss_clip": 1.01277101, "balance_loss_mlp": 1.01761413, "epoch": 0.6479889376540612, "flos": 20083581648000.0, "grad_norm": 1.7411181446654405, "language_loss": 0.81127375, "learning_rate": 1.1648564346212019e-06, "loss": 0.83324045, "num_input_tokens_seen": 115968145, "step": 5389, "time_per_iteration": 2.7495532035827637 }, { "auxiliary_loss_clip": 0.01166599, "auxiliary_loss_mlp": 0.01024906, "balance_loss_clip": 1.0120585, "balance_loss_mlp": 1.01770282, "epoch": 0.6481091805447003, "flos": 26758082039040.0, "grad_norm": 1.9067666020655958, "language_loss": 0.761944, "learning_rate": 1.164148690119104e-06, "loss": 0.78385907, "num_input_tokens_seen": 115989425, "step": 5390, "time_per_iteration": 2.7334561347961426 }, { "auxiliary_loss_clip": 0.01169492, "auxiliary_loss_mlp": 0.0102315, "balance_loss_clip": 1.05019855, "balance_loss_mlp": 1.01607823, "epoch": 0.6482294234353394, "flos": 23952094462080.0, "grad_norm": 1.6548610631226042, "language_loss": 0.7425189, "learning_rate": 1.163441072414985e-06, "loss": 0.7644453, "num_input_tokens_seen": 116009630, "step": 5391, "time_per_iteration": 2.654085636138916 }, { "auxiliary_loss_clip": 0.0117126, "auxiliary_loss_mlp": 0.01024813, "balance_loss_clip": 1.01281428, "balance_loss_mlp": 1.0171566, "epoch": 0.6483496663259785, "flos": 26209833776640.0, "grad_norm": 1.700686648592255, "language_loss": 0.69946039, "learning_rate": 1.16273358161619e-06, "loss": 0.72142112, "num_input_tokens_seen": 116029965, "step": 5392, "time_per_iteration": 2.6607913970947266 }, { "auxiliary_loss_clip": 0.0117939, "auxiliary_loss_mlp": 0.01028718, "balance_loss_clip": 0.97517753, "balance_loss_mlp": 1.02086473, "epoch": 0.6484699092166175, "flos": 20922239370240.0, "grad_norm": 2.2130973910011753, "language_loss": 0.83155149, "learning_rate": 1.1620262178300446e-06, "loss": 0.85363257, "num_input_tokens_seen": 116048580, "step": 5393, "time_per_iteration": 2.7471306324005127 }, { "auxiliary_loss_clip": 0.01167418, "auxiliary_loss_mlp": 0.01024928, "balance_loss_clip": 0.93244857, "balance_loss_mlp": 1.01777804, "epoch": 0.6485901521072567, "flos": 33072865678080.0, "grad_norm": 1.861240135038781, "language_loss": 0.75867915, "learning_rate": 1.1613189811638563e-06, "loss": 0.78060257, "num_input_tokens_seen": 116070305, "step": 5394, "time_per_iteration": 2.7757740020751953 }, { "auxiliary_loss_clip": 0.01171602, "auxiliary_loss_mlp": 0.01027751, "balance_loss_clip": 1.01338577, "balance_loss_mlp": 1.02090812, "epoch": 0.6487103949978957, "flos": 22274060745600.0, "grad_norm": 1.5090208242317293, "language_loss": 0.77870524, "learning_rate": 1.1606118717249117e-06, "loss": 0.80069882, "num_input_tokens_seen": 116090405, "step": 5395, "time_per_iteration": 2.675257444381714 }, { "auxiliary_loss_clip": 0.0117387, "auxiliary_loss_mlp": 0.01027677, "balance_loss_clip": 1.04857254, "balance_loss_mlp": 1.02061105, "epoch": 0.6488306378885348, "flos": 22930400010240.0, "grad_norm": 1.9222090968023184, "language_loss": 0.67674923, "learning_rate": 1.1599048896204787e-06, "loss": 0.69876468, "num_input_tokens_seen": 116110285, "step": 5396, "time_per_iteration": 2.590062141418457 }, { "auxiliary_loss_clip": 0.01173679, "auxiliary_loss_mlp": 0.01026325, "balance_loss_clip": 0.93570518, "balance_loss_mlp": 1.01869571, "epoch": 0.648950880779174, "flos": 20376110010240.0, "grad_norm": 1.6304594314171683, "language_loss": 0.80888712, "learning_rate": 1.1591980349578061e-06, "loss": 0.83088714, "num_input_tokens_seen": 116128955, "step": 5397, "time_per_iteration": 2.6795167922973633 }, { "auxiliary_loss_clip": 0.01071933, "auxiliary_loss_mlp": 0.0099885, "balance_loss_clip": 0.90002048, "balance_loss_mlp": 0.99721634, "epoch": 0.649071123669813, "flos": 59930889310080.0, "grad_norm": 0.734842157648244, "language_loss": 0.54313278, "learning_rate": 1.158491307844123e-06, "loss": 0.56384057, "num_input_tokens_seen": 116188875, "step": 5398, "time_per_iteration": 4.12629508972168 }, { "auxiliary_loss_clip": 0.01169496, "auxiliary_loss_mlp": 0.01028972, "balance_loss_clip": 0.97568393, "balance_loss_mlp": 1.02170622, "epoch": 0.6491913665604521, "flos": 20446566537600.0, "grad_norm": 1.5599952667506403, "language_loss": 0.83724809, "learning_rate": 1.1577847083866387e-06, "loss": 0.85923278, "num_input_tokens_seen": 116207910, "step": 5399, "time_per_iteration": 2.621541738510132 }, { "auxiliary_loss_clip": 0.01158317, "auxiliary_loss_mlp": 0.01022385, "balance_loss_clip": 0.97149444, "balance_loss_mlp": 1.01504445, "epoch": 0.6493116094510912, "flos": 16946820702720.0, "grad_norm": 1.7399624693541775, "language_loss": 0.7224853, "learning_rate": 1.1570782366925453e-06, "loss": 0.74429238, "num_input_tokens_seen": 116226425, "step": 5400, "time_per_iteration": 2.6269524097442627 }, { "auxiliary_loss_clip": 0.01170327, "auxiliary_loss_mlp": 0.01025611, "balance_loss_clip": 0.97125435, "balance_loss_mlp": 1.01861978, "epoch": 0.6494318523417303, "flos": 18802935072000.0, "grad_norm": 2.600858546168585, "language_loss": 0.75317204, "learning_rate": 1.1563718928690132e-06, "loss": 0.7751314, "num_input_tokens_seen": 116243860, "step": 5401, "time_per_iteration": 2.661386013031006 }, { "auxiliary_loss_clip": 0.01168753, "auxiliary_loss_mlp": 0.01029213, "balance_loss_clip": 0.93725282, "balance_loss_mlp": 1.02164936, "epoch": 0.6495520952323693, "flos": 18982847318400.0, "grad_norm": 2.0333430276615503, "language_loss": 0.70777428, "learning_rate": 1.1556656770231942e-06, "loss": 0.72975397, "num_input_tokens_seen": 116260055, "step": 5402, "time_per_iteration": 3.58663010597229 }, { "auxiliary_loss_clip": 0.01169815, "auxiliary_loss_mlp": 0.01023832, "balance_loss_clip": 1.00984645, "balance_loss_mlp": 1.01703095, "epoch": 0.6496723381230085, "flos": 22745388032640.0, "grad_norm": 1.5387074944256782, "language_loss": 0.76168478, "learning_rate": 1.1549595892622207e-06, "loss": 0.78362119, "num_input_tokens_seen": 116278825, "step": 5403, "time_per_iteration": 3.6257457733154297 }, { "auxiliary_loss_clip": 0.01076157, "auxiliary_loss_mlp": 0.01003647, "balance_loss_clip": 0.87023103, "balance_loss_mlp": 1.00188243, "epoch": 0.6497925810136476, "flos": 62145283887360.0, "grad_norm": 0.822714934231164, "language_loss": 0.59001201, "learning_rate": 1.1542536296932047e-06, "loss": 0.61081004, "num_input_tokens_seen": 116342360, "step": 5404, "time_per_iteration": 3.2988667488098145 }, { "auxiliary_loss_clip": 0.01174148, "auxiliary_loss_mlp": 0.01027513, "balance_loss_clip": 0.93382663, "balance_loss_mlp": 1.01968122, "epoch": 0.6499128239042866, "flos": 20156731695360.0, "grad_norm": 1.6991513217744783, "language_loss": 0.70150685, "learning_rate": 1.1535477984232414e-06, "loss": 0.72352338, "num_input_tokens_seen": 116362235, "step": 5405, "time_per_iteration": 2.7289695739746094 }, { "auxiliary_loss_clip": 0.01166679, "auxiliary_loss_mlp": 0.01026803, "balance_loss_clip": 0.89117229, "balance_loss_mlp": 1.01987398, "epoch": 0.6500330667949258, "flos": 24462420940800.0, "grad_norm": 1.7266006880741216, "language_loss": 0.76740956, "learning_rate": 1.152842095559404e-06, "loss": 0.78934443, "num_input_tokens_seen": 116382895, "step": 5406, "time_per_iteration": 2.8237202167510986 }, { "auxiliary_loss_clip": 0.01174808, "auxiliary_loss_mlp": 0.01025918, "balance_loss_clip": 0.97260678, "balance_loss_mlp": 1.01858652, "epoch": 0.6501533096855648, "flos": 25477399549440.0, "grad_norm": 1.5989325873821627, "language_loss": 0.76592481, "learning_rate": 1.1521365212087474e-06, "loss": 0.78793204, "num_input_tokens_seen": 116402880, "step": 5407, "time_per_iteration": 2.7505762577056885 }, { "auxiliary_loss_clip": 0.01169722, "auxiliary_loss_mlp": 0.01023333, "balance_loss_clip": 1.01037264, "balance_loss_mlp": 1.01595998, "epoch": 0.6502735525762039, "flos": 44819245347840.0, "grad_norm": 1.6368894238808485, "language_loss": 0.70384163, "learning_rate": 1.1514310754783062e-06, "loss": 0.72577214, "num_input_tokens_seen": 116425830, "step": 5408, "time_per_iteration": 2.8815131187438965 }, { "auxiliary_loss_clip": 0.01173261, "auxiliary_loss_mlp": 0.01025448, "balance_loss_clip": 0.97383124, "balance_loss_mlp": 1.01848626, "epoch": 0.6503937954668431, "flos": 28658546726400.0, "grad_norm": 1.7914137970714992, "language_loss": 0.73424363, "learning_rate": 1.1507257584750964e-06, "loss": 0.75623071, "num_input_tokens_seen": 116446010, "step": 5409, "time_per_iteration": 2.6992411613464355 }, { "auxiliary_loss_clip": 0.01176711, "auxiliary_loss_mlp": 0.01025212, "balance_loss_clip": 1.0537765, "balance_loss_mlp": 1.01771998, "epoch": 0.6505140383574821, "flos": 20922562592640.0, "grad_norm": 2.4674384485150767, "language_loss": 0.77513826, "learning_rate": 1.150020570306113e-06, "loss": 0.79715753, "num_input_tokens_seen": 116465150, "step": 5410, "time_per_iteration": 2.654125452041626 }, { "auxiliary_loss_clip": 0.01161635, "auxiliary_loss_mlp": 0.01026785, "balance_loss_clip": 0.97068387, "balance_loss_mlp": 1.01903582, "epoch": 0.6506342812481212, "flos": 20595236929920.0, "grad_norm": 1.6184152499475353, "language_loss": 0.7481603, "learning_rate": 1.1493155110783338e-06, "loss": 0.77004451, "num_input_tokens_seen": 116483675, "step": 5411, "time_per_iteration": 2.719010353088379 }, { "auxiliary_loss_clip": 0.01171167, "auxiliary_loss_mlp": 0.01025003, "balance_loss_clip": 1.01183128, "balance_loss_mlp": 1.01750481, "epoch": 0.6507545241387603, "flos": 30226478279040.0, "grad_norm": 1.9809244455480681, "language_loss": 0.70966434, "learning_rate": 1.1486105808987155e-06, "loss": 0.73162603, "num_input_tokens_seen": 116505165, "step": 5412, "time_per_iteration": 2.684224843978882 }, { "auxiliary_loss_clip": 0.01170268, "auxiliary_loss_mlp": 0.01026859, "balance_loss_clip": 1.01125956, "balance_loss_mlp": 1.01922703, "epoch": 0.6508747670293994, "flos": 17128241320320.0, "grad_norm": 1.820281038041891, "language_loss": 0.80902672, "learning_rate": 1.1479057798741947e-06, "loss": 0.83099806, "num_input_tokens_seen": 116523220, "step": 5413, "time_per_iteration": 2.6858296394348145 }, { "auxiliary_loss_clip": 0.01081573, "auxiliary_loss_mlp": 0.01001156, "balance_loss_clip": 0.95110208, "balance_loss_mlp": 0.99932063, "epoch": 0.6509950099200384, "flos": 68559826573440.0, "grad_norm": 0.7844471320688458, "language_loss": 0.53410184, "learning_rate": 1.14720110811169e-06, "loss": 0.55492914, "num_input_tokens_seen": 116580450, "step": 5414, "time_per_iteration": 3.2464888095855713 }, { "auxiliary_loss_clip": 0.01175895, "auxiliary_loss_mlp": 0.01026344, "balance_loss_clip": 1.01317632, "balance_loss_mlp": 1.01823211, "epoch": 0.6511152528106776, "flos": 22347462188160.0, "grad_norm": 2.121032520841048, "language_loss": 0.76657391, "learning_rate": 1.146496565718098e-06, "loss": 0.78859627, "num_input_tokens_seen": 116601020, "step": 5415, "time_per_iteration": 2.6792054176330566 }, { "auxiliary_loss_clip": 0.0117256, "auxiliary_loss_mlp": 0.01025147, "balance_loss_clip": 0.97519225, "balance_loss_mlp": 1.01716626, "epoch": 0.6512354957013167, "flos": 20522158709760.0, "grad_norm": 1.8628304895343542, "language_loss": 0.75643957, "learning_rate": 1.1457921528002996e-06, "loss": 0.77841669, "num_input_tokens_seen": 116619455, "step": 5416, "time_per_iteration": 2.7274653911590576 }, { "auxiliary_loss_clip": 0.01172649, "auxiliary_loss_mlp": 0.01123015, "balance_loss_clip": 1.05006719, "balance_loss_mlp": 0.0, "epoch": 0.6513557385919557, "flos": 32337342881280.0, "grad_norm": 2.212839920416578, "language_loss": 0.72585452, "learning_rate": 1.1450878694651522e-06, "loss": 0.74881113, "num_input_tokens_seen": 116640020, "step": 5417, "time_per_iteration": 2.7712299823760986 }, { "auxiliary_loss_clip": 0.01164734, "auxiliary_loss_mlp": 0.01029049, "balance_loss_clip": 0.89241958, "balance_loss_mlp": 1.02144909, "epoch": 0.6514759814825949, "flos": 12093206417280.0, "grad_norm": 2.1505943651086152, "language_loss": 0.63629967, "learning_rate": 1.1443837158194954e-06, "loss": 0.65823746, "num_input_tokens_seen": 116655165, "step": 5418, "time_per_iteration": 2.698920726776123 }, { "auxiliary_loss_clip": 0.01173617, "auxiliary_loss_mlp": 0.01029197, "balance_loss_clip": 0.93853295, "balance_loss_mlp": 1.02155626, "epoch": 0.651596224373234, "flos": 22526907557760.0, "grad_norm": 1.5777454138239402, "language_loss": 0.74325407, "learning_rate": 1.1436796919701484e-06, "loss": 0.76528227, "num_input_tokens_seen": 116673880, "step": 5419, "time_per_iteration": 2.7440147399902344 }, { "auxiliary_loss_clip": 0.01172067, "auxiliary_loss_mlp": 0.01026945, "balance_loss_clip": 0.97453368, "balance_loss_mlp": 1.01938152, "epoch": 0.651716467263873, "flos": 27818955250560.0, "grad_norm": 3.759883226061594, "language_loss": 0.6159035, "learning_rate": 1.1429757980239115e-06, "loss": 0.63789356, "num_input_tokens_seen": 116694305, "step": 5420, "time_per_iteration": 2.8082034587860107 }, { "auxiliary_loss_clip": 0.01175794, "auxiliary_loss_mlp": 0.01027061, "balance_loss_clip": 1.05023837, "balance_loss_mlp": 1.01860023, "epoch": 0.6518367101545122, "flos": 24316300414080.0, "grad_norm": 3.1664911005173155, "language_loss": 0.8215462, "learning_rate": 1.1422720340875636e-06, "loss": 0.84357476, "num_input_tokens_seen": 116713055, "step": 5421, "time_per_iteration": 2.718379259109497 }, { "auxiliary_loss_clip": 0.01175316, "auxiliary_loss_mlp": 0.01026102, "balance_loss_clip": 1.0111022, "balance_loss_mlp": 1.01857364, "epoch": 0.6519569530451512, "flos": 20011939971840.0, "grad_norm": 2.0688503454860303, "language_loss": 0.79422206, "learning_rate": 1.1415684002678671e-06, "loss": 0.81623626, "num_input_tokens_seen": 116731815, "step": 5422, "time_per_iteration": 2.650423526763916 }, { "auxiliary_loss_clip": 0.01168752, "auxiliary_loss_mlp": 0.01031082, "balance_loss_clip": 0.97128487, "balance_loss_mlp": 1.02318406, "epoch": 0.6520771959357903, "flos": 21576064682880.0, "grad_norm": 2.255501636088057, "language_loss": 0.77711785, "learning_rate": 1.1408648966715617e-06, "loss": 0.79911613, "num_input_tokens_seen": 116749335, "step": 5423, "time_per_iteration": 2.669645071029663 }, { "auxiliary_loss_clip": 0.01166483, "auxiliary_loss_mlp": 0.01031413, "balance_loss_clip": 0.96929228, "balance_loss_mlp": 1.02383161, "epoch": 0.6521974388264293, "flos": 22711021695360.0, "grad_norm": 1.7333664028914717, "language_loss": 0.72466511, "learning_rate": 1.1401615234053683e-06, "loss": 0.74664408, "num_input_tokens_seen": 116768155, "step": 5424, "time_per_iteration": 3.613328456878662 }, { "auxiliary_loss_clip": 0.01169892, "auxiliary_loss_mlp": 0.01027476, "balance_loss_clip": 0.97132349, "balance_loss_mlp": 1.0196383, "epoch": 0.6523176817170685, "flos": 23002939526400.0, "grad_norm": 4.694282963657878, "language_loss": 0.75948644, "learning_rate": 1.1394582805759885e-06, "loss": 0.78146005, "num_input_tokens_seen": 116787435, "step": 5425, "time_per_iteration": 2.6465823650360107 }, { "auxiliary_loss_clip": 0.01171286, "auxiliary_loss_mlp": 0.01026163, "balance_loss_clip": 1.01326847, "balance_loss_mlp": 1.0186646, "epoch": 0.6524379246077076, "flos": 21688249835520.0, "grad_norm": 1.7229041256363633, "language_loss": 0.75578576, "learning_rate": 1.1387551682901022e-06, "loss": 0.77776027, "num_input_tokens_seen": 116808040, "step": 5426, "time_per_iteration": 2.6876542568206787 }, { "auxiliary_loss_clip": 0.01164735, "auxiliary_loss_mlp": 0.01029467, "balance_loss_clip": 0.93506938, "balance_loss_mlp": 1.02164674, "epoch": 0.6525581674983466, "flos": 19390936711680.0, "grad_norm": 2.3068984920026554, "language_loss": 0.70626497, "learning_rate": 1.138052186654373e-06, "loss": 0.72820699, "num_input_tokens_seen": 116825510, "step": 5427, "time_per_iteration": 2.6847572326660156 }, { "auxiliary_loss_clip": 0.0117137, "auxiliary_loss_mlp": 0.01027799, "balance_loss_clip": 0.9735142, "balance_loss_mlp": 1.02013946, "epoch": 0.6526784103889858, "flos": 17165444832000.0, "grad_norm": 2.3144654608798882, "language_loss": 0.88333666, "learning_rate": 1.1373493357754417e-06, "loss": 0.90532827, "num_input_tokens_seen": 116844415, "step": 5428, "time_per_iteration": 3.538104772567749 }, { "auxiliary_loss_clip": 0.01170404, "auxiliary_loss_mlp": 0.01021793, "balance_loss_clip": 1.04758286, "balance_loss_mlp": 1.01482511, "epoch": 0.6527986532796248, "flos": 18989168112000.0, "grad_norm": 1.6219092129847557, "language_loss": 0.7716009, "learning_rate": 1.1366466157599303e-06, "loss": 0.79352289, "num_input_tokens_seen": 116863690, "step": 5429, "time_per_iteration": 3.5684525966644287 }, { "auxiliary_loss_clip": 0.0116064, "auxiliary_loss_mlp": 0.01122401, "balance_loss_clip": 0.8940503, "balance_loss_mlp": 0.0, "epoch": 0.6529188961702639, "flos": 14238581011200.0, "grad_norm": 1.8724381648478776, "language_loss": 0.75993896, "learning_rate": 1.1359440267144412e-06, "loss": 0.78276938, "num_input_tokens_seen": 116881145, "step": 5430, "time_per_iteration": 3.6893129348754883 }, { "auxiliary_loss_clip": 0.01172579, "auxiliary_loss_mlp": 0.0102534, "balance_loss_clip": 1.01134753, "balance_loss_mlp": 1.0181334, "epoch": 0.653039139060903, "flos": 36682929158400.0, "grad_norm": 2.1726918366851464, "language_loss": 0.74216145, "learning_rate": 1.1352415687455556e-06, "loss": 0.76414061, "num_input_tokens_seen": 116902405, "step": 5431, "time_per_iteration": 2.749373197555542 }, { "auxiliary_loss_clip": 0.01172134, "auxiliary_loss_mlp": 0.01023112, "balance_loss_clip": 1.01331747, "balance_loss_mlp": 1.01551843, "epoch": 0.6531593819515421, "flos": 25376275785600.0, "grad_norm": 2.3527962412280474, "language_loss": 0.64047641, "learning_rate": 1.1345392419598362e-06, "loss": 0.66242886, "num_input_tokens_seen": 116921285, "step": 5432, "time_per_iteration": 2.743912696838379 }, { "auxiliary_loss_clip": 0.01162615, "auxiliary_loss_mlp": 0.01024411, "balance_loss_clip": 1.00903893, "balance_loss_mlp": 1.01709127, "epoch": 0.6532796248421812, "flos": 21178533888000.0, "grad_norm": 1.5341395801518165, "language_loss": 0.71898687, "learning_rate": 1.1338370464638263e-06, "loss": 0.74085718, "num_input_tokens_seen": 116940685, "step": 5433, "time_per_iteration": 2.7438321113586426 }, { "auxiliary_loss_clip": 0.01171161, "auxiliary_loss_mlp": 0.01028828, "balance_loss_clip": 1.04896879, "balance_loss_mlp": 1.02147293, "epoch": 0.6533998677328203, "flos": 17675950878720.0, "grad_norm": 2.3163780882456986, "language_loss": 0.63580596, "learning_rate": 1.1331349823640474e-06, "loss": 0.65780592, "num_input_tokens_seen": 116958115, "step": 5434, "time_per_iteration": 2.592410087585449 }, { "auxiliary_loss_clip": 0.01170835, "auxiliary_loss_mlp": 0.01121481, "balance_loss_clip": 1.01076102, "balance_loss_mlp": 0.0, "epoch": 0.6535201106234594, "flos": 28400384701440.0, "grad_norm": 5.200143532158661, "language_loss": 0.77789015, "learning_rate": 1.132433049767003e-06, "loss": 0.80081332, "num_input_tokens_seen": 116976030, "step": 5435, "time_per_iteration": 2.659801959991455 }, { "auxiliary_loss_clip": 0.01167624, "auxiliary_loss_mlp": 0.01024258, "balance_loss_clip": 0.9739086, "balance_loss_mlp": 1.01716161, "epoch": 0.6536403535140984, "flos": 23586667447680.0, "grad_norm": 1.5920776750864367, "language_loss": 0.81334436, "learning_rate": 1.1317312487791748e-06, "loss": 0.83526319, "num_input_tokens_seen": 116997680, "step": 5436, "time_per_iteration": 2.789611339569092 }, { "auxiliary_loss_clip": 0.01162349, "auxiliary_loss_mlp": 0.01024395, "balance_loss_clip": 1.00827265, "balance_loss_mlp": 1.0171175, "epoch": 0.6537605964047376, "flos": 21579476474880.0, "grad_norm": 1.7158931574813165, "language_loss": 0.72859466, "learning_rate": 1.1310295795070253e-06, "loss": 0.75046206, "num_input_tokens_seen": 117017620, "step": 5437, "time_per_iteration": 2.714845895767212 }, { "auxiliary_loss_clip": 0.01169537, "auxiliary_loss_mlp": 0.01022183, "balance_loss_clip": 0.89476693, "balance_loss_mlp": 1.01562035, "epoch": 0.6538808392953767, "flos": 26833997433600.0, "grad_norm": 1.7267656110135898, "language_loss": 0.80849636, "learning_rate": 1.1303280420569982e-06, "loss": 0.83041358, "num_input_tokens_seen": 117039505, "step": 5438, "time_per_iteration": 2.874546527862549 }, { "auxiliary_loss_clip": 0.01161399, "auxiliary_loss_mlp": 0.01024151, "balance_loss_clip": 1.00867724, "balance_loss_mlp": 1.01699829, "epoch": 0.6540010821860157, "flos": 30738241301760.0, "grad_norm": 1.5966530970160002, "language_loss": 0.7737838, "learning_rate": 1.1296266365355158e-06, "loss": 0.79563928, "num_input_tokens_seen": 117062890, "step": 5439, "time_per_iteration": 2.7047407627105713 }, { "auxiliary_loss_clip": 0.01172977, "auxiliary_loss_mlp": 0.01025879, "balance_loss_clip": 0.936818, "balance_loss_mlp": 1.01818681, "epoch": 0.6541213250766549, "flos": 26907147480960.0, "grad_norm": 4.553297918317413, "language_loss": 0.73619497, "learning_rate": 1.1289253630489806e-06, "loss": 0.7581836, "num_input_tokens_seen": 117083940, "step": 5440, "time_per_iteration": 2.768402099609375 }, { "auxiliary_loss_clip": 0.01174996, "auxiliary_loss_mlp": 0.01027711, "balance_loss_clip": 1.01064467, "balance_loss_mlp": 1.01960158, "epoch": 0.6542415679672939, "flos": 19172384409600.0, "grad_norm": 1.9864677545543021, "language_loss": 0.72377557, "learning_rate": 1.1282242217037753e-06, "loss": 0.74580264, "num_input_tokens_seen": 117101440, "step": 5441, "time_per_iteration": 2.601083517074585 }, { "auxiliary_loss_clip": 0.011587, "auxiliary_loss_mlp": 0.01021952, "balance_loss_clip": 0.89169735, "balance_loss_mlp": 1.01359558, "epoch": 0.654361810857933, "flos": 48173517100800.0, "grad_norm": 2.4052434622854943, "language_loss": 0.6180191, "learning_rate": 1.127523212606262e-06, "loss": 0.6398257, "num_input_tokens_seen": 117124265, "step": 5442, "time_per_iteration": 2.949490547180176 }, { "auxiliary_loss_clip": 0.01170013, "auxiliary_loss_mlp": 0.01025531, "balance_loss_clip": 1.01196527, "balance_loss_mlp": 1.01818752, "epoch": 0.6544820537485722, "flos": 26943165843840.0, "grad_norm": 1.4551131729603048, "language_loss": 0.73027098, "learning_rate": 1.1268223358627835e-06, "loss": 0.75222647, "num_input_tokens_seen": 117146755, "step": 5443, "time_per_iteration": 2.6650590896606445 }, { "auxiliary_loss_clip": 0.0117615, "auxiliary_loss_mlp": 0.01024106, "balance_loss_clip": 1.05178738, "balance_loss_mlp": 1.01694155, "epoch": 0.6546022966392112, "flos": 20886328748160.0, "grad_norm": 3.2447186742071046, "language_loss": 0.71927053, "learning_rate": 1.126121591579663e-06, "loss": 0.74127311, "num_input_tokens_seen": 117165960, "step": 5444, "time_per_iteration": 2.6049468517303467 }, { "auxiliary_loss_clip": 0.0116741, "auxiliary_loss_mlp": 0.01025099, "balance_loss_clip": 1.01141834, "balance_loss_mlp": 1.01859355, "epoch": 0.6547225395298503, "flos": 24936693143040.0, "grad_norm": 1.6278413575168253, "language_loss": 0.69000423, "learning_rate": 1.1254209798632018e-06, "loss": 0.71192932, "num_input_tokens_seen": 117186980, "step": 5445, "time_per_iteration": 2.6421163082122803 }, { "auxiliary_loss_clip": 0.01152986, "auxiliary_loss_mlp": 0.01028734, "balance_loss_clip": 0.85535467, "balance_loss_mlp": 1.02155495, "epoch": 0.6548427824204894, "flos": 22565942663040.0, "grad_norm": 1.6876251259031023, "language_loss": 0.84749514, "learning_rate": 1.124720500819683e-06, "loss": 0.86931235, "num_input_tokens_seen": 117205135, "step": 5446, "time_per_iteration": 2.695808172225952 }, { "auxiliary_loss_clip": 0.01172812, "auxiliary_loss_mlp": 0.01025136, "balance_loss_clip": 1.05009508, "balance_loss_mlp": 1.01739895, "epoch": 0.6549630253111285, "flos": 18442500048000.0, "grad_norm": 4.240540402238748, "language_loss": 0.82277608, "learning_rate": 1.1240201545553682e-06, "loss": 0.84475553, "num_input_tokens_seen": 117222935, "step": 5447, "time_per_iteration": 2.592836618423462 }, { "auxiliary_loss_clip": 0.01165718, "auxiliary_loss_mlp": 0.01023312, "balance_loss_clip": 0.9342376, "balance_loss_mlp": 1.0160135, "epoch": 0.6550832682017675, "flos": 25187313312000.0, "grad_norm": 1.6642340251331496, "language_loss": 0.72801626, "learning_rate": 1.1233199411764987e-06, "loss": 0.74990654, "num_input_tokens_seen": 117242370, "step": 5448, "time_per_iteration": 2.779064178466797 }, { "auxiliary_loss_clip": 0.01151715, "auxiliary_loss_mlp": 0.01023745, "balance_loss_clip": 0.93124288, "balance_loss_mlp": 1.01677752, "epoch": 0.6552035110924067, "flos": 22748153379840.0, "grad_norm": 1.8519080208710812, "language_loss": 0.69087791, "learning_rate": 1.1226198607892978e-06, "loss": 0.71263254, "num_input_tokens_seen": 117262930, "step": 5449, "time_per_iteration": 2.8404152393341064 }, { "auxiliary_loss_clip": 0.01168767, "auxiliary_loss_mlp": 0.0103015, "balance_loss_clip": 0.8977015, "balance_loss_mlp": 1.02316463, "epoch": 0.6553237539830458, "flos": 21799178012160.0, "grad_norm": 3.236198399780482, "language_loss": 0.80013311, "learning_rate": 1.1219199134999664e-06, "loss": 0.82212228, "num_input_tokens_seen": 117281430, "step": 5450, "time_per_iteration": 3.6185688972473145 }, { "auxiliary_loss_clip": 0.01172608, "auxiliary_loss_mlp": 0.01028431, "balance_loss_clip": 0.97213024, "balance_loss_mlp": 1.02119517, "epoch": 0.6554439968736848, "flos": 20887226588160.0, "grad_norm": 2.361278453348682, "language_loss": 0.78601974, "learning_rate": 1.1212200994146863e-06, "loss": 0.80803013, "num_input_tokens_seen": 117299185, "step": 5451, "time_per_iteration": 2.6563563346862793 }, { "auxiliary_loss_clip": 0.01161225, "auxiliary_loss_mlp": 0.01023123, "balance_loss_clip": 0.92937356, "balance_loss_mlp": 1.01562476, "epoch": 0.655564239764324, "flos": 16139045698560.0, "grad_norm": 8.771617093052908, "language_loss": 0.75729686, "learning_rate": 1.120520418639618e-06, "loss": 0.77914035, "num_input_tokens_seen": 117317720, "step": 5452, "time_per_iteration": 2.7170538902282715 }, { "auxiliary_loss_clip": 0.01170858, "auxiliary_loss_mlp": 0.0102827, "balance_loss_clip": 1.01331115, "balance_loss_mlp": 1.02107263, "epoch": 0.655684482654963, "flos": 29570354496000.0, "grad_norm": 2.144535741140699, "language_loss": 0.83252937, "learning_rate": 1.119820871280903e-06, "loss": 0.85452068, "num_input_tokens_seen": 117338795, "step": 5453, "time_per_iteration": 2.6907451152801514 }, { "auxiliary_loss_clip": 0.01166866, "auxiliary_loss_mlp": 0.01030566, "balance_loss_clip": 1.01024759, "balance_loss_mlp": 1.02370524, "epoch": 0.6558047255456021, "flos": 29789409588480.0, "grad_norm": 2.4096572755206567, "language_loss": 0.73737061, "learning_rate": 1.1191214574446614e-06, "loss": 0.75934494, "num_input_tokens_seen": 117359040, "step": 5454, "time_per_iteration": 3.5594398975372314 }, { "auxiliary_loss_clip": 0.01158207, "auxiliary_loss_mlp": 0.0102656, "balance_loss_clip": 0.96911561, "balance_loss_mlp": 1.01920223, "epoch": 0.6559249684362413, "flos": 29059166090880.0, "grad_norm": 1.41636433195908, "language_loss": 0.79919517, "learning_rate": 1.118422177236995e-06, "loss": 0.82104284, "num_input_tokens_seen": 117380865, "step": 5455, "time_per_iteration": 3.671154260635376 }, { "auxiliary_loss_clip": 0.01168392, "auxiliary_loss_mlp": 0.01030445, "balance_loss_clip": 0.97011065, "balance_loss_mlp": 1.02305436, "epoch": 0.6560452113268803, "flos": 20225464369920.0, "grad_norm": 1.8897705142658854, "language_loss": 0.8560608, "learning_rate": 1.1177230307639835e-06, "loss": 0.87804925, "num_input_tokens_seen": 117398405, "step": 5456, "time_per_iteration": 3.6555635929107666 }, { "auxiliary_loss_clip": 0.01160613, "auxiliary_loss_mlp": 0.01026694, "balance_loss_clip": 0.93299127, "balance_loss_mlp": 1.01928496, "epoch": 0.6561654542175194, "flos": 25045538330880.0, "grad_norm": 1.5711904083456711, "language_loss": 0.78495526, "learning_rate": 1.1170240181316865e-06, "loss": 0.80682832, "num_input_tokens_seen": 117419850, "step": 5457, "time_per_iteration": 2.7447638511657715 }, { "auxiliary_loss_clip": 0.0116238, "auxiliary_loss_mlp": 0.01025372, "balance_loss_clip": 0.93044239, "balance_loss_mlp": 1.01806998, "epoch": 0.6562856971081584, "flos": 22856711258880.0, "grad_norm": 4.732942397776752, "language_loss": 0.79523325, "learning_rate": 1.1163251394461442e-06, "loss": 0.81711078, "num_input_tokens_seen": 117438330, "step": 5458, "time_per_iteration": 2.800708293914795 }, { "auxiliary_loss_clip": 0.01165584, "auxiliary_loss_mlp": 0.01027664, "balance_loss_clip": 1.01159263, "balance_loss_mlp": 1.02010632, "epoch": 0.6564059399987976, "flos": 18872565586560.0, "grad_norm": 1.823036859416767, "language_loss": 0.82000399, "learning_rate": 1.1156263948133746e-06, "loss": 0.84193653, "num_input_tokens_seen": 117454985, "step": 5459, "time_per_iteration": 2.628899097442627 }, { "auxiliary_loss_clip": 0.01156886, "auxiliary_loss_mlp": 0.01123048, "balance_loss_clip": 0.89421797, "balance_loss_mlp": 0.0, "epoch": 0.6565261828894366, "flos": 25484187219840.0, "grad_norm": 1.8353560528756376, "language_loss": 0.77656829, "learning_rate": 1.1149277843393787e-06, "loss": 0.79936761, "num_input_tokens_seen": 117476145, "step": 5460, "time_per_iteration": 2.7805004119873047 }, { "auxiliary_loss_clip": 0.01157936, "auxiliary_loss_mlp": 0.01122649, "balance_loss_clip": 0.85163355, "balance_loss_mlp": 0.0, "epoch": 0.6566464257800757, "flos": 19683500987520.0, "grad_norm": 3.723133693756934, "language_loss": 0.634758, "learning_rate": 1.1142293081301342e-06, "loss": 0.65756381, "num_input_tokens_seen": 117494025, "step": 5461, "time_per_iteration": 2.814318895339966 }, { "auxiliary_loss_clip": 0.01163371, "auxiliary_loss_mlp": 0.01026674, "balance_loss_clip": 0.97260499, "balance_loss_mlp": 1.01958382, "epoch": 0.6567666686707149, "flos": 23514127931520.0, "grad_norm": 1.6371097862012314, "language_loss": 0.67939532, "learning_rate": 1.1135309662915995e-06, "loss": 0.70129573, "num_input_tokens_seen": 117514190, "step": 5462, "time_per_iteration": 2.7947373390197754 }, { "auxiliary_loss_clip": 0.01171074, "auxiliary_loss_mlp": 0.01030371, "balance_loss_clip": 0.89360243, "balance_loss_mlp": 1.02296829, "epoch": 0.6568869115613539, "flos": 32781342896640.0, "grad_norm": 1.9222371982762194, "language_loss": 0.60547948, "learning_rate": 1.112832758929712e-06, "loss": 0.62749398, "num_input_tokens_seen": 117536800, "step": 5463, "time_per_iteration": 2.8291492462158203 }, { "auxiliary_loss_clip": 0.01172087, "auxiliary_loss_mlp": 0.01026098, "balance_loss_clip": 1.01321197, "balance_loss_mlp": 1.0179913, "epoch": 0.657007154451993, "flos": 18442428220800.0, "grad_norm": 1.5747591577434619, "language_loss": 0.74925864, "learning_rate": 1.11213468615039e-06, "loss": 0.77124053, "num_input_tokens_seen": 117556230, "step": 5464, "time_per_iteration": 2.6677582263946533 }, { "auxiliary_loss_clip": 0.01164396, "auxiliary_loss_mlp": 0.01027336, "balance_loss_clip": 0.85660404, "balance_loss_mlp": 1.02001321, "epoch": 0.6571273973426321, "flos": 25156717902720.0, "grad_norm": 1.6210798405446418, "language_loss": 0.7532959, "learning_rate": 1.1114367480595292e-06, "loss": 0.77521324, "num_input_tokens_seen": 117577310, "step": 5465, "time_per_iteration": 2.763575315475464 }, { "auxiliary_loss_clip": 0.01162778, "auxiliary_loss_mlp": 0.01030131, "balance_loss_clip": 0.85566258, "balance_loss_mlp": 1.02178669, "epoch": 0.6572476402332712, "flos": 17529830352000.0, "grad_norm": 2.0774038935286687, "language_loss": 0.8118794, "learning_rate": 1.1107389447630086e-06, "loss": 0.83380842, "num_input_tokens_seen": 117596010, "step": 5466, "time_per_iteration": 2.721489667892456 }, { "auxiliary_loss_clip": 0.01159639, "auxiliary_loss_mlp": 0.01122433, "balance_loss_clip": 0.96886182, "balance_loss_mlp": 0.0, "epoch": 0.6573678831239103, "flos": 17014260487680.0, "grad_norm": 2.2458450780519015, "language_loss": 0.78149426, "learning_rate": 1.1100412763666818e-06, "loss": 0.80431497, "num_input_tokens_seen": 117611270, "step": 5467, "time_per_iteration": 2.6435601711273193 }, { "auxiliary_loss_clip": 0.01171437, "auxiliary_loss_mlp": 0.01021382, "balance_loss_clip": 0.97354758, "balance_loss_mlp": 1.01443172, "epoch": 0.6574881260145494, "flos": 23910078528000.0, "grad_norm": 2.2918644667670307, "language_loss": 0.79866117, "learning_rate": 1.1093437429763865e-06, "loss": 0.82058936, "num_input_tokens_seen": 117631535, "step": 5468, "time_per_iteration": 2.7809267044067383 }, { "auxiliary_loss_clip": 0.01170664, "auxiliary_loss_mlp": 0.0102378, "balance_loss_clip": 1.01143789, "balance_loss_mlp": 1.01683605, "epoch": 0.6576083689051885, "flos": 11218458504960.0, "grad_norm": 1.9812293207641156, "language_loss": 0.7333287, "learning_rate": 1.1086463446979361e-06, "loss": 0.7552731, "num_input_tokens_seen": 117649885, "step": 5469, "time_per_iteration": 2.729140043258667 }, { "auxiliary_loss_clip": 0.01174605, "auxiliary_loss_mlp": 0.01026, "balance_loss_clip": 1.01506972, "balance_loss_mlp": 1.01856434, "epoch": 0.6577286117958275, "flos": 22455553190400.0, "grad_norm": 1.8234052718864648, "language_loss": 0.77414691, "learning_rate": 1.1079490816371277e-06, "loss": 0.79615301, "num_input_tokens_seen": 117669650, "step": 5470, "time_per_iteration": 2.715773344039917 }, { "auxiliary_loss_clip": 0.011706, "auxiliary_loss_mlp": 0.01122822, "balance_loss_clip": 1.01081157, "balance_loss_mlp": 0.0, "epoch": 0.6578488546864667, "flos": 21872184405120.0, "grad_norm": 2.4928954094323332, "language_loss": 0.74485099, "learning_rate": 1.1072519538997352e-06, "loss": 0.76778519, "num_input_tokens_seen": 117688790, "step": 5471, "time_per_iteration": 2.624626874923706 }, { "auxiliary_loss_clip": 0.0116774, "auxiliary_loss_mlp": 0.01025118, "balance_loss_clip": 0.96930456, "balance_loss_mlp": 1.01736951, "epoch": 0.6579690975771058, "flos": 23543753673600.0, "grad_norm": 1.5599429484523513, "language_loss": 0.82530314, "learning_rate": 1.1065549615915095e-06, "loss": 0.84723175, "num_input_tokens_seen": 117708620, "step": 5472, "time_per_iteration": 2.736445188522339 }, { "auxiliary_loss_clip": 0.01171689, "auxiliary_loss_mlp": 0.01027444, "balance_loss_clip": 1.01483083, "balance_loss_mlp": 1.02007389, "epoch": 0.6580893404677448, "flos": 32743995730560.0, "grad_norm": 2.4923062250001555, "language_loss": 0.78029835, "learning_rate": 1.105858104818187e-06, "loss": 0.80228972, "num_input_tokens_seen": 117729775, "step": 5473, "time_per_iteration": 2.7767653465270996 }, { "auxiliary_loss_clip": 0.01175618, "auxiliary_loss_mlp": 0.01025837, "balance_loss_clip": 1.01266575, "balance_loss_mlp": 1.0180999, "epoch": 0.658209583358384, "flos": 15888138220800.0, "grad_norm": 2.262425090850409, "language_loss": 0.7496053, "learning_rate": 1.105161383685478e-06, "loss": 0.7716198, "num_input_tokens_seen": 117746160, "step": 5474, "time_per_iteration": 2.6251273155212402 }, { "auxiliary_loss_clip": 0.01075145, "auxiliary_loss_mlp": 0.01002555, "balance_loss_clip": 0.90500712, "balance_loss_mlp": 1.00099373, "epoch": 0.658329826249023, "flos": 62695902447360.0, "grad_norm": 0.7362821671904001, "language_loss": 0.56359351, "learning_rate": 1.1044647982990771e-06, "loss": 0.58437049, "num_input_tokens_seen": 117808045, "step": 5475, "time_per_iteration": 3.243414878845215 }, { "auxiliary_loss_clip": 0.01169626, "auxiliary_loss_mlp": 0.01025512, "balance_loss_clip": 0.97356945, "balance_loss_mlp": 1.01803184, "epoch": 0.6584500691396621, "flos": 31722624501120.0, "grad_norm": 2.5936556568351357, "language_loss": 0.64465404, "learning_rate": 1.1037683487646536e-06, "loss": 0.66660547, "num_input_tokens_seen": 117828330, "step": 5476, "time_per_iteration": 3.6925859451293945 }, { "auxiliary_loss_clip": 0.01165571, "auxiliary_loss_mlp": 0.01122241, "balance_loss_clip": 0.97438526, "balance_loss_mlp": 0.0, "epoch": 0.6585703120303013, "flos": 18406086635520.0, "grad_norm": 2.8744864807431387, "language_loss": 0.7710492, "learning_rate": 1.1030720351878583e-06, "loss": 0.79392731, "num_input_tokens_seen": 117846450, "step": 5477, "time_per_iteration": 2.704146146774292 }, { "auxiliary_loss_clip": 0.01074514, "auxiliary_loss_mlp": 0.01001261, "balance_loss_clip": 0.93917781, "balance_loss_mlp": 0.99960417, "epoch": 0.6586905549209403, "flos": 58309880434560.0, "grad_norm": 0.8078255454666585, "language_loss": 0.57616782, "learning_rate": 1.102375857674323e-06, "loss": 0.59692562, "num_input_tokens_seen": 117908365, "step": 5478, "time_per_iteration": 3.211615562438965 }, { "auxiliary_loss_clip": 0.0116715, "auxiliary_loss_mlp": 0.01025203, "balance_loss_clip": 0.9714613, "balance_loss_mlp": 1.01851869, "epoch": 0.6588107978115794, "flos": 22782627457920.0, "grad_norm": 2.7557185149989136, "language_loss": 0.90181273, "learning_rate": 1.1016798163296561e-06, "loss": 0.92373627, "num_input_tokens_seen": 117927565, "step": 5479, "time_per_iteration": 2.662710428237915 }, { "auxiliary_loss_clip": 0.01171532, "auxiliary_loss_mlp": 0.01027539, "balance_loss_clip": 1.01091468, "balance_loss_mlp": 1.02014184, "epoch": 0.6589310407022185, "flos": 20667525050880.0, "grad_norm": 1.7509368163201349, "language_loss": 0.65980059, "learning_rate": 1.1009839112594471e-06, "loss": 0.68179131, "num_input_tokens_seen": 117945590, "step": 5480, "time_per_iteration": 2.624279737472534 }, { "auxiliary_loss_clip": 0.01169696, "auxiliary_loss_mlp": 0.01024497, "balance_loss_clip": 1.01003373, "balance_loss_mlp": 1.01731181, "epoch": 0.6590512835928576, "flos": 25630595055360.0, "grad_norm": 3.649729069956956, "language_loss": 0.71595085, "learning_rate": 1.1002881425692638e-06, "loss": 0.73789281, "num_input_tokens_seen": 117966020, "step": 5481, "time_per_iteration": 3.5824289321899414 }, { "auxiliary_loss_clip": 0.01162844, "auxiliary_loss_mlp": 0.01025772, "balance_loss_clip": 1.00919211, "balance_loss_mlp": 1.01860738, "epoch": 0.6591715264834966, "flos": 23726108044800.0, "grad_norm": 1.6336070102893552, "language_loss": 0.75242692, "learning_rate": 1.0995925103646532e-06, "loss": 0.77431309, "num_input_tokens_seen": 117984620, "step": 5482, "time_per_iteration": 3.574777841567993 }, { "auxiliary_loss_clip": 0.0116057, "auxiliary_loss_mlp": 0.01021147, "balance_loss_clip": 0.93547773, "balance_loss_mlp": 1.01398563, "epoch": 0.6592917693741358, "flos": 35773850822400.0, "grad_norm": 1.4514286375761905, "language_loss": 0.6666612, "learning_rate": 1.0988970147511437e-06, "loss": 0.68847841, "num_input_tokens_seen": 118006500, "step": 5483, "time_per_iteration": 2.8610143661499023 }, { "auxiliary_loss_clip": 0.01169497, "auxiliary_loss_mlp": 0.01029216, "balance_loss_clip": 0.97536761, "balance_loss_mlp": 1.02220345, "epoch": 0.6594120122647749, "flos": 21396834794880.0, "grad_norm": 1.7228843723068927, "language_loss": 0.80281329, "learning_rate": 1.0982016558342405e-06, "loss": 0.82480049, "num_input_tokens_seen": 118025470, "step": 5484, "time_per_iteration": 2.6668789386749268 }, { "auxiliary_loss_clip": 0.01173775, "auxiliary_loss_mlp": 0.01030111, "balance_loss_clip": 1.05125737, "balance_loss_mlp": 1.02285409, "epoch": 0.6595322551554139, "flos": 19351829779200.0, "grad_norm": 1.8065051021690302, "language_loss": 0.71387911, "learning_rate": 1.0975064337194291e-06, "loss": 0.73591805, "num_input_tokens_seen": 118043515, "step": 5485, "time_per_iteration": 2.5935442447662354 }, { "auxiliary_loss_clip": 0.01162835, "auxiliary_loss_mlp": 0.0102865, "balance_loss_clip": 0.93459308, "balance_loss_mlp": 1.0214169, "epoch": 0.6596524980460531, "flos": 16837113588480.0, "grad_norm": 1.3954347619554253, "language_loss": 0.69886178, "learning_rate": 1.0968113485121743e-06, "loss": 0.72077668, "num_input_tokens_seen": 118063105, "step": 5486, "time_per_iteration": 2.696943998336792 }, { "auxiliary_loss_clip": 0.01168249, "auxiliary_loss_mlp": 0.01122912, "balance_loss_clip": 1.00862408, "balance_loss_mlp": 0.0, "epoch": 0.6597727409366921, "flos": 21798567480960.0, "grad_norm": 2.3846343320167698, "language_loss": 0.80036151, "learning_rate": 1.0961164003179185e-06, "loss": 0.82327318, "num_input_tokens_seen": 118081615, "step": 5487, "time_per_iteration": 2.657942533493042 }, { "auxiliary_loss_clip": 0.0116315, "auxiliary_loss_mlp": 0.01026861, "balance_loss_clip": 0.93232775, "balance_loss_mlp": 1.02005696, "epoch": 0.6598929838273312, "flos": 23730704985600.0, "grad_norm": 1.8945220998461982, "language_loss": 0.84039664, "learning_rate": 1.0954215892420884e-06, "loss": 0.86229682, "num_input_tokens_seen": 118102315, "step": 5488, "time_per_iteration": 2.670311689376831 }, { "auxiliary_loss_clip": 0.01172189, "auxiliary_loss_mlp": 0.01030328, "balance_loss_clip": 0.93523967, "balance_loss_mlp": 1.02316391, "epoch": 0.6600132267179702, "flos": 19974520978560.0, "grad_norm": 1.788584841129075, "language_loss": 0.70364487, "learning_rate": 1.094726915390082e-06, "loss": 0.72567004, "num_input_tokens_seen": 118120650, "step": 5489, "time_per_iteration": 2.6806628704071045 }, { "auxiliary_loss_clip": 0.01168774, "auxiliary_loss_mlp": 0.01022979, "balance_loss_clip": 1.01098228, "balance_loss_mlp": 1.01539147, "epoch": 0.6601334696086094, "flos": 22342649765760.0, "grad_norm": 1.652619581519308, "language_loss": 0.69708741, "learning_rate": 1.0940323788672836e-06, "loss": 0.71900499, "num_input_tokens_seen": 118139825, "step": 5490, "time_per_iteration": 2.6353063583374023 }, { "auxiliary_loss_clip": 0.01162606, "auxiliary_loss_mlp": 0.01024057, "balance_loss_clip": 1.00967503, "balance_loss_mlp": 1.01728833, "epoch": 0.6602537124992485, "flos": 25703098657920.0, "grad_norm": 1.5380634281003978, "language_loss": 0.73595053, "learning_rate": 1.0933379797790522e-06, "loss": 0.75781721, "num_input_tokens_seen": 118159240, "step": 5491, "time_per_iteration": 2.7478110790252686 }, { "auxiliary_loss_clip": 0.01172158, "auxiliary_loss_mlp": 0.01031823, "balance_loss_clip": 1.05131137, "balance_loss_mlp": 1.02497697, "epoch": 0.6603739553898875, "flos": 25848572739840.0, "grad_norm": 2.8699345181803744, "language_loss": 0.71535963, "learning_rate": 1.0926437182307293e-06, "loss": 0.73739946, "num_input_tokens_seen": 118178050, "step": 5492, "time_per_iteration": 2.682427167892456 }, { "auxiliary_loss_clip": 0.01171682, "auxiliary_loss_mlp": 0.0102592, "balance_loss_clip": 0.9711622, "balance_loss_mlp": 1.01850224, "epoch": 0.6604941982805267, "flos": 24570296461440.0, "grad_norm": 6.790427825880287, "language_loss": 0.77899647, "learning_rate": 1.0919495943276338e-06, "loss": 0.80097246, "num_input_tokens_seen": 118199070, "step": 5493, "time_per_iteration": 2.6949641704559326 }, { "auxiliary_loss_clip": 0.01165519, "auxiliary_loss_mlp": 0.01025122, "balance_loss_clip": 0.92976969, "balance_loss_mlp": 1.01729941, "epoch": 0.6606144411711657, "flos": 13261775581440.0, "grad_norm": 2.1212495397420983, "language_loss": 0.76159376, "learning_rate": 1.0912556081750611e-06, "loss": 0.78350019, "num_input_tokens_seen": 118217000, "step": 5494, "time_per_iteration": 2.70078706741333 }, { "auxiliary_loss_clip": 0.01164807, "auxiliary_loss_mlp": 0.01023469, "balance_loss_clip": 0.97342253, "balance_loss_mlp": 1.0164088, "epoch": 0.6607346840618048, "flos": 25155281358720.0, "grad_norm": 1.876970635067376, "language_loss": 0.76245922, "learning_rate": 1.0905617598782909e-06, "loss": 0.78434199, "num_input_tokens_seen": 118237205, "step": 5495, "time_per_iteration": 2.7756707668304443 }, { "auxiliary_loss_clip": 0.01156087, "auxiliary_loss_mlp": 0.01027311, "balance_loss_clip": 0.89288664, "balance_loss_mlp": 1.02050376, "epoch": 0.660854926952444, "flos": 17638029095040.0, "grad_norm": 1.8961908312313003, "language_loss": 0.81254238, "learning_rate": 1.0898680495425775e-06, "loss": 0.83437634, "num_input_tokens_seen": 118255495, "step": 5496, "time_per_iteration": 2.745086908340454 }, { "auxiliary_loss_clip": 0.01172378, "auxiliary_loss_mlp": 0.01025878, "balance_loss_clip": 0.97410762, "balance_loss_mlp": 1.01896405, "epoch": 0.660975169843083, "flos": 16836000266880.0, "grad_norm": 1.6151522981272095, "language_loss": 0.80511117, "learning_rate": 1.0891744772731594e-06, "loss": 0.82709372, "num_input_tokens_seen": 118273310, "step": 5497, "time_per_iteration": 2.6245737075805664 }, { "auxiliary_loss_clip": 0.01170845, "auxiliary_loss_mlp": 0.01023026, "balance_loss_clip": 1.00975347, "balance_loss_mlp": 1.0160464, "epoch": 0.6610954127337221, "flos": 26870410846080.0, "grad_norm": 1.6863744551354816, "language_loss": 0.66079241, "learning_rate": 1.088481043175248e-06, "loss": 0.68273109, "num_input_tokens_seen": 118293880, "step": 5498, "time_per_iteration": 2.691035032272339 }, { "auxiliary_loss_clip": 0.0115584, "auxiliary_loss_mlp": 0.01026482, "balance_loss_clip": 0.96864069, "balance_loss_mlp": 1.01918066, "epoch": 0.6612156556243612, "flos": 26465697331200.0, "grad_norm": 1.8372424082322736, "language_loss": 0.75591338, "learning_rate": 1.0877877473540368e-06, "loss": 0.77773666, "num_input_tokens_seen": 118314465, "step": 5499, "time_per_iteration": 2.7053589820861816 }, { "auxiliary_loss_clip": 0.01174504, "auxiliary_loss_mlp": 0.01022447, "balance_loss_clip": 1.0510751, "balance_loss_mlp": 1.01526523, "epoch": 0.6613358985150003, "flos": 19791915212160.0, "grad_norm": 2.4509549026026627, "language_loss": 0.72424304, "learning_rate": 1.0870945899147002e-06, "loss": 0.74621254, "num_input_tokens_seen": 118331110, "step": 5500, "time_per_iteration": 2.6537134647369385 }, { "auxiliary_loss_clip": 0.01169708, "auxiliary_loss_mlp": 0.01021993, "balance_loss_clip": 1.01265836, "balance_loss_mlp": 1.01498318, "epoch": 0.6614561414056394, "flos": 26831627136000.0, "grad_norm": 2.281487765749376, "language_loss": 0.75923306, "learning_rate": 1.0864015709623879e-06, "loss": 0.78115004, "num_input_tokens_seen": 118351980, "step": 5501, "time_per_iteration": 2.6664586067199707 }, { "auxiliary_loss_clip": 0.0117568, "auxiliary_loss_mlp": 0.01025712, "balance_loss_clip": 1.01196027, "balance_loss_mlp": 1.01796985, "epoch": 0.6615763842962785, "flos": 22894597128960.0, "grad_norm": 2.1879889980777776, "language_loss": 0.79991257, "learning_rate": 1.0857086906022313e-06, "loss": 0.82192647, "num_input_tokens_seen": 118370315, "step": 5502, "time_per_iteration": 3.649228096008301 }, { "auxiliary_loss_clip": 0.01156922, "auxiliary_loss_mlp": 0.01029829, "balance_loss_clip": 0.85974109, "balance_loss_mlp": 1.02200294, "epoch": 0.6616966271869176, "flos": 24790321221120.0, "grad_norm": 1.9749594537173842, "language_loss": 0.72897112, "learning_rate": 1.0850159489393388e-06, "loss": 0.75083864, "num_input_tokens_seen": 118389575, "step": 5503, "time_per_iteration": 2.8653557300567627 }, { "auxiliary_loss_clip": 0.01155756, "auxiliary_loss_mlp": 0.01022392, "balance_loss_clip": 0.92814445, "balance_loss_mlp": 1.01513243, "epoch": 0.6618168700775566, "flos": 17202109639680.0, "grad_norm": 2.519820566319009, "language_loss": 0.82015687, "learning_rate": 1.0843233460787992e-06, "loss": 0.84193832, "num_input_tokens_seen": 118406790, "step": 5504, "time_per_iteration": 2.6548051834106445 }, { "auxiliary_loss_clip": 0.01159161, "auxiliary_loss_mlp": 0.01029312, "balance_loss_clip": 0.93599147, "balance_loss_mlp": 1.02206373, "epoch": 0.6619371129681958, "flos": 25447091448960.0, "grad_norm": 2.119951475799084, "language_loss": 0.77922559, "learning_rate": 1.0836308821256805e-06, "loss": 0.80111033, "num_input_tokens_seen": 118427590, "step": 5505, "time_per_iteration": 2.756286382675171 }, { "auxiliary_loss_clip": 0.01165946, "auxiliary_loss_mlp": 0.01024813, "balance_loss_clip": 1.01112247, "balance_loss_mlp": 1.01825619, "epoch": 0.6620573558588349, "flos": 18040444139520.0, "grad_norm": 1.9844046974557492, "language_loss": 0.77670014, "learning_rate": 1.0829385571850282e-06, "loss": 0.79860771, "num_input_tokens_seen": 118444570, "step": 5506, "time_per_iteration": 3.5151138305664062 }, { "auxiliary_loss_clip": 0.01176075, "auxiliary_loss_mlp": 0.0102448, "balance_loss_clip": 1.04891801, "balance_loss_mlp": 1.01669586, "epoch": 0.6621775987494739, "flos": 17785586165760.0, "grad_norm": 2.77729544680056, "language_loss": 0.8390305, "learning_rate": 1.0822463713618679e-06, "loss": 0.861036, "num_input_tokens_seen": 118461425, "step": 5507, "time_per_iteration": 2.6576344966888428 }, { "auxiliary_loss_clip": 0.01168788, "auxiliary_loss_mlp": 0.01027178, "balance_loss_clip": 0.93381178, "balance_loss_mlp": 1.01940262, "epoch": 0.6622978416401131, "flos": 17492590926720.0, "grad_norm": 2.0168847703312665, "language_loss": 0.84796488, "learning_rate": 1.0815543247612034e-06, "loss": 0.86992455, "num_input_tokens_seen": 118478495, "step": 5508, "time_per_iteration": 3.6765296459198 }, { "auxiliary_loss_clip": 0.01164687, "auxiliary_loss_mlp": 0.01024188, "balance_loss_clip": 0.96819615, "balance_loss_mlp": 1.01674342, "epoch": 0.6624180845307521, "flos": 21648352803840.0, "grad_norm": 1.787050707299595, "language_loss": 0.82980382, "learning_rate": 1.0808624174880168e-06, "loss": 0.85169256, "num_input_tokens_seen": 118499145, "step": 5509, "time_per_iteration": 2.6686313152313232 }, { "auxiliary_loss_clip": 0.01168504, "auxiliary_loss_mlp": 0.0102812, "balance_loss_clip": 1.04985094, "balance_loss_mlp": 1.02130723, "epoch": 0.6625383274213912, "flos": 23805902108160.0, "grad_norm": 1.5902506531651441, "language_loss": 0.79900503, "learning_rate": 1.080170649647272e-06, "loss": 0.82097131, "num_input_tokens_seen": 118518950, "step": 5510, "time_per_iteration": 2.6949710845947266 }, { "auxiliary_loss_clip": 0.01166975, "auxiliary_loss_mlp": 0.0102622, "balance_loss_clip": 1.04737353, "balance_loss_mlp": 1.01880503, "epoch": 0.6626585703120303, "flos": 33262941473280.0, "grad_norm": 1.538351183093456, "language_loss": 0.67199177, "learning_rate": 1.0794790213439068e-06, "loss": 0.69392371, "num_input_tokens_seen": 118545850, "step": 5511, "time_per_iteration": 2.726693630218506 }, { "auxiliary_loss_clip": 0.011668, "auxiliary_loss_mlp": 0.01028596, "balance_loss_clip": 0.8946166, "balance_loss_mlp": 1.02044189, "epoch": 0.6627788132026694, "flos": 22085780630400.0, "grad_norm": 2.102580590482813, "language_loss": 0.78652394, "learning_rate": 1.078787532682843e-06, "loss": 0.80847794, "num_input_tokens_seen": 118563325, "step": 5512, "time_per_iteration": 2.733553886413574 }, { "auxiliary_loss_clip": 0.01169029, "auxiliary_loss_mlp": 0.010271, "balance_loss_clip": 1.01138139, "balance_loss_mlp": 1.01979566, "epoch": 0.6628990560933085, "flos": 36173608260480.0, "grad_norm": 2.2146198927769305, "language_loss": 0.76211923, "learning_rate": 1.0780961837689773e-06, "loss": 0.78408051, "num_input_tokens_seen": 118582835, "step": 5513, "time_per_iteration": 2.7775566577911377 }, { "auxiliary_loss_clip": 0.01157569, "auxiliary_loss_mlp": 0.01026065, "balance_loss_clip": 0.97159815, "balance_loss_mlp": 1.01919246, "epoch": 0.6630192989839476, "flos": 18513567106560.0, "grad_norm": 1.57467045000349, "language_loss": 0.69895995, "learning_rate": 1.0774049747071883e-06, "loss": 0.72079629, "num_input_tokens_seen": 118600715, "step": 5514, "time_per_iteration": 2.681152105331421 }, { "auxiliary_loss_clip": 0.01163374, "auxiliary_loss_mlp": 0.01023801, "balance_loss_clip": 0.89607096, "balance_loss_mlp": 1.01688647, "epoch": 0.6631395418745867, "flos": 35809510049280.0, "grad_norm": 1.5271188941180547, "language_loss": 0.68103814, "learning_rate": 1.076713905602332e-06, "loss": 0.70290995, "num_input_tokens_seen": 118621290, "step": 5515, "time_per_iteration": 2.894343376159668 }, { "auxiliary_loss_clip": 0.0117413, "auxiliary_loss_mlp": 0.01025509, "balance_loss_clip": 1.01319456, "balance_loss_mlp": 1.01798069, "epoch": 0.6632597847652257, "flos": 20047742853120.0, "grad_norm": 1.6351845804452196, "language_loss": 0.80798703, "learning_rate": 1.07602297655924e-06, "loss": 0.82998347, "num_input_tokens_seen": 118639610, "step": 5516, "time_per_iteration": 2.6187775135040283 }, { "auxiliary_loss_clip": 0.01172807, "auxiliary_loss_mlp": 0.01025805, "balance_loss_clip": 1.05208826, "balance_loss_mlp": 1.0186466, "epoch": 0.6633800276558649, "flos": 21214480423680.0, "grad_norm": 1.7912085983892048, "language_loss": 0.8109045, "learning_rate": 1.0753321876827292e-06, "loss": 0.83289057, "num_input_tokens_seen": 118658895, "step": 5517, "time_per_iteration": 2.603976249694824 }, { "auxiliary_loss_clip": 0.01171306, "auxiliary_loss_mlp": 0.010257, "balance_loss_clip": 1.04850018, "balance_loss_mlp": 1.01827955, "epoch": 0.663500270546504, "flos": 23987753688960.0, "grad_norm": 2.0153294951808536, "language_loss": 0.73941553, "learning_rate": 1.0746415390775893e-06, "loss": 0.76138562, "num_input_tokens_seen": 118677025, "step": 5518, "time_per_iteration": 2.6443839073181152 }, { "auxiliary_loss_clip": 0.01170406, "auxiliary_loss_mlp": 0.01026091, "balance_loss_clip": 1.05126476, "balance_loss_mlp": 1.01956105, "epoch": 0.663620513437143, "flos": 17932389050880.0, "grad_norm": 1.8034793355372294, "language_loss": 0.76481223, "learning_rate": 1.0739510308485939e-06, "loss": 0.78677726, "num_input_tokens_seen": 118694240, "step": 5519, "time_per_iteration": 2.598902702331543 }, { "auxiliary_loss_clip": 0.01081175, "auxiliary_loss_mlp": 0.0100062, "balance_loss_clip": 0.90315348, "balance_loss_mlp": 0.99899834, "epoch": 0.6637407563277821, "flos": 57840241086720.0, "grad_norm": 0.8947496431932915, "language_loss": 0.62603676, "learning_rate": 1.07326066310049e-06, "loss": 0.64685476, "num_input_tokens_seen": 118758365, "step": 5520, "time_per_iteration": 3.2685961723327637 }, { "auxiliary_loss_clip": 0.01161874, "auxiliary_loss_mlp": 0.01033971, "balance_loss_clip": 0.9333235, "balance_loss_mlp": 1.02598405, "epoch": 0.6638609992184212, "flos": 27306007079040.0, "grad_norm": 1.6826346931574603, "language_loss": 0.79502082, "learning_rate": 1.0725704359380059e-06, "loss": 0.81697929, "num_input_tokens_seen": 118778220, "step": 5521, "time_per_iteration": 2.79559588432312 }, { "auxiliary_loss_clip": 0.01171363, "auxiliary_loss_mlp": 0.01027107, "balance_loss_clip": 1.04960322, "balance_loss_mlp": 1.02001989, "epoch": 0.6639812421090603, "flos": 18624854419200.0, "grad_norm": 2.209470752537504, "language_loss": 0.7216692, "learning_rate": 1.0718803494658497e-06, "loss": 0.74365389, "num_input_tokens_seen": 118797110, "step": 5522, "time_per_iteration": 2.526806592941284 }, { "auxiliary_loss_clip": 0.0115664, "auxiliary_loss_mlp": 0.01032436, "balance_loss_clip": 0.78088462, "balance_loss_mlp": 1.02520013, "epoch": 0.6641014849996993, "flos": 15924479806080.0, "grad_norm": 2.1659066186572984, "language_loss": 0.83791208, "learning_rate": 1.071190403788707e-06, "loss": 0.85980284, "num_input_tokens_seen": 118812415, "step": 5523, "time_per_iteration": 2.951721668243408 }, { "auxiliary_loss_clip": 0.01176189, "auxiliary_loss_mlp": 0.01031749, "balance_loss_clip": 0.93731403, "balance_loss_mlp": 1.02377963, "epoch": 0.6642217278903385, "flos": 26505486622080.0, "grad_norm": 1.843415359871783, "language_loss": 0.75694132, "learning_rate": 1.0705005990112415e-06, "loss": 0.77902073, "num_input_tokens_seen": 118832195, "step": 5524, "time_per_iteration": 3.1554882526397705 }, { "auxiliary_loss_clip": 0.01153393, "auxiliary_loss_mlp": 0.01028808, "balance_loss_clip": 0.89622962, "balance_loss_mlp": 1.02154565, "epoch": 0.6643419707809776, "flos": 15377308951680.0, "grad_norm": 3.813020521304631, "language_loss": 0.7511487, "learning_rate": 1.0698109352380957e-06, "loss": 0.7729708, "num_input_tokens_seen": 118849795, "step": 5525, "time_per_iteration": 2.7216289043426514 }, { "auxiliary_loss_clip": 0.01169446, "auxiliary_loss_mlp": 0.01024609, "balance_loss_clip": 1.04867363, "balance_loss_mlp": 1.01673281, "epoch": 0.6644622136716166, "flos": 25117610970240.0, "grad_norm": 1.6913094004047218, "language_loss": 0.78127861, "learning_rate": 1.0691214125738909e-06, "loss": 0.8032192, "num_input_tokens_seen": 118870000, "step": 5526, "time_per_iteration": 2.6581881046295166 }, { "auxiliary_loss_clip": 0.01067389, "auxiliary_loss_mlp": 0.01003135, "balance_loss_clip": 1.01316774, "balance_loss_mlp": 1.00154936, "epoch": 0.6645824565622558, "flos": 66201717680640.0, "grad_norm": 0.7833568137438208, "language_loss": 0.57536948, "learning_rate": 1.0684320311232287e-06, "loss": 0.5960747, "num_input_tokens_seen": 118932905, "step": 5527, "time_per_iteration": 3.2367868423461914 }, { "auxiliary_loss_clip": 0.01163909, "auxiliary_loss_mlp": 0.01030768, "balance_loss_clip": 0.97219074, "balance_loss_mlp": 1.02353144, "epoch": 0.6647026994528948, "flos": 25082131311360.0, "grad_norm": 1.8515062569373477, "language_loss": 0.80936849, "learning_rate": 1.0677427909906865e-06, "loss": 0.83131528, "num_input_tokens_seen": 118953355, "step": 5528, "time_per_iteration": 4.054861783981323 }, { "auxiliary_loss_clip": 0.01174335, "auxiliary_loss_mlp": 0.01030326, "balance_loss_clip": 1.04999411, "balance_loss_mlp": 1.02281857, "epoch": 0.6648229423435339, "flos": 18222187979520.0, "grad_norm": 1.826015212458188, "language_loss": 0.72225964, "learning_rate": 1.0670536922808216e-06, "loss": 0.74430621, "num_input_tokens_seen": 118973480, "step": 5529, "time_per_iteration": 2.662010908126831 }, { "auxiliary_loss_clip": 0.01169787, "auxiliary_loss_mlp": 0.01024694, "balance_loss_clip": 0.97363126, "balance_loss_mlp": 1.0174551, "epoch": 0.6649431852341731, "flos": 18296882311680.0, "grad_norm": 2.673591327647134, "language_loss": 0.71990186, "learning_rate": 1.06636473509817e-06, "loss": 0.74184668, "num_input_tokens_seen": 118989860, "step": 5530, "time_per_iteration": 2.721069812774658 }, { "auxiliary_loss_clip": 0.01163253, "auxiliary_loss_mlp": 0.0112235, "balance_loss_clip": 0.97051406, "balance_loss_mlp": 0.0, "epoch": 0.6650634281248121, "flos": 17019575700480.0, "grad_norm": 2.177736258580397, "language_loss": 0.80843103, "learning_rate": 1.0656759195472447e-06, "loss": 0.83128703, "num_input_tokens_seen": 119007150, "step": 5531, "time_per_iteration": 2.6777658462524414 }, { "auxiliary_loss_clip": 0.01069175, "auxiliary_loss_mlp": 0.0100286, "balance_loss_clip": 0.93823355, "balance_loss_mlp": 1.0012387, "epoch": 0.6651836710154512, "flos": 69294810666240.0, "grad_norm": 0.7801330953608385, "language_loss": 0.59804618, "learning_rate": 1.0649872457325414e-06, "loss": 0.61876655, "num_input_tokens_seen": 119068435, "step": 5532, "time_per_iteration": 3.1752676963806152 }, { "auxiliary_loss_clip": 0.01070544, "auxiliary_loss_mlp": 0.01001481, "balance_loss_clip": 0.97540426, "balance_loss_mlp": 0.99991912, "epoch": 0.6653039139060903, "flos": 66883444882560.0, "grad_norm": 0.8484827413959484, "language_loss": 0.55121368, "learning_rate": 1.0642987137585278e-06, "loss": 0.57193398, "num_input_tokens_seen": 119127960, "step": 5533, "time_per_iteration": 4.908107042312622 }, { "auxiliary_loss_clip": 0.01164324, "auxiliary_loss_mlp": 0.01030865, "balance_loss_clip": 0.97157204, "balance_loss_mlp": 1.02400434, "epoch": 0.6654241567967294, "flos": 21470056669440.0, "grad_norm": 1.8054039856748192, "language_loss": 0.82345927, "learning_rate": 1.0636103237296561e-06, "loss": 0.84541118, "num_input_tokens_seen": 119146885, "step": 5534, "time_per_iteration": 3.592411756515503 }, { "auxiliary_loss_clip": 0.0116676, "auxiliary_loss_mlp": 0.01030535, "balance_loss_clip": 1.01297593, "balance_loss_mlp": 1.02365649, "epoch": 0.6655443996873684, "flos": 25119514391040.0, "grad_norm": 1.6778860350670148, "language_loss": 0.83788109, "learning_rate": 1.062922075750353e-06, "loss": 0.85985404, "num_input_tokens_seen": 119166900, "step": 5535, "time_per_iteration": 2.703653573989868 }, { "auxiliary_loss_clip": 0.0116896, "auxiliary_loss_mlp": 0.01024073, "balance_loss_clip": 0.93337202, "balance_loss_mlp": 1.01694691, "epoch": 0.6656646425780076, "flos": 17457326749440.0, "grad_norm": 1.8916490928800682, "language_loss": 0.717116, "learning_rate": 1.0622339699250267e-06, "loss": 0.73904634, "num_input_tokens_seen": 119184820, "step": 5536, "time_per_iteration": 2.718641757965088 }, { "auxiliary_loss_clip": 0.01169071, "auxiliary_loss_mlp": 0.01027409, "balance_loss_clip": 0.93450075, "balance_loss_mlp": 1.01967812, "epoch": 0.6657848854686467, "flos": 23434190213760.0, "grad_norm": 1.7572829449581397, "language_loss": 0.7924931, "learning_rate": 1.0615460063580624e-06, "loss": 0.81445789, "num_input_tokens_seen": 119203295, "step": 5537, "time_per_iteration": 2.7631404399871826 }, { "auxiliary_loss_clip": 0.01171906, "auxiliary_loss_mlp": 0.01023534, "balance_loss_clip": 0.97201729, "balance_loss_mlp": 1.01617265, "epoch": 0.6659051283592857, "flos": 11509909459200.0, "grad_norm": 2.2499056882879587, "language_loss": 0.72833699, "learning_rate": 1.060858185153821e-06, "loss": 0.75029135, "num_input_tokens_seen": 119221395, "step": 5538, "time_per_iteration": 2.6424973011016846 }, { "auxiliary_loss_clip": 0.01175147, "auxiliary_loss_mlp": 0.01032238, "balance_loss_clip": 0.97403479, "balance_loss_mlp": 1.02446222, "epoch": 0.6660253712499249, "flos": 20594554571520.0, "grad_norm": 3.8920034859792723, "language_loss": 0.76001859, "learning_rate": 1.0601705064166474e-06, "loss": 0.78209245, "num_input_tokens_seen": 119239790, "step": 5539, "time_per_iteration": 2.64426589012146 }, { "auxiliary_loss_clip": 0.01166553, "auxiliary_loss_mlp": 0.01026566, "balance_loss_clip": 0.97435558, "balance_loss_mlp": 1.01909173, "epoch": 0.666145614140564, "flos": 21251504367360.0, "grad_norm": 2.1809572545263927, "language_loss": 0.73477513, "learning_rate": 1.0594829702508596e-06, "loss": 0.75670636, "num_input_tokens_seen": 119257505, "step": 5540, "time_per_iteration": 2.6729629039764404 }, { "auxiliary_loss_clip": 0.01167396, "auxiliary_loss_mlp": 0.01025627, "balance_loss_clip": 0.93330956, "balance_loss_mlp": 1.01818788, "epoch": 0.666265857031203, "flos": 33726188200320.0, "grad_norm": 1.6753232690860456, "language_loss": 0.54790139, "learning_rate": 1.0587955767607592e-06, "loss": 0.56983161, "num_input_tokens_seen": 119279365, "step": 5541, "time_per_iteration": 2.8143582344055176 }, { "auxiliary_loss_clip": 0.0116843, "auxiliary_loss_mlp": 0.01025894, "balance_loss_clip": 1.04810798, "balance_loss_mlp": 1.01840794, "epoch": 0.6663860999218422, "flos": 17456644391040.0, "grad_norm": 3.768236909649301, "language_loss": 0.77294779, "learning_rate": 1.0581083260506206e-06, "loss": 0.794891, "num_input_tokens_seen": 119296150, "step": 5542, "time_per_iteration": 2.550318956375122 }, { "auxiliary_loss_clip": 0.01164937, "auxiliary_loss_mlp": 0.01020315, "balance_loss_clip": 0.97061384, "balance_loss_mlp": 1.0134958, "epoch": 0.6665063428124812, "flos": 17676740977920.0, "grad_norm": 2.1462453326011652, "language_loss": 0.76427829, "learning_rate": 1.0574212182246993e-06, "loss": 0.78613085, "num_input_tokens_seen": 119314845, "step": 5543, "time_per_iteration": 2.7217888832092285 }, { "auxiliary_loss_clip": 0.01176672, "auxiliary_loss_mlp": 0.01025952, "balance_loss_clip": 0.97356784, "balance_loss_mlp": 1.01850128, "epoch": 0.6666265857031203, "flos": 27673265687040.0, "grad_norm": 2.308251893507947, "language_loss": 0.75714135, "learning_rate": 1.0567342533872303e-06, "loss": 0.77916765, "num_input_tokens_seen": 119334875, "step": 5544, "time_per_iteration": 2.6727700233459473 }, { "auxiliary_loss_clip": 0.01170128, "auxiliary_loss_mlp": 0.01022252, "balance_loss_clip": 0.97325128, "balance_loss_mlp": 1.0150938, "epoch": 0.6667468285937594, "flos": 25046831220480.0, "grad_norm": 1.7583166105514565, "language_loss": 0.811526, "learning_rate": 1.0560474316424255e-06, "loss": 0.83344978, "num_input_tokens_seen": 119354635, "step": 5545, "time_per_iteration": 2.764101028442383 }, { "auxiliary_loss_clip": 0.01167541, "auxiliary_loss_mlp": 0.01023788, "balance_loss_clip": 0.9708994, "balance_loss_mlp": 1.01603913, "epoch": 0.6668670714843985, "flos": 22780472641920.0, "grad_norm": 2.427738166402404, "language_loss": 0.73657238, "learning_rate": 1.0553607530944746e-06, "loss": 0.75848567, "num_input_tokens_seen": 119372690, "step": 5546, "time_per_iteration": 2.689955949783325 }, { "auxiliary_loss_clip": 0.01165713, "auxiliary_loss_mlp": 0.01023589, "balance_loss_clip": 0.9312259, "balance_loss_mlp": 1.0161978, "epoch": 0.6669873143750376, "flos": 22163886754560.0, "grad_norm": 2.404747968499722, "language_loss": 0.89493465, "learning_rate": 1.0546742178475463e-06, "loss": 0.91682762, "num_input_tokens_seen": 119391685, "step": 5547, "time_per_iteration": 2.7242870330810547 }, { "auxiliary_loss_clip": 0.011728, "auxiliary_loss_mlp": 0.01025774, "balance_loss_clip": 0.89746904, "balance_loss_mlp": 1.01912844, "epoch": 0.6671075572656767, "flos": 20514832335360.0, "grad_norm": 1.8910006870120095, "language_loss": 0.86812031, "learning_rate": 1.0539878260057868e-06, "loss": 0.89010602, "num_input_tokens_seen": 119410725, "step": 5548, "time_per_iteration": 2.7492945194244385 }, { "auxiliary_loss_clip": 0.0117619, "auxiliary_loss_mlp": 0.01034592, "balance_loss_clip": 1.0147078, "balance_loss_mlp": 1.02734447, "epoch": 0.6672278001563158, "flos": 17931203902080.0, "grad_norm": 2.550445253564321, "language_loss": 0.68595684, "learning_rate": 1.0533015776733226e-06, "loss": 0.70806468, "num_input_tokens_seen": 119426875, "step": 5549, "time_per_iteration": 2.6241390705108643 }, { "auxiliary_loss_clip": 0.0116869, "auxiliary_loss_mlp": 0.01028143, "balance_loss_clip": 0.97381818, "balance_loss_mlp": 1.02113056, "epoch": 0.6673480430469548, "flos": 22342146975360.0, "grad_norm": 2.0847795877837383, "language_loss": 0.78696644, "learning_rate": 1.0526154729542566e-06, "loss": 0.80893481, "num_input_tokens_seen": 119446935, "step": 5550, "time_per_iteration": 2.697467088699341 }, { "auxiliary_loss_clip": 0.01167251, "auxiliary_loss_mlp": 0.01026414, "balance_loss_clip": 0.93505895, "balance_loss_mlp": 1.01859355, "epoch": 0.6674682859375939, "flos": 20703830722560.0, "grad_norm": 2.417330642498015, "language_loss": 0.79814726, "learning_rate": 1.0519295119526699e-06, "loss": 0.82008398, "num_input_tokens_seen": 119463240, "step": 5551, "time_per_iteration": 2.6920058727264404 }, { "auxiliary_loss_clip": 0.01170013, "auxiliary_loss_mlp": 0.01027546, "balance_loss_clip": 0.97159219, "balance_loss_mlp": 1.02026868, "epoch": 0.667588528828233, "flos": 26206673379840.0, "grad_norm": 1.694938251766929, "language_loss": 0.83006507, "learning_rate": 1.0512436947726227e-06, "loss": 0.85204065, "num_input_tokens_seen": 119484655, "step": 5552, "time_per_iteration": 2.763140916824341 }, { "auxiliary_loss_clip": 0.01166712, "auxiliary_loss_mlp": 0.01029951, "balance_loss_clip": 0.93296945, "balance_loss_mlp": 1.02184522, "epoch": 0.6677087717188721, "flos": 23071025756160.0, "grad_norm": 3.137263666489519, "language_loss": 0.64882898, "learning_rate": 1.0505580215181517e-06, "loss": 0.67079568, "num_input_tokens_seen": 119502895, "step": 5553, "time_per_iteration": 3.655857801437378 }, { "auxiliary_loss_clip": 0.01067562, "auxiliary_loss_mlp": 0.01001029, "balance_loss_clip": 0.90437651, "balance_loss_mlp": 0.99939537, "epoch": 0.6678290146095112, "flos": 70941315219840.0, "grad_norm": 0.7869493416507688, "language_loss": 0.56658947, "learning_rate": 1.0498724922932753e-06, "loss": 0.58727539, "num_input_tokens_seen": 119561010, "step": 5554, "time_per_iteration": 3.2186062335968018 }, { "auxiliary_loss_clip": 0.01175423, "auxiliary_loss_mlp": 0.01023388, "balance_loss_clip": 1.05076575, "balance_loss_mlp": 1.01570463, "epoch": 0.6679492575001503, "flos": 18661088263680.0, "grad_norm": 2.4575705766417504, "language_loss": 0.86095178, "learning_rate": 1.0491871072019851e-06, "loss": 0.88293993, "num_input_tokens_seen": 119578900, "step": 5555, "time_per_iteration": 2.6474530696868896 }, { "auxiliary_loss_clip": 0.01168935, "auxiliary_loss_mlp": 0.01024995, "balance_loss_clip": 0.9307133, "balance_loss_mlp": 1.01765728, "epoch": 0.6680695003907894, "flos": 29711985822720.0, "grad_norm": 1.6390322184385862, "language_loss": 0.63797587, "learning_rate": 1.0485018663482555e-06, "loss": 0.65991515, "num_input_tokens_seen": 119598920, "step": 5556, "time_per_iteration": 2.8130764961242676 }, { "auxiliary_loss_clip": 0.01164957, "auxiliary_loss_mlp": 0.01027136, "balance_loss_clip": 1.01053107, "balance_loss_mlp": 1.01937246, "epoch": 0.6681897432814284, "flos": 28218964083840.0, "grad_norm": 2.225610527255174, "language_loss": 0.70940375, "learning_rate": 1.0478167698360354e-06, "loss": 0.73132473, "num_input_tokens_seen": 119618220, "step": 5557, "time_per_iteration": 2.7333643436431885 }, { "auxiliary_loss_clip": 0.01162422, "auxiliary_loss_mlp": 0.010246, "balance_loss_clip": 1.00940847, "balance_loss_mlp": 1.01710224, "epoch": 0.6683099861720676, "flos": 25046543911680.0, "grad_norm": 2.265719597527753, "language_loss": 0.69647741, "learning_rate": 1.0471318177692556e-06, "loss": 0.71834761, "num_input_tokens_seen": 119638520, "step": 5558, "time_per_iteration": 3.5638058185577393 }, { "auxiliary_loss_clip": 0.01167122, "auxiliary_loss_mlp": 0.01025907, "balance_loss_clip": 0.8952046, "balance_loss_mlp": 1.01888549, "epoch": 0.6684302290627067, "flos": 22996977868800.0, "grad_norm": 2.4753913004937385, "language_loss": 0.75593054, "learning_rate": 1.046447010251821e-06, "loss": 0.77786076, "num_input_tokens_seen": 119655850, "step": 5559, "time_per_iteration": 3.7062594890594482 }, { "auxiliary_loss_clip": 0.01169547, "auxiliary_loss_mlp": 0.01029035, "balance_loss_clip": 0.97579044, "balance_loss_mlp": 1.02165639, "epoch": 0.6685504719533457, "flos": 26573824247040.0, "grad_norm": 1.6391912423068404, "language_loss": 0.75644964, "learning_rate": 1.0457623473876157e-06, "loss": 0.77843541, "num_input_tokens_seen": 119675355, "step": 5560, "time_per_iteration": 3.626051187515259 }, { "auxiliary_loss_clip": 0.01168725, "auxiliary_loss_mlp": 0.01025623, "balance_loss_clip": 1.04863942, "balance_loss_mlp": 1.01881897, "epoch": 0.6686707148439849, "flos": 28986087870720.0, "grad_norm": 1.8675116864301728, "language_loss": 0.71148443, "learning_rate": 1.0450778292805046e-06, "loss": 0.73342788, "num_input_tokens_seen": 119695340, "step": 5561, "time_per_iteration": 2.658820629119873 }, { "auxiliary_loss_clip": 0.01171125, "auxiliary_loss_mlp": 0.01025788, "balance_loss_clip": 1.00890827, "balance_loss_mlp": 1.01836169, "epoch": 0.6687909577346239, "flos": 23623152687360.0, "grad_norm": 1.6842336441236878, "language_loss": 0.78785729, "learning_rate": 1.0443934560343267e-06, "loss": 0.80982643, "num_input_tokens_seen": 119716750, "step": 5562, "time_per_iteration": 2.6468942165374756 }, { "auxiliary_loss_clip": 0.01155583, "auxiliary_loss_mlp": 0.01024026, "balance_loss_clip": 0.93305671, "balance_loss_mlp": 1.01734436, "epoch": 0.668911200625263, "flos": 23148593176320.0, "grad_norm": 3.497903582799157, "language_loss": 0.78210628, "learning_rate": 1.0437092277529034e-06, "loss": 0.80390239, "num_input_tokens_seen": 119736005, "step": 5563, "time_per_iteration": 2.680879592895508 }, { "auxiliary_loss_clip": 0.01165127, "auxiliary_loss_mlp": 0.01025825, "balance_loss_clip": 0.97143143, "balance_loss_mlp": 1.01892543, "epoch": 0.6690314435159022, "flos": 18551919853440.0, "grad_norm": 1.963001589039381, "language_loss": 0.73459387, "learning_rate": 1.0430251445400292e-06, "loss": 0.75650334, "num_input_tokens_seen": 119754050, "step": 5564, "time_per_iteration": 2.5736477375030518 }, { "auxiliary_loss_clip": 0.01165359, "auxiliary_loss_mlp": 0.01024218, "balance_loss_clip": 0.78360432, "balance_loss_mlp": 1.01709199, "epoch": 0.6691516864065412, "flos": 31759540704000.0, "grad_norm": 1.916627635670406, "language_loss": 0.62626076, "learning_rate": 1.0423412064994787e-06, "loss": 0.64815652, "num_input_tokens_seen": 119774820, "step": 5565, "time_per_iteration": 3.0087730884552 }, { "auxiliary_loss_clip": 0.01165427, "auxiliary_loss_mlp": 0.01022638, "balance_loss_clip": 0.93136173, "balance_loss_mlp": 1.01558101, "epoch": 0.6692719292971803, "flos": 34933864296960.0, "grad_norm": 2.0415180663644747, "language_loss": 0.73997581, "learning_rate": 1.0416574137350064e-06, "loss": 0.76185644, "num_input_tokens_seen": 119795525, "step": 5566, "time_per_iteration": 2.963127613067627 }, { "auxiliary_loss_clip": 0.01162735, "auxiliary_loss_mlp": 0.01028754, "balance_loss_clip": 1.00976062, "balance_loss_mlp": 1.02136314, "epoch": 0.6693921721878194, "flos": 20449188230400.0, "grad_norm": 2.4542968597138515, "language_loss": 0.80876446, "learning_rate": 1.0409737663503428e-06, "loss": 0.8306793, "num_input_tokens_seen": 119813905, "step": 5567, "time_per_iteration": 2.682868003845215 }, { "auxiliary_loss_clip": 0.01165433, "auxiliary_loss_mlp": 0.01023041, "balance_loss_clip": 1.00899887, "balance_loss_mlp": 1.01498568, "epoch": 0.6695124150784585, "flos": 16614538963200.0, "grad_norm": 1.8271939902514522, "language_loss": 0.82718801, "learning_rate": 1.040290264449196e-06, "loss": 0.84907281, "num_input_tokens_seen": 119832010, "step": 5568, "time_per_iteration": 2.6088340282440186 }, { "auxiliary_loss_clip": 0.01166068, "auxiliary_loss_mlp": 0.0102314, "balance_loss_clip": 1.01233613, "balance_loss_mlp": 1.01622248, "epoch": 0.6696326579690975, "flos": 26652145852800.0, "grad_norm": 2.549959347011034, "language_loss": 0.64071977, "learning_rate": 1.0396069081352532e-06, "loss": 0.66261184, "num_input_tokens_seen": 119851165, "step": 5569, "time_per_iteration": 2.6262435913085938 }, { "auxiliary_loss_clip": 0.01067239, "auxiliary_loss_mlp": 0.01002693, "balance_loss_clip": 1.0130235, "balance_loss_mlp": 1.00111961, "epoch": 0.6697529008597367, "flos": 66964603662720.0, "grad_norm": 0.7737345147976794, "language_loss": 0.56055963, "learning_rate": 1.0389236975121782e-06, "loss": 0.58125889, "num_input_tokens_seen": 119906015, "step": 5570, "time_per_iteration": 3.0660955905914307 }, { "auxiliary_loss_clip": 0.01174504, "auxiliary_loss_mlp": 0.01029005, "balance_loss_clip": 1.05088496, "balance_loss_mlp": 1.02152801, "epoch": 0.6698731437503758, "flos": 20886939279360.0, "grad_norm": 4.945522214162882, "language_loss": 0.71434522, "learning_rate": 1.0382406326836147e-06, "loss": 0.73638022, "num_input_tokens_seen": 119925160, "step": 5571, "time_per_iteration": 2.58292555809021 }, { "auxiliary_loss_clip": 0.01176499, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.01265597, "balance_loss_mlp": 1.02225494, "epoch": 0.6699933866410148, "flos": 20409470766720.0, "grad_norm": 2.6316632368111836, "language_loss": 0.75806463, "learning_rate": 1.0375577137531828e-06, "loss": 0.78013235, "num_input_tokens_seen": 119943720, "step": 5572, "time_per_iteration": 2.604980230331421 }, { "auxiliary_loss_clip": 0.01171769, "auxiliary_loss_mlp": 0.01023074, "balance_loss_clip": 0.97260273, "balance_loss_mlp": 1.01609123, "epoch": 0.670113629531654, "flos": 29023075900800.0, "grad_norm": 1.523957665666801, "language_loss": 0.7214964, "learning_rate": 1.0368749408244802e-06, "loss": 0.7434448, "num_input_tokens_seen": 119966640, "step": 5573, "time_per_iteration": 2.7008001804351807 }, { "auxiliary_loss_clip": 0.01162768, "auxiliary_loss_mlp": 0.01024309, "balance_loss_clip": 1.01112366, "balance_loss_mlp": 1.01741314, "epoch": 0.670233872422293, "flos": 19791699730560.0, "grad_norm": 1.7589687623400208, "language_loss": 0.78748339, "learning_rate": 1.0361923140010836e-06, "loss": 0.80935419, "num_input_tokens_seen": 119985125, "step": 5574, "time_per_iteration": 2.6079936027526855 }, { "auxiliary_loss_clip": 0.01174248, "auxiliary_loss_mlp": 0.01027883, "balance_loss_clip": 1.01075625, "balance_loss_mlp": 1.02010143, "epoch": 0.6703541153129321, "flos": 24243689070720.0, "grad_norm": 3.0065981273578277, "language_loss": 0.63205075, "learning_rate": 1.0355098333865455e-06, "loss": 0.65407205, "num_input_tokens_seen": 120004355, "step": 5575, "time_per_iteration": 2.62214994430542 }, { "auxiliary_loss_clip": 0.01170337, "auxiliary_loss_mlp": 0.01028925, "balance_loss_clip": 1.01467645, "balance_loss_mlp": 1.021415, "epoch": 0.6704743582035713, "flos": 26688523351680.0, "grad_norm": 1.7163970652772538, "language_loss": 0.69590789, "learning_rate": 1.0348274990844006e-06, "loss": 0.71790051, "num_input_tokens_seen": 120027115, "step": 5576, "time_per_iteration": 2.693668842315674 }, { "auxiliary_loss_clip": 0.01169293, "auxiliary_loss_mlp": 0.01026403, "balance_loss_clip": 1.01291871, "balance_loss_mlp": 1.01979232, "epoch": 0.6705946010942103, "flos": 23514379326720.0, "grad_norm": 1.7455595037368377, "language_loss": 0.72466016, "learning_rate": 1.034145311198155e-06, "loss": 0.74661714, "num_input_tokens_seen": 120047130, "step": 5577, "time_per_iteration": 2.638317108154297 }, { "auxiliary_loss_clip": 0.01167978, "auxiliary_loss_mlp": 0.01026267, "balance_loss_clip": 1.04828131, "balance_loss_mlp": 1.01983225, "epoch": 0.6707148439848494, "flos": 24061011477120.0, "grad_norm": 1.8133485496168862, "language_loss": 0.63793421, "learning_rate": 1.0334632698312989e-06, "loss": 0.65987659, "num_input_tokens_seen": 120067925, "step": 5578, "time_per_iteration": 2.6983537673950195 }, { "auxiliary_loss_clip": 0.01158612, "auxiliary_loss_mlp": 0.0102826, "balance_loss_clip": 0.97031826, "balance_loss_mlp": 1.02066898, "epoch": 0.6708350868754885, "flos": 22528667324160.0, "grad_norm": 2.031382067049503, "language_loss": 0.75181556, "learning_rate": 1.032781375087295e-06, "loss": 0.77368426, "num_input_tokens_seen": 120087825, "step": 5579, "time_per_iteration": 2.7136716842651367 }, { "auxiliary_loss_clip": 0.01173841, "auxiliary_loss_mlp": 0.01026085, "balance_loss_clip": 0.97560585, "balance_loss_mlp": 1.01840782, "epoch": 0.6709553297661276, "flos": 25227749047680.0, "grad_norm": 1.5002724895206827, "language_loss": 0.67610097, "learning_rate": 1.0320996270695891e-06, "loss": 0.69810027, "num_input_tokens_seen": 120108895, "step": 5580, "time_per_iteration": 3.600037097930908 }, { "auxiliary_loss_clip": 0.01164466, "auxiliary_loss_mlp": 0.01027097, "balance_loss_clip": 0.93095851, "balance_loss_mlp": 1.01994467, "epoch": 0.6710755726567667, "flos": 20448757267200.0, "grad_norm": 1.856350082965629, "language_loss": 0.7322551, "learning_rate": 1.0314180258815998e-06, "loss": 0.75417072, "num_input_tokens_seen": 120127535, "step": 5581, "time_per_iteration": 2.749659538269043 }, { "auxiliary_loss_clip": 0.01153681, "auxiliary_loss_mlp": 0.0102642, "balance_loss_clip": 0.92961276, "balance_loss_mlp": 1.01993227, "epoch": 0.6711958155474057, "flos": 25995411538560.0, "grad_norm": 2.043330905424155, "language_loss": 0.7434299, "learning_rate": 1.0307365716267247e-06, "loss": 0.76523095, "num_input_tokens_seen": 120147980, "step": 5582, "time_per_iteration": 2.7207605838775635 }, { "auxiliary_loss_clip": 0.01168625, "auxiliary_loss_mlp": 0.01025483, "balance_loss_clip": 1.01153505, "balance_loss_mlp": 1.01794291, "epoch": 0.6713160584380449, "flos": 19937712516480.0, "grad_norm": 2.108282071058105, "language_loss": 0.77988183, "learning_rate": 1.0300552644083423e-06, "loss": 0.8018229, "num_input_tokens_seen": 120166905, "step": 5583, "time_per_iteration": 2.714796543121338 }, { "auxiliary_loss_clip": 0.01175098, "auxiliary_loss_mlp": 0.01025223, "balance_loss_clip": 0.93640339, "balance_loss_mlp": 1.01768911, "epoch": 0.6714363013286839, "flos": 18223373128320.0, "grad_norm": 2.329657537333104, "language_loss": 0.72085339, "learning_rate": 1.0293741043298036e-06, "loss": 0.74285656, "num_input_tokens_seen": 120185255, "step": 5584, "time_per_iteration": 2.716700315475464 }, { "auxiliary_loss_clip": 0.01176567, "auxiliary_loss_mlp": 0.01031627, "balance_loss_clip": 0.94214356, "balance_loss_mlp": 1.0238843, "epoch": 0.671556544219323, "flos": 25812374808960.0, "grad_norm": 1.8929129101400208, "language_loss": 0.71533972, "learning_rate": 1.0286930914944436e-06, "loss": 0.73742175, "num_input_tokens_seen": 120205070, "step": 5585, "time_per_iteration": 5.065800189971924 }, { "auxiliary_loss_clip": 0.01170069, "auxiliary_loss_mlp": 0.01028042, "balance_loss_clip": 1.04696047, "balance_loss_mlp": 1.02022815, "epoch": 0.6716767871099621, "flos": 15850431918720.0, "grad_norm": 2.7539899781872355, "language_loss": 0.77057827, "learning_rate": 1.0280122260055684e-06, "loss": 0.79255939, "num_input_tokens_seen": 120220780, "step": 5586, "time_per_iteration": 3.5618488788604736 }, { "auxiliary_loss_clip": 0.01173923, "auxiliary_loss_mlp": 0.01018568, "balance_loss_clip": 1.05169261, "balance_loss_mlp": 1.01122475, "epoch": 0.6717970300006012, "flos": 19756112330880.0, "grad_norm": 2.660438601676744, "language_loss": 0.82035637, "learning_rate": 1.0273315079664652e-06, "loss": 0.84228122, "num_input_tokens_seen": 120238735, "step": 5587, "time_per_iteration": 2.564166307449341 }, { "auxiliary_loss_clip": 0.0117205, "auxiliary_loss_mlp": 0.01023787, "balance_loss_clip": 1.01076794, "balance_loss_mlp": 1.01687264, "epoch": 0.6719172728912403, "flos": 25485049146240.0, "grad_norm": 2.698481782284807, "language_loss": 0.74608493, "learning_rate": 1.0266509374803992e-06, "loss": 0.76804328, "num_input_tokens_seen": 120259895, "step": 5588, "time_per_iteration": 2.644487142562866 }, { "auxiliary_loss_clip": 0.01173305, "auxiliary_loss_mlp": 0.01122663, "balance_loss_clip": 1.04992986, "balance_loss_mlp": 0.0, "epoch": 0.6720375157818794, "flos": 15880344969600.0, "grad_norm": 2.440766291091994, "language_loss": 0.84323573, "learning_rate": 1.0259705146506123e-06, "loss": 0.86619544, "num_input_tokens_seen": 120274790, "step": 5589, "time_per_iteration": 2.5704774856567383 }, { "auxiliary_loss_clip": 0.01174201, "auxiliary_loss_mlp": 0.01024036, "balance_loss_clip": 1.01222813, "balance_loss_mlp": 1.01691949, "epoch": 0.6721577586725185, "flos": 32010843231360.0, "grad_norm": 1.966950688182901, "language_loss": 0.77583337, "learning_rate": 1.025290239580324e-06, "loss": 0.79781574, "num_input_tokens_seen": 120295460, "step": 5590, "time_per_iteration": 2.8413171768188477 }, { "auxiliary_loss_clip": 0.01162675, "auxiliary_loss_mlp": 0.01026397, "balance_loss_clip": 0.89276063, "balance_loss_mlp": 1.01875246, "epoch": 0.6722780015631575, "flos": 20737873837440.0, "grad_norm": 1.6548630633492856, "language_loss": 0.75443077, "learning_rate": 1.0246101123727313e-06, "loss": 0.77632147, "num_input_tokens_seen": 120314440, "step": 5591, "time_per_iteration": 2.7431836128234863 }, { "auxiliary_loss_clip": 0.01168994, "auxiliary_loss_mlp": 0.01022935, "balance_loss_clip": 1.01003027, "balance_loss_mlp": 1.01644969, "epoch": 0.6723982444537967, "flos": 16909617191040.0, "grad_norm": 2.0079595301198525, "language_loss": 0.78433609, "learning_rate": 1.0239301331310085e-06, "loss": 0.80625534, "num_input_tokens_seen": 120332060, "step": 5592, "time_per_iteration": 2.5968751907348633 }, { "auxiliary_loss_clip": 0.01166694, "auxiliary_loss_mlp": 0.01026032, "balance_loss_clip": 1.01025355, "balance_loss_mlp": 1.01910567, "epoch": 0.6725184873444358, "flos": 20667812359680.0, "grad_norm": 1.6230270305987595, "language_loss": 0.88327885, "learning_rate": 1.0232503019583088e-06, "loss": 0.90520608, "num_input_tokens_seen": 120351670, "step": 5593, "time_per_iteration": 2.677717685699463 }, { "auxiliary_loss_clip": 0.01166128, "auxiliary_loss_mlp": 0.01030533, "balance_loss_clip": 1.0112803, "balance_loss_mlp": 1.02350855, "epoch": 0.6726387302350748, "flos": 23727616416000.0, "grad_norm": 2.004652453659147, "language_loss": 0.69583356, "learning_rate": 1.0225706189577619e-06, "loss": 0.71780014, "num_input_tokens_seen": 120370195, "step": 5594, "time_per_iteration": 2.6670029163360596 }, { "auxiliary_loss_clip": 0.01170025, "auxiliary_loss_mlp": 0.01025615, "balance_loss_clip": 1.00981724, "balance_loss_mlp": 1.01791453, "epoch": 0.672758973125714, "flos": 15188274650880.0, "grad_norm": 2.011961271409599, "language_loss": 0.74690199, "learning_rate": 1.021891084232475e-06, "loss": 0.76885843, "num_input_tokens_seen": 120388130, "step": 5595, "time_per_iteration": 2.7872517108917236 }, { "auxiliary_loss_clip": 0.01169233, "auxiliary_loss_mlp": 0.01025013, "balance_loss_clip": 1.00979769, "balance_loss_mlp": 1.01839685, "epoch": 0.672879216016353, "flos": 18077252601600.0, "grad_norm": 2.2742203305600834, "language_loss": 0.79823643, "learning_rate": 1.0212116978855325e-06, "loss": 0.82017887, "num_input_tokens_seen": 120406145, "step": 5596, "time_per_iteration": 2.6525237560272217 }, { "auxiliary_loss_clip": 0.01163252, "auxiliary_loss_mlp": 0.01025334, "balance_loss_clip": 0.93246746, "balance_loss_mlp": 1.01781154, "epoch": 0.6729994589069921, "flos": 23476349802240.0, "grad_norm": 2.2944807786320593, "language_loss": 0.78569186, "learning_rate": 1.020532460019997e-06, "loss": 0.80757761, "num_input_tokens_seen": 120425395, "step": 5597, "time_per_iteration": 2.7829432487487793 }, { "auxiliary_loss_clip": 0.01170724, "auxiliary_loss_mlp": 0.01025205, "balance_loss_clip": 0.81971085, "balance_loss_mlp": 1.0175966, "epoch": 0.6731197017976313, "flos": 26322018929280.0, "grad_norm": 1.7350096084025706, "language_loss": 0.70779735, "learning_rate": 1.0198533707389096e-06, "loss": 0.72975659, "num_input_tokens_seen": 120446270, "step": 5598, "time_per_iteration": 2.9193215370178223 }, { "auxiliary_loss_clip": 0.01165731, "auxiliary_loss_mlp": 0.01122703, "balance_loss_clip": 1.01077092, "balance_loss_mlp": 0.0, "epoch": 0.6732399446882703, "flos": 21616428591360.0, "grad_norm": 1.6904891265534132, "language_loss": 0.73540616, "learning_rate": 1.0191744301452853e-06, "loss": 0.75829053, "num_input_tokens_seen": 120465570, "step": 5599, "time_per_iteration": 3.0810165405273438 }, { "auxiliary_loss_clip": 0.01168623, "auxiliary_loss_mlp": 0.0102431, "balance_loss_clip": 1.04789305, "balance_loss_mlp": 1.01708615, "epoch": 0.6733601875789094, "flos": 25880173729920.0, "grad_norm": 1.596986932201288, "language_loss": 0.7040844, "learning_rate": 1.0184956383421208e-06, "loss": 0.72601378, "num_input_tokens_seen": 120484220, "step": 5600, "time_per_iteration": 2.7160420417785645 }, { "auxiliary_loss_clip": 0.01172794, "auxiliary_loss_mlp": 0.01027537, "balance_loss_clip": 1.01195431, "balance_loss_mlp": 1.02066147, "epoch": 0.6734804304695485, "flos": 22929573997440.0, "grad_norm": 2.7540142384907917, "language_loss": 0.65140307, "learning_rate": 1.017816995432387e-06, "loss": 0.67340636, "num_input_tokens_seen": 120503320, "step": 5601, "time_per_iteration": 2.7826123237609863 }, { "auxiliary_loss_clip": 0.01169497, "auxiliary_loss_mlp": 0.01026714, "balance_loss_clip": 0.97360063, "balance_loss_mlp": 1.01935911, "epoch": 0.6736006733601876, "flos": 18697968552960.0, "grad_norm": 1.8527087445229178, "language_loss": 0.74258101, "learning_rate": 1.0171385015190353e-06, "loss": 0.76454312, "num_input_tokens_seen": 120523180, "step": 5602, "time_per_iteration": 2.6931209564208984 }, { "auxiliary_loss_clip": 0.01166698, "auxiliary_loss_mlp": 0.01122757, "balance_loss_clip": 0.97528207, "balance_loss_mlp": 0.0, "epoch": 0.6737209162508266, "flos": 19427745173760.0, "grad_norm": 1.920579024467232, "language_loss": 0.73254132, "learning_rate": 1.0164601567049908e-06, "loss": 0.75543588, "num_input_tokens_seen": 120541710, "step": 5603, "time_per_iteration": 2.6096713542938232 }, { "auxiliary_loss_clip": 0.01166636, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 0.97278845, "balance_loss_mlp": 1.0236783, "epoch": 0.6738411591414658, "flos": 20158060498560.0, "grad_norm": 1.8295906120720167, "language_loss": 0.80272591, "learning_rate": 1.015781961093158e-06, "loss": 0.82469839, "num_input_tokens_seen": 120561030, "step": 5604, "time_per_iteration": 2.6913812160491943 }, { "auxiliary_loss_clip": 0.01172215, "auxiliary_loss_mlp": 0.01026886, "balance_loss_clip": 0.97179788, "balance_loss_mlp": 1.01998067, "epoch": 0.6739614020321049, "flos": 21653847584640.0, "grad_norm": 1.5390560132150037, "language_loss": 0.77242267, "learning_rate": 1.0151039147864197e-06, "loss": 0.79441369, "num_input_tokens_seen": 120581005, "step": 5605, "time_per_iteration": 4.009011268615723 }, { "auxiliary_loss_clip": 0.01164263, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 0.82402188, "balance_loss_mlp": 1.02269983, "epoch": 0.6740816449227439, "flos": 19171702051200.0, "grad_norm": 3.582391477923609, "language_loss": 0.65569842, "learning_rate": 1.0144260178876336e-06, "loss": 0.67764366, "num_input_tokens_seen": 120600350, "step": 5606, "time_per_iteration": 2.824582576751709 }, { "auxiliary_loss_clip": 0.01174396, "auxiliary_loss_mlp": 0.01023856, "balance_loss_clip": 0.97348696, "balance_loss_mlp": 1.01693892, "epoch": 0.6742018878133831, "flos": 21097015971840.0, "grad_norm": 3.8162064878892017, "language_loss": 0.67093498, "learning_rate": 1.0137482704996388e-06, "loss": 0.69291747, "num_input_tokens_seen": 120614700, "step": 5607, "time_per_iteration": 2.689446449279785 }, { "auxiliary_loss_clip": 0.01171846, "auxiliary_loss_mlp": 0.01030111, "balance_loss_clip": 0.93552297, "balance_loss_mlp": 1.02244616, "epoch": 0.6743221307040221, "flos": 23549966726400.0, "grad_norm": 1.904122001832646, "language_loss": 0.78787041, "learning_rate": 1.0130706727252461e-06, "loss": 0.80989003, "num_input_tokens_seen": 120631755, "step": 5608, "time_per_iteration": 2.758591413497925 }, { "auxiliary_loss_clip": 0.01174834, "auxiliary_loss_mlp": 0.010247, "balance_loss_clip": 0.93657029, "balance_loss_mlp": 1.01766682, "epoch": 0.6744423735946612, "flos": 16249542912000.0, "grad_norm": 2.250993433735692, "language_loss": 0.67705584, "learning_rate": 1.0123932246672468e-06, "loss": 0.69905114, "num_input_tokens_seen": 120645900, "step": 5609, "time_per_iteration": 2.711188316345215 }, { "auxiliary_loss_clip": 0.01070852, "auxiliary_loss_mlp": 0.01115982, "balance_loss_clip": 0.86186337, "balance_loss_mlp": 0.0, "epoch": 0.6745626164853004, "flos": 57843257829120.0, "grad_norm": 0.7529036303924393, "language_loss": 0.55826175, "learning_rate": 1.0117159264284114e-06, "loss": 0.5801301, "num_input_tokens_seen": 120709070, "step": 5610, "time_per_iteration": 3.288889169692993 }, { "auxiliary_loss_clip": 0.01169313, "auxiliary_loss_mlp": 0.01021675, "balance_loss_clip": 0.97314227, "balance_loss_mlp": 1.01432872, "epoch": 0.6746828593759394, "flos": 20485027025280.0, "grad_norm": 1.878371774100965, "language_loss": 0.76999062, "learning_rate": 1.0110387781114837e-06, "loss": 0.7919004, "num_input_tokens_seen": 120727685, "step": 5611, "time_per_iteration": 4.431893825531006 }, { "auxiliary_loss_clip": 0.01170442, "auxiliary_loss_mlp": 0.01022849, "balance_loss_clip": 1.05010641, "balance_loss_mlp": 1.01514196, "epoch": 0.6748031022665785, "flos": 19208223204480.0, "grad_norm": 2.5955382611898603, "language_loss": 0.774822, "learning_rate": 1.0103617798191872e-06, "loss": 0.79675484, "num_input_tokens_seen": 120747160, "step": 5612, "time_per_iteration": 3.569495916366577 }, { "auxiliary_loss_clip": 0.01166145, "auxiliary_loss_mlp": 0.01031405, "balance_loss_clip": 0.97426558, "balance_loss_mlp": 1.02393603, "epoch": 0.6749233451572175, "flos": 15195026407680.0, "grad_norm": 2.026987188461399, "language_loss": 0.82505465, "learning_rate": 1.0096849316542217e-06, "loss": 0.8470301, "num_input_tokens_seen": 120763710, "step": 5613, "time_per_iteration": 2.613504648208618 }, { "auxiliary_loss_clip": 0.01145857, "auxiliary_loss_mlp": 0.01026186, "balance_loss_clip": 0.81501549, "balance_loss_mlp": 1.01861036, "epoch": 0.6750435880478567, "flos": 26499489050880.0, "grad_norm": 1.8590624164288236, "language_loss": 0.74720961, "learning_rate": 1.0090082337192643e-06, "loss": 0.76893002, "num_input_tokens_seen": 120783355, "step": 5614, "time_per_iteration": 2.8600378036499023 }, { "auxiliary_loss_clip": 0.01152838, "auxiliary_loss_mlp": 0.01027984, "balance_loss_clip": 0.85076618, "balance_loss_mlp": 1.02077198, "epoch": 0.6751638309384957, "flos": 23404313076480.0, "grad_norm": 4.07319428323177, "language_loss": 0.78068614, "learning_rate": 1.0083316861169705e-06, "loss": 0.80249435, "num_input_tokens_seen": 120802090, "step": 5615, "time_per_iteration": 2.8443751335144043 }, { "auxiliary_loss_clip": 0.01172676, "auxiliary_loss_mlp": 0.01024602, "balance_loss_clip": 0.93361658, "balance_loss_mlp": 1.01612616, "epoch": 0.6752840738291348, "flos": 23441408847360.0, "grad_norm": 2.1989188589040816, "language_loss": 0.71867955, "learning_rate": 1.0076552889499713e-06, "loss": 0.74065232, "num_input_tokens_seen": 120822855, "step": 5616, "time_per_iteration": 2.743403673171997 }, { "auxiliary_loss_clip": 0.01168392, "auxiliary_loss_mlp": 0.01027705, "balance_loss_clip": 1.01277804, "balance_loss_mlp": 1.02085936, "epoch": 0.675404316719774, "flos": 30335826257280.0, "grad_norm": 1.9847235348334233, "language_loss": 0.73896283, "learning_rate": 1.006979042320876e-06, "loss": 0.7609238, "num_input_tokens_seen": 120843070, "step": 5617, "time_per_iteration": 2.7661936283111572 }, { "auxiliary_loss_clip": 0.01162106, "auxiliary_loss_mlp": 0.01029826, "balance_loss_clip": 0.97019291, "balance_loss_mlp": 1.0223577, "epoch": 0.675524559610413, "flos": 23622613983360.0, "grad_norm": 2.035974448678337, "language_loss": 0.62690818, "learning_rate": 1.0063029463322702e-06, "loss": 0.64882743, "num_input_tokens_seen": 120863345, "step": 5618, "time_per_iteration": 2.7271385192871094 }, { "auxiliary_loss_clip": 0.01160641, "auxiliary_loss_mlp": 0.01122493, "balance_loss_clip": 0.89377481, "balance_loss_mlp": 0.0, "epoch": 0.6756448025010521, "flos": 21248631279360.0, "grad_norm": 2.186908743616244, "language_loss": 0.753802, "learning_rate": 1.0056270010867164e-06, "loss": 0.77663338, "num_input_tokens_seen": 120880915, "step": 5619, "time_per_iteration": 2.7433953285217285 }, { "auxiliary_loss_clip": 0.011703, "auxiliary_loss_mlp": 0.01025363, "balance_loss_clip": 0.97044599, "balance_loss_mlp": 1.01772749, "epoch": 0.6757650453916912, "flos": 21646521210240.0, "grad_norm": 2.56203817178192, "language_loss": 0.78287977, "learning_rate": 1.004951206686758e-06, "loss": 0.80483639, "num_input_tokens_seen": 120899190, "step": 5620, "time_per_iteration": 2.6492655277252197 }, { "auxiliary_loss_clip": 0.01162969, "auxiliary_loss_mlp": 0.01028893, "balance_loss_clip": 1.00923002, "balance_loss_mlp": 1.02116835, "epoch": 0.6758852882823303, "flos": 21795658479360.0, "grad_norm": 1.7861839619353226, "language_loss": 0.71355957, "learning_rate": 1.0042755632349087e-06, "loss": 0.73547816, "num_input_tokens_seen": 120916080, "step": 5621, "time_per_iteration": 2.6849591732025146 }, { "auxiliary_loss_clip": 0.01167173, "auxiliary_loss_mlp": 0.01029276, "balance_loss_clip": 0.93477273, "balance_loss_mlp": 1.02184296, "epoch": 0.6760055311729694, "flos": 27088783580160.0, "grad_norm": 2.013261649342481, "language_loss": 0.62338996, "learning_rate": 1.0036000708336653e-06, "loss": 0.64535445, "num_input_tokens_seen": 120935210, "step": 5622, "time_per_iteration": 2.7785449028015137 }, { "auxiliary_loss_clip": 0.01172407, "auxiliary_loss_mlp": 0.0103142, "balance_loss_clip": 0.97345626, "balance_loss_mlp": 1.02399623, "epoch": 0.6761257740636085, "flos": 17999792922240.0, "grad_norm": 2.2577887751550603, "language_loss": 0.79424095, "learning_rate": 1.0029247295854984e-06, "loss": 0.81627923, "num_input_tokens_seen": 120951830, "step": 5623, "time_per_iteration": 2.7091352939605713 }, { "auxiliary_loss_clip": 0.01173478, "auxiliary_loss_mlp": 0.01032063, "balance_loss_clip": 0.93602359, "balance_loss_mlp": 1.02492571, "epoch": 0.6762460169542476, "flos": 15121912273920.0, "grad_norm": 1.715683474359595, "language_loss": 0.71597636, "learning_rate": 1.0022495395928588e-06, "loss": 0.73803174, "num_input_tokens_seen": 120970310, "step": 5624, "time_per_iteration": 2.750312089920044 }, { "auxiliary_loss_clip": 0.01064988, "auxiliary_loss_mlp": 0.01002122, "balance_loss_clip": 1.0112021, "balance_loss_mlp": 1.00048876, "epoch": 0.6763662598448866, "flos": 67886970030720.0, "grad_norm": 0.7941691167042878, "language_loss": 0.62375724, "learning_rate": 1.0015745009581697e-06, "loss": 0.64442831, "num_input_tokens_seen": 121031915, "step": 5625, "time_per_iteration": 3.25520658493042 }, { "auxiliary_loss_clip": 0.01173563, "auxiliary_loss_mlp": 0.01026506, "balance_loss_clip": 1.01504183, "balance_loss_mlp": 1.0192219, "epoch": 0.6764865027355258, "flos": 20631829910400.0, "grad_norm": 1.8386399311957164, "language_loss": 0.66937912, "learning_rate": 1.0008996137838343e-06, "loss": 0.69137979, "num_input_tokens_seen": 121050890, "step": 5626, "time_per_iteration": 2.649766683578491 }, { "auxiliary_loss_clip": 0.01176445, "auxiliary_loss_mlp": 0.01027774, "balance_loss_clip": 1.04966629, "balance_loss_mlp": 1.0196023, "epoch": 0.6766067456261649, "flos": 21215809226880.0, "grad_norm": 1.9841884551136904, "language_loss": 0.79673028, "learning_rate": 1.000224878172234e-06, "loss": 0.81877244, "num_input_tokens_seen": 121070015, "step": 5627, "time_per_iteration": 2.6607861518859863 }, { "auxiliary_loss_clip": 0.01172392, "auxiliary_loss_mlp": 0.01022048, "balance_loss_clip": 1.0116719, "balance_loss_mlp": 1.01484144, "epoch": 0.6767269885168039, "flos": 19938251220480.0, "grad_norm": 1.9998268636434067, "language_loss": 0.72944009, "learning_rate": 9.99550294225724e-07, "loss": 0.7513845, "num_input_tokens_seen": 121089170, "step": 5628, "time_per_iteration": 2.6604793071746826 }, { "auxiliary_loss_clip": 0.01169219, "auxiliary_loss_mlp": 0.01032182, "balance_loss_clip": 0.89397204, "balance_loss_mlp": 1.02449632, "epoch": 0.6768472314074431, "flos": 20814076540800.0, "grad_norm": 1.9580734528840984, "language_loss": 0.72569036, "learning_rate": 9.988758620466402e-07, "loss": 0.74770439, "num_input_tokens_seen": 121108040, "step": 5629, "time_per_iteration": 2.711390733718872 }, { "auxiliary_loss_clip": 0.01169561, "auxiliary_loss_mlp": 0.01027405, "balance_loss_clip": 0.85679674, "balance_loss_mlp": 1.02060974, "epoch": 0.6769674742980821, "flos": 23186012169600.0, "grad_norm": 1.4408927493891752, "language_loss": 0.76030707, "learning_rate": 9.982015817372917e-07, "loss": 0.78227675, "num_input_tokens_seen": 121128480, "step": 5630, "time_per_iteration": 2.8167552947998047 }, { "auxiliary_loss_clip": 0.01161622, "auxiliary_loss_mlp": 0.01028525, "balance_loss_clip": 0.89243841, "balance_loss_mlp": 1.0208714, "epoch": 0.6770877171887212, "flos": 24242934885120.0, "grad_norm": 1.8739460583281569, "language_loss": 0.82058489, "learning_rate": 9.975274533999657e-07, "loss": 0.84248632, "num_input_tokens_seen": 121148010, "step": 5631, "time_per_iteration": 3.78566575050354 }, { "auxiliary_loss_clip": 0.0117096, "auxiliary_loss_mlp": 0.0102418, "balance_loss_clip": 1.04803765, "balance_loss_mlp": 1.0163238, "epoch": 0.6772079600793603, "flos": 18141567903360.0, "grad_norm": 2.42301461340185, "language_loss": 0.8423388, "learning_rate": 9.96853477136929e-07, "loss": 0.86429018, "num_input_tokens_seen": 121162755, "step": 5632, "time_per_iteration": 2.6560187339782715 }, { "auxiliary_loss_clip": 0.01157217, "auxiliary_loss_mlp": 0.010302, "balance_loss_clip": 0.92988414, "balance_loss_mlp": 1.02246881, "epoch": 0.6773282029699994, "flos": 22452069571200.0, "grad_norm": 1.9742853453634266, "language_loss": 0.75579369, "learning_rate": 9.96179653050422e-07, "loss": 0.77766788, "num_input_tokens_seen": 121182915, "step": 5633, "time_per_iteration": 2.7428526878356934 }, { "auxiliary_loss_clip": 0.01157466, "auxiliary_loss_mlp": 0.01023163, "balance_loss_clip": 0.93199211, "balance_loss_mlp": 1.01563215, "epoch": 0.6774484458606385, "flos": 18693730748160.0, "grad_norm": 2.0817218197950487, "language_loss": 0.73989373, "learning_rate": 9.955059812426635e-07, "loss": 0.76170003, "num_input_tokens_seen": 121200445, "step": 5634, "time_per_iteration": 2.753983497619629 }, { "auxiliary_loss_clip": 0.01172287, "auxiliary_loss_mlp": 0.0102336, "balance_loss_clip": 1.05079722, "balance_loss_mlp": 1.01648188, "epoch": 0.6775686887512776, "flos": 25994046821760.0, "grad_norm": 2.552279912227689, "language_loss": 0.8270396, "learning_rate": 9.948324618158493e-07, "loss": 0.8489961, "num_input_tokens_seen": 121220785, "step": 5635, "time_per_iteration": 2.637824296951294 }, { "auxiliary_loss_clip": 0.0116909, "auxiliary_loss_mlp": 0.01027505, "balance_loss_clip": 1.00824475, "balance_loss_mlp": 1.02016747, "epoch": 0.6776889316419167, "flos": 13587987922560.0, "grad_norm": 2.231658529873091, "language_loss": 0.77496445, "learning_rate": 9.941590948721502e-07, "loss": 0.79693043, "num_input_tokens_seen": 121237985, "step": 5636, "time_per_iteration": 3.51859450340271 }, { "auxiliary_loss_clip": 0.01161899, "auxiliary_loss_mlp": 0.0102102, "balance_loss_clip": 0.97239602, "balance_loss_mlp": 1.01382565, "epoch": 0.6778091745325557, "flos": 27601121220480.0, "grad_norm": 1.7092363144492346, "language_loss": 0.7635746, "learning_rate": 9.934858805137188e-07, "loss": 0.78540379, "num_input_tokens_seen": 121258635, "step": 5637, "time_per_iteration": 3.747009515762329 }, { "auxiliary_loss_clip": 0.0116625, "auxiliary_loss_mlp": 0.01024963, "balance_loss_clip": 1.01127815, "balance_loss_mlp": 1.01805472, "epoch": 0.6779294174231949, "flos": 18734058743040.0, "grad_norm": 7.6350717216944615, "language_loss": 0.80390704, "learning_rate": 9.92812818842677e-07, "loss": 0.82581925, "num_input_tokens_seen": 121277810, "step": 5638, "time_per_iteration": 3.6035659313201904 }, { "auxiliary_loss_clip": 0.01169576, "auxiliary_loss_mlp": 0.01031601, "balance_loss_clip": 1.01154661, "balance_loss_mlp": 1.02434683, "epoch": 0.678049660313834, "flos": 45873797765760.0, "grad_norm": 1.6301122502483503, "language_loss": 0.64513677, "learning_rate": 9.921399099611306e-07, "loss": 0.66714853, "num_input_tokens_seen": 121298975, "step": 5639, "time_per_iteration": 2.8734097480773926 }, { "auxiliary_loss_clip": 0.01168539, "auxiliary_loss_mlp": 0.0102498, "balance_loss_clip": 0.97160566, "balance_loss_mlp": 1.01808333, "epoch": 0.678169903204473, "flos": 19974556892160.0, "grad_norm": 2.0839169569069673, "language_loss": 0.69272655, "learning_rate": 9.914671539711588e-07, "loss": 0.71466178, "num_input_tokens_seen": 121318495, "step": 5640, "time_per_iteration": 2.6640939712524414 }, { "auxiliary_loss_clip": 0.01165953, "auxiliary_loss_mlp": 0.01123117, "balance_loss_clip": 0.78170496, "balance_loss_mlp": 0.0, "epoch": 0.6782901460951122, "flos": 21395613732480.0, "grad_norm": 4.607776806713373, "language_loss": 0.78445661, "learning_rate": 9.90794550974817e-07, "loss": 0.8073473, "num_input_tokens_seen": 121338890, "step": 5641, "time_per_iteration": 2.9818103313446045 }, { "auxiliary_loss_clip": 0.01163121, "auxiliary_loss_mlp": 0.01029328, "balance_loss_clip": 0.93465424, "balance_loss_mlp": 1.02171659, "epoch": 0.6784103889857512, "flos": 21434002392960.0, "grad_norm": 2.149886572731091, "language_loss": 0.81241554, "learning_rate": 9.901221010741407e-07, "loss": 0.83434004, "num_input_tokens_seen": 121358210, "step": 5642, "time_per_iteration": 3.4884440898895264 }, { "auxiliary_loss_clip": 0.01172338, "auxiliary_loss_mlp": 0.01029729, "balance_loss_clip": 1.01121402, "balance_loss_mlp": 1.02248979, "epoch": 0.6785306318763903, "flos": 32671923091200.0, "grad_norm": 1.7867383118866773, "language_loss": 0.74513018, "learning_rate": 9.894498043711375e-07, "loss": 0.76715088, "num_input_tokens_seen": 121379955, "step": 5643, "time_per_iteration": 2.79034161567688 }, { "auxiliary_loss_clip": 0.01164513, "auxiliary_loss_mlp": 0.01022684, "balance_loss_clip": 0.97069633, "balance_loss_mlp": 1.015517, "epoch": 0.6786508747670293, "flos": 25632139340160.0, "grad_norm": 1.9556092053341714, "language_loss": 0.69520903, "learning_rate": 9.887776609677962e-07, "loss": 0.71708101, "num_input_tokens_seen": 121401325, "step": 5644, "time_per_iteration": 2.7453815937042236 }, { "auxiliary_loss_clip": 0.011538, "auxiliary_loss_mlp": 0.01026594, "balance_loss_clip": 0.92968225, "balance_loss_mlp": 1.01913738, "epoch": 0.6787711176576685, "flos": 19171881619200.0, "grad_norm": 1.6185271372067218, "language_loss": 0.72078145, "learning_rate": 9.88105670966079e-07, "loss": 0.74258536, "num_input_tokens_seen": 121419785, "step": 5645, "time_per_iteration": 2.716728448867798 }, { "auxiliary_loss_clip": 0.01153871, "auxiliary_loss_mlp": 0.01028599, "balance_loss_clip": 0.89615571, "balance_loss_mlp": 1.02108884, "epoch": 0.6788913605483076, "flos": 13985159581440.0, "grad_norm": 1.783948804590144, "language_loss": 0.78969872, "learning_rate": 9.874338344679283e-07, "loss": 0.81152338, "num_input_tokens_seen": 121435630, "step": 5646, "time_per_iteration": 2.8618764877319336 }, { "auxiliary_loss_clip": 0.01168683, "auxiliary_loss_mlp": 0.01023357, "balance_loss_clip": 1.04977918, "balance_loss_mlp": 1.01616311, "epoch": 0.6790116034389466, "flos": 22017586659840.0, "grad_norm": 1.8340813721520082, "language_loss": 0.7446245, "learning_rate": 9.86762151575259e-07, "loss": 0.76654482, "num_input_tokens_seen": 121455625, "step": 5647, "time_per_iteration": 2.6497247219085693 }, { "auxiliary_loss_clip": 0.01170315, "auxiliary_loss_mlp": 0.01122103, "balance_loss_clip": 0.89817512, "balance_loss_mlp": 0.0, "epoch": 0.6791318463295858, "flos": 20922454851840.0, "grad_norm": 1.6646444159863294, "language_loss": 0.80386019, "learning_rate": 9.860906223899651e-07, "loss": 0.82678431, "num_input_tokens_seen": 121475020, "step": 5648, "time_per_iteration": 2.7790586948394775 }, { "auxiliary_loss_clip": 0.01172238, "auxiliary_loss_mlp": 0.01026357, "balance_loss_clip": 0.97141761, "balance_loss_mlp": 1.01932657, "epoch": 0.6792520892202248, "flos": 28512749422080.0, "grad_norm": 1.6015180122888437, "language_loss": 0.75703299, "learning_rate": 9.854192470139184e-07, "loss": 0.77901894, "num_input_tokens_seen": 121496500, "step": 5649, "time_per_iteration": 2.701676845550537 }, { "auxiliary_loss_clip": 0.01169138, "auxiliary_loss_mlp": 0.01028346, "balance_loss_clip": 0.9752143, "balance_loss_mlp": 1.02148223, "epoch": 0.6793723321108639, "flos": 20011904058240.0, "grad_norm": 2.770286259325702, "language_loss": 0.71724266, "learning_rate": 9.847480255489645e-07, "loss": 0.73921752, "num_input_tokens_seen": 121515525, "step": 5650, "time_per_iteration": 2.685312032699585 }, { "auxiliary_loss_clip": 0.01172754, "auxiliary_loss_mlp": 0.01026494, "balance_loss_clip": 0.97183514, "balance_loss_mlp": 1.01866174, "epoch": 0.6794925750015031, "flos": 26649488246400.0, "grad_norm": 1.6209848438253913, "language_loss": 0.68994021, "learning_rate": 9.840769580969295e-07, "loss": 0.71193278, "num_input_tokens_seen": 121535965, "step": 5651, "time_per_iteration": 2.860853433609009 }, { "auxiliary_loss_clip": 0.01159585, "auxiliary_loss_mlp": 0.01027759, "balance_loss_clip": 1.00837493, "balance_loss_mlp": 1.02004623, "epoch": 0.6796128178921421, "flos": 21580374314880.0, "grad_norm": 1.7856862696730422, "language_loss": 0.79621798, "learning_rate": 9.834060447596114e-07, "loss": 0.81809139, "num_input_tokens_seen": 121555235, "step": 5652, "time_per_iteration": 2.6197242736816406 }, { "auxiliary_loss_clip": 0.01168404, "auxiliary_loss_mlp": 0.01026203, "balance_loss_clip": 1.00921321, "balance_loss_mlp": 1.01850224, "epoch": 0.6797330607827812, "flos": 22492002516480.0, "grad_norm": 1.9762641618081038, "language_loss": 0.77778184, "learning_rate": 9.827352856387868e-07, "loss": 0.79972792, "num_input_tokens_seen": 121574945, "step": 5653, "time_per_iteration": 2.6298670768737793 }, { "auxiliary_loss_clip": 0.0106985, "auxiliary_loss_mlp": 0.01003715, "balance_loss_clip": 0.86422467, "balance_loss_mlp": 1.00192726, "epoch": 0.6798533036734203, "flos": 66306648286080.0, "grad_norm": 0.7763273066117955, "language_loss": 0.64278716, "learning_rate": 9.820646808362118e-07, "loss": 0.66352284, "num_input_tokens_seen": 121641200, "step": 5654, "time_per_iteration": 3.399152994155884 }, { "auxiliary_loss_clip": 0.01163356, "auxiliary_loss_mlp": 0.01022654, "balance_loss_clip": 0.97231877, "balance_loss_mlp": 1.01604056, "epoch": 0.6799735465640594, "flos": 16180163792640.0, "grad_norm": 3.4110806643419367, "language_loss": 0.73340219, "learning_rate": 9.813942304536154e-07, "loss": 0.75526226, "num_input_tokens_seen": 121659170, "step": 5655, "time_per_iteration": 2.6342203617095947 }, { "auxiliary_loss_clip": 0.01169229, "auxiliary_loss_mlp": 0.01022481, "balance_loss_clip": 0.97202861, "balance_loss_mlp": 1.01512313, "epoch": 0.6800937894546984, "flos": 22125749489280.0, "grad_norm": 1.9262015584175645, "language_loss": 0.63676572, "learning_rate": 9.807239345927043e-07, "loss": 0.65868282, "num_input_tokens_seen": 121679180, "step": 5656, "time_per_iteration": 2.726402521133423 }, { "auxiliary_loss_clip": 0.01171655, "auxiliary_loss_mlp": 0.01021376, "balance_loss_clip": 0.97086471, "balance_loss_mlp": 1.01418781, "epoch": 0.6802140323453376, "flos": 31612953300480.0, "grad_norm": 2.033279171575098, "language_loss": 0.72460526, "learning_rate": 9.80053793355162e-07, "loss": 0.74653554, "num_input_tokens_seen": 121697875, "step": 5657, "time_per_iteration": 3.734870433807373 }, { "auxiliary_loss_clip": 0.01161891, "auxiliary_loss_mlp": 0.01025104, "balance_loss_clip": 0.89475054, "balance_loss_mlp": 1.01755786, "epoch": 0.6803342752359767, "flos": 17712938908800.0, "grad_norm": 1.981066104911819, "language_loss": 0.74687165, "learning_rate": 9.793838068426472e-07, "loss": 0.76874161, "num_input_tokens_seen": 121715570, "step": 5658, "time_per_iteration": 2.6650161743164062 }, { "auxiliary_loss_clip": 0.01169487, "auxiliary_loss_mlp": 0.0102716, "balance_loss_clip": 1.04838073, "balance_loss_mlp": 1.02020717, "epoch": 0.6804545181266157, "flos": 11326800902400.0, "grad_norm": 2.1161386779564304, "language_loss": 0.61328065, "learning_rate": 9.78713975156799e-07, "loss": 0.63524711, "num_input_tokens_seen": 121731435, "step": 5659, "time_per_iteration": 2.6187477111816406 }, { "auxiliary_loss_clip": 0.01170113, "auxiliary_loss_mlp": 0.0103342, "balance_loss_clip": 0.93704432, "balance_loss_mlp": 1.02579987, "epoch": 0.6805747610172549, "flos": 29350976181120.0, "grad_norm": 1.8804740309071766, "language_loss": 0.71334946, "learning_rate": 9.780442983992273e-07, "loss": 0.73538476, "num_input_tokens_seen": 121749950, "step": 5660, "time_per_iteration": 2.752934455871582 }, { "auxiliary_loss_clip": 0.01162938, "auxiliary_loss_mlp": 0.01026204, "balance_loss_clip": 0.97238445, "balance_loss_mlp": 1.0190959, "epoch": 0.680695003907894, "flos": 37631868612480.0, "grad_norm": 1.646135833218756, "language_loss": 0.71740401, "learning_rate": 9.773747766715238e-07, "loss": 0.73929542, "num_input_tokens_seen": 121770770, "step": 5661, "time_per_iteration": 2.81315279006958 }, { "auxiliary_loss_clip": 0.01170215, "auxiliary_loss_mlp": 0.01027415, "balance_loss_clip": 0.97142017, "balance_loss_mlp": 1.02028346, "epoch": 0.680815246798533, "flos": 22127365601280.0, "grad_norm": 1.701229674930598, "language_loss": 0.80382133, "learning_rate": 9.767054100752536e-07, "loss": 0.82579768, "num_input_tokens_seen": 121790720, "step": 5662, "time_per_iteration": 2.677824020385742 }, { "auxiliary_loss_clip": 0.01171311, "auxiliary_loss_mlp": 0.01023614, "balance_loss_clip": 0.93562198, "balance_loss_mlp": 1.01671791, "epoch": 0.6809354896891722, "flos": 17201822330880.0, "grad_norm": 1.9901152542192784, "language_loss": 0.81818771, "learning_rate": 9.760361987119584e-07, "loss": 0.84013689, "num_input_tokens_seen": 121808455, "step": 5663, "time_per_iteration": 3.6109368801116943 }, { "auxiliary_loss_clip": 0.0116477, "auxiliary_loss_mlp": 0.01024158, "balance_loss_clip": 0.97208792, "balance_loss_mlp": 1.01688051, "epoch": 0.6810557325798112, "flos": 12458166554880.0, "grad_norm": 3.335231407741031, "language_loss": 0.67571008, "learning_rate": 9.753671426831592e-07, "loss": 0.69759929, "num_input_tokens_seen": 121824470, "step": 5664, "time_per_iteration": 3.5523483753204346 }, { "auxiliary_loss_clip": 0.01158599, "auxiliary_loss_mlp": 0.01026004, "balance_loss_clip": 1.00642347, "balance_loss_mlp": 1.01864839, "epoch": 0.6811759754704503, "flos": 22156165330560.0, "grad_norm": 1.9473945711620657, "language_loss": 0.79466164, "learning_rate": 9.746982420903483e-07, "loss": 0.8165077, "num_input_tokens_seen": 121842665, "step": 5665, "time_per_iteration": 2.657966136932373 }, { "auxiliary_loss_clip": 0.01169279, "auxiliary_loss_mlp": 0.01024135, "balance_loss_clip": 1.01258945, "balance_loss_mlp": 1.01780462, "epoch": 0.6812962183610894, "flos": 17525377065600.0, "grad_norm": 2.036906537045561, "language_loss": 0.74693274, "learning_rate": 9.740294970349993e-07, "loss": 0.76886678, "num_input_tokens_seen": 121859080, "step": 5666, "time_per_iteration": 2.610193967819214 }, { "auxiliary_loss_clip": 0.0106906, "auxiliary_loss_mlp": 0.01001937, "balance_loss_clip": 0.93648714, "balance_loss_mlp": 1.0003041, "epoch": 0.6814164612517285, "flos": 60274480855680.0, "grad_norm": 0.8892427107516558, "language_loss": 0.60984886, "learning_rate": 9.733609076185594e-07, "loss": 0.63055885, "num_input_tokens_seen": 121915485, "step": 5667, "time_per_iteration": 3.1546268463134766 }, { "auxiliary_loss_clip": 0.01172768, "auxiliary_loss_mlp": 0.01026519, "balance_loss_clip": 1.01388061, "balance_loss_mlp": 1.01881766, "epoch": 0.6815367041423676, "flos": 19317750750720.0, "grad_norm": 2.2646465038586605, "language_loss": 0.83534312, "learning_rate": 9.72692473942455e-07, "loss": 0.85733604, "num_input_tokens_seen": 121932710, "step": 5668, "time_per_iteration": 2.5655906200408936 }, { "auxiliary_loss_clip": 0.01172894, "auxiliary_loss_mlp": 0.01028194, "balance_loss_clip": 0.89981669, "balance_loss_mlp": 1.02057648, "epoch": 0.6816569470330067, "flos": 22161696024960.0, "grad_norm": 1.46528575213011, "language_loss": 0.77539623, "learning_rate": 9.720241961080849e-07, "loss": 0.79740703, "num_input_tokens_seen": 121952025, "step": 5669, "time_per_iteration": 2.7123289108276367 }, { "auxiliary_loss_clip": 0.01167537, "auxiliary_loss_mlp": 0.01027321, "balance_loss_clip": 1.04638195, "balance_loss_mlp": 1.02013886, "epoch": 0.6817771899236458, "flos": 41463501137280.0, "grad_norm": 1.890333123041085, "language_loss": 0.73359251, "learning_rate": 9.713560742168259e-07, "loss": 0.75554115, "num_input_tokens_seen": 121974650, "step": 5670, "time_per_iteration": 2.766113519668579 }, { "auxiliary_loss_clip": 0.01163206, "auxiliary_loss_mlp": 0.01026144, "balance_loss_clip": 0.93290311, "balance_loss_mlp": 1.01850319, "epoch": 0.6818974328142848, "flos": 21106138026240.0, "grad_norm": 3.090200910966046, "language_loss": 0.71581727, "learning_rate": 9.706881083700333e-07, "loss": 0.73771071, "num_input_tokens_seen": 121994335, "step": 5671, "time_per_iteration": 2.701481819152832 }, { "auxiliary_loss_clip": 0.01164168, "auxiliary_loss_mlp": 0.01024337, "balance_loss_clip": 0.86091858, "balance_loss_mlp": 1.01722062, "epoch": 0.682017675704924, "flos": 20441897769600.0, "grad_norm": 1.925268549922729, "language_loss": 0.82768464, "learning_rate": 9.700202986690357e-07, "loss": 0.84956968, "num_input_tokens_seen": 122012635, "step": 5672, "time_per_iteration": 2.7278621196746826 }, { "auxiliary_loss_clip": 0.01164693, "auxiliary_loss_mlp": 0.01122798, "balance_loss_clip": 1.00862098, "balance_loss_mlp": 0.0, "epoch": 0.682137918595563, "flos": 20044438801920.0, "grad_norm": 2.12747691017428, "language_loss": 0.66559947, "learning_rate": 9.693526452151413e-07, "loss": 0.68847442, "num_input_tokens_seen": 122031685, "step": 5673, "time_per_iteration": 2.6452839374542236 }, { "auxiliary_loss_clip": 0.01171923, "auxiliary_loss_mlp": 0.01027946, "balance_loss_clip": 0.93374604, "balance_loss_mlp": 1.02021551, "epoch": 0.6822581614862021, "flos": 31684559063040.0, "grad_norm": 1.7534520254675976, "language_loss": 0.75541687, "learning_rate": 9.686851481096305e-07, "loss": 0.77741551, "num_input_tokens_seen": 122052995, "step": 5674, "time_per_iteration": 2.7362258434295654 }, { "auxiliary_loss_clip": 0.01164323, "auxiliary_loss_mlp": 0.01024756, "balance_loss_clip": 0.8574183, "balance_loss_mlp": 1.01718044, "epoch": 0.6823784043768413, "flos": 23477570864640.0, "grad_norm": 1.9319822361127235, "language_loss": 0.71868598, "learning_rate": 9.68017807453762e-07, "loss": 0.74057674, "num_input_tokens_seen": 122071740, "step": 5675, "time_per_iteration": 2.7819225788116455 }, { "auxiliary_loss_clip": 0.01171943, "auxiliary_loss_mlp": 0.0112228, "balance_loss_clip": 0.97498691, "balance_loss_mlp": 0.0, "epoch": 0.6824986472674803, "flos": 14137134024960.0, "grad_norm": 1.761421148854184, "language_loss": 0.73617792, "learning_rate": 9.673506233487721e-07, "loss": 0.75912011, "num_input_tokens_seen": 122089705, "step": 5676, "time_per_iteration": 2.6875076293945312 }, { "auxiliary_loss_clip": 0.01167317, "auxiliary_loss_mlp": 0.01121702, "balance_loss_clip": 0.97114223, "balance_loss_mlp": 0.0, "epoch": 0.6826188901581194, "flos": 21504997624320.0, "grad_norm": 1.7386191609173802, "language_loss": 0.86436558, "learning_rate": 9.666835958958717e-07, "loss": 0.88725573, "num_input_tokens_seen": 122109025, "step": 5677, "time_per_iteration": 2.7231693267822266 }, { "auxiliary_loss_clip": 0.01169503, "auxiliary_loss_mlp": 0.01032844, "balance_loss_clip": 1.0494523, "balance_loss_mlp": 1.02560186, "epoch": 0.6827391330487584, "flos": 20810126044800.0, "grad_norm": 2.2375422019648363, "language_loss": 0.80207342, "learning_rate": 9.660167251962484e-07, "loss": 0.82409692, "num_input_tokens_seen": 122127385, "step": 5678, "time_per_iteration": 2.655046224594116 }, { "auxiliary_loss_clip": 0.01167597, "auxiliary_loss_mlp": 0.01029322, "balance_loss_clip": 0.93200755, "balance_loss_mlp": 1.02249479, "epoch": 0.6828593759393976, "flos": 21688788539520.0, "grad_norm": 1.6108298463616646, "language_loss": 0.77431166, "learning_rate": 9.653500113510654e-07, "loss": 0.79628086, "num_input_tokens_seen": 122146500, "step": 5679, "time_per_iteration": 2.690657377243042 }, { "auxiliary_loss_clip": 0.01160108, "auxiliary_loss_mlp": 0.0101962, "balance_loss_clip": 0.96938497, "balance_loss_mlp": 1.01229215, "epoch": 0.6829796188300367, "flos": 25337707557120.0, "grad_norm": 2.328794375436072, "language_loss": 0.66859907, "learning_rate": 9.646834544614627e-07, "loss": 0.69039631, "num_input_tokens_seen": 122167000, "step": 5680, "time_per_iteration": 2.72124981880188 }, { "auxiliary_loss_clip": 0.01160618, "auxiliary_loss_mlp": 0.01023914, "balance_loss_clip": 0.97307074, "balance_loss_mlp": 1.01664186, "epoch": 0.6830998617206757, "flos": 20704800389760.0, "grad_norm": 2.079206698687657, "language_loss": 0.7641291, "learning_rate": 9.64017054628558e-07, "loss": 0.78597438, "num_input_tokens_seen": 122185825, "step": 5681, "time_per_iteration": 2.684319496154785 }, { "auxiliary_loss_clip": 0.01165788, "auxiliary_loss_mlp": 0.01022622, "balance_loss_clip": 0.89320838, "balance_loss_mlp": 1.01559794, "epoch": 0.6832201046113149, "flos": 21726638496000.0, "grad_norm": 6.114959794603954, "language_loss": 0.78827691, "learning_rate": 9.63350811953441e-07, "loss": 0.81016105, "num_input_tokens_seen": 122206200, "step": 5682, "time_per_iteration": 2.796278953552246 }, { "auxiliary_loss_clip": 0.01166239, "auxiliary_loss_mlp": 0.01024794, "balance_loss_clip": 0.93358994, "balance_loss_mlp": 1.01747417, "epoch": 0.6833403475019539, "flos": 19536554448000.0, "grad_norm": 1.9717724440883153, "language_loss": 0.70304692, "learning_rate": 9.626847265371826e-07, "loss": 0.72495723, "num_input_tokens_seen": 122225520, "step": 5683, "time_per_iteration": 3.7383270263671875 }, { "auxiliary_loss_clip": 0.01158753, "auxiliary_loss_mlp": 0.0102457, "balance_loss_clip": 0.9702211, "balance_loss_mlp": 1.01765013, "epoch": 0.683460590392593, "flos": 19352153001600.0, "grad_norm": 12.005205583506507, "language_loss": 0.78907949, "learning_rate": 9.620187984808262e-07, "loss": 0.81091273, "num_input_tokens_seen": 122244320, "step": 5684, "time_per_iteration": 2.668665647506714 }, { "auxiliary_loss_clip": 0.01170982, "auxiliary_loss_mlp": 0.01122566, "balance_loss_clip": 0.97444516, "balance_loss_mlp": 0.0, "epoch": 0.6835808332832322, "flos": 23288500650240.0, "grad_norm": 1.8704651115664788, "language_loss": 0.8578155, "learning_rate": 9.613530278853919e-07, "loss": 0.88075101, "num_input_tokens_seen": 122264295, "step": 5685, "time_per_iteration": 2.7378733158111572 }, { "auxiliary_loss_clip": 0.01170832, "auxiliary_loss_mlp": 0.01023226, "balance_loss_clip": 1.01185656, "balance_loss_mlp": 1.01565671, "epoch": 0.6837010761738712, "flos": 21653416621440.0, "grad_norm": 26.21147181818955, "language_loss": 0.74025989, "learning_rate": 9.60687414851879e-07, "loss": 0.76220047, "num_input_tokens_seen": 122285300, "step": 5686, "time_per_iteration": 2.625521421432495 }, { "auxiliary_loss_clip": 0.01172513, "auxiliary_loss_mlp": 0.01024304, "balance_loss_clip": 0.97480989, "balance_loss_mlp": 1.01665354, "epoch": 0.6838213190645103, "flos": 17566387418880.0, "grad_norm": 2.2432732975157976, "language_loss": 0.76657844, "learning_rate": 9.600219594812575e-07, "loss": 0.78854656, "num_input_tokens_seen": 122303240, "step": 5687, "time_per_iteration": 2.677467107772827 }, { "auxiliary_loss_clip": 0.0116893, "auxiliary_loss_mlp": 0.01028547, "balance_loss_clip": 1.04943967, "balance_loss_mlp": 1.02135551, "epoch": 0.6839415619551494, "flos": 23112538899840.0, "grad_norm": 1.5633382636058244, "language_loss": 0.72672129, "learning_rate": 9.593566618744786e-07, "loss": 0.74869609, "num_input_tokens_seen": 122323390, "step": 5688, "time_per_iteration": 3.4619925022125244 }, { "auxiliary_loss_clip": 0.01168691, "auxiliary_loss_mlp": 0.01023243, "balance_loss_clip": 1.04875338, "balance_loss_mlp": 1.01619148, "epoch": 0.6840618048457885, "flos": 22127868391680.0, "grad_norm": 1.7870486009265363, "language_loss": 0.73804379, "learning_rate": 9.58691522132466e-07, "loss": 0.7599631, "num_input_tokens_seen": 122342200, "step": 5689, "time_per_iteration": 3.5681838989257812 }, { "auxiliary_loss_clip": 0.01173893, "auxiliary_loss_mlp": 0.01021776, "balance_loss_clip": 0.97550303, "balance_loss_mlp": 1.01409924, "epoch": 0.6841820477364275, "flos": 22015898720640.0, "grad_norm": 2.0277274732511215, "language_loss": 0.84717095, "learning_rate": 9.58026540356123e-07, "loss": 0.86912763, "num_input_tokens_seen": 122360465, "step": 5690, "time_per_iteration": 2.6742212772369385 }, { "auxiliary_loss_clip": 0.01169144, "auxiliary_loss_mlp": 0.01027585, "balance_loss_clip": 1.00976682, "balance_loss_mlp": 1.02023327, "epoch": 0.6843022906270667, "flos": 24900531125760.0, "grad_norm": 1.6293353318030808, "language_loss": 0.86733997, "learning_rate": 9.573617166463246e-07, "loss": 0.88930726, "num_input_tokens_seen": 122381680, "step": 5691, "time_per_iteration": 3.7582955360412598 }, { "auxiliary_loss_clip": 0.0116828, "auxiliary_loss_mlp": 0.01027139, "balance_loss_clip": 0.97071612, "balance_loss_mlp": 1.0196588, "epoch": 0.6844225335177058, "flos": 19969924037760.0, "grad_norm": 1.765440173020377, "language_loss": 0.60055315, "learning_rate": 9.56697051103924e-07, "loss": 0.62250733, "num_input_tokens_seen": 122399120, "step": 5692, "time_per_iteration": 2.643012523651123 }, { "auxiliary_loss_clip": 0.01164814, "auxiliary_loss_mlp": 0.0103247, "balance_loss_clip": 0.97114694, "balance_loss_mlp": 1.02577937, "epoch": 0.6845427764083448, "flos": 25883334126720.0, "grad_norm": 1.8427055331437334, "language_loss": 0.80780947, "learning_rate": 9.560325438297522e-07, "loss": 0.82978237, "num_input_tokens_seen": 122417430, "step": 5693, "time_per_iteration": 2.683169364929199 }, { "auxiliary_loss_clip": 0.01171115, "auxiliary_loss_mlp": 0.01028829, "balance_loss_clip": 0.97569716, "balance_loss_mlp": 1.02167034, "epoch": 0.684663019298984, "flos": 18880143356160.0, "grad_norm": 1.8034823091787358, "language_loss": 0.86817098, "learning_rate": 9.553681949246127e-07, "loss": 0.8901704, "num_input_tokens_seen": 122435055, "step": 5694, "time_per_iteration": 2.697247266769409 }, { "auxiliary_loss_clip": 0.01173425, "auxiliary_loss_mlp": 0.01023309, "balance_loss_clip": 0.93528545, "balance_loss_mlp": 1.01597142, "epoch": 0.684783262189623, "flos": 54193725302400.0, "grad_norm": 1.9149572688514118, "language_loss": 0.75431573, "learning_rate": 9.547040044892886e-07, "loss": 0.77628309, "num_input_tokens_seen": 122462570, "step": 5695, "time_per_iteration": 3.005363702774048 }, { "auxiliary_loss_clip": 0.01068765, "auxiliary_loss_mlp": 0.01002346, "balance_loss_clip": 0.97404838, "balance_loss_mlp": 1.00079656, "epoch": 0.6849035050802621, "flos": 63970264143360.0, "grad_norm": 0.8936855233646073, "language_loss": 0.60206068, "learning_rate": 9.540399726245354e-07, "loss": 0.62277174, "num_input_tokens_seen": 122519275, "step": 5696, "time_per_iteration": 3.0834691524505615 }, { "auxiliary_loss_clip": 0.01161852, "auxiliary_loss_mlp": 0.01029672, "balance_loss_clip": 0.97013462, "balance_loss_mlp": 1.022156, "epoch": 0.6850237479709013, "flos": 25224121774080.0, "grad_norm": 5.736248161745113, "language_loss": 0.68939614, "learning_rate": 9.533760994310859e-07, "loss": 0.71131134, "num_input_tokens_seen": 122539675, "step": 5697, "time_per_iteration": 2.747832775115967 }, { "auxiliary_loss_clip": 0.01172655, "auxiliary_loss_mlp": 0.01024903, "balance_loss_clip": 1.04982018, "balance_loss_mlp": 1.01721954, "epoch": 0.6851439908615403, "flos": 19354128249600.0, "grad_norm": 1.863545371536794, "language_loss": 0.75057042, "learning_rate": 9.527123850096508e-07, "loss": 0.77254593, "num_input_tokens_seen": 122558035, "step": 5698, "time_per_iteration": 2.585859537124634 }, { "auxiliary_loss_clip": 0.01171505, "auxiliary_loss_mlp": 0.01023792, "balance_loss_clip": 1.00936556, "balance_loss_mlp": 1.01662505, "epoch": 0.6852642337521794, "flos": 23182133500800.0, "grad_norm": 1.8027704031056198, "language_loss": 0.72109282, "learning_rate": 9.520488294609142e-07, "loss": 0.74304581, "num_input_tokens_seen": 122576815, "step": 5699, "time_per_iteration": 2.71332049369812 }, { "auxiliary_loss_clip": 0.01071827, "auxiliary_loss_mlp": 0.01000683, "balance_loss_clip": 0.86212826, "balance_loss_mlp": 0.99905014, "epoch": 0.6853844766428185, "flos": 62647206583680.0, "grad_norm": 0.8722156875458177, "language_loss": 0.5387423, "learning_rate": 9.513854328855368e-07, "loss": 0.55946738, "num_input_tokens_seen": 122634690, "step": 5700, "time_per_iteration": 3.22176194190979 }, { "auxiliary_loss_clip": 0.01166081, "auxiliary_loss_mlp": 0.01023732, "balance_loss_clip": 1.0475167, "balance_loss_mlp": 1.01666248, "epoch": 0.6855047195334576, "flos": 23437242869760.0, "grad_norm": 1.7902930048958947, "language_loss": 0.81092465, "learning_rate": 9.507221953841558e-07, "loss": 0.83282274, "num_input_tokens_seen": 122652320, "step": 5701, "time_per_iteration": 2.9334218502044678 }, { "auxiliary_loss_clip": 0.01170734, "auxiliary_loss_mlp": 0.0102597, "balance_loss_clip": 1.01264882, "balance_loss_mlp": 1.01853752, "epoch": 0.6856249624240967, "flos": 20664831530880.0, "grad_norm": 1.5288632838361256, "language_loss": 0.77975583, "learning_rate": 9.500591170573824e-07, "loss": 0.80172288, "num_input_tokens_seen": 122672340, "step": 5702, "time_per_iteration": 2.6849265098571777 }, { "auxiliary_loss_clip": 0.01159264, "auxiliary_loss_mlp": 0.01024637, "balance_loss_clip": 0.89495695, "balance_loss_mlp": 1.01728487, "epoch": 0.6857452053147358, "flos": 17087302794240.0, "grad_norm": 1.9749492682988228, "language_loss": 0.74300039, "learning_rate": 9.493961980058078e-07, "loss": 0.76483941, "num_input_tokens_seen": 122689935, "step": 5703, "time_per_iteration": 2.7145891189575195 }, { "auxiliary_loss_clip": 0.01146129, "auxiliary_loss_mlp": 0.0102185, "balance_loss_clip": 0.85401022, "balance_loss_mlp": 1.01496005, "epoch": 0.6858654482053749, "flos": 30847266057600.0, "grad_norm": 1.8595927569596427, "language_loss": 0.6777401, "learning_rate": 9.48733438329993e-07, "loss": 0.69941998, "num_input_tokens_seen": 122710200, "step": 5704, "time_per_iteration": 2.8325469493865967 }, { "auxiliary_loss_clip": 0.01170573, "auxiliary_loss_mlp": 0.0112243, "balance_loss_clip": 1.05184472, "balance_loss_mlp": 0.0, "epoch": 0.6859856910960139, "flos": 28877314510080.0, "grad_norm": 1.5793574244879, "language_loss": 0.74397683, "learning_rate": 9.480708381304807e-07, "loss": 0.76690686, "num_input_tokens_seen": 122731495, "step": 5705, "time_per_iteration": 2.6157374382019043 }, { "auxiliary_loss_clip": 0.01165101, "auxiliary_loss_mlp": 0.01028902, "balance_loss_clip": 0.90043849, "balance_loss_mlp": 1.02093923, "epoch": 0.6861059339866531, "flos": 19354523299200.0, "grad_norm": 2.22656331979498, "language_loss": 0.83387029, "learning_rate": 9.474083975077858e-07, "loss": 0.85581034, "num_input_tokens_seen": 122748620, "step": 5706, "time_per_iteration": 2.7490711212158203 }, { "auxiliary_loss_clip": 0.01159853, "auxiliary_loss_mlp": 0.01028176, "balance_loss_clip": 1.00816798, "balance_loss_mlp": 1.02068412, "epoch": 0.6862261768772921, "flos": 22199976944640.0, "grad_norm": 2.24392780102091, "language_loss": 0.80395216, "learning_rate": 9.467461165623994e-07, "loss": 0.82583249, "num_input_tokens_seen": 122767670, "step": 5707, "time_per_iteration": 2.694457530975342 }, { "auxiliary_loss_clip": 0.01169302, "auxiliary_loss_mlp": 0.01025922, "balance_loss_clip": 1.00917244, "balance_loss_mlp": 1.01892447, "epoch": 0.6863464197679312, "flos": 26285677344000.0, "grad_norm": 2.005451332282776, "language_loss": 0.79433179, "learning_rate": 9.46083995394791e-07, "loss": 0.816284, "num_input_tokens_seen": 122785480, "step": 5708, "time_per_iteration": 2.676502227783203 }, { "auxiliary_loss_clip": 0.01167221, "auxiliary_loss_mlp": 0.01122364, "balance_loss_clip": 1.00970268, "balance_loss_mlp": 0.0, "epoch": 0.6864666626585703, "flos": 37815228564480.0, "grad_norm": 2.0501751050764097, "language_loss": 0.63876891, "learning_rate": 9.454220341054012e-07, "loss": 0.66166472, "num_input_tokens_seen": 122810265, "step": 5709, "time_per_iteration": 2.8017098903656006 }, { "auxiliary_loss_clip": 0.01165165, "auxiliary_loss_mlp": 0.01024916, "balance_loss_clip": 0.93454814, "balance_loss_mlp": 1.01723349, "epoch": 0.6865869055492094, "flos": 19391152193280.0, "grad_norm": 2.0113066319937722, "language_loss": 0.81009305, "learning_rate": 9.447602327946512e-07, "loss": 0.83199382, "num_input_tokens_seen": 122828905, "step": 5710, "time_per_iteration": 3.695596218109131 }, { "auxiliary_loss_clip": 0.0116127, "auxiliary_loss_mlp": 0.0102251, "balance_loss_clip": 0.96843386, "balance_loss_mlp": 1.01502991, "epoch": 0.6867071484398485, "flos": 20375966355840.0, "grad_norm": 1.804858232154143, "language_loss": 0.761724, "learning_rate": 9.440985915629338e-07, "loss": 0.78356177, "num_input_tokens_seen": 122846235, "step": 5711, "time_per_iteration": 2.7079105377197266 }, { "auxiliary_loss_clip": 0.01169318, "auxiliary_loss_mlp": 0.0102607, "balance_loss_clip": 1.0508213, "balance_loss_mlp": 1.01930451, "epoch": 0.6868273913304875, "flos": 15889143801600.0, "grad_norm": 1.8859810350697823, "language_loss": 0.72783035, "learning_rate": 9.434371105106223e-07, "loss": 0.74978417, "num_input_tokens_seen": 122863835, "step": 5712, "time_per_iteration": 2.606036424636841 }, { "auxiliary_loss_clip": 0.01161952, "auxiliary_loss_mlp": 0.01025208, "balance_loss_clip": 0.93129969, "balance_loss_mlp": 1.01784134, "epoch": 0.6869476342211267, "flos": 24462492768000.0, "grad_norm": 1.814467482135841, "language_loss": 0.70588779, "learning_rate": 9.427757897380602e-07, "loss": 0.72775942, "num_input_tokens_seen": 122883235, "step": 5713, "time_per_iteration": 2.7422268390655518 }, { "auxiliary_loss_clip": 0.01163963, "auxiliary_loss_mlp": 0.01023466, "balance_loss_clip": 0.93556166, "balance_loss_mlp": 1.01634288, "epoch": 0.6870678771117658, "flos": 18442571875200.0, "grad_norm": 2.110151686011257, "language_loss": 0.85399419, "learning_rate": 9.421146293455695e-07, "loss": 0.87586844, "num_input_tokens_seen": 122898975, "step": 5714, "time_per_iteration": 3.5479886531829834 }, { "auxiliary_loss_clip": 0.01162039, "auxiliary_loss_mlp": 0.01026845, "balance_loss_clip": 0.97016191, "balance_loss_mlp": 1.01932633, "epoch": 0.6871881200024048, "flos": 22200371994240.0, "grad_norm": 1.733217632454185, "language_loss": 0.68206221, "learning_rate": 9.414536294334489e-07, "loss": 0.703951, "num_input_tokens_seen": 122918995, "step": 5715, "time_per_iteration": 3.67118501663208 }, { "auxiliary_loss_clip": 0.01164909, "auxiliary_loss_mlp": 0.01022834, "balance_loss_clip": 0.96739846, "balance_loss_mlp": 1.01558018, "epoch": 0.687308362893044, "flos": 22127724737280.0, "grad_norm": 1.8048116703514037, "language_loss": 0.69536269, "learning_rate": 9.407927901019708e-07, "loss": 0.71724021, "num_input_tokens_seen": 122938125, "step": 5716, "time_per_iteration": 2.700701951980591 }, { "auxiliary_loss_clip": 0.01167151, "auxiliary_loss_mlp": 0.01020048, "balance_loss_clip": 1.00865102, "balance_loss_mlp": 1.0128572, "epoch": 0.687428605783683, "flos": 25040546340480.0, "grad_norm": 2.0009251053413397, "language_loss": 0.76992285, "learning_rate": 9.401321114513854e-07, "loss": 0.79179484, "num_input_tokens_seen": 122957020, "step": 5717, "time_per_iteration": 3.588959217071533 }, { "auxiliary_loss_clip": 0.01172111, "auxiliary_loss_mlp": 0.01024176, "balance_loss_clip": 1.05124831, "balance_loss_mlp": 1.01661217, "epoch": 0.6875488486743221, "flos": 23770063313280.0, "grad_norm": 1.4627062214653765, "language_loss": 0.75201952, "learning_rate": 9.394715935819155e-07, "loss": 0.77398235, "num_input_tokens_seen": 122977410, "step": 5718, "time_per_iteration": 2.699378490447998 }, { "auxiliary_loss_clip": 0.01169876, "auxiliary_loss_mlp": 0.01026415, "balance_loss_clip": 1.00855184, "balance_loss_mlp": 1.01905668, "epoch": 0.6876690915649613, "flos": 25516937445120.0, "grad_norm": 1.9513912575807437, "language_loss": 0.62624812, "learning_rate": 9.388112365937608e-07, "loss": 0.64821106, "num_input_tokens_seen": 122996875, "step": 5719, "time_per_iteration": 2.6811249256134033 }, { "auxiliary_loss_clip": 0.01164912, "auxiliary_loss_mlp": 0.0102889, "balance_loss_clip": 0.93412459, "balance_loss_mlp": 1.02113581, "epoch": 0.6877893344556003, "flos": 19427996568960.0, "grad_norm": 2.049727221985915, "language_loss": 0.82609189, "learning_rate": 9.381510405870985e-07, "loss": 0.84802991, "num_input_tokens_seen": 123015890, "step": 5720, "time_per_iteration": 2.688426971435547 }, { "auxiliary_loss_clip": 0.0116528, "auxiliary_loss_mlp": 0.01028093, "balance_loss_clip": 1.0097717, "balance_loss_mlp": 1.02064788, "epoch": 0.6879095773462394, "flos": 18661303745280.0, "grad_norm": 2.0377221403983263, "language_loss": 0.77630901, "learning_rate": 9.374910056620791e-07, "loss": 0.79824275, "num_input_tokens_seen": 123034955, "step": 5721, "time_per_iteration": 2.654358148574829 }, { "auxiliary_loss_clip": 0.0117033, "auxiliary_loss_mlp": 0.01025856, "balance_loss_clip": 1.01187444, "balance_loss_mlp": 1.01845336, "epoch": 0.6880298202368785, "flos": 20883132437760.0, "grad_norm": 1.732093918846895, "language_loss": 0.8101939, "learning_rate": 9.368311319188293e-07, "loss": 0.8321557, "num_input_tokens_seen": 123052770, "step": 5722, "time_per_iteration": 2.6006462574005127 }, { "auxiliary_loss_clip": 0.01162735, "auxiliary_loss_mlp": 0.010241, "balance_loss_clip": 0.93183291, "balance_loss_mlp": 1.01708746, "epoch": 0.6881500631275176, "flos": 30153292318080.0, "grad_norm": 1.7519494432369214, "language_loss": 0.78907704, "learning_rate": 9.361714194574515e-07, "loss": 0.81094539, "num_input_tokens_seen": 123075105, "step": 5723, "time_per_iteration": 2.763079881668091 }, { "auxiliary_loss_clip": 0.01064952, "auxiliary_loss_mlp": 0.01002247, "balance_loss_clip": 1.01174664, "balance_loss_mlp": 1.00062561, "epoch": 0.6882703060181566, "flos": 66181537215360.0, "grad_norm": 0.7554197944433955, "language_loss": 0.58339846, "learning_rate": 9.355118683780228e-07, "loss": 0.60407043, "num_input_tokens_seen": 123145175, "step": 5724, "time_per_iteration": 3.2642672061920166 }, { "auxiliary_loss_clip": 0.01167868, "auxiliary_loss_mlp": 0.01027761, "balance_loss_clip": 1.0472033, "balance_loss_mlp": 1.02042401, "epoch": 0.6883905489087958, "flos": 18214646123520.0, "grad_norm": 2.0751797869437483, "language_loss": 0.78922772, "learning_rate": 9.348524787805987e-07, "loss": 0.81118399, "num_input_tokens_seen": 123160365, "step": 5725, "time_per_iteration": 2.5308218002319336 }, { "auxiliary_loss_clip": 0.01166397, "auxiliary_loss_mlp": 0.01023703, "balance_loss_clip": 0.93204826, "balance_loss_mlp": 1.01661587, "epoch": 0.6885107917994349, "flos": 14056262553600.0, "grad_norm": 2.916334987349192, "language_loss": 0.85487068, "learning_rate": 9.341932507652053e-07, "loss": 0.87677169, "num_input_tokens_seen": 123174855, "step": 5726, "time_per_iteration": 2.652102470397949 }, { "auxiliary_loss_clip": 0.01160635, "auxiliary_loss_mlp": 0.01029762, "balance_loss_clip": 0.96733153, "balance_loss_mlp": 1.02218676, "epoch": 0.6886310346900739, "flos": 28690722334080.0, "grad_norm": 1.8341740923302696, "language_loss": 0.78577006, "learning_rate": 9.335341844318489e-07, "loss": 0.80767399, "num_input_tokens_seen": 123194995, "step": 5727, "time_per_iteration": 2.6979880332946777 }, { "auxiliary_loss_clip": 0.01165441, "auxiliary_loss_mlp": 0.01028874, "balance_loss_clip": 0.97133374, "balance_loss_mlp": 1.02178144, "epoch": 0.6887512775807131, "flos": 24535319592960.0, "grad_norm": 1.7900098885975293, "language_loss": 0.73300624, "learning_rate": 9.328752798805091e-07, "loss": 0.75494945, "num_input_tokens_seen": 123213465, "step": 5728, "time_per_iteration": 2.678314685821533 }, { "auxiliary_loss_clip": 0.01167035, "auxiliary_loss_mlp": 0.01028087, "balance_loss_clip": 1.00987482, "balance_loss_mlp": 1.02074993, "epoch": 0.6888715204713521, "flos": 22414363269120.0, "grad_norm": 2.1298943967505783, "language_loss": 0.7585398, "learning_rate": 9.322165372111399e-07, "loss": 0.78049099, "num_input_tokens_seen": 123231610, "step": 5729, "time_per_iteration": 2.688575506210327 }, { "auxiliary_loss_clip": 0.01164742, "auxiliary_loss_mlp": 0.01029221, "balance_loss_clip": 0.93577504, "balance_loss_mlp": 1.02211618, "epoch": 0.6889917633619912, "flos": 22054323294720.0, "grad_norm": 2.013938665170355, "language_loss": 0.7567144, "learning_rate": 9.315579565236747e-07, "loss": 0.77865404, "num_input_tokens_seen": 123250715, "step": 5730, "time_per_iteration": 2.68019437789917 }, { "auxiliary_loss_clip": 0.01165838, "auxiliary_loss_mlp": 0.01029603, "balance_loss_clip": 0.97482693, "balance_loss_mlp": 1.02178836, "epoch": 0.6891120062526304, "flos": 23949724164480.0, "grad_norm": 1.901523947995716, "language_loss": 0.74131054, "learning_rate": 9.308995379180162e-07, "loss": 0.76326495, "num_input_tokens_seen": 123270270, "step": 5731, "time_per_iteration": 2.6990363597869873 }, { "auxiliary_loss_clip": 0.01068144, "auxiliary_loss_mlp": 0.01000631, "balance_loss_clip": 0.97374415, "balance_loss_mlp": 0.99895042, "epoch": 0.6892322491432694, "flos": 64117354337280.0, "grad_norm": 0.7471715401008999, "language_loss": 0.59536934, "learning_rate": 9.302412814940488e-07, "loss": 0.61605716, "num_input_tokens_seen": 123333045, "step": 5732, "time_per_iteration": 3.2300100326538086 }, { "auxiliary_loss_clip": 0.01161155, "auxiliary_loss_mlp": 0.01029799, "balance_loss_clip": 0.96812975, "balance_loss_mlp": 1.02242005, "epoch": 0.6893524920339085, "flos": 23002436736000.0, "grad_norm": 2.0320157216357018, "language_loss": 0.70947063, "learning_rate": 9.295831873516276e-07, "loss": 0.73138022, "num_input_tokens_seen": 123352320, "step": 5733, "time_per_iteration": 2.6501693725585938 }, { "auxiliary_loss_clip": 0.01169512, "auxiliary_loss_mlp": 0.0102774, "balance_loss_clip": 1.05043125, "balance_loss_mlp": 1.0202837, "epoch": 0.6894727349245476, "flos": 21396260177280.0, "grad_norm": 1.545518236840743, "language_loss": 0.75956267, "learning_rate": 9.289252555905873e-07, "loss": 0.78153527, "num_input_tokens_seen": 123372400, "step": 5734, "time_per_iteration": 2.647714376449585 }, { "auxiliary_loss_clip": 0.01169755, "auxiliary_loss_mlp": 0.01029432, "balance_loss_clip": 1.01381874, "balance_loss_mlp": 1.02174044, "epoch": 0.6895929778151867, "flos": 19865316654720.0, "grad_norm": 2.1506242850049837, "language_loss": 0.76122266, "learning_rate": 9.282674863107334e-07, "loss": 0.78321457, "num_input_tokens_seen": 123390215, "step": 5735, "time_per_iteration": 2.659026861190796 }, { "auxiliary_loss_clip": 0.0116683, "auxiliary_loss_mlp": 0.01024703, "balance_loss_clip": 1.01036382, "balance_loss_mlp": 1.01770568, "epoch": 0.6897132207058257, "flos": 18179166464640.0, "grad_norm": 2.2654741074960305, "language_loss": 0.75715971, "learning_rate": 9.276098796118488e-07, "loss": 0.77907515, "num_input_tokens_seen": 123406700, "step": 5736, "time_per_iteration": 3.6192262172698975 }, { "auxiliary_loss_clip": 0.01167774, "auxiliary_loss_mlp": 0.01022519, "balance_loss_clip": 0.97241354, "balance_loss_mlp": 1.01560521, "epoch": 0.6898334635964649, "flos": 32561641359360.0, "grad_norm": 1.8255600878364793, "language_loss": 0.66227281, "learning_rate": 9.269524355936938e-07, "loss": 0.68417573, "num_input_tokens_seen": 123429880, "step": 5737, "time_per_iteration": 2.8254430294036865 }, { "auxiliary_loss_clip": 0.01158241, "auxiliary_loss_mlp": 0.01026011, "balance_loss_clip": 0.9690659, "balance_loss_mlp": 1.01898921, "epoch": 0.689953706487104, "flos": 22819004956800.0, "grad_norm": 1.5973954250596196, "language_loss": 0.8471874, "learning_rate": 9.262951543560002e-07, "loss": 0.86902994, "num_input_tokens_seen": 123449105, "step": 5738, "time_per_iteration": 2.6871860027313232 }, { "auxiliary_loss_clip": 0.011693, "auxiliary_loss_mlp": 0.01035168, "balance_loss_clip": 0.97469413, "balance_loss_mlp": 1.02722859, "epoch": 0.690073949377743, "flos": 18515362786560.0, "grad_norm": 2.2518391862965537, "language_loss": 0.86422455, "learning_rate": 9.256380359984795e-07, "loss": 0.88626921, "num_input_tokens_seen": 123466215, "step": 5739, "time_per_iteration": 2.622488498687744 }, { "auxiliary_loss_clip": 0.01168534, "auxiliary_loss_mlp": 0.01031239, "balance_loss_clip": 0.89160669, "balance_loss_mlp": 1.02390778, "epoch": 0.6901941922683821, "flos": 34857194716800.0, "grad_norm": 2.21405812876121, "language_loss": 0.74764299, "learning_rate": 9.249810806208139e-07, "loss": 0.76964074, "num_input_tokens_seen": 123485480, "step": 5740, "time_per_iteration": 2.8017852306365967 }, { "auxiliary_loss_clip": 0.01158456, "auxiliary_loss_mlp": 0.01122244, "balance_loss_clip": 0.89033401, "balance_loss_mlp": 0.0, "epoch": 0.6903144351590212, "flos": 16253672976000.0, "grad_norm": 2.1567563113353185, "language_loss": 0.80178624, "learning_rate": 9.243242883226627e-07, "loss": 0.82459331, "num_input_tokens_seen": 123504575, "step": 5741, "time_per_iteration": 4.518073081970215 }, { "auxiliary_loss_clip": 0.01170131, "auxiliary_loss_mlp": 0.01026745, "balance_loss_clip": 1.00800061, "balance_loss_mlp": 1.01931238, "epoch": 0.6904346780496603, "flos": 28035137255040.0, "grad_norm": 1.6950261027578366, "language_loss": 0.69893676, "learning_rate": 9.236676592036628e-07, "loss": 0.72090554, "num_input_tokens_seen": 123524250, "step": 5742, "time_per_iteration": 2.706909656524658 }, { "auxiliary_loss_clip": 0.01164274, "auxiliary_loss_mlp": 0.01028178, "balance_loss_clip": 0.97457379, "balance_loss_mlp": 1.02124333, "epoch": 0.6905549209402994, "flos": 23624266008960.0, "grad_norm": 1.9443066022455595, "language_loss": 0.73375028, "learning_rate": 9.230111933634228e-07, "loss": 0.75567484, "num_input_tokens_seen": 123545845, "step": 5743, "time_per_iteration": 3.650038480758667 }, { "auxiliary_loss_clip": 0.01171799, "auxiliary_loss_mlp": 0.01028291, "balance_loss_clip": 1.01206374, "balance_loss_mlp": 1.02100432, "epoch": 0.6906751638309385, "flos": 23114945111040.0, "grad_norm": 1.4348826826703034, "language_loss": 0.80726641, "learning_rate": 9.223548909015288e-07, "loss": 0.82926732, "num_input_tokens_seen": 123567535, "step": 5744, "time_per_iteration": 2.6579971313476562 }, { "auxiliary_loss_clip": 0.01153815, "auxiliary_loss_mlp": 0.01024099, "balance_loss_clip": 0.8929615, "balance_loss_mlp": 1.01717281, "epoch": 0.6907954067215776, "flos": 27305468375040.0, "grad_norm": 1.8038794941928031, "language_loss": 0.72437739, "learning_rate": 9.216987519175407e-07, "loss": 0.74615657, "num_input_tokens_seen": 123587710, "step": 5745, "time_per_iteration": 2.7397866249084473 }, { "auxiliary_loss_clip": 0.0116635, "auxiliary_loss_mlp": 0.01027021, "balance_loss_clip": 1.01236522, "balance_loss_mlp": 1.02002931, "epoch": 0.6909156496122166, "flos": 21689399070720.0, "grad_norm": 1.540358123768213, "language_loss": 0.68370956, "learning_rate": 9.210427765109942e-07, "loss": 0.7056433, "num_input_tokens_seen": 123607385, "step": 5746, "time_per_iteration": 2.6176626682281494 }, { "auxiliary_loss_clip": 0.01169972, "auxiliary_loss_mlp": 0.0102966, "balance_loss_clip": 0.97139692, "balance_loss_mlp": 1.02173257, "epoch": 0.6910358925028558, "flos": 22561453463040.0, "grad_norm": 1.8317359467019647, "language_loss": 0.81247914, "learning_rate": 9.20386964781402e-07, "loss": 0.83447552, "num_input_tokens_seen": 123625405, "step": 5747, "time_per_iteration": 2.622565269470215 }, { "auxiliary_loss_clip": 0.01161529, "auxiliary_loss_mlp": 0.01028779, "balance_loss_clip": 0.97003269, "balance_loss_mlp": 1.02182627, "epoch": 0.6911561353934949, "flos": 22054107813120.0, "grad_norm": 1.8822539619852994, "language_loss": 0.84411317, "learning_rate": 9.197313168282472e-07, "loss": 0.86601621, "num_input_tokens_seen": 123642850, "step": 5748, "time_per_iteration": 2.6796607971191406 }, { "auxiliary_loss_clip": 0.01159153, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.0063622, "balance_loss_mlp": 1.01786017, "epoch": 0.6912763782841339, "flos": 24206557386240.0, "grad_norm": 1.9148862507791835, "language_loss": 0.72425449, "learning_rate": 9.190758327509935e-07, "loss": 0.74609828, "num_input_tokens_seen": 123661595, "step": 5749, "time_per_iteration": 2.722623825073242 }, { "auxiliary_loss_clip": 0.01079216, "auxiliary_loss_mlp": 0.01115764, "balance_loss_clip": 0.86301875, "balance_loss_mlp": 0.0, "epoch": 0.6913966211747731, "flos": 52329641091840.0, "grad_norm": 0.935515931459557, "language_loss": 0.64517975, "learning_rate": 9.184205126490767e-07, "loss": 0.66712952, "num_input_tokens_seen": 123710490, "step": 5750, "time_per_iteration": 3.097550868988037 }, { "auxiliary_loss_clip": 0.01071957, "auxiliary_loss_mlp": 0.01115979, "balance_loss_clip": 0.90029734, "balance_loss_mlp": 0.0, "epoch": 0.6915168640654121, "flos": 66741274851840.0, "grad_norm": 1.085767660830572, "language_loss": 0.59686995, "learning_rate": 9.177653566219075e-07, "loss": 0.61874932, "num_input_tokens_seen": 123765215, "step": 5751, "time_per_iteration": 3.204941987991333 }, { "auxiliary_loss_clip": 0.01169979, "auxiliary_loss_mlp": 0.0102647, "balance_loss_clip": 0.93277866, "balance_loss_mlp": 1.01884067, "epoch": 0.6916371069560512, "flos": 18296523175680.0, "grad_norm": 2.115970853759081, "language_loss": 0.76202112, "learning_rate": 9.171103647688744e-07, "loss": 0.78398561, "num_input_tokens_seen": 123783955, "step": 5752, "time_per_iteration": 2.6718366146087646 }, { "auxiliary_loss_clip": 0.01152532, "auxiliary_loss_mlp": 0.01026106, "balance_loss_clip": 0.81634372, "balance_loss_mlp": 1.01968026, "epoch": 0.6917573498466904, "flos": 19645794685440.0, "grad_norm": 1.7476926373665855, "language_loss": 0.6925385, "learning_rate": 9.164555371893367e-07, "loss": 0.71432495, "num_input_tokens_seen": 123803885, "step": 5753, "time_per_iteration": 2.8082377910614014 }, { "auxiliary_loss_clip": 0.01167493, "auxiliary_loss_mlp": 0.01122138, "balance_loss_clip": 1.01102257, "balance_loss_mlp": 0.0, "epoch": 0.6918775927373294, "flos": 14210319985920.0, "grad_norm": 3.163929320701825, "language_loss": 0.75091851, "learning_rate": 9.158008739826333e-07, "loss": 0.7738148, "num_input_tokens_seen": 123821485, "step": 5754, "time_per_iteration": 2.6720197200775146 }, { "auxiliary_loss_clip": 0.01165385, "auxiliary_loss_mlp": 0.01022751, "balance_loss_clip": 0.97351193, "balance_loss_mlp": 1.01506829, "epoch": 0.6919978356279685, "flos": 23985455218560.0, "grad_norm": 1.65918527190369, "language_loss": 0.86577874, "learning_rate": 9.151463752480744e-07, "loss": 0.88766015, "num_input_tokens_seen": 123840215, "step": 5755, "time_per_iteration": 2.7206859588623047 }, { "auxiliary_loss_clip": 0.01156508, "auxiliary_loss_mlp": 0.01025076, "balance_loss_clip": 0.93168652, "balance_loss_mlp": 1.01793492, "epoch": 0.6921180785186076, "flos": 23622937205760.0, "grad_norm": 1.3455103394379122, "language_loss": 0.80104947, "learning_rate": 9.144920410849493e-07, "loss": 0.82286531, "num_input_tokens_seen": 123861450, "step": 5756, "time_per_iteration": 2.7872672080993652 }, { "auxiliary_loss_clip": 0.01169131, "auxiliary_loss_mlp": 0.0102323, "balance_loss_clip": 0.97125596, "balance_loss_mlp": 1.01586282, "epoch": 0.6922383214092467, "flos": 21142623265920.0, "grad_norm": 1.6683973118821034, "language_loss": 0.80448782, "learning_rate": 9.138378715925176e-07, "loss": 0.82641137, "num_input_tokens_seen": 123880545, "step": 5757, "time_per_iteration": 2.643049955368042 }, { "auxiliary_loss_clip": 0.01154858, "auxiliary_loss_mlp": 0.01028299, "balance_loss_clip": 0.96881723, "balance_loss_mlp": 1.02159631, "epoch": 0.6923585642998857, "flos": 21470667200640.0, "grad_norm": 1.6141949863631502, "language_loss": 0.81188482, "learning_rate": 9.131838668700167e-07, "loss": 0.83371639, "num_input_tokens_seen": 123900615, "step": 5758, "time_per_iteration": 2.7132556438446045 }, { "auxiliary_loss_clip": 0.01162715, "auxiliary_loss_mlp": 0.0102852, "balance_loss_clip": 0.93058866, "balance_loss_mlp": 1.02094436, "epoch": 0.6924788071905249, "flos": 21105204272640.0, "grad_norm": 1.781402946058838, "language_loss": 0.86670911, "learning_rate": 9.125300270166598e-07, "loss": 0.88862145, "num_input_tokens_seen": 123921220, "step": 5759, "time_per_iteration": 2.754638910293579 }, { "auxiliary_loss_clip": 0.01171114, "auxiliary_loss_mlp": 0.0102041, "balance_loss_clip": 0.93082356, "balance_loss_mlp": 1.01288223, "epoch": 0.692599050081164, "flos": 26250018117120.0, "grad_norm": 1.7003678910995266, "language_loss": 0.8575213, "learning_rate": 9.118763521316324e-07, "loss": 0.87943649, "num_input_tokens_seen": 123941795, "step": 5760, "time_per_iteration": 2.7330310344696045 }, { "auxiliary_loss_clip": 0.01169444, "auxiliary_loss_mlp": 0.01122406, "balance_loss_clip": 1.04794955, "balance_loss_mlp": 0.0, "epoch": 0.692719292971803, "flos": 20885215426560.0, "grad_norm": 1.6623229351723907, "language_loss": 0.76116157, "learning_rate": 9.112228423140987e-07, "loss": 0.78408003, "num_input_tokens_seen": 123960715, "step": 5761, "time_per_iteration": 2.628164529800415 }, { "auxiliary_loss_clip": 0.01171819, "auxiliary_loss_mlp": 0.0102798, "balance_loss_clip": 0.97316211, "balance_loss_mlp": 1.0203743, "epoch": 0.6928395358624422, "flos": 25921938268800.0, "grad_norm": 3.205451284228985, "language_loss": 0.85874933, "learning_rate": 9.105694976631932e-07, "loss": 0.88074732, "num_input_tokens_seen": 123978625, "step": 5762, "time_per_iteration": 3.603203535079956 }, { "auxiliary_loss_clip": 0.01169553, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.01260889, "balance_loss_mlp": 1.0233736, "epoch": 0.6929597787530812, "flos": 23586559706880.0, "grad_norm": 2.156911418951469, "language_loss": 0.72357726, "learning_rate": 9.099163182780283e-07, "loss": 0.74558413, "num_input_tokens_seen": 123996780, "step": 5763, "time_per_iteration": 2.6548314094543457 }, { "auxiliary_loss_clip": 0.01164587, "auxiliary_loss_mlp": 0.01028793, "balance_loss_clip": 0.97209811, "balance_loss_mlp": 1.0214442, "epoch": 0.6930800216437203, "flos": 18255656476800.0, "grad_norm": 2.718179929792089, "language_loss": 0.49382079, "learning_rate": 9.092633042576916e-07, "loss": 0.51575458, "num_input_tokens_seen": 124014045, "step": 5764, "time_per_iteration": 2.665391445159912 }, { "auxiliary_loss_clip": 0.01162757, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 0.97311831, "balance_loss_mlp": 1.01932573, "epoch": 0.6932002645343595, "flos": 29168621809920.0, "grad_norm": 1.8767111379614996, "language_loss": 0.56224871, "learning_rate": 9.086104557012446e-07, "loss": 0.5841437, "num_input_tokens_seen": 124034615, "step": 5765, "time_per_iteration": 2.78193998336792 }, { "auxiliary_loss_clip": 0.01155633, "auxiliary_loss_mlp": 0.01027819, "balance_loss_clip": 1.00699091, "balance_loss_mlp": 1.02074385, "epoch": 0.6933205074249985, "flos": 23842746483840.0, "grad_norm": 4.524345279938104, "language_loss": 0.65744621, "learning_rate": 9.079577727077239e-07, "loss": 0.67928076, "num_input_tokens_seen": 124053445, "step": 5766, "time_per_iteration": 3.629969596862793 }, { "auxiliary_loss_clip": 0.01167206, "auxiliary_loss_mlp": 0.01028722, "balance_loss_clip": 1.01038361, "balance_loss_mlp": 1.02120018, "epoch": 0.6934407503156376, "flos": 24166696268160.0, "grad_norm": 2.7982657737578167, "language_loss": 0.71644723, "learning_rate": 9.073052553761404e-07, "loss": 0.73840654, "num_input_tokens_seen": 124072810, "step": 5767, "time_per_iteration": 3.621274471282959 }, { "auxiliary_loss_clip": 0.01162769, "auxiliary_loss_mlp": 0.01022813, "balance_loss_clip": 0.89429075, "balance_loss_mlp": 1.01514196, "epoch": 0.6935609932062767, "flos": 20631327120000.0, "grad_norm": 1.5667734707791454, "language_loss": 0.78149533, "learning_rate": 9.066529038054805e-07, "loss": 0.80335116, "num_input_tokens_seen": 124092875, "step": 5768, "time_per_iteration": 2.782222270965576 }, { "auxiliary_loss_clip": 0.01163483, "auxiliary_loss_mlp": 0.01026465, "balance_loss_clip": 0.97110963, "balance_loss_mlp": 1.01958346, "epoch": 0.6936812360969158, "flos": 18254184019200.0, "grad_norm": 1.6279344757304595, "language_loss": 0.738657, "learning_rate": 9.060007180947071e-07, "loss": 0.76055646, "num_input_tokens_seen": 124110930, "step": 5769, "time_per_iteration": 3.5173304080963135 }, { "auxiliary_loss_clip": 0.01163179, "auxiliary_loss_mlp": 0.0102772, "balance_loss_clip": 0.89208353, "balance_loss_mlp": 1.02001929, "epoch": 0.6938014789875548, "flos": 31317336368640.0, "grad_norm": 1.8418705431568707, "language_loss": 0.73517239, "learning_rate": 9.053486983427534e-07, "loss": 0.75708139, "num_input_tokens_seen": 124132180, "step": 5770, "time_per_iteration": 2.796205520629883 }, { "auxiliary_loss_clip": 0.01168286, "auxiliary_loss_mlp": 0.0102194, "balance_loss_clip": 0.97037923, "balance_loss_mlp": 1.01481676, "epoch": 0.6939217218781939, "flos": 17528429721600.0, "grad_norm": 1.9318826922813153, "language_loss": 0.70618647, "learning_rate": 9.046968446485326e-07, "loss": 0.72808874, "num_input_tokens_seen": 124150585, "step": 5771, "time_per_iteration": 2.6403627395629883 }, { "auxiliary_loss_clip": 0.01171151, "auxiliary_loss_mlp": 0.01024248, "balance_loss_clip": 1.0108273, "balance_loss_mlp": 1.01685739, "epoch": 0.6940419647688331, "flos": 18551776199040.0, "grad_norm": 17.73530751890689, "language_loss": 0.70238048, "learning_rate": 9.040451571109295e-07, "loss": 0.72433448, "num_input_tokens_seen": 124166205, "step": 5772, "time_per_iteration": 2.6496918201446533 }, { "auxiliary_loss_clip": 0.01083252, "auxiliary_loss_mlp": 0.01001678, "balance_loss_clip": 0.91284895, "balance_loss_mlp": 1.00000942, "epoch": 0.6941622076594721, "flos": 66926286829440.0, "grad_norm": 0.9015032307224569, "language_loss": 0.60503775, "learning_rate": 9.033936358288042e-07, "loss": 0.6258871, "num_input_tokens_seen": 124219940, "step": 5773, "time_per_iteration": 3.191448926925659 }, { "auxiliary_loss_clip": 0.01171471, "auxiliary_loss_mlp": 0.01024489, "balance_loss_clip": 1.04944515, "balance_loss_mlp": 1.0174675, "epoch": 0.6942824505501112, "flos": 26578062051840.0, "grad_norm": 1.6568114863693415, "language_loss": 0.82396358, "learning_rate": 9.027422809009937e-07, "loss": 0.84592319, "num_input_tokens_seen": 124239885, "step": 5774, "time_per_iteration": 2.631803035736084 }, { "auxiliary_loss_clip": 0.01168247, "auxiliary_loss_mlp": 0.01025798, "balance_loss_clip": 1.00834, "balance_loss_mlp": 1.01868737, "epoch": 0.6944026934407503, "flos": 21248308056960.0, "grad_norm": 1.8226103306728676, "language_loss": 0.82967556, "learning_rate": 9.020910924263054e-07, "loss": 0.85161602, "num_input_tokens_seen": 124258410, "step": 5775, "time_per_iteration": 2.6637682914733887 }, { "auxiliary_loss_clip": 0.01079536, "auxiliary_loss_mlp": 0.01000025, "balance_loss_clip": 0.91104621, "balance_loss_mlp": 0.99817717, "epoch": 0.6945229363313894, "flos": 70677191537280.0, "grad_norm": 2.8580921840581617, "language_loss": 0.58195055, "learning_rate": 9.014400705035261e-07, "loss": 0.60274613, "num_input_tokens_seen": 124315315, "step": 5776, "time_per_iteration": 3.2942259311676025 }, { "auxiliary_loss_clip": 0.01169969, "auxiliary_loss_mlp": 0.01029194, "balance_loss_clip": 1.05072892, "balance_loss_mlp": 1.02242041, "epoch": 0.6946431792220285, "flos": 18952934267520.0, "grad_norm": 5.2473460841419195, "language_loss": 0.7667321, "learning_rate": 9.00789215231414e-07, "loss": 0.78872371, "num_input_tokens_seen": 124333710, "step": 5777, "time_per_iteration": 2.548375368118286 }, { "auxiliary_loss_clip": 0.01161362, "auxiliary_loss_mlp": 0.01122315, "balance_loss_clip": 0.92980063, "balance_loss_mlp": 0.0, "epoch": 0.6947634221126676, "flos": 20338834671360.0, "grad_norm": 1.6665534705568081, "language_loss": 0.81944513, "learning_rate": 9.001385267087056e-07, "loss": 0.84228194, "num_input_tokens_seen": 124352855, "step": 5778, "time_per_iteration": 2.725569725036621 }, { "auxiliary_loss_clip": 0.01170523, "auxiliary_loss_mlp": 0.01024475, "balance_loss_clip": 1.01176834, "balance_loss_mlp": 1.01738775, "epoch": 0.6948836650033067, "flos": 21833723917440.0, "grad_norm": 1.5461537581313347, "language_loss": 0.70893013, "learning_rate": 8.994880050341072e-07, "loss": 0.73088014, "num_input_tokens_seen": 124372960, "step": 5779, "time_per_iteration": 2.6854207515716553 }, { "auxiliary_loss_clip": 0.01158039, "auxiliary_loss_mlp": 0.01022706, "balance_loss_clip": 0.97055382, "balance_loss_mlp": 1.01618862, "epoch": 0.6950039078939457, "flos": 23657519024640.0, "grad_norm": 1.737144704576542, "language_loss": 0.77645373, "learning_rate": 8.988376503063026e-07, "loss": 0.79826123, "num_input_tokens_seen": 124394220, "step": 5780, "time_per_iteration": 2.6836540699005127 }, { "auxiliary_loss_clip": 0.01170306, "auxiliary_loss_mlp": 0.01022165, "balance_loss_clip": 0.89456713, "balance_loss_mlp": 1.01413012, "epoch": 0.6951241507845849, "flos": 21792462168960.0, "grad_norm": 2.0374895514347733, "language_loss": 0.81659919, "learning_rate": 8.981874626239521e-07, "loss": 0.83852386, "num_input_tokens_seen": 124412795, "step": 5781, "time_per_iteration": 2.7543423175811768 }, { "auxiliary_loss_clip": 0.01170351, "auxiliary_loss_mlp": 0.01031195, "balance_loss_clip": 1.01194143, "balance_loss_mlp": 1.02409005, "epoch": 0.695244393675224, "flos": 14647568244480.0, "grad_norm": 1.8965912775378395, "language_loss": 0.88344693, "learning_rate": 8.975374420856872e-07, "loss": 0.90546238, "num_input_tokens_seen": 124429690, "step": 5782, "time_per_iteration": 2.5746066570281982 }, { "auxiliary_loss_clip": 0.01152172, "auxiliary_loss_mlp": 0.01022168, "balance_loss_clip": 0.92901933, "balance_loss_mlp": 1.0149498, "epoch": 0.695364636565863, "flos": 16873203778560.0, "grad_norm": 2.030769897992797, "language_loss": 0.72377169, "learning_rate": 8.968875887901157e-07, "loss": 0.74551511, "num_input_tokens_seen": 124447070, "step": 5783, "time_per_iteration": 2.7537941932678223 }, { "auxiliary_loss_clip": 0.01162227, "auxiliary_loss_mlp": 0.0102971, "balance_loss_clip": 0.96833622, "balance_loss_mlp": 1.02222323, "epoch": 0.6954848794565022, "flos": 19354523299200.0, "grad_norm": 3.8461147798198385, "language_loss": 0.62606966, "learning_rate": 8.9623790283582e-07, "loss": 0.64798903, "num_input_tokens_seen": 124464950, "step": 5784, "time_per_iteration": 2.7603492736816406 }, { "auxiliary_loss_clip": 0.01165833, "auxiliary_loss_mlp": 0.0102796, "balance_loss_clip": 0.93409336, "balance_loss_mlp": 1.02059317, "epoch": 0.6956051223471412, "flos": 18990209606400.0, "grad_norm": 2.2890415108303355, "language_loss": 0.76118684, "learning_rate": 8.955883843213561e-07, "loss": 0.7831248, "num_input_tokens_seen": 124483965, "step": 5785, "time_per_iteration": 2.791839838027954 }, { "auxiliary_loss_clip": 0.01176618, "auxiliary_loss_mlp": 0.01030246, "balance_loss_clip": 1.01173341, "balance_loss_mlp": 1.02232456, "epoch": 0.6957253652377803, "flos": 16107229226880.0, "grad_norm": 1.9148200888894609, "language_loss": 0.86928153, "learning_rate": 8.949390333452569e-07, "loss": 0.89135015, "num_input_tokens_seen": 124501910, "step": 5786, "time_per_iteration": 2.6560676097869873 }, { "auxiliary_loss_clip": 0.01168752, "auxiliary_loss_mlp": 0.01027171, "balance_loss_clip": 1.0498184, "balance_loss_mlp": 1.01977396, "epoch": 0.6958456081284194, "flos": 29388646569600.0, "grad_norm": 1.6398022930382, "language_loss": 0.67612123, "learning_rate": 8.942898500060279e-07, "loss": 0.69808042, "num_input_tokens_seen": 124521625, "step": 5787, "time_per_iteration": 2.739746570587158 }, { "auxiliary_loss_clip": 0.01174642, "auxiliary_loss_mlp": 0.01030007, "balance_loss_clip": 0.89817584, "balance_loss_mlp": 1.02235413, "epoch": 0.6959658510190585, "flos": 25154850395520.0, "grad_norm": 2.176317518770801, "language_loss": 0.71693343, "learning_rate": 8.936408344021493e-07, "loss": 0.73897994, "num_input_tokens_seen": 124538540, "step": 5788, "time_per_iteration": 2.8136672973632812 }, { "auxiliary_loss_clip": 0.01178947, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 0.97593451, "balance_loss_mlp": 1.02037799, "epoch": 0.6960860939096976, "flos": 42814388759040.0, "grad_norm": 2.199827348974353, "language_loss": 0.71069831, "learning_rate": 8.929919866320765e-07, "loss": 0.73277235, "num_input_tokens_seen": 124559355, "step": 5789, "time_per_iteration": 3.766467332839966 }, { "auxiliary_loss_clip": 0.01172134, "auxiliary_loss_mlp": 0.01122769, "balance_loss_clip": 0.9337396, "balance_loss_mlp": 0.0, "epoch": 0.6962063368003367, "flos": 17566566986880.0, "grad_norm": 1.9383174891003077, "language_loss": 0.81269586, "learning_rate": 8.923433067942385e-07, "loss": 0.8356449, "num_input_tokens_seen": 124577920, "step": 5790, "time_per_iteration": 2.741273880004883 }, { "auxiliary_loss_clip": 0.01174624, "auxiliary_loss_mlp": 0.01026557, "balance_loss_clip": 0.93472064, "balance_loss_mlp": 1.01930022, "epoch": 0.6963265796909758, "flos": 21251648021760.0, "grad_norm": 1.7537287527627907, "language_loss": 0.68463165, "learning_rate": 8.916947949870417e-07, "loss": 0.70664346, "num_input_tokens_seen": 124597585, "step": 5791, "time_per_iteration": 2.763558864593506 }, { "auxiliary_loss_clip": 0.01070248, "auxiliary_loss_mlp": 0.01000521, "balance_loss_clip": 0.97541308, "balance_loss_mlp": 0.99879247, "epoch": 0.6964468225816148, "flos": 68828295801600.0, "grad_norm": 0.7597895531532591, "language_loss": 0.58185899, "learning_rate": 8.910464513088615e-07, "loss": 0.60256672, "num_input_tokens_seen": 124661625, "step": 5792, "time_per_iteration": 3.3001089096069336 }, { "auxiliary_loss_clip": 0.01160858, "auxiliary_loss_mlp": 0.01029753, "balance_loss_clip": 0.97204649, "balance_loss_mlp": 1.0225116, "epoch": 0.696567065472254, "flos": 18950887192320.0, "grad_norm": 2.0747931515641063, "language_loss": 0.78238159, "learning_rate": 8.903982758580542e-07, "loss": 0.80428767, "num_input_tokens_seen": 124680565, "step": 5793, "time_per_iteration": 4.530453443527222 }, { "auxiliary_loss_clip": 0.01167013, "auxiliary_loss_mlp": 0.01028806, "balance_loss_clip": 0.97367752, "balance_loss_mlp": 1.02117622, "epoch": 0.696687308362893, "flos": 22856675345280.0, "grad_norm": 1.8621928269454882, "language_loss": 0.79955316, "learning_rate": 8.897502687329457e-07, "loss": 0.82151139, "num_input_tokens_seen": 124700365, "step": 5794, "time_per_iteration": 2.7226130962371826 }, { "auxiliary_loss_clip": 0.01159157, "auxiliary_loss_mlp": 0.01037476, "balance_loss_clip": 0.93162566, "balance_loss_mlp": 1.02972698, "epoch": 0.6968075512535321, "flos": 24972926987520.0, "grad_norm": 2.6003106346660814, "language_loss": 0.79831773, "learning_rate": 8.891024300318382e-07, "loss": 0.82028407, "num_input_tokens_seen": 124718935, "step": 5795, "time_per_iteration": 3.6237845420837402 }, { "auxiliary_loss_clip": 0.0115275, "auxiliary_loss_mlp": 0.01027373, "balance_loss_clip": 0.92927933, "balance_loss_mlp": 1.02033997, "epoch": 0.6969277941441713, "flos": 21030438113280.0, "grad_norm": 1.520117594898483, "language_loss": 0.76024461, "learning_rate": 8.884547598530103e-07, "loss": 0.78204584, "num_input_tokens_seen": 124739505, "step": 5796, "time_per_iteration": 2.7762324810028076 }, { "auxiliary_loss_clip": 0.01150907, "auxiliary_loss_mlp": 0.01023569, "balance_loss_clip": 0.81727463, "balance_loss_mlp": 1.01605892, "epoch": 0.6970480370348103, "flos": 21579404647680.0, "grad_norm": 1.7098208243517568, "language_loss": 0.75063616, "learning_rate": 8.8780725829471e-07, "loss": 0.77238095, "num_input_tokens_seen": 124757410, "step": 5797, "time_per_iteration": 2.867347240447998 }, { "auxiliary_loss_clip": 0.01170461, "auxiliary_loss_mlp": 0.01024616, "balance_loss_clip": 1.04901505, "balance_loss_mlp": 1.0171144, "epoch": 0.6971682799254494, "flos": 22419175691520.0, "grad_norm": 2.3830241896635163, "language_loss": 0.77910483, "learning_rate": 8.87159925455165e-07, "loss": 0.80105567, "num_input_tokens_seen": 124777240, "step": 5798, "time_per_iteration": 2.745037078857422 }, { "auxiliary_loss_clip": 0.01160978, "auxiliary_loss_mlp": 0.01026451, "balance_loss_clip": 0.93307996, "balance_loss_mlp": 1.01879787, "epoch": 0.6972885228160886, "flos": 20005834659840.0, "grad_norm": 1.9099272398684675, "language_loss": 0.73262143, "learning_rate": 8.865127614325738e-07, "loss": 0.75449574, "num_input_tokens_seen": 124795670, "step": 5799, "time_per_iteration": 2.777757167816162 }, { "auxiliary_loss_clip": 0.01157832, "auxiliary_loss_mlp": 0.010227, "balance_loss_clip": 0.96898454, "balance_loss_mlp": 1.01456153, "epoch": 0.6974087657067276, "flos": 37853437656960.0, "grad_norm": 2.0680393750721486, "language_loss": 0.66943479, "learning_rate": 8.85865766325113e-07, "loss": 0.69124019, "num_input_tokens_seen": 124819600, "step": 5800, "time_per_iteration": 2.910466194152832 }, { "auxiliary_loss_clip": 0.01163135, "auxiliary_loss_mlp": 0.01027061, "balance_loss_clip": 0.97177434, "balance_loss_mlp": 1.01942575, "epoch": 0.6975290085973667, "flos": 29489267543040.0, "grad_norm": 2.52782304837913, "language_loss": 0.71801198, "learning_rate": 8.852189402309287e-07, "loss": 0.739914, "num_input_tokens_seen": 124838785, "step": 5801, "time_per_iteration": 2.773172378540039 }, { "auxiliary_loss_clip": 0.01170713, "auxiliary_loss_mlp": 0.010245, "balance_loss_clip": 1.01205015, "balance_loss_mlp": 1.01666152, "epoch": 0.6976492514880057, "flos": 12895630295040.0, "grad_norm": 2.0785482694533877, "language_loss": 0.74388576, "learning_rate": 8.845722832481441e-07, "loss": 0.76583791, "num_input_tokens_seen": 124854215, "step": 5802, "time_per_iteration": 2.6580841541290283 }, { "auxiliary_loss_clip": 0.01166035, "auxiliary_loss_mlp": 0.01024265, "balance_loss_clip": 1.00955307, "balance_loss_mlp": 1.01755643, "epoch": 0.6977694943786449, "flos": 24352929308160.0, "grad_norm": 1.7076088174835873, "language_loss": 0.77163363, "learning_rate": 8.83925795474858e-07, "loss": 0.7935366, "num_input_tokens_seen": 124874340, "step": 5803, "time_per_iteration": 2.6902177333831787 }, { "auxiliary_loss_clip": 0.0116518, "auxiliary_loss_mlp": 0.0102674, "balance_loss_clip": 0.93618518, "balance_loss_mlp": 1.01888371, "epoch": 0.6978897372692839, "flos": 29898470257920.0, "grad_norm": 2.7145642919564086, "language_loss": 0.59161079, "learning_rate": 8.832794770091414e-07, "loss": 0.61352998, "num_input_tokens_seen": 124895175, "step": 5804, "time_per_iteration": 2.8156886100769043 }, { "auxiliary_loss_clip": 0.01169868, "auxiliary_loss_mlp": 0.01027277, "balance_loss_clip": 0.96981394, "balance_loss_mlp": 1.02016664, "epoch": 0.698009980159923, "flos": 21761579450880.0, "grad_norm": 2.363636002484077, "language_loss": 0.8257488, "learning_rate": 8.826333279490401e-07, "loss": 0.84772027, "num_input_tokens_seen": 124915810, "step": 5805, "time_per_iteration": 2.757575511932373 }, { "auxiliary_loss_clip": 0.01169163, "auxiliary_loss_mlp": 0.01029139, "balance_loss_clip": 0.97297752, "balance_loss_mlp": 1.02207565, "epoch": 0.6981302230505622, "flos": 19857164267520.0, "grad_norm": 2.0629370217447667, "language_loss": 0.67814845, "learning_rate": 8.819873483925748e-07, "loss": 0.70013148, "num_input_tokens_seen": 124932930, "step": 5806, "time_per_iteration": 2.713216543197632 }, { "auxiliary_loss_clip": 0.01171267, "auxiliary_loss_mlp": 0.01122796, "balance_loss_clip": 0.93530321, "balance_loss_mlp": 0.0, "epoch": 0.6982504659412012, "flos": 22198648141440.0, "grad_norm": 1.9972853845031369, "language_loss": 0.74555284, "learning_rate": 8.81341538437739e-07, "loss": 0.76849341, "num_input_tokens_seen": 124951220, "step": 5807, "time_per_iteration": 2.7209255695343018 }, { "auxiliary_loss_clip": 0.01171657, "auxiliary_loss_mlp": 0.01025086, "balance_loss_clip": 0.97138011, "balance_loss_mlp": 1.01763594, "epoch": 0.6983707088318403, "flos": 35588479708800.0, "grad_norm": 1.6671184229087985, "language_loss": 0.67863899, "learning_rate": 8.80695898182503e-07, "loss": 0.70060635, "num_input_tokens_seen": 124972200, "step": 5808, "time_per_iteration": 2.875032424926758 }, { "auxiliary_loss_clip": 0.01074601, "auxiliary_loss_mlp": 0.01001743, "balance_loss_clip": 0.98565173, "balance_loss_mlp": 0.99999088, "epoch": 0.6984909517224794, "flos": 65440052760960.0, "grad_norm": 0.811355767350827, "language_loss": 0.65106207, "learning_rate": 8.800504277248093e-07, "loss": 0.67182553, "num_input_tokens_seen": 125036950, "step": 5809, "time_per_iteration": 3.2126681804656982 }, { "auxiliary_loss_clip": 0.01167624, "auxiliary_loss_mlp": 0.01122289, "balance_loss_clip": 0.94031549, "balance_loss_mlp": 0.0, "epoch": 0.6986111946131185, "flos": 18546927863040.0, "grad_norm": 1.5895219152372364, "language_loss": 0.75173891, "learning_rate": 8.794051271625753e-07, "loss": 0.77463806, "num_input_tokens_seen": 125054585, "step": 5810, "time_per_iteration": 2.7716829776763916 }, { "auxiliary_loss_clip": 0.01167558, "auxiliary_loss_mlp": 0.01023828, "balance_loss_clip": 0.97204322, "balance_loss_mlp": 1.01686358, "epoch": 0.6987314375037575, "flos": 23039173370880.0, "grad_norm": 1.6951292649894523, "language_loss": 0.83407617, "learning_rate": 8.787599965936925e-07, "loss": 0.85598999, "num_input_tokens_seen": 125075515, "step": 5811, "time_per_iteration": 2.7188656330108643 }, { "auxiliary_loss_clip": 0.01160378, "auxiliary_loss_mlp": 0.0102708, "balance_loss_clip": 0.9366039, "balance_loss_mlp": 1.02034199, "epoch": 0.6988516803943967, "flos": 38400393029760.0, "grad_norm": 1.7522480113784797, "language_loss": 0.71798229, "learning_rate": 8.781150361160261e-07, "loss": 0.73985684, "num_input_tokens_seen": 125097425, "step": 5812, "time_per_iteration": 2.8444008827209473 }, { "auxiliary_loss_clip": 0.01170412, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 0.93542862, "balance_loss_mlp": 1.01922214, "epoch": 0.6989719232850358, "flos": 24096993926400.0, "grad_norm": 1.8800549691615456, "language_loss": 0.73894268, "learning_rate": 8.774702458274181e-07, "loss": 0.76091409, "num_input_tokens_seen": 125117830, "step": 5813, "time_per_iteration": 2.798762321472168 }, { "auxiliary_loss_clip": 0.01168078, "auxiliary_loss_mlp": 0.01022876, "balance_loss_clip": 1.01013362, "balance_loss_mlp": 1.01527643, "epoch": 0.6990921661756748, "flos": 14866838818560.0, "grad_norm": 2.5349937290727658, "language_loss": 0.7038402, "learning_rate": 8.768256258256799e-07, "loss": 0.72574973, "num_input_tokens_seen": 125134455, "step": 5814, "time_per_iteration": 4.11737585067749 }, { "auxiliary_loss_clip": 0.01170391, "auxiliary_loss_mlp": 0.01025199, "balance_loss_clip": 1.01056349, "balance_loss_mlp": 1.01791525, "epoch": 0.699212409066314, "flos": 20193719725440.0, "grad_norm": 1.6195112153074525, "language_loss": 0.73484409, "learning_rate": 8.76181176208602e-07, "loss": 0.7568, "num_input_tokens_seen": 125152555, "step": 5815, "time_per_iteration": 2.6721978187561035 }, { "auxiliary_loss_clip": 0.01149546, "auxiliary_loss_mlp": 0.01029821, "balance_loss_clip": 0.89112687, "balance_loss_mlp": 1.02199805, "epoch": 0.699332651956953, "flos": 19427888828160.0, "grad_norm": 1.7457195823933112, "language_loss": 0.73380363, "learning_rate": 8.755368970739461e-07, "loss": 0.75559723, "num_input_tokens_seen": 125171915, "step": 5816, "time_per_iteration": 2.737694263458252 }, { "auxiliary_loss_clip": 0.01172157, "auxiliary_loss_mlp": 0.01024396, "balance_loss_clip": 0.93268204, "balance_loss_mlp": 1.01683259, "epoch": 0.6994528948475921, "flos": 16143714466560.0, "grad_norm": 3.901674117379048, "language_loss": 0.61451477, "learning_rate": 8.748927885194479e-07, "loss": 0.63648027, "num_input_tokens_seen": 125190220, "step": 5817, "time_per_iteration": 2.7029805183410645 }, { "auxiliary_loss_clip": 0.01066312, "auxiliary_loss_mlp": 0.01000251, "balance_loss_clip": 0.90096968, "balance_loss_mlp": 0.99852288, "epoch": 0.6995731377382313, "flos": 64952420699520.0, "grad_norm": 0.801965599563055, "language_loss": 0.57528311, "learning_rate": 8.742488506428209e-07, "loss": 0.5959487, "num_input_tokens_seen": 125249310, "step": 5818, "time_per_iteration": 3.2499446868896484 }, { "auxiliary_loss_clip": 0.01170468, "auxiliary_loss_mlp": 0.01122555, "balance_loss_clip": 0.97265673, "balance_loss_mlp": 0.0, "epoch": 0.6996933806288703, "flos": 24900136076160.0, "grad_norm": 1.7481489100682177, "language_loss": 0.78652358, "learning_rate": 8.736050835417466e-07, "loss": 0.80945385, "num_input_tokens_seen": 125269350, "step": 5819, "time_per_iteration": 4.515204429626465 }, { "auxiliary_loss_clip": 0.01173505, "auxiliary_loss_mlp": 0.01025435, "balance_loss_clip": 1.0118897, "balance_loss_mlp": 1.01790738, "epoch": 0.6998136235195094, "flos": 20777806782720.0, "grad_norm": 1.8354023965089976, "language_loss": 0.61604917, "learning_rate": 8.729614873138862e-07, "loss": 0.63803852, "num_input_tokens_seen": 125286985, "step": 5820, "time_per_iteration": 2.73683762550354 }, { "auxiliary_loss_clip": 0.01170337, "auxiliary_loss_mlp": 0.01029806, "balance_loss_clip": 0.89782929, "balance_loss_mlp": 1.02198601, "epoch": 0.6999338664101485, "flos": 23733470332800.0, "grad_norm": 1.862798685177662, "language_loss": 0.77835375, "learning_rate": 8.723180620568716e-07, "loss": 0.8003552, "num_input_tokens_seen": 125306240, "step": 5821, "time_per_iteration": 3.6836063861846924 }, { "auxiliary_loss_clip": 0.01169629, "auxiliary_loss_mlp": 0.01025182, "balance_loss_clip": 0.97197092, "balance_loss_mlp": 1.01804686, "epoch": 0.7000541093007876, "flos": 19864598382720.0, "grad_norm": 1.7487339534153457, "language_loss": 0.84920943, "learning_rate": 8.716748078683116e-07, "loss": 0.87115753, "num_input_tokens_seen": 125323015, "step": 5822, "time_per_iteration": 2.6017565727233887 }, { "auxiliary_loss_clip": 0.01152376, "auxiliary_loss_mlp": 0.01029924, "balance_loss_clip": 0.81707525, "balance_loss_mlp": 1.02237773, "epoch": 0.7001743521914267, "flos": 29679056029440.0, "grad_norm": 1.9190281423341033, "language_loss": 0.68570423, "learning_rate": 8.710317248457855e-07, "loss": 0.70752722, "num_input_tokens_seen": 125342630, "step": 5823, "time_per_iteration": 2.9503934383392334 }, { "auxiliary_loss_clip": 0.01165152, "auxiliary_loss_mlp": 0.01029411, "balance_loss_clip": 0.97391212, "balance_loss_mlp": 1.02188039, "epoch": 0.7002945950820658, "flos": 27489762080640.0, "grad_norm": 1.6368960755240827, "language_loss": 0.71832603, "learning_rate": 8.703888130868482e-07, "loss": 0.74027169, "num_input_tokens_seen": 125364480, "step": 5824, "time_per_iteration": 3.331721782684326 }, { "auxiliary_loss_clip": 0.01163638, "auxiliary_loss_mlp": 0.01025649, "balance_loss_clip": 0.93283111, "balance_loss_mlp": 1.01816869, "epoch": 0.7004148379727049, "flos": 22158463800960.0, "grad_norm": 1.9659074623360773, "language_loss": 0.82335365, "learning_rate": 8.697460726890307e-07, "loss": 0.84524655, "num_input_tokens_seen": 125381625, "step": 5825, "time_per_iteration": 2.7123372554779053 }, { "auxiliary_loss_clip": 0.01162374, "auxiliary_loss_mlp": 0.0112258, "balance_loss_clip": 0.93123996, "balance_loss_mlp": 0.0, "epoch": 0.7005350808633439, "flos": 19423758764160.0, "grad_norm": 1.8773877304124826, "language_loss": 0.90312028, "learning_rate": 8.691035037498354e-07, "loss": 0.92596984, "num_input_tokens_seen": 125397615, "step": 5826, "time_per_iteration": 2.771224021911621 }, { "auxiliary_loss_clip": 0.01160985, "auxiliary_loss_mlp": 0.01028152, "balance_loss_clip": 0.96926868, "balance_loss_mlp": 1.02042139, "epoch": 0.7006553237539831, "flos": 23476708938240.0, "grad_norm": 1.6345749514895238, "language_loss": 0.72145152, "learning_rate": 8.684611063667391e-07, "loss": 0.74334288, "num_input_tokens_seen": 125418080, "step": 5827, "time_per_iteration": 2.8455045223236084 }, { "auxiliary_loss_clip": 0.0116493, "auxiliary_loss_mlp": 0.01027177, "balance_loss_clip": 1.00774372, "balance_loss_mlp": 1.02013183, "epoch": 0.7007755666446221, "flos": 31212872640000.0, "grad_norm": 2.0730178927887164, "language_loss": 0.77163625, "learning_rate": 8.678188806371935e-07, "loss": 0.79355729, "num_input_tokens_seen": 125440115, "step": 5828, "time_per_iteration": 2.8178703784942627 }, { "auxiliary_loss_clip": 0.01167635, "auxiliary_loss_mlp": 0.01022386, "balance_loss_clip": 1.01034176, "balance_loss_mlp": 1.01597297, "epoch": 0.7008958095352612, "flos": 18149899858560.0, "grad_norm": 1.682317274905178, "language_loss": 0.84911281, "learning_rate": 8.671768266586228e-07, "loss": 0.87101305, "num_input_tokens_seen": 125458240, "step": 5829, "time_per_iteration": 2.588503122329712 }, { "auxiliary_loss_clip": 0.01161599, "auxiliary_loss_mlp": 0.01028323, "balance_loss_clip": 0.93325675, "balance_loss_mlp": 1.02115238, "epoch": 0.7010160524259004, "flos": 27452307173760.0, "grad_norm": 1.6679339379203766, "language_loss": 0.77927679, "learning_rate": 8.665349445284275e-07, "loss": 0.80117595, "num_input_tokens_seen": 125477980, "step": 5830, "time_per_iteration": 2.732787609100342 }, { "auxiliary_loss_clip": 0.01169834, "auxiliary_loss_mlp": 0.01022603, "balance_loss_clip": 0.93547869, "balance_loss_mlp": 1.01541758, "epoch": 0.7011362953165394, "flos": 23842064125440.0, "grad_norm": 1.4799506256888135, "language_loss": 0.80734372, "learning_rate": 8.658932343439799e-07, "loss": 0.8292681, "num_input_tokens_seen": 125497765, "step": 5831, "time_per_iteration": 2.7898004055023193 }, { "auxiliary_loss_clip": 0.01169983, "auxiliary_loss_mlp": 0.01022711, "balance_loss_clip": 1.04910672, "balance_loss_mlp": 1.01545763, "epoch": 0.7012565382071785, "flos": 24823430582400.0, "grad_norm": 1.7871907515671308, "language_loss": 0.77689743, "learning_rate": 8.65251696202627e-07, "loss": 0.79882437, "num_input_tokens_seen": 125514145, "step": 5832, "time_per_iteration": 2.6556529998779297 }, { "auxiliary_loss_clip": 0.0116974, "auxiliary_loss_mlp": 0.01022553, "balance_loss_clip": 0.93553305, "balance_loss_mlp": 1.01487637, "epoch": 0.7013767810978175, "flos": 21397445326080.0, "grad_norm": 1.8843622766123442, "language_loss": 0.87118983, "learning_rate": 8.646103302016896e-07, "loss": 0.89311278, "num_input_tokens_seen": 125533115, "step": 5833, "time_per_iteration": 2.760502815246582 }, { "auxiliary_loss_clip": 0.01175317, "auxiliary_loss_mlp": 0.01025816, "balance_loss_clip": 0.89584708, "balance_loss_mlp": 1.0183115, "epoch": 0.7014970239884567, "flos": 16687150306560.0, "grad_norm": 1.9032098374654483, "language_loss": 0.88395107, "learning_rate": 8.639691364384614e-07, "loss": 0.90596241, "num_input_tokens_seen": 125550740, "step": 5834, "time_per_iteration": 2.765763282775879 }, { "auxiliary_loss_clip": 0.01173517, "auxiliary_loss_mlp": 0.01026762, "balance_loss_clip": 0.97461677, "balance_loss_mlp": 1.01983309, "epoch": 0.7016172668790958, "flos": 12568268718720.0, "grad_norm": 1.7992916936425494, "language_loss": 0.72819865, "learning_rate": 8.633281150102136e-07, "loss": 0.75020146, "num_input_tokens_seen": 125567590, "step": 5835, "time_per_iteration": 2.677875280380249 }, { "auxiliary_loss_clip": 0.01166354, "auxiliary_loss_mlp": 0.01024944, "balance_loss_clip": 0.9732703, "balance_loss_mlp": 1.01760364, "epoch": 0.7017375097697348, "flos": 17452729808640.0, "grad_norm": 2.2926157496203396, "language_loss": 0.68215829, "learning_rate": 8.626872660141855e-07, "loss": 0.70407128, "num_input_tokens_seen": 125585500, "step": 5836, "time_per_iteration": 2.668498992919922 }, { "auxiliary_loss_clip": 0.01167466, "auxiliary_loss_mlp": 0.01025112, "balance_loss_clip": 0.89759851, "balance_loss_mlp": 1.01740849, "epoch": 0.701857752660374, "flos": 18513028402560.0, "grad_norm": 1.655924117856182, "language_loss": 0.74697387, "learning_rate": 8.620465895475957e-07, "loss": 0.76889968, "num_input_tokens_seen": 125603720, "step": 5837, "time_per_iteration": 2.759160041809082 }, { "auxiliary_loss_clip": 0.01158664, "auxiliary_loss_mlp": 0.01029924, "balance_loss_clip": 0.89558375, "balance_loss_mlp": 1.02312589, "epoch": 0.701977995551013, "flos": 24425971614720.0, "grad_norm": 1.4744829970807902, "language_loss": 0.74882215, "learning_rate": 8.614060857076333e-07, "loss": 0.77070808, "num_input_tokens_seen": 125624390, "step": 5838, "time_per_iteration": 2.78658390045166 }, { "auxiliary_loss_clip": 0.01163226, "auxiliary_loss_mlp": 0.0103021, "balance_loss_clip": 0.97224551, "balance_loss_mlp": 1.02273524, "epoch": 0.7020982384416521, "flos": 23002759958400.0, "grad_norm": 1.814517599962057, "language_loss": 0.7466563, "learning_rate": 8.60765754591462e-07, "loss": 0.76859069, "num_input_tokens_seen": 125644085, "step": 5839, "time_per_iteration": 2.699347734451294 }, { "auxiliary_loss_clip": 0.01168429, "auxiliary_loss_mlp": 0.01027473, "balance_loss_clip": 1.04841423, "balance_loss_mlp": 1.02016568, "epoch": 0.7022184813322913, "flos": 20449080489600.0, "grad_norm": 1.9765285149551843, "language_loss": 0.72504735, "learning_rate": 8.601255962962211e-07, "loss": 0.74700642, "num_input_tokens_seen": 125663095, "step": 5840, "time_per_iteration": 3.9878687858581543 }, { "auxiliary_loss_clip": 0.01175972, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.01099098, "balance_loss_mlp": 1.02064013, "epoch": 0.7023387242229303, "flos": 19790514581760.0, "grad_norm": 2.660889943664633, "language_loss": 0.72362399, "learning_rate": 8.594856109190194e-07, "loss": 0.7456755, "num_input_tokens_seen": 125680125, "step": 5841, "time_per_iteration": 2.657158136367798 }, { "auxiliary_loss_clip": 0.01168875, "auxiliary_loss_mlp": 0.01021403, "balance_loss_clip": 1.0492568, "balance_loss_mlp": 1.01376784, "epoch": 0.7024589671135694, "flos": 33259278286080.0, "grad_norm": 1.5350143052113252, "language_loss": 0.68727589, "learning_rate": 8.588457985569446e-07, "loss": 0.70917863, "num_input_tokens_seen": 125703035, "step": 5842, "time_per_iteration": 2.795463800430298 }, { "auxiliary_loss_clip": 0.01172559, "auxiliary_loss_mlp": 0.01026578, "balance_loss_clip": 1.05072331, "balance_loss_mlp": 1.01881146, "epoch": 0.7025792100042085, "flos": 19098982967040.0, "grad_norm": 1.9425069010725218, "language_loss": 0.71787226, "learning_rate": 8.582061593070542e-07, "loss": 0.73986369, "num_input_tokens_seen": 125723765, "step": 5843, "time_per_iteration": 2.663937568664551 }, { "auxiliary_loss_clip": 0.011697, "auxiliary_loss_mlp": 0.01122585, "balance_loss_clip": 1.04805422, "balance_loss_mlp": 0.0, "epoch": 0.7026994528948476, "flos": 18952611045120.0, "grad_norm": 2.2892567492948794, "language_loss": 0.76663792, "learning_rate": 8.57566693266383e-07, "loss": 0.78956079, "num_input_tokens_seen": 125741455, "step": 5844, "time_per_iteration": 2.6661643981933594 }, { "auxiliary_loss_clip": 0.01173622, "auxiliary_loss_mlp": 0.01122681, "balance_loss_clip": 0.97240007, "balance_loss_mlp": 0.0, "epoch": 0.7028196957854866, "flos": 19536662188800.0, "grad_norm": 2.025909740540363, "language_loss": 0.6916818, "learning_rate": 8.569274005319354e-07, "loss": 0.71464479, "num_input_tokens_seen": 125759855, "step": 5845, "time_per_iteration": 3.676208019256592 }, { "auxiliary_loss_clip": 0.01161919, "auxiliary_loss_mlp": 0.01029424, "balance_loss_clip": 1.00926948, "balance_loss_mlp": 1.02227151, "epoch": 0.7029399386761258, "flos": 20845318394880.0, "grad_norm": 2.0777219134332983, "language_loss": 0.79602611, "learning_rate": 8.562882812006913e-07, "loss": 0.81793952, "num_input_tokens_seen": 125777345, "step": 5846, "time_per_iteration": 2.7439322471618652 }, { "auxiliary_loss_clip": 0.01168916, "auxiliary_loss_mlp": 0.01027376, "balance_loss_clip": 1.04806137, "balance_loss_mlp": 1.02032495, "epoch": 0.7030601815667649, "flos": 22055005653120.0, "grad_norm": 1.6506348115523695, "language_loss": 0.7749809, "learning_rate": 8.556493353696066e-07, "loss": 0.79694378, "num_input_tokens_seen": 125796345, "step": 5847, "time_per_iteration": 3.607912063598633 }, { "auxiliary_loss_clip": 0.01171957, "auxiliary_loss_mlp": 0.01123121, "balance_loss_clip": 1.01078486, "balance_loss_mlp": 0.0, "epoch": 0.7031804244574039, "flos": 27198742089600.0, "grad_norm": 2.145643394939609, "language_loss": 0.6807611, "learning_rate": 8.550105631356077e-07, "loss": 0.70371193, "num_input_tokens_seen": 125816070, "step": 5848, "time_per_iteration": 2.7561984062194824 }, { "auxiliary_loss_clip": 0.01159937, "auxiliary_loss_mlp": 0.01032268, "balance_loss_clip": 0.93168652, "balance_loss_mlp": 1.02496636, "epoch": 0.7033006673480431, "flos": 22379853277440.0, "grad_norm": 2.689521198285791, "language_loss": 0.77462268, "learning_rate": 8.543719645955961e-07, "loss": 0.79654479, "num_input_tokens_seen": 125834400, "step": 5849, "time_per_iteration": 2.801934242248535 }, { "auxiliary_loss_clip": 0.01170315, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 0.97205377, "balance_loss_mlp": 1.01751339, "epoch": 0.7034209102386821, "flos": 24715986024960.0, "grad_norm": 1.5685028175794336, "language_loss": 0.74441385, "learning_rate": 8.537335398464467e-07, "loss": 0.7663638, "num_input_tokens_seen": 125854720, "step": 5850, "time_per_iteration": 2.7331936359405518 }, { "auxiliary_loss_clip": 0.01163315, "auxiliary_loss_mlp": 0.01024949, "balance_loss_clip": 0.96890205, "balance_loss_mlp": 1.01731932, "epoch": 0.7035411531293212, "flos": 22556174163840.0, "grad_norm": 3.034012393970634, "language_loss": 0.8506121, "learning_rate": 8.53095288985007e-07, "loss": 0.87249476, "num_input_tokens_seen": 125868455, "step": 5851, "time_per_iteration": 2.683784008026123 }, { "auxiliary_loss_clip": 0.0116797, "auxiliary_loss_mlp": 0.01025281, "balance_loss_clip": 1.04944825, "balance_loss_mlp": 1.01839399, "epoch": 0.7036613960199604, "flos": 22674967418880.0, "grad_norm": 1.5584607435082374, "language_loss": 0.82241917, "learning_rate": 8.524572121081009e-07, "loss": 0.84435165, "num_input_tokens_seen": 125888555, "step": 5852, "time_per_iteration": 2.6760189533233643 }, { "auxiliary_loss_clip": 0.01169736, "auxiliary_loss_mlp": 0.01027885, "balance_loss_clip": 1.00907433, "balance_loss_mlp": 1.02002954, "epoch": 0.7037816389105994, "flos": 22492146170880.0, "grad_norm": 2.5070392909777155, "language_loss": 0.62180853, "learning_rate": 8.518193093125232e-07, "loss": 0.6437847, "num_input_tokens_seen": 125907610, "step": 5853, "time_per_iteration": 2.724057912826538 }, { "auxiliary_loss_clip": 0.01173519, "auxiliary_loss_mlp": 0.01023035, "balance_loss_clip": 0.9730041, "balance_loss_mlp": 1.01629925, "epoch": 0.7039018818012385, "flos": 27087490690560.0, "grad_norm": 2.187639015542998, "language_loss": 0.81110644, "learning_rate": 8.511815806950436e-07, "loss": 0.83307195, "num_input_tokens_seen": 125928640, "step": 5854, "time_per_iteration": 2.745480537414551 }, { "auxiliary_loss_clip": 0.01166421, "auxiliary_loss_mlp": 0.01025856, "balance_loss_clip": 1.00844026, "balance_loss_mlp": 1.01886404, "epoch": 0.7040221246918776, "flos": 17749819198080.0, "grad_norm": 1.5790654727108413, "language_loss": 0.78095275, "learning_rate": 8.505440263524044e-07, "loss": 0.80287552, "num_input_tokens_seen": 125947485, "step": 5855, "time_per_iteration": 2.649782180786133 }, { "auxiliary_loss_clip": 0.01172057, "auxiliary_loss_mlp": 0.0103223, "balance_loss_clip": 1.01091373, "balance_loss_mlp": 1.02399886, "epoch": 0.7041423675825167, "flos": 16279851012480.0, "grad_norm": 3.513001711586992, "language_loss": 0.87948304, "learning_rate": 8.49906646381322e-07, "loss": 0.90152586, "num_input_tokens_seen": 125960320, "step": 5856, "time_per_iteration": 2.731992483139038 }, { "auxiliary_loss_clip": 0.01171112, "auxiliary_loss_mlp": 0.01027517, "balance_loss_clip": 0.93617135, "balance_loss_mlp": 1.02030754, "epoch": 0.7042626104731557, "flos": 25483181639040.0, "grad_norm": 5.462495598555303, "language_loss": 0.71908879, "learning_rate": 8.492694408784884e-07, "loss": 0.7410751, "num_input_tokens_seen": 125980575, "step": 5857, "time_per_iteration": 2.7184319496154785 }, { "auxiliary_loss_clip": 0.01170583, "auxiliary_loss_mlp": 0.01028016, "balance_loss_clip": 1.01018918, "balance_loss_mlp": 1.02091742, "epoch": 0.7043828533637949, "flos": 17857622891520.0, "grad_norm": 2.7448626316167584, "language_loss": 0.62382698, "learning_rate": 8.486324099405642e-07, "loss": 0.64581299, "num_input_tokens_seen": 125997420, "step": 5858, "time_per_iteration": 2.738820791244507 }, { "auxiliary_loss_clip": 0.01168005, "auxiliary_loss_mlp": 0.01020864, "balance_loss_clip": 1.00951457, "balance_loss_mlp": 1.01408362, "epoch": 0.704503096254434, "flos": 29494259533440.0, "grad_norm": 1.6976928373206681, "language_loss": 0.74524295, "learning_rate": 8.479955536641887e-07, "loss": 0.76713163, "num_input_tokens_seen": 126018915, "step": 5859, "time_per_iteration": 2.7081236839294434 }, { "auxiliary_loss_clip": 0.01154504, "auxiliary_loss_mlp": 0.0102543, "balance_loss_clip": 0.96614182, "balance_loss_mlp": 1.01820576, "epoch": 0.704623339145073, "flos": 30920739327360.0, "grad_norm": 1.7037546602567757, "language_loss": 0.65944481, "learning_rate": 8.473588721459716e-07, "loss": 0.68124413, "num_input_tokens_seen": 126038825, "step": 5860, "time_per_iteration": 2.7918732166290283 }, { "auxiliary_loss_clip": 0.01171397, "auxiliary_loss_mlp": 0.01027558, "balance_loss_clip": 1.01385212, "balance_loss_mlp": 1.01957726, "epoch": 0.7047435820357122, "flos": 23914747296000.0, "grad_norm": 1.7689855845615325, "language_loss": 0.70280927, "learning_rate": 8.467223654824967e-07, "loss": 0.7247988, "num_input_tokens_seen": 126058280, "step": 5861, "time_per_iteration": 2.755159378051758 }, { "auxiliary_loss_clip": 0.01161524, "auxiliary_loss_mlp": 0.01023924, "balance_loss_clip": 1.00952506, "balance_loss_mlp": 1.01643753, "epoch": 0.7048638249263512, "flos": 46494010926720.0, "grad_norm": 1.9771743005349534, "language_loss": 0.62758785, "learning_rate": 8.460860337703233e-07, "loss": 0.64944232, "num_input_tokens_seen": 126078885, "step": 5862, "time_per_iteration": 2.914186954498291 }, { "auxiliary_loss_clip": 0.01152196, "auxiliary_loss_mlp": 0.01028004, "balance_loss_clip": 0.93055141, "balance_loss_mlp": 1.01999962, "epoch": 0.7049840678169903, "flos": 21689219502720.0, "grad_norm": 1.7873809210324887, "language_loss": 0.70680749, "learning_rate": 8.454498771059797e-07, "loss": 0.72860944, "num_input_tokens_seen": 126098260, "step": 5863, "time_per_iteration": 2.7802789211273193 }, { "auxiliary_loss_clip": 0.01157827, "auxiliary_loss_mlp": 0.01027278, "balance_loss_clip": 0.89454395, "balance_loss_mlp": 1.01942801, "epoch": 0.7051043107076294, "flos": 18405081054720.0, "grad_norm": 2.3382282968513888, "language_loss": 0.83238232, "learning_rate": 8.448138955859725e-07, "loss": 0.85423338, "num_input_tokens_seen": 126114845, "step": 5864, "time_per_iteration": 2.7772727012634277 }, { "auxiliary_loss_clip": 0.01167061, "auxiliary_loss_mlp": 0.01026764, "balance_loss_clip": 0.97271764, "balance_loss_mlp": 1.01923609, "epoch": 0.7052245535982685, "flos": 19319043640320.0, "grad_norm": 1.8747326301563458, "language_loss": 0.8991375, "learning_rate": 8.44178089306778e-07, "loss": 0.9210757, "num_input_tokens_seen": 126132780, "step": 5865, "time_per_iteration": 2.7690422534942627 }, { "auxiliary_loss_clip": 0.0117075, "auxiliary_loss_mlp": 0.01024325, "balance_loss_clip": 1.05060518, "balance_loss_mlp": 1.01716614, "epoch": 0.7053447964889076, "flos": 19062138591360.0, "grad_norm": 1.7284915461247061, "language_loss": 0.76736367, "learning_rate": 8.4354245836485e-07, "loss": 0.78931445, "num_input_tokens_seen": 126151225, "step": 5866, "time_per_iteration": 3.7203803062438965 }, { "auxiliary_loss_clip": 0.01170686, "auxiliary_loss_mlp": 0.0102229, "balance_loss_clip": 0.93576336, "balance_loss_mlp": 1.01486301, "epoch": 0.7054650393795466, "flos": 27379228953600.0, "grad_norm": 1.5769430957799797, "language_loss": 0.72298801, "learning_rate": 8.429070028566108e-07, "loss": 0.74491775, "num_input_tokens_seen": 126172535, "step": 5867, "time_per_iteration": 2.747521162033081 }, { "auxiliary_loss_clip": 0.01167942, "auxiliary_loss_mlp": 0.01021699, "balance_loss_clip": 1.01117539, "balance_loss_mlp": 1.01451683, "epoch": 0.7055852822701858, "flos": 16102201322880.0, "grad_norm": 1.9814822203087878, "language_loss": 0.75056559, "learning_rate": 8.422717228784586e-07, "loss": 0.77246201, "num_input_tokens_seen": 126189410, "step": 5868, "time_per_iteration": 2.655562162399292 }, { "auxiliary_loss_clip": 0.01166281, "auxiliary_loss_mlp": 0.01032236, "balance_loss_clip": 0.89784455, "balance_loss_mlp": 1.0243355, "epoch": 0.7057055251608249, "flos": 11692299744000.0, "grad_norm": 1.8119451016267085, "language_loss": 0.69461071, "learning_rate": 8.416366185267663e-07, "loss": 0.71659595, "num_input_tokens_seen": 126206910, "step": 5869, "time_per_iteration": 2.7394304275512695 }, { "auxiliary_loss_clip": 0.01166566, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 1.00844479, "balance_loss_mlp": 1.02340865, "epoch": 0.7058257680514639, "flos": 22711560399360.0, "grad_norm": 1.8208760136819842, "language_loss": 0.7771672, "learning_rate": 8.410016898978778e-07, "loss": 0.799137, "num_input_tokens_seen": 126224385, "step": 5870, "time_per_iteration": 2.7038917541503906 }, { "auxiliary_loss_clip": 0.01169462, "auxiliary_loss_mlp": 0.01027369, "balance_loss_clip": 0.89885515, "balance_loss_mlp": 1.02020729, "epoch": 0.7059460109421031, "flos": 17529543043200.0, "grad_norm": 1.7163092813389371, "language_loss": 0.78767419, "learning_rate": 8.403669370881115e-07, "loss": 0.80964243, "num_input_tokens_seen": 126243120, "step": 5871, "time_per_iteration": 4.672168493270874 }, { "auxiliary_loss_clip": 0.01169904, "auxiliary_loss_mlp": 0.01024563, "balance_loss_clip": 1.04912031, "balance_loss_mlp": 1.01708305, "epoch": 0.7060662538327421, "flos": 23544687427200.0, "grad_norm": 1.6528001220865525, "language_loss": 0.78484732, "learning_rate": 8.397323601937587e-07, "loss": 0.80679196, "num_input_tokens_seen": 126263020, "step": 5872, "time_per_iteration": 3.650085687637329 }, { "auxiliary_loss_clip": 0.01159257, "auxiliary_loss_mlp": 0.01022376, "balance_loss_clip": 0.93258941, "balance_loss_mlp": 1.01487219, "epoch": 0.7061864967233812, "flos": 30260736875520.0, "grad_norm": 1.8773690637748153, "language_loss": 0.76953304, "learning_rate": 8.390979593110838e-07, "loss": 0.79134935, "num_input_tokens_seen": 126285150, "step": 5873, "time_per_iteration": 2.8758318424224854 }, { "auxiliary_loss_clip": 0.01173529, "auxiliary_loss_mlp": 0.01024795, "balance_loss_clip": 0.97420192, "balance_loss_mlp": 1.01710284, "epoch": 0.7063067396140204, "flos": 20701460424960.0, "grad_norm": 1.9733324500359564, "language_loss": 0.81428021, "learning_rate": 8.384637345363262e-07, "loss": 0.83626348, "num_input_tokens_seen": 126304340, "step": 5874, "time_per_iteration": 2.8122901916503906 }, { "auxiliary_loss_clip": 0.01156187, "auxiliary_loss_mlp": 0.01023408, "balance_loss_clip": 0.96803391, "balance_loss_mlp": 1.01585603, "epoch": 0.7064269825046594, "flos": 32266168081920.0, "grad_norm": 1.5921270360915583, "language_loss": 0.76295853, "learning_rate": 8.378296859656964e-07, "loss": 0.78475451, "num_input_tokens_seen": 126325495, "step": 5875, "time_per_iteration": 2.8518550395965576 }, { "auxiliary_loss_clip": 0.01167808, "auxiliary_loss_mlp": 0.01024218, "balance_loss_clip": 0.97254449, "balance_loss_mlp": 1.01750088, "epoch": 0.7065472253952985, "flos": 30227124723840.0, "grad_norm": 1.9402891358145757, "language_loss": 0.67830265, "learning_rate": 8.371958136953792e-07, "loss": 0.70022297, "num_input_tokens_seen": 126345525, "step": 5876, "time_per_iteration": 2.7928221225738525 }, { "auxiliary_loss_clip": 0.0116971, "auxiliary_loss_mlp": 0.01029124, "balance_loss_clip": 0.93204033, "balance_loss_mlp": 1.02125025, "epoch": 0.7066674682859376, "flos": 16216720859520.0, "grad_norm": 2.787149205195021, "language_loss": 0.66106212, "learning_rate": 8.365621178215326e-07, "loss": 0.68305045, "num_input_tokens_seen": 126361995, "step": 5877, "time_per_iteration": 2.7997825145721436 }, { "auxiliary_loss_clip": 0.01160861, "auxiliary_loss_mlp": 0.01025295, "balance_loss_clip": 1.00888634, "balance_loss_mlp": 1.01818752, "epoch": 0.7067877111765767, "flos": 14830461319680.0, "grad_norm": 2.068225916488796, "language_loss": 0.75185335, "learning_rate": 8.359285984402871e-07, "loss": 0.7737149, "num_input_tokens_seen": 126379260, "step": 5878, "time_per_iteration": 2.7097907066345215 }, { "auxiliary_loss_clip": 0.01164892, "auxiliary_loss_mlp": 0.0102409, "balance_loss_clip": 0.97353703, "balance_loss_mlp": 1.01693201, "epoch": 0.7069079540672157, "flos": 25440196037760.0, "grad_norm": 2.5819366612550354, "language_loss": 0.73939043, "learning_rate": 8.352952556477489e-07, "loss": 0.7612803, "num_input_tokens_seen": 126397170, "step": 5879, "time_per_iteration": 2.749863624572754 }, { "auxiliary_loss_clip": 0.01168385, "auxiliary_loss_mlp": 0.01025369, "balance_loss_clip": 1.0112499, "balance_loss_mlp": 1.01796067, "epoch": 0.7070281969578549, "flos": 24607751368320.0, "grad_norm": 2.1965779570774635, "language_loss": 0.76598442, "learning_rate": 8.34662089539993e-07, "loss": 0.78792197, "num_input_tokens_seen": 126416680, "step": 5880, "time_per_iteration": 2.715283155441284 }, { "auxiliary_loss_clip": 0.01166557, "auxiliary_loss_mlp": 0.01025075, "balance_loss_clip": 1.04908776, "balance_loss_mlp": 1.01767862, "epoch": 0.707148439848494, "flos": 26724469887360.0, "grad_norm": 1.9627057136179273, "language_loss": 0.79463816, "learning_rate": 8.340291002130722e-07, "loss": 0.81655443, "num_input_tokens_seen": 126435870, "step": 5881, "time_per_iteration": 2.6889004707336426 }, { "auxiliary_loss_clip": 0.01172816, "auxiliary_loss_mlp": 0.01032169, "balance_loss_clip": 1.05099821, "balance_loss_mlp": 1.02456975, "epoch": 0.707268682739133, "flos": 15085750256640.0, "grad_norm": 2.7678832981017703, "language_loss": 0.79959613, "learning_rate": 8.3339628776301e-07, "loss": 0.82164598, "num_input_tokens_seen": 126454010, "step": 5882, "time_per_iteration": 2.6766233444213867 }, { "auxiliary_loss_clip": 0.01168886, "auxiliary_loss_mlp": 0.01026418, "balance_loss_clip": 1.04801941, "balance_loss_mlp": 1.01945066, "epoch": 0.7073889256297722, "flos": 34313148345600.0, "grad_norm": 1.7956837842529743, "language_loss": 0.56847012, "learning_rate": 8.327636522858033e-07, "loss": 0.59042317, "num_input_tokens_seen": 126473615, "step": 5883, "time_per_iteration": 2.727919816970825 }, { "auxiliary_loss_clip": 0.01170345, "auxiliary_loss_mlp": 0.01027141, "balance_loss_clip": 0.89943302, "balance_loss_mlp": 1.01983929, "epoch": 0.7075091685204112, "flos": 20083940784000.0, "grad_norm": 2.106641083275422, "language_loss": 0.76990438, "learning_rate": 8.321311938774225e-07, "loss": 0.7918793, "num_input_tokens_seen": 126492705, "step": 5884, "time_per_iteration": 2.8625173568725586 }, { "auxiliary_loss_clip": 0.01172681, "auxiliary_loss_mlp": 0.01024734, "balance_loss_clip": 1.05019808, "balance_loss_mlp": 1.01736689, "epoch": 0.7076294114110503, "flos": 20777124424320.0, "grad_norm": 1.8860054068556782, "language_loss": 0.79151118, "learning_rate": 8.314989126338104e-07, "loss": 0.81348538, "num_input_tokens_seen": 126512715, "step": 5885, "time_per_iteration": 2.6450023651123047 }, { "auxiliary_loss_clip": 0.01170374, "auxiliary_loss_mlp": 0.01024391, "balance_loss_clip": 1.00994885, "balance_loss_mlp": 1.01694012, "epoch": 0.7077496543016895, "flos": 17967689141760.0, "grad_norm": 1.7072570542408736, "language_loss": 0.84631586, "learning_rate": 8.308668086508847e-07, "loss": 0.86826354, "num_input_tokens_seen": 126530795, "step": 5886, "time_per_iteration": 2.693148374557495 }, { "auxiliary_loss_clip": 0.01165508, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 0.93038464, "balance_loss_mlp": 1.02513742, "epoch": 0.7078698971923285, "flos": 45478098564480.0, "grad_norm": 1.777636620277453, "language_loss": 0.73564231, "learning_rate": 8.302348820245342e-07, "loss": 0.75762409, "num_input_tokens_seen": 126553360, "step": 5887, "time_per_iteration": 3.0466885566711426 }, { "auxiliary_loss_clip": 0.01165383, "auxiliary_loss_mlp": 0.01026421, "balance_loss_clip": 0.93341577, "balance_loss_mlp": 1.0188396, "epoch": 0.7079901400829676, "flos": 26943704547840.0, "grad_norm": 2.4865692974504, "language_loss": 0.70355713, "learning_rate": 8.296031328506232e-07, "loss": 0.72547519, "num_input_tokens_seen": 126573110, "step": 5888, "time_per_iteration": 2.8327605724334717 }, { "auxiliary_loss_clip": 0.0116938, "auxiliary_loss_mlp": 0.01024313, "balance_loss_clip": 0.97227466, "balance_loss_mlp": 1.01736569, "epoch": 0.7081103829736067, "flos": 24423206267520.0, "grad_norm": 3.936021355987275, "language_loss": 0.75813252, "learning_rate": 8.289715612249857e-07, "loss": 0.78006947, "num_input_tokens_seen": 126593725, "step": 5889, "time_per_iteration": 2.855226755142212 }, { "auxiliary_loss_clip": 0.01164775, "auxiliary_loss_mlp": 0.01032372, "balance_loss_clip": 0.97274947, "balance_loss_mlp": 1.02507937, "epoch": 0.7082306258642458, "flos": 18543300589440.0, "grad_norm": 3.6867074097101873, "language_loss": 0.77710003, "learning_rate": 8.283401672434305e-07, "loss": 0.79907155, "num_input_tokens_seen": 126608950, "step": 5890, "time_per_iteration": 2.7521238327026367 }, { "auxiliary_loss_clip": 0.01162844, "auxiliary_loss_mlp": 0.01022354, "balance_loss_clip": 0.97263342, "balance_loss_mlp": 1.01532984, "epoch": 0.7083508687548848, "flos": 23477534951040.0, "grad_norm": 1.9647517364055966, "language_loss": 0.70688111, "learning_rate": 8.277089510017412e-07, "loss": 0.72873306, "num_input_tokens_seen": 126629755, "step": 5891, "time_per_iteration": 2.751574754714966 }, { "auxiliary_loss_clip": 0.01164283, "auxiliary_loss_mlp": 0.01021689, "balance_loss_clip": 0.97258705, "balance_loss_mlp": 1.01471519, "epoch": 0.708471111645524, "flos": 22419463000320.0, "grad_norm": 1.6867550395269963, "language_loss": 0.8206774, "learning_rate": 8.270779125956719e-07, "loss": 0.84253711, "num_input_tokens_seen": 126650135, "step": 5892, "time_per_iteration": 2.7152600288391113 }, { "auxiliary_loss_clip": 0.01158889, "auxiliary_loss_mlp": 0.01023621, "balance_loss_clip": 0.8944056, "balance_loss_mlp": 1.01677811, "epoch": 0.7085913545361631, "flos": 20922885815040.0, "grad_norm": 2.2925130182669378, "language_loss": 0.80161989, "learning_rate": 8.264470521209505e-07, "loss": 0.82344496, "num_input_tokens_seen": 126668500, "step": 5893, "time_per_iteration": 3.7003579139709473 }, { "auxiliary_loss_clip": 0.01159279, "auxiliary_loss_mlp": 0.01021991, "balance_loss_clip": 1.00783098, "balance_loss_mlp": 1.01413536, "epoch": 0.7087115974268021, "flos": 15012384727680.0, "grad_norm": 2.238386290509437, "language_loss": 0.7626195, "learning_rate": 8.258163696732785e-07, "loss": 0.78443217, "num_input_tokens_seen": 126686090, "step": 5894, "time_per_iteration": 2.7143948078155518 }, { "auxiliary_loss_clip": 0.01161542, "auxiliary_loss_mlp": 0.01028797, "balance_loss_clip": 1.00918794, "balance_loss_mlp": 1.02174616, "epoch": 0.7088318403174413, "flos": 21539040739200.0, "grad_norm": 2.3026742970753484, "language_loss": 0.76600701, "learning_rate": 8.251858653483288e-07, "loss": 0.7879104, "num_input_tokens_seen": 126704255, "step": 5895, "time_per_iteration": 2.698866128921509 }, { "auxiliary_loss_clip": 0.01171388, "auxiliary_loss_mlp": 0.01026093, "balance_loss_clip": 1.01229513, "balance_loss_mlp": 1.01856542, "epoch": 0.7089520832080803, "flos": 15516785462400.0, "grad_norm": 1.9779526216852799, "language_loss": 0.85917175, "learning_rate": 8.245555392417501e-07, "loss": 0.88114661, "num_input_tokens_seen": 126718910, "step": 5896, "time_per_iteration": 2.7037458419799805 }, { "auxiliary_loss_clip": 0.01149715, "auxiliary_loss_mlp": 0.01031144, "balance_loss_clip": 0.89284486, "balance_loss_mlp": 1.02376807, "epoch": 0.7090723260987194, "flos": 20412667077120.0, "grad_norm": 1.8150780997889286, "language_loss": 0.78504092, "learning_rate": 8.239253914491613e-07, "loss": 0.80684948, "num_input_tokens_seen": 126737235, "step": 5897, "time_per_iteration": 3.76906681060791 }, { "auxiliary_loss_clip": 0.01159659, "auxiliary_loss_mlp": 0.01024622, "balance_loss_clip": 0.93358111, "balance_loss_mlp": 1.01726139, "epoch": 0.7091925689893585, "flos": 25668337271040.0, "grad_norm": 1.7563958787941838, "language_loss": 0.75390697, "learning_rate": 8.232954220661556e-07, "loss": 0.7757498, "num_input_tokens_seen": 126759970, "step": 5898, "time_per_iteration": 4.671457052230835 }, { "auxiliary_loss_clip": 0.0116903, "auxiliary_loss_mlp": 0.0102716, "balance_loss_clip": 1.05017185, "balance_loss_mlp": 1.01993871, "epoch": 0.7093128118799976, "flos": 24206629213440.0, "grad_norm": 4.41895663099938, "language_loss": 0.70071769, "learning_rate": 8.226656311882989e-07, "loss": 0.72267962, "num_input_tokens_seen": 126779280, "step": 5899, "time_per_iteration": 2.6851987838745117 }, { "auxiliary_loss_clip": 0.01166992, "auxiliary_loss_mlp": 0.01027836, "balance_loss_clip": 1.0128969, "balance_loss_mlp": 1.02103841, "epoch": 0.7094330547706367, "flos": 16646786398080.0, "grad_norm": 2.0509747945477668, "language_loss": 0.76894599, "learning_rate": 8.22036018911129e-07, "loss": 0.79089427, "num_input_tokens_seen": 126797310, "step": 5900, "time_per_iteration": 2.6576967239379883 }, { "auxiliary_loss_clip": 0.0117305, "auxiliary_loss_mlp": 0.01027098, "balance_loss_clip": 1.05007744, "balance_loss_mlp": 1.01937938, "epoch": 0.7095532976612757, "flos": 16283370545280.0, "grad_norm": 2.1041691476585913, "language_loss": 0.80788714, "learning_rate": 8.214065853301599e-07, "loss": 0.82988864, "num_input_tokens_seen": 126812840, "step": 5901, "time_per_iteration": 2.711019515991211 }, { "auxiliary_loss_clip": 0.0106903, "auxiliary_loss_mlp": 0.01004316, "balance_loss_clip": 0.97459078, "balance_loss_mlp": 1.00263524, "epoch": 0.7096735405519149, "flos": 70722080559360.0, "grad_norm": 0.8109409424543004, "language_loss": 0.58277357, "learning_rate": 8.207773305408734e-07, "loss": 0.60350704, "num_input_tokens_seen": 126880060, "step": 5902, "time_per_iteration": 3.345893383026123 }, { "auxiliary_loss_clip": 0.01175111, "auxiliary_loss_mlp": 0.0102851, "balance_loss_clip": 0.89612919, "balance_loss_mlp": 1.02054691, "epoch": 0.709793783442554, "flos": 23621500661760.0, "grad_norm": 1.9017442482434663, "language_loss": 0.79850614, "learning_rate": 8.201482546387288e-07, "loss": 0.82054234, "num_input_tokens_seen": 126899535, "step": 5903, "time_per_iteration": 2.8722829818725586 }, { "auxiliary_loss_clip": 0.01168158, "auxiliary_loss_mlp": 0.01027196, "balance_loss_clip": 1.01143551, "balance_loss_mlp": 1.01995397, "epoch": 0.709914026333193, "flos": 25993472204160.0, "grad_norm": 1.6628508409681853, "language_loss": 0.91776419, "learning_rate": 8.195193577191553e-07, "loss": 0.93971777, "num_input_tokens_seen": 126921365, "step": 5904, "time_per_iteration": 2.8095815181732178 }, { "auxiliary_loss_clip": 0.01175331, "auxiliary_loss_mlp": 0.01122573, "balance_loss_clip": 0.97210944, "balance_loss_mlp": 0.0, "epoch": 0.7100342692238322, "flos": 24861531934080.0, "grad_norm": 1.6354240542548903, "language_loss": 0.84379637, "learning_rate": 8.188906398775579e-07, "loss": 0.86677539, "num_input_tokens_seen": 126941910, "step": 5905, "time_per_iteration": 2.7434375286102295 }, { "auxiliary_loss_clip": 0.01171718, "auxiliary_loss_mlp": 0.01123059, "balance_loss_clip": 1.04918671, "balance_loss_mlp": 0.0, "epoch": 0.7101545121144712, "flos": 24932203943040.0, "grad_norm": 1.9598862441927833, "language_loss": 0.69029748, "learning_rate": 8.18262101209311e-07, "loss": 0.71324527, "num_input_tokens_seen": 126961120, "step": 5906, "time_per_iteration": 2.718404531478882 }, { "auxiliary_loss_clip": 0.01171817, "auxiliary_loss_mlp": 0.01030102, "balance_loss_clip": 1.01005805, "balance_loss_mlp": 1.02260995, "epoch": 0.7102747550051103, "flos": 23768842250880.0, "grad_norm": 1.7250504098096784, "language_loss": 0.6994487, "learning_rate": 8.176337418097626e-07, "loss": 0.72146791, "num_input_tokens_seen": 126981590, "step": 5907, "time_per_iteration": 2.703331470489502 }, { "auxiliary_loss_clip": 0.01169385, "auxiliary_loss_mlp": 0.01122571, "balance_loss_clip": 1.01267445, "balance_loss_mlp": 0.0, "epoch": 0.7103949978957494, "flos": 15303907509120.0, "grad_norm": 2.014023357231085, "language_loss": 0.80068433, "learning_rate": 8.170055617742364e-07, "loss": 0.82360387, "num_input_tokens_seen": 126998870, "step": 5908, "time_per_iteration": 2.71642804145813 }, { "auxiliary_loss_clip": 0.01159669, "auxiliary_loss_mlp": 0.01028576, "balance_loss_clip": 0.97219861, "balance_loss_mlp": 1.02107811, "epoch": 0.7105152407863885, "flos": 22638805401600.0, "grad_norm": 1.9065550777482276, "language_loss": 0.70956564, "learning_rate": 8.163775611980252e-07, "loss": 0.73144817, "num_input_tokens_seen": 127017980, "step": 5909, "time_per_iteration": 2.7525885105133057 }, { "auxiliary_loss_clip": 0.01170122, "auxiliary_loss_mlp": 0.01025858, "balance_loss_clip": 0.97550964, "balance_loss_mlp": 1.01869941, "epoch": 0.7106354836770276, "flos": 17238594879360.0, "grad_norm": 1.8334565250291326, "language_loss": 0.7867704, "learning_rate": 8.157497401763982e-07, "loss": 0.80873019, "num_input_tokens_seen": 127035645, "step": 5910, "time_per_iteration": 2.739002227783203 }, { "auxiliary_loss_clip": 0.01167184, "auxiliary_loss_mlp": 0.01025605, "balance_loss_clip": 1.0106194, "balance_loss_mlp": 1.01858354, "epoch": 0.7107557265676667, "flos": 20193647898240.0, "grad_norm": 1.6302350768898426, "language_loss": 0.77894545, "learning_rate": 8.151220988045935e-07, "loss": 0.80087334, "num_input_tokens_seen": 127054900, "step": 5911, "time_per_iteration": 2.676457405090332 }, { "auxiliary_loss_clip": 0.01169323, "auxiliary_loss_mlp": 0.01024128, "balance_loss_clip": 1.01094282, "balance_loss_mlp": 1.01718378, "epoch": 0.7108759694583058, "flos": 21507080613120.0, "grad_norm": 1.5887572220798925, "language_loss": 0.82508272, "learning_rate": 8.144946371778234e-07, "loss": 0.84701723, "num_input_tokens_seen": 127075010, "step": 5912, "time_per_iteration": 2.6660804748535156 }, { "auxiliary_loss_clip": 0.01168925, "auxiliary_loss_mlp": 0.01123138, "balance_loss_clip": 0.9740265, "balance_loss_mlp": 0.0, "epoch": 0.7109962123489448, "flos": 24061909317120.0, "grad_norm": 1.8767358192251002, "language_loss": 0.78490686, "learning_rate": 8.138673553912751e-07, "loss": 0.80782753, "num_input_tokens_seen": 127095570, "step": 5913, "time_per_iteration": 2.7508363723754883 }, { "auxiliary_loss_clip": 0.01164749, "auxiliary_loss_mlp": 0.01027062, "balance_loss_clip": 0.89629799, "balance_loss_mlp": 1.01880634, "epoch": 0.711116455239584, "flos": 30480474326400.0, "grad_norm": 2.1323323666921614, "language_loss": 0.56836021, "learning_rate": 8.132402535401059e-07, "loss": 0.59027833, "num_input_tokens_seen": 127116825, "step": 5914, "time_per_iteration": 2.845510721206665 }, { "auxiliary_loss_clip": 0.01169804, "auxiliary_loss_mlp": 0.01030273, "balance_loss_clip": 1.01310921, "balance_loss_mlp": 1.02296507, "epoch": 0.711236698130223, "flos": 25045610158080.0, "grad_norm": 1.7169024637788088, "language_loss": 0.74214137, "learning_rate": 8.126133317194465e-07, "loss": 0.76414216, "num_input_tokens_seen": 127137015, "step": 5915, "time_per_iteration": 2.6878719329833984 }, { "auxiliary_loss_clip": 0.01173771, "auxiliary_loss_mlp": 0.01024293, "balance_loss_clip": 0.85560143, "balance_loss_mlp": 1.01697314, "epoch": 0.7113569410208621, "flos": 24206701040640.0, "grad_norm": 1.6827469666721449, "language_loss": 0.74168283, "learning_rate": 8.11986590024401e-07, "loss": 0.76366341, "num_input_tokens_seen": 127156755, "step": 5916, "time_per_iteration": 2.8942880630493164 }, { "auxiliary_loss_clip": 0.01176612, "auxiliary_loss_mlp": 0.01026973, "balance_loss_clip": 0.9759202, "balance_loss_mlp": 1.01905787, "epoch": 0.7114771839115013, "flos": 35439306526080.0, "grad_norm": 1.7075327381142746, "language_loss": 0.68934882, "learning_rate": 8.113600285500442e-07, "loss": 0.71138465, "num_input_tokens_seen": 127176965, "step": 5917, "time_per_iteration": 2.82381010055542 }, { "auxiliary_loss_clip": 0.01170004, "auxiliary_loss_mlp": 0.01023591, "balance_loss_clip": 1.04866219, "balance_loss_mlp": 1.01667058, "epoch": 0.7115974268021403, "flos": 21099458096640.0, "grad_norm": 1.7748102385705673, "language_loss": 0.74281323, "learning_rate": 8.107336473914268e-07, "loss": 0.76474917, "num_input_tokens_seen": 127195595, "step": 5918, "time_per_iteration": 2.6052701473236084 }, { "auxiliary_loss_clip": 0.01068863, "auxiliary_loss_mlp": 0.01002015, "balance_loss_clip": 0.9378944, "balance_loss_mlp": 1.00022686, "epoch": 0.7117176696927794, "flos": 56752866616320.0, "grad_norm": 0.7752514441473022, "language_loss": 0.5575887, "learning_rate": 8.101074466435694e-07, "loss": 0.57829744, "num_input_tokens_seen": 127255070, "step": 5919, "time_per_iteration": 4.2026286125183105 }, { "auxiliary_loss_clip": 0.01160409, "auxiliary_loss_mlp": 0.01025311, "balance_loss_clip": 1.0082109, "balance_loss_mlp": 1.01772332, "epoch": 0.7118379125834186, "flos": 15925269905280.0, "grad_norm": 1.5823477582079888, "language_loss": 0.67645508, "learning_rate": 8.094814264014662e-07, "loss": 0.69831228, "num_input_tokens_seen": 127273825, "step": 5920, "time_per_iteration": 2.6529088020324707 }, { "auxiliary_loss_clip": 0.01172127, "auxiliary_loss_mlp": 0.01025419, "balance_loss_clip": 1.0475657, "balance_loss_mlp": 1.01780748, "epoch": 0.7119581554740576, "flos": 20193360589440.0, "grad_norm": 1.9692109054014901, "language_loss": 0.81208956, "learning_rate": 8.088555867600844e-07, "loss": 0.83406496, "num_input_tokens_seen": 127289990, "step": 5921, "time_per_iteration": 2.5994441509246826 }, { "auxiliary_loss_clip": 0.01160366, "auxiliary_loss_mlp": 0.01024029, "balance_loss_clip": 0.93278289, "balance_loss_mlp": 1.01726127, "epoch": 0.7120783983646967, "flos": 34715383822080.0, "grad_norm": 1.9054456173444487, "language_loss": 0.60513484, "learning_rate": 8.08229927814362e-07, "loss": 0.62697881, "num_input_tokens_seen": 127312880, "step": 5922, "time_per_iteration": 2.8194217681884766 }, { "auxiliary_loss_clip": 0.01163209, "auxiliary_loss_mlp": 0.01028028, "balance_loss_clip": 0.93251055, "balance_loss_mlp": 1.02101588, "epoch": 0.7121986412553358, "flos": 26359114700160.0, "grad_norm": 1.5965959803290617, "language_loss": 0.64829457, "learning_rate": 8.076044496592134e-07, "loss": 0.67020702, "num_input_tokens_seen": 127334730, "step": 5923, "time_per_iteration": 3.7803969383239746 }, { "auxiliary_loss_clip": 0.01165834, "auxiliary_loss_mlp": 0.01021134, "balance_loss_clip": 0.97311211, "balance_loss_mlp": 1.01396918, "epoch": 0.7123188841459749, "flos": 11145344371200.0, "grad_norm": 2.3117311586740414, "language_loss": 0.77444577, "learning_rate": 8.069791523895204e-07, "loss": 0.79631543, "num_input_tokens_seen": 127351180, "step": 5924, "time_per_iteration": 3.5756607055664062 }, { "auxiliary_loss_clip": 0.01153825, "auxiliary_loss_mlp": 0.0102548, "balance_loss_clip": 0.93092233, "balance_loss_mlp": 1.01813114, "epoch": 0.7124391270366139, "flos": 20811670329600.0, "grad_norm": 2.2254767943387552, "language_loss": 0.77244496, "learning_rate": 8.063540361001422e-07, "loss": 0.79423809, "num_input_tokens_seen": 127369750, "step": 5925, "time_per_iteration": 3.6369428634643555 }, { "auxiliary_loss_clip": 0.01159222, "auxiliary_loss_mlp": 0.01030127, "balance_loss_clip": 0.932473, "balance_loss_mlp": 1.02259922, "epoch": 0.7125593699272531, "flos": 17603734584960.0, "grad_norm": 2.1132131437987884, "language_loss": 0.79266846, "learning_rate": 8.057291008859069e-07, "loss": 0.81456196, "num_input_tokens_seen": 127387910, "step": 5926, "time_per_iteration": 2.7362940311431885 }, { "auxiliary_loss_clip": 0.01164135, "auxiliary_loss_mlp": 0.01024666, "balance_loss_clip": 1.00918627, "balance_loss_mlp": 1.01773095, "epoch": 0.7126796128178922, "flos": 28654057526400.0, "grad_norm": 1.830903629551356, "language_loss": 0.68592489, "learning_rate": 8.051043468416187e-07, "loss": 0.70781291, "num_input_tokens_seen": 127409160, "step": 5927, "time_per_iteration": 2.714681625366211 }, { "auxiliary_loss_clip": 0.01168816, "auxiliary_loss_mlp": 0.01025691, "balance_loss_clip": 1.05068278, "balance_loss_mlp": 1.01813889, "epoch": 0.7127998557085312, "flos": 16034438315520.0, "grad_norm": 2.4408012104584835, "language_loss": 0.82094377, "learning_rate": 8.044797740620506e-07, "loss": 0.84288883, "num_input_tokens_seen": 127427765, "step": 5928, "time_per_iteration": 2.6488146781921387 }, { "auxiliary_loss_clip": 0.01157474, "auxiliary_loss_mlp": 0.01028566, "balance_loss_clip": 0.89526492, "balance_loss_mlp": 1.02155697, "epoch": 0.7129200985991703, "flos": 23403271582080.0, "grad_norm": 3.1356667880553863, "language_loss": 0.78692281, "learning_rate": 8.038553826419494e-07, "loss": 0.80878317, "num_input_tokens_seen": 127446475, "step": 5929, "time_per_iteration": 2.8464202880859375 }, { "auxiliary_loss_clip": 0.01167875, "auxiliary_loss_mlp": 0.01023502, "balance_loss_clip": 1.04730272, "balance_loss_mlp": 1.0160929, "epoch": 0.7130403414898094, "flos": 21397445326080.0, "grad_norm": 1.7058054767902606, "language_loss": 0.80758494, "learning_rate": 8.032311726760364e-07, "loss": 0.82949871, "num_input_tokens_seen": 127467695, "step": 5930, "time_per_iteration": 2.711348533630371 }, { "auxiliary_loss_clip": 0.01154256, "auxiliary_loss_mlp": 0.01021857, "balance_loss_clip": 0.93120718, "balance_loss_mlp": 1.01407254, "epoch": 0.7131605843804485, "flos": 74739045306240.0, "grad_norm": 1.6630356142812075, "language_loss": 0.68754327, "learning_rate": 8.026071442590022e-07, "loss": 0.70930439, "num_input_tokens_seen": 127494590, "step": 5931, "time_per_iteration": 3.2046408653259277 }, { "auxiliary_loss_clip": 0.01168478, "auxiliary_loss_mlp": 0.01025318, "balance_loss_clip": 1.01260567, "balance_loss_mlp": 1.01816809, "epoch": 0.7132808272710875, "flos": 18368739469440.0, "grad_norm": 1.911178849658274, "language_loss": 0.80404222, "learning_rate": 8.019832974855134e-07, "loss": 0.82598019, "num_input_tokens_seen": 127512550, "step": 5932, "time_per_iteration": 2.7065742015838623 }, { "auxiliary_loss_clip": 0.01164533, "auxiliary_loss_mlp": 0.01022781, "balance_loss_clip": 0.93417442, "balance_loss_mlp": 1.01553297, "epoch": 0.7134010701617267, "flos": 23253380127360.0, "grad_norm": 2.292368359581844, "language_loss": 0.82582188, "learning_rate": 8.013596324502052e-07, "loss": 0.84769505, "num_input_tokens_seen": 127531015, "step": 5933, "time_per_iteration": 2.7575085163116455 }, { "auxiliary_loss_clip": 0.01163989, "auxiliary_loss_mlp": 0.01020373, "balance_loss_clip": 1.01283288, "balance_loss_mlp": 1.01353335, "epoch": 0.7135213130523658, "flos": 23653137565440.0, "grad_norm": 1.74484032963186, "language_loss": 0.78520823, "learning_rate": 8.007361492476872e-07, "loss": 0.8070519, "num_input_tokens_seen": 127550340, "step": 5934, "time_per_iteration": 2.7539103031158447 }, { "auxiliary_loss_clip": 0.01173271, "auxiliary_loss_mlp": 0.01025204, "balance_loss_clip": 0.93179661, "balance_loss_mlp": 1.01788449, "epoch": 0.7136415559430048, "flos": 24790644443520.0, "grad_norm": 1.398344750094204, "language_loss": 0.78635561, "learning_rate": 8.001128479725426e-07, "loss": 0.80834037, "num_input_tokens_seen": 127572245, "step": 5935, "time_per_iteration": 2.8056774139404297 }, { "auxiliary_loss_clip": 0.01156721, "auxiliary_loss_mlp": 0.01034073, "balance_loss_clip": 0.89376771, "balance_loss_mlp": 1.02676558, "epoch": 0.713761798833644, "flos": 18296954138880.0, "grad_norm": 1.506721878765777, "language_loss": 0.80682135, "learning_rate": 7.994897287193248e-07, "loss": 0.82872933, "num_input_tokens_seen": 127591625, "step": 5936, "time_per_iteration": 2.8417317867279053 }, { "auxiliary_loss_clip": 0.01171155, "auxiliary_loss_mlp": 0.01020909, "balance_loss_clip": 1.01057935, "balance_loss_mlp": 1.01373887, "epoch": 0.713882041724283, "flos": 15558262692480.0, "grad_norm": 2.26752691718704, "language_loss": 0.8347832, "learning_rate": 7.988667915825605e-07, "loss": 0.85670388, "num_input_tokens_seen": 127608690, "step": 5937, "time_per_iteration": 2.7215418815612793 }, { "auxiliary_loss_clip": 0.01166566, "auxiliary_loss_mlp": 0.01021873, "balance_loss_clip": 0.97219211, "balance_loss_mlp": 1.01429772, "epoch": 0.7140022846149221, "flos": 24061011477120.0, "grad_norm": 2.1813577587380877, "language_loss": 0.75630665, "learning_rate": 7.982440366567491e-07, "loss": 0.77819109, "num_input_tokens_seen": 127627180, "step": 5938, "time_per_iteration": 2.7013943195343018 }, { "auxiliary_loss_clip": 0.01160037, "auxiliary_loss_mlp": 0.01024154, "balance_loss_clip": 1.00845695, "balance_loss_mlp": 1.01771402, "epoch": 0.7141225275055613, "flos": 27891710248320.0, "grad_norm": 1.5197872155756567, "language_loss": 0.75230145, "learning_rate": 7.97621464036361e-07, "loss": 0.7741434, "num_input_tokens_seen": 127648940, "step": 5939, "time_per_iteration": 2.719865322113037 }, { "auxiliary_loss_clip": 0.01173227, "auxiliary_loss_mlp": 0.01020544, "balance_loss_clip": 1.0125277, "balance_loss_mlp": 1.01308155, "epoch": 0.7142427703962003, "flos": 19682603147520.0, "grad_norm": 1.5287423981354444, "language_loss": 0.67951965, "learning_rate": 7.969990738158417e-07, "loss": 0.70145732, "num_input_tokens_seen": 127667350, "step": 5940, "time_per_iteration": 2.653977394104004 }, { "auxiliary_loss_clip": 0.01170814, "auxiliary_loss_mlp": 0.01027235, "balance_loss_clip": 1.01324689, "balance_loss_mlp": 1.02010918, "epoch": 0.7143630132868394, "flos": 21032377447680.0, "grad_norm": 1.9175818729574465, "language_loss": 0.85229999, "learning_rate": 7.963768660896062e-07, "loss": 0.87428045, "num_input_tokens_seen": 127685760, "step": 5941, "time_per_iteration": 2.784510850906372 }, { "auxiliary_loss_clip": 0.0117063, "auxiliary_loss_mlp": 0.01026546, "balance_loss_clip": 1.01076412, "balance_loss_mlp": 1.01963758, "epoch": 0.7144832561774785, "flos": 24129923719680.0, "grad_norm": 2.0245623226238405, "language_loss": 0.82278752, "learning_rate": 7.957548409520432e-07, "loss": 0.84475929, "num_input_tokens_seen": 127704985, "step": 5942, "time_per_iteration": 2.7111363410949707 }, { "auxiliary_loss_clip": 0.01163902, "auxiliary_loss_mlp": 0.01024327, "balance_loss_clip": 0.93138313, "balance_loss_mlp": 1.0171628, "epoch": 0.7146034990681176, "flos": 16325817442560.0, "grad_norm": 1.8856742014989611, "language_loss": 0.83878368, "learning_rate": 7.951329984975135e-07, "loss": 0.86066592, "num_input_tokens_seen": 127721925, "step": 5943, "time_per_iteration": 2.68186616897583 }, { "auxiliary_loss_clip": 0.01075304, "auxiliary_loss_mlp": 0.01001639, "balance_loss_clip": 0.89916396, "balance_loss_mlp": 0.99991095, "epoch": 0.7147237419587567, "flos": 69627164232960.0, "grad_norm": 0.7134818883717962, "language_loss": 0.54319298, "learning_rate": 7.94511338820349e-07, "loss": 0.5639624, "num_input_tokens_seen": 127784230, "step": 5944, "time_per_iteration": 3.3168623447418213 }, { "auxiliary_loss_clip": 0.01166158, "auxiliary_loss_mlp": 0.01123273, "balance_loss_clip": 0.97269589, "balance_loss_mlp": 0.0, "epoch": 0.7148439848493958, "flos": 22266806198400.0, "grad_norm": 2.3602561117818377, "language_loss": 0.7823168, "learning_rate": 7.938898620148575e-07, "loss": 0.80521113, "num_input_tokens_seen": 127801990, "step": 5945, "time_per_iteration": 2.7495954036712646 }, { "auxiliary_loss_clip": 0.01166963, "auxiliary_loss_mlp": 0.0102777, "balance_loss_clip": 0.97248161, "balance_loss_mlp": 1.02039671, "epoch": 0.7149642277400349, "flos": 17931383470080.0, "grad_norm": 1.750084817748359, "language_loss": 0.70769453, "learning_rate": 7.932685681753135e-07, "loss": 0.72964185, "num_input_tokens_seen": 127819270, "step": 5946, "time_per_iteration": 3.615046977996826 }, { "auxiliary_loss_clip": 0.01166714, "auxiliary_loss_mlp": 0.01025664, "balance_loss_clip": 1.05023694, "balance_loss_mlp": 1.01854682, "epoch": 0.7150844706306739, "flos": 31681937370240.0, "grad_norm": 1.9003213676499648, "language_loss": 0.62898803, "learning_rate": 7.92647457395969e-07, "loss": 0.65091181, "num_input_tokens_seen": 127841095, "step": 5947, "time_per_iteration": 2.748526096343994 }, { "auxiliary_loss_clip": 0.01166481, "auxiliary_loss_mlp": 0.01026008, "balance_loss_clip": 0.85298073, "balance_loss_mlp": 1.0179224, "epoch": 0.7152047135213131, "flos": 10926217451520.0, "grad_norm": 2.36992648927277, "language_loss": 0.74051678, "learning_rate": 7.920265297710444e-07, "loss": 0.76244164, "num_input_tokens_seen": 127858485, "step": 5948, "time_per_iteration": 2.803330659866333 }, { "auxiliary_loss_clip": 0.01170628, "auxiliary_loss_mlp": 0.01027329, "balance_loss_clip": 1.01069927, "balance_loss_mlp": 1.01985478, "epoch": 0.7153249564119522, "flos": 20995640812800.0, "grad_norm": 1.7650515614235138, "language_loss": 0.73289055, "learning_rate": 7.914057853947363e-07, "loss": 0.75487012, "num_input_tokens_seen": 127877665, "step": 5949, "time_per_iteration": 2.661832094192505 }, { "auxiliary_loss_clip": 0.01164049, "auxiliary_loss_mlp": 0.01029781, "balance_loss_clip": 0.93339187, "balance_loss_mlp": 1.02234876, "epoch": 0.7154451993025912, "flos": 24243114453120.0, "grad_norm": 2.068368503800279, "language_loss": 0.6269511, "learning_rate": 7.907852243612089e-07, "loss": 0.64888936, "num_input_tokens_seen": 127898070, "step": 5950, "time_per_iteration": 4.575345277786255 }, { "auxiliary_loss_clip": 0.01162358, "auxiliary_loss_mlp": 0.01026232, "balance_loss_clip": 0.97084928, "balance_loss_mlp": 1.01912117, "epoch": 0.7155654421932304, "flos": 23330947547520.0, "grad_norm": 1.6995014280199843, "language_loss": 0.72105473, "learning_rate": 7.901648467646009e-07, "loss": 0.74294066, "num_input_tokens_seen": 127917010, "step": 5951, "time_per_iteration": 3.615713119506836 }, { "auxiliary_loss_clip": 0.01170261, "auxiliary_loss_mlp": 0.01027067, "balance_loss_clip": 1.04847443, "balance_loss_mlp": 1.01909244, "epoch": 0.7156856850838694, "flos": 22711883621760.0, "grad_norm": 2.6465682542545057, "language_loss": 0.7237041, "learning_rate": 7.895446526990244e-07, "loss": 0.74567747, "num_input_tokens_seen": 127937025, "step": 5952, "time_per_iteration": 2.643151044845581 }, { "auxiliary_loss_clip": 0.01171731, "auxiliary_loss_mlp": 0.010248, "balance_loss_clip": 0.89419955, "balance_loss_mlp": 1.01700962, "epoch": 0.7158059279745085, "flos": 19865424395520.0, "grad_norm": 1.5422631297139948, "language_loss": 0.75588274, "learning_rate": 7.889246422585609e-07, "loss": 0.77784801, "num_input_tokens_seen": 127956410, "step": 5953, "time_per_iteration": 2.713472843170166 }, { "auxiliary_loss_clip": 0.01170493, "auxiliary_loss_mlp": 0.01026989, "balance_loss_clip": 1.04969382, "balance_loss_mlp": 1.01958609, "epoch": 0.7159261708651476, "flos": 24134772055680.0, "grad_norm": 3.115269906363137, "language_loss": 0.73624718, "learning_rate": 7.883048155372675e-07, "loss": 0.75822198, "num_input_tokens_seen": 127974925, "step": 5954, "time_per_iteration": 2.6487700939178467 }, { "auxiliary_loss_clip": 0.01173544, "auxiliary_loss_mlp": 0.01024122, "balance_loss_clip": 0.97357553, "balance_loss_mlp": 1.01691556, "epoch": 0.7160464137557867, "flos": 16983198201600.0, "grad_norm": 2.2874799635141407, "language_loss": 0.71286714, "learning_rate": 7.876851726291698e-07, "loss": 0.73484385, "num_input_tokens_seen": 127993225, "step": 5955, "time_per_iteration": 2.593489408493042 }, { "auxiliary_loss_clip": 0.01169221, "auxiliary_loss_mlp": 0.01030014, "balance_loss_clip": 0.93255359, "balance_loss_mlp": 1.0223906, "epoch": 0.7161666566464258, "flos": 25228251838080.0, "grad_norm": 1.7005645013740507, "language_loss": 0.78346884, "learning_rate": 7.870657136282666e-07, "loss": 0.80546117, "num_input_tokens_seen": 128012085, "step": 5956, "time_per_iteration": 2.7173421382904053 }, { "auxiliary_loss_clip": 0.01160118, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.00864506, "balance_loss_mlp": 1.02374744, "epoch": 0.7162868995370649, "flos": 26468390851200.0, "grad_norm": 1.4903908026683685, "language_loss": 0.81969571, "learning_rate": 7.86446438628531e-07, "loss": 0.84160757, "num_input_tokens_seen": 128033155, "step": 5957, "time_per_iteration": 2.679100513458252 }, { "auxiliary_loss_clip": 0.01064471, "auxiliary_loss_mlp": 0.0100319, "balance_loss_clip": 1.01142502, "balance_loss_mlp": 1.00150955, "epoch": 0.716407142427704, "flos": 69998912040960.0, "grad_norm": 0.7674527856937771, "language_loss": 0.56927282, "learning_rate": 7.858273477239059e-07, "loss": 0.58994943, "num_input_tokens_seen": 128101575, "step": 5958, "time_per_iteration": 3.2694151401519775 }, { "auxiliary_loss_clip": 0.01150708, "auxiliary_loss_mlp": 0.01025255, "balance_loss_clip": 0.89402497, "balance_loss_mlp": 1.01785851, "epoch": 0.716527385318343, "flos": 20740459616640.0, "grad_norm": 1.7307242685740476, "language_loss": 0.71501195, "learning_rate": 7.852084410083067e-07, "loss": 0.73677164, "num_input_tokens_seen": 128120395, "step": 5959, "time_per_iteration": 2.788404703140259 }, { "auxiliary_loss_clip": 0.01161346, "auxiliary_loss_mlp": 0.01024067, "balance_loss_clip": 0.97177684, "balance_loss_mlp": 1.01768637, "epoch": 0.7166476282089821, "flos": 25371966153600.0, "grad_norm": 1.7258040522047131, "language_loss": 0.63797563, "learning_rate": 7.84589718575621e-07, "loss": 0.65982974, "num_input_tokens_seen": 128140840, "step": 5960, "time_per_iteration": 2.7616562843322754 }, { "auxiliary_loss_clip": 0.01165188, "auxiliary_loss_mlp": 0.01024918, "balance_loss_clip": 0.96771103, "balance_loss_mlp": 1.01793206, "epoch": 0.7167678710996213, "flos": 24133730561280.0, "grad_norm": 1.8577517841191349, "language_loss": 0.68982649, "learning_rate": 7.83971180519708e-07, "loss": 0.71172762, "num_input_tokens_seen": 128159695, "step": 5961, "time_per_iteration": 2.731534957885742 }, { "auxiliary_loss_clip": 0.01171678, "auxiliary_loss_mlp": 0.01026617, "balance_loss_clip": 1.05057716, "balance_loss_mlp": 1.01935995, "epoch": 0.7168881139902603, "flos": 30226586019840.0, "grad_norm": 1.9054867289327506, "language_loss": 0.75520051, "learning_rate": 7.833528269344008e-07, "loss": 0.77718353, "num_input_tokens_seen": 128179600, "step": 5962, "time_per_iteration": 2.648725986480713 }, { "auxiliary_loss_clip": 0.01165206, "auxiliary_loss_mlp": 0.01031672, "balance_loss_clip": 0.93458378, "balance_loss_mlp": 1.02400696, "epoch": 0.7170083568808994, "flos": 14606414236800.0, "grad_norm": 2.158352637637883, "language_loss": 0.77439654, "learning_rate": 7.827346579135023e-07, "loss": 0.79636526, "num_input_tokens_seen": 128196940, "step": 5963, "time_per_iteration": 2.74403977394104 }, { "auxiliary_loss_clip": 0.01161819, "auxiliary_loss_mlp": 0.010259, "balance_loss_clip": 0.97034121, "balance_loss_mlp": 1.01818991, "epoch": 0.7171285997715385, "flos": 23331091201920.0, "grad_norm": 2.3491469915688494, "language_loss": 0.82899415, "learning_rate": 7.821166735507885e-07, "loss": 0.85087132, "num_input_tokens_seen": 128215970, "step": 5964, "time_per_iteration": 2.694885015487671 }, { "auxiliary_loss_clip": 0.01170835, "auxiliary_loss_mlp": 0.01023663, "balance_loss_clip": 1.0503006, "balance_loss_mlp": 1.01664138, "epoch": 0.7172488426621776, "flos": 16543543731840.0, "grad_norm": 1.6191863174410788, "language_loss": 0.68897313, "learning_rate": 7.81498873940007e-07, "loss": 0.71091813, "num_input_tokens_seen": 128233185, "step": 5965, "time_per_iteration": 2.6705281734466553 }, { "auxiliary_loss_clip": 0.01170875, "auxiliary_loss_mlp": 0.01022898, "balance_loss_clip": 1.00702214, "balance_loss_mlp": 1.01548934, "epoch": 0.7173690855528166, "flos": 26541612725760.0, "grad_norm": 1.9573976680866985, "language_loss": 0.77341789, "learning_rate": 7.808812591748768e-07, "loss": 0.79535556, "num_input_tokens_seen": 128253565, "step": 5966, "time_per_iteration": 2.6678662300109863 }, { "auxiliary_loss_clip": 0.01161199, "auxiliary_loss_mlp": 0.01025835, "balance_loss_clip": 0.93327028, "balance_loss_mlp": 1.01765108, "epoch": 0.7174893284434558, "flos": 22784099915520.0, "grad_norm": 5.441016320482679, "language_loss": 0.65402806, "learning_rate": 7.802638293490915e-07, "loss": 0.67589837, "num_input_tokens_seen": 128273210, "step": 5967, "time_per_iteration": 2.699707269668579 }, { "auxiliary_loss_clip": 0.01167604, "auxiliary_loss_mlp": 0.01022888, "balance_loss_clip": 0.97116297, "balance_loss_mlp": 1.01629901, "epoch": 0.7176095713340949, "flos": 23293564467840.0, "grad_norm": 1.5894874863636954, "language_loss": 0.77104801, "learning_rate": 7.796465845563123e-07, "loss": 0.7929529, "num_input_tokens_seen": 128292085, "step": 5968, "time_per_iteration": 2.7509653568267822 }, { "auxiliary_loss_clip": 0.01161576, "auxiliary_loss_mlp": 0.01122447, "balance_loss_clip": 0.9710685, "balance_loss_mlp": 0.0, "epoch": 0.7177298142247339, "flos": 25591631777280.0, "grad_norm": 1.7541450220167043, "language_loss": 0.79392421, "learning_rate": 7.790295248901766e-07, "loss": 0.81676435, "num_input_tokens_seen": 128313215, "step": 5969, "time_per_iteration": 2.731595993041992 }, { "auxiliary_loss_clip": 0.01164118, "auxiliary_loss_mlp": 0.01022662, "balance_loss_clip": 1.0074439, "balance_loss_mlp": 1.01579845, "epoch": 0.7178500571153731, "flos": 31652778504960.0, "grad_norm": 2.729360460139407, "language_loss": 0.62436879, "learning_rate": 7.784126504442902e-07, "loss": 0.64623654, "num_input_tokens_seen": 128336445, "step": 5970, "time_per_iteration": 2.751990795135498 }, { "auxiliary_loss_clip": 0.0115774, "auxiliary_loss_mlp": 0.01025766, "balance_loss_clip": 0.93396223, "balance_loss_mlp": 1.01825595, "epoch": 0.7179703000060121, "flos": 19427242383360.0, "grad_norm": 1.375604346058042, "language_loss": 0.67778313, "learning_rate": 7.777959613122351e-07, "loss": 0.69961816, "num_input_tokens_seen": 128356270, "step": 5971, "time_per_iteration": 2.7335317134857178 }, { "auxiliary_loss_clip": 0.01160288, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 0.9723897, "balance_loss_mlp": 1.02174461, "epoch": 0.7180905428966512, "flos": 28839249072000.0, "grad_norm": 1.8386052092834513, "language_loss": 0.77489895, "learning_rate": 7.771794575875604e-07, "loss": 0.796785, "num_input_tokens_seen": 128378140, "step": 5972, "time_per_iteration": 3.7282068729400635 }, { "auxiliary_loss_clip": 0.01168227, "auxiliary_loss_mlp": 0.01030424, "balance_loss_clip": 1.01079524, "balance_loss_mlp": 1.02314377, "epoch": 0.7182107857872904, "flos": 20047563285120.0, "grad_norm": 2.7380550344136587, "language_loss": 0.77744782, "learning_rate": 7.765631393637888e-07, "loss": 0.7994343, "num_input_tokens_seen": 128396335, "step": 5973, "time_per_iteration": 2.6499505043029785 }, { "auxiliary_loss_clip": 0.01160916, "auxiliary_loss_mlp": 0.01024478, "balance_loss_clip": 1.00717735, "balance_loss_mlp": 1.01714313, "epoch": 0.7183310286779294, "flos": 22747686503040.0, "grad_norm": 2.967518930171413, "language_loss": 0.4844487, "learning_rate": 7.75947006734417e-07, "loss": 0.50630265, "num_input_tokens_seen": 128414115, "step": 5974, "time_per_iteration": 2.722121238708496 }, { "auxiliary_loss_clip": 0.01168932, "auxiliary_loss_mlp": 0.0102662, "balance_loss_clip": 1.04773045, "balance_loss_mlp": 1.01939344, "epoch": 0.7184512715685685, "flos": 17158262112000.0, "grad_norm": 2.615681143278034, "language_loss": 0.82718396, "learning_rate": 7.753310597929101e-07, "loss": 0.84913945, "num_input_tokens_seen": 128430755, "step": 5975, "time_per_iteration": 3.6207034587860107 }, { "auxiliary_loss_clip": 0.01063992, "auxiliary_loss_mlp": 0.01002336, "balance_loss_clip": 1.01107001, "balance_loss_mlp": 1.00071454, "epoch": 0.7185715144592076, "flos": 65509611448320.0, "grad_norm": 0.7579113342173114, "language_loss": 0.55088472, "learning_rate": 7.747152986327095e-07, "loss": 0.57154799, "num_input_tokens_seen": 128491300, "step": 5976, "time_per_iteration": 3.9474401473999023 }, { "auxiliary_loss_clip": 0.01168287, "auxiliary_loss_mlp": 0.01028009, "balance_loss_clip": 0.89485931, "balance_loss_mlp": 1.02077281, "epoch": 0.7186917573498467, "flos": 16180522928640.0, "grad_norm": 1.9604808810351924, "language_loss": 0.68114436, "learning_rate": 7.740997233472228e-07, "loss": 0.7031073, "num_input_tokens_seen": 128508920, "step": 5977, "time_per_iteration": 3.6220667362213135 }, { "auxiliary_loss_clip": 0.01165064, "auxiliary_loss_mlp": 0.01021572, "balance_loss_clip": 0.97035265, "balance_loss_mlp": 1.01474094, "epoch": 0.7188120002404857, "flos": 29242274647680.0, "grad_norm": 1.9222076596784017, "language_loss": 0.70890605, "learning_rate": 7.734843340298329e-07, "loss": 0.73077244, "num_input_tokens_seen": 128528745, "step": 5978, "time_per_iteration": 2.7332510948181152 }, { "auxiliary_loss_clip": 0.01172177, "auxiliary_loss_mlp": 0.01026785, "balance_loss_clip": 0.97124046, "balance_loss_mlp": 1.01925099, "epoch": 0.7189322431311249, "flos": 33401161008000.0, "grad_norm": 2.224996591295898, "language_loss": 0.74996555, "learning_rate": 7.72869130773895e-07, "loss": 0.77195513, "num_input_tokens_seen": 128549345, "step": 5979, "time_per_iteration": 2.782536506652832 }, { "auxiliary_loss_clip": 0.01067289, "auxiliary_loss_mlp": 0.0100211, "balance_loss_clip": 0.973508, "balance_loss_mlp": 1.00041699, "epoch": 0.719052486021764, "flos": 61351263792000.0, "grad_norm": 0.8178059965715253, "language_loss": 0.59364504, "learning_rate": 7.722541136727343e-07, "loss": 0.61433899, "num_input_tokens_seen": 128605360, "step": 5980, "time_per_iteration": 3.075922966003418 }, { "auxiliary_loss_clip": 0.01167205, "auxiliary_loss_mlp": 0.01023005, "balance_loss_clip": 1.00996256, "balance_loss_mlp": 1.01544738, "epoch": 0.719172728912403, "flos": 15596795007360.0, "grad_norm": 1.9171738794588047, "language_loss": 0.80183601, "learning_rate": 7.716392828196483e-07, "loss": 0.8237381, "num_input_tokens_seen": 128623160, "step": 5981, "time_per_iteration": 2.6321747303009033 }, { "auxiliary_loss_clip": 0.01168232, "auxiliary_loss_mlp": 0.01032765, "balance_loss_clip": 1.01094186, "balance_loss_mlp": 1.02521324, "epoch": 0.7192929718030422, "flos": 15553162961280.0, "grad_norm": 12.33746865964735, "language_loss": 0.77087903, "learning_rate": 7.710246383079064e-07, "loss": 0.792889, "num_input_tokens_seen": 128638545, "step": 5982, "time_per_iteration": 2.6050546169281006 }, { "auxiliary_loss_clip": 0.0116545, "auxiliary_loss_mlp": 0.01027642, "balance_loss_clip": 0.96901375, "balance_loss_mlp": 1.02057242, "epoch": 0.7194132146936812, "flos": 21862487733120.0, "grad_norm": 2.4933195995171693, "language_loss": 0.92384493, "learning_rate": 7.704101802307492e-07, "loss": 0.94577587, "num_input_tokens_seen": 128650845, "step": 5983, "time_per_iteration": 2.6627719402313232 }, { "auxiliary_loss_clip": 0.01154336, "auxiliary_loss_mlp": 0.01030017, "balance_loss_clip": 0.9313767, "balance_loss_mlp": 1.02226257, "epoch": 0.7195334575843203, "flos": 27338900958720.0, "grad_norm": 2.0267394482219334, "language_loss": 0.87286311, "learning_rate": 7.697959086813912e-07, "loss": 0.89470667, "num_input_tokens_seen": 128667010, "step": 5984, "time_per_iteration": 2.7594470977783203 }, { "auxiliary_loss_clip": 0.01152696, "auxiliary_loss_mlp": 0.01022862, "balance_loss_clip": 0.92934668, "balance_loss_mlp": 1.01607013, "epoch": 0.7196537004749595, "flos": 18770615809920.0, "grad_norm": 1.508026135950106, "language_loss": 0.79777324, "learning_rate": 7.691818237530145e-07, "loss": 0.81952882, "num_input_tokens_seen": 128685870, "step": 5985, "time_per_iteration": 2.681849479675293 }, { "auxiliary_loss_clip": 0.01175096, "auxiliary_loss_mlp": 0.01025133, "balance_loss_clip": 0.89475691, "balance_loss_mlp": 1.01818967, "epoch": 0.7197739433655985, "flos": 24531009960960.0, "grad_norm": 1.6868219143426835, "language_loss": 0.77604818, "learning_rate": 7.685679255387774e-07, "loss": 0.79805052, "num_input_tokens_seen": 128704185, "step": 5986, "time_per_iteration": 2.737743377685547 }, { "auxiliary_loss_clip": 0.0116604, "auxiliary_loss_mlp": 0.01025458, "balance_loss_clip": 0.97452676, "balance_loss_mlp": 1.01844001, "epoch": 0.7198941862562376, "flos": 18040587793920.0, "grad_norm": 1.7992371205862039, "language_loss": 0.76866382, "learning_rate": 7.679542141318065e-07, "loss": 0.79057878, "num_input_tokens_seen": 128721290, "step": 5987, "time_per_iteration": 2.65386962890625 }, { "auxiliary_loss_clip": 0.01150166, "auxiliary_loss_mlp": 0.01026342, "balance_loss_clip": 0.96778989, "balance_loss_mlp": 1.0192852, "epoch": 0.7200144291468767, "flos": 29022393542400.0, "grad_norm": 2.0901184783078373, "language_loss": 0.75892651, "learning_rate": 7.673406896252013e-07, "loss": 0.78069162, "num_input_tokens_seen": 128742665, "step": 5988, "time_per_iteration": 2.7222819328308105 }, { "auxiliary_loss_clip": 0.01160616, "auxiliary_loss_mlp": 0.01026773, "balance_loss_clip": 0.93176353, "balance_loss_mlp": 1.01934958, "epoch": 0.7201346720375158, "flos": 25374264624000.0, "grad_norm": 1.512944503423628, "language_loss": 0.78188258, "learning_rate": 7.667273521120347e-07, "loss": 0.80375648, "num_input_tokens_seen": 128762225, "step": 5989, "time_per_iteration": 2.7240378856658936 }, { "auxiliary_loss_clip": 0.0116969, "auxiliary_loss_mlp": 0.01022069, "balance_loss_clip": 0.93466645, "balance_loss_mlp": 1.01470172, "epoch": 0.7202549149281549, "flos": 14355614499840.0, "grad_norm": 1.932607883357991, "language_loss": 0.7937845, "learning_rate": 7.661142016853468e-07, "loss": 0.81570208, "num_input_tokens_seen": 128779585, "step": 5990, "time_per_iteration": 2.6483469009399414 }, { "auxiliary_loss_clip": 0.01163871, "auxiliary_loss_mlp": 0.0102279, "balance_loss_clip": 0.89604676, "balance_loss_mlp": 1.01567364, "epoch": 0.7203751578187939, "flos": 23001682550400.0, "grad_norm": 2.0726915182496066, "language_loss": 0.74918747, "learning_rate": 7.655012384381543e-07, "loss": 0.77105409, "num_input_tokens_seen": 128799070, "step": 5991, "time_per_iteration": 2.7471606731414795 }, { "auxiliary_loss_clip": 0.01165388, "auxiliary_loss_mlp": 0.01031233, "balance_loss_clip": 0.97498369, "balance_loss_mlp": 1.02363932, "epoch": 0.7204954007094331, "flos": 23692424065920.0, "grad_norm": 1.8523360567322353, "language_loss": 0.81833923, "learning_rate": 7.648884624634415e-07, "loss": 0.84030545, "num_input_tokens_seen": 128817620, "step": 5992, "time_per_iteration": 2.6175625324249268 }, { "auxiliary_loss_clip": 0.01166159, "auxiliary_loss_mlp": 0.01023199, "balance_loss_clip": 1.0110482, "balance_loss_mlp": 1.01661873, "epoch": 0.7206156436000721, "flos": 16253026531200.0, "grad_norm": 2.0099038253897525, "language_loss": 0.88931823, "learning_rate": 7.642758738541683e-07, "loss": 0.91121179, "num_input_tokens_seen": 128834200, "step": 5993, "time_per_iteration": 2.6505024433135986 }, { "auxiliary_loss_clip": 0.01067485, "auxiliary_loss_mlp": 0.01001045, "balance_loss_clip": 0.97430515, "balance_loss_mlp": 0.9994716, "epoch": 0.7207358864907112, "flos": 54377806504320.0, "grad_norm": 1.1429555503756774, "language_loss": 0.60759544, "learning_rate": 7.636634727032621e-07, "loss": 0.62828076, "num_input_tokens_seen": 128891305, "step": 5994, "time_per_iteration": 3.086188554763794 }, { "auxiliary_loss_clip": 0.01162682, "auxiliary_loss_mlp": 0.01030834, "balance_loss_clip": 0.92932862, "balance_loss_mlp": 1.02278161, "epoch": 0.7208561293813504, "flos": 19135540033920.0, "grad_norm": 2.7773817193574954, "language_loss": 0.78744984, "learning_rate": 7.630512591036231e-07, "loss": 0.809385, "num_input_tokens_seen": 128910615, "step": 5995, "time_per_iteration": 2.7159907817840576 }, { "auxiliary_loss_clip": 0.01168945, "auxiliary_loss_mlp": 0.0102354, "balance_loss_clip": 1.01085103, "balance_loss_mlp": 1.01659584, "epoch": 0.7209763722719894, "flos": 17748526308480.0, "grad_norm": 2.05838174990164, "language_loss": 0.64894676, "learning_rate": 7.624392331481255e-07, "loss": 0.67087162, "num_input_tokens_seen": 128928270, "step": 5996, "time_per_iteration": 2.72501802444458 }, { "auxiliary_loss_clip": 0.01066717, "auxiliary_loss_mlp": 0.01002022, "balance_loss_clip": 0.97403347, "balance_loss_mlp": 1.00037718, "epoch": 0.7210966151626285, "flos": 66819488716800.0, "grad_norm": 0.7470037645214519, "language_loss": 0.51844645, "learning_rate": 7.618273949296115e-07, "loss": 0.53913391, "num_input_tokens_seen": 128987780, "step": 5997, "time_per_iteration": 3.105548620223999 }, { "auxiliary_loss_clip": 0.01159884, "auxiliary_loss_mlp": 0.01023424, "balance_loss_clip": 0.96989691, "balance_loss_mlp": 1.01618791, "epoch": 0.7212168580532676, "flos": 21141869080320.0, "grad_norm": 1.9189862839858003, "language_loss": 0.68786263, "learning_rate": 7.612157445408987e-07, "loss": 0.7096957, "num_input_tokens_seen": 129005590, "step": 5998, "time_per_iteration": 3.699991226196289 }, { "auxiliary_loss_clip": 0.01174877, "auxiliary_loss_mlp": 0.0102481, "balance_loss_clip": 0.97563142, "balance_loss_mlp": 1.01711774, "epoch": 0.7213371009439067, "flos": 22345738335360.0, "grad_norm": 2.0294466539336717, "language_loss": 0.74324226, "learning_rate": 7.606042820747716e-07, "loss": 0.76523912, "num_input_tokens_seen": 129021995, "step": 5999, "time_per_iteration": 2.663874864578247 }, { "auxiliary_loss_clip": 0.0117397, "auxiliary_loss_mlp": 0.01028065, "balance_loss_clip": 0.9751876, "balance_loss_mlp": 1.02068281, "epoch": 0.7214573438345457, "flos": 18515901490560.0, "grad_norm": 1.6439204689366957, "language_loss": 0.85042328, "learning_rate": 7.599930076239889e-07, "loss": 0.87244362, "num_input_tokens_seen": 129039280, "step": 6000, "time_per_iteration": 2.66341233253479 }, { "auxiliary_loss_clip": 0.01169446, "auxiliary_loss_mlp": 0.01122818, "balance_loss_clip": 0.89663142, "balance_loss_mlp": 0.0, "epoch": 0.7215775867251849, "flos": 35736108606720.0, "grad_norm": 2.751365446721865, "language_loss": 0.70482469, "learning_rate": 7.593819212812818e-07, "loss": 0.72774726, "num_input_tokens_seen": 129060860, "step": 6001, "time_per_iteration": 3.831052541732788 }, { "auxiliary_loss_clip": 0.01169777, "auxiliary_loss_mlp": 0.01024753, "balance_loss_clip": 1.01108789, "balance_loss_mlp": 1.01773763, "epoch": 0.721697829615824, "flos": 20372410909440.0, "grad_norm": 1.7790812195834995, "language_loss": 0.71632826, "learning_rate": 7.587710231393508e-07, "loss": 0.73827362, "num_input_tokens_seen": 129079215, "step": 6002, "time_per_iteration": 3.595609188079834 }, { "auxiliary_loss_clip": 0.01148436, "auxiliary_loss_mlp": 0.01024265, "balance_loss_clip": 0.81562257, "balance_loss_mlp": 1.01735711, "epoch": 0.721818072506463, "flos": 20229809915520.0, "grad_norm": 2.0405494389312655, "language_loss": 0.83836722, "learning_rate": 7.581603132908685e-07, "loss": 0.86009425, "num_input_tokens_seen": 129097185, "step": 6003, "time_per_iteration": 2.936506748199463 }, { "auxiliary_loss_clip": 0.01157333, "auxiliary_loss_mlp": 0.01024995, "balance_loss_clip": 0.93151039, "balance_loss_mlp": 1.01729417, "epoch": 0.7219383153971022, "flos": 18186887888640.0, "grad_norm": 1.8425974359463073, "language_loss": 0.78273606, "learning_rate": 7.575497918284795e-07, "loss": 0.80455935, "num_input_tokens_seen": 129114730, "step": 6004, "time_per_iteration": 3.9169740676879883 }, { "auxiliary_loss_clip": 0.01173087, "auxiliary_loss_mlp": 0.01027385, "balance_loss_clip": 1.04932404, "balance_loss_mlp": 1.01980913, "epoch": 0.7220585582877412, "flos": 17342124854400.0, "grad_norm": 2.0390480189828306, "language_loss": 0.7426911, "learning_rate": 7.569394588447984e-07, "loss": 0.76469582, "num_input_tokens_seen": 129131745, "step": 6005, "time_per_iteration": 2.5745086669921875 }, { "auxiliary_loss_clip": 0.011569, "auxiliary_loss_mlp": 0.01020546, "balance_loss_clip": 1.00702655, "balance_loss_mlp": 1.013906, "epoch": 0.7221788011783803, "flos": 16976338704000.0, "grad_norm": 2.700240856593312, "language_loss": 0.77773726, "learning_rate": 7.563293144324146e-07, "loss": 0.79951167, "num_input_tokens_seen": 129147295, "step": 6006, "time_per_iteration": 2.664926528930664 }, { "auxiliary_loss_clip": 0.01167642, "auxiliary_loss_mlp": 0.01026654, "balance_loss_clip": 1.0497632, "balance_loss_mlp": 1.01997495, "epoch": 0.7222990440690195, "flos": 26286359702400.0, "grad_norm": 2.009101331150661, "language_loss": 0.79998517, "learning_rate": 7.557193586838834e-07, "loss": 0.82192814, "num_input_tokens_seen": 129162660, "step": 6007, "time_per_iteration": 2.624457359313965 }, { "auxiliary_loss_clip": 0.01170322, "auxiliary_loss_mlp": 0.01021704, "balance_loss_clip": 0.97044498, "balance_loss_mlp": 1.01489162, "epoch": 0.7224192869596585, "flos": 17601687509760.0, "grad_norm": 2.993130302174788, "language_loss": 0.70075834, "learning_rate": 7.551095916917371e-07, "loss": 0.72267854, "num_input_tokens_seen": 129179990, "step": 6008, "time_per_iteration": 2.6223392486572266 }, { "auxiliary_loss_clip": 0.01176844, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 0.93331563, "balance_loss_mlp": 1.02310538, "epoch": 0.7225395298502976, "flos": 12932331016320.0, "grad_norm": 3.3362264907761174, "language_loss": 0.67142278, "learning_rate": 7.545000135484758e-07, "loss": 0.69350576, "num_input_tokens_seen": 129197425, "step": 6009, "time_per_iteration": 2.675055742263794 }, { "auxiliary_loss_clip": 0.01168943, "auxiliary_loss_mlp": 0.01122771, "balance_loss_clip": 1.04860306, "balance_loss_mlp": 0.0, "epoch": 0.7226597727409367, "flos": 29643899592960.0, "grad_norm": 2.2102934009845576, "language_loss": 0.62744182, "learning_rate": 7.538906243465714e-07, "loss": 0.65035897, "num_input_tokens_seen": 129217560, "step": 6010, "time_per_iteration": 2.638899803161621 }, { "auxiliary_loss_clip": 0.01169925, "auxiliary_loss_mlp": 0.01026822, "balance_loss_clip": 1.04981208, "balance_loss_mlp": 1.01966977, "epoch": 0.7227800156315758, "flos": 13771635183360.0, "grad_norm": 2.2526476574369494, "language_loss": 0.78784657, "learning_rate": 7.5328142417847e-07, "loss": 0.8098141, "num_input_tokens_seen": 129234325, "step": 6011, "time_per_iteration": 2.579324960708618 }, { "auxiliary_loss_clip": 0.01162491, "auxiliary_loss_mlp": 0.01024437, "balance_loss_clip": 1.00694966, "balance_loss_mlp": 1.01750183, "epoch": 0.7229002585222148, "flos": 20301882554880.0, "grad_norm": 1.6510658916158942, "language_loss": 0.69174808, "learning_rate": 7.526724131365838e-07, "loss": 0.71361738, "num_input_tokens_seen": 129255280, "step": 6012, "time_per_iteration": 2.60844349861145 }, { "auxiliary_loss_clip": 0.0117087, "auxiliary_loss_mlp": 0.01031018, "balance_loss_clip": 0.97622246, "balance_loss_mlp": 1.02273929, "epoch": 0.723020501412854, "flos": 16581250033920.0, "grad_norm": 1.8563142945198647, "language_loss": 0.70203584, "learning_rate": 7.520635913133017e-07, "loss": 0.72405469, "num_input_tokens_seen": 129273910, "step": 6013, "time_per_iteration": 2.684690475463867 }, { "auxiliary_loss_clip": 0.01173528, "auxiliary_loss_mlp": 0.01025846, "balance_loss_clip": 1.00985897, "balance_loss_mlp": 1.01776958, "epoch": 0.7231407443034931, "flos": 28548300908160.0, "grad_norm": 1.756533577385092, "language_loss": 0.82491338, "learning_rate": 7.514549588009798e-07, "loss": 0.84690714, "num_input_tokens_seen": 129294785, "step": 6014, "time_per_iteration": 2.6555182933807373 }, { "auxiliary_loss_clip": 0.01170047, "auxiliary_loss_mlp": 0.0102707, "balance_loss_clip": 0.97328126, "balance_loss_mlp": 1.01952434, "epoch": 0.7232609871941321, "flos": 30008536508160.0, "grad_norm": 39.89607507396739, "language_loss": 0.70682108, "learning_rate": 7.508465156919492e-07, "loss": 0.72879219, "num_input_tokens_seen": 129318295, "step": 6015, "time_per_iteration": 2.7861523628234863 }, { "auxiliary_loss_clip": 0.01166268, "auxiliary_loss_mlp": 0.01022738, "balance_loss_clip": 0.97083384, "balance_loss_mlp": 1.01535296, "epoch": 0.7233812300847713, "flos": 16654005031680.0, "grad_norm": 2.884961164328366, "language_loss": 0.61503285, "learning_rate": 7.502382620785083e-07, "loss": 0.6369229, "num_input_tokens_seen": 129334845, "step": 6016, "time_per_iteration": 2.633899211883545 }, { "auxiliary_loss_clip": 0.01067349, "auxiliary_loss_mlp": 0.01000716, "balance_loss_clip": 0.90104449, "balance_loss_mlp": 0.99914211, "epoch": 0.7235014729754103, "flos": 67258784050560.0, "grad_norm": 0.8490035114805109, "language_loss": 0.6258328, "learning_rate": 7.496301980529289e-07, "loss": 0.64651346, "num_input_tokens_seen": 129398055, "step": 6017, "time_per_iteration": 3.3796374797821045 }, { "auxiliary_loss_clip": 0.01168012, "auxiliary_loss_mlp": 0.01027394, "balance_loss_clip": 1.04770637, "balance_loss_mlp": 1.02057195, "epoch": 0.7236217158660494, "flos": 26943237671040.0, "grad_norm": 1.8421484660248946, "language_loss": 0.74267566, "learning_rate": 7.490223237074547e-07, "loss": 0.76462972, "num_input_tokens_seen": 129417765, "step": 6018, "time_per_iteration": 2.8123090267181396 }, { "auxiliary_loss_clip": 0.0116167, "auxiliary_loss_mlp": 0.01026352, "balance_loss_clip": 0.93062413, "balance_loss_mlp": 1.01906228, "epoch": 0.7237419587566886, "flos": 29423372042880.0, "grad_norm": 1.9898719350929364, "language_loss": 0.65924209, "learning_rate": 7.484146391342989e-07, "loss": 0.6811223, "num_input_tokens_seen": 129437560, "step": 6019, "time_per_iteration": 2.717527389526367 }, { "auxiliary_loss_clip": 0.01161098, "auxiliary_loss_mlp": 0.01030926, "balance_loss_clip": 0.97071433, "balance_loss_mlp": 1.02317178, "epoch": 0.7238622016473276, "flos": 17821496787840.0, "grad_norm": 7.569128756046113, "language_loss": 0.56654215, "learning_rate": 7.478071444256484e-07, "loss": 0.58846247, "num_input_tokens_seen": 129455320, "step": 6020, "time_per_iteration": 2.6879334449768066 }, { "auxiliary_loss_clip": 0.01175534, "auxiliary_loss_mlp": 0.01024852, "balance_loss_clip": 0.93405116, "balance_loss_mlp": 1.01762235, "epoch": 0.7239824445379667, "flos": 25739117020800.0, "grad_norm": 2.086039644443325, "language_loss": 0.79194432, "learning_rate": 7.471998396736579e-07, "loss": 0.81394815, "num_input_tokens_seen": 129475700, "step": 6021, "time_per_iteration": 2.713975667953491 }, { "auxiliary_loss_clip": 0.01167787, "auxiliary_loss_mlp": 0.010232, "balance_loss_clip": 0.9353326, "balance_loss_mlp": 1.01607728, "epoch": 0.7241026874286057, "flos": 23148916398720.0, "grad_norm": 1.6432225615358553, "language_loss": 0.76122284, "learning_rate": 7.465927249704549e-07, "loss": 0.78313273, "num_input_tokens_seen": 129493585, "step": 6022, "time_per_iteration": 2.73264479637146 }, { "auxiliary_loss_clip": 0.01166812, "auxiliary_loss_mlp": 0.01027431, "balance_loss_clip": 1.00954342, "balance_loss_mlp": 1.02092183, "epoch": 0.7242229303192449, "flos": 20266905686400.0, "grad_norm": 1.6126657808148752, "language_loss": 0.77318394, "learning_rate": 7.459858004081398e-07, "loss": 0.79512638, "num_input_tokens_seen": 129511555, "step": 6023, "time_per_iteration": 2.613672971725464 }, { "auxiliary_loss_clip": 0.01060711, "auxiliary_loss_mlp": 0.01003402, "balance_loss_clip": 0.89924037, "balance_loss_mlp": 1.00166178, "epoch": 0.724343173209884, "flos": 62311659684480.0, "grad_norm": 0.6578113169706394, "language_loss": 0.58036065, "learning_rate": 7.453790660787815e-07, "loss": 0.6010018, "num_input_tokens_seen": 129579650, "step": 6024, "time_per_iteration": 4.272123098373413 }, { "auxiliary_loss_clip": 0.01170096, "auxiliary_loss_mlp": 0.01041215, "balance_loss_clip": 0.9729563, "balance_loss_mlp": 1.03347182, "epoch": 0.724463416100523, "flos": 35006403813120.0, "grad_norm": 2.082836240916339, "language_loss": 0.63554978, "learning_rate": 7.447725220744214e-07, "loss": 0.65766287, "num_input_tokens_seen": 129601895, "step": 6025, "time_per_iteration": 2.7535018920898438 }, { "auxiliary_loss_clip": 0.01170149, "auxiliary_loss_mlp": 0.0102735, "balance_loss_clip": 1.04842949, "balance_loss_mlp": 1.01944613, "epoch": 0.7245836589911622, "flos": 21871968923520.0, "grad_norm": 1.991350267421015, "language_loss": 0.77155054, "learning_rate": 7.441661684870717e-07, "loss": 0.79352558, "num_input_tokens_seen": 129622150, "step": 6026, "time_per_iteration": 2.63454532623291 }, { "auxiliary_loss_clip": 0.01171197, "auxiliary_loss_mlp": 0.01021634, "balance_loss_clip": 1.04991293, "balance_loss_mlp": 1.01445723, "epoch": 0.7247039018818012, "flos": 23006494972800.0, "grad_norm": 3.139264498811876, "language_loss": 0.81561929, "learning_rate": 7.435600054087152e-07, "loss": 0.83754754, "num_input_tokens_seen": 129644315, "step": 6027, "time_per_iteration": 2.692244529724121 }, { "auxiliary_loss_clip": 0.0117125, "auxiliary_loss_mlp": 0.01031435, "balance_loss_clip": 1.04986405, "balance_loss_mlp": 1.02363849, "epoch": 0.7248241447724403, "flos": 31722588587520.0, "grad_norm": 2.007149800148861, "language_loss": 0.74365646, "learning_rate": 7.42954032931308e-07, "loss": 0.76568329, "num_input_tokens_seen": 129665355, "step": 6028, "time_per_iteration": 4.504699945449829 }, { "auxiliary_loss_clip": 0.01167866, "auxiliary_loss_mlp": 0.01025439, "balance_loss_clip": 0.97231948, "balance_loss_mlp": 1.01860845, "epoch": 0.7249443876630794, "flos": 34896984007680.0, "grad_norm": 1.7932265210962153, "language_loss": 0.74735171, "learning_rate": 7.423482511467733e-07, "loss": 0.76928478, "num_input_tokens_seen": 129686125, "step": 6029, "time_per_iteration": 2.7881267070770264 }, { "auxiliary_loss_clip": 0.0114982, "auxiliary_loss_mlp": 0.0102859, "balance_loss_clip": 0.85518634, "balance_loss_mlp": 1.02125907, "epoch": 0.7250646305537185, "flos": 26359294268160.0, "grad_norm": 3.290810310578613, "language_loss": 0.64276814, "learning_rate": 7.417426601470099e-07, "loss": 0.66455221, "num_input_tokens_seen": 129706485, "step": 6030, "time_per_iteration": 3.8020474910736084 }, { "auxiliary_loss_clip": 0.01170335, "auxiliary_loss_mlp": 0.01026978, "balance_loss_clip": 1.00993848, "balance_loss_mlp": 1.01902676, "epoch": 0.7251848734443576, "flos": 30081614728320.0, "grad_norm": 2.2478257302069315, "language_loss": 0.78599113, "learning_rate": 7.411372600238841e-07, "loss": 0.80796432, "num_input_tokens_seen": 129727100, "step": 6031, "time_per_iteration": 2.7174370288848877 }, { "auxiliary_loss_clip": 0.01169293, "auxiliary_loss_mlp": 0.0102672, "balance_loss_clip": 1.04795432, "balance_loss_mlp": 1.01987123, "epoch": 0.7253051163349967, "flos": 17785262943360.0, "grad_norm": 1.9371584873492362, "language_loss": 0.73863184, "learning_rate": 7.405320508692346e-07, "loss": 0.76059198, "num_input_tokens_seen": 129745840, "step": 6032, "time_per_iteration": 2.630156993865967 }, { "auxiliary_loss_clip": 0.01166171, "auxiliary_loss_mlp": 0.01020966, "balance_loss_clip": 1.04941022, "balance_loss_mlp": 1.01418042, "epoch": 0.7254253592256358, "flos": 12641346938880.0, "grad_norm": 2.548764480790678, "language_loss": 0.7553407, "learning_rate": 7.399270327748727e-07, "loss": 0.77721208, "num_input_tokens_seen": 129763500, "step": 6033, "time_per_iteration": 2.6325976848602295 }, { "auxiliary_loss_clip": 0.01168157, "auxiliary_loss_mlp": 0.0112205, "balance_loss_clip": 0.93393254, "balance_loss_mlp": 0.0, "epoch": 0.7255456021162748, "flos": 27199208966400.0, "grad_norm": 1.6576658391345664, "language_loss": 0.74099928, "learning_rate": 7.39322205832577e-07, "loss": 0.76390135, "num_input_tokens_seen": 129784390, "step": 6034, "time_per_iteration": 2.7096660137176514 }, { "auxiliary_loss_clip": 0.01160175, "auxiliary_loss_mlp": 0.0102582, "balance_loss_clip": 0.97073054, "balance_loss_mlp": 1.01881373, "epoch": 0.725665845006914, "flos": 21288205088640.0, "grad_norm": 1.966428204889125, "language_loss": 0.81088638, "learning_rate": 7.387175701341009e-07, "loss": 0.83274633, "num_input_tokens_seen": 129803060, "step": 6035, "time_per_iteration": 2.7209115028381348 }, { "auxiliary_loss_clip": 0.011676, "auxiliary_loss_mlp": 0.01030081, "balance_loss_clip": 1.00917327, "balance_loss_mlp": 1.02270794, "epoch": 0.7257860878975531, "flos": 16033684129920.0, "grad_norm": 2.1111548981822916, "language_loss": 0.72504413, "learning_rate": 7.381131257711659e-07, "loss": 0.7470209, "num_input_tokens_seen": 129820165, "step": 6036, "time_per_iteration": 2.5723042488098145 }, { "auxiliary_loss_clip": 0.01167873, "auxiliary_loss_mlp": 0.01024159, "balance_loss_clip": 0.97667623, "balance_loss_mlp": 1.01729226, "epoch": 0.7259063307881921, "flos": 12129943052160.0, "grad_norm": 1.7941830223224111, "language_loss": 0.83633274, "learning_rate": 7.375088728354677e-07, "loss": 0.858253, "num_input_tokens_seen": 129835195, "step": 6037, "time_per_iteration": 2.6353025436401367 }, { "auxiliary_loss_clip": 0.0116717, "auxiliary_loss_mlp": 0.01029425, "balance_loss_clip": 0.93186831, "balance_loss_mlp": 1.02223325, "epoch": 0.7260265736788313, "flos": 30443845432320.0, "grad_norm": 1.4220665977843678, "language_loss": 0.67397034, "learning_rate": 7.369048114186691e-07, "loss": 0.69593626, "num_input_tokens_seen": 129856240, "step": 6038, "time_per_iteration": 2.7359561920166016 }, { "auxiliary_loss_clip": 0.01174404, "auxiliary_loss_mlp": 0.01121719, "balance_loss_clip": 0.93490326, "balance_loss_mlp": 0.0, "epoch": 0.7261468165694703, "flos": 21142264129920.0, "grad_norm": 1.6532059043279839, "language_loss": 0.83190441, "learning_rate": 7.363009416124055e-07, "loss": 0.85486567, "num_input_tokens_seen": 129875565, "step": 6039, "time_per_iteration": 2.770930290222168 }, { "auxiliary_loss_clip": 0.01170527, "auxiliary_loss_mlp": 0.01026089, "balance_loss_clip": 0.93586296, "balance_loss_mlp": 1.01865673, "epoch": 0.7262670594601094, "flos": 22306308180480.0, "grad_norm": 2.2154646618305036, "language_loss": 0.62643582, "learning_rate": 7.356972635082852e-07, "loss": 0.64840198, "num_input_tokens_seen": 129894420, "step": 6040, "time_per_iteration": 2.73699688911438 }, { "auxiliary_loss_clip": 0.0116801, "auxiliary_loss_mlp": 0.01025615, "balance_loss_clip": 0.89996493, "balance_loss_mlp": 1.01797926, "epoch": 0.7263873023507486, "flos": 25335049950720.0, "grad_norm": 2.1340543469765434, "language_loss": 0.7526083, "learning_rate": 7.35093777197884e-07, "loss": 0.77454454, "num_input_tokens_seen": 129914490, "step": 6041, "time_per_iteration": 2.7565829753875732 }, { "auxiliary_loss_clip": 0.01166469, "auxiliary_loss_mlp": 0.01021275, "balance_loss_clip": 0.97304165, "balance_loss_mlp": 1.01394975, "epoch": 0.7265075452413876, "flos": 23878621192320.0, "grad_norm": 2.5111778652724324, "language_loss": 0.85325956, "learning_rate": 7.344904827727525e-07, "loss": 0.87513703, "num_input_tokens_seen": 129931670, "step": 6042, "time_per_iteration": 2.6684770584106445 }, { "auxiliary_loss_clip": 0.01163371, "auxiliary_loss_mlp": 0.01026174, "balance_loss_clip": 0.93111396, "balance_loss_mlp": 1.01883376, "epoch": 0.7266277881320267, "flos": 28724549967360.0, "grad_norm": 2.2800610078324746, "language_loss": 0.7321682, "learning_rate": 7.338873803244076e-07, "loss": 0.75406361, "num_input_tokens_seen": 129946905, "step": 6043, "time_per_iteration": 2.723339796066284 }, { "auxiliary_loss_clip": 0.01161069, "auxiliary_loss_mlp": 0.01023366, "balance_loss_clip": 0.97041094, "balance_loss_mlp": 1.01676524, "epoch": 0.7267480310226658, "flos": 24863507182080.0, "grad_norm": 1.6629028460841695, "language_loss": 0.80590099, "learning_rate": 7.332844699443401e-07, "loss": 0.82774538, "num_input_tokens_seen": 129965505, "step": 6044, "time_per_iteration": 2.699894905090332 }, { "auxiliary_loss_clip": 0.01157939, "auxiliary_loss_mlp": 0.01027247, "balance_loss_clip": 0.89483762, "balance_loss_mlp": 1.02052665, "epoch": 0.7268682739133049, "flos": 27198490694400.0, "grad_norm": 1.8688957462196218, "language_loss": 0.75357831, "learning_rate": 7.326817517240121e-07, "loss": 0.7754302, "num_input_tokens_seen": 129987210, "step": 6045, "time_per_iteration": 2.839139461517334 }, { "auxiliary_loss_clip": 0.01167163, "auxiliary_loss_mlp": 0.01122032, "balance_loss_clip": 1.00887346, "balance_loss_mlp": 0.0, "epoch": 0.7269885168039439, "flos": 33508138688640.0, "grad_norm": 1.7464122530970434, "language_loss": 0.83195508, "learning_rate": 7.320792257548545e-07, "loss": 0.85484707, "num_input_tokens_seen": 130008385, "step": 6046, "time_per_iteration": 2.758955955505371 }, { "auxiliary_loss_clip": 0.01171289, "auxiliary_loss_mlp": 0.01024936, "balance_loss_clip": 0.97165871, "balance_loss_mlp": 1.01763141, "epoch": 0.7271087596945831, "flos": 24313750548480.0, "grad_norm": 2.3021402742701182, "language_loss": 0.76061934, "learning_rate": 7.314768921282704e-07, "loss": 0.78258163, "num_input_tokens_seen": 130029040, "step": 6047, "time_per_iteration": 2.709059000015259 }, { "auxiliary_loss_clip": 0.01171062, "auxiliary_loss_mlp": 0.01025458, "balance_loss_clip": 1.01025033, "balance_loss_mlp": 1.01820695, "epoch": 0.7272290025852222, "flos": 23805147922560.0, "grad_norm": 2.3791925065779878, "language_loss": 0.71758258, "learning_rate": 7.30874750935633e-07, "loss": 0.73954785, "num_input_tokens_seen": 130048725, "step": 6048, "time_per_iteration": 2.6499342918395996 }, { "auxiliary_loss_clip": 0.01167151, "auxiliary_loss_mlp": 0.01025825, "balance_loss_clip": 0.93526292, "balance_loss_mlp": 1.01889563, "epoch": 0.7273492454758612, "flos": 16720367408640.0, "grad_norm": 1.7587512685523554, "language_loss": 0.7873919, "learning_rate": 7.30272802268286e-07, "loss": 0.80932164, "num_input_tokens_seen": 130065720, "step": 6049, "time_per_iteration": 3.6068596839904785 }, { "auxiliary_loss_clip": 0.01138624, "auxiliary_loss_mlp": 0.01024253, "balance_loss_clip": 0.85515398, "balance_loss_mlp": 1.01756215, "epoch": 0.7274694883665004, "flos": 28031330413440.0, "grad_norm": 1.6772272976519391, "language_loss": 0.76199389, "learning_rate": 7.29671046217547e-07, "loss": 0.78362262, "num_input_tokens_seen": 130084830, "step": 6050, "time_per_iteration": 2.780548334121704 }, { "auxiliary_loss_clip": 0.01165073, "auxiliary_loss_mlp": 0.01026268, "balance_loss_clip": 0.93262494, "balance_loss_mlp": 1.01890647, "epoch": 0.7275897312571394, "flos": 30372706546560.0, "grad_norm": 1.6677657578171343, "language_loss": 0.81851375, "learning_rate": 7.290694828746988e-07, "loss": 0.84042716, "num_input_tokens_seen": 130104495, "step": 6051, "time_per_iteration": 2.8313333988189697 }, { "auxiliary_loss_clip": 0.01167085, "auxiliary_loss_mlp": 0.01023486, "balance_loss_clip": 0.9309361, "balance_loss_mlp": 1.01641655, "epoch": 0.7277099741477785, "flos": 19204775498880.0, "grad_norm": 2.035965235422022, "language_loss": 0.85563135, "learning_rate": 7.284681123310004e-07, "loss": 0.87753713, "num_input_tokens_seen": 130123210, "step": 6052, "time_per_iteration": 2.677197217941284 }, { "auxiliary_loss_clip": 0.01168108, "auxiliary_loss_mlp": 0.01025342, "balance_loss_clip": 1.0110662, "balance_loss_mlp": 1.01757574, "epoch": 0.7278302170384175, "flos": 20667884186880.0, "grad_norm": 1.8470907327375574, "language_loss": 0.7953527, "learning_rate": 7.27866934677678e-07, "loss": 0.81728721, "num_input_tokens_seen": 130142880, "step": 6053, "time_per_iteration": 2.6642167568206787 }, { "auxiliary_loss_clip": 0.01164962, "auxiliary_loss_mlp": 0.01026997, "balance_loss_clip": 0.89559525, "balance_loss_mlp": 1.02049398, "epoch": 0.7279504599290567, "flos": 19093200877440.0, "grad_norm": 1.6291558912236208, "language_loss": 0.78178793, "learning_rate": 7.272659500059297e-07, "loss": 0.80370754, "num_input_tokens_seen": 130160220, "step": 6054, "time_per_iteration": 3.637587547302246 }, { "auxiliary_loss_clip": 0.01162218, "auxiliary_loss_mlp": 0.01026602, "balance_loss_clip": 1.01034749, "balance_loss_mlp": 1.01890373, "epoch": 0.7280707028196958, "flos": 19062174504960.0, "grad_norm": 2.107885679957906, "language_loss": 0.80068672, "learning_rate": 7.266651584069264e-07, "loss": 0.82257491, "num_input_tokens_seen": 130177885, "step": 6055, "time_per_iteration": 2.641528367996216 }, { "auxiliary_loss_clip": 0.01174313, "auxiliary_loss_mlp": 0.01027158, "balance_loss_clip": 1.01455951, "balance_loss_mlp": 1.02027655, "epoch": 0.7281909457103348, "flos": 37196308293120.0, "grad_norm": 1.7222961882854737, "language_loss": 0.56920528, "learning_rate": 7.260645599718045e-07, "loss": 0.59122002, "num_input_tokens_seen": 130204240, "step": 6056, "time_per_iteration": 3.65535306930542 }, { "auxiliary_loss_clip": 0.01169042, "auxiliary_loss_mlp": 0.01026946, "balance_loss_clip": 0.97372508, "balance_loss_mlp": 1.01886976, "epoch": 0.728311188600974, "flos": 20667094087680.0, "grad_norm": 2.554209653008748, "language_loss": 0.67029464, "learning_rate": 7.254641547916767e-07, "loss": 0.69225454, "num_input_tokens_seen": 130221735, "step": 6057, "time_per_iteration": 2.6538498401641846 }, { "auxiliary_loss_clip": 0.01169054, "auxiliary_loss_mlp": 0.01027063, "balance_loss_clip": 1.04915786, "balance_loss_mlp": 1.01976442, "epoch": 0.728431431491613, "flos": 28840685616000.0, "grad_norm": 1.640587555740948, "language_loss": 0.68846458, "learning_rate": 7.248639429576226e-07, "loss": 0.71042573, "num_input_tokens_seen": 130241190, "step": 6058, "time_per_iteration": 2.678819417953491 }, { "auxiliary_loss_clip": 0.01169407, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.0102644, "balance_loss_mlp": 1.02089214, "epoch": 0.7285516743822521, "flos": 25991856092160.0, "grad_norm": 1.769448411754872, "language_loss": 0.72201788, "learning_rate": 7.242639245606959e-07, "loss": 0.74399042, "num_input_tokens_seen": 130260980, "step": 6059, "time_per_iteration": 2.6846542358398438 }, { "auxiliary_loss_clip": 0.01171574, "auxiliary_loss_mlp": 0.01029995, "balance_loss_clip": 0.97114253, "balance_loss_mlp": 1.02249658, "epoch": 0.7286719172728913, "flos": 16399721675520.0, "grad_norm": 4.513796967763238, "language_loss": 0.82332379, "learning_rate": 7.236640996919168e-07, "loss": 0.84533954, "num_input_tokens_seen": 130280025, "step": 6060, "time_per_iteration": 2.66152024269104 }, { "auxiliary_loss_clip": 0.01171058, "auxiliary_loss_mlp": 0.01023221, "balance_loss_clip": 1.01100397, "balance_loss_mlp": 1.01622987, "epoch": 0.7287921601635303, "flos": 22018161277440.0, "grad_norm": 1.5857881770893578, "language_loss": 0.70374483, "learning_rate": 7.230644684422782e-07, "loss": 0.72568762, "num_input_tokens_seen": 130300255, "step": 6061, "time_per_iteration": 2.6479439735412598 }, { "auxiliary_loss_clip": 0.01162624, "auxiliary_loss_mlp": 0.01023998, "balance_loss_clip": 0.9331997, "balance_loss_mlp": 1.01628864, "epoch": 0.7289124030541694, "flos": 24600927784320.0, "grad_norm": 1.7916918894339633, "language_loss": 0.81966704, "learning_rate": 7.224650309027451e-07, "loss": 0.8415333, "num_input_tokens_seen": 130320005, "step": 6062, "time_per_iteration": 2.8443591594696045 }, { "auxiliary_loss_clip": 0.01170954, "auxiliary_loss_mlp": 0.01024864, "balance_loss_clip": 1.01177728, "balance_loss_mlp": 1.01833427, "epoch": 0.7290326459448085, "flos": 21393638484480.0, "grad_norm": 3.6730258778756864, "language_loss": 0.68626463, "learning_rate": 7.218657871642506e-07, "loss": 0.70822287, "num_input_tokens_seen": 130338810, "step": 6063, "time_per_iteration": 2.675081968307495 }, { "auxiliary_loss_clip": 0.01172429, "auxiliary_loss_mlp": 0.01024997, "balance_loss_clip": 1.0496726, "balance_loss_mlp": 1.01768398, "epoch": 0.7291528888354476, "flos": 18587686821120.0, "grad_norm": 1.9472473253774718, "language_loss": 0.61737907, "learning_rate": 7.212667373177012e-07, "loss": 0.63935333, "num_input_tokens_seen": 130353805, "step": 6064, "time_per_iteration": 2.581887722015381 }, { "auxiliary_loss_clip": 0.01163821, "auxiliary_loss_mlp": 0.0102736, "balance_loss_clip": 0.93260086, "balance_loss_mlp": 1.02050257, "epoch": 0.7292731317260867, "flos": 18951066760320.0, "grad_norm": 3.5170414591784906, "language_loss": 0.75320458, "learning_rate": 7.206678814539704e-07, "loss": 0.77511632, "num_input_tokens_seen": 130372105, "step": 6065, "time_per_iteration": 2.697286605834961 }, { "auxiliary_loss_clip": 0.0117549, "auxiliary_loss_mlp": 0.01025932, "balance_loss_clip": 0.89682627, "balance_loss_mlp": 1.01900899, "epoch": 0.7293933746167258, "flos": 21067569797760.0, "grad_norm": 1.5341427643215397, "language_loss": 0.72894323, "learning_rate": 7.20069219663904e-07, "loss": 0.75095737, "num_input_tokens_seen": 130391990, "step": 6066, "time_per_iteration": 2.7489545345306396 }, { "auxiliary_loss_clip": 0.01171126, "auxiliary_loss_mlp": 0.01028986, "balance_loss_clip": 1.00930476, "balance_loss_mlp": 1.02179742, "epoch": 0.7295136175073649, "flos": 22453326547200.0, "grad_norm": 1.580088798784451, "language_loss": 0.79930085, "learning_rate": 7.1947075203832e-07, "loss": 0.82130194, "num_input_tokens_seen": 130411970, "step": 6067, "time_per_iteration": 2.633277177810669 }, { "auxiliary_loss_clip": 0.01063292, "auxiliary_loss_mlp": 0.01003662, "balance_loss_clip": 1.01038051, "balance_loss_mlp": 1.0020411, "epoch": 0.7296338603980039, "flos": 56125506648960.0, "grad_norm": 0.862206761206234, "language_loss": 0.60204744, "learning_rate": 7.188724786680049e-07, "loss": 0.62271702, "num_input_tokens_seen": 130472440, "step": 6068, "time_per_iteration": 3.197417974472046 }, { "auxiliary_loss_clip": 0.01163411, "auxiliary_loss_mlp": 0.01025311, "balance_loss_clip": 0.97007096, "balance_loss_mlp": 1.01827788, "epoch": 0.7297541032886431, "flos": 25228287751680.0, "grad_norm": 1.6388433094675248, "language_loss": 0.7558794, "learning_rate": 7.182743996437162e-07, "loss": 0.77776664, "num_input_tokens_seen": 130491975, "step": 6069, "time_per_iteration": 2.717078685760498 }, { "auxiliary_loss_clip": 0.01172081, "auxiliary_loss_mlp": 0.01026993, "balance_loss_clip": 0.93291378, "balance_loss_mlp": 1.01906562, "epoch": 0.7298743461792822, "flos": 26467600752000.0, "grad_norm": 1.754695751155953, "language_loss": 0.68672049, "learning_rate": 7.176765150561819e-07, "loss": 0.70871127, "num_input_tokens_seen": 130510580, "step": 6070, "time_per_iteration": 2.713456153869629 }, { "auxiliary_loss_clip": 0.01169477, "auxiliary_loss_mlp": 0.01021369, "balance_loss_clip": 1.04704499, "balance_loss_mlp": 1.013901, "epoch": 0.7299945890699212, "flos": 19569053278080.0, "grad_norm": 2.221681840857095, "language_loss": 0.79631877, "learning_rate": 7.170788249961002e-07, "loss": 0.81822717, "num_input_tokens_seen": 130529090, "step": 6071, "time_per_iteration": 2.5815236568450928 }, { "auxiliary_loss_clip": 0.01168605, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.04833138, "balance_loss_mlp": 1.02214646, "epoch": 0.7301148319605604, "flos": 22928963466240.0, "grad_norm": 2.1499872659105925, "language_loss": 0.88035035, "learning_rate": 7.164813295541418e-07, "loss": 0.90232426, "num_input_tokens_seen": 130548655, "step": 6072, "time_per_iteration": 2.569432497024536 }, { "auxiliary_loss_clip": 0.0116703, "auxiliary_loss_mlp": 0.0102222, "balance_loss_clip": 0.97033763, "balance_loss_mlp": 1.0148344, "epoch": 0.7302350748511994, "flos": 25369703596800.0, "grad_norm": 1.604988166701141, "language_loss": 0.70315784, "learning_rate": 7.15884028820944e-07, "loss": 0.72505039, "num_input_tokens_seen": 130567710, "step": 6073, "time_per_iteration": 2.7270846366882324 }, { "auxiliary_loss_clip": 0.01159263, "auxiliary_loss_mlp": 0.01023583, "balance_loss_clip": 0.93094969, "balance_loss_mlp": 1.01652324, "epoch": 0.7303553177418385, "flos": 27819170732160.0, "grad_norm": 2.5889183588125593, "language_loss": 0.60822302, "learning_rate": 7.152869228871185e-07, "loss": 0.63005149, "num_input_tokens_seen": 130590195, "step": 6074, "time_per_iteration": 2.8824820518493652 }, { "auxiliary_loss_clip": 0.01162374, "auxiliary_loss_mlp": 0.01027493, "balance_loss_clip": 0.97173977, "balance_loss_mlp": 1.02017379, "epoch": 0.7304755606324776, "flos": 24426510318720.0, "grad_norm": 1.7502654147763814, "language_loss": 0.71807051, "learning_rate": 7.146900118432457e-07, "loss": 0.73996913, "num_input_tokens_seen": 130609940, "step": 6075, "time_per_iteration": 3.5997045040130615 }, { "auxiliary_loss_clip": 0.01155168, "auxiliary_loss_mlp": 0.01029498, "balance_loss_clip": 0.81239796, "balance_loss_mlp": 1.02257514, "epoch": 0.7305958035231167, "flos": 23840483927040.0, "grad_norm": 1.5936689610335135, "language_loss": 0.85662019, "learning_rate": 7.140932957798753e-07, "loss": 0.87846684, "num_input_tokens_seen": 130628380, "step": 6076, "time_per_iteration": 3.0266430377960205 }, { "auxiliary_loss_clip": 0.01169034, "auxiliary_loss_mlp": 0.01027439, "balance_loss_clip": 0.97082007, "balance_loss_mlp": 1.01997614, "epoch": 0.7307160464137558, "flos": 16726939597440.0, "grad_norm": 1.8051199020790916, "language_loss": 0.71424925, "learning_rate": 7.134967747875309e-07, "loss": 0.73621398, "num_input_tokens_seen": 130646590, "step": 6077, "time_per_iteration": 3.058976888656616 }, { "auxiliary_loss_clip": 0.0116158, "auxiliary_loss_mlp": 0.01027629, "balance_loss_clip": 1.00868464, "balance_loss_mlp": 1.02088153, "epoch": 0.7308362893043949, "flos": 21798280172160.0, "grad_norm": 1.8801537510506532, "language_loss": 0.81839085, "learning_rate": 7.129004489567014e-07, "loss": 0.84028298, "num_input_tokens_seen": 130664070, "step": 6078, "time_per_iteration": 2.666015863418579 }, { "auxiliary_loss_clip": 0.01169653, "auxiliary_loss_mlp": 0.01024296, "balance_loss_clip": 0.93287343, "balance_loss_mlp": 1.01732194, "epoch": 0.730956532195034, "flos": 10707377840640.0, "grad_norm": 2.2035345742630494, "language_loss": 0.77846062, "learning_rate": 7.123043183778512e-07, "loss": 0.80040008, "num_input_tokens_seen": 130681400, "step": 6079, "time_per_iteration": 3.9749510288238525 }, { "auxiliary_loss_clip": 0.01172862, "auxiliary_loss_mlp": 0.01032955, "balance_loss_clip": 0.93448889, "balance_loss_mlp": 1.02489972, "epoch": 0.731076775085673, "flos": 19791987039360.0, "grad_norm": 1.5698079028688885, "language_loss": 0.65159547, "learning_rate": 7.117083831414114e-07, "loss": 0.67365366, "num_input_tokens_seen": 130700675, "step": 6080, "time_per_iteration": 3.581599235534668 }, { "auxiliary_loss_clip": 0.01166755, "auxiliary_loss_mlp": 0.01025177, "balance_loss_clip": 1.04872704, "balance_loss_mlp": 1.01824474, "epoch": 0.7311970179763122, "flos": 20447033414400.0, "grad_norm": 1.7398388431109248, "language_loss": 0.69398546, "learning_rate": 7.11112643337787e-07, "loss": 0.71590477, "num_input_tokens_seen": 130719720, "step": 6081, "time_per_iteration": 2.612736940383911 }, { "auxiliary_loss_clip": 0.0117136, "auxiliary_loss_mlp": 0.01023722, "balance_loss_clip": 0.97436821, "balance_loss_mlp": 1.01622915, "epoch": 0.7313172608669513, "flos": 18513818501760.0, "grad_norm": 2.519392867465125, "language_loss": 0.76206094, "learning_rate": 7.10517099057349e-07, "loss": 0.78401172, "num_input_tokens_seen": 130736670, "step": 6082, "time_per_iteration": 3.6062991619110107 }, { "auxiliary_loss_clip": 0.01168509, "auxiliary_loss_mlp": 0.01027449, "balance_loss_clip": 0.97123021, "balance_loss_mlp": 1.02011216, "epoch": 0.7314375037575903, "flos": 16180738410240.0, "grad_norm": 2.8364509001636207, "language_loss": 0.61416662, "learning_rate": 7.099217503904411e-07, "loss": 0.63612628, "num_input_tokens_seen": 130754525, "step": 6083, "time_per_iteration": 2.652986526489258 }, { "auxiliary_loss_clip": 0.01171112, "auxiliary_loss_mlp": 0.01022554, "balance_loss_clip": 0.97310889, "balance_loss_mlp": 1.0158242, "epoch": 0.7315577466482295, "flos": 17967940536960.0, "grad_norm": 1.766024051407861, "language_loss": 0.89577323, "learning_rate": 7.093265974273788e-07, "loss": 0.91770983, "num_input_tokens_seen": 130772420, "step": 6084, "time_per_iteration": 2.7179834842681885 }, { "auxiliary_loss_clip": 0.01167358, "auxiliary_loss_mlp": 0.01024061, "balance_loss_clip": 1.00760388, "balance_loss_mlp": 1.01742125, "epoch": 0.7316779895388685, "flos": 18405440190720.0, "grad_norm": 1.6598962564248552, "language_loss": 0.71982104, "learning_rate": 7.087316402584447e-07, "loss": 0.74173522, "num_input_tokens_seen": 130791245, "step": 6085, "time_per_iteration": 2.620810031890869 }, { "auxiliary_loss_clip": 0.01170355, "auxiliary_loss_mlp": 0.01027098, "balance_loss_clip": 1.04984784, "balance_loss_mlp": 1.02014542, "epoch": 0.7317982324295076, "flos": 17928294900480.0, "grad_norm": 1.7259327623831213, "language_loss": 0.86216772, "learning_rate": 7.081368789738953e-07, "loss": 0.88414228, "num_input_tokens_seen": 130808445, "step": 6086, "time_per_iteration": 2.6278812885284424 }, { "auxiliary_loss_clip": 0.01157738, "auxiliary_loss_mlp": 0.01029126, "balance_loss_clip": 0.96734583, "balance_loss_mlp": 1.02217913, "epoch": 0.7319184753201466, "flos": 27229840289280.0, "grad_norm": 3.5639191518689723, "language_loss": 0.77761114, "learning_rate": 7.075423136639537e-07, "loss": 0.79947972, "num_input_tokens_seen": 130827700, "step": 6087, "time_per_iteration": 2.769777297973633 }, { "auxiliary_loss_clip": 0.0115858, "auxiliary_loss_mlp": 0.01025635, "balance_loss_clip": 0.93136919, "balance_loss_mlp": 1.01767206, "epoch": 0.7320387182107858, "flos": 37448544574080.0, "grad_norm": 2.4261504083691308, "language_loss": 0.74822974, "learning_rate": 7.069479444188149e-07, "loss": 0.77007186, "num_input_tokens_seen": 130848290, "step": 6088, "time_per_iteration": 2.809373378753662 }, { "auxiliary_loss_clip": 0.01157433, "auxiliary_loss_mlp": 0.01024994, "balance_loss_clip": 0.96994001, "balance_loss_mlp": 1.01788652, "epoch": 0.7321589611014249, "flos": 17859023521920.0, "grad_norm": 1.9996893360988823, "language_loss": 0.82517308, "learning_rate": 7.063537713286453e-07, "loss": 0.84699732, "num_input_tokens_seen": 130865970, "step": 6089, "time_per_iteration": 2.6016438007354736 }, { "auxiliary_loss_clip": 0.01172013, "auxiliary_loss_mlp": 0.01025112, "balance_loss_clip": 0.97121668, "balance_loss_mlp": 1.01776838, "epoch": 0.7322792039920639, "flos": 26100593539200.0, "grad_norm": 1.7679179105612366, "language_loss": 0.80866003, "learning_rate": 7.057597944835803e-07, "loss": 0.83063126, "num_input_tokens_seen": 130885245, "step": 6090, "time_per_iteration": 2.770411491394043 }, { "auxiliary_loss_clip": 0.01167062, "auxiliary_loss_mlp": 0.01026962, "balance_loss_clip": 0.93016315, "balance_loss_mlp": 1.020051, "epoch": 0.7323994468827031, "flos": 25369093065600.0, "grad_norm": 1.6446660946624412, "language_loss": 0.74573696, "learning_rate": 7.051660139737253e-07, "loss": 0.76767719, "num_input_tokens_seen": 130903465, "step": 6091, "time_per_iteration": 2.66302490234375 }, { "auxiliary_loss_clip": 0.01166867, "auxiliary_loss_mlp": 0.01122497, "balance_loss_clip": 1.01102984, "balance_loss_mlp": 0.0, "epoch": 0.7325196897733421, "flos": 26907075653760.0, "grad_norm": 1.8429998781418155, "language_loss": 0.76520216, "learning_rate": 7.045724298891565e-07, "loss": 0.78809577, "num_input_tokens_seen": 130922935, "step": 6092, "time_per_iteration": 2.6629695892333984 }, { "auxiliary_loss_clip": 0.01167213, "auxiliary_loss_mlp": 0.01024385, "balance_loss_clip": 1.01214838, "balance_loss_mlp": 1.01678574, "epoch": 0.7326399326639812, "flos": 25775781828480.0, "grad_norm": 1.9887281353594088, "language_loss": 0.68976521, "learning_rate": 7.039790423199192e-07, "loss": 0.71168119, "num_input_tokens_seen": 130942575, "step": 6093, "time_per_iteration": 2.7103476524353027 }, { "auxiliary_loss_clip": 0.01170575, "auxiliary_loss_mlp": 0.01025118, "balance_loss_clip": 0.97305834, "balance_loss_mlp": 1.01768577, "epoch": 0.7327601755546204, "flos": 21032269706880.0, "grad_norm": 2.1180710734503307, "language_loss": 0.77719653, "learning_rate": 7.033858513560322e-07, "loss": 0.79915345, "num_input_tokens_seen": 130958870, "step": 6094, "time_per_iteration": 2.63236141204834 }, { "auxiliary_loss_clip": 0.01171703, "auxiliary_loss_mlp": 0.01023445, "balance_loss_clip": 1.01270914, "balance_loss_mlp": 1.01615286, "epoch": 0.7328804184452594, "flos": 16289224462080.0, "grad_norm": 3.9086473368845294, "language_loss": 0.76344866, "learning_rate": 7.027928570874794e-07, "loss": 0.78540015, "num_input_tokens_seen": 130977060, "step": 6095, "time_per_iteration": 2.603902578353882 }, { "auxiliary_loss_clip": 0.01166925, "auxiliary_loss_mlp": 0.01021543, "balance_loss_clip": 1.04825175, "balance_loss_mlp": 1.0147686, "epoch": 0.7330006613358985, "flos": 17858233422720.0, "grad_norm": 1.9107756498221382, "language_loss": 0.85434079, "learning_rate": 7.022000596042194e-07, "loss": 0.87622541, "num_input_tokens_seen": 130994160, "step": 6096, "time_per_iteration": 2.573903799057007 }, { "auxiliary_loss_clip": 0.01162889, "auxiliary_loss_mlp": 0.01023957, "balance_loss_clip": 0.92924237, "balance_loss_mlp": 1.01684022, "epoch": 0.7331209042265376, "flos": 22492074343680.0, "grad_norm": 2.2947012553314177, "language_loss": 0.81654602, "learning_rate": 7.016074589961784e-07, "loss": 0.83841443, "num_input_tokens_seen": 131012725, "step": 6097, "time_per_iteration": 2.709505796432495 }, { "auxiliary_loss_clip": 0.01165686, "auxiliary_loss_mlp": 0.01028735, "balance_loss_clip": 0.97296512, "balance_loss_mlp": 1.02200532, "epoch": 0.7332411471171767, "flos": 33072757937280.0, "grad_norm": 1.6735393558039082, "language_loss": 0.67340082, "learning_rate": 7.01015055353253e-07, "loss": 0.69534504, "num_input_tokens_seen": 131035150, "step": 6098, "time_per_iteration": 2.7725701332092285 }, { "auxiliary_loss_clip": 0.01153215, "auxiliary_loss_mlp": 0.01033993, "balance_loss_clip": 0.8947888, "balance_loss_mlp": 1.02616715, "epoch": 0.7333613900078157, "flos": 22743017735040.0, "grad_norm": 1.669353679742519, "language_loss": 0.78054631, "learning_rate": 7.004228487653123e-07, "loss": 0.80241841, "num_input_tokens_seen": 131055955, "step": 6099, "time_per_iteration": 2.7648751735687256 }, { "auxiliary_loss_clip": 0.01159364, "auxiliary_loss_mlp": 0.0102477, "balance_loss_clip": 0.92848909, "balance_loss_mlp": 1.01757574, "epoch": 0.7334816328984549, "flos": 22346133384960.0, "grad_norm": 1.6451645008632407, "language_loss": 0.78213352, "learning_rate": 6.998308393221906e-07, "loss": 0.80397487, "num_input_tokens_seen": 131074360, "step": 6100, "time_per_iteration": 2.6822903156280518 }, { "auxiliary_loss_clip": 0.01168748, "auxiliary_loss_mlp": 0.01022389, "balance_loss_clip": 0.93546486, "balance_loss_mlp": 1.0155139, "epoch": 0.733601875789094, "flos": 20736149984640.0, "grad_norm": 2.6518496821640767, "language_loss": 0.71019197, "learning_rate": 6.992390271136977e-07, "loss": 0.73210335, "num_input_tokens_seen": 131090070, "step": 6101, "time_per_iteration": 3.6434872150421143 }, { "auxiliary_loss_clip": 0.01159991, "auxiliary_loss_mlp": 0.01024717, "balance_loss_clip": 1.00939655, "balance_loss_mlp": 1.01791942, "epoch": 0.733722118679733, "flos": 22564362464640.0, "grad_norm": 1.9391392617461072, "language_loss": 0.85683423, "learning_rate": 6.986474122296094e-07, "loss": 0.8786813, "num_input_tokens_seen": 131109185, "step": 6102, "time_per_iteration": 2.6505954265594482 }, { "auxiliary_loss_clip": 0.01171677, "auxiliary_loss_mlp": 0.01026119, "balance_loss_clip": 1.04949927, "balance_loss_mlp": 1.01835859, "epoch": 0.7338423615703722, "flos": 20084192179200.0, "grad_norm": 1.856408333513536, "language_loss": 0.72299093, "learning_rate": 6.980559947596751e-07, "loss": 0.74496889, "num_input_tokens_seen": 131127725, "step": 6103, "time_per_iteration": 2.57419490814209 }, { "auxiliary_loss_clip": 0.01162572, "auxiliary_loss_mlp": 0.01023334, "balance_loss_clip": 0.894068, "balance_loss_mlp": 1.01656926, "epoch": 0.7339626044610112, "flos": 21687675217920.0, "grad_norm": 1.9691145741919895, "language_loss": 0.75689489, "learning_rate": 6.974647747936109e-07, "loss": 0.778754, "num_input_tokens_seen": 131146110, "step": 6104, "time_per_iteration": 2.7622077465057373 }, { "auxiliary_loss_clip": 0.01169688, "auxiliary_loss_mlp": 0.01122273, "balance_loss_clip": 1.04949224, "balance_loss_mlp": 0.0, "epoch": 0.7340828473516503, "flos": 15268248282240.0, "grad_norm": 1.8624588089465899, "language_loss": 0.82385445, "learning_rate": 6.968737524211039e-07, "loss": 0.8467741, "num_input_tokens_seen": 131162920, "step": 6105, "time_per_iteration": 3.475412607192993 }, { "auxiliary_loss_clip": 0.01167423, "auxiliary_loss_mlp": 0.01024687, "balance_loss_clip": 1.01092553, "balance_loss_mlp": 1.01731944, "epoch": 0.7342030902422895, "flos": 22930112701440.0, "grad_norm": 2.26607066294627, "language_loss": 0.80081356, "learning_rate": 6.962829277318132e-07, "loss": 0.82273465, "num_input_tokens_seen": 131182515, "step": 6106, "time_per_iteration": 3.5931954383850098 }, { "auxiliary_loss_clip": 0.01168071, "auxiliary_loss_mlp": 0.01027126, "balance_loss_clip": 1.01056671, "balance_loss_mlp": 1.02040219, "epoch": 0.7343233331329285, "flos": 25847890381440.0, "grad_norm": 1.951561728586461, "language_loss": 0.83575368, "learning_rate": 6.956923008153652e-07, "loss": 0.85770571, "num_input_tokens_seen": 131202280, "step": 6107, "time_per_iteration": 2.631028413772583 }, { "auxiliary_loss_clip": 0.01168745, "auxiliary_loss_mlp": 0.01024732, "balance_loss_clip": 1.00935507, "balance_loss_mlp": 1.01782405, "epoch": 0.7344435760235676, "flos": 18478985287680.0, "grad_norm": 2.008962794131081, "language_loss": 0.84596169, "learning_rate": 6.951018717613593e-07, "loss": 0.8678965, "num_input_tokens_seen": 131221295, "step": 6108, "time_per_iteration": 3.5795249938964844 }, { "auxiliary_loss_clip": 0.01166162, "auxiliary_loss_mlp": 0.01026227, "balance_loss_clip": 1.01038003, "balance_loss_mlp": 1.0190115, "epoch": 0.7345638189142067, "flos": 17640040256640.0, "grad_norm": 1.9146542500653199, "language_loss": 0.78307998, "learning_rate": 6.945116406593614e-07, "loss": 0.80500388, "num_input_tokens_seen": 131240150, "step": 6109, "time_per_iteration": 2.6702630519866943 }, { "auxiliary_loss_clip": 0.0116874, "auxiliary_loss_mlp": 0.01022097, "balance_loss_clip": 0.89670789, "balance_loss_mlp": 1.01524282, "epoch": 0.7346840618048458, "flos": 20260225756800.0, "grad_norm": 2.05980419626718, "language_loss": 0.74064565, "learning_rate": 6.939216075989089e-07, "loss": 0.76255405, "num_input_tokens_seen": 131258080, "step": 6110, "time_per_iteration": 2.8144500255584717 }, { "auxiliary_loss_clip": 0.01162771, "auxiliary_loss_mlp": 0.01027063, "balance_loss_clip": 0.97054231, "balance_loss_mlp": 1.02008343, "epoch": 0.7348043046954849, "flos": 29023183641600.0, "grad_norm": 1.5480998905685954, "language_loss": 0.65695053, "learning_rate": 6.933317726695109e-07, "loss": 0.67884886, "num_input_tokens_seen": 131279310, "step": 6111, "time_per_iteration": 2.759544610977173 }, { "auxiliary_loss_clip": 0.01165504, "auxiliary_loss_mlp": 0.01025702, "balance_loss_clip": 0.93464828, "balance_loss_mlp": 1.01882935, "epoch": 0.734924547586124, "flos": 17931203902080.0, "grad_norm": 2.6622261389536024, "language_loss": 0.80020392, "learning_rate": 6.92742135960644e-07, "loss": 0.82211596, "num_input_tokens_seen": 131297010, "step": 6112, "time_per_iteration": 2.77209210395813 }, { "auxiliary_loss_clip": 0.01067276, "auxiliary_loss_mlp": 0.01004825, "balance_loss_clip": 0.9738338, "balance_loss_mlp": 1.00320339, "epoch": 0.7350447904767631, "flos": 63588319850880.0, "grad_norm": 0.8230380357023008, "language_loss": 0.55696952, "learning_rate": 6.921526975617556e-07, "loss": 0.57769048, "num_input_tokens_seen": 131356470, "step": 6113, "time_per_iteration": 3.245089292526245 }, { "auxiliary_loss_clip": 0.0116806, "auxiliary_loss_mlp": 0.01025028, "balance_loss_clip": 0.97000754, "balance_loss_mlp": 1.01792336, "epoch": 0.7351650333674021, "flos": 21580015178880.0, "grad_norm": 1.7739860243954082, "language_loss": 0.75268686, "learning_rate": 6.915634575622631e-07, "loss": 0.77461773, "num_input_tokens_seen": 131374985, "step": 6114, "time_per_iteration": 2.6758265495300293 }, { "auxiliary_loss_clip": 0.01165978, "auxiliary_loss_mlp": 0.01028006, "balance_loss_clip": 1.0478363, "balance_loss_mlp": 1.02111614, "epoch": 0.7352852762580413, "flos": 18186349184640.0, "grad_norm": 1.6832002587222736, "language_loss": 0.70866865, "learning_rate": 6.909744160515532e-07, "loss": 0.73060846, "num_input_tokens_seen": 131393125, "step": 6115, "time_per_iteration": 2.584362268447876 }, { "auxiliary_loss_clip": 0.01162645, "auxiliary_loss_mlp": 0.01027235, "balance_loss_clip": 0.97077096, "balance_loss_mlp": 1.02014506, "epoch": 0.7354055191486804, "flos": 38910073063680.0, "grad_norm": 1.7812504650415037, "language_loss": 0.69391561, "learning_rate": 6.903855731189849e-07, "loss": 0.71581447, "num_input_tokens_seen": 131415760, "step": 6116, "time_per_iteration": 2.9021215438842773 }, { "auxiliary_loss_clip": 0.01176248, "auxiliary_loss_mlp": 0.01020334, "balance_loss_clip": 0.97412455, "balance_loss_mlp": 1.01222181, "epoch": 0.7355257620393194, "flos": 16289978647680.0, "grad_norm": 2.0518955951540114, "language_loss": 0.81634498, "learning_rate": 6.897969288538825e-07, "loss": 0.83831084, "num_input_tokens_seen": 131433705, "step": 6117, "time_per_iteration": 2.719414710998535 }, { "auxiliary_loss_clip": 0.01161489, "auxiliary_loss_mlp": 0.0102916, "balance_loss_clip": 0.97073066, "balance_loss_mlp": 1.02237999, "epoch": 0.7356460049299585, "flos": 18114240631680.0, "grad_norm": 1.6023095489972476, "language_loss": 0.81541294, "learning_rate": 6.892084833455452e-07, "loss": 0.83731949, "num_input_tokens_seen": 131453275, "step": 6118, "time_per_iteration": 2.6929712295532227 }, { "auxiliary_loss_clip": 0.01163338, "auxiliary_loss_mlp": 0.01024215, "balance_loss_clip": 1.00889397, "balance_loss_mlp": 1.01745248, "epoch": 0.7357662478205976, "flos": 21325193118720.0, "grad_norm": 1.414981776699237, "language_loss": 0.83597171, "learning_rate": 6.886202366832384e-07, "loss": 0.85784733, "num_input_tokens_seen": 131474960, "step": 6119, "time_per_iteration": 2.63899564743042 }, { "auxiliary_loss_clip": 0.01165731, "auxiliary_loss_mlp": 0.01025233, "balance_loss_clip": 0.89570057, "balance_loss_mlp": 1.01812828, "epoch": 0.7358864907112367, "flos": 14246841139200.0, "grad_norm": 2.1721276738499276, "language_loss": 0.73747623, "learning_rate": 6.880321889561987e-07, "loss": 0.75938594, "num_input_tokens_seen": 131492935, "step": 6120, "time_per_iteration": 2.72147536277771 }, { "auxiliary_loss_clip": 0.01157185, "auxiliary_loss_mlp": 0.01026484, "balance_loss_clip": 0.93214858, "balance_loss_mlp": 1.01858044, "epoch": 0.7360067336018757, "flos": 22309684058880.0, "grad_norm": 2.7850490025820416, "language_loss": 0.65425915, "learning_rate": 6.874443402536338e-07, "loss": 0.67609584, "num_input_tokens_seen": 131512025, "step": 6121, "time_per_iteration": 2.686843156814575 }, { "auxiliary_loss_clip": 0.01172424, "auxiliary_loss_mlp": 0.01023334, "balance_loss_clip": 0.97402745, "balance_loss_mlp": 1.01578236, "epoch": 0.7361269764925149, "flos": 25554607833600.0, "grad_norm": 1.540805114909941, "language_loss": 0.80414045, "learning_rate": 6.868566906647177e-07, "loss": 0.82609808, "num_input_tokens_seen": 131532975, "step": 6122, "time_per_iteration": 2.7037429809570312 }, { "auxiliary_loss_clip": 0.01168026, "auxiliary_loss_mlp": 0.01029712, "balance_loss_clip": 1.01020074, "balance_loss_mlp": 1.02207077, "epoch": 0.736247219383154, "flos": 20376505059840.0, "grad_norm": 2.2200653230471055, "language_loss": 0.83440137, "learning_rate": 6.862692402785984e-07, "loss": 0.85637873, "num_input_tokens_seen": 131553225, "step": 6123, "time_per_iteration": 2.681079149246216 }, { "auxiliary_loss_clip": 0.0108254, "auxiliary_loss_mlp": 0.01000486, "balance_loss_clip": 0.91460752, "balance_loss_mlp": 0.99872202, "epoch": 0.736367462273793, "flos": 70339525735680.0, "grad_norm": 0.6837157235760004, "language_loss": 0.49621826, "learning_rate": 6.856819891843899e-07, "loss": 0.51704854, "num_input_tokens_seen": 131617930, "step": 6124, "time_per_iteration": 3.428037643432617 }, { "auxiliary_loss_clip": 0.01164551, "auxiliary_loss_mlp": 0.0102228, "balance_loss_clip": 0.85834384, "balance_loss_mlp": 1.01468015, "epoch": 0.7364877051644322, "flos": 22412711243520.0, "grad_norm": 1.869122991605269, "language_loss": 0.71968865, "learning_rate": 6.8509493747118e-07, "loss": 0.741557, "num_input_tokens_seen": 131636740, "step": 6125, "time_per_iteration": 2.8172593116760254 }, { "auxiliary_loss_clip": 0.01171795, "auxiliary_loss_mlp": 0.01027332, "balance_loss_clip": 1.05072665, "balance_loss_mlp": 1.02045059, "epoch": 0.7366079480550712, "flos": 12130266274560.0, "grad_norm": 2.1434488149965705, "language_loss": 0.88239866, "learning_rate": 6.845080852280221e-07, "loss": 0.90438992, "num_input_tokens_seen": 131653810, "step": 6126, "time_per_iteration": 2.995695114135742 }, { "auxiliary_loss_clip": 0.01165406, "auxiliary_loss_mlp": 0.01024633, "balance_loss_clip": 0.93188691, "balance_loss_mlp": 1.01804626, "epoch": 0.7367281909457103, "flos": 15049336844160.0, "grad_norm": 1.6143880129196404, "language_loss": 0.74409837, "learning_rate": 6.839214325439409e-07, "loss": 0.76599872, "num_input_tokens_seen": 131671505, "step": 6127, "time_per_iteration": 3.5115628242492676 }, { "auxiliary_loss_clip": 0.01163749, "auxiliary_loss_mlp": 0.01025482, "balance_loss_clip": 0.97494304, "balance_loss_mlp": 1.01868963, "epoch": 0.7368484338363495, "flos": 23510752053120.0, "grad_norm": 1.8142404141276343, "language_loss": 0.71878338, "learning_rate": 6.833349795079327e-07, "loss": 0.74067575, "num_input_tokens_seen": 131690615, "step": 6128, "time_per_iteration": 2.7625370025634766 }, { "auxiliary_loss_clip": 0.01164959, "auxiliary_loss_mlp": 0.0102609, "balance_loss_clip": 0.93418717, "balance_loss_mlp": 1.01877391, "epoch": 0.7369686767269885, "flos": 27417833095680.0, "grad_norm": 1.7206305437124898, "language_loss": 0.68538928, "learning_rate": 6.827487262089613e-07, "loss": 0.70729971, "num_input_tokens_seen": 131711120, "step": 6129, "time_per_iteration": 2.732185125350952 }, { "auxiliary_loss_clip": 0.01063427, "auxiliary_loss_mlp": 0.01002404, "balance_loss_clip": 0.93563545, "balance_loss_mlp": 1.00071132, "epoch": 0.7370889196176276, "flos": 70293343824000.0, "grad_norm": 0.9596675388848459, "language_loss": 0.56813359, "learning_rate": 6.821626727359606e-07, "loss": 0.58879191, "num_input_tokens_seen": 131776680, "step": 6130, "time_per_iteration": 3.322251796722412 }, { "auxiliary_loss_clip": 0.01168475, "auxiliary_loss_mlp": 0.01028619, "balance_loss_clip": 0.975986, "balance_loss_mlp": 1.0204885, "epoch": 0.7372091625082667, "flos": 18040839189120.0, "grad_norm": 2.230171267996136, "language_loss": 0.77289867, "learning_rate": 6.815768191778348e-07, "loss": 0.79486954, "num_input_tokens_seen": 131794760, "step": 6131, "time_per_iteration": 3.7444915771484375 }, { "auxiliary_loss_clip": 0.01160627, "auxiliary_loss_mlp": 0.01024232, "balance_loss_clip": 1.00795388, "balance_loss_mlp": 1.0171299, "epoch": 0.7373294053989058, "flos": 33726331854720.0, "grad_norm": 1.7217557232560954, "language_loss": 0.72844744, "learning_rate": 6.809911656234569e-07, "loss": 0.75029606, "num_input_tokens_seen": 131816735, "step": 6132, "time_per_iteration": 3.6195614337921143 }, { "auxiliary_loss_clip": 0.01166564, "auxiliary_loss_mlp": 0.01029088, "balance_loss_clip": 0.93152708, "balance_loss_mlp": 1.02197695, "epoch": 0.7374496482895448, "flos": 21506326427520.0, "grad_norm": 2.4822379584571816, "language_loss": 0.78396606, "learning_rate": 6.804057121616707e-07, "loss": 0.80592251, "num_input_tokens_seen": 131834940, "step": 6133, "time_per_iteration": 2.7006921768188477 }, { "auxiliary_loss_clip": 0.01169895, "auxiliary_loss_mlp": 0.01026742, "balance_loss_clip": 1.01015425, "balance_loss_mlp": 1.01905882, "epoch": 0.737569891180184, "flos": 24936908624640.0, "grad_norm": 1.8379066023896298, "language_loss": 0.71785706, "learning_rate": 6.798204588812888e-07, "loss": 0.73982346, "num_input_tokens_seen": 131854355, "step": 6134, "time_per_iteration": 3.575958013534546 }, { "auxiliary_loss_clip": 0.01149313, "auxiliary_loss_mlp": 0.01122555, "balance_loss_clip": 0.85397136, "balance_loss_mlp": 0.0, "epoch": 0.7376901340708231, "flos": 20664544222080.0, "grad_norm": 1.6383942505293292, "language_loss": 0.75755298, "learning_rate": 6.792354058710937e-07, "loss": 0.78027165, "num_input_tokens_seen": 131871825, "step": 6135, "time_per_iteration": 2.815039873123169 }, { "auxiliary_loss_clip": 0.01162385, "auxiliary_loss_mlp": 0.01023549, "balance_loss_clip": 1.04665256, "balance_loss_mlp": 1.0170399, "epoch": 0.7378103769614621, "flos": 23805794367360.0, "grad_norm": 1.8377371701805054, "language_loss": 0.65660334, "learning_rate": 6.786505532198374e-07, "loss": 0.67846268, "num_input_tokens_seen": 131890770, "step": 6136, "time_per_iteration": 2.6383023262023926 }, { "auxiliary_loss_clip": 0.011677, "auxiliary_loss_mlp": 0.0103079, "balance_loss_clip": 1.04746413, "balance_loss_mlp": 1.02282727, "epoch": 0.7379306198521013, "flos": 22237216369920.0, "grad_norm": 1.6286177910948578, "language_loss": 0.85206056, "learning_rate": 6.780659010162411e-07, "loss": 0.87404549, "num_input_tokens_seen": 131909720, "step": 6137, "time_per_iteration": 2.593885660171509 }, { "auxiliary_loss_clip": 0.0117231, "auxiliary_loss_mlp": 0.01024512, "balance_loss_clip": 0.93691242, "balance_loss_mlp": 1.01779509, "epoch": 0.7380508627427403, "flos": 14903108576640.0, "grad_norm": 1.6336485903650038, "language_loss": 0.83221173, "learning_rate": 6.774814493489975e-07, "loss": 0.85417998, "num_input_tokens_seen": 131927395, "step": 6138, "time_per_iteration": 2.718355655670166 }, { "auxiliary_loss_clip": 0.0116235, "auxiliary_loss_mlp": 0.01021622, "balance_loss_clip": 1.00880098, "balance_loss_mlp": 1.01519632, "epoch": 0.7381711056333794, "flos": 21685843624320.0, "grad_norm": 1.6270422691171067, "language_loss": 0.66173309, "learning_rate": 6.768971983067655e-07, "loss": 0.68357283, "num_input_tokens_seen": 131947725, "step": 6139, "time_per_iteration": 2.6259047985076904 }, { "auxiliary_loss_clip": 0.01063324, "auxiliary_loss_mlp": 0.01003477, "balance_loss_clip": 1.01040363, "balance_loss_mlp": 1.00190377, "epoch": 0.7382913485240186, "flos": 52404263596800.0, "grad_norm": 1.21937876737063, "language_loss": 0.67847002, "learning_rate": 6.763131479781772e-07, "loss": 0.69913805, "num_input_tokens_seen": 131997485, "step": 6140, "time_per_iteration": 3.011660099029541 }, { "auxiliary_loss_clip": 0.01158836, "auxiliary_loss_mlp": 0.01022333, "balance_loss_clip": 0.97234547, "balance_loss_mlp": 1.01551425, "epoch": 0.7384115914146576, "flos": 21798818876160.0, "grad_norm": 1.8851762898704747, "language_loss": 0.75876689, "learning_rate": 6.757292984518316e-07, "loss": 0.78057861, "num_input_tokens_seen": 132016885, "step": 6141, "time_per_iteration": 2.690715789794922 }, { "auxiliary_loss_clip": 0.01067776, "auxiliary_loss_mlp": 0.01002058, "balance_loss_clip": 0.97489661, "balance_loss_mlp": 1.00047302, "epoch": 0.7385318343052967, "flos": 61494331662720.0, "grad_norm": 0.7422208074566918, "language_loss": 0.56463599, "learning_rate": 6.751456498162981e-07, "loss": 0.58533442, "num_input_tokens_seen": 132075920, "step": 6142, "time_per_iteration": 3.0959746837615967 }, { "auxiliary_loss_clip": 0.01163368, "auxiliary_loss_mlp": 0.01024483, "balance_loss_clip": 1.0067327, "balance_loss_mlp": 1.0177598, "epoch": 0.7386520771959358, "flos": 17013757697280.0, "grad_norm": 1.8361782769474089, "language_loss": 0.85136926, "learning_rate": 6.745622021601174e-07, "loss": 0.87324774, "num_input_tokens_seen": 132092945, "step": 6143, "time_per_iteration": 2.6695306301116943 }, { "auxiliary_loss_clip": 0.01165834, "auxiliary_loss_mlp": 0.0102266, "balance_loss_clip": 0.93178445, "balance_loss_mlp": 1.0155549, "epoch": 0.7387723200865749, "flos": 18770759464320.0, "grad_norm": 1.9157808523326956, "language_loss": 0.69396603, "learning_rate": 6.739789555717954e-07, "loss": 0.71585101, "num_input_tokens_seen": 132109920, "step": 6144, "time_per_iteration": 2.7274863719940186 }, { "auxiliary_loss_clip": 0.01167719, "auxiliary_loss_mlp": 0.01024118, "balance_loss_clip": 1.04803598, "balance_loss_mlp": 1.01740336, "epoch": 0.738892562977214, "flos": 22525542840960.0, "grad_norm": 2.0568954607560443, "language_loss": 0.77170408, "learning_rate": 6.733959101398124e-07, "loss": 0.79362249, "num_input_tokens_seen": 132128050, "step": 6145, "time_per_iteration": 2.6576554775238037 }, { "auxiliary_loss_clip": 0.01163053, "auxiliary_loss_mlp": 0.01027529, "balance_loss_clip": 0.97049344, "balance_loss_mlp": 1.02082682, "epoch": 0.7390128058678531, "flos": 21501478091520.0, "grad_norm": 2.4124370433303737, "language_loss": 0.81402707, "learning_rate": 6.728130659526143e-07, "loss": 0.83593285, "num_input_tokens_seen": 132145860, "step": 6146, "time_per_iteration": 2.692084550857544 }, { "auxiliary_loss_clip": 0.01168924, "auxiliary_loss_mlp": 0.01023949, "balance_loss_clip": 0.97274852, "balance_loss_mlp": 1.01669538, "epoch": 0.7391330487584922, "flos": 25776176878080.0, "grad_norm": 2.1642840333851705, "language_loss": 0.70782042, "learning_rate": 6.7223042309862e-07, "loss": 0.7297492, "num_input_tokens_seen": 132166060, "step": 6147, "time_per_iteration": 2.7064127922058105 }, { "auxiliary_loss_clip": 0.01162322, "auxiliary_loss_mlp": 0.01020744, "balance_loss_clip": 1.00814641, "balance_loss_mlp": 1.01343989, "epoch": 0.7392532916491312, "flos": 28366736636160.0, "grad_norm": 1.9396684732344072, "language_loss": 0.7402516, "learning_rate": 6.716479816662144e-07, "loss": 0.76208228, "num_input_tokens_seen": 132187790, "step": 6148, "time_per_iteration": 2.682731866836548 }, { "auxiliary_loss_clip": 0.01169014, "auxiliary_loss_mlp": 0.0103073, "balance_loss_clip": 0.97068763, "balance_loss_mlp": 1.02354491, "epoch": 0.7393735345397703, "flos": 23585877348480.0, "grad_norm": 1.853263914336021, "language_loss": 0.73238635, "learning_rate": 6.710657417437531e-07, "loss": 0.7543838, "num_input_tokens_seen": 132207495, "step": 6149, "time_per_iteration": 2.744798421859741 }, { "auxiliary_loss_clip": 0.01166357, "auxiliary_loss_mlp": 0.01027081, "balance_loss_clip": 0.97145832, "balance_loss_mlp": 1.0204227, "epoch": 0.7394937774304094, "flos": 19974772373760.0, "grad_norm": 2.4653408911671355, "language_loss": 0.79966152, "learning_rate": 6.704837034195628e-07, "loss": 0.82159591, "num_input_tokens_seen": 132225960, "step": 6150, "time_per_iteration": 2.642831802368164 }, { "auxiliary_loss_clip": 0.0115842, "auxiliary_loss_mlp": 0.01030314, "balance_loss_clip": 1.00866055, "balance_loss_mlp": 1.02315891, "epoch": 0.7396140203210485, "flos": 23478037741440.0, "grad_norm": 2.1180891724067004, "language_loss": 0.84771603, "learning_rate": 6.699018667819376e-07, "loss": 0.8696034, "num_input_tokens_seen": 132245360, "step": 6151, "time_per_iteration": 2.694859504699707 }, { "auxiliary_loss_clip": 0.01161551, "auxiliary_loss_mlp": 0.01028605, "balance_loss_clip": 1.00866771, "balance_loss_mlp": 1.02067471, "epoch": 0.7397342632116876, "flos": 25555433846400.0, "grad_norm": 1.5340970447865012, "language_loss": 0.72729015, "learning_rate": 6.693202319191415e-07, "loss": 0.74919164, "num_input_tokens_seen": 132267095, "step": 6152, "time_per_iteration": 2.6800050735473633 }, { "auxiliary_loss_clip": 0.01168001, "auxiliary_loss_mlp": 0.01032157, "balance_loss_clip": 1.05068648, "balance_loss_mlp": 1.02515614, "epoch": 0.7398545061023267, "flos": 24755021130240.0, "grad_norm": 1.6936884632718527, "language_loss": 0.74596763, "learning_rate": 6.687387989194084e-07, "loss": 0.76796925, "num_input_tokens_seen": 132286610, "step": 6153, "time_per_iteration": 3.5063555240631104 }, { "auxiliary_loss_clip": 0.01164117, "auxiliary_loss_mlp": 0.01026638, "balance_loss_clip": 0.97361779, "balance_loss_mlp": 1.01970601, "epoch": 0.7399747489929658, "flos": 16508602776960.0, "grad_norm": 1.9232689236137521, "language_loss": 0.79533905, "learning_rate": 6.681575678709404e-07, "loss": 0.81724656, "num_input_tokens_seen": 132305300, "step": 6154, "time_per_iteration": 2.6762590408325195 }, { "auxiliary_loss_clip": 0.01166742, "auxiliary_loss_mlp": 0.01026468, "balance_loss_clip": 1.00993729, "balance_loss_mlp": 1.0195694, "epoch": 0.7400949918836048, "flos": 24097065753600.0, "grad_norm": 2.010851984329852, "language_loss": 0.70516449, "learning_rate": 6.67576538861911e-07, "loss": 0.72709656, "num_input_tokens_seen": 132323875, "step": 6155, "time_per_iteration": 2.610081195831299 }, { "auxiliary_loss_clip": 0.01161885, "auxiliary_loss_mlp": 0.01021065, "balance_loss_clip": 0.9703151, "balance_loss_mlp": 1.01435399, "epoch": 0.740215234774244, "flos": 21802517976960.0, "grad_norm": 1.4802200120002522, "language_loss": 0.82225919, "learning_rate": 6.669957119804612e-07, "loss": 0.84408867, "num_input_tokens_seen": 132345510, "step": 6156, "time_per_iteration": 2.698878288269043 }, { "auxiliary_loss_clip": 0.01172361, "auxiliary_loss_mlp": 0.01019026, "balance_loss_clip": 0.97030413, "balance_loss_mlp": 1.01152444, "epoch": 0.7403354776648831, "flos": 18733196816640.0, "grad_norm": 3.1983074497222344, "language_loss": 0.72324759, "learning_rate": 6.66415087314702e-07, "loss": 0.74516147, "num_input_tokens_seen": 132360465, "step": 6157, "time_per_iteration": 3.5795304775238037 }, { "auxiliary_loss_clip": 0.01165587, "auxiliary_loss_mlp": 0.01025601, "balance_loss_clip": 0.96907341, "balance_loss_mlp": 1.01924443, "epoch": 0.7404557205555221, "flos": 16909581277440.0, "grad_norm": 2.3086744408043325, "language_loss": 0.73002321, "learning_rate": 6.65834664952714e-07, "loss": 0.75193506, "num_input_tokens_seen": 132377915, "step": 6158, "time_per_iteration": 3.4847850799560547 }, { "auxiliary_loss_clip": 0.01163769, "auxiliary_loss_mlp": 0.01021764, "balance_loss_clip": 0.93005741, "balance_loss_mlp": 1.01531446, "epoch": 0.7405759634461613, "flos": 21214408596480.0, "grad_norm": 1.5299940507318315, "language_loss": 0.75844264, "learning_rate": 6.652544449825457e-07, "loss": 0.78029799, "num_input_tokens_seen": 132398170, "step": 6159, "time_per_iteration": 2.728302240371704 }, { "auxiliary_loss_clip": 0.01173595, "auxiliary_loss_mlp": 0.01026595, "balance_loss_clip": 0.97135437, "balance_loss_mlp": 1.01952004, "epoch": 0.7406962063368003, "flos": 20480106862080.0, "grad_norm": 1.6278239716917, "language_loss": 0.76379788, "learning_rate": 6.646744274922182e-07, "loss": 0.7857998, "num_input_tokens_seen": 132416615, "step": 6160, "time_per_iteration": 3.5740838050842285 }, { "auxiliary_loss_clip": 0.0116427, "auxiliary_loss_mlp": 0.01027522, "balance_loss_clip": 0.96895826, "balance_loss_mlp": 1.02039671, "epoch": 0.7408164492274394, "flos": 19791915212160.0, "grad_norm": 9.138084926426746, "language_loss": 0.75875431, "learning_rate": 6.640946125697171e-07, "loss": 0.78067225, "num_input_tokens_seen": 132434145, "step": 6161, "time_per_iteration": 2.6749520301818848 }, { "auxiliary_loss_clip": 0.01167717, "auxiliary_loss_mlp": 0.01025936, "balance_loss_clip": 1.00795841, "balance_loss_mlp": 1.01851833, "epoch": 0.7409366921180786, "flos": 29204855654400.0, "grad_norm": 2.095903663586103, "language_loss": 0.75387985, "learning_rate": 6.635150003030017e-07, "loss": 0.77581638, "num_input_tokens_seen": 132452670, "step": 6162, "time_per_iteration": 2.672379493713379 }, { "auxiliary_loss_clip": 0.01163152, "auxiliary_loss_mlp": 0.01027034, "balance_loss_clip": 0.89209306, "balance_loss_mlp": 1.0197922, "epoch": 0.7410569350087176, "flos": 22930004960640.0, "grad_norm": 2.345147455211459, "language_loss": 0.85794044, "learning_rate": 6.629355907799981e-07, "loss": 0.87984228, "num_input_tokens_seen": 132472475, "step": 6163, "time_per_iteration": 2.7379651069641113 }, { "auxiliary_loss_clip": 0.01168667, "auxiliary_loss_mlp": 0.01028737, "balance_loss_clip": 1.00909352, "balance_loss_mlp": 1.02122068, "epoch": 0.7411771778993567, "flos": 30440397726720.0, "grad_norm": 1.9092164255475486, "language_loss": 0.68976581, "learning_rate": 6.623563840886015e-07, "loss": 0.7117399, "num_input_tokens_seen": 132493400, "step": 6164, "time_per_iteration": 2.693866014480591 }, { "auxiliary_loss_clip": 0.01160351, "auxiliary_loss_mlp": 0.010239, "balance_loss_clip": 1.0069294, "balance_loss_mlp": 1.0173974, "epoch": 0.7412974207899958, "flos": 20522050968960.0, "grad_norm": 1.579703520937219, "language_loss": 0.69695044, "learning_rate": 6.617773803166795e-07, "loss": 0.71879292, "num_input_tokens_seen": 132511725, "step": 6165, "time_per_iteration": 2.65807843208313 }, { "auxiliary_loss_clip": 0.01169583, "auxiliary_loss_mlp": 0.01123116, "balance_loss_clip": 0.9724232, "balance_loss_mlp": 0.0, "epoch": 0.7414176636806349, "flos": 22090700793600.0, "grad_norm": 3.2331822222322684, "language_loss": 0.82194078, "learning_rate": 6.611985795520634e-07, "loss": 0.84486777, "num_input_tokens_seen": 132530270, "step": 6166, "time_per_iteration": 2.6998610496520996 }, { "auxiliary_loss_clip": 0.01175457, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 0.93493885, "balance_loss_mlp": 1.02085102, "epoch": 0.7415379065712739, "flos": 25155245445120.0, "grad_norm": 2.5333383256226725, "language_loss": 0.77645075, "learning_rate": 6.606199818825588e-07, "loss": 0.79849178, "num_input_tokens_seen": 132550725, "step": 6167, "time_per_iteration": 2.7152202129364014 }, { "auxiliary_loss_clip": 0.01167928, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 0.96816015, "balance_loss_mlp": 1.02370667, "epoch": 0.7416581494619131, "flos": 16871731320960.0, "grad_norm": 1.8868751664829704, "language_loss": 0.81698221, "learning_rate": 6.600415873959377e-07, "loss": 0.83897078, "num_input_tokens_seen": 132568600, "step": 6168, "time_per_iteration": 2.7446658611297607 }, { "auxiliary_loss_clip": 0.01161924, "auxiliary_loss_mlp": 0.01122083, "balance_loss_clip": 0.85581923, "balance_loss_mlp": 0.0, "epoch": 0.7417783923525522, "flos": 28438881102720.0, "grad_norm": 1.9364528251360784, "language_loss": 0.64587569, "learning_rate": 6.594633961799437e-07, "loss": 0.66871578, "num_input_tokens_seen": 132587640, "step": 6169, "time_per_iteration": 2.828670024871826 }, { "auxiliary_loss_clip": 0.0117289, "auxiliary_loss_mlp": 0.01026581, "balance_loss_clip": 0.93401462, "balance_loss_mlp": 1.01927686, "epoch": 0.7418986352431912, "flos": 20084299920000.0, "grad_norm": 1.547993456622944, "language_loss": 0.8146143, "learning_rate": 6.588854083222857e-07, "loss": 0.83660901, "num_input_tokens_seen": 132607075, "step": 6170, "time_per_iteration": 2.7291746139526367 }, { "auxiliary_loss_clip": 0.01170117, "auxiliary_loss_mlp": 0.01024571, "balance_loss_clip": 0.97216326, "balance_loss_mlp": 1.01695323, "epoch": 0.7420188781338304, "flos": 18259571059200.0, "grad_norm": 2.2248979435499234, "language_loss": 0.80648279, "learning_rate": 6.583076239106444e-07, "loss": 0.82842964, "num_input_tokens_seen": 132625580, "step": 6171, "time_per_iteration": 2.748835802078247 }, { "auxiliary_loss_clip": 0.0117129, "auxiliary_loss_mlp": 0.01022401, "balance_loss_clip": 0.97113931, "balance_loss_mlp": 1.01491427, "epoch": 0.7421391210244694, "flos": 13771994319360.0, "grad_norm": 6.576434311991205, "language_loss": 0.7516712, "learning_rate": 6.577300430326707e-07, "loss": 0.77360809, "num_input_tokens_seen": 132640525, "step": 6172, "time_per_iteration": 2.7035574913024902 }, { "auxiliary_loss_clip": 0.01161866, "auxiliary_loss_mlp": 0.0102204, "balance_loss_clip": 0.93378186, "balance_loss_mlp": 1.01503658, "epoch": 0.7422593639151085, "flos": 15961683317760.0, "grad_norm": 2.306189732970349, "language_loss": 0.72167337, "learning_rate": 6.571526657759821e-07, "loss": 0.74351245, "num_input_tokens_seen": 132656265, "step": 6173, "time_per_iteration": 2.6859192848205566 }, { "auxiliary_loss_clip": 0.01157555, "auxiliary_loss_mlp": 0.01023401, "balance_loss_clip": 1.00565004, "balance_loss_mlp": 1.01659143, "epoch": 0.7423796068057477, "flos": 30114400867200.0, "grad_norm": 1.5737589753669832, "language_loss": 0.71080434, "learning_rate": 6.565754922281663e-07, "loss": 0.73261392, "num_input_tokens_seen": 132678510, "step": 6174, "time_per_iteration": 2.7008285522460938 }, { "auxiliary_loss_clip": 0.01161109, "auxiliary_loss_mlp": 0.01023036, "balance_loss_clip": 0.96811724, "balance_loss_mlp": 1.01579404, "epoch": 0.7424998496963867, "flos": 20521907314560.0, "grad_norm": 1.7872178149622389, "language_loss": 0.78243166, "learning_rate": 6.559985224767801e-07, "loss": 0.80427307, "num_input_tokens_seen": 132696385, "step": 6175, "time_per_iteration": 2.6156487464904785 }, { "auxiliary_loss_clip": 0.011713, "auxiliary_loss_mlp": 0.01023939, "balance_loss_clip": 0.93432033, "balance_loss_mlp": 1.01705432, "epoch": 0.7426200925870258, "flos": 21871573873920.0, "grad_norm": 2.442105850186843, "language_loss": 0.75531763, "learning_rate": 6.55421756609349e-07, "loss": 0.77726996, "num_input_tokens_seen": 132714640, "step": 6176, "time_per_iteration": 2.7446305751800537 }, { "auxiliary_loss_clip": 0.01167331, "auxiliary_loss_mlp": 0.01029933, "balance_loss_clip": 1.0119884, "balance_loss_mlp": 1.02239048, "epoch": 0.7427403354776649, "flos": 26432049265920.0, "grad_norm": 1.7694075115171133, "language_loss": 0.78568983, "learning_rate": 6.54845194713369e-07, "loss": 0.80766249, "num_input_tokens_seen": 132735590, "step": 6177, "time_per_iteration": 2.688480854034424 }, { "auxiliary_loss_clip": 0.01160754, "auxiliary_loss_mlp": 0.01027364, "balance_loss_clip": 1.00915575, "balance_loss_mlp": 1.02037215, "epoch": 0.742860578368304, "flos": 19898390102400.0, "grad_norm": 1.929614902855729, "language_loss": 0.79737502, "learning_rate": 6.542688368763034e-07, "loss": 0.81925625, "num_input_tokens_seen": 132753995, "step": 6178, "time_per_iteration": 2.628587007522583 }, { "auxiliary_loss_clip": 0.01163387, "auxiliary_loss_mlp": 0.01024757, "balance_loss_clip": 1.01020133, "balance_loss_mlp": 1.01797414, "epoch": 0.742980821258943, "flos": 24827201510400.0, "grad_norm": 1.583104105760063, "language_loss": 0.7695998, "learning_rate": 6.536926831855854e-07, "loss": 0.79148126, "num_input_tokens_seen": 132773160, "step": 6179, "time_per_iteration": 3.557494878768921 }, { "auxiliary_loss_clip": 0.01161582, "auxiliary_loss_mlp": 0.01024224, "balance_loss_clip": 0.97200125, "balance_loss_mlp": 1.01744699, "epoch": 0.7431010641495821, "flos": 25228646887680.0, "grad_norm": 2.390093888177149, "language_loss": 0.73339248, "learning_rate": 6.531167337286165e-07, "loss": 0.75525045, "num_input_tokens_seen": 132793180, "step": 6180, "time_per_iteration": 2.6819956302642822 }, { "auxiliary_loss_clip": 0.0116332, "auxiliary_loss_mlp": 0.01022403, "balance_loss_clip": 0.971434, "balance_loss_mlp": 1.01599872, "epoch": 0.7432213070402213, "flos": 21762369550080.0, "grad_norm": 1.393954668526962, "language_loss": 0.79511148, "learning_rate": 6.52540988592768e-07, "loss": 0.8169688, "num_input_tokens_seen": 132814200, "step": 6181, "time_per_iteration": 2.686934471130371 }, { "auxiliary_loss_clip": 0.01167504, "auxiliary_loss_mlp": 0.01025832, "balance_loss_clip": 0.97096717, "balance_loss_mlp": 1.01903105, "epoch": 0.7433415499308603, "flos": 14793832425600.0, "grad_norm": 1.9730534198418257, "language_loss": 0.8324827, "learning_rate": 6.519654478653814e-07, "loss": 0.85441601, "num_input_tokens_seen": 132832565, "step": 6182, "time_per_iteration": 2.6666321754455566 }, { "auxiliary_loss_clip": 0.01072282, "auxiliary_loss_mlp": 0.01000518, "balance_loss_clip": 0.93748438, "balance_loss_mlp": 0.99870569, "epoch": 0.7434617928214994, "flos": 67155577297920.0, "grad_norm": 0.7411716958981042, "language_loss": 0.56125093, "learning_rate": 6.51390111633763e-07, "loss": 0.58197892, "num_input_tokens_seen": 132897840, "step": 6183, "time_per_iteration": 4.170970916748047 }, { "auxiliary_loss_clip": 0.01161055, "auxiliary_loss_mlp": 0.0102264, "balance_loss_clip": 0.85514885, "balance_loss_mlp": 1.01570511, "epoch": 0.7435820357121385, "flos": 27377576928000.0, "grad_norm": 3.0958321074483557, "language_loss": 0.76188886, "learning_rate": 6.508149799851932e-07, "loss": 0.78372586, "num_input_tokens_seen": 132919505, "step": 6184, "time_per_iteration": 2.8347702026367188 }, { "auxiliary_loss_clip": 0.01162441, "auxiliary_loss_mlp": 0.01025496, "balance_loss_clip": 0.97118378, "balance_loss_mlp": 1.01846826, "epoch": 0.7437022786027776, "flos": 23987645948160.0, "grad_norm": 1.7800399936595284, "language_loss": 0.61192757, "learning_rate": 6.502400530069183e-07, "loss": 0.63380694, "num_input_tokens_seen": 132939390, "step": 6185, "time_per_iteration": 3.630631685256958 }, { "auxiliary_loss_clip": 0.01164667, "auxiliary_loss_mlp": 0.01028459, "balance_loss_clip": 0.93299437, "balance_loss_mlp": 1.0205884, "epoch": 0.7438225214934167, "flos": 21866761451520.0, "grad_norm": 1.6964526386881988, "language_loss": 0.68297446, "learning_rate": 6.496653307861535e-07, "loss": 0.70490569, "num_input_tokens_seen": 132960060, "step": 6186, "time_per_iteration": 2.718451976776123 }, { "auxiliary_loss_clip": 0.01171601, "auxiliary_loss_mlp": 0.01028565, "balance_loss_clip": 1.00860274, "balance_loss_mlp": 1.02146578, "epoch": 0.7439427643840558, "flos": 20230097224320.0, "grad_norm": 1.7353813465537344, "language_loss": 0.6574645, "learning_rate": 6.490908134100857e-07, "loss": 0.67946613, "num_input_tokens_seen": 132978525, "step": 6187, "time_per_iteration": 2.6427536010742188 }, { "auxiliary_loss_clip": 0.01170381, "auxiliary_loss_mlp": 0.01021023, "balance_loss_clip": 1.00899541, "balance_loss_mlp": 1.01412106, "epoch": 0.7440630072746949, "flos": 20849915335680.0, "grad_norm": 2.073061422444826, "language_loss": 0.69489336, "learning_rate": 6.48516500965866e-07, "loss": 0.71680743, "num_input_tokens_seen": 132998460, "step": 6188, "time_per_iteration": 2.6187357902526855 }, { "auxiliary_loss_clip": 0.01170571, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 1.00848472, "balance_loss_mlp": 1.02126288, "epoch": 0.7441832501653339, "flos": 26503762769280.0, "grad_norm": 2.0622248769270044, "language_loss": 0.81465399, "learning_rate": 6.479423935406192e-07, "loss": 0.83664453, "num_input_tokens_seen": 133018445, "step": 6189, "time_per_iteration": 2.6951143741607666 }, { "auxiliary_loss_clip": 0.01061691, "auxiliary_loss_mlp": 0.01005176, "balance_loss_clip": 0.93614584, "balance_loss_mlp": 1.00367379, "epoch": 0.7443034930559731, "flos": 68602848088320.0, "grad_norm": 0.820956838326652, "language_loss": 0.62007618, "learning_rate": 6.473684912214357e-07, "loss": 0.64074486, "num_input_tokens_seen": 133082005, "step": 6190, "time_per_iteration": 3.326580286026001 }, { "auxiliary_loss_clip": 0.01167118, "auxiliary_loss_mlp": 0.01025134, "balance_loss_clip": 1.01094913, "balance_loss_mlp": 1.01807714, "epoch": 0.7444237359466122, "flos": 18654982951680.0, "grad_norm": 2.0315784629572753, "language_loss": 0.69702351, "learning_rate": 6.467947940953778e-07, "loss": 0.71894604, "num_input_tokens_seen": 133100530, "step": 6191, "time_per_iteration": 2.6353323459625244 }, { "auxiliary_loss_clip": 0.01163067, "auxiliary_loss_mlp": 0.01023718, "balance_loss_clip": 0.97034329, "balance_loss_mlp": 1.01720941, "epoch": 0.7445439788372512, "flos": 22817604326400.0, "grad_norm": 1.8352351794439536, "language_loss": 0.72477031, "learning_rate": 6.462213022494732e-07, "loss": 0.74663818, "num_input_tokens_seen": 133119775, "step": 6192, "time_per_iteration": 2.6558969020843506 }, { "auxiliary_loss_clip": 0.01068188, "auxiliary_loss_mlp": 0.01001167, "balance_loss_clip": 0.97388166, "balance_loss_mlp": 0.99955809, "epoch": 0.7446642217278904, "flos": 67045690615680.0, "grad_norm": 0.7795546180626605, "language_loss": 0.61085325, "learning_rate": 6.456480157707201e-07, "loss": 0.6315468, "num_input_tokens_seen": 133184550, "step": 6193, "time_per_iteration": 3.156362533569336 }, { "auxiliary_loss_clip": 0.01153814, "auxiliary_loss_mlp": 0.01022993, "balance_loss_clip": 0.93021196, "balance_loss_mlp": 1.01589739, "epoch": 0.7447844646185294, "flos": 17417465631360.0, "grad_norm": 1.8170139052768077, "language_loss": 0.84814858, "learning_rate": 6.450749347460866e-07, "loss": 0.86991668, "num_input_tokens_seen": 133201525, "step": 6194, "time_per_iteration": 2.7334983348846436 }, { "auxiliary_loss_clip": 0.01169743, "auxiliary_loss_mlp": 0.01025201, "balance_loss_clip": 1.04903948, "balance_loss_mlp": 1.01765776, "epoch": 0.7449047075091685, "flos": 26615876094720.0, "grad_norm": 1.7387485494983403, "language_loss": 0.78921723, "learning_rate": 6.445020592625083e-07, "loss": 0.8111667, "num_input_tokens_seen": 133222175, "step": 6195, "time_per_iteration": 2.6530768871307373 }, { "auxiliary_loss_clip": 0.01165416, "auxiliary_loss_mlp": 0.01025831, "balance_loss_clip": 1.04711974, "balance_loss_mlp": 1.01905417, "epoch": 0.7450249503998077, "flos": 14170458867840.0, "grad_norm": 2.465520945436477, "language_loss": 0.79990935, "learning_rate": 6.4392938940689e-07, "loss": 0.82182181, "num_input_tokens_seen": 133237590, "step": 6196, "time_per_iteration": 2.5989184379577637 }, { "auxiliary_loss_clip": 0.01156929, "auxiliary_loss_mlp": 0.01122797, "balance_loss_clip": 0.89354134, "balance_loss_mlp": 0.0, "epoch": 0.7451451932904467, "flos": 19606687752960.0, "grad_norm": 2.52534927573421, "language_loss": 0.71548325, "learning_rate": 6.433569252661049e-07, "loss": 0.73828053, "num_input_tokens_seen": 133255590, "step": 6197, "time_per_iteration": 2.7455480098724365 }, { "auxiliary_loss_clip": 0.01155453, "auxiliary_loss_mlp": 0.01025509, "balance_loss_clip": 0.93091583, "balance_loss_mlp": 1.01922929, "epoch": 0.7452654361810858, "flos": 12495405980160.0, "grad_norm": 1.8428465720108593, "language_loss": 0.71337295, "learning_rate": 6.427846669269952e-07, "loss": 0.73518258, "num_input_tokens_seen": 133273210, "step": 6198, "time_per_iteration": 2.697756290435791 }, { "auxiliary_loss_clip": 0.01171151, "auxiliary_loss_mlp": 0.0102644, "balance_loss_clip": 1.05121911, "balance_loss_mlp": 1.01980352, "epoch": 0.7453856790717249, "flos": 22127329687680.0, "grad_norm": 1.9679518759197403, "language_loss": 0.82279021, "learning_rate": 6.422126144763729e-07, "loss": 0.84476608, "num_input_tokens_seen": 133292600, "step": 6199, "time_per_iteration": 2.6269376277923584 }, { "auxiliary_loss_clip": 0.01159608, "auxiliary_loss_mlp": 0.01122889, "balance_loss_clip": 0.92986596, "balance_loss_mlp": 0.0, "epoch": 0.745505921962364, "flos": 20010682995840.0, "grad_norm": 2.18788593387655, "language_loss": 0.76658511, "learning_rate": 6.416407680010174e-07, "loss": 0.78941011, "num_input_tokens_seen": 133306960, "step": 6200, "time_per_iteration": 2.7823903560638428 }, { "auxiliary_loss_clip": 0.01179434, "auxiliary_loss_mlp": 0.01029022, "balance_loss_clip": 0.89691222, "balance_loss_mlp": 1.02175021, "epoch": 0.745626164853003, "flos": 24677884673280.0, "grad_norm": 2.031269667282568, "language_loss": 0.81127858, "learning_rate": 6.410691275876774e-07, "loss": 0.83336306, "num_input_tokens_seen": 133326380, "step": 6201, "time_per_iteration": 2.7369227409362793 }, { "auxiliary_loss_clip": 0.01175532, "auxiliary_loss_mlp": 0.01029268, "balance_loss_clip": 0.97309375, "balance_loss_mlp": 1.02176952, "epoch": 0.7457464077436422, "flos": 14538830797440.0, "grad_norm": 2.6429454255930698, "language_loss": 0.76447153, "learning_rate": 6.404976933230704e-07, "loss": 0.78651953, "num_input_tokens_seen": 133342900, "step": 6202, "time_per_iteration": 2.6901795864105225 }, { "auxiliary_loss_clip": 0.01169845, "auxiliary_loss_mlp": 0.01025015, "balance_loss_clip": 0.97130704, "balance_loss_mlp": 1.01771367, "epoch": 0.7458666506342813, "flos": 34021194600960.0, "grad_norm": 1.9984939830232356, "language_loss": 0.72444832, "learning_rate": 6.399264652938813e-07, "loss": 0.7463969, "num_input_tokens_seen": 133363805, "step": 6203, "time_per_iteration": 2.7669506072998047 }, { "auxiliary_loss_clip": 0.01163109, "auxiliary_loss_mlp": 0.0102877, "balance_loss_clip": 0.97072452, "balance_loss_mlp": 1.02188563, "epoch": 0.7459868935249203, "flos": 24279025075200.0, "grad_norm": 1.8062776195677566, "language_loss": 0.74932498, "learning_rate": 6.393554435867679e-07, "loss": 0.77124369, "num_input_tokens_seen": 133384655, "step": 6204, "time_per_iteration": 2.723005533218384 }, { "auxiliary_loss_clip": 0.01157873, "auxiliary_loss_mlp": 0.01023037, "balance_loss_clip": 0.93037558, "balance_loss_mlp": 1.01563394, "epoch": 0.7461071364155595, "flos": 21908777385600.0, "grad_norm": 2.0988520743212282, "language_loss": 0.83620346, "learning_rate": 6.387846282883502e-07, "loss": 0.85801256, "num_input_tokens_seen": 133401185, "step": 6205, "time_per_iteration": 3.668250322341919 }, { "auxiliary_loss_clip": 0.01168025, "auxiliary_loss_mlp": 0.01021842, "balance_loss_clip": 1.04880643, "balance_loss_mlp": 1.01502001, "epoch": 0.7462273793061985, "flos": 22889712879360.0, "grad_norm": 1.9398632161052807, "language_loss": 0.76821017, "learning_rate": 6.38214019485223e-07, "loss": 0.7901088, "num_input_tokens_seen": 133420010, "step": 6206, "time_per_iteration": 2.5816712379455566 }, { "auxiliary_loss_clip": 0.01154335, "auxiliary_loss_mlp": 0.01025157, "balance_loss_clip": 0.85330981, "balance_loss_mlp": 1.0177573, "epoch": 0.7463476221968376, "flos": 19968451580160.0, "grad_norm": 1.8208081024151939, "language_loss": 0.71439797, "learning_rate": 6.376436172639461e-07, "loss": 0.73619294, "num_input_tokens_seen": 133437855, "step": 6207, "time_per_iteration": 2.8015496730804443 }, { "auxiliary_loss_clip": 0.01167513, "auxiliary_loss_mlp": 0.01029335, "balance_loss_clip": 0.81934023, "balance_loss_mlp": 1.02175951, "epoch": 0.7464678650874768, "flos": 16836610798080.0, "grad_norm": 1.998100647694979, "language_loss": 0.64877552, "learning_rate": 6.370734217110487e-07, "loss": 0.670744, "num_input_tokens_seen": 133456600, "step": 6208, "time_per_iteration": 2.7681431770324707 }, { "auxiliary_loss_clip": 0.01174565, "auxiliary_loss_mlp": 0.01028505, "balance_loss_clip": 0.97725439, "balance_loss_mlp": 1.02042818, "epoch": 0.7465881079781158, "flos": 48100869843840.0, "grad_norm": 1.5260398790855778, "language_loss": 0.64301598, "learning_rate": 6.36503432913031e-07, "loss": 0.66504669, "num_input_tokens_seen": 133479745, "step": 6209, "time_per_iteration": 3.865255117416382 }, { "auxiliary_loss_clip": 0.01167636, "auxiliary_loss_mlp": 0.01025484, "balance_loss_clip": 1.01027632, "balance_loss_mlp": 1.01820004, "epoch": 0.7467083508687549, "flos": 19677359761920.0, "grad_norm": 2.7486408435464105, "language_loss": 0.69040704, "learning_rate": 6.359336509563569e-07, "loss": 0.71233821, "num_input_tokens_seen": 133495765, "step": 6210, "time_per_iteration": 2.6618881225585938 }, { "auxiliary_loss_clip": 0.01152618, "auxiliary_loss_mlp": 0.01027735, "balance_loss_clip": 0.93272954, "balance_loss_mlp": 1.02027822, "epoch": 0.7468285937593939, "flos": 17895436934400.0, "grad_norm": 1.6985888299093053, "language_loss": 0.80291706, "learning_rate": 6.353640759274641e-07, "loss": 0.82472062, "num_input_tokens_seen": 133514655, "step": 6211, "time_per_iteration": 3.6050167083740234 }, { "auxiliary_loss_clip": 0.0116247, "auxiliary_loss_mlp": 0.01022999, "balance_loss_clip": 1.00658488, "balance_loss_mlp": 1.01592422, "epoch": 0.7469488366500331, "flos": 23141446369920.0, "grad_norm": 3.9578539963081094, "language_loss": 0.74832761, "learning_rate": 6.347947079127556e-07, "loss": 0.77018225, "num_input_tokens_seen": 133532555, "step": 6212, "time_per_iteration": 2.6884853839874268 }, { "auxiliary_loss_clip": 0.01159187, "auxiliary_loss_mlp": 0.01028703, "balance_loss_clip": 0.97060877, "balance_loss_mlp": 1.02157116, "epoch": 0.7470690795406721, "flos": 16690849407360.0, "grad_norm": 2.1263267032403146, "language_loss": 0.76973987, "learning_rate": 6.342255469986053e-07, "loss": 0.79161882, "num_input_tokens_seen": 133551300, "step": 6213, "time_per_iteration": 2.668391704559326 }, { "auxiliary_loss_clip": 0.01167288, "auxiliary_loss_mlp": 0.01023631, "balance_loss_clip": 1.04858565, "balance_loss_mlp": 1.01683652, "epoch": 0.7471893224313112, "flos": 25192700352000.0, "grad_norm": 2.122286649202501, "language_loss": 0.76322603, "learning_rate": 6.336565932713533e-07, "loss": 0.78513527, "num_input_tokens_seen": 133570725, "step": 6214, "time_per_iteration": 2.697758197784424 }, { "auxiliary_loss_clip": 0.01169164, "auxiliary_loss_mlp": 0.0102592, "balance_loss_clip": 0.97593999, "balance_loss_mlp": 1.01845157, "epoch": 0.7473095653219504, "flos": 22526225199360.0, "grad_norm": 1.8647662892298509, "language_loss": 0.77482504, "learning_rate": 6.330878468173088e-07, "loss": 0.79677588, "num_input_tokens_seen": 133590790, "step": 6215, "time_per_iteration": 2.686018466949463 }, { "auxiliary_loss_clip": 0.01158882, "auxiliary_loss_mlp": 0.01025095, "balance_loss_clip": 1.00829852, "balance_loss_mlp": 1.01817179, "epoch": 0.7474298082125894, "flos": 18113989236480.0, "grad_norm": 1.536989146752822, "language_loss": 0.73175621, "learning_rate": 6.32519307722752e-07, "loss": 0.75359595, "num_input_tokens_seen": 133608685, "step": 6216, "time_per_iteration": 2.70660662651062 }, { "auxiliary_loss_clip": 0.01082657, "auxiliary_loss_mlp": 0.01000842, "balance_loss_clip": 0.91338211, "balance_loss_mlp": 0.99902958, "epoch": 0.7475500511032285, "flos": 62086535193600.0, "grad_norm": 0.8340547015158464, "language_loss": 0.55015856, "learning_rate": 6.31950976073929e-07, "loss": 0.57099354, "num_input_tokens_seen": 133662775, "step": 6217, "time_per_iteration": 3.2129435539245605 }, { "auxiliary_loss_clip": 0.01162205, "auxiliary_loss_mlp": 0.01030174, "balance_loss_clip": 0.8965385, "balance_loss_mlp": 1.02288175, "epoch": 0.7476702939938676, "flos": 17785586165760.0, "grad_norm": 2.2155370838604886, "language_loss": 0.80981565, "learning_rate": 6.31382851957055e-07, "loss": 0.83173943, "num_input_tokens_seen": 133679595, "step": 6218, "time_per_iteration": 2.695000410079956 }, { "auxiliary_loss_clip": 0.01161196, "auxiliary_loss_mlp": 0.01122178, "balance_loss_clip": 0.93440074, "balance_loss_mlp": 0.0, "epoch": 0.7477905368845067, "flos": 27927944092800.0, "grad_norm": 2.0095648123742302, "language_loss": 0.71525902, "learning_rate": 6.308149354583143e-07, "loss": 0.73809278, "num_input_tokens_seen": 133699000, "step": 6219, "time_per_iteration": 2.774130344390869 }, { "auxiliary_loss_clip": 0.01172107, "auxiliary_loss_mlp": 0.01027607, "balance_loss_clip": 1.01128829, "balance_loss_mlp": 1.02022469, "epoch": 0.7479107797751458, "flos": 26870374932480.0, "grad_norm": 1.7777227356774268, "language_loss": 0.81793553, "learning_rate": 6.302472266638586e-07, "loss": 0.83993268, "num_input_tokens_seen": 133719540, "step": 6220, "time_per_iteration": 2.6635100841522217 }, { "auxiliary_loss_clip": 0.01176584, "auxiliary_loss_mlp": 0.01035369, "balance_loss_clip": 1.05149722, "balance_loss_mlp": 1.02744722, "epoch": 0.7480310226657849, "flos": 33943375785600.0, "grad_norm": 2.2488770822622866, "language_loss": 0.70033896, "learning_rate": 6.296797256598101e-07, "loss": 0.72245848, "num_input_tokens_seen": 133741020, "step": 6221, "time_per_iteration": 2.7339258193969727 }, { "auxiliary_loss_clip": 0.01154532, "auxiliary_loss_mlp": 0.01030434, "balance_loss_clip": 0.93122947, "balance_loss_mlp": 1.02314115, "epoch": 0.748151265556424, "flos": 24826555065600.0, "grad_norm": 1.6966805971671244, "language_loss": 0.81552249, "learning_rate": 6.291124325322576e-07, "loss": 0.83737212, "num_input_tokens_seen": 133761145, "step": 6222, "time_per_iteration": 2.691976547241211 }, { "auxiliary_loss_clip": 0.0117242, "auxiliary_loss_mlp": 0.01029175, "balance_loss_clip": 0.97352171, "balance_loss_mlp": 1.02251744, "epoch": 0.748271508447063, "flos": 38399351535360.0, "grad_norm": 1.6600458766294686, "language_loss": 0.6250807, "learning_rate": 6.285453473672595e-07, "loss": 0.64709669, "num_input_tokens_seen": 133783715, "step": 6223, "time_per_iteration": 2.793504476547241 }, { "auxiliary_loss_clip": 0.01167164, "auxiliary_loss_mlp": 0.01021576, "balance_loss_clip": 1.04716372, "balance_loss_mlp": 1.0146296, "epoch": 0.7483917513377022, "flos": 21541842000000.0, "grad_norm": 2.04525475400459, "language_loss": 0.75137782, "learning_rate": 6.279784702508415e-07, "loss": 0.77326524, "num_input_tokens_seen": 133804465, "step": 6224, "time_per_iteration": 2.615560293197632 }, { "auxiliary_loss_clip": 0.01075192, "auxiliary_loss_mlp": 0.01004353, "balance_loss_clip": 0.89919889, "balance_loss_mlp": 1.00267208, "epoch": 0.7485119942283412, "flos": 62314532772480.0, "grad_norm": 0.7925608282809694, "language_loss": 0.58605999, "learning_rate": 6.274118012689979e-07, "loss": 0.60685539, "num_input_tokens_seen": 133866365, "step": 6225, "time_per_iteration": 3.34809947013855 }, { "auxiliary_loss_clip": 0.01156199, "auxiliary_loss_mlp": 0.01025723, "balance_loss_clip": 0.96915317, "balance_loss_mlp": 1.01898193, "epoch": 0.7486322371189803, "flos": 29937613104000.0, "grad_norm": 1.417366871430993, "language_loss": 0.68307471, "learning_rate": 6.268453405076943e-07, "loss": 0.70489395, "num_input_tokens_seen": 133888760, "step": 6226, "time_per_iteration": 2.7659945487976074 }, { "auxiliary_loss_clip": 0.01165146, "auxiliary_loss_mlp": 0.01023664, "balance_loss_clip": 0.97140998, "balance_loss_mlp": 1.01683605, "epoch": 0.7487524800096195, "flos": 18949414734720.0, "grad_norm": 2.3960918920804724, "language_loss": 0.8226186, "learning_rate": 6.262790880528592e-07, "loss": 0.84450674, "num_input_tokens_seen": 133906380, "step": 6227, "time_per_iteration": 2.757293701171875 }, { "auxiliary_loss_clip": 0.01172018, "auxiliary_loss_mlp": 0.01022925, "balance_loss_clip": 0.93050599, "balance_loss_mlp": 1.01557565, "epoch": 0.7488727229002585, "flos": 18697393935360.0, "grad_norm": 2.429236493032102, "language_loss": 0.79713655, "learning_rate": 6.257130439903951e-07, "loss": 0.81908602, "num_input_tokens_seen": 133922875, "step": 6228, "time_per_iteration": 2.694286584854126 }, { "auxiliary_loss_clip": 0.01171462, "auxiliary_loss_mlp": 0.01023514, "balance_loss_clip": 1.05099893, "balance_loss_mlp": 1.01673102, "epoch": 0.7489929657908976, "flos": 23623368168960.0, "grad_norm": 1.9229153987363328, "language_loss": 0.81156933, "learning_rate": 6.251472084061695e-07, "loss": 0.8335191, "num_input_tokens_seen": 133941795, "step": 6229, "time_per_iteration": 2.641557455062866 }, { "auxiliary_loss_clip": 0.0116439, "auxiliary_loss_mlp": 0.01026608, "balance_loss_clip": 1.00910878, "balance_loss_mlp": 1.01981592, "epoch": 0.7491132086815367, "flos": 20551533056640.0, "grad_norm": 1.9453484554501685, "language_loss": 0.89136481, "learning_rate": 6.245815813860191e-07, "loss": 0.91327477, "num_input_tokens_seen": 133957305, "step": 6230, "time_per_iteration": 2.667766571044922 }, { "auxiliary_loss_clip": 0.01171564, "auxiliary_loss_mlp": 0.01024458, "balance_loss_clip": 1.04894078, "balance_loss_mlp": 1.01667619, "epoch": 0.7492334515721758, "flos": 23003011353600.0, "grad_norm": 2.620676653309956, "language_loss": 0.70235562, "learning_rate": 6.240161630157495e-07, "loss": 0.72431588, "num_input_tokens_seen": 133976660, "step": 6231, "time_per_iteration": 3.4663212299346924 }, { "auxiliary_loss_clip": 0.01168173, "auxiliary_loss_mlp": 0.01025147, "balance_loss_clip": 1.04680252, "balance_loss_mlp": 1.01731443, "epoch": 0.7493536944628149, "flos": 16398823835520.0, "grad_norm": 3.1698578003948525, "language_loss": 0.69904572, "learning_rate": 6.23450953381133e-07, "loss": 0.72097892, "num_input_tokens_seen": 133994750, "step": 6232, "time_per_iteration": 2.605949640274048 }, { "auxiliary_loss_clip": 0.01157786, "auxiliary_loss_mlp": 0.0102531, "balance_loss_clip": 0.96996009, "balance_loss_mlp": 1.01842844, "epoch": 0.749473937353454, "flos": 15338561155200.0, "grad_norm": 1.971532283967838, "language_loss": 0.68016803, "learning_rate": 6.228859525679131e-07, "loss": 0.70199895, "num_input_tokens_seen": 134009165, "step": 6233, "time_per_iteration": 2.676250696182251 }, { "auxiliary_loss_clip": 0.01167328, "auxiliary_loss_mlp": 0.01021382, "balance_loss_clip": 1.00962079, "balance_loss_mlp": 1.01477432, "epoch": 0.7495941802440931, "flos": 18951138587520.0, "grad_norm": 2.2040427960682725, "language_loss": 0.79869539, "learning_rate": 6.223211606617986e-07, "loss": 0.82058251, "num_input_tokens_seen": 134027585, "step": 6234, "time_per_iteration": 2.773226499557495 }, { "auxiliary_loss_clip": 0.01165557, "auxiliary_loss_mlp": 0.0102476, "balance_loss_clip": 1.01239276, "balance_loss_mlp": 1.01850486, "epoch": 0.7497144231347321, "flos": 22492469393280.0, "grad_norm": 1.753680853444614, "language_loss": 0.83837008, "learning_rate": 6.217565777484701e-07, "loss": 0.8602733, "num_input_tokens_seen": 134046680, "step": 6235, "time_per_iteration": 3.56089448928833 }, { "auxiliary_loss_clip": 0.01164085, "auxiliary_loss_mlp": 0.01122101, "balance_loss_clip": 0.97261274, "balance_loss_mlp": 0.0, "epoch": 0.7498346660253713, "flos": 24243509502720.0, "grad_norm": 1.826576486647448, "language_loss": 0.80132937, "learning_rate": 6.211922039135722e-07, "loss": 0.82419121, "num_input_tokens_seen": 134066825, "step": 6236, "time_per_iteration": 2.759669542312622 }, { "auxiliary_loss_clip": 0.01168724, "auxiliary_loss_mlp": 0.01026194, "balance_loss_clip": 1.04881227, "balance_loss_mlp": 1.01920223, "epoch": 0.7499549089160104, "flos": 24387080163840.0, "grad_norm": 1.692583213526655, "language_loss": 0.81071889, "learning_rate": 6.206280392427201e-07, "loss": 0.83266807, "num_input_tokens_seen": 134086410, "step": 6237, "time_per_iteration": 3.5582456588745117 }, { "auxiliary_loss_clip": 0.01157149, "auxiliary_loss_mlp": 0.01026458, "balance_loss_clip": 1.00646126, "balance_loss_mlp": 1.01984155, "epoch": 0.7500751518066494, "flos": 34057320704640.0, "grad_norm": 1.51368885116395, "language_loss": 0.7391969, "learning_rate": 6.200640838214983e-07, "loss": 0.761033, "num_input_tokens_seen": 134109185, "step": 6238, "time_per_iteration": 2.8561441898345947 }, { "auxiliary_loss_clip": 0.01165504, "auxiliary_loss_mlp": 0.01029074, "balance_loss_clip": 1.04757333, "balance_loss_mlp": 1.02211785, "epoch": 0.7501953946972886, "flos": 18843586289280.0, "grad_norm": 2.1263647850678713, "language_loss": 0.66668785, "learning_rate": 6.195003377354578e-07, "loss": 0.68863362, "num_input_tokens_seen": 134128455, "step": 6239, "time_per_iteration": 2.62213134765625 }, { "auxiliary_loss_clip": 0.01158483, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 1.00456989, "balance_loss_mlp": 1.02478957, "epoch": 0.7503156375879276, "flos": 20257675891200.0, "grad_norm": 2.2498722394846977, "language_loss": 0.73120737, "learning_rate": 6.189368010701183e-07, "loss": 0.75311196, "num_input_tokens_seen": 134145515, "step": 6240, "time_per_iteration": 2.6561989784240723 }, { "auxiliary_loss_clip": 0.01171956, "auxiliary_loss_mlp": 0.01021595, "balance_loss_clip": 1.00901341, "balance_loss_mlp": 1.01494896, "epoch": 0.7504358804785667, "flos": 13480040574720.0, "grad_norm": 1.88807715175308, "language_loss": 0.76895911, "learning_rate": 6.183734739109683e-07, "loss": 0.79089463, "num_input_tokens_seen": 134163335, "step": 6241, "time_per_iteration": 2.6492528915405273 }, { "auxiliary_loss_clip": 0.01175311, "auxiliary_loss_mlp": 0.01027629, "balance_loss_clip": 1.01182973, "balance_loss_mlp": 1.01986897, "epoch": 0.7505561233692057, "flos": 29461042431360.0, "grad_norm": 2.042689631333684, "language_loss": 0.68758869, "learning_rate": 6.178103563434629e-07, "loss": 0.70961815, "num_input_tokens_seen": 134182335, "step": 6242, "time_per_iteration": 2.790853500366211 }, { "auxiliary_loss_clip": 0.01168519, "auxiliary_loss_mlp": 0.01022512, "balance_loss_clip": 1.0488987, "balance_loss_mlp": 1.01547027, "epoch": 0.7506763662598449, "flos": 20302457172480.0, "grad_norm": 1.9173437882851636, "language_loss": 0.83792275, "learning_rate": 6.172474484530283e-07, "loss": 0.85983312, "num_input_tokens_seen": 134201070, "step": 6243, "time_per_iteration": 2.5876197814941406 }, { "auxiliary_loss_clip": 0.01150113, "auxiliary_loss_mlp": 0.01029537, "balance_loss_clip": 0.96605456, "balance_loss_mlp": 1.021878, "epoch": 0.750796609150484, "flos": 37230961939200.0, "grad_norm": 1.6263011755623462, "language_loss": 0.75985128, "learning_rate": 6.166847503250563e-07, "loss": 0.7816478, "num_input_tokens_seen": 134223310, "step": 6244, "time_per_iteration": 2.8076512813568115 }, { "auxiliary_loss_clip": 0.01164105, "auxiliary_loss_mlp": 0.01026098, "balance_loss_clip": 0.97162163, "balance_loss_mlp": 1.01962507, "epoch": 0.750916852041123, "flos": 19609417186560.0, "grad_norm": 2.088237360631757, "language_loss": 0.78823125, "learning_rate": 6.161222620449078e-07, "loss": 0.81013328, "num_input_tokens_seen": 134242085, "step": 6245, "time_per_iteration": 2.7240779399871826 }, { "auxiliary_loss_clip": 0.01168545, "auxiliary_loss_mlp": 0.01025249, "balance_loss_clip": 0.93392229, "balance_loss_mlp": 1.01837087, "epoch": 0.7510370949317622, "flos": 25112690807040.0, "grad_norm": 2.1330679735076217, "language_loss": 0.80009407, "learning_rate": 6.155599836979117e-07, "loss": 0.82203197, "num_input_tokens_seen": 134260770, "step": 6246, "time_per_iteration": 2.781064748764038 }, { "auxiliary_loss_clip": 0.01161599, "auxiliary_loss_mlp": 0.01028136, "balance_loss_clip": 0.89363968, "balance_loss_mlp": 1.02042866, "epoch": 0.7511573378224012, "flos": 19062282245760.0, "grad_norm": 1.9874740497266743, "language_loss": 0.81518251, "learning_rate": 6.149979153693649e-07, "loss": 0.83707988, "num_input_tokens_seen": 134278025, "step": 6247, "time_per_iteration": 2.730647325515747 }, { "auxiliary_loss_clip": 0.01161758, "auxiliary_loss_mlp": 0.01028839, "balance_loss_clip": 1.00844634, "balance_loss_mlp": 1.0217247, "epoch": 0.7512775807130403, "flos": 19937676602880.0, "grad_norm": 2.1071942755616333, "language_loss": 0.76655769, "learning_rate": 6.144360571445343e-07, "loss": 0.78846371, "num_input_tokens_seen": 134297170, "step": 6248, "time_per_iteration": 2.6795809268951416 }, { "auxiliary_loss_clip": 0.01164557, "auxiliary_loss_mlp": 0.01026494, "balance_loss_clip": 1.01086354, "balance_loss_mlp": 1.01941609, "epoch": 0.7513978236036795, "flos": 20739920912640.0, "grad_norm": 1.7206884801982045, "language_loss": 0.79797971, "learning_rate": 6.138744091086509e-07, "loss": 0.81989014, "num_input_tokens_seen": 134316755, "step": 6249, "time_per_iteration": 2.7646076679229736 }, { "auxiliary_loss_clip": 0.01168955, "auxiliary_loss_mlp": 0.01027926, "balance_loss_clip": 0.93560439, "balance_loss_mlp": 1.02051425, "epoch": 0.7515180664943185, "flos": 27563163523200.0, "grad_norm": 2.2417896519041705, "language_loss": 0.72490978, "learning_rate": 6.133129713469183e-07, "loss": 0.74687862, "num_input_tokens_seen": 134335960, "step": 6250, "time_per_iteration": 2.850757122039795 }, { "auxiliary_loss_clip": 0.011722, "auxiliary_loss_mlp": 0.01027575, "balance_loss_clip": 0.93149173, "balance_loss_mlp": 1.01973712, "epoch": 0.7516383093849576, "flos": 33803181002880.0, "grad_norm": 1.7345087509705648, "language_loss": 0.64363599, "learning_rate": 6.127517439445053e-07, "loss": 0.6656338, "num_input_tokens_seen": 134356805, "step": 6251, "time_per_iteration": 2.885523557662964 }, { "auxiliary_loss_clip": 0.01156792, "auxiliary_loss_mlp": 0.01021524, "balance_loss_clip": 0.89500362, "balance_loss_mlp": 1.01492023, "epoch": 0.7517585522755967, "flos": 29746172592000.0, "grad_norm": 1.9552688936207596, "language_loss": 0.82334423, "learning_rate": 6.121907269865498e-07, "loss": 0.84512746, "num_input_tokens_seen": 134376295, "step": 6252, "time_per_iteration": 2.798861026763916 }, { "auxiliary_loss_clip": 0.01069863, "auxiliary_loss_mlp": 0.01001555, "balance_loss_clip": 0.90123272, "balance_loss_mlp": 1.00001705, "epoch": 0.7518787951662358, "flos": 69807974319360.0, "grad_norm": 0.9528635368987325, "language_loss": 0.67382592, "learning_rate": 6.116299205581577e-07, "loss": 0.69454014, "num_input_tokens_seen": 134431125, "step": 6253, "time_per_iteration": 3.209299087524414 }, { "auxiliary_loss_clip": 0.01171871, "auxiliary_loss_mlp": 0.01026554, "balance_loss_clip": 1.04893994, "balance_loss_mlp": 1.01874328, "epoch": 0.7519990380568748, "flos": 34203225749760.0, "grad_norm": 1.9097787641897401, "language_loss": 0.68586147, "learning_rate": 6.110693247444018e-07, "loss": 0.70784569, "num_input_tokens_seen": 134452960, "step": 6254, "time_per_iteration": 2.7186498641967773 }, { "auxiliary_loss_clip": 0.01153462, "auxiliary_loss_mlp": 0.0102789, "balance_loss_clip": 0.93115789, "balance_loss_mlp": 1.02132177, "epoch": 0.752119280947514, "flos": 21725704742400.0, "grad_norm": 2.952035760921811, "language_loss": 0.82426786, "learning_rate": 6.105089396303258e-07, "loss": 0.84608144, "num_input_tokens_seen": 134471350, "step": 6255, "time_per_iteration": 2.6761748790740967 }, { "auxiliary_loss_clip": 0.01166324, "auxiliary_loss_mlp": 0.010245, "balance_loss_clip": 0.97175539, "balance_loss_mlp": 1.01684642, "epoch": 0.7522395238381531, "flos": 32742774668160.0, "grad_norm": 2.0610224777608956, "language_loss": 0.75952768, "learning_rate": 6.099487653009383e-07, "loss": 0.78143585, "num_input_tokens_seen": 134490695, "step": 6256, "time_per_iteration": 2.7789249420166016 }, { "auxiliary_loss_clip": 0.01162113, "auxiliary_loss_mlp": 0.01028083, "balance_loss_clip": 1.00752366, "balance_loss_mlp": 1.02134168, "epoch": 0.7523597667287921, "flos": 23476026579840.0, "grad_norm": 1.8862901096097484, "language_loss": 0.83182871, "learning_rate": 6.093888018412192e-07, "loss": 0.85373068, "num_input_tokens_seen": 134506885, "step": 6257, "time_per_iteration": 3.4175968170166016 }, { "auxiliary_loss_clip": 0.01066834, "auxiliary_loss_mlp": 0.01002066, "balance_loss_clip": 0.97321016, "balance_loss_mlp": 1.00048089, "epoch": 0.7524800096194313, "flos": 67346730501120.0, "grad_norm": 0.8761124220792527, "language_loss": 0.54680616, "learning_rate": 6.088290493361125e-07, "loss": 0.56749517, "num_input_tokens_seen": 134571770, "step": 6258, "time_per_iteration": 3.293315887451172 }, { "auxiliary_loss_clip": 0.01154168, "auxiliary_loss_mlp": 0.01027223, "balance_loss_clip": 0.89328516, "balance_loss_mlp": 1.02042222, "epoch": 0.7526002525100703, "flos": 13006055681280.0, "grad_norm": 2.124042433471504, "language_loss": 0.71438384, "learning_rate": 6.082695078705322e-07, "loss": 0.73619777, "num_input_tokens_seen": 134589250, "step": 6259, "time_per_iteration": 2.5717625617980957 }, { "auxiliary_loss_clip": 0.01158651, "auxiliary_loss_mlp": 0.01029338, "balance_loss_clip": 1.00808108, "balance_loss_mlp": 1.02167249, "epoch": 0.7527204954007094, "flos": 21397229844480.0, "grad_norm": 12.712136759664002, "language_loss": 0.6874432, "learning_rate": 6.077101775293618e-07, "loss": 0.70932305, "num_input_tokens_seen": 134608075, "step": 6260, "time_per_iteration": 2.5361087322235107 }, { "auxiliary_loss_clip": 0.01167254, "auxiliary_loss_mlp": 0.0102556, "balance_loss_clip": 1.00943649, "balance_loss_mlp": 1.01809144, "epoch": 0.7528407382913486, "flos": 18947188091520.0, "grad_norm": 2.377023779312224, "language_loss": 0.8305366, "learning_rate": 6.071510583974504e-07, "loss": 0.8524648, "num_input_tokens_seen": 134623260, "step": 6261, "time_per_iteration": 3.362377405166626 }, { "auxiliary_loss_clip": 0.01167025, "auxiliary_loss_mlp": 0.01026837, "balance_loss_clip": 1.04689217, "balance_loss_mlp": 1.01982749, "epoch": 0.7529609811819876, "flos": 15231798956160.0, "grad_norm": 1.961375145301958, "language_loss": 0.72229642, "learning_rate": 6.065921505596161e-07, "loss": 0.74423504, "num_input_tokens_seen": 134641540, "step": 6262, "time_per_iteration": 3.818164348602295 }, { "auxiliary_loss_clip": 0.0116242, "auxiliary_loss_mlp": 0.01027167, "balance_loss_clip": 0.93441969, "balance_loss_mlp": 1.02010381, "epoch": 0.7530812240726267, "flos": 19354487385600.0, "grad_norm": 1.5629920687486925, "language_loss": 0.77334183, "learning_rate": 6.060334541006445e-07, "loss": 0.79523772, "num_input_tokens_seen": 134660035, "step": 6263, "time_per_iteration": 3.797006607055664 }, { "auxiliary_loss_clip": 0.01163721, "auxiliary_loss_mlp": 0.01022374, "balance_loss_clip": 0.92927134, "balance_loss_mlp": 1.01561522, "epoch": 0.7532014669632658, "flos": 27748247328000.0, "grad_norm": 1.4635684377088232, "language_loss": 0.68960321, "learning_rate": 6.05474969105289e-07, "loss": 0.71146417, "num_input_tokens_seen": 134683025, "step": 6264, "time_per_iteration": 2.813478946685791 }, { "auxiliary_loss_clip": 0.0116834, "auxiliary_loss_mlp": 0.01027837, "balance_loss_clip": 1.01099205, "balance_loss_mlp": 1.02019012, "epoch": 0.7533217098539049, "flos": 14137421333760.0, "grad_norm": 3.1839700953956274, "language_loss": 0.73706305, "learning_rate": 6.049166956582725e-07, "loss": 0.75902474, "num_input_tokens_seen": 134701290, "step": 6265, "time_per_iteration": 2.61019229888916 }, { "auxiliary_loss_clip": 0.01161703, "auxiliary_loss_mlp": 0.01025802, "balance_loss_clip": 1.00901341, "balance_loss_mlp": 1.01907265, "epoch": 0.753441952744544, "flos": 26429068437120.0, "grad_norm": 32.8506760739276, "language_loss": 0.87340355, "learning_rate": 6.043586338442841e-07, "loss": 0.89527857, "num_input_tokens_seen": 134720345, "step": 6266, "time_per_iteration": 2.701042652130127 }, { "auxiliary_loss_clip": 0.01164338, "auxiliary_loss_mlp": 0.01020521, "balance_loss_clip": 1.04808283, "balance_loss_mlp": 1.01399708, "epoch": 0.7535621956351831, "flos": 23878621192320.0, "grad_norm": 1.4935016126811886, "language_loss": 0.73143971, "learning_rate": 6.038007837479815e-07, "loss": 0.75328827, "num_input_tokens_seen": 134741450, "step": 6267, "time_per_iteration": 2.6508142948150635 }, { "auxiliary_loss_clip": 0.01164598, "auxiliary_loss_mlp": 0.0102639, "balance_loss_clip": 1.01007533, "balance_loss_mlp": 1.01986003, "epoch": 0.7536824385258222, "flos": 21795873960960.0, "grad_norm": 2.009750180893604, "language_loss": 0.63972306, "learning_rate": 6.032431454539897e-07, "loss": 0.66163296, "num_input_tokens_seen": 134760295, "step": 6268, "time_per_iteration": 2.7136685848236084 }, { "auxiliary_loss_clip": 0.01166261, "auxiliary_loss_mlp": 0.01024768, "balance_loss_clip": 0.93476057, "balance_loss_mlp": 1.01810455, "epoch": 0.7538026814164612, "flos": 28911644933760.0, "grad_norm": 1.6564261848320587, "language_loss": 0.81543183, "learning_rate": 6.026857190469014e-07, "loss": 0.83734214, "num_input_tokens_seen": 134782050, "step": 6269, "time_per_iteration": 2.7571351528167725 }, { "auxiliary_loss_clip": 0.01164251, "auxiliary_loss_mlp": 0.01026358, "balance_loss_clip": 0.97006261, "balance_loss_mlp": 1.01928902, "epoch": 0.7539229243071004, "flos": 21104701482240.0, "grad_norm": 1.748321616967585, "language_loss": 0.74198151, "learning_rate": 6.0212850461128e-07, "loss": 0.76388764, "num_input_tokens_seen": 134801170, "step": 6270, "time_per_iteration": 2.699960947036743 }, { "auxiliary_loss_clip": 0.01165124, "auxiliary_loss_mlp": 0.01025721, "balance_loss_clip": 0.96843863, "balance_loss_mlp": 1.01777577, "epoch": 0.7540431671977395, "flos": 15158469340800.0, "grad_norm": 2.097803172931682, "language_loss": 0.74599999, "learning_rate": 6.015715022316516e-07, "loss": 0.76790839, "num_input_tokens_seen": 134819150, "step": 6271, "time_per_iteration": 2.7183456420898438 }, { "auxiliary_loss_clip": 0.01163656, "auxiliary_loss_mlp": 0.01026391, "balance_loss_clip": 0.8931725, "balance_loss_mlp": 1.01898539, "epoch": 0.7541634100883785, "flos": 18770579896320.0, "grad_norm": 2.564190376633595, "language_loss": 0.77309632, "learning_rate": 6.010147119925154e-07, "loss": 0.79499674, "num_input_tokens_seen": 134836905, "step": 6272, "time_per_iteration": 2.7580528259277344 }, { "auxiliary_loss_clip": 0.01154023, "auxiliary_loss_mlp": 0.01023054, "balance_loss_clip": 0.93092924, "balance_loss_mlp": 1.01569247, "epoch": 0.7542836529790176, "flos": 20594770053120.0, "grad_norm": 2.171822485696211, "language_loss": 0.66398489, "learning_rate": 6.004581339783348e-07, "loss": 0.68575561, "num_input_tokens_seen": 134855225, "step": 6273, "time_per_iteration": 2.7531731128692627 }, { "auxiliary_loss_clip": 0.0117311, "auxiliary_loss_mlp": 0.01028797, "balance_loss_clip": 1.01152039, "balance_loss_mlp": 1.02096462, "epoch": 0.7544038958696567, "flos": 19095104298240.0, "grad_norm": 2.256236199875772, "language_loss": 0.68558389, "learning_rate": 5.999017682735425e-07, "loss": 0.70760298, "num_input_tokens_seen": 134871615, "step": 6274, "time_per_iteration": 2.639767646789551 }, { "auxiliary_loss_clip": 0.01170761, "auxiliary_loss_mlp": 0.01026247, "balance_loss_clip": 0.85730475, "balance_loss_mlp": 1.01930606, "epoch": 0.7545241387602958, "flos": 31723306859520.0, "grad_norm": 1.7642411708773105, "language_loss": 0.66056383, "learning_rate": 5.993456149625387e-07, "loss": 0.68253392, "num_input_tokens_seen": 134892765, "step": 6275, "time_per_iteration": 2.8509438037872314 }, { "auxiliary_loss_clip": 0.0115424, "auxiliary_loss_mlp": 0.01027668, "balance_loss_clip": 0.93180591, "balance_loss_mlp": 1.02108455, "epoch": 0.7546443816509348, "flos": 20296495514880.0, "grad_norm": 1.7300286964358735, "language_loss": 0.82413983, "learning_rate": 5.987896741296909e-07, "loss": 0.84595895, "num_input_tokens_seen": 134910505, "step": 6276, "time_per_iteration": 2.7121660709381104 }, { "auxiliary_loss_clip": 0.01168227, "auxiliary_loss_mlp": 0.01029671, "balance_loss_clip": 0.97513664, "balance_loss_mlp": 1.02271533, "epoch": 0.754764624541574, "flos": 23696159080320.0, "grad_norm": 2.1133581131812353, "language_loss": 0.78422064, "learning_rate": 5.982339458593361e-07, "loss": 0.80619967, "num_input_tokens_seen": 134930445, "step": 6277, "time_per_iteration": 2.756455183029175 }, { "auxiliary_loss_clip": 0.0116099, "auxiliary_loss_mlp": 0.01121873, "balance_loss_clip": 1.00855494, "balance_loss_mlp": 0.0, "epoch": 0.7548848674322131, "flos": 25337204766720.0, "grad_norm": 1.5006660128799587, "language_loss": 0.83904743, "learning_rate": 5.976784302357767e-07, "loss": 0.86187607, "num_input_tokens_seen": 134951010, "step": 6278, "time_per_iteration": 2.6836225986480713 }, { "auxiliary_loss_clip": 0.01167578, "auxiliary_loss_mlp": 0.0102775, "balance_loss_clip": 1.01021266, "balance_loss_mlp": 1.02056134, "epoch": 0.7550051103228521, "flos": 19573147428480.0, "grad_norm": 1.7871443068558222, "language_loss": 0.73580384, "learning_rate": 5.971231273432855e-07, "loss": 0.75775719, "num_input_tokens_seen": 134970495, "step": 6279, "time_per_iteration": 2.68068528175354 }, { "auxiliary_loss_clip": 0.01067149, "auxiliary_loss_mlp": 0.01001098, "balance_loss_clip": 0.97429717, "balance_loss_mlp": 0.99952489, "epoch": 0.7551253532134913, "flos": 64150068648960.0, "grad_norm": 0.8197101894690632, "language_loss": 0.54586512, "learning_rate": 5.965680372661e-07, "loss": 0.56654757, "num_input_tokens_seen": 135028060, "step": 6280, "time_per_iteration": 3.139392852783203 }, { "auxiliary_loss_clip": 0.01162307, "auxiliary_loss_mlp": 0.01020388, "balance_loss_clip": 0.97106695, "balance_loss_mlp": 1.01394749, "epoch": 0.7552455961041303, "flos": 26067986968320.0, "grad_norm": 1.620628677697643, "language_loss": 0.56044269, "learning_rate": 5.960131600884266e-07, "loss": 0.58226955, "num_input_tokens_seen": 135047330, "step": 6281, "time_per_iteration": 2.6869664192199707 }, { "auxiliary_loss_clip": 0.01163281, "auxiliary_loss_mlp": 0.01022425, "balance_loss_clip": 0.93086326, "balance_loss_mlp": 1.01570749, "epoch": 0.7553658389947694, "flos": 24498223822080.0, "grad_norm": 2.0263100394162548, "language_loss": 0.75874233, "learning_rate": 5.954584958944413e-07, "loss": 0.78059936, "num_input_tokens_seen": 135065995, "step": 6282, "time_per_iteration": 2.7482519149780273 }, { "auxiliary_loss_clip": 0.01164663, "auxiliary_loss_mlp": 0.01122299, "balance_loss_clip": 0.93063951, "balance_loss_mlp": 0.0, "epoch": 0.7554860818854086, "flos": 21799465320960.0, "grad_norm": 1.9805379571892927, "language_loss": 0.82065779, "learning_rate": 5.949040447682854e-07, "loss": 0.84352744, "num_input_tokens_seen": 135085820, "step": 6283, "time_per_iteration": 3.7325103282928467 }, { "auxiliary_loss_clip": 0.01170251, "auxiliary_loss_mlp": 0.01035648, "balance_loss_clip": 0.97122037, "balance_loss_mlp": 1.02835846, "epoch": 0.7556063247760476, "flos": 16362123114240.0, "grad_norm": 2.0469556637308273, "language_loss": 0.68242943, "learning_rate": 5.943498067940686e-07, "loss": 0.70448852, "num_input_tokens_seen": 135102845, "step": 6284, "time_per_iteration": 2.692479133605957 }, { "auxiliary_loss_clip": 0.01164956, "auxiliary_loss_mlp": 0.0102754, "balance_loss_clip": 0.97788417, "balance_loss_mlp": 1.02075076, "epoch": 0.7557265676666867, "flos": 27235155502080.0, "grad_norm": 1.8570578551925665, "language_loss": 0.81820798, "learning_rate": 5.937957820558686e-07, "loss": 0.84013295, "num_input_tokens_seen": 135122190, "step": 6285, "time_per_iteration": 2.7327451705932617 }, { "auxiliary_loss_clip": 0.01071236, "auxiliary_loss_mlp": 0.01002038, "balance_loss_clip": 0.93577421, "balance_loss_mlp": 1.00047636, "epoch": 0.7558468105573258, "flos": 62189131415040.0, "grad_norm": 0.856473262750231, "language_loss": 0.65463841, "learning_rate": 5.932419706377296e-07, "loss": 0.67537117, "num_input_tokens_seen": 135180495, "step": 6286, "time_per_iteration": 3.2015902996063232 }, { "auxiliary_loss_clip": 0.01161376, "auxiliary_loss_mlp": 0.01026683, "balance_loss_clip": 0.93594193, "balance_loss_mlp": 1.01976943, "epoch": 0.7559670534479649, "flos": 33249078823680.0, "grad_norm": 1.787230057644885, "language_loss": 0.74133706, "learning_rate": 5.92688372623666e-07, "loss": 0.76321775, "num_input_tokens_seen": 135199200, "step": 6287, "time_per_iteration": 4.828469514846802 }, { "auxiliary_loss_clip": 0.0116675, "auxiliary_loss_mlp": 0.01027125, "balance_loss_clip": 1.00807345, "balance_loss_mlp": 1.02026999, "epoch": 0.7560872963386039, "flos": 14064379027200.0, "grad_norm": 2.347463105521853, "language_loss": 0.73864019, "learning_rate": 5.921349880976574e-07, "loss": 0.76057887, "num_input_tokens_seen": 135217035, "step": 6288, "time_per_iteration": 2.7172930240631104 }, { "auxiliary_loss_clip": 0.01166408, "auxiliary_loss_mlp": 0.0112262, "balance_loss_clip": 0.9691726, "balance_loss_mlp": 0.0, "epoch": 0.7562075392292431, "flos": 20412307941120.0, "grad_norm": 1.7602363684525824, "language_loss": 0.81815016, "learning_rate": 5.915818171436515e-07, "loss": 0.84104049, "num_input_tokens_seen": 135236370, "step": 6289, "time_per_iteration": 3.629103660583496 }, { "auxiliary_loss_clip": 0.01160116, "auxiliary_loss_mlp": 0.01023639, "balance_loss_clip": 0.96638632, "balance_loss_mlp": 1.0166142, "epoch": 0.7563277821198822, "flos": 20376792368640.0, "grad_norm": 1.9087911798527954, "language_loss": 0.74582374, "learning_rate": 5.910288598455642e-07, "loss": 0.76766127, "num_input_tokens_seen": 135255720, "step": 6290, "time_per_iteration": 2.699247360229492 }, { "auxiliary_loss_clip": 0.01171128, "auxiliary_loss_mlp": 0.01027935, "balance_loss_clip": 1.00904882, "balance_loss_mlp": 1.02083325, "epoch": 0.7564480250105212, "flos": 18588261438720.0, "grad_norm": 2.2126959459728304, "language_loss": 0.74696946, "learning_rate": 5.90476116287278e-07, "loss": 0.76896012, "num_input_tokens_seen": 135273320, "step": 6291, "time_per_iteration": 2.673424243927002 }, { "auxiliary_loss_clip": 0.01166947, "auxiliary_loss_mlp": 0.01026135, "balance_loss_clip": 0.97257274, "balance_loss_mlp": 1.01937938, "epoch": 0.7565682679011604, "flos": 21215521918080.0, "grad_norm": 1.6397505897296867, "language_loss": 0.67704266, "learning_rate": 5.899235865526456e-07, "loss": 0.69897354, "num_input_tokens_seen": 135292615, "step": 6292, "time_per_iteration": 2.6646945476531982 }, { "auxiliary_loss_clip": 0.01154364, "auxiliary_loss_mlp": 0.01020847, "balance_loss_clip": 0.93088412, "balance_loss_mlp": 1.0139271, "epoch": 0.7566885107917994, "flos": 20449008662400.0, "grad_norm": 1.7605859651569598, "language_loss": 0.82274091, "learning_rate": 5.893712707254825e-07, "loss": 0.84449297, "num_input_tokens_seen": 135310075, "step": 6293, "time_per_iteration": 2.7117371559143066 }, { "auxiliary_loss_clip": 0.01156834, "auxiliary_loss_mlp": 0.01028397, "balance_loss_clip": 0.89160383, "balance_loss_mlp": 1.02134585, "epoch": 0.7568087536824385, "flos": 19025832919680.0, "grad_norm": 2.3954307070507057, "language_loss": 0.65988886, "learning_rate": 5.888191688895769e-07, "loss": 0.68174124, "num_input_tokens_seen": 135327335, "step": 6294, "time_per_iteration": 2.742088556289673 }, { "auxiliary_loss_clip": 0.01167492, "auxiliary_loss_mlp": 0.01020859, "balance_loss_clip": 1.0465734, "balance_loss_mlp": 1.0133791, "epoch": 0.7569289965730777, "flos": 15225442248960.0, "grad_norm": 2.48250278948384, "language_loss": 0.61862516, "learning_rate": 5.882672811286813e-07, "loss": 0.64050865, "num_input_tokens_seen": 135343615, "step": 6295, "time_per_iteration": 2.5855462551116943 }, { "auxiliary_loss_clip": 0.01168301, "auxiliary_loss_mlp": 0.0102336, "balance_loss_clip": 1.04677999, "balance_loss_mlp": 1.01573634, "epoch": 0.7570492394637167, "flos": 20769367086720.0, "grad_norm": 1.9308329445845653, "language_loss": 0.69413823, "learning_rate": 5.877156075265166e-07, "loss": 0.71605486, "num_input_tokens_seen": 135359880, "step": 6296, "time_per_iteration": 2.6512553691864014 }, { "auxiliary_loss_clip": 0.01162476, "auxiliary_loss_mlp": 0.01023231, "balance_loss_clip": 0.9694792, "balance_loss_mlp": 1.01577103, "epoch": 0.7571694823543558, "flos": 15664091137920.0, "grad_norm": 2.6091634735703115, "language_loss": 0.69325805, "learning_rate": 5.871641481667715e-07, "loss": 0.71511507, "num_input_tokens_seen": 135374325, "step": 6297, "time_per_iteration": 2.6822242736816406 }, { "auxiliary_loss_clip": 0.01167622, "auxiliary_loss_mlp": 0.01025316, "balance_loss_clip": 0.89530921, "balance_loss_mlp": 1.01755536, "epoch": 0.7572897252449949, "flos": 25409241492480.0, "grad_norm": 1.7974205820491729, "language_loss": 0.84272206, "learning_rate": 5.866129031331011e-07, "loss": 0.86465144, "num_input_tokens_seen": 135393980, "step": 6298, "time_per_iteration": 2.738312244415283 }, { "auxiliary_loss_clip": 0.01165235, "auxiliary_loss_mlp": 0.01027375, "balance_loss_clip": 0.96943998, "balance_loss_mlp": 1.02042866, "epoch": 0.757409968135634, "flos": 24279348297600.0, "grad_norm": 2.924910570823401, "language_loss": 0.8325792, "learning_rate": 5.8606187250913e-07, "loss": 0.8545053, "num_input_tokens_seen": 135412030, "step": 6299, "time_per_iteration": 2.709172487258911 }, { "auxiliary_loss_clip": 0.01166951, "auxiliary_loss_mlp": 0.01122406, "balance_loss_clip": 1.01268041, "balance_loss_mlp": 0.0, "epoch": 0.757530211026273, "flos": 24133766474880.0, "grad_norm": 1.9066721480189366, "language_loss": 0.84199423, "learning_rate": 5.855110563784482e-07, "loss": 0.86488777, "num_input_tokens_seen": 135430565, "step": 6300, "time_per_iteration": 2.7296714782714844 }, { "auxiliary_loss_clip": 0.01157424, "auxiliary_loss_mlp": 0.01122335, "balance_loss_clip": 1.00678778, "balance_loss_mlp": 0.0, "epoch": 0.7576504539169122, "flos": 23951807153280.0, "grad_norm": 1.5977918396999367, "language_loss": 0.64067245, "learning_rate": 5.849604548246156e-07, "loss": 0.66347003, "num_input_tokens_seen": 135451675, "step": 6301, "time_per_iteration": 2.7142081260681152 }, { "auxiliary_loss_clip": 0.01173116, "auxiliary_loss_mlp": 0.01122236, "balance_loss_clip": 0.97450602, "balance_loss_mlp": 0.0, "epoch": 0.7577706968075513, "flos": 21251360712960.0, "grad_norm": 1.8317223684755528, "language_loss": 0.80121082, "learning_rate": 5.844100679311565e-07, "loss": 0.82416433, "num_input_tokens_seen": 135470635, "step": 6302, "time_per_iteration": 2.702597141265869 }, { "auxiliary_loss_clip": 0.01165829, "auxiliary_loss_mlp": 0.01022366, "balance_loss_clip": 0.97423095, "balance_loss_mlp": 1.01479352, "epoch": 0.7578909396981903, "flos": 18296595002880.0, "grad_norm": 2.5376364453853717, "language_loss": 0.76348388, "learning_rate": 5.838598957815637e-07, "loss": 0.78536576, "num_input_tokens_seen": 135487865, "step": 6303, "time_per_iteration": 2.678269147872925 }, { "auxiliary_loss_clip": 0.01161268, "auxiliary_loss_mlp": 0.01023172, "balance_loss_clip": 0.97146189, "balance_loss_mlp": 1.01592398, "epoch": 0.7580111825888295, "flos": 25373869574400.0, "grad_norm": 1.4768018981390352, "language_loss": 0.85411417, "learning_rate": 5.833099384592996e-07, "loss": 0.87595856, "num_input_tokens_seen": 135508440, "step": 6304, "time_per_iteration": 2.6847870349884033 }, { "auxiliary_loss_clip": 0.01161806, "auxiliary_loss_mlp": 0.01023525, "balance_loss_clip": 0.97153682, "balance_loss_mlp": 1.01640224, "epoch": 0.7581314254794685, "flos": 23768662682880.0, "grad_norm": 2.3658065232633807, "language_loss": 0.7161392, "learning_rate": 5.827601960477913e-07, "loss": 0.73799247, "num_input_tokens_seen": 135526365, "step": 6305, "time_per_iteration": 2.6959228515625 }, { "auxiliary_loss_clip": 0.01163184, "auxiliary_loss_mlp": 0.0102438, "balance_loss_clip": 1.00747061, "balance_loss_mlp": 1.01780224, "epoch": 0.7582516683701076, "flos": 22054610603520.0, "grad_norm": 1.8726115414360138, "language_loss": 0.70439649, "learning_rate": 5.822106686304344e-07, "loss": 0.72627211, "num_input_tokens_seen": 135545655, "step": 6306, "time_per_iteration": 2.647007703781128 }, { "auxiliary_loss_clip": 0.01166788, "auxiliary_loss_mlp": 0.01025009, "balance_loss_clip": 0.93275529, "balance_loss_mlp": 1.0178566, "epoch": 0.7583719112607467, "flos": 31649725848960.0, "grad_norm": 6.711575898052772, "language_loss": 0.57950294, "learning_rate": 5.816613562905919e-07, "loss": 0.60142094, "num_input_tokens_seen": 135566840, "step": 6307, "time_per_iteration": 2.757601022720337 }, { "auxiliary_loss_clip": 0.0116354, "auxiliary_loss_mlp": 0.01026512, "balance_loss_clip": 0.93552935, "balance_loss_mlp": 1.01965165, "epoch": 0.7584921541513858, "flos": 33068376478080.0, "grad_norm": 1.5742085204782619, "language_loss": 0.69945598, "learning_rate": 5.811122591115933e-07, "loss": 0.72135651, "num_input_tokens_seen": 135587825, "step": 6308, "time_per_iteration": 2.810659408569336 }, { "auxiliary_loss_clip": 0.01166372, "auxiliary_loss_mlp": 0.01027288, "balance_loss_clip": 0.93672454, "balance_loss_mlp": 1.01958704, "epoch": 0.7586123970420249, "flos": 23326350606720.0, "grad_norm": 2.0703420995163686, "language_loss": 0.71494585, "learning_rate": 5.805633771767376e-07, "loss": 0.73688245, "num_input_tokens_seen": 135605220, "step": 6309, "time_per_iteration": 3.535447120666504 }, { "auxiliary_loss_clip": 0.01164115, "auxiliary_loss_mlp": 0.01026755, "balance_loss_clip": 0.97217733, "balance_loss_mlp": 1.01988554, "epoch": 0.7587326399326639, "flos": 18334229477760.0, "grad_norm": 1.942780423773983, "language_loss": 0.77779138, "learning_rate": 5.800147105692888e-07, "loss": 0.79970002, "num_input_tokens_seen": 135624795, "step": 6310, "time_per_iteration": 2.7168731689453125 }, { "auxiliary_loss_clip": 0.01165655, "auxiliary_loss_mlp": 0.01024472, "balance_loss_clip": 1.00704682, "balance_loss_mlp": 1.0172689, "epoch": 0.7588528828233031, "flos": 17275080119040.0, "grad_norm": 4.184339659904399, "language_loss": 0.79286313, "learning_rate": 5.794662593724795e-07, "loss": 0.81476438, "num_input_tokens_seen": 135643800, "step": 6311, "time_per_iteration": 2.616260051727295 }, { "auxiliary_loss_clip": 0.01170995, "auxiliary_loss_mlp": 0.01024237, "balance_loss_clip": 1.05009031, "balance_loss_mlp": 1.01715016, "epoch": 0.7589731257139422, "flos": 17713621267200.0, "grad_norm": 1.8532087109082713, "language_loss": 0.74885416, "learning_rate": 5.789180236695091e-07, "loss": 0.77080649, "num_input_tokens_seen": 135660655, "step": 6312, "time_per_iteration": 2.6063597202301025 }, { "auxiliary_loss_clip": 0.01161438, "auxiliary_loss_mlp": 0.0102873, "balance_loss_clip": 1.00993824, "balance_loss_mlp": 1.0221169, "epoch": 0.7590933686045812, "flos": 15961072786560.0, "grad_norm": 1.8566214055808288, "language_loss": 0.8512156, "learning_rate": 5.78370003543544e-07, "loss": 0.87311733, "num_input_tokens_seen": 135679410, "step": 6313, "time_per_iteration": 4.489595413208008 }, { "auxiliary_loss_clip": 0.01166699, "auxiliary_loss_mlp": 0.01122144, "balance_loss_clip": 1.00937784, "balance_loss_mlp": 0.0, "epoch": 0.7592136114952204, "flos": 21068072588160.0, "grad_norm": 1.8136679480100069, "language_loss": 0.83867449, "learning_rate": 5.778221990777203e-07, "loss": 0.86156291, "num_input_tokens_seen": 135697150, "step": 6314, "time_per_iteration": 2.6676385402679443 }, { "auxiliary_loss_clip": 0.01165275, "auxiliary_loss_mlp": 0.01026251, "balance_loss_clip": 0.97241658, "balance_loss_mlp": 1.01889825, "epoch": 0.7593338543858594, "flos": 25297666871040.0, "grad_norm": 2.1140338484308736, "language_loss": 0.82629329, "learning_rate": 5.772746103551372e-07, "loss": 0.84820855, "num_input_tokens_seen": 135712545, "step": 6315, "time_per_iteration": 3.6742310523986816 }, { "auxiliary_loss_clip": 0.01162862, "auxiliary_loss_mlp": 0.01027551, "balance_loss_clip": 0.97219783, "balance_loss_mlp": 1.02046454, "epoch": 0.7594540972764985, "flos": 31832367528960.0, "grad_norm": 2.2872894888511985, "language_loss": 0.71443266, "learning_rate": 5.767272374588648e-07, "loss": 0.73633683, "num_input_tokens_seen": 135733950, "step": 6316, "time_per_iteration": 2.7600085735321045 }, { "auxiliary_loss_clip": 0.01166143, "auxiliary_loss_mlp": 0.01024382, "balance_loss_clip": 1.01155066, "balance_loss_mlp": 1.01710975, "epoch": 0.7595743401671377, "flos": 37597250880000.0, "grad_norm": 1.5766310750347878, "language_loss": 0.77978581, "learning_rate": 5.76180080471939e-07, "loss": 0.80169106, "num_input_tokens_seen": 135757120, "step": 6317, "time_per_iteration": 2.784630298614502 }, { "auxiliary_loss_clip": 0.01172827, "auxiliary_loss_mlp": 0.01029463, "balance_loss_clip": 1.04901052, "balance_loss_mlp": 1.02154183, "epoch": 0.7596945830577767, "flos": 18287724343680.0, "grad_norm": 1.9187304143781108, "language_loss": 0.72081387, "learning_rate": 5.756331394773631e-07, "loss": 0.74283683, "num_input_tokens_seen": 135773335, "step": 6318, "time_per_iteration": 2.5744423866271973 }, { "auxiliary_loss_clip": 0.01161651, "auxiliary_loss_mlp": 0.01122776, "balance_loss_clip": 0.85539579, "balance_loss_mlp": 0.0, "epoch": 0.7598148259484158, "flos": 22233122219520.0, "grad_norm": 1.7287041249213297, "language_loss": 0.76407152, "learning_rate": 5.750864145581071e-07, "loss": 0.78691584, "num_input_tokens_seen": 135792555, "step": 6319, "time_per_iteration": 2.7410731315612793 }, { "auxiliary_loss_clip": 0.01169052, "auxiliary_loss_mlp": 0.01021758, "balance_loss_clip": 1.05005181, "balance_loss_mlp": 1.01536798, "epoch": 0.7599350688390549, "flos": 27161718145920.0, "grad_norm": 2.0788228477119284, "language_loss": 0.85658044, "learning_rate": 5.745399057971085e-07, "loss": 0.87848854, "num_input_tokens_seen": 135813690, "step": 6320, "time_per_iteration": 2.6987102031707764 }, { "auxiliary_loss_clip": 0.01170127, "auxiliary_loss_mlp": 0.010262, "balance_loss_clip": 1.00977039, "balance_loss_mlp": 1.01885033, "epoch": 0.760055311729694, "flos": 15560704817280.0, "grad_norm": 2.095471168404814, "language_loss": 0.75225592, "learning_rate": 5.739936132772738e-07, "loss": 0.77421916, "num_input_tokens_seen": 135832255, "step": 6321, "time_per_iteration": 2.690639019012451 }, { "auxiliary_loss_clip": 0.01165845, "auxiliary_loss_mlp": 0.01022545, "balance_loss_clip": 1.04703164, "balance_loss_mlp": 1.015553, "epoch": 0.760175554620333, "flos": 25155496840320.0, "grad_norm": 2.23292453994803, "language_loss": 0.74155003, "learning_rate": 5.734475370814733e-07, "loss": 0.76343393, "num_input_tokens_seen": 135851935, "step": 6322, "time_per_iteration": 2.65523624420166 }, { "auxiliary_loss_clip": 0.01167294, "auxiliary_loss_mlp": 0.01026409, "balance_loss_clip": 1.0080421, "balance_loss_mlp": 1.01943791, "epoch": 0.7602957975109722, "flos": 24353791234560.0, "grad_norm": 1.5012999815794386, "language_loss": 0.78463078, "learning_rate": 5.729016772925483e-07, "loss": 0.80656779, "num_input_tokens_seen": 135873510, "step": 6323, "time_per_iteration": 2.6803030967712402 }, { "auxiliary_loss_clip": 0.01161381, "auxiliary_loss_mlp": 0.01025229, "balance_loss_clip": 0.89640266, "balance_loss_mlp": 1.017349, "epoch": 0.7604160404016113, "flos": 25192664438400.0, "grad_norm": 1.7143278452751798, "language_loss": 0.70637852, "learning_rate": 5.723560339933038e-07, "loss": 0.7282446, "num_input_tokens_seen": 135893845, "step": 6324, "time_per_iteration": 2.7829792499542236 }, { "auxiliary_loss_clip": 0.01163925, "auxiliary_loss_mlp": 0.01122128, "balance_loss_clip": 1.0081811, "balance_loss_mlp": 0.0, "epoch": 0.7605362832922503, "flos": 29861841363840.0, "grad_norm": 1.9600887506761555, "language_loss": 0.65226406, "learning_rate": 5.71810607266513e-07, "loss": 0.67512453, "num_input_tokens_seen": 135912430, "step": 6325, "time_per_iteration": 2.683136224746704 }, { "auxiliary_loss_clip": 0.01166379, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.00854588, "balance_loss_mlp": 1.02206206, "epoch": 0.7606565261828895, "flos": 13917935278080.0, "grad_norm": 1.7365362390369363, "language_loss": 0.60453331, "learning_rate": 5.712653971949184e-07, "loss": 0.62649024, "num_input_tokens_seen": 135930550, "step": 6326, "time_per_iteration": 2.6197686195373535 }, { "auxiliary_loss_clip": 0.01159622, "auxiliary_loss_mlp": 0.01023251, "balance_loss_clip": 1.00818181, "balance_loss_mlp": 1.01649785, "epoch": 0.7607767690735285, "flos": 18551273408640.0, "grad_norm": 2.410888369074166, "language_loss": 0.75504482, "learning_rate": 5.707204038612268e-07, "loss": 0.77687359, "num_input_tokens_seen": 135947980, "step": 6327, "time_per_iteration": 2.6027894020080566 }, { "auxiliary_loss_clip": 0.01175987, "auxiliary_loss_mlp": 0.01029963, "balance_loss_clip": 0.97713161, "balance_loss_mlp": 1.02201176, "epoch": 0.7608970119641676, "flos": 20922993555840.0, "grad_norm": 2.354896488946492, "language_loss": 0.74461496, "learning_rate": 5.701756273481138e-07, "loss": 0.76667452, "num_input_tokens_seen": 135965400, "step": 6328, "time_per_iteration": 2.6600725650787354 }, { "auxiliary_loss_clip": 0.01168679, "auxiliary_loss_mlp": 0.01020669, "balance_loss_clip": 0.97042745, "balance_loss_mlp": 1.01339388, "epoch": 0.7610172548548068, "flos": 23807302738560.0, "grad_norm": 1.5594937834755855, "language_loss": 0.7392202, "learning_rate": 5.696310677382212e-07, "loss": 0.7611137, "num_input_tokens_seen": 135986795, "step": 6329, "time_per_iteration": 2.7028796672821045 }, { "auxiliary_loss_clip": 0.01072074, "auxiliary_loss_mlp": 0.01002613, "balance_loss_clip": 0.90251535, "balance_loss_mlp": 1.00088429, "epoch": 0.7611374977454458, "flos": 66496580426880.0, "grad_norm": 0.8843111768204993, "language_loss": 0.61803126, "learning_rate": 5.690867251141576e-07, "loss": 0.63877809, "num_input_tokens_seen": 136053450, "step": 6330, "time_per_iteration": 3.4014196395874023 }, { "auxiliary_loss_clip": 0.01174771, "auxiliary_loss_mlp": 0.01024468, "balance_loss_clip": 1.00991392, "balance_loss_mlp": 1.0175004, "epoch": 0.7612577406360849, "flos": 15633136592640.0, "grad_norm": 2.5308904847884484, "language_loss": 0.91311967, "learning_rate": 5.685425995585013e-07, "loss": 0.93511212, "num_input_tokens_seen": 136071375, "step": 6331, "time_per_iteration": 2.6178364753723145 }, { "auxiliary_loss_clip": 0.01070886, "auxiliary_loss_mlp": 0.01004089, "balance_loss_clip": 0.93697178, "balance_loss_mlp": 1.00251567, "epoch": 0.761377983526724, "flos": 60526253237760.0, "grad_norm": 0.7608266315359651, "language_loss": 0.59102547, "learning_rate": 5.679986911537935e-07, "loss": 0.61177522, "num_input_tokens_seen": 136138905, "step": 6332, "time_per_iteration": 3.364859104156494 }, { "auxiliary_loss_clip": 0.01152625, "auxiliary_loss_mlp": 0.0102369, "balance_loss_clip": 0.89384937, "balance_loss_mlp": 1.01664746, "epoch": 0.7614982264173631, "flos": 35772522019200.0, "grad_norm": 1.7276267152950329, "language_loss": 0.67739189, "learning_rate": 5.674549999825462e-07, "loss": 0.69915509, "num_input_tokens_seen": 136161720, "step": 6333, "time_per_iteration": 2.8255536556243896 }, { "auxiliary_loss_clip": 0.01066751, "auxiliary_loss_mlp": 0.01004225, "balance_loss_clip": 0.97341168, "balance_loss_mlp": 1.00260341, "epoch": 0.7616184693080021, "flos": 67925502345600.0, "grad_norm": 0.9347153019979894, "language_loss": 0.71472502, "learning_rate": 5.669115261272363e-07, "loss": 0.73543477, "num_input_tokens_seen": 136222040, "step": 6334, "time_per_iteration": 3.194619655609131 }, { "auxiliary_loss_clip": 0.0116705, "auxiliary_loss_mlp": 0.01023045, "balance_loss_clip": 1.01045096, "balance_loss_mlp": 1.01568961, "epoch": 0.7617387121986413, "flos": 20521979141760.0, "grad_norm": 2.417656064675074, "language_loss": 0.7302326, "learning_rate": 5.663682696703081e-07, "loss": 0.75213349, "num_input_tokens_seen": 136240305, "step": 6335, "time_per_iteration": 3.5092782974243164 }, { "auxiliary_loss_clip": 0.01165215, "auxiliary_loss_mlp": 0.01022784, "balance_loss_clip": 1.04811025, "balance_loss_mlp": 1.01613832, "epoch": 0.7618589550892804, "flos": 18624495283200.0, "grad_norm": 1.7972405206125586, "language_loss": 0.82052827, "learning_rate": 5.658252306941746e-07, "loss": 0.84240824, "num_input_tokens_seen": 136259625, "step": 6336, "time_per_iteration": 2.6271026134490967 }, { "auxiliary_loss_clip": 0.01167542, "auxiliary_loss_mlp": 0.01028283, "balance_loss_clip": 0.89662457, "balance_loss_mlp": 1.02076364, "epoch": 0.7619791979799194, "flos": 17453735389440.0, "grad_norm": 2.3725055273108753, "language_loss": 0.75710154, "learning_rate": 5.65282409281212e-07, "loss": 0.77905983, "num_input_tokens_seen": 136277090, "step": 6337, "time_per_iteration": 2.7362277507781982 }, { "auxiliary_loss_clip": 0.01162213, "auxiliary_loss_mlp": 0.01032063, "balance_loss_clip": 0.97159237, "balance_loss_mlp": 1.02439761, "epoch": 0.7620994408705585, "flos": 14137421333760.0, "grad_norm": 2.06003849508506, "language_loss": 0.69999063, "learning_rate": 5.64739805513768e-07, "loss": 0.72193336, "num_input_tokens_seen": 136294635, "step": 6338, "time_per_iteration": 2.7106306552886963 }, { "auxiliary_loss_clip": 0.0106525, "auxiliary_loss_mlp": 0.01115733, "balance_loss_clip": 0.97562194, "balance_loss_mlp": 0.0, "epoch": 0.7622196837611976, "flos": 70708792527360.0, "grad_norm": 0.7886209509160019, "language_loss": 0.55763149, "learning_rate": 5.641974194741541e-07, "loss": 0.57944131, "num_input_tokens_seen": 136350320, "step": 6339, "time_per_iteration": 5.1099629402160645 }, { "auxiliary_loss_clip": 0.01072663, "auxiliary_loss_mlp": 0.01001221, "balance_loss_clip": 0.94889683, "balance_loss_mlp": 0.99951631, "epoch": 0.7623399266518367, "flos": 60684150447360.0, "grad_norm": 0.7864342202578544, "language_loss": 0.63745749, "learning_rate": 5.636552512446502e-07, "loss": 0.65819633, "num_input_tokens_seen": 136411375, "step": 6340, "time_per_iteration": 3.1666619777679443 }, { "auxiliary_loss_clip": 0.01164712, "auxiliary_loss_mlp": 0.01025451, "balance_loss_clip": 1.01103187, "balance_loss_mlp": 1.01866829, "epoch": 0.7624601695424758, "flos": 26468893641600.0, "grad_norm": 3.4525680261426763, "language_loss": 0.77750444, "learning_rate": 5.631133009075027e-07, "loss": 0.79940611, "num_input_tokens_seen": 136430560, "step": 6341, "time_per_iteration": 3.618424892425537 }, { "auxiliary_loss_clip": 0.01169215, "auxiliary_loss_mlp": 0.01122059, "balance_loss_clip": 1.01159966, "balance_loss_mlp": 0.0, "epoch": 0.7625804124331149, "flos": 19135755515520.0, "grad_norm": 1.789974980350881, "language_loss": 0.68675935, "learning_rate": 5.625715685449242e-07, "loss": 0.70967209, "num_input_tokens_seen": 136448665, "step": 6342, "time_per_iteration": 2.674933433532715 }, { "auxiliary_loss_clip": 0.01169597, "auxiliary_loss_mlp": 0.01025435, "balance_loss_clip": 0.94063902, "balance_loss_mlp": 1.0188396, "epoch": 0.762700655323754, "flos": 26213101914240.0, "grad_norm": 1.5746433872022618, "language_loss": 0.71460056, "learning_rate": 5.620300542390966e-07, "loss": 0.73655087, "num_input_tokens_seen": 136469710, "step": 6343, "time_per_iteration": 2.7912862300872803 }, { "auxiliary_loss_clip": 0.01158996, "auxiliary_loss_mlp": 0.01028292, "balance_loss_clip": 0.96832848, "balance_loss_mlp": 1.02177453, "epoch": 0.762820898214393, "flos": 22382582711040.0, "grad_norm": 1.8017653533595235, "language_loss": 0.85097808, "learning_rate": 5.614887580721659e-07, "loss": 0.87285089, "num_input_tokens_seen": 136489855, "step": 6344, "time_per_iteration": 2.7051279544830322 }, { "auxiliary_loss_clip": 0.01160954, "auxiliary_loss_mlp": 0.0103014, "balance_loss_clip": 0.93685907, "balance_loss_mlp": 1.02290368, "epoch": 0.7629411411050322, "flos": 15700504550400.0, "grad_norm": 2.2777217390690576, "language_loss": 0.7394886, "learning_rate": 5.609476801262481e-07, "loss": 0.76139963, "num_input_tokens_seen": 136504715, "step": 6345, "time_per_iteration": 2.7034213542938232 }, { "auxiliary_loss_clip": 0.01161166, "auxiliary_loss_mlp": 0.01030299, "balance_loss_clip": 0.93319744, "balance_loss_mlp": 1.0229615, "epoch": 0.7630613839956712, "flos": 13770342293760.0, "grad_norm": 4.776309930939368, "language_loss": 0.6378175, "learning_rate": 5.604068204834223e-07, "loss": 0.6597321, "num_input_tokens_seen": 136521610, "step": 6346, "time_per_iteration": 2.692176342010498 }, { "auxiliary_loss_clip": 0.01162311, "auxiliary_loss_mlp": 0.01122737, "balance_loss_clip": 0.89658308, "balance_loss_mlp": 0.0, "epoch": 0.7631816268863103, "flos": 14569569861120.0, "grad_norm": 1.972031429754075, "language_loss": 0.76549065, "learning_rate": 5.598661792257367e-07, "loss": 0.78834116, "num_input_tokens_seen": 136538655, "step": 6347, "time_per_iteration": 2.736286163330078 }, { "auxiliary_loss_clip": 0.01164356, "auxiliary_loss_mlp": 0.01020119, "balance_loss_clip": 1.00875306, "balance_loss_mlp": 1.01352632, "epoch": 0.7633018697769495, "flos": 19062210418560.0, "grad_norm": 2.565219807043183, "language_loss": 0.76205719, "learning_rate": 5.593257564352071e-07, "loss": 0.78390193, "num_input_tokens_seen": 136557095, "step": 6348, "time_per_iteration": 2.669297218322754 }, { "auxiliary_loss_clip": 0.01164587, "auxiliary_loss_mlp": 0.01027095, "balance_loss_clip": 1.00849402, "balance_loss_mlp": 1.02025557, "epoch": 0.7634221126675885, "flos": 22052958577920.0, "grad_norm": 1.45029202075361, "language_loss": 0.75531733, "learning_rate": 5.58785552193815e-07, "loss": 0.77723408, "num_input_tokens_seen": 136577340, "step": 6349, "time_per_iteration": 2.722832441329956 }, { "auxiliary_loss_clip": 0.01167804, "auxiliary_loss_mlp": 0.01028109, "balance_loss_clip": 1.04832935, "balance_loss_mlp": 1.02122474, "epoch": 0.7635423555582276, "flos": 29382720825600.0, "grad_norm": 1.8717850402654224, "language_loss": 0.7589916, "learning_rate": 5.582455665835086e-07, "loss": 0.78095067, "num_input_tokens_seen": 136597635, "step": 6350, "time_per_iteration": 2.6507341861724854 }, { "auxiliary_loss_clip": 0.01172185, "auxiliary_loss_mlp": 0.01031107, "balance_loss_clip": 0.96886635, "balance_loss_mlp": 1.02342677, "epoch": 0.7636625984488667, "flos": 17784903807360.0, "grad_norm": 2.7493340681736815, "language_loss": 0.72994411, "learning_rate": 5.577057996862036e-07, "loss": 0.75197703, "num_input_tokens_seen": 136615260, "step": 6351, "time_per_iteration": 2.6536848545074463 }, { "auxiliary_loss_clip": 0.01164351, "auxiliary_loss_mlp": 0.01025996, "balance_loss_clip": 1.04718733, "balance_loss_mlp": 1.01924562, "epoch": 0.7637828413395058, "flos": 23734583654400.0, "grad_norm": 2.1042877756838574, "language_loss": 0.75950921, "learning_rate": 5.571662515837814e-07, "loss": 0.78141272, "num_input_tokens_seen": 136637220, "step": 6352, "time_per_iteration": 2.6180477142333984 }, { "auxiliary_loss_clip": 0.01165498, "auxiliary_loss_mlp": 0.01021235, "balance_loss_clip": 0.97210896, "balance_loss_mlp": 1.01414156, "epoch": 0.7639030842301449, "flos": 36283279461120.0, "grad_norm": 1.5996175596172086, "language_loss": 0.8390764, "learning_rate": 5.566269223580926e-07, "loss": 0.86094373, "num_input_tokens_seen": 136658930, "step": 6353, "time_per_iteration": 2.795624256134033 }, { "auxiliary_loss_clip": 0.01169337, "auxiliary_loss_mlp": 0.01021017, "balance_loss_clip": 1.01037669, "balance_loss_mlp": 1.01436543, "epoch": 0.764023327120784, "flos": 28878104609280.0, "grad_norm": 1.9446880053841271, "language_loss": 0.75255799, "learning_rate": 5.560878120909511e-07, "loss": 0.77446151, "num_input_tokens_seen": 136681530, "step": 6354, "time_per_iteration": 2.6830506324768066 }, { "auxiliary_loss_clip": 0.01067987, "auxiliary_loss_mlp": 0.01001223, "balance_loss_clip": 0.97394979, "balance_loss_mlp": 0.99957764, "epoch": 0.7641435700114231, "flos": 64789711067520.0, "grad_norm": 0.9701609046699711, "language_loss": 0.58595675, "learning_rate": 5.55548920864141e-07, "loss": 0.6066488, "num_input_tokens_seen": 136742185, "step": 6355, "time_per_iteration": 3.234095811843872 }, { "auxiliary_loss_clip": 0.01166129, "auxiliary_loss_mlp": 0.01027979, "balance_loss_clip": 1.01129961, "balance_loss_mlp": 1.02142835, "epoch": 0.7642638129020621, "flos": 16835784785280.0, "grad_norm": 2.369423712376231, "language_loss": 0.77841121, "learning_rate": 5.550102487594113e-07, "loss": 0.80035233, "num_input_tokens_seen": 136760855, "step": 6356, "time_per_iteration": 2.587348222732544 }, { "auxiliary_loss_clip": 0.01164479, "auxiliary_loss_mlp": 0.01121991, "balance_loss_clip": 0.89256859, "balance_loss_mlp": 0.0, "epoch": 0.7643840557927013, "flos": 30408940391040.0, "grad_norm": 1.613092114026478, "language_loss": 0.71385944, "learning_rate": 5.54471795858477e-07, "loss": 0.7367242, "num_input_tokens_seen": 136780925, "step": 6357, "time_per_iteration": 2.8248953819274902 }, { "auxiliary_loss_clip": 0.01159071, "auxiliary_loss_mlp": 0.01023094, "balance_loss_clip": 0.92776167, "balance_loss_mlp": 1.01598048, "epoch": 0.7645042986833404, "flos": 16983234115200.0, "grad_norm": 6.21826765184467, "language_loss": 0.82820964, "learning_rate": 5.539335622430235e-07, "loss": 0.85003132, "num_input_tokens_seen": 136799545, "step": 6358, "time_per_iteration": 2.687966823577881 }, { "auxiliary_loss_clip": 0.01157907, "auxiliary_loss_mlp": 0.01024419, "balance_loss_clip": 1.00669813, "balance_loss_mlp": 1.01734662, "epoch": 0.7646245415739794, "flos": 17311493531520.0, "grad_norm": 2.2223588363444855, "language_loss": 0.7482751, "learning_rate": 5.533955479946975e-07, "loss": 0.77009833, "num_input_tokens_seen": 136818325, "step": 6359, "time_per_iteration": 2.609250545501709 }, { "auxiliary_loss_clip": 0.01077024, "auxiliary_loss_mlp": 0.01116713, "balance_loss_clip": 0.91174567, "balance_loss_mlp": 0.0, "epoch": 0.7647447844646186, "flos": 70402332666240.0, "grad_norm": 0.8571628504954478, "language_loss": 0.65784937, "learning_rate": 5.528577531951173e-07, "loss": 0.67978674, "num_input_tokens_seen": 136878730, "step": 6360, "time_per_iteration": 3.277566432952881 }, { "auxiliary_loss_clip": 0.01168917, "auxiliary_loss_mlp": 0.01028309, "balance_loss_clip": 0.97253239, "balance_loss_mlp": 1.02141547, "epoch": 0.7648650273552576, "flos": 17675914965120.0, "grad_norm": 1.923612452563062, "language_loss": 0.73897165, "learning_rate": 5.523201779258653e-07, "loss": 0.76094389, "num_input_tokens_seen": 136897705, "step": 6361, "time_per_iteration": 3.367898941040039 }, { "auxiliary_loss_clip": 0.01166341, "auxiliary_loss_mlp": 0.01028568, "balance_loss_clip": 1.04552436, "balance_loss_mlp": 1.02104604, "epoch": 0.7649852702458967, "flos": 22162019247360.0, "grad_norm": 1.9358938102162082, "language_loss": 0.83948481, "learning_rate": 5.517828222684912e-07, "loss": 0.86143392, "num_input_tokens_seen": 136918360, "step": 6362, "time_per_iteration": 2.6432831287384033 }, { "auxiliary_loss_clip": 0.01067859, "auxiliary_loss_mlp": 0.01001705, "balance_loss_clip": 0.93789601, "balance_loss_mlp": 1.00010777, "epoch": 0.7651055131365359, "flos": 69848338227840.0, "grad_norm": 0.7676709602721373, "language_loss": 0.59092468, "learning_rate": 5.512456863045117e-07, "loss": 0.61162031, "num_input_tokens_seen": 136979050, "step": 6363, "time_per_iteration": 3.2407758235931396 }, { "auxiliary_loss_clip": 0.01166735, "auxiliary_loss_mlp": 0.01024798, "balance_loss_clip": 1.04590511, "balance_loss_mlp": 1.01763391, "epoch": 0.7652257560271749, "flos": 19464014931840.0, "grad_norm": 2.3492687632647438, "language_loss": 0.73993623, "learning_rate": 5.507087701154089e-07, "loss": 0.76185149, "num_input_tokens_seen": 136998970, "step": 6364, "time_per_iteration": 2.607318639755249 }, { "auxiliary_loss_clip": 0.01163759, "auxiliary_loss_mlp": 0.01024951, "balance_loss_clip": 0.89410633, "balance_loss_mlp": 1.01779866, "epoch": 0.765345998917814, "flos": 15961108700160.0, "grad_norm": 2.5714318605776136, "language_loss": 0.7542218, "learning_rate": 5.50172073782634e-07, "loss": 0.77610886, "num_input_tokens_seen": 137016950, "step": 6365, "time_per_iteration": 3.6676547527313232 }, { "auxiliary_loss_clip": 0.01163896, "auxiliary_loss_mlp": 0.01026191, "balance_loss_clip": 0.93494892, "balance_loss_mlp": 1.01948202, "epoch": 0.7654662418084531, "flos": 23659853408640.0, "grad_norm": 1.652421696223589, "language_loss": 0.87705588, "learning_rate": 5.496355973876023e-07, "loss": 0.89895672, "num_input_tokens_seen": 137036205, "step": 6366, "time_per_iteration": 2.7024242877960205 }, { "auxiliary_loss_clip": 0.01159283, "auxiliary_loss_mlp": 0.01123012, "balance_loss_clip": 0.93059504, "balance_loss_mlp": 0.0, "epoch": 0.7655864846990922, "flos": 41463608878080.0, "grad_norm": 1.775180072994, "language_loss": 0.70918822, "learning_rate": 5.490993410116984e-07, "loss": 0.73201114, "num_input_tokens_seen": 137059195, "step": 6367, "time_per_iteration": 3.808515787124634 }, { "auxiliary_loss_clip": 0.01158017, "auxiliary_loss_mlp": 0.01023762, "balance_loss_clip": 0.93250591, "balance_loss_mlp": 1.0170114, "epoch": 0.7657067275897312, "flos": 43142684088960.0, "grad_norm": 1.8144624052602079, "language_loss": 0.69466418, "learning_rate": 5.485633047362704e-07, "loss": 0.71648192, "num_input_tokens_seen": 137081200, "step": 6368, "time_per_iteration": 2.9050776958465576 }, { "auxiliary_loss_clip": 0.01172852, "auxiliary_loss_mlp": 0.01028071, "balance_loss_clip": 1.05160546, "balance_loss_mlp": 1.02076626, "epoch": 0.7658269704803703, "flos": 17311780840320.0, "grad_norm": 2.811239645265645, "language_loss": 0.78758973, "learning_rate": 5.480274886426341e-07, "loss": 0.80959892, "num_input_tokens_seen": 137097840, "step": 6369, "time_per_iteration": 2.5776050090789795 }, { "auxiliary_loss_clip": 0.0116473, "auxiliary_loss_mlp": 0.01028015, "balance_loss_clip": 1.01197743, "balance_loss_mlp": 1.0208149, "epoch": 0.7659472133710095, "flos": 12568160977920.0, "grad_norm": 1.893381400909145, "language_loss": 0.77729136, "learning_rate": 5.474918928120744e-07, "loss": 0.79921877, "num_input_tokens_seen": 137114335, "step": 6370, "time_per_iteration": 2.641101121902466 }, { "auxiliary_loss_clip": 0.01163991, "auxiliary_loss_mlp": 0.01025529, "balance_loss_clip": 1.00918007, "balance_loss_mlp": 1.01872218, "epoch": 0.7660674562616485, "flos": 22707430335360.0, "grad_norm": 1.6715184205192455, "language_loss": 0.87197661, "learning_rate": 5.469565173258392e-07, "loss": 0.89387178, "num_input_tokens_seen": 137132850, "step": 6371, "time_per_iteration": 2.701861619949341 }, { "auxiliary_loss_clip": 0.011717, "auxiliary_loss_mlp": 0.01028215, "balance_loss_clip": 1.04833198, "balance_loss_mlp": 1.02038574, "epoch": 0.7661876991522876, "flos": 17056455989760.0, "grad_norm": 2.8439497439966477, "language_loss": 0.64164877, "learning_rate": 5.464213622651454e-07, "loss": 0.66364789, "num_input_tokens_seen": 137150665, "step": 6372, "time_per_iteration": 2.5955357551574707 }, { "auxiliary_loss_clip": 0.01170669, "auxiliary_loss_mlp": 0.01028446, "balance_loss_clip": 0.93462199, "balance_loss_mlp": 1.02082229, "epoch": 0.7663079420429267, "flos": 20084228092800.0, "grad_norm": 1.7377336207097733, "language_loss": 0.83993804, "learning_rate": 5.458864277111753e-07, "loss": 0.86192918, "num_input_tokens_seen": 137168500, "step": 6373, "time_per_iteration": 2.6394190788269043 }, { "auxiliary_loss_clip": 0.01158121, "auxiliary_loss_mlp": 0.01121897, "balance_loss_clip": 0.97028613, "balance_loss_mlp": 0.0, "epoch": 0.7664281849335658, "flos": 12677473042560.0, "grad_norm": 2.2726338530271897, "language_loss": 0.69140655, "learning_rate": 5.453517137450769e-07, "loss": 0.7142067, "num_input_tokens_seen": 137185075, "step": 6374, "time_per_iteration": 2.6650030612945557 }, { "auxiliary_loss_clip": 0.01168906, "auxiliary_loss_mlp": 0.0103009, "balance_loss_clip": 1.0126332, "balance_loss_mlp": 1.02253771, "epoch": 0.7665484278242048, "flos": 22345271458560.0, "grad_norm": 2.297009560042903, "language_loss": 0.75898409, "learning_rate": 5.448172204479684e-07, "loss": 0.78097403, "num_input_tokens_seen": 137204355, "step": 6375, "time_per_iteration": 2.6424689292907715 }, { "auxiliary_loss_clip": 0.01165418, "auxiliary_loss_mlp": 0.01022124, "balance_loss_clip": 1.04735923, "balance_loss_mlp": 1.01504862, "epoch": 0.766668670714844, "flos": 23617909301760.0, "grad_norm": 1.6563527607760844, "language_loss": 0.74541461, "learning_rate": 5.442829479009294e-07, "loss": 0.76729006, "num_input_tokens_seen": 137223135, "step": 6376, "time_per_iteration": 2.654327869415283 }, { "auxiliary_loss_clip": 0.01176287, "auxiliary_loss_mlp": 0.01027146, "balance_loss_clip": 1.01141381, "balance_loss_mlp": 1.01998115, "epoch": 0.7667889136054831, "flos": 19427134642560.0, "grad_norm": 1.7574249181062396, "language_loss": 0.71752375, "learning_rate": 5.437488961850103e-07, "loss": 0.7395581, "num_input_tokens_seen": 137242935, "step": 6377, "time_per_iteration": 2.7054309844970703 }, { "auxiliary_loss_clip": 0.01163405, "auxiliary_loss_mlp": 0.01023319, "balance_loss_clip": 0.89679098, "balance_loss_mlp": 1.01702809, "epoch": 0.7669091564961221, "flos": 26866352609280.0, "grad_norm": 1.6575482279838731, "language_loss": 0.75310266, "learning_rate": 5.432150653812258e-07, "loss": 0.77496994, "num_input_tokens_seen": 137262970, "step": 6378, "time_per_iteration": 2.8197832107543945 }, { "auxiliary_loss_clip": 0.01162688, "auxiliary_loss_mlp": 0.01027171, "balance_loss_clip": 1.00979352, "balance_loss_mlp": 1.01985431, "epoch": 0.7670293993867613, "flos": 12385303816320.0, "grad_norm": 1.9795894576058317, "language_loss": 0.82734382, "learning_rate": 5.42681455570557e-07, "loss": 0.84924239, "num_input_tokens_seen": 137279500, "step": 6379, "time_per_iteration": 2.639616012573242 }, { "auxiliary_loss_clip": 0.01163422, "auxiliary_loss_mlp": 0.01025426, "balance_loss_clip": 1.04611039, "balance_loss_mlp": 1.01809156, "epoch": 0.7671496422774003, "flos": 21762944167680.0, "grad_norm": 1.550166190886845, "language_loss": 0.64569801, "learning_rate": 5.42148066833954e-07, "loss": 0.66758645, "num_input_tokens_seen": 137298745, "step": 6380, "time_per_iteration": 2.6063129901885986 }, { "auxiliary_loss_clip": 0.01167841, "auxiliary_loss_mlp": 0.01021842, "balance_loss_clip": 1.04927254, "balance_loss_mlp": 1.01495194, "epoch": 0.7672698851680394, "flos": 21069221823360.0, "grad_norm": 4.141537924364451, "language_loss": 0.75786269, "learning_rate": 5.416148992523289e-07, "loss": 0.77975953, "num_input_tokens_seen": 137317320, "step": 6381, "time_per_iteration": 2.6222329139709473 }, { "auxiliary_loss_clip": 0.01159672, "auxiliary_loss_mlp": 0.01028599, "balance_loss_clip": 0.81874913, "balance_loss_mlp": 1.02178669, "epoch": 0.7673901280586786, "flos": 16976697840000.0, "grad_norm": 2.1324336645682025, "language_loss": 0.78813255, "learning_rate": 5.410819529065644e-07, "loss": 0.8100152, "num_input_tokens_seen": 137335275, "step": 6382, "time_per_iteration": 2.7770752906799316 }, { "auxiliary_loss_clip": 0.01162023, "auxiliary_loss_mlp": 0.0102399, "balance_loss_clip": 0.89209402, "balance_loss_mlp": 1.01748419, "epoch": 0.7675103709493176, "flos": 29242669697280.0, "grad_norm": 2.0369162668341927, "language_loss": 0.65243661, "learning_rate": 5.405492278775079e-07, "loss": 0.6742968, "num_input_tokens_seen": 137355055, "step": 6383, "time_per_iteration": 2.8126046657562256 }, { "auxiliary_loss_clip": 0.0116349, "auxiliary_loss_mlp": 0.01032816, "balance_loss_clip": 0.96891832, "balance_loss_mlp": 1.02537155, "epoch": 0.7676306138399567, "flos": 29023004073600.0, "grad_norm": 2.3446800838111233, "language_loss": 0.79645824, "learning_rate": 5.400167242459732e-07, "loss": 0.8184213, "num_input_tokens_seen": 137374015, "step": 6384, "time_per_iteration": 2.7086167335510254 }, { "auxiliary_loss_clip": 0.01161936, "auxiliary_loss_mlp": 0.0102922, "balance_loss_clip": 1.00863361, "balance_loss_mlp": 1.02210307, "epoch": 0.7677508567305958, "flos": 22565116650240.0, "grad_norm": 1.5687694754535377, "language_loss": 0.80529213, "learning_rate": 5.394844420927405e-07, "loss": 0.82720363, "num_input_tokens_seen": 137393625, "step": 6385, "time_per_iteration": 2.6824207305908203 }, { "auxiliary_loss_clip": 0.0116638, "auxiliary_loss_mlp": 0.01028906, "balance_loss_clip": 1.04818273, "balance_loss_mlp": 1.02185512, "epoch": 0.7678710996212349, "flos": 25411432222080.0, "grad_norm": 2.4899223698824913, "language_loss": 0.72947431, "learning_rate": 5.389523814985562e-07, "loss": 0.75142717, "num_input_tokens_seen": 137413045, "step": 6386, "time_per_iteration": 2.658154010772705 }, { "auxiliary_loss_clip": 0.01160407, "auxiliary_loss_mlp": 0.01026894, "balance_loss_clip": 0.89370024, "balance_loss_mlp": 1.01924372, "epoch": 0.767991342511874, "flos": 26756825063040.0, "grad_norm": 2.006374905069691, "language_loss": 0.76042736, "learning_rate": 5.384205425441344e-07, "loss": 0.78230041, "num_input_tokens_seen": 137433955, "step": 6387, "time_per_iteration": 3.617708921432495 }, { "auxiliary_loss_clip": 0.01163236, "auxiliary_loss_mlp": 0.01026414, "balance_loss_clip": 0.96994078, "balance_loss_mlp": 1.01928806, "epoch": 0.7681115854025131, "flos": 26359509749760.0, "grad_norm": 1.6509078476854464, "language_loss": 0.84186774, "learning_rate": 5.378889253101537e-07, "loss": 0.86376429, "num_input_tokens_seen": 137454510, "step": 6388, "time_per_iteration": 2.7696480751037598 }, { "auxiliary_loss_clip": 0.01163914, "auxiliary_loss_mlp": 0.01026889, "balance_loss_clip": 1.00664926, "balance_loss_mlp": 1.02033603, "epoch": 0.7682318282931522, "flos": 23257043314560.0, "grad_norm": 2.0371660745269753, "language_loss": 0.81044555, "learning_rate": 5.373575298772617e-07, "loss": 0.83235359, "num_input_tokens_seen": 137473630, "step": 6389, "time_per_iteration": 2.7246975898742676 }, { "auxiliary_loss_clip": 0.01067115, "auxiliary_loss_mlp": 0.01002113, "balance_loss_clip": 0.97332478, "balance_loss_mlp": 1.00040841, "epoch": 0.7683520711837912, "flos": 70072457137920.0, "grad_norm": 0.7680656777950574, "language_loss": 0.61355591, "learning_rate": 5.368263563260689e-07, "loss": 0.6342482, "num_input_tokens_seen": 137538765, "step": 6390, "time_per_iteration": 3.294503688812256 }, { "auxiliary_loss_clip": 0.01163143, "auxiliary_loss_mlp": 0.01026421, "balance_loss_clip": 1.00732899, "balance_loss_mlp": 1.01930118, "epoch": 0.7684723140744304, "flos": 18624890332800.0, "grad_norm": 1.5091937981208767, "language_loss": 0.64170974, "learning_rate": 5.362954047371537e-07, "loss": 0.66360539, "num_input_tokens_seen": 137557875, "step": 6391, "time_per_iteration": 4.5413923263549805 }, { "auxiliary_loss_clip": 0.01167807, "auxiliary_loss_mlp": 0.01027831, "balance_loss_clip": 0.93944502, "balance_loss_mlp": 1.02041936, "epoch": 0.7685925569650695, "flos": 27452989532160.0, "grad_norm": 2.0424899979425692, "language_loss": 0.72007871, "learning_rate": 5.357646751910627e-07, "loss": 0.74203509, "num_input_tokens_seen": 137579055, "step": 6392, "time_per_iteration": 2.730485200881958 }, { "auxiliary_loss_clip": 0.0116061, "auxiliary_loss_mlp": 0.0102815, "balance_loss_clip": 0.96967191, "balance_loss_mlp": 1.0202229, "epoch": 0.7687127998557085, "flos": 24535714642560.0, "grad_norm": 2.0842946703807868, "language_loss": 0.79321533, "learning_rate": 5.352341677683061e-07, "loss": 0.81510293, "num_input_tokens_seen": 137600355, "step": 6393, "time_per_iteration": 3.637333393096924 }, { "auxiliary_loss_clip": 0.01173481, "auxiliary_loss_mlp": 0.01024722, "balance_loss_clip": 0.93518519, "balance_loss_mlp": 1.0179956, "epoch": 0.7688330427463477, "flos": 25155963717120.0, "grad_norm": 2.4372750889143884, "language_loss": 0.7883271, "learning_rate": 5.347038825493617e-07, "loss": 0.81030917, "num_input_tokens_seen": 137621885, "step": 6394, "time_per_iteration": 2.7304561138153076 }, { "auxiliary_loss_clip": 0.01162484, "auxiliary_loss_mlp": 0.01024247, "balance_loss_clip": 0.97396892, "balance_loss_mlp": 1.01763058, "epoch": 0.7689532856369867, "flos": 21211284113280.0, "grad_norm": 2.9219349898505715, "language_loss": 0.68203551, "learning_rate": 5.341738196146732e-07, "loss": 0.70390284, "num_input_tokens_seen": 137640230, "step": 6395, "time_per_iteration": 2.6578257083892822 }, { "auxiliary_loss_clip": 0.01161095, "auxiliary_loss_mlp": 0.01027006, "balance_loss_clip": 1.00799227, "balance_loss_mlp": 1.02010655, "epoch": 0.7690735285276258, "flos": 25119083427840.0, "grad_norm": 2.076901001376128, "language_loss": 0.7358669, "learning_rate": 5.336439790446503e-07, "loss": 0.75774789, "num_input_tokens_seen": 137659330, "step": 6396, "time_per_iteration": 2.711379289627075 }, { "auxiliary_loss_clip": 0.01158496, "auxiliary_loss_mlp": 0.01025514, "balance_loss_clip": 0.93068802, "balance_loss_mlp": 1.01858187, "epoch": 0.769193771418265, "flos": 54744020640000.0, "grad_norm": 2.7828252245166456, "language_loss": 0.62491351, "learning_rate": 5.331143609196711e-07, "loss": 0.64675361, "num_input_tokens_seen": 137683145, "step": 6397, "time_per_iteration": 2.988229513168335 }, { "auxiliary_loss_clip": 0.01164974, "auxiliary_loss_mlp": 0.01027267, "balance_loss_clip": 1.00984526, "balance_loss_mlp": 1.01984656, "epoch": 0.769314014308904, "flos": 37341890115840.0, "grad_norm": 1.706061282599936, "language_loss": 0.76927316, "learning_rate": 5.325849653200758e-07, "loss": 0.79119551, "num_input_tokens_seen": 137707095, "step": 6398, "time_per_iteration": 2.8059864044189453 }, { "auxiliary_loss_clip": 0.01168409, "auxiliary_loss_mlp": 0.01025238, "balance_loss_clip": 1.04853296, "balance_loss_mlp": 1.01814234, "epoch": 0.7694342571995431, "flos": 20631686256000.0, "grad_norm": 1.664941980573841, "language_loss": 0.76091516, "learning_rate": 5.32055792326175e-07, "loss": 0.7828517, "num_input_tokens_seen": 137725520, "step": 6399, "time_per_iteration": 2.5981361865997314 }, { "auxiliary_loss_clip": 0.0116917, "auxiliary_loss_mlp": 0.01025692, "balance_loss_clip": 0.97360075, "balance_loss_mlp": 1.01868248, "epoch": 0.7695545000901821, "flos": 24207706621440.0, "grad_norm": 1.9696129434296714, "language_loss": 0.72601783, "learning_rate": 5.315268420182437e-07, "loss": 0.74796641, "num_input_tokens_seen": 137744195, "step": 6400, "time_per_iteration": 2.7443227767944336 }, { "auxiliary_loss_clip": 0.0116855, "auxiliary_loss_mlp": 0.01122349, "balance_loss_clip": 0.93290073, "balance_loss_mlp": 0.0, "epoch": 0.7696747429808213, "flos": 28001273708160.0, "grad_norm": 1.9371121897689967, "language_loss": 0.76618564, "learning_rate": 5.309981144765221e-07, "loss": 0.78909457, "num_input_tokens_seen": 137764340, "step": 6401, "time_per_iteration": 2.7693262100219727 }, { "auxiliary_loss_clip": 0.01167523, "auxiliary_loss_mlp": 0.01021815, "balance_loss_clip": 0.89340186, "balance_loss_mlp": 1.01524305, "epoch": 0.7697949858714603, "flos": 11509550323200.0, "grad_norm": 2.0745018943924203, "language_loss": 0.75114697, "learning_rate": 5.304696097812196e-07, "loss": 0.77304035, "num_input_tokens_seen": 137780940, "step": 6402, "time_per_iteration": 2.853425979614258 }, { "auxiliary_loss_clip": 0.01163649, "auxiliary_loss_mlp": 0.01029971, "balance_loss_clip": 0.97069693, "balance_loss_mlp": 1.02204382, "epoch": 0.7699152287620994, "flos": 26688271956480.0, "grad_norm": 3.1110095478433113, "language_loss": 0.60619164, "learning_rate": 5.299413280125078e-07, "loss": 0.62812787, "num_input_tokens_seen": 137799250, "step": 6403, "time_per_iteration": 2.737097978591919 }, { "auxiliary_loss_clip": 0.01161835, "auxiliary_loss_mlp": 0.01021289, "balance_loss_clip": 0.96871316, "balance_loss_mlp": 1.01373732, "epoch": 0.7700354716527386, "flos": 16544944362240.0, "grad_norm": 1.8143217906316793, "language_loss": 0.72639477, "learning_rate": 5.294132692505284e-07, "loss": 0.74822599, "num_input_tokens_seen": 137817660, "step": 6404, "time_per_iteration": 2.673717975616455 }, { "auxiliary_loss_clip": 0.01151721, "auxiliary_loss_mlp": 0.01025567, "balance_loss_clip": 0.89218426, "balance_loss_mlp": 1.0190438, "epoch": 0.7701557145433776, "flos": 19242733196160.0, "grad_norm": 2.314842937277097, "language_loss": 0.79123205, "learning_rate": 5.288854335753861e-07, "loss": 0.81300497, "num_input_tokens_seen": 137835920, "step": 6405, "time_per_iteration": 2.7979423999786377 }, { "auxiliary_loss_clip": 0.01167653, "auxiliary_loss_mlp": 0.01028768, "balance_loss_clip": 1.00822949, "balance_loss_mlp": 1.02173424, "epoch": 0.7702759574340167, "flos": 31685744211840.0, "grad_norm": 1.5585092329573733, "language_loss": 0.75582534, "learning_rate": 5.283578210671551e-07, "loss": 0.77778947, "num_input_tokens_seen": 137858160, "step": 6406, "time_per_iteration": 2.696974992752075 }, { "auxiliary_loss_clip": 0.01168904, "auxiliary_loss_mlp": 0.0102601, "balance_loss_clip": 0.97137749, "balance_loss_mlp": 1.01917946, "epoch": 0.7703962003246558, "flos": 16800089644800.0, "grad_norm": 1.9517951107238831, "language_loss": 0.76699781, "learning_rate": 5.278304318058719e-07, "loss": 0.78894693, "num_input_tokens_seen": 137876015, "step": 6407, "time_per_iteration": 2.6465322971343994 }, { "auxiliary_loss_clip": 0.01160568, "auxiliary_loss_mlp": 0.01031432, "balance_loss_clip": 0.85468006, "balance_loss_mlp": 1.02423787, "epoch": 0.7705164432152949, "flos": 35736072693120.0, "grad_norm": 1.8687075501573778, "language_loss": 0.79257274, "learning_rate": 5.273032658715411e-07, "loss": 0.8144927, "num_input_tokens_seen": 137898825, "step": 6408, "time_per_iteration": 2.8572874069213867 }, { "auxiliary_loss_clip": 0.01158294, "auxiliary_loss_mlp": 0.01021951, "balance_loss_clip": 0.89399219, "balance_loss_mlp": 1.01443172, "epoch": 0.7706366861059339, "flos": 23365960329600.0, "grad_norm": 1.9259166161560086, "language_loss": 0.7669313, "learning_rate": 5.267763233441347e-07, "loss": 0.78873372, "num_input_tokens_seen": 137919455, "step": 6409, "time_per_iteration": 2.7706105709075928 }, { "auxiliary_loss_clip": 0.01169711, "auxiliary_loss_mlp": 0.01024533, "balance_loss_clip": 1.01091361, "balance_loss_mlp": 1.01642036, "epoch": 0.7707569289965731, "flos": 22929897219840.0, "grad_norm": 2.111821493619589, "language_loss": 0.69493908, "learning_rate": 5.26249604303588e-07, "loss": 0.71688145, "num_input_tokens_seen": 137937960, "step": 6410, "time_per_iteration": 2.6463592052459717 }, { "auxiliary_loss_clip": 0.01170551, "auxiliary_loss_mlp": 0.01025039, "balance_loss_clip": 1.05022931, "balance_loss_mlp": 1.01799083, "epoch": 0.7708771718872122, "flos": 17420661941760.0, "grad_norm": 2.0838366765712673, "language_loss": 0.77980363, "learning_rate": 5.257231088298057e-07, "loss": 0.80175948, "num_input_tokens_seen": 137956370, "step": 6411, "time_per_iteration": 2.6383895874023438 }, { "auxiliary_loss_clip": 0.01068359, "auxiliary_loss_mlp": 0.01001908, "balance_loss_clip": 0.89970112, "balance_loss_mlp": 1.00032222, "epoch": 0.7709974147778512, "flos": 72241316248320.0, "grad_norm": 0.7993506117842045, "language_loss": 0.54015666, "learning_rate": 5.25196837002655e-07, "loss": 0.56085932, "num_input_tokens_seen": 138016080, "step": 6412, "time_per_iteration": 3.328259229660034 }, { "auxiliary_loss_clip": 0.01161286, "auxiliary_loss_mlp": 0.01025471, "balance_loss_clip": 0.97078854, "balance_loss_mlp": 1.0181396, "epoch": 0.7711176576684904, "flos": 39859694876160.0, "grad_norm": 2.051208514224757, "language_loss": 0.68460125, "learning_rate": 5.24670788901971e-07, "loss": 0.70646882, "num_input_tokens_seen": 138039170, "step": 6413, "time_per_iteration": 3.6977059841156006 }, { "auxiliary_loss_clip": 0.01162156, "auxiliary_loss_mlp": 0.01031757, "balance_loss_clip": 0.97105718, "balance_loss_mlp": 1.02372789, "epoch": 0.7712379005591294, "flos": 36976391274240.0, "grad_norm": 2.1998990602495256, "language_loss": 0.68444157, "learning_rate": 5.241449646075557e-07, "loss": 0.70638061, "num_input_tokens_seen": 138062395, "step": 6414, "time_per_iteration": 2.897272825241089 }, { "auxiliary_loss_clip": 0.01174689, "auxiliary_loss_mlp": 0.01025509, "balance_loss_clip": 1.01082802, "balance_loss_mlp": 1.0175401, "epoch": 0.7713581434497685, "flos": 22776773541120.0, "grad_norm": 2.118560755475185, "language_loss": 0.72117567, "learning_rate": 5.236193641991762e-07, "loss": 0.74317765, "num_input_tokens_seen": 138080325, "step": 6415, "time_per_iteration": 2.6967110633850098 }, { "auxiliary_loss_clip": 0.01160877, "auxiliary_loss_mlp": 0.01025086, "balance_loss_clip": 0.97108763, "balance_loss_mlp": 1.01776671, "epoch": 0.7714783863404077, "flos": 24097460803200.0, "grad_norm": 1.7897099228917768, "language_loss": 0.69878483, "learning_rate": 5.23093987756565e-07, "loss": 0.72064447, "num_input_tokens_seen": 138099020, "step": 6416, "time_per_iteration": 2.715240478515625 }, { "auxiliary_loss_clip": 0.01170667, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 0.93148565, "balance_loss_mlp": 1.02175426, "epoch": 0.7715986292310467, "flos": 21063655215360.0, "grad_norm": 1.7323419382470833, "language_loss": 0.75480402, "learning_rate": 5.225688353594217e-07, "loss": 0.77680618, "num_input_tokens_seen": 138118650, "step": 6417, "time_per_iteration": 3.7294983863830566 }, { "auxiliary_loss_clip": 0.01174034, "auxiliary_loss_mlp": 0.01122625, "balance_loss_clip": 0.97422576, "balance_loss_mlp": 0.0, "epoch": 0.7717188721216858, "flos": 20594877793920.0, "grad_norm": 6.042672055956829, "language_loss": 0.77616549, "learning_rate": 5.220439070874108e-07, "loss": 0.79913211, "num_input_tokens_seen": 138137890, "step": 6418, "time_per_iteration": 3.644202947616577 }, { "auxiliary_loss_clip": 0.01166567, "auxiliary_loss_mlp": 0.01027891, "balance_loss_clip": 1.01252437, "balance_loss_mlp": 1.02094996, "epoch": 0.7718391150123249, "flos": 26250951870720.0, "grad_norm": 1.8461642139149685, "language_loss": 0.71001768, "learning_rate": 5.215192030201652e-07, "loss": 0.73196232, "num_input_tokens_seen": 138158880, "step": 6419, "time_per_iteration": 2.667611837387085 }, { "auxiliary_loss_clip": 0.01148796, "auxiliary_loss_mlp": 0.01029326, "balance_loss_clip": 0.92940634, "balance_loss_mlp": 1.02205694, "epoch": 0.771959357902964, "flos": 22049762267520.0, "grad_norm": 1.9905346009446552, "language_loss": 0.86286932, "learning_rate": 5.209947232372798e-07, "loss": 0.88465053, "num_input_tokens_seen": 138176370, "step": 6420, "time_per_iteration": 2.7120118141174316 }, { "auxiliary_loss_clip": 0.01168834, "auxiliary_loss_mlp": 0.01122687, "balance_loss_clip": 1.0097239, "balance_loss_mlp": 0.0, "epoch": 0.772079600793603, "flos": 30446000248320.0, "grad_norm": 2.056133831739459, "language_loss": 0.81216919, "learning_rate": 5.204704678183196e-07, "loss": 0.83508432, "num_input_tokens_seen": 138195105, "step": 6421, "time_per_iteration": 2.688159942626953 }, { "auxiliary_loss_clip": 0.01168807, "auxiliary_loss_mlp": 0.01024691, "balance_loss_clip": 1.0488075, "balance_loss_mlp": 1.01703215, "epoch": 0.7721998436842422, "flos": 12969857750400.0, "grad_norm": 2.5904415820754503, "language_loss": 0.84925967, "learning_rate": 5.19946436842813e-07, "loss": 0.87119466, "num_input_tokens_seen": 138212235, "step": 6422, "time_per_iteration": 2.592663049697876 }, { "auxiliary_loss_clip": 0.0116645, "auxiliary_loss_mlp": 0.01020202, "balance_loss_clip": 0.93501806, "balance_loss_mlp": 1.01365435, "epoch": 0.7723200865748813, "flos": 32635509678720.0, "grad_norm": 1.6837865215070953, "language_loss": 0.68433523, "learning_rate": 5.194226303902546e-07, "loss": 0.70620179, "num_input_tokens_seen": 138231970, "step": 6423, "time_per_iteration": 2.804802417755127 }, { "auxiliary_loss_clip": 0.01160127, "auxiliary_loss_mlp": 0.01025482, "balance_loss_clip": 0.96990019, "balance_loss_mlp": 1.01847553, "epoch": 0.7724403294655203, "flos": 21105707063040.0, "grad_norm": 2.3219536669219956, "language_loss": 0.70544225, "learning_rate": 5.188990485401072e-07, "loss": 0.72729826, "num_input_tokens_seen": 138251175, "step": 6424, "time_per_iteration": 2.659514904022217 }, { "auxiliary_loss_clip": 0.01165138, "auxiliary_loss_mlp": 0.01027707, "balance_loss_clip": 1.00928545, "balance_loss_mlp": 1.02045286, "epoch": 0.7725605723561595, "flos": 22090736707200.0, "grad_norm": 1.6711382772082766, "language_loss": 0.85993457, "learning_rate": 5.183756913717954e-07, "loss": 0.881863, "num_input_tokens_seen": 138270950, "step": 6425, "time_per_iteration": 2.6284520626068115 }, { "auxiliary_loss_clip": 0.01162764, "auxiliary_loss_mlp": 0.01025189, "balance_loss_clip": 0.97205424, "balance_loss_mlp": 1.01855445, "epoch": 0.7726808152467985, "flos": 34495610457600.0, "grad_norm": 2.6379842836454594, "language_loss": 0.73526746, "learning_rate": 5.178525589647136e-07, "loss": 0.75714695, "num_input_tokens_seen": 138292590, "step": 6426, "time_per_iteration": 2.8466813564300537 }, { "auxiliary_loss_clip": 0.01170005, "auxiliary_loss_mlp": 0.01023355, "balance_loss_clip": 0.97098076, "balance_loss_mlp": 1.01654816, "epoch": 0.7728010581374376, "flos": 22306344094080.0, "grad_norm": 2.1649711091054216, "language_loss": 0.78768194, "learning_rate": 5.173296513982197e-07, "loss": 0.80961549, "num_input_tokens_seen": 138311115, "step": 6427, "time_per_iteration": 2.666083335876465 }, { "auxiliary_loss_clip": 0.01175612, "auxiliary_loss_mlp": 0.01026651, "balance_loss_clip": 0.93581879, "balance_loss_mlp": 1.01965094, "epoch": 0.7729213010280768, "flos": 27126453968640.0, "grad_norm": 1.9987132724788281, "language_loss": 0.65245187, "learning_rate": 5.168069687516398e-07, "loss": 0.67447448, "num_input_tokens_seen": 138330885, "step": 6428, "time_per_iteration": 2.7505552768707275 }, { "auxiliary_loss_clip": 0.01167673, "auxiliary_loss_mlp": 0.01027846, "balance_loss_clip": 0.97408664, "balance_loss_mlp": 1.02067566, "epoch": 0.7730415439187158, "flos": 18150223080960.0, "grad_norm": 1.9157987233407947, "language_loss": 0.7175917, "learning_rate": 5.16284511104263e-07, "loss": 0.7395469, "num_input_tokens_seen": 138350020, "step": 6429, "time_per_iteration": 2.6765220165252686 }, { "auxiliary_loss_clip": 0.01166553, "auxiliary_loss_mlp": 0.01021502, "balance_loss_clip": 0.97356373, "balance_loss_mlp": 1.0142628, "epoch": 0.7731617868093549, "flos": 11947480940160.0, "grad_norm": 2.3985692965420005, "language_loss": 0.80851257, "learning_rate": 5.157622785353457e-07, "loss": 0.83039308, "num_input_tokens_seen": 138368135, "step": 6430, "time_per_iteration": 2.668623447418213 }, { "auxiliary_loss_clip": 0.01067308, "auxiliary_loss_mlp": 0.01002078, "balance_loss_clip": 0.97368777, "balance_loss_mlp": 1.00049257, "epoch": 0.7732820296999939, "flos": 64201027069440.0, "grad_norm": 0.6699865725425013, "language_loss": 0.6043365, "learning_rate": 5.152402711241113e-07, "loss": 0.6250304, "num_input_tokens_seen": 138436040, "step": 6431, "time_per_iteration": 3.2846436500549316 }, { "auxiliary_loss_clip": 0.01156978, "auxiliary_loss_mlp": 0.01028455, "balance_loss_clip": 0.93008983, "balance_loss_mlp": 1.02139235, "epoch": 0.7734022725906331, "flos": 25302191984640.0, "grad_norm": 1.5878806639125402, "language_loss": 0.82856381, "learning_rate": 5.147184889497465e-07, "loss": 0.85041809, "num_input_tokens_seen": 138455510, "step": 6432, "time_per_iteration": 2.7551541328430176 }, { "auxiliary_loss_clip": 0.01153033, "auxiliary_loss_mlp": 0.0102984, "balance_loss_clip": 0.93157125, "balance_loss_mlp": 1.02257395, "epoch": 0.7735225154812722, "flos": 17347440067200.0, "grad_norm": 1.9825399942094628, "language_loss": 0.80109024, "learning_rate": 5.141969320914072e-07, "loss": 0.82291895, "num_input_tokens_seen": 138473015, "step": 6433, "time_per_iteration": 2.6833760738372803 }, { "auxiliary_loss_clip": 0.01169878, "auxiliary_loss_mlp": 0.010236, "balance_loss_clip": 1.04754126, "balance_loss_mlp": 1.01600671, "epoch": 0.7736427583719112, "flos": 32630086725120.0, "grad_norm": 23.048967046997927, "language_loss": 0.6190511, "learning_rate": 5.136756006282113e-07, "loss": 0.64098591, "num_input_tokens_seen": 138491680, "step": 6434, "time_per_iteration": 2.7041988372802734 }, { "auxiliary_loss_clip": 0.0117233, "auxiliary_loss_mlp": 0.01025423, "balance_loss_clip": 1.05061698, "balance_loss_mlp": 1.0181694, "epoch": 0.7737630012625504, "flos": 19860073269120.0, "grad_norm": 2.169005904424347, "language_loss": 0.85159463, "learning_rate": 5.131544946392446e-07, "loss": 0.87357217, "num_input_tokens_seen": 138506960, "step": 6435, "time_per_iteration": 2.623309373855591 }, { "auxiliary_loss_clip": 0.01170678, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 0.97657698, "balance_loss_mlp": 1.02517939, "epoch": 0.7738832441531894, "flos": 36022639397760.0, "grad_norm": 1.9471249410674045, "language_loss": 0.6398133, "learning_rate": 5.126336142035592e-07, "loss": 0.66184592, "num_input_tokens_seen": 138526995, "step": 6436, "time_per_iteration": 2.792675256729126 }, { "auxiliary_loss_clip": 0.0116565, "auxiliary_loss_mlp": 0.01024952, "balance_loss_clip": 0.97112411, "balance_loss_mlp": 1.01799321, "epoch": 0.7740034870438285, "flos": 13405274415360.0, "grad_norm": 2.6358747228493726, "language_loss": 0.71890926, "learning_rate": 5.121129594001721e-07, "loss": 0.74081534, "num_input_tokens_seen": 138541260, "step": 6437, "time_per_iteration": 2.7451555728912354 }, { "auxiliary_loss_clip": 0.01165531, "auxiliary_loss_mlp": 0.01029114, "balance_loss_clip": 1.01128983, "balance_loss_mlp": 1.02169299, "epoch": 0.7741237299344677, "flos": 22086714384000.0, "grad_norm": 1.513601850614429, "language_loss": 0.81406271, "learning_rate": 5.115925303080661e-07, "loss": 0.83600914, "num_input_tokens_seen": 138560970, "step": 6438, "time_per_iteration": 3.4770474433898926 }, { "auxiliary_loss_clip": 0.01167958, "auxiliary_loss_mlp": 0.01020388, "balance_loss_clip": 0.97224236, "balance_loss_mlp": 1.01351893, "epoch": 0.7742439728251067, "flos": 19864777950720.0, "grad_norm": 2.2452184061482234, "language_loss": 0.79021436, "learning_rate": 5.110723270061899e-07, "loss": 0.81209785, "num_input_tokens_seen": 138577460, "step": 6439, "time_per_iteration": 2.6371371746063232 }, { "auxiliary_loss_clip": 0.0116653, "auxiliary_loss_mlp": 0.0102618, "balance_loss_clip": 1.04862738, "balance_loss_mlp": 1.01887798, "epoch": 0.7743642157157458, "flos": 16690167048960.0, "grad_norm": 1.8493939180328842, "language_loss": 0.7951858, "learning_rate": 5.105523495734572e-07, "loss": 0.81711286, "num_input_tokens_seen": 138594860, "step": 6440, "time_per_iteration": 2.5854599475860596 }, { "auxiliary_loss_clip": 0.01166618, "auxiliary_loss_mlp": 0.01024701, "balance_loss_clip": 1.04680753, "balance_loss_mlp": 1.01773286, "epoch": 0.7744844586063849, "flos": 20304360593280.0, "grad_norm": 1.7066383528030817, "language_loss": 0.74891937, "learning_rate": 5.100325980887499e-07, "loss": 0.77083254, "num_input_tokens_seen": 138614785, "step": 6441, "time_per_iteration": 2.5681397914886475 }, { "auxiliary_loss_clip": 0.01175631, "auxiliary_loss_mlp": 0.01026312, "balance_loss_clip": 0.97304344, "balance_loss_mlp": 1.0194788, "epoch": 0.774604701497024, "flos": 22966705681920.0, "grad_norm": 1.6691269735905094, "language_loss": 0.83432281, "learning_rate": 5.095130726309116e-07, "loss": 0.85634226, "num_input_tokens_seen": 138634960, "step": 6442, "time_per_iteration": 2.6823177337646484 }, { "auxiliary_loss_clip": 0.01063137, "auxiliary_loss_mlp": 0.01003876, "balance_loss_clip": 1.01085138, "balance_loss_mlp": 1.00229073, "epoch": 0.774724944387663, "flos": 60288523073280.0, "grad_norm": 0.9128533550415872, "language_loss": 0.59051824, "learning_rate": 5.089937732787559e-07, "loss": 0.61118841, "num_input_tokens_seen": 138699520, "step": 6443, "time_per_iteration": 5.059353351593018 }, { "auxiliary_loss_clip": 0.01163189, "auxiliary_loss_mlp": 0.01033948, "balance_loss_clip": 0.93188846, "balance_loss_mlp": 1.02607393, "epoch": 0.7748451872783022, "flos": 26761026954240.0, "grad_norm": 2.3426576920228848, "language_loss": 0.66262072, "learning_rate": 5.084747001110592e-07, "loss": 0.68459213, "num_input_tokens_seen": 138719145, "step": 6444, "time_per_iteration": 2.8166403770446777 }, { "auxiliary_loss_clip": 0.01168177, "auxiliary_loss_mlp": 0.0112248, "balance_loss_clip": 1.01446891, "balance_loss_mlp": 0.0, "epoch": 0.7749654301689413, "flos": 30338627518080.0, "grad_norm": 1.7083442253452765, "language_loss": 0.70629299, "learning_rate": 5.07955853206564e-07, "loss": 0.72919959, "num_input_tokens_seen": 138743850, "step": 6445, "time_per_iteration": 3.689237594604492 }, { "auxiliary_loss_clip": 0.0117296, "auxiliary_loss_mlp": 0.01024474, "balance_loss_clip": 1.01179063, "balance_loss_mlp": 1.01729774, "epoch": 0.7750856730595803, "flos": 43179851687040.0, "grad_norm": 1.5509772229276666, "language_loss": 0.70462787, "learning_rate": 5.074372326439807e-07, "loss": 0.7266022, "num_input_tokens_seen": 138766860, "step": 6446, "time_per_iteration": 2.8391501903533936 }, { "auxiliary_loss_clip": 0.01167986, "auxiliary_loss_mlp": 0.01025099, "balance_loss_clip": 0.93489885, "balance_loss_mlp": 1.01837862, "epoch": 0.7752059159502195, "flos": 17640040256640.0, "grad_norm": 2.0834964250013943, "language_loss": 0.73520315, "learning_rate": 5.069188385019814e-07, "loss": 0.75713396, "num_input_tokens_seen": 138784560, "step": 6447, "time_per_iteration": 2.7319891452789307 }, { "auxiliary_loss_clip": 0.01169382, "auxiliary_loss_mlp": 0.01028261, "balance_loss_clip": 0.89279288, "balance_loss_mlp": 1.02121282, "epoch": 0.7753261588408585, "flos": 12677688524160.0, "grad_norm": 3.9263814210376498, "language_loss": 0.61110312, "learning_rate": 5.064006708592077e-07, "loss": 0.63307953, "num_input_tokens_seen": 138800805, "step": 6448, "time_per_iteration": 2.7290165424346924 }, { "auxiliary_loss_clip": 0.01161312, "auxiliary_loss_mlp": 0.01026122, "balance_loss_clip": 0.97200644, "balance_loss_mlp": 1.01913595, "epoch": 0.7754464017314976, "flos": 16690741666560.0, "grad_norm": 2.117201321988177, "language_loss": 0.75357866, "learning_rate": 5.058827297942641e-07, "loss": 0.77545303, "num_input_tokens_seen": 138815910, "step": 6449, "time_per_iteration": 2.6097700595855713 }, { "auxiliary_loss_clip": 0.01173813, "auxiliary_loss_mlp": 0.01024607, "balance_loss_clip": 0.97253889, "balance_loss_mlp": 1.01766014, "epoch": 0.7755666446221368, "flos": 19718944732800.0, "grad_norm": 1.9202742005302949, "language_loss": 0.75120193, "learning_rate": 5.053650153857237e-07, "loss": 0.77318621, "num_input_tokens_seen": 138834920, "step": 6450, "time_per_iteration": 2.760012626647949 }, { "auxiliary_loss_clip": 0.01169519, "auxiliary_loss_mlp": 0.01028429, "balance_loss_clip": 1.01236236, "balance_loss_mlp": 1.02098489, "epoch": 0.7756868875127758, "flos": 18693623007360.0, "grad_norm": 1.4968864576376248, "language_loss": 0.69498718, "learning_rate": 5.048475277121214e-07, "loss": 0.71696663, "num_input_tokens_seen": 138852135, "step": 6451, "time_per_iteration": 2.6442952156066895 }, { "auxiliary_loss_clip": 0.01168993, "auxiliary_loss_mlp": 0.01020001, "balance_loss_clip": 1.0101757, "balance_loss_mlp": 1.01283681, "epoch": 0.7758071304034149, "flos": 28404191543040.0, "grad_norm": 1.729373645587011, "language_loss": 0.77368641, "learning_rate": 5.043302668519598e-07, "loss": 0.79557633, "num_input_tokens_seen": 138871470, "step": 6452, "time_per_iteration": 2.7386043071746826 }, { "auxiliary_loss_clip": 0.0116861, "auxiliary_loss_mlp": 0.01029104, "balance_loss_clip": 1.00933456, "balance_loss_mlp": 1.02184069, "epoch": 0.775927373294054, "flos": 20595344670720.0, "grad_norm": 1.8144222111361823, "language_loss": 0.71872717, "learning_rate": 5.038132328837079e-07, "loss": 0.7407043, "num_input_tokens_seen": 138889860, "step": 6453, "time_per_iteration": 2.634258270263672 }, { "auxiliary_loss_clip": 0.01166951, "auxiliary_loss_mlp": 0.0102211, "balance_loss_clip": 1.01050067, "balance_loss_mlp": 1.01547289, "epoch": 0.7760476161846931, "flos": 22526368853760.0, "grad_norm": 1.8715466506140643, "language_loss": 0.73674095, "learning_rate": 5.032964258857993e-07, "loss": 0.75863159, "num_input_tokens_seen": 138909955, "step": 6454, "time_per_iteration": 2.655517578125 }, { "auxiliary_loss_clip": 0.01163996, "auxiliary_loss_mlp": 0.01027384, "balance_loss_clip": 1.00668907, "balance_loss_mlp": 1.02051783, "epoch": 0.7761678590753321, "flos": 48651488403840.0, "grad_norm": 1.6269144637800101, "language_loss": 0.68346643, "learning_rate": 5.027798459366329e-07, "loss": 0.70538026, "num_input_tokens_seen": 138935320, "step": 6455, "time_per_iteration": 2.8426523208618164 }, { "auxiliary_loss_clip": 0.01170802, "auxiliary_loss_mlp": 0.01023249, "balance_loss_clip": 1.01090336, "balance_loss_mlp": 1.01577473, "epoch": 0.7762881019659713, "flos": 26177047637760.0, "grad_norm": 1.3869975107423111, "language_loss": 0.63432842, "learning_rate": 5.02263493114573e-07, "loss": 0.65626889, "num_input_tokens_seen": 138957115, "step": 6456, "time_per_iteration": 2.725027084350586 }, { "auxiliary_loss_clip": 0.01166805, "auxiliary_loss_mlp": 0.01025984, "balance_loss_clip": 1.04735112, "balance_loss_mlp": 1.01860523, "epoch": 0.7764083448566104, "flos": 20588341518720.0, "grad_norm": 2.2358871857164173, "language_loss": 0.77009964, "learning_rate": 5.017473674979502e-07, "loss": 0.79202759, "num_input_tokens_seen": 138973140, "step": 6457, "time_per_iteration": 2.615661859512329 }, { "auxiliary_loss_clip": 0.01063026, "auxiliary_loss_mlp": 0.01004269, "balance_loss_clip": 0.90003258, "balance_loss_mlp": 1.00270736, "epoch": 0.7765285877472494, "flos": 67293078560640.0, "grad_norm": 0.7425770504794058, "language_loss": 0.58376503, "learning_rate": 5.01231469165061e-07, "loss": 0.60443795, "num_input_tokens_seen": 139028965, "step": 6458, "time_per_iteration": 3.2331857681274414 }, { "auxiliary_loss_clip": 0.01067068, "auxiliary_loss_mlp": 0.01001431, "balance_loss_clip": 0.97451174, "balance_loss_mlp": 0.99982196, "epoch": 0.7766488306378886, "flos": 61344476121600.0, "grad_norm": 0.8334262249357043, "language_loss": 0.56950748, "learning_rate": 5.007157981941663e-07, "loss": 0.59019244, "num_input_tokens_seen": 139094325, "step": 6459, "time_per_iteration": 3.3812825679779053 }, { "auxiliary_loss_clip": 0.01070208, "auxiliary_loss_mlp": 0.01002139, "balance_loss_clip": 0.93735218, "balance_loss_mlp": 1.00045824, "epoch": 0.7767690735285276, "flos": 62946199393920.0, "grad_norm": 0.8785054242929401, "language_loss": 0.67474991, "learning_rate": 5.002003546634928e-07, "loss": 0.69547337, "num_input_tokens_seen": 139150425, "step": 6460, "time_per_iteration": 3.150148630142212 }, { "auxiliary_loss_clip": 0.0116574, "auxiliary_loss_mlp": 0.01026351, "balance_loss_clip": 0.89865887, "balance_loss_mlp": 1.01910853, "epoch": 0.7768893164191667, "flos": 20886400575360.0, "grad_norm": 1.6759777074055857, "language_loss": 0.75964278, "learning_rate": 4.996851386512331e-07, "loss": 0.78156364, "num_input_tokens_seen": 139169130, "step": 6461, "time_per_iteration": 2.75607967376709 }, { "auxiliary_loss_clip": 0.01166737, "auxiliary_loss_mlp": 0.01027784, "balance_loss_clip": 0.97327387, "balance_loss_mlp": 1.0202651, "epoch": 0.7770095593098058, "flos": 20704584908160.0, "grad_norm": 1.6171870421763277, "language_loss": 0.82982409, "learning_rate": 4.991701502355444e-07, "loss": 0.85176927, "num_input_tokens_seen": 139189595, "step": 6462, "time_per_iteration": 2.659743309020996 }, { "auxiliary_loss_clip": 0.01170489, "auxiliary_loss_mlp": 0.0102466, "balance_loss_clip": 1.00954235, "balance_loss_mlp": 1.01806736, "epoch": 0.7771298022004449, "flos": 24717709877760.0, "grad_norm": 1.52750254133146, "language_loss": 0.75847471, "learning_rate": 4.986553894945518e-07, "loss": 0.78042614, "num_input_tokens_seen": 139210805, "step": 6463, "time_per_iteration": 2.7322983741760254 }, { "auxiliary_loss_clip": 0.01166694, "auxiliary_loss_mlp": 0.01027938, "balance_loss_clip": 0.89366901, "balance_loss_mlp": 1.02104163, "epoch": 0.777250045091084, "flos": 25009232659200.0, "grad_norm": 2.0256366382631685, "language_loss": 0.85755682, "learning_rate": 4.981408565063416e-07, "loss": 0.87950313, "num_input_tokens_seen": 139230750, "step": 6464, "time_per_iteration": 3.6028270721435547 }, { "auxiliary_loss_clip": 0.01168482, "auxiliary_loss_mlp": 0.01025861, "balance_loss_clip": 1.04772377, "balance_loss_mlp": 1.01873231, "epoch": 0.777370287981723, "flos": 20119887319680.0, "grad_norm": 1.8173767371101195, "language_loss": 0.76321316, "learning_rate": 4.976265513489701e-07, "loss": 0.78515661, "num_input_tokens_seen": 139250720, "step": 6465, "time_per_iteration": 2.567706823348999 }, { "auxiliary_loss_clip": 0.01165491, "auxiliary_loss_mlp": 0.01020763, "balance_loss_clip": 1.0080421, "balance_loss_mlp": 1.01322317, "epoch": 0.7774905308723622, "flos": 21718809331200.0, "grad_norm": 1.8897508221722343, "language_loss": 0.80242836, "learning_rate": 4.971124741004562e-07, "loss": 0.82429087, "num_input_tokens_seen": 139269720, "step": 6466, "time_per_iteration": 2.657541036605835 }, { "auxiliary_loss_clip": 0.01166514, "auxiliary_loss_mlp": 0.01026779, "balance_loss_clip": 1.01008058, "balance_loss_mlp": 1.01979887, "epoch": 0.7776107737630013, "flos": 16034115093120.0, "grad_norm": 1.7403856939907842, "language_loss": 0.76450539, "learning_rate": 4.965986248387846e-07, "loss": 0.78643835, "num_input_tokens_seen": 139288035, "step": 6467, "time_per_iteration": 2.6254844665527344 }, { "auxiliary_loss_clip": 0.01168115, "auxiliary_loss_mlp": 0.01024717, "balance_loss_clip": 0.97026783, "balance_loss_mlp": 1.01733184, "epoch": 0.7777310166536403, "flos": 24790895838720.0, "grad_norm": 1.5858929034279727, "language_loss": 0.76955485, "learning_rate": 4.960850036419073e-07, "loss": 0.79148316, "num_input_tokens_seen": 139307135, "step": 6468, "time_per_iteration": 3.712921142578125 }, { "auxiliary_loss_clip": 0.01162198, "auxiliary_loss_mlp": 0.0103154, "balance_loss_clip": 0.97186333, "balance_loss_mlp": 1.02415526, "epoch": 0.7778512595442795, "flos": 17272530253440.0, "grad_norm": 2.5281673903634263, "language_loss": 0.78462124, "learning_rate": 4.955716105877378e-07, "loss": 0.80655861, "num_input_tokens_seen": 139325905, "step": 6469, "time_per_iteration": 2.6823861598968506 }, { "auxiliary_loss_clip": 0.01169399, "auxiliary_loss_mlp": 0.01123037, "balance_loss_clip": 1.0093255, "balance_loss_mlp": 0.0, "epoch": 0.7779715024349185, "flos": 17748418567680.0, "grad_norm": 2.4645841063433087, "language_loss": 0.83214051, "learning_rate": 4.950584457541598e-07, "loss": 0.85506487, "num_input_tokens_seen": 139344370, "step": 6470, "time_per_iteration": 2.6647825241088867 }, { "auxiliary_loss_clip": 0.01167256, "auxiliary_loss_mlp": 0.01027693, "balance_loss_clip": 1.00852084, "balance_loss_mlp": 1.02092457, "epoch": 0.7780917453255576, "flos": 24316875031680.0, "grad_norm": 4.4597298823101275, "language_loss": 0.81876242, "learning_rate": 4.945455092190183e-07, "loss": 0.84071195, "num_input_tokens_seen": 139365625, "step": 6471, "time_per_iteration": 3.6190550327301025 }, { "auxiliary_loss_clip": 0.01063559, "auxiliary_loss_mlp": 0.01000178, "balance_loss_clip": 1.01117706, "balance_loss_mlp": 0.9985804, "epoch": 0.7782119882161967, "flos": 56364601530240.0, "grad_norm": 0.6853268323935653, "language_loss": 0.56026375, "learning_rate": 4.940328010601271e-07, "loss": 0.58090115, "num_input_tokens_seen": 139430540, "step": 6472, "time_per_iteration": 3.195819139480591 }, { "auxiliary_loss_clip": 0.01175001, "auxiliary_loss_mlp": 0.01035506, "balance_loss_clip": 0.97585511, "balance_loss_mlp": 1.02782273, "epoch": 0.7783322311068358, "flos": 46789986994560.0, "grad_norm": 1.7446409211884957, "language_loss": 0.76470298, "learning_rate": 4.935203213552621e-07, "loss": 0.78680801, "num_input_tokens_seen": 139454280, "step": 6473, "time_per_iteration": 2.882431745529175 }, { "auxiliary_loss_clip": 0.01170074, "auxiliary_loss_mlp": 0.01029698, "balance_loss_clip": 0.97336954, "balance_loss_mlp": 1.02226508, "epoch": 0.7784524739974749, "flos": 19057864872960.0, "grad_norm": 1.9164518167824611, "language_loss": 0.66704929, "learning_rate": 4.930080701821662e-07, "loss": 0.68904698, "num_input_tokens_seen": 139471745, "step": 6474, "time_per_iteration": 2.6748838424682617 }, { "auxiliary_loss_clip": 0.01168456, "auxiliary_loss_mlp": 0.01019501, "balance_loss_clip": 0.97287369, "balance_loss_mlp": 1.0122745, "epoch": 0.778572716888114, "flos": 24791111320320.0, "grad_norm": 1.875961745516027, "language_loss": 0.76621413, "learning_rate": 4.92496047618548e-07, "loss": 0.78809369, "num_input_tokens_seen": 139491505, "step": 6475, "time_per_iteration": 2.7093865871429443 }, { "auxiliary_loss_clip": 0.01169748, "auxiliary_loss_mlp": 0.01024784, "balance_loss_clip": 1.01249218, "balance_loss_mlp": 1.01780438, "epoch": 0.7786929597787531, "flos": 20078086867200.0, "grad_norm": 4.736358257934801, "language_loss": 0.77654493, "learning_rate": 4.919842537420811e-07, "loss": 0.79849017, "num_input_tokens_seen": 139508620, "step": 6476, "time_per_iteration": 2.680497884750366 }, { "auxiliary_loss_clip": 0.01166664, "auxiliary_loss_mlp": 0.01027847, "balance_loss_clip": 0.97374129, "balance_loss_mlp": 1.0207808, "epoch": 0.7788132026693921, "flos": 21872220318720.0, "grad_norm": 1.5433337032190435, "language_loss": 0.79472256, "learning_rate": 4.91472688630404e-07, "loss": 0.81666768, "num_input_tokens_seen": 139529360, "step": 6477, "time_per_iteration": 2.631200075149536 }, { "auxiliary_loss_clip": 0.01167883, "auxiliary_loss_mlp": 0.01030044, "balance_loss_clip": 1.05001163, "balance_loss_mlp": 1.02324271, "epoch": 0.7789334455600313, "flos": 11181937351680.0, "grad_norm": 1.6528059325843887, "language_loss": 0.74069858, "learning_rate": 4.909613523611202e-07, "loss": 0.76267785, "num_input_tokens_seen": 139546240, "step": 6478, "time_per_iteration": 2.621912956237793 }, { "auxiliary_loss_clip": 0.01157839, "auxiliary_loss_mlp": 0.01122825, "balance_loss_clip": 0.89284283, "balance_loss_mlp": 0.0, "epoch": 0.7790536884506704, "flos": 28695427015680.0, "grad_norm": 1.7014889047120425, "language_loss": 0.74317509, "learning_rate": 4.904502450117991e-07, "loss": 0.76598167, "num_input_tokens_seen": 139567200, "step": 6479, "time_per_iteration": 2.756920337677002 }, { "auxiliary_loss_clip": 0.01165589, "auxiliary_loss_mlp": 0.01024596, "balance_loss_clip": 0.97464669, "balance_loss_mlp": 1.01703548, "epoch": 0.7791739313413094, "flos": 11072302064640.0, "grad_norm": 2.468713498361715, "language_loss": 0.72102755, "learning_rate": 4.899393666599762e-07, "loss": 0.74292934, "num_input_tokens_seen": 139583775, "step": 6480, "time_per_iteration": 2.6464109420776367 }, { "auxiliary_loss_clip": 0.01166993, "auxiliary_loss_mlp": 0.01027492, "balance_loss_clip": 1.04703617, "balance_loss_mlp": 1.02008295, "epoch": 0.7792941742319486, "flos": 14679276975360.0, "grad_norm": 2.140152260603949, "language_loss": 0.727521, "learning_rate": 4.894287173831506e-07, "loss": 0.74946582, "num_input_tokens_seen": 139599735, "step": 6481, "time_per_iteration": 2.617608070373535 }, { "auxiliary_loss_clip": 0.01164819, "auxiliary_loss_mlp": 0.01021928, "balance_loss_clip": 0.9690817, "balance_loss_mlp": 1.01466179, "epoch": 0.7794144171225876, "flos": 23258874908160.0, "grad_norm": 2.9839987472821488, "language_loss": 0.84888971, "learning_rate": 4.889182972587877e-07, "loss": 0.87075722, "num_input_tokens_seen": 139619030, "step": 6482, "time_per_iteration": 2.63505482673645 }, { "auxiliary_loss_clip": 0.01173034, "auxiliary_loss_mlp": 0.01030027, "balance_loss_clip": 0.93461752, "balance_loss_mlp": 1.02313352, "epoch": 0.7795346600132267, "flos": 21507080613120.0, "grad_norm": 1.8141353388549655, "language_loss": 0.66162854, "learning_rate": 4.884081063643177e-07, "loss": 0.68365914, "num_input_tokens_seen": 139637690, "step": 6483, "time_per_iteration": 2.6926522254943848 }, { "auxiliary_loss_clip": 0.01059739, "auxiliary_loss_mlp": 0.01001219, "balance_loss_clip": 0.93654859, "balance_loss_mlp": 0.99966925, "epoch": 0.7796549029038659, "flos": 70052273694720.0, "grad_norm": 0.8437080040120781, "language_loss": 0.52634478, "learning_rate": 4.878981447771353e-07, "loss": 0.54695433, "num_input_tokens_seen": 139692070, "step": 6484, "time_per_iteration": 3.227926015853882 }, { "auxiliary_loss_clip": 0.01156109, "auxiliary_loss_mlp": 0.01024093, "balance_loss_clip": 0.93138379, "balance_loss_mlp": 1.01661229, "epoch": 0.7797751457945049, "flos": 23989405714560.0, "grad_norm": 1.492684864648409, "language_loss": 0.73418248, "learning_rate": 4.873884125746035e-07, "loss": 0.75598449, "num_input_tokens_seen": 139713745, "step": 6485, "time_per_iteration": 2.7281696796417236 }, { "auxiliary_loss_clip": 0.01160506, "auxiliary_loss_mlp": 0.01025611, "balance_loss_clip": 0.97062075, "balance_loss_mlp": 1.01855421, "epoch": 0.779895388685144, "flos": 22674751937280.0, "grad_norm": 2.837704408860677, "language_loss": 0.72045934, "learning_rate": 4.868789098340456e-07, "loss": 0.74232054, "num_input_tokens_seen": 139731650, "step": 6486, "time_per_iteration": 2.663090467453003 }, { "auxiliary_loss_clip": 0.01163222, "auxiliary_loss_mlp": 0.01025464, "balance_loss_clip": 0.9326368, "balance_loss_mlp": 1.01790595, "epoch": 0.7800156315757831, "flos": 23768698596480.0, "grad_norm": 3.2213214986631065, "language_loss": 0.73141003, "learning_rate": 4.863696366327543e-07, "loss": 0.75329691, "num_input_tokens_seen": 139750820, "step": 6487, "time_per_iteration": 2.7505998611450195 }, { "auxiliary_loss_clip": 0.01165757, "auxiliary_loss_mlp": 0.0102931, "balance_loss_clip": 1.00790191, "balance_loss_mlp": 1.02191305, "epoch": 0.7801358744664222, "flos": 26429714881920.0, "grad_norm": 2.0468413539173618, "language_loss": 0.78028506, "learning_rate": 4.85860593047986e-07, "loss": 0.80223572, "num_input_tokens_seen": 139770885, "step": 6488, "time_per_iteration": 2.6446640491485596 }, { "auxiliary_loss_clip": 0.01153665, "auxiliary_loss_mlp": 0.01023871, "balance_loss_clip": 0.92729783, "balance_loss_mlp": 1.01700807, "epoch": 0.7802561173570612, "flos": 26322162583680.0, "grad_norm": 1.5704600697756521, "language_loss": 0.74561125, "learning_rate": 4.853517791569613e-07, "loss": 0.76738662, "num_input_tokens_seen": 139793065, "step": 6489, "time_per_iteration": 2.7883107662200928 }, { "auxiliary_loss_clip": 0.01170564, "auxiliary_loss_mlp": 0.01122956, "balance_loss_clip": 0.97181875, "balance_loss_mlp": 0.0, "epoch": 0.7803763602477004, "flos": 40333751596800.0, "grad_norm": 2.0019814840909165, "language_loss": 0.65980577, "learning_rate": 4.848431950368684e-07, "loss": 0.68274093, "num_input_tokens_seen": 139815625, "step": 6490, "time_per_iteration": 3.6363368034362793 }, { "auxiliary_loss_clip": 0.01062869, "auxiliary_loss_mlp": 0.01115883, "balance_loss_clip": 1.01068151, "balance_loss_mlp": 0.0, "epoch": 0.7804966031383395, "flos": 67001448038400.0, "grad_norm": 0.7205507838329649, "language_loss": 0.55791086, "learning_rate": 4.843348407648569e-07, "loss": 0.57969838, "num_input_tokens_seen": 139876905, "step": 6491, "time_per_iteration": 3.168738603591919 }, { "auxiliary_loss_clip": 0.01165679, "auxiliary_loss_mlp": 0.01031591, "balance_loss_clip": 1.00551319, "balance_loss_mlp": 1.02433133, "epoch": 0.7806168460289785, "flos": 17740733057280.0, "grad_norm": 2.2457711002339296, "language_loss": 0.83017588, "learning_rate": 4.838267164180457e-07, "loss": 0.85214853, "num_input_tokens_seen": 139892575, "step": 6492, "time_per_iteration": 2.5912086963653564 }, { "auxiliary_loss_clip": 0.0117108, "auxiliary_loss_mlp": 0.01025952, "balance_loss_clip": 1.04842019, "balance_loss_mlp": 1.01825452, "epoch": 0.7807370889196176, "flos": 23946240545280.0, "grad_norm": 3.4201516171631687, "language_loss": 0.83969522, "learning_rate": 4.833188220735156e-07, "loss": 0.86166555, "num_input_tokens_seen": 139912245, "step": 6493, "time_per_iteration": 2.5602259635925293 }, { "auxiliary_loss_clip": 0.01166611, "auxiliary_loss_mlp": 0.01024983, "balance_loss_clip": 1.00926661, "balance_loss_mlp": 1.01796472, "epoch": 0.7808573318102567, "flos": 18989024457600.0, "grad_norm": 4.9894547281845085, "language_loss": 0.74777883, "learning_rate": 4.828111578083152e-07, "loss": 0.76969481, "num_input_tokens_seen": 139929150, "step": 6494, "time_per_iteration": 3.4705746173858643 }, { "auxiliary_loss_clip": 0.01163982, "auxiliary_loss_mlp": 0.01026084, "balance_loss_clip": 0.97289711, "balance_loss_mlp": 1.01823378, "epoch": 0.7809775747008958, "flos": 23980750536960.0, "grad_norm": 2.7225467072069063, "language_loss": 0.81339598, "learning_rate": 4.823037236994556e-07, "loss": 0.83529663, "num_input_tokens_seen": 139947315, "step": 6495, "time_per_iteration": 3.511343240737915 }, { "auxiliary_loss_clip": 0.01067122, "auxiliary_loss_mlp": 0.01002752, "balance_loss_clip": 0.97422671, "balance_loss_mlp": 1.0011189, "epoch": 0.7810978175915348, "flos": 68535875180160.0, "grad_norm": 0.7170297600729217, "language_loss": 0.56323862, "learning_rate": 4.817965198239136e-07, "loss": 0.58393735, "num_input_tokens_seen": 140013775, "step": 6496, "time_per_iteration": 3.209611654281616 }, { "auxiliary_loss_clip": 0.0116079, "auxiliary_loss_mlp": 0.01026708, "balance_loss_clip": 0.9310447, "balance_loss_mlp": 1.01929605, "epoch": 0.781218060482174, "flos": 19642131498240.0, "grad_norm": 1.9351719491497859, "language_loss": 0.74454331, "learning_rate": 4.812895462586331e-07, "loss": 0.76641822, "num_input_tokens_seen": 140031600, "step": 6497, "time_per_iteration": 3.5997562408447266 }, { "auxiliary_loss_clip": 0.01167894, "auxiliary_loss_mlp": 0.01027367, "balance_loss_clip": 0.93379319, "balance_loss_mlp": 1.02049506, "epoch": 0.7813383033728131, "flos": 25627865621760.0, "grad_norm": 1.6630961596565128, "language_loss": 0.81896895, "learning_rate": 4.807828030805207e-07, "loss": 0.84092152, "num_input_tokens_seen": 140050590, "step": 6498, "time_per_iteration": 2.681600332260132 }, { "auxiliary_loss_clip": 0.01166306, "auxiliary_loss_mlp": 0.01029509, "balance_loss_clip": 1.01072955, "balance_loss_mlp": 1.02258873, "epoch": 0.7814585462634521, "flos": 20485924865280.0, "grad_norm": 1.7394988516087966, "language_loss": 0.680089, "learning_rate": 4.802762903664495e-07, "loss": 0.70204711, "num_input_tokens_seen": 140069770, "step": 6499, "time_per_iteration": 2.797266960144043 }, { "auxiliary_loss_clip": 0.01171668, "auxiliary_loss_mlp": 0.01029142, "balance_loss_clip": 0.97383642, "balance_loss_mlp": 1.02161062, "epoch": 0.7815787891540913, "flos": 22304297018880.0, "grad_norm": 3.415807245712423, "language_loss": 0.73650444, "learning_rate": 4.797700081932565e-07, "loss": 0.75851256, "num_input_tokens_seen": 140087635, "step": 6500, "time_per_iteration": 2.683889865875244 }, { "auxiliary_loss_clip": 0.01151927, "auxiliary_loss_mlp": 0.01026989, "balance_loss_clip": 0.85313904, "balance_loss_mlp": 1.01980042, "epoch": 0.7816990320447303, "flos": 22600668136320.0, "grad_norm": 2.1203321391723726, "language_loss": 0.81796646, "learning_rate": 4.792639566377442e-07, "loss": 0.83975565, "num_input_tokens_seen": 140105045, "step": 6501, "time_per_iteration": 2.754768133163452 }, { "auxiliary_loss_clip": 0.0115727, "auxiliary_loss_mlp": 0.01024655, "balance_loss_clip": 1.00625205, "balance_loss_mlp": 1.01752377, "epoch": 0.7818192749353694, "flos": 24935974871040.0, "grad_norm": 3.0060174452759494, "language_loss": 0.77383792, "learning_rate": 4.78758135776681e-07, "loss": 0.79565716, "num_input_tokens_seen": 140124900, "step": 6502, "time_per_iteration": 2.7347018718719482 }, { "auxiliary_loss_clip": 0.01167518, "auxiliary_loss_mlp": 0.01025944, "balance_loss_clip": 0.9725793, "balance_loss_mlp": 1.01902735, "epoch": 0.7819395178260086, "flos": 23733039369600.0, "grad_norm": 1.932354136533387, "language_loss": 0.78932297, "learning_rate": 4.782525456867989e-07, "loss": 0.8112576, "num_input_tokens_seen": 140143755, "step": 6503, "time_per_iteration": 2.6270720958709717 }, { "auxiliary_loss_clip": 0.01169298, "auxiliary_loss_mlp": 0.01028082, "balance_loss_clip": 0.93479812, "balance_loss_mlp": 1.02016091, "epoch": 0.7820597607166476, "flos": 23221671396480.0, "grad_norm": 1.5344604406080777, "language_loss": 0.8322978, "learning_rate": 4.777471864447959e-07, "loss": 0.85427165, "num_input_tokens_seen": 140164495, "step": 6504, "time_per_iteration": 2.669914484024048 }, { "auxiliary_loss_clip": 0.01165763, "auxiliary_loss_mlp": 0.01031002, "balance_loss_clip": 0.97042567, "balance_loss_mlp": 1.0237509, "epoch": 0.7821800036072867, "flos": 22309540404480.0, "grad_norm": 1.8848464007767622, "language_loss": 0.80895531, "learning_rate": 4.772420581273344e-07, "loss": 0.83092296, "num_input_tokens_seen": 140181980, "step": 6505, "time_per_iteration": 2.6672046184539795 }, { "auxiliary_loss_clip": 0.01164096, "auxiliary_loss_mlp": 0.01022178, "balance_loss_clip": 1.00974154, "balance_loss_mlp": 1.01523387, "epoch": 0.7823002464979258, "flos": 21544176384000.0, "grad_norm": 2.0595409564116314, "language_loss": 0.76296794, "learning_rate": 4.7673716081104134e-07, "loss": 0.78483069, "num_input_tokens_seen": 140202155, "step": 6506, "time_per_iteration": 2.614588499069214 }, { "auxiliary_loss_clip": 0.01166711, "auxiliary_loss_mlp": 0.01023218, "balance_loss_clip": 1.01171768, "balance_loss_mlp": 1.01593447, "epoch": 0.7824204893885649, "flos": 24535642815360.0, "grad_norm": 1.7846939786879765, "language_loss": 0.84549814, "learning_rate": 4.762324945725109e-07, "loss": 0.86739743, "num_input_tokens_seen": 140221600, "step": 6507, "time_per_iteration": 2.6299800872802734 }, { "auxiliary_loss_clip": 0.01165373, "auxiliary_loss_mlp": 0.01028636, "balance_loss_clip": 0.97492558, "balance_loss_mlp": 1.02188921, "epoch": 0.782540732279204, "flos": 27415211402880.0, "grad_norm": 1.8763596374425757, "language_loss": 0.76023293, "learning_rate": 4.7572805948829844e-07, "loss": 0.78217304, "num_input_tokens_seen": 140241860, "step": 6508, "time_per_iteration": 2.700995922088623 }, { "auxiliary_loss_clip": 0.01167413, "auxiliary_loss_mlp": 0.01021736, "balance_loss_clip": 0.89411378, "balance_loss_mlp": 1.01475644, "epoch": 0.7826609751698431, "flos": 24353216616960.0, "grad_norm": 1.8696132135115888, "language_loss": 0.71092904, "learning_rate": 4.7522385563492795e-07, "loss": 0.73282051, "num_input_tokens_seen": 140262160, "step": 6509, "time_per_iteration": 2.694211959838867 }, { "auxiliary_loss_clip": 0.01168176, "auxiliary_loss_mlp": 0.01027501, "balance_loss_clip": 0.93438351, "balance_loss_mlp": 1.02044344, "epoch": 0.7827812180604822, "flos": 23988543788160.0, "grad_norm": 1.8839008243692594, "language_loss": 0.70191216, "learning_rate": 4.747198830888863e-07, "loss": 0.72386891, "num_input_tokens_seen": 140282030, "step": 6510, "time_per_iteration": 2.7111666202545166 }, { "auxiliary_loss_clip": 0.01159861, "auxiliary_loss_mlp": 0.01026362, "balance_loss_clip": 0.97012967, "balance_loss_mlp": 1.01889384, "epoch": 0.7829014609511212, "flos": 27454318335360.0, "grad_norm": 1.980545554206769, "language_loss": 0.68461263, "learning_rate": 4.742161419266251e-07, "loss": 0.7064749, "num_input_tokens_seen": 140301190, "step": 6511, "time_per_iteration": 2.647355318069458 }, { "auxiliary_loss_clip": 0.011707, "auxiliary_loss_mlp": 0.0102857, "balance_loss_clip": 1.01065981, "balance_loss_mlp": 1.02088737, "epoch": 0.7830217038417604, "flos": 29204532432000.0, "grad_norm": 2.654847172004722, "language_loss": 0.64908683, "learning_rate": 4.7371263222456304e-07, "loss": 0.67107952, "num_input_tokens_seen": 140318510, "step": 6512, "time_per_iteration": 2.6315083503723145 }, { "auxiliary_loss_clip": 0.01063668, "auxiliary_loss_mlp": 0.01001415, "balance_loss_clip": 0.97446978, "balance_loss_mlp": 0.99981767, "epoch": 0.7831419467323995, "flos": 60950895822720.0, "grad_norm": 0.7958560994952903, "language_loss": 0.61450505, "learning_rate": 4.7320935405908004e-07, "loss": 0.63515592, "num_input_tokens_seen": 140379380, "step": 6513, "time_per_iteration": 3.20462965965271 }, { "auxiliary_loss_clip": 0.01172317, "auxiliary_loss_mlp": 0.01029431, "balance_loss_clip": 1.04889596, "balance_loss_mlp": 1.02165246, "epoch": 0.7832621896230385, "flos": 19682531320320.0, "grad_norm": 2.0367489277274538, "language_loss": 0.84144241, "learning_rate": 4.7270630750652475e-07, "loss": 0.86345989, "num_input_tokens_seen": 140395335, "step": 6514, "time_per_iteration": 2.628105640411377 }, { "auxiliary_loss_clip": 0.01166515, "auxiliary_loss_mlp": 0.01025795, "balance_loss_clip": 1.00929773, "balance_loss_mlp": 1.0187676, "epoch": 0.7833824325136777, "flos": 25009232659200.0, "grad_norm": 2.177292129338143, "language_loss": 0.80490577, "learning_rate": 4.7220349264320746e-07, "loss": 0.82682884, "num_input_tokens_seen": 140414420, "step": 6515, "time_per_iteration": 2.673638343811035 }, { "auxiliary_loss_clip": 0.01064865, "auxiliary_loss_mlp": 0.01001332, "balance_loss_clip": 0.97357696, "balance_loss_mlp": 0.99976999, "epoch": 0.7835026754043167, "flos": 68800142517120.0, "grad_norm": 0.7377237117200126, "language_loss": 0.54994142, "learning_rate": 4.71700909545407e-07, "loss": 0.57060343, "num_input_tokens_seen": 140477365, "step": 6516, "time_per_iteration": 4.030500650405884 }, { "auxiliary_loss_clip": 0.01168947, "auxiliary_loss_mlp": 0.01020667, "balance_loss_clip": 1.01008821, "balance_loss_mlp": 1.01375318, "epoch": 0.7836229182949558, "flos": 19864598382720.0, "grad_norm": 1.8521307260486977, "language_loss": 0.76935965, "learning_rate": 4.711985582893627e-07, "loss": 0.79125583, "num_input_tokens_seen": 140495885, "step": 6517, "time_per_iteration": 2.604717493057251 }, { "auxiliary_loss_clip": 0.0116457, "auxiliary_loss_mlp": 0.01026127, "balance_loss_clip": 0.89351547, "balance_loss_mlp": 1.01906085, "epoch": 0.783743161185595, "flos": 22965843755520.0, "grad_norm": 1.6312257295202097, "language_loss": 0.71756423, "learning_rate": 4.706964389512811e-07, "loss": 0.7394712, "num_input_tokens_seen": 140515920, "step": 6518, "time_per_iteration": 2.762772560119629 }, { "auxiliary_loss_clip": 0.01167537, "auxiliary_loss_mlp": 0.01028524, "balance_loss_clip": 1.0491581, "balance_loss_mlp": 1.0213412, "epoch": 0.783863404076234, "flos": 12458489777280.0, "grad_norm": 1.7729744010009862, "language_loss": 0.87258315, "learning_rate": 4.701945516073345e-07, "loss": 0.89454377, "num_input_tokens_seen": 140533395, "step": 6519, "time_per_iteration": 2.5636658668518066 }, { "auxiliary_loss_clip": 0.01161596, "auxiliary_loss_mlp": 0.01027081, "balance_loss_clip": 0.93365884, "balance_loss_mlp": 1.0202446, "epoch": 0.7839836469668731, "flos": 24243940465920.0, "grad_norm": 1.8455507685401986, "language_loss": 0.75082731, "learning_rate": 4.696928963336577e-07, "loss": 0.77271414, "num_input_tokens_seen": 140552825, "step": 6520, "time_per_iteration": 2.7096426486968994 }, { "auxiliary_loss_clip": 0.01063432, "auxiliary_loss_mlp": 0.01001881, "balance_loss_clip": 0.97411048, "balance_loss_mlp": 1.00034285, "epoch": 0.7841038898575122, "flos": 62121978938880.0, "grad_norm": 0.8580538256086414, "language_loss": 0.61049145, "learning_rate": 4.6919147320635224e-07, "loss": 0.63114458, "num_input_tokens_seen": 140615535, "step": 6521, "time_per_iteration": 5.053264379501343 }, { "auxiliary_loss_clip": 0.01167573, "auxiliary_loss_mlp": 0.0102897, "balance_loss_clip": 1.00946903, "balance_loss_mlp": 1.02165079, "epoch": 0.7842241327481513, "flos": 20193899293440.0, "grad_norm": 2.4864392162336477, "language_loss": 0.73514283, "learning_rate": 4.6869028230148286e-07, "loss": 0.75710827, "num_input_tokens_seen": 140633330, "step": 6522, "time_per_iteration": 2.6090571880340576 }, { "auxiliary_loss_clip": 0.01154968, "auxiliary_loss_mlp": 0.01024979, "balance_loss_clip": 0.9284839, "balance_loss_mlp": 1.01761222, "epoch": 0.7843443756387903, "flos": 28074531496320.0, "grad_norm": 2.5884377983454527, "language_loss": 0.59663969, "learning_rate": 4.6818932369507957e-07, "loss": 0.6184392, "num_input_tokens_seen": 140652830, "step": 6523, "time_per_iteration": 3.6616551876068115 }, { "auxiliary_loss_clip": 0.01166543, "auxiliary_loss_mlp": 0.01028326, "balance_loss_clip": 1.01132095, "balance_loss_mlp": 1.02079773, "epoch": 0.7844646185294295, "flos": 21323397438720.0, "grad_norm": 2.4206864085310085, "language_loss": 0.89291912, "learning_rate": 4.676885974631386e-07, "loss": 0.91486788, "num_input_tokens_seen": 140671190, "step": 6524, "time_per_iteration": 2.6465742588043213 }, { "auxiliary_loss_clip": 0.011694, "auxiliary_loss_mlp": 0.01030237, "balance_loss_clip": 1.01121545, "balance_loss_mlp": 1.02297688, "epoch": 0.7845848614200686, "flos": 23656585271040.0, "grad_norm": 1.7825656392486395, "language_loss": 0.81204081, "learning_rate": 4.67188103681619e-07, "loss": 0.83403713, "num_input_tokens_seen": 140690975, "step": 6525, "time_per_iteration": 2.6330630779266357 }, { "auxiliary_loss_clip": 0.01168527, "auxiliary_loss_mlp": 0.0112285, "balance_loss_clip": 1.01277065, "balance_loss_mlp": 0.0, "epoch": 0.7847051043107076, "flos": 23402194174080.0, "grad_norm": 2.1054141436386518, "language_loss": 0.6910423, "learning_rate": 4.666878424264453e-07, "loss": 0.71395612, "num_input_tokens_seen": 140710930, "step": 6526, "time_per_iteration": 2.5620663166046143 }, { "auxiliary_loss_clip": 0.01154546, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 0.97031862, "balance_loss_mlp": 1.02314758, "epoch": 0.7848253472013467, "flos": 19022277473280.0, "grad_norm": 1.6649523598318046, "language_loss": 0.73768955, "learning_rate": 4.661878137735069e-07, "loss": 0.75953102, "num_input_tokens_seen": 140729120, "step": 6527, "time_per_iteration": 2.6027300357818604 }, { "auxiliary_loss_clip": 0.01166198, "auxiliary_loss_mlp": 0.01025674, "balance_loss_clip": 0.97131836, "balance_loss_mlp": 1.01864386, "epoch": 0.7849455900919858, "flos": 21179180332800.0, "grad_norm": 1.8109207755405394, "language_loss": 0.74947166, "learning_rate": 4.656880177986571e-07, "loss": 0.77139032, "num_input_tokens_seen": 140747665, "step": 6528, "time_per_iteration": 2.655012369155884 }, { "auxiliary_loss_clip": 0.01168511, "auxiliary_loss_mlp": 0.01027177, "balance_loss_clip": 0.97088742, "balance_loss_mlp": 1.01987815, "epoch": 0.7850658329826249, "flos": 19536482620800.0, "grad_norm": 1.9625963465251397, "language_loss": 0.81825292, "learning_rate": 4.6518845457771607e-07, "loss": 0.84020984, "num_input_tokens_seen": 140766525, "step": 6529, "time_per_iteration": 2.708463668823242 }, { "auxiliary_loss_clip": 0.0115903, "auxiliary_loss_mlp": 0.01122254, "balance_loss_clip": 1.0081718, "balance_loss_mlp": 0.0, "epoch": 0.7851860758732639, "flos": 12495334152960.0, "grad_norm": 1.8440640681330853, "language_loss": 0.79153305, "learning_rate": 4.646891241864652e-07, "loss": 0.8143459, "num_input_tokens_seen": 140785090, "step": 6530, "time_per_iteration": 2.657048463821411 }, { "auxiliary_loss_clip": 0.01163903, "auxiliary_loss_mlp": 0.01028564, "balance_loss_clip": 1.00855398, "balance_loss_mlp": 1.02114964, "epoch": 0.7853063187639031, "flos": 22960959505920.0, "grad_norm": 2.3184072280818304, "language_loss": 0.73527873, "learning_rate": 4.6419002670065397e-07, "loss": 0.75720346, "num_input_tokens_seen": 140804670, "step": 6531, "time_per_iteration": 2.6516385078430176 }, { "auxiliary_loss_clip": 0.01170273, "auxiliary_loss_mlp": 0.01029366, "balance_loss_clip": 0.93638623, "balance_loss_mlp": 1.02195072, "epoch": 0.7854265616545422, "flos": 17347260499200.0, "grad_norm": 2.0112925062551392, "language_loss": 0.86789024, "learning_rate": 4.6369116219599445e-07, "loss": 0.88988662, "num_input_tokens_seen": 140820655, "step": 6532, "time_per_iteration": 2.684767723083496 }, { "auxiliary_loss_clip": 0.01163482, "auxiliary_loss_mlp": 0.01026935, "balance_loss_clip": 0.93323761, "balance_loss_mlp": 1.02013707, "epoch": 0.7855468045451812, "flos": 23838293197440.0, "grad_norm": 3.8664016550164604, "language_loss": 0.79385853, "learning_rate": 4.631925307481637e-07, "loss": 0.81576276, "num_input_tokens_seen": 140840470, "step": 6533, "time_per_iteration": 2.6788833141326904 }, { "auxiliary_loss_clip": 0.01166192, "auxiliary_loss_mlp": 0.01022642, "balance_loss_clip": 0.9737345, "balance_loss_mlp": 1.01599956, "epoch": 0.7856670474358204, "flos": 25666792986240.0, "grad_norm": 2.0098980384913276, "language_loss": 0.76142603, "learning_rate": 4.6269413243280533e-07, "loss": 0.78331441, "num_input_tokens_seen": 140859890, "step": 6534, "time_per_iteration": 2.719014883041382 }, { "auxiliary_loss_clip": 0.01172778, "auxiliary_loss_mlp": 0.01025693, "balance_loss_clip": 0.97381032, "balance_loss_mlp": 1.01806986, "epoch": 0.7857872903264594, "flos": 18144656472960.0, "grad_norm": 2.25377912667196, "language_loss": 0.73753959, "learning_rate": 4.621959673255236e-07, "loss": 0.75952435, "num_input_tokens_seen": 140876190, "step": 6535, "time_per_iteration": 2.666167736053467 }, { "auxiliary_loss_clip": 0.01163239, "auxiliary_loss_mlp": 0.01024705, "balance_loss_clip": 0.89471811, "balance_loss_mlp": 1.01809514, "epoch": 0.7859075332170985, "flos": 14386138081920.0, "grad_norm": 2.2555624975429653, "language_loss": 0.90401566, "learning_rate": 4.6169803550189135e-07, "loss": 0.92589515, "num_input_tokens_seen": 140891885, "step": 6536, "time_per_iteration": 2.7639307975769043 }, { "auxiliary_loss_clip": 0.01158239, "auxiliary_loss_mlp": 0.01030286, "balance_loss_clip": 0.89567554, "balance_loss_mlp": 1.0223949, "epoch": 0.7860277761077377, "flos": 19864059678720.0, "grad_norm": 2.105992917466931, "language_loss": 0.77319098, "learning_rate": 4.6120033703744355e-07, "loss": 0.79507625, "num_input_tokens_seen": 140910780, "step": 6537, "time_per_iteration": 2.677001953125 }, { "auxiliary_loss_clip": 0.01154704, "auxiliary_loss_mlp": 0.01025336, "balance_loss_clip": 0.97020304, "balance_loss_mlp": 1.01847851, "epoch": 0.7861480189983767, "flos": 26396174557440.0, "grad_norm": 1.8146602846041604, "language_loss": 0.78242254, "learning_rate": 4.607028720076822e-07, "loss": 0.80422294, "num_input_tokens_seen": 140927460, "step": 6538, "time_per_iteration": 2.6624624729156494 }, { "auxiliary_loss_clip": 0.01168601, "auxiliary_loss_mlp": 0.0102656, "balance_loss_clip": 1.01048195, "balance_loss_mlp": 1.01924682, "epoch": 0.7862682618890158, "flos": 24236578177920.0, "grad_norm": 1.7799998250650668, "language_loss": 0.73160994, "learning_rate": 4.6020564048807074e-07, "loss": 0.75356162, "num_input_tokens_seen": 140945135, "step": 6539, "time_per_iteration": 2.6181328296661377 }, { "auxiliary_loss_clip": 0.01168527, "auxiliary_loss_mlp": 0.01027017, "balance_loss_clip": 1.00944018, "balance_loss_mlp": 1.02057672, "epoch": 0.7863885047796549, "flos": 47551508259840.0, "grad_norm": 1.953716793149751, "language_loss": 0.71995783, "learning_rate": 4.5970864255403883e-07, "loss": 0.7419132, "num_input_tokens_seen": 140966660, "step": 6540, "time_per_iteration": 2.8283326625823975 }, { "auxiliary_loss_clip": 0.01153684, "auxiliary_loss_mlp": 0.01027105, "balance_loss_clip": 1.00714111, "balance_loss_mlp": 1.02029169, "epoch": 0.786508747670294, "flos": 24389234979840.0, "grad_norm": 2.1033111575265346, "language_loss": 0.8218888, "learning_rate": 4.59211878280982e-07, "loss": 0.84369665, "num_input_tokens_seen": 140986175, "step": 6541, "time_per_iteration": 2.5936849117279053 }, { "auxiliary_loss_clip": 0.01167184, "auxiliary_loss_mlp": 0.0102863, "balance_loss_clip": 0.97166729, "balance_loss_mlp": 1.02146888, "epoch": 0.786628990560933, "flos": 18041234238720.0, "grad_norm": 2.2011970021965928, "language_loss": 0.69840956, "learning_rate": 4.587153477442578e-07, "loss": 0.72036773, "num_input_tokens_seen": 141002490, "step": 6542, "time_per_iteration": 2.660841703414917 }, { "auxiliary_loss_clip": 0.01171813, "auxiliary_loss_mlp": 0.01023634, "balance_loss_clip": 1.04927504, "balance_loss_mlp": 1.01571274, "epoch": 0.7867492334515722, "flos": 25848860048640.0, "grad_norm": 3.5390868550958667, "language_loss": 0.81231254, "learning_rate": 4.582190510191899e-07, "loss": 0.83426702, "num_input_tokens_seen": 141021150, "step": 6543, "time_per_iteration": 3.4251599311828613 }, { "auxiliary_loss_clip": 0.01162543, "auxiliary_loss_mlp": 0.01026798, "balance_loss_clip": 0.93438357, "balance_loss_mlp": 1.01961839, "epoch": 0.7868694763422113, "flos": 16580819070720.0, "grad_norm": 2.031678763225563, "language_loss": 0.87346172, "learning_rate": 4.5772298818106625e-07, "loss": 0.89535517, "num_input_tokens_seen": 141036940, "step": 6544, "time_per_iteration": 2.6526763439178467 }, { "auxiliary_loss_clip": 0.01176174, "auxiliary_loss_mlp": 0.01028879, "balance_loss_clip": 0.93646115, "balance_loss_mlp": 1.02134442, "epoch": 0.7869897192328503, "flos": 29386276272000.0, "grad_norm": 2.13374999493199, "language_loss": 0.71825147, "learning_rate": 4.572271593051384e-07, "loss": 0.74030203, "num_input_tokens_seen": 141054295, "step": 6545, "time_per_iteration": 2.7288169860839844 }, { "auxiliary_loss_clip": 0.01154074, "auxiliary_loss_mlp": 0.01023978, "balance_loss_clip": 0.89476573, "balance_loss_mlp": 1.01700974, "epoch": 0.7871099621234895, "flos": 17128923678720.0, "grad_norm": 1.5700970114577464, "language_loss": 0.77726394, "learning_rate": 4.567315644666245e-07, "loss": 0.79904449, "num_input_tokens_seen": 141073090, "step": 6546, "time_per_iteration": 4.533766269683838 }, { "auxiliary_loss_clip": 0.01159076, "auxiliary_loss_mlp": 0.01028807, "balance_loss_clip": 0.9346531, "balance_loss_mlp": 1.02185392, "epoch": 0.7872302050141285, "flos": 23440187784960.0, "grad_norm": 2.1346410553157935, "language_loss": 0.84722656, "learning_rate": 4.5623620374070507e-07, "loss": 0.8691054, "num_input_tokens_seen": 141092405, "step": 6547, "time_per_iteration": 2.655207872390747 }, { "auxiliary_loss_clip": 0.01066102, "auxiliary_loss_mlp": 0.00999914, "balance_loss_clip": 0.89729548, "balance_loss_mlp": 0.99830478, "epoch": 0.7873504479047676, "flos": 65959752689280.0, "grad_norm": 0.7620758724970804, "language_loss": 0.58405292, "learning_rate": 4.557410772025263e-07, "loss": 0.60471308, "num_input_tokens_seen": 141154355, "step": 6548, "time_per_iteration": 3.3585045337677 }, { "auxiliary_loss_clip": 0.01162201, "auxiliary_loss_mlp": 0.01029743, "balance_loss_clip": 0.97049499, "balance_loss_mlp": 1.02269769, "epoch": 0.7874706907954068, "flos": 23258336204160.0, "grad_norm": 1.903867621075791, "language_loss": 0.66050541, "learning_rate": 4.5524618492719803e-07, "loss": 0.68242478, "num_input_tokens_seen": 141173575, "step": 6549, "time_per_iteration": 3.7023766040802 }, { "auxiliary_loss_clip": 0.01167011, "auxiliary_loss_mlp": 0.01024329, "balance_loss_clip": 1.00881195, "balance_loss_mlp": 1.01787066, "epoch": 0.7875909336860458, "flos": 28767786963840.0, "grad_norm": 1.498659976872526, "language_loss": 0.78976071, "learning_rate": 4.54751526989795e-07, "loss": 0.81167406, "num_input_tokens_seen": 141195415, "step": 6550, "time_per_iteration": 2.7266979217529297 }, { "auxiliary_loss_clip": 0.01167379, "auxiliary_loss_mlp": 0.01027683, "balance_loss_clip": 1.00863683, "balance_loss_mlp": 1.02067065, "epoch": 0.7877111765766849, "flos": 18697286194560.0, "grad_norm": 1.9233049276859338, "language_loss": 0.79234463, "learning_rate": 4.5425710346535775e-07, "loss": 0.81429523, "num_input_tokens_seen": 141213360, "step": 6551, "time_per_iteration": 2.592759132385254 }, { "auxiliary_loss_clip": 0.01168979, "auxiliary_loss_mlp": 0.01029975, "balance_loss_clip": 1.00907636, "balance_loss_mlp": 1.02268577, "epoch": 0.787831419467324, "flos": 27592968833280.0, "grad_norm": 2.144721732998236, "language_loss": 0.81613505, "learning_rate": 4.537629144288877e-07, "loss": 0.83812451, "num_input_tokens_seen": 141230815, "step": 6552, "time_per_iteration": 2.591217041015625 }, { "auxiliary_loss_clip": 0.0117003, "auxiliary_loss_mlp": 0.01023566, "balance_loss_clip": 0.89244771, "balance_loss_mlp": 1.01660442, "epoch": 0.7879516623579631, "flos": 18150187167360.0, "grad_norm": 1.864222995641037, "language_loss": 0.74733305, "learning_rate": 4.5326895995535477e-07, "loss": 0.76926899, "num_input_tokens_seen": 141249715, "step": 6553, "time_per_iteration": 2.733327865600586 }, { "auxiliary_loss_clip": 0.011634, "auxiliary_loss_mlp": 0.01024738, "balance_loss_clip": 1.009022, "balance_loss_mlp": 1.01781166, "epoch": 0.7880719052486022, "flos": 20339193807360.0, "grad_norm": 3.0606260342430875, "language_loss": 0.84539866, "learning_rate": 4.527752401196907e-07, "loss": 0.86728001, "num_input_tokens_seen": 141267730, "step": 6554, "time_per_iteration": 2.6106808185577393 }, { "auxiliary_loss_clip": 0.01159042, "auxiliary_loss_mlp": 0.01027579, "balance_loss_clip": 0.97050852, "balance_loss_mlp": 1.02064085, "epoch": 0.7881921481392413, "flos": 21653237053440.0, "grad_norm": 1.6274342547247693, "language_loss": 0.66782463, "learning_rate": 4.5228175499679254e-07, "loss": 0.68969083, "num_input_tokens_seen": 141287315, "step": 6555, "time_per_iteration": 2.6975746154785156 }, { "auxiliary_loss_clip": 0.01065147, "auxiliary_loss_mlp": 0.01001033, "balance_loss_clip": 0.97315693, "balance_loss_mlp": 0.99944729, "epoch": 0.7883123910298804, "flos": 68565860058240.0, "grad_norm": 0.8166828564400855, "language_loss": 0.54527748, "learning_rate": 4.5178850466152174e-07, "loss": 0.56593931, "num_input_tokens_seen": 141346145, "step": 6556, "time_per_iteration": 3.254084348678589 }, { "auxiliary_loss_clip": 0.01155591, "auxiliary_loss_mlp": 0.01027315, "balance_loss_clip": 0.96721709, "balance_loss_mlp": 1.02053154, "epoch": 0.7884326339205194, "flos": 19318217627520.0, "grad_norm": 2.0961403756669954, "language_loss": 0.82126492, "learning_rate": 4.512954891887031e-07, "loss": 0.84309399, "num_input_tokens_seen": 141364445, "step": 6557, "time_per_iteration": 2.7418880462646484 }, { "auxiliary_loss_clip": 0.01161613, "auxiliary_loss_mlp": 0.01030592, "balance_loss_clip": 0.97218108, "balance_loss_mlp": 1.02312016, "epoch": 0.7885528768111585, "flos": 17784903807360.0, "grad_norm": 2.3728144722178928, "language_loss": 0.82940447, "learning_rate": 4.5080270865312806e-07, "loss": 0.85132647, "num_input_tokens_seen": 141381640, "step": 6558, "time_per_iteration": 2.5985403060913086 }, { "auxiliary_loss_clip": 0.01166928, "auxiliary_loss_mlp": 0.01028453, "balance_loss_clip": 1.01048684, "balance_loss_mlp": 1.02149713, "epoch": 0.7886731197017977, "flos": 18807639753600.0, "grad_norm": 2.0654940656966128, "language_loss": 0.7102496, "learning_rate": 4.5031016312954985e-07, "loss": 0.73220342, "num_input_tokens_seen": 141399955, "step": 6559, "time_per_iteration": 2.6435747146606445 }, { "auxiliary_loss_clip": 0.01177275, "auxiliary_loss_mlp": 0.01029136, "balance_loss_clip": 1.01333034, "balance_loss_mlp": 1.0217452, "epoch": 0.7887933625924367, "flos": 33365358126720.0, "grad_norm": 3.8204537790508803, "language_loss": 0.7438519, "learning_rate": 4.498178526926886e-07, "loss": 0.76591599, "num_input_tokens_seen": 141420820, "step": 6560, "time_per_iteration": 2.752187728881836 }, { "auxiliary_loss_clip": 0.01168676, "auxiliary_loss_mlp": 0.01024237, "balance_loss_clip": 1.05015683, "balance_loss_mlp": 1.01775241, "epoch": 0.7889136054830758, "flos": 17019360218880.0, "grad_norm": 2.049210002577828, "language_loss": 0.72437167, "learning_rate": 4.4932577741722635e-07, "loss": 0.74630082, "num_input_tokens_seen": 141439350, "step": 6561, "time_per_iteration": 2.540818214416504 }, { "auxiliary_loss_clip": 0.01161726, "auxiliary_loss_mlp": 0.01028471, "balance_loss_clip": 0.97034001, "balance_loss_mlp": 1.02122951, "epoch": 0.7890338483737149, "flos": 29424629018880.0, "grad_norm": 1.6313894554231891, "language_loss": 0.74159473, "learning_rate": 4.4883393737780985e-07, "loss": 0.76349664, "num_input_tokens_seen": 141460300, "step": 6562, "time_per_iteration": 2.7743563652038574 }, { "auxiliary_loss_clip": 0.01156839, "auxiliary_loss_mlp": 0.01025506, "balance_loss_clip": 1.0062021, "balance_loss_mlp": 1.01895893, "epoch": 0.789154091264354, "flos": 19971576063360.0, "grad_norm": 1.9639560141782182, "language_loss": 0.78717232, "learning_rate": 4.4834233264905254e-07, "loss": 0.80899572, "num_input_tokens_seen": 141477315, "step": 6563, "time_per_iteration": 2.64365816116333 }, { "auxiliary_loss_clip": 0.01155244, "auxiliary_loss_mlp": 0.01030373, "balance_loss_clip": 0.93095601, "balance_loss_mlp": 1.02353036, "epoch": 0.789274334154993, "flos": 14537825216640.0, "grad_norm": 2.152646381497799, "language_loss": 0.71263933, "learning_rate": 4.478509633055294e-07, "loss": 0.73449546, "num_input_tokens_seen": 141495025, "step": 6564, "time_per_iteration": 2.6539855003356934 }, { "auxiliary_loss_clip": 0.01167995, "auxiliary_loss_mlp": 0.01029002, "balance_loss_clip": 0.97046298, "balance_loss_mlp": 1.02131295, "epoch": 0.7893945770456322, "flos": 21827403123840.0, "grad_norm": 2.215951883636341, "language_loss": 0.80172092, "learning_rate": 4.473598294217813e-07, "loss": 0.82369089, "num_input_tokens_seen": 141510450, "step": 6565, "time_per_iteration": 2.6708598136901855 }, { "auxiliary_loss_clip": 0.01164546, "auxiliary_loss_mlp": 0.01022905, "balance_loss_clip": 1.01162732, "balance_loss_mlp": 1.01631284, "epoch": 0.7895148199362713, "flos": 20740639184640.0, "grad_norm": 2.103132046080869, "language_loss": 0.71706784, "learning_rate": 4.468689310723124e-07, "loss": 0.73894238, "num_input_tokens_seen": 141528265, "step": 6566, "time_per_iteration": 2.6311442852020264 }, { "auxiliary_loss_clip": 0.01166876, "auxiliary_loss_mlp": 0.01025197, "balance_loss_clip": 0.93190926, "balance_loss_mlp": 1.01802683, "epoch": 0.7896350628269103, "flos": 16690669839360.0, "grad_norm": 1.8397041431899528, "language_loss": 0.78526366, "learning_rate": 4.463782683315913e-07, "loss": 0.80718446, "num_input_tokens_seen": 141547270, "step": 6567, "time_per_iteration": 2.6357717514038086 }, { "auxiliary_loss_clip": 0.01164802, "auxiliary_loss_mlp": 0.01022164, "balance_loss_clip": 1.04809129, "balance_loss_mlp": 1.015697, "epoch": 0.7897553057175495, "flos": 22638374438400.0, "grad_norm": 1.6039651231033567, "language_loss": 0.73290682, "learning_rate": 4.458878412740523e-07, "loss": 0.75477648, "num_input_tokens_seen": 141566050, "step": 6568, "time_per_iteration": 2.628063201904297 }, { "auxiliary_loss_clip": 0.01165412, "auxiliary_loss_mlp": 0.0102437, "balance_loss_clip": 1.01077211, "balance_loss_mlp": 1.01733649, "epoch": 0.7898755486081885, "flos": 14537573821440.0, "grad_norm": 4.365343003926625, "language_loss": 0.77406073, "learning_rate": 4.453976499740919e-07, "loss": 0.79595852, "num_input_tokens_seen": 141583695, "step": 6569, "time_per_iteration": 3.544257640838623 }, { "auxiliary_loss_clip": 0.01165766, "auxiliary_loss_mlp": 0.01022731, "balance_loss_clip": 1.01194453, "balance_loss_mlp": 1.01534641, "epoch": 0.7899957914988276, "flos": 17238487138560.0, "grad_norm": 2.058441206214522, "language_loss": 0.77750182, "learning_rate": 4.4490769450607215e-07, "loss": 0.79938686, "num_input_tokens_seen": 141601320, "step": 6570, "time_per_iteration": 2.596633195877075 }, { "auxiliary_loss_clip": 0.01155833, "auxiliary_loss_mlp": 0.01027012, "balance_loss_clip": 0.92751449, "balance_loss_mlp": 1.0201931, "epoch": 0.7901160343894668, "flos": 41279351086080.0, "grad_norm": 2.012274596488211, "language_loss": 0.72912169, "learning_rate": 4.4441797494431845e-07, "loss": 0.75095022, "num_input_tokens_seen": 141623125, "step": 6571, "time_per_iteration": 2.903616428375244 }, { "auxiliary_loss_clip": 0.01165282, "auxiliary_loss_mlp": 0.01025713, "balance_loss_clip": 1.01066923, "balance_loss_mlp": 1.01885831, "epoch": 0.7902362772801058, "flos": 16837005847680.0, "grad_norm": 2.037990107263529, "language_loss": 0.77430558, "learning_rate": 4.439284913631207e-07, "loss": 0.79621553, "num_input_tokens_seen": 141640335, "step": 6572, "time_per_iteration": 3.637277126312256 }, { "auxiliary_loss_clip": 0.0116981, "auxiliary_loss_mlp": 0.01026007, "balance_loss_clip": 0.93639916, "balance_loss_mlp": 1.01869392, "epoch": 0.7903565201707449, "flos": 27125987091840.0, "grad_norm": 1.7812641453555786, "language_loss": 0.83646786, "learning_rate": 4.434392438367347e-07, "loss": 0.85842597, "num_input_tokens_seen": 141659760, "step": 6573, "time_per_iteration": 3.569746971130371 }, { "auxiliary_loss_clip": 0.01171847, "auxiliary_loss_mlp": 0.01022719, "balance_loss_clip": 1.00991511, "balance_loss_mlp": 1.0157243, "epoch": 0.790476763061384, "flos": 31025167142400.0, "grad_norm": 1.8016036358056842, "language_loss": 0.73984933, "learning_rate": 4.4295023243937677e-07, "loss": 0.76179498, "num_input_tokens_seen": 141679965, "step": 6574, "time_per_iteration": 2.7244699001312256 }, { "auxiliary_loss_clip": 0.01173798, "auxiliary_loss_mlp": 0.01027988, "balance_loss_clip": 1.01381099, "balance_loss_mlp": 1.02020967, "epoch": 0.7905970059520231, "flos": 22089084681600.0, "grad_norm": 1.5344936744945334, "language_loss": 0.80220944, "learning_rate": 4.4246145724523123e-07, "loss": 0.82422727, "num_input_tokens_seen": 141697710, "step": 6575, "time_per_iteration": 2.6531872749328613 }, { "auxiliary_loss_clip": 0.01165046, "auxiliary_loss_mlp": 0.01026408, "balance_loss_clip": 0.93592602, "balance_loss_mlp": 1.01920772, "epoch": 0.7907172488426621, "flos": 20558141159040.0, "grad_norm": 2.1654608779794033, "language_loss": 0.77404189, "learning_rate": 4.41972918328444e-07, "loss": 0.79595649, "num_input_tokens_seen": 141715145, "step": 6576, "time_per_iteration": 3.6055850982666016 }, { "auxiliary_loss_clip": 0.01165329, "auxiliary_loss_mlp": 0.01027658, "balance_loss_clip": 1.01133811, "balance_loss_mlp": 1.02043951, "epoch": 0.7908374917333013, "flos": 30081542901120.0, "grad_norm": 1.8939989315777086, "language_loss": 0.77316296, "learning_rate": 4.4148461576312646e-07, "loss": 0.79509276, "num_input_tokens_seen": 141734810, "step": 6577, "time_per_iteration": 2.6666600704193115 }, { "auxiliary_loss_clip": 0.01169888, "auxiliary_loss_mlp": 0.01023395, "balance_loss_clip": 1.01202655, "balance_loss_mlp": 1.01666868, "epoch": 0.7909577346239404, "flos": 20996359084800.0, "grad_norm": 1.4250821195004977, "language_loss": 0.74539894, "learning_rate": 4.4099654962335343e-07, "loss": 0.76733172, "num_input_tokens_seen": 141755260, "step": 6578, "time_per_iteration": 2.6938910484313965 }, { "auxiliary_loss_clip": 0.01174629, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 0.97485363, "balance_loss_mlp": 1.02348387, "epoch": 0.7910779775145794, "flos": 26247935128320.0, "grad_norm": 1.7531448787243167, "language_loss": 0.74749172, "learning_rate": 4.405087199831636e-07, "loss": 0.76954883, "num_input_tokens_seen": 141775500, "step": 6579, "time_per_iteration": 2.7391626834869385 }, { "auxiliary_loss_clip": 0.01168273, "auxiliary_loss_mlp": 0.01122192, "balance_loss_clip": 0.97182035, "balance_loss_mlp": 0.0, "epoch": 0.7911982204052186, "flos": 22564434291840.0, "grad_norm": 2.261673620182817, "language_loss": 0.66881466, "learning_rate": 4.400211269165619e-07, "loss": 0.69171929, "num_input_tokens_seen": 141791955, "step": 6580, "time_per_iteration": 2.767219305038452 }, { "auxiliary_loss_clip": 0.01169732, "auxiliary_loss_mlp": 0.01026843, "balance_loss_clip": 1.05161309, "balance_loss_mlp": 1.02002466, "epoch": 0.7913184632958576, "flos": 23112538899840.0, "grad_norm": 1.435811557751292, "language_loss": 0.76912642, "learning_rate": 4.3953377049751416e-07, "loss": 0.79109216, "num_input_tokens_seen": 141812380, "step": 6581, "time_per_iteration": 2.659355401992798 }, { "auxiliary_loss_clip": 0.01171443, "auxiliary_loss_mlp": 0.01024213, "balance_loss_clip": 0.97371894, "balance_loss_mlp": 1.01663744, "epoch": 0.7914387061864967, "flos": 12311758719360.0, "grad_norm": 2.2428919747503495, "language_loss": 0.77396476, "learning_rate": 4.390466507999537e-07, "loss": 0.79592133, "num_input_tokens_seen": 141828130, "step": 6582, "time_per_iteration": 2.665325403213501 }, { "auxiliary_loss_clip": 0.0115881, "auxiliary_loss_mlp": 0.01021184, "balance_loss_clip": 0.93068099, "balance_loss_mlp": 1.01428223, "epoch": 0.7915589490771359, "flos": 17603267708160.0, "grad_norm": 2.1342699753447962, "language_loss": 0.75581646, "learning_rate": 4.385597678977748e-07, "loss": 0.77761638, "num_input_tokens_seen": 141846965, "step": 6583, "time_per_iteration": 2.7532379627227783 }, { "auxiliary_loss_clip": 0.01164024, "auxiliary_loss_mlp": 0.01024389, "balance_loss_clip": 0.96943599, "balance_loss_mlp": 1.01703966, "epoch": 0.7916791919677749, "flos": 25591272641280.0, "grad_norm": 1.6209411077192009, "language_loss": 0.75359279, "learning_rate": 4.3807312186483726e-07, "loss": 0.77547693, "num_input_tokens_seen": 141867685, "step": 6584, "time_per_iteration": 2.77107834815979 }, { "auxiliary_loss_clip": 0.01165777, "auxiliary_loss_mlp": 0.0102346, "balance_loss_clip": 1.01393533, "balance_loss_mlp": 1.01673055, "epoch": 0.791799434858414, "flos": 18844340474880.0, "grad_norm": 2.2372300962555554, "language_loss": 0.78208649, "learning_rate": 4.375867127749655e-07, "loss": 0.80397892, "num_input_tokens_seen": 141885960, "step": 6585, "time_per_iteration": 2.6960372924804688 }, { "auxiliary_loss_clip": 0.01165603, "auxiliary_loss_mlp": 0.01023221, "balance_loss_clip": 0.93531346, "balance_loss_mlp": 1.01602709, "epoch": 0.7919196777490531, "flos": 25812015672960.0, "grad_norm": 1.8958159106531245, "language_loss": 0.6716693, "learning_rate": 4.3710054070194744e-07, "loss": 0.69355756, "num_input_tokens_seen": 141905655, "step": 6586, "time_per_iteration": 2.788851737976074 }, { "auxiliary_loss_clip": 0.01170542, "auxiliary_loss_mlp": 0.01122772, "balance_loss_clip": 1.04841018, "balance_loss_mlp": 0.0, "epoch": 0.7920399206396922, "flos": 11947624594560.0, "grad_norm": 2.656840549277873, "language_loss": 0.66288555, "learning_rate": 4.3661460571953455e-07, "loss": 0.68581873, "num_input_tokens_seen": 141922390, "step": 6587, "time_per_iteration": 2.6571731567382812 }, { "auxiliary_loss_clip": 0.01166585, "auxiliary_loss_mlp": 0.0102016, "balance_loss_clip": 1.00748587, "balance_loss_mlp": 1.01344872, "epoch": 0.7921601635303313, "flos": 21579907438080.0, "grad_norm": 1.7020739317762215, "language_loss": 0.68833196, "learning_rate": 4.36128907901443e-07, "loss": 0.71019948, "num_input_tokens_seen": 141941985, "step": 6588, "time_per_iteration": 2.758697748184204 }, { "auxiliary_loss_clip": 0.01164964, "auxiliary_loss_mlp": 0.0102628, "balance_loss_clip": 0.93329686, "balance_loss_mlp": 1.01921105, "epoch": 0.7922804064209703, "flos": 18113989236480.0, "grad_norm": 2.1315633758046886, "language_loss": 0.7248795, "learning_rate": 4.356434473213519e-07, "loss": 0.74679196, "num_input_tokens_seen": 141959435, "step": 6589, "time_per_iteration": 2.6614747047424316 }, { "auxiliary_loss_clip": 0.01162179, "auxiliary_loss_mlp": 0.01025292, "balance_loss_clip": 0.97245663, "balance_loss_mlp": 1.01827669, "epoch": 0.7924006493116095, "flos": 21652806090240.0, "grad_norm": 1.6397013459163916, "language_loss": 0.79963148, "learning_rate": 4.351582240529068e-07, "loss": 0.82150626, "num_input_tokens_seen": 141980265, "step": 6590, "time_per_iteration": 2.686518669128418 }, { "auxiliary_loss_clip": 0.01068425, "auxiliary_loss_mlp": 0.01001559, "balance_loss_clip": 0.9348805, "balance_loss_mlp": 0.99993819, "epoch": 0.7925208922022485, "flos": 64242755694720.0, "grad_norm": 0.6998078994001393, "language_loss": 0.58213747, "learning_rate": 4.346732381697149e-07, "loss": 0.60283732, "num_input_tokens_seen": 142044395, "step": 6591, "time_per_iteration": 3.355914831161499 }, { "auxiliary_loss_clip": 0.01162916, "auxiliary_loss_mlp": 0.01026232, "balance_loss_clip": 0.97467959, "balance_loss_mlp": 1.01880252, "epoch": 0.7926411350928876, "flos": 16941541403520.0, "grad_norm": 1.735924010086351, "language_loss": 0.81118876, "learning_rate": 4.3418848974534825e-07, "loss": 0.83308023, "num_input_tokens_seen": 142061335, "step": 6592, "time_per_iteration": 2.6832900047302246 }, { "auxiliary_loss_clip": 0.01168642, "auxiliary_loss_mlp": 0.01027931, "balance_loss_clip": 0.93333739, "balance_loss_mlp": 1.02079618, "epoch": 0.7927613779835267, "flos": 34459987144320.0, "grad_norm": 1.7787595573125388, "language_loss": 0.68713284, "learning_rate": 4.3370397885334276e-07, "loss": 0.70909858, "num_input_tokens_seen": 142081965, "step": 6593, "time_per_iteration": 2.8167617321014404 }, { "auxiliary_loss_clip": 0.01160945, "auxiliary_loss_mlp": 0.01029111, "balance_loss_clip": 1.01062655, "balance_loss_mlp": 1.0221014, "epoch": 0.7928816208741658, "flos": 18951174501120.0, "grad_norm": 1.9077950919354163, "language_loss": 0.75349271, "learning_rate": 4.3321970556719777e-07, "loss": 0.77539325, "num_input_tokens_seen": 142100260, "step": 6594, "time_per_iteration": 2.649728298187256 }, { "auxiliary_loss_clip": 0.01168999, "auxiliary_loss_mlp": 0.0103095, "balance_loss_clip": 1.04921055, "balance_loss_mlp": 1.02422094, "epoch": 0.7930018637648049, "flos": 18623022825600.0, "grad_norm": 2.2627093211913385, "language_loss": 0.72326589, "learning_rate": 4.3273566996037856e-07, "loss": 0.74526536, "num_input_tokens_seen": 142116955, "step": 6595, "time_per_iteration": 3.4156532287597656 }, { "auxiliary_loss_clip": 0.0116598, "auxiliary_loss_mlp": 0.01024914, "balance_loss_clip": 0.9727121, "balance_loss_mlp": 1.01833987, "epoch": 0.793122106655444, "flos": 24530650824960.0, "grad_norm": 2.1439730364921523, "language_loss": 0.80437267, "learning_rate": 4.322518721063113e-07, "loss": 0.82628161, "num_input_tokens_seen": 142135505, "step": 6596, "time_per_iteration": 2.690807819366455 }, { "auxiliary_loss_clip": 0.01167271, "auxiliary_loss_mlp": 0.01032318, "balance_loss_clip": 1.01230025, "balance_loss_mlp": 1.02457845, "epoch": 0.7932423495460831, "flos": 34421203434240.0, "grad_norm": 1.7716820866618939, "language_loss": 0.70474476, "learning_rate": 4.3176831207838906e-07, "loss": 0.72674066, "num_input_tokens_seen": 142158915, "step": 6597, "time_per_iteration": 2.7766101360321045 }, { "auxiliary_loss_clip": 0.01166204, "auxiliary_loss_mlp": 0.01022582, "balance_loss_clip": 1.01214409, "balance_loss_mlp": 1.01567698, "epoch": 0.7933625924367221, "flos": 26980333441920.0, "grad_norm": 1.681652992501929, "language_loss": 0.74428988, "learning_rate": 4.3128498994996685e-07, "loss": 0.76617777, "num_input_tokens_seen": 142178390, "step": 6598, "time_per_iteration": 3.6730916500091553 }, { "auxiliary_loss_clip": 0.01174091, "auxiliary_loss_mlp": 0.01024767, "balance_loss_clip": 1.0116905, "balance_loss_mlp": 1.01736426, "epoch": 0.7934828353273613, "flos": 29568630643200.0, "grad_norm": 1.911151014610954, "language_loss": 0.71189725, "learning_rate": 4.308019057943646e-07, "loss": 0.73388588, "num_input_tokens_seen": 142200115, "step": 6599, "time_per_iteration": 3.6363918781280518 }, { "auxiliary_loss_clip": 0.01173548, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 0.8967731, "balance_loss_mlp": 1.02551985, "epoch": 0.7936030782180004, "flos": 28615381557120.0, "grad_norm": 1.79934550307467, "language_loss": 0.74593663, "learning_rate": 4.3031905968486535e-07, "loss": 0.76800191, "num_input_tokens_seen": 142220945, "step": 6600, "time_per_iteration": 2.7986552715301514 }, { "auxiliary_loss_clip": 0.01164188, "auxiliary_loss_mlp": 0.01027068, "balance_loss_clip": 0.89997488, "balance_loss_mlp": 1.02057445, "epoch": 0.7937233211086394, "flos": 16392574869120.0, "grad_norm": 2.0858815354438476, "language_loss": 0.68734765, "learning_rate": 4.298364516947162e-07, "loss": 0.70926017, "num_input_tokens_seen": 142238175, "step": 6601, "time_per_iteration": 3.5809648036956787 }, { "auxiliary_loss_clip": 0.01158051, "auxiliary_loss_mlp": 0.01024247, "balance_loss_clip": 0.89338362, "balance_loss_mlp": 1.01716328, "epoch": 0.7938435639992786, "flos": 22013420682240.0, "grad_norm": 1.8761681154081, "language_loss": 0.66088414, "learning_rate": 4.293540818971295e-07, "loss": 0.68270707, "num_input_tokens_seen": 142255980, "step": 6602, "time_per_iteration": 2.797194719314575 }, { "auxiliary_loss_clip": 0.01170935, "auxiliary_loss_mlp": 0.01027547, "balance_loss_clip": 1.00895762, "balance_loss_mlp": 1.01999474, "epoch": 0.7939638068899176, "flos": 22197032029440.0, "grad_norm": 1.9063647818133393, "language_loss": 0.76565158, "learning_rate": 4.2887195036527934e-07, "loss": 0.7876364, "num_input_tokens_seen": 142274785, "step": 6603, "time_per_iteration": 2.613011360168457 }, { "auxiliary_loss_clip": 0.01156326, "auxiliary_loss_mlp": 0.01029275, "balance_loss_clip": 1.00580692, "balance_loss_mlp": 1.02237606, "epoch": 0.7940840497805567, "flos": 17745186343680.0, "grad_norm": 2.3431633258167603, "language_loss": 0.73672539, "learning_rate": 4.28390057172306e-07, "loss": 0.7585814, "num_input_tokens_seen": 142291290, "step": 6604, "time_per_iteration": 2.5743629932403564 }, { "auxiliary_loss_clip": 0.01156862, "auxiliary_loss_mlp": 0.01028162, "balance_loss_clip": 0.93021291, "balance_loss_mlp": 1.02080679, "epoch": 0.7942042926711959, "flos": 23805435231360.0, "grad_norm": 2.0011201492355677, "language_loss": 0.72472739, "learning_rate": 4.279084023913111e-07, "loss": 0.74657768, "num_input_tokens_seen": 142309165, "step": 6605, "time_per_iteration": 2.719064712524414 }, { "auxiliary_loss_clip": 0.01167391, "auxiliary_loss_mlp": 0.01021782, "balance_loss_clip": 1.01170301, "balance_loss_mlp": 1.01513958, "epoch": 0.7943245355618349, "flos": 19244959839360.0, "grad_norm": 1.7037316684690165, "language_loss": 0.69102114, "learning_rate": 4.2742698609536096e-07, "loss": 0.71291292, "num_input_tokens_seen": 142327475, "step": 6606, "time_per_iteration": 2.6175365447998047 }, { "auxiliary_loss_clip": 0.01169825, "auxiliary_loss_mlp": 0.01028097, "balance_loss_clip": 0.97234923, "balance_loss_mlp": 1.02141261, "epoch": 0.794444778452474, "flos": 25007616547200.0, "grad_norm": 2.052493096646598, "language_loss": 0.78744745, "learning_rate": 4.2694580835748706e-07, "loss": 0.80942667, "num_input_tokens_seen": 142347335, "step": 6607, "time_per_iteration": 2.66918683052063 }, { "auxiliary_loss_clip": 0.01163201, "auxiliary_loss_mlp": 0.01024181, "balance_loss_clip": 0.97017229, "balance_loss_mlp": 1.01697779, "epoch": 0.7945650213431131, "flos": 23221491828480.0, "grad_norm": 2.063934771395369, "language_loss": 0.73760635, "learning_rate": 4.264648692506836e-07, "loss": 0.75948018, "num_input_tokens_seen": 142366125, "step": 6608, "time_per_iteration": 2.672050714492798 }, { "auxiliary_loss_clip": 0.01160006, "auxiliary_loss_mlp": 0.01025235, "balance_loss_clip": 0.97148144, "balance_loss_mlp": 1.01804054, "epoch": 0.7946852642337522, "flos": 26062887237120.0, "grad_norm": 2.269827507391957, "language_loss": 0.72336066, "learning_rate": 4.2598416884790824e-07, "loss": 0.74521303, "num_input_tokens_seen": 142385175, "step": 6609, "time_per_iteration": 2.709974527359009 }, { "auxiliary_loss_clip": 0.01173704, "auxiliary_loss_mlp": 0.01028419, "balance_loss_clip": 0.97046673, "balance_loss_mlp": 1.02104926, "epoch": 0.7948055071243912, "flos": 23769704177280.0, "grad_norm": 2.110031110953954, "language_loss": 0.80869323, "learning_rate": 4.255037072220828e-07, "loss": 0.83071446, "num_input_tokens_seen": 142406545, "step": 6610, "time_per_iteration": 2.6685116291046143 }, { "auxiliary_loss_clip": 0.0116388, "auxiliary_loss_mlp": 0.01024647, "balance_loss_clip": 1.04702294, "balance_loss_mlp": 1.01761675, "epoch": 0.7949257500150304, "flos": 21980814111360.0, "grad_norm": 1.7548096687039065, "language_loss": 0.71703756, "learning_rate": 4.2502348444609293e-07, "loss": 0.73892289, "num_input_tokens_seen": 142426165, "step": 6611, "time_per_iteration": 2.6427838802337646 }, { "auxiliary_loss_clip": 0.01158266, "auxiliary_loss_mlp": 0.01026817, "balance_loss_clip": 0.89179063, "balance_loss_mlp": 1.01997483, "epoch": 0.7950459929056695, "flos": 25774129802880.0, "grad_norm": 2.7330216884299916, "language_loss": 0.69601136, "learning_rate": 4.2454350059278844e-07, "loss": 0.71786219, "num_input_tokens_seen": 142447225, "step": 6612, "time_per_iteration": 2.7622573375701904 }, { "auxiliary_loss_clip": 0.0115675, "auxiliary_loss_mlp": 0.01027588, "balance_loss_clip": 0.96755803, "balance_loss_mlp": 1.02055502, "epoch": 0.7951662357963085, "flos": 22158068751360.0, "grad_norm": 1.8592138581777553, "language_loss": 0.84133089, "learning_rate": 4.240637557349824e-07, "loss": 0.86317432, "num_input_tokens_seen": 142464440, "step": 6613, "time_per_iteration": 2.7119674682617188 }, { "auxiliary_loss_clip": 0.01154067, "auxiliary_loss_mlp": 0.01023555, "balance_loss_clip": 0.97084111, "balance_loss_mlp": 1.01640201, "epoch": 0.7952864786869477, "flos": 24641938137600.0, "grad_norm": 2.1901483320272304, "language_loss": 0.66440701, "learning_rate": 4.235842499454516e-07, "loss": 0.68618321, "num_input_tokens_seen": 142484355, "step": 6614, "time_per_iteration": 2.745274782180786 }, { "auxiliary_loss_clip": 0.01166803, "auxiliary_loss_mlp": 0.0102834, "balance_loss_clip": 0.9728241, "balance_loss_mlp": 1.02170873, "epoch": 0.7954067215775867, "flos": 21830922656640.0, "grad_norm": 1.6410933117429058, "language_loss": 0.83166349, "learning_rate": 4.2310498329693687e-07, "loss": 0.85361493, "num_input_tokens_seen": 142505255, "step": 6615, "time_per_iteration": 2.7335257530212402 }, { "auxiliary_loss_clip": 0.01171295, "auxiliary_loss_mlp": 0.01023899, "balance_loss_clip": 1.01086259, "balance_loss_mlp": 1.01623058, "epoch": 0.7955269644682258, "flos": 24060652341120.0, "grad_norm": 1.5679589684051565, "language_loss": 0.80715811, "learning_rate": 4.2262595586214164e-07, "loss": 0.82911003, "num_input_tokens_seen": 142526350, "step": 6616, "time_per_iteration": 2.6736526489257812 }, { "auxiliary_loss_clip": 0.01171885, "auxiliary_loss_mlp": 0.01030991, "balance_loss_clip": 1.01127148, "balance_loss_mlp": 1.02338576, "epoch": 0.795647207358865, "flos": 25010741030400.0, "grad_norm": 1.7790171210239847, "language_loss": 0.76492614, "learning_rate": 4.221471677137358e-07, "loss": 0.78695488, "num_input_tokens_seen": 142547165, "step": 6617, "time_per_iteration": 2.710193395614624 }, { "auxiliary_loss_clip": 0.01153298, "auxiliary_loss_mlp": 0.01027271, "balance_loss_clip": 0.97116148, "balance_loss_mlp": 1.02038932, "epoch": 0.795767450249504, "flos": 14648358343680.0, "grad_norm": 1.5017985669923928, "language_loss": 0.70227575, "learning_rate": 4.216686189243492e-07, "loss": 0.72408146, "num_input_tokens_seen": 142565955, "step": 6618, "time_per_iteration": 2.630669593811035 }, { "auxiliary_loss_clip": 0.01160164, "auxiliary_loss_mlp": 0.01022, "balance_loss_clip": 0.93338418, "balance_loss_mlp": 1.01483846, "epoch": 0.7958876931401431, "flos": 18547897530240.0, "grad_norm": 1.9739284581539729, "language_loss": 0.7292012, "learning_rate": 4.211903095665785e-07, "loss": 0.75102282, "num_input_tokens_seen": 142585340, "step": 6619, "time_per_iteration": 2.7223665714263916 }, { "auxiliary_loss_clip": 0.01158953, "auxiliary_loss_mlp": 0.01028074, "balance_loss_clip": 1.00747681, "balance_loss_mlp": 1.02105308, "epoch": 0.7960079360307821, "flos": 21543960902400.0, "grad_norm": 1.7595456511000573, "language_loss": 0.74883831, "learning_rate": 4.2071223971298277e-07, "loss": 0.77070856, "num_input_tokens_seen": 142602525, "step": 6620, "time_per_iteration": 2.6126291751861572 }, { "auxiliary_loss_clip": 0.01166425, "auxiliary_loss_mlp": 0.01026563, "balance_loss_clip": 1.00819468, "balance_loss_mlp": 1.01955056, "epoch": 0.7961281789214213, "flos": 25481745095040.0, "grad_norm": 1.8837505616793533, "language_loss": 0.61320782, "learning_rate": 4.2023440943608433e-07, "loss": 0.63513768, "num_input_tokens_seen": 142622490, "step": 6621, "time_per_iteration": 3.5050368309020996 }, { "auxiliary_loss_clip": 0.01164964, "auxiliary_loss_mlp": 0.01025409, "balance_loss_clip": 1.00844359, "balance_loss_mlp": 1.01903701, "epoch": 0.7962484218120603, "flos": 21944436612480.0, "grad_norm": 1.507873052169503, "language_loss": 0.78226382, "learning_rate": 4.1975681880837023e-07, "loss": 0.80416751, "num_input_tokens_seen": 142642495, "step": 6622, "time_per_iteration": 2.720940351486206 }, { "auxiliary_loss_clip": 0.01155633, "auxiliary_loss_mlp": 0.01026696, "balance_loss_clip": 0.92892241, "balance_loss_mlp": 1.01987123, "epoch": 0.7963686647026994, "flos": 18876264687360.0, "grad_norm": 1.7421960820088846, "language_loss": 0.82340342, "learning_rate": 4.192794679022895e-07, "loss": 0.84522676, "num_input_tokens_seen": 142660820, "step": 6623, "time_per_iteration": 2.637407064437866 }, { "auxiliary_loss_clip": 0.01166062, "auxiliary_loss_mlp": 0.01021657, "balance_loss_clip": 1.00903523, "balance_loss_mlp": 1.01490355, "epoch": 0.7964889075933386, "flos": 29716582763520.0, "grad_norm": 1.7669472472626073, "language_loss": 0.71995085, "learning_rate": 4.1880235679025743e-07, "loss": 0.74182808, "num_input_tokens_seen": 142680915, "step": 6624, "time_per_iteration": 3.7265303134918213 }, { "auxiliary_loss_clip": 0.01161435, "auxiliary_loss_mlp": 0.01024298, "balance_loss_clip": 0.85620928, "balance_loss_mlp": 1.01685596, "epoch": 0.7966091504839776, "flos": 29491458272640.0, "grad_norm": 1.9251147197406175, "language_loss": 0.63878363, "learning_rate": 4.1832548554464986e-07, "loss": 0.66064095, "num_input_tokens_seen": 142699210, "step": 6625, "time_per_iteration": 3.7138264179229736 }, { "auxiliary_loss_clip": 0.01059036, "auxiliary_loss_mlp": 0.01003495, "balance_loss_clip": 0.97163975, "balance_loss_mlp": 1.00185001, "epoch": 0.7967293933746167, "flos": 67288697101440.0, "grad_norm": 0.7443259258273837, "language_loss": 0.58803809, "learning_rate": 4.178488542378098e-07, "loss": 0.60866344, "num_input_tokens_seen": 142756790, "step": 6626, "time_per_iteration": 3.1010262966156006 }, { "auxiliary_loss_clip": 0.01171973, "auxiliary_loss_mlp": 0.01024542, "balance_loss_clip": 1.04884207, "balance_loss_mlp": 1.01656127, "epoch": 0.7968496362652558, "flos": 25554679660800.0, "grad_norm": 1.7725554090952644, "language_loss": 0.88689578, "learning_rate": 4.173724629420401e-07, "loss": 0.90886092, "num_input_tokens_seen": 142778150, "step": 6627, "time_per_iteration": 2.621213436126709 }, { "auxiliary_loss_clip": 0.01170623, "auxiliary_loss_mlp": 0.01025679, "balance_loss_clip": 0.97288406, "balance_loss_mlp": 1.01765025, "epoch": 0.7969698791558949, "flos": 14501088581760.0, "grad_norm": 2.99437673494826, "language_loss": 0.68797493, "learning_rate": 4.168963117296087e-07, "loss": 0.70993799, "num_input_tokens_seen": 142795485, "step": 6628, "time_per_iteration": 3.6006922721862793 }, { "auxiliary_loss_clip": 0.01168995, "auxiliary_loss_mlp": 0.01025023, "balance_loss_clip": 1.05008698, "balance_loss_mlp": 1.0178678, "epoch": 0.797090122046534, "flos": 22127545169280.0, "grad_norm": 2.261359572264553, "language_loss": 0.75792146, "learning_rate": 4.1642040067274876e-07, "loss": 0.77986157, "num_input_tokens_seen": 142815155, "step": 6629, "time_per_iteration": 2.6368231773376465 }, { "auxiliary_loss_clip": 0.01169802, "auxiliary_loss_mlp": 0.0102838, "balance_loss_clip": 0.97243953, "balance_loss_mlp": 1.02170694, "epoch": 0.7972103649371731, "flos": 19897671830400.0, "grad_norm": 3.0242847056233644, "language_loss": 0.72706193, "learning_rate": 4.1594472984365493e-07, "loss": 0.7490437, "num_input_tokens_seen": 142833840, "step": 6630, "time_per_iteration": 2.719367504119873 }, { "auxiliary_loss_clip": 0.01167288, "auxiliary_loss_mlp": 0.01030661, "balance_loss_clip": 1.01280951, "balance_loss_mlp": 1.02334428, "epoch": 0.7973306078278122, "flos": 36058621847040.0, "grad_norm": 2.11030454985203, "language_loss": 0.77495003, "learning_rate": 4.154692993144862e-07, "loss": 0.7969296, "num_input_tokens_seen": 142853610, "step": 6631, "time_per_iteration": 2.7558059692382812 }, { "auxiliary_loss_clip": 0.01169212, "auxiliary_loss_mlp": 0.01122485, "balance_loss_clip": 1.04910398, "balance_loss_mlp": 0.0, "epoch": 0.7974508507184512, "flos": 21360600950400.0, "grad_norm": 2.023467870122603, "language_loss": 0.71701908, "learning_rate": 4.1499410915736476e-07, "loss": 0.73993611, "num_input_tokens_seen": 142872540, "step": 6632, "time_per_iteration": 2.6376378536224365 }, { "auxiliary_loss_clip": 0.01065124, "auxiliary_loss_mlp": 0.01001243, "balance_loss_clip": 0.97284508, "balance_loss_mlp": 0.99960941, "epoch": 0.7975710936090904, "flos": 68253115317120.0, "grad_norm": 0.7782471071193168, "language_loss": 0.64318955, "learning_rate": 4.145191594443762e-07, "loss": 0.66385317, "num_input_tokens_seen": 142936895, "step": 6633, "time_per_iteration": 3.3235673904418945 }, { "auxiliary_loss_clip": 0.01161062, "auxiliary_loss_mlp": 0.01025178, "balance_loss_clip": 0.93412066, "balance_loss_mlp": 1.01753378, "epoch": 0.7976913364997295, "flos": 22492433479680.0, "grad_norm": 1.6660955836263895, "language_loss": 0.70466912, "learning_rate": 4.140444502475713e-07, "loss": 0.72653151, "num_input_tokens_seen": 142956445, "step": 6634, "time_per_iteration": 2.7402284145355225 }, { "auxiliary_loss_clip": 0.01162383, "auxiliary_loss_mlp": 0.01023888, "balance_loss_clip": 1.00810277, "balance_loss_mlp": 1.01650023, "epoch": 0.7978115793903685, "flos": 15263220378240.0, "grad_norm": 6.097605387190584, "language_loss": 0.69833803, "learning_rate": 4.1356998163896216e-07, "loss": 0.72020078, "num_input_tokens_seen": 142973495, "step": 6635, "time_per_iteration": 2.6169393062591553 }, { "auxiliary_loss_clip": 0.01170061, "auxiliary_loss_mlp": 0.01026435, "balance_loss_clip": 0.93393219, "balance_loss_mlp": 1.01925278, "epoch": 0.7979318222810077, "flos": 19719232041600.0, "grad_norm": 1.949719205235015, "language_loss": 0.74952507, "learning_rate": 4.130957536905255e-07, "loss": 0.77148998, "num_input_tokens_seen": 142991510, "step": 6636, "time_per_iteration": 2.7691726684570312 }, { "auxiliary_loss_clip": 0.01175893, "auxiliary_loss_mlp": 0.01027486, "balance_loss_clip": 0.97335649, "balance_loss_mlp": 1.02008367, "epoch": 0.7980520651716467, "flos": 15560273854080.0, "grad_norm": 2.7983277182610258, "language_loss": 0.7126627, "learning_rate": 4.1262176647420134e-07, "loss": 0.73469651, "num_input_tokens_seen": 143009675, "step": 6637, "time_per_iteration": 2.6686131954193115 }, { "auxiliary_loss_clip": 0.01171852, "auxiliary_loss_mlp": 0.01023889, "balance_loss_clip": 0.97262472, "balance_loss_mlp": 1.01741862, "epoch": 0.7981723080622858, "flos": 22309432663680.0, "grad_norm": 1.6921938277982265, "language_loss": 0.79678082, "learning_rate": 4.121480200618923e-07, "loss": 0.81873828, "num_input_tokens_seen": 143029330, "step": 6638, "time_per_iteration": 2.628260612487793 }, { "auxiliary_loss_clip": 0.01158312, "auxiliary_loss_mlp": 0.01026356, "balance_loss_clip": 0.97146028, "balance_loss_mlp": 1.0189476, "epoch": 0.798292550952925, "flos": 22929573997440.0, "grad_norm": 1.7841493311015217, "language_loss": 0.80060828, "learning_rate": 4.116745145254674e-07, "loss": 0.82245493, "num_input_tokens_seen": 143048865, "step": 6639, "time_per_iteration": 2.7094192504882812 }, { "auxiliary_loss_clip": 0.01062279, "auxiliary_loss_mlp": 0.01003648, "balance_loss_clip": 0.93716264, "balance_loss_mlp": 1.00195491, "epoch": 0.798412793843564, "flos": 64497936890880.0, "grad_norm": 0.7681381037675021, "language_loss": 0.58080113, "learning_rate": 4.1120124993675476e-07, "loss": 0.60146034, "num_input_tokens_seen": 143113295, "step": 6640, "time_per_iteration": 3.2440171241760254 }, { "auxiliary_loss_clip": 0.01172653, "auxiliary_loss_mlp": 0.01028667, "balance_loss_clip": 0.9714694, "balance_loss_mlp": 1.02093291, "epoch": 0.7985330367342031, "flos": 13586910514560.0, "grad_norm": 3.4077068753378867, "language_loss": 0.62193471, "learning_rate": 4.107282263675498e-07, "loss": 0.64394796, "num_input_tokens_seen": 143130965, "step": 6641, "time_per_iteration": 2.702986240386963 }, { "auxiliary_loss_clip": 0.01066966, "auxiliary_loss_mlp": 0.01116099, "balance_loss_clip": 0.93801415, "balance_loss_mlp": 0.0, "epoch": 0.7986532796248422, "flos": 67698797656320.0, "grad_norm": 0.8631809625309379, "language_loss": 0.52551186, "learning_rate": 4.1025544388960907e-07, "loss": 0.54734242, "num_input_tokens_seen": 143192005, "step": 6642, "time_per_iteration": 3.2151694297790527 }, { "auxiliary_loss_clip": 0.01163386, "auxiliary_loss_mlp": 0.01028554, "balance_loss_clip": 1.00987637, "balance_loss_mlp": 1.02171433, "epoch": 0.7987735225154813, "flos": 22455373622400.0, "grad_norm": 2.171917626060502, "language_loss": 0.71614361, "learning_rate": 4.097829025746538e-07, "loss": 0.73806304, "num_input_tokens_seen": 143213550, "step": 6643, "time_per_iteration": 2.7020928859710693 }, { "auxiliary_loss_clip": 0.01064841, "auxiliary_loss_mlp": 0.0100181, "balance_loss_clip": 0.97362602, "balance_loss_mlp": 1.00023639, "epoch": 0.7988937654061203, "flos": 68864098682880.0, "grad_norm": 0.6636167266474358, "language_loss": 0.61036718, "learning_rate": 4.0931060249436757e-07, "loss": 0.63103366, "num_input_tokens_seen": 143277390, "step": 6644, "time_per_iteration": 3.210146427154541 }, { "auxiliary_loss_clip": 0.01165102, "auxiliary_loss_mlp": 0.01027773, "balance_loss_clip": 1.01138008, "balance_loss_mlp": 1.02068341, "epoch": 0.7990140082967595, "flos": 20806893820800.0, "grad_norm": 2.343781273293228, "language_loss": 0.69362664, "learning_rate": 4.088385437203978e-07, "loss": 0.71555537, "num_input_tokens_seen": 143294400, "step": 6645, "time_per_iteration": 2.662916660308838 }, { "auxiliary_loss_clip": 0.011689, "auxiliary_loss_mlp": 0.01022768, "balance_loss_clip": 1.04717088, "balance_loss_mlp": 1.01547194, "epoch": 0.7991342511873986, "flos": 18985289443200.0, "grad_norm": 1.9674634653589176, "language_loss": 0.77441847, "learning_rate": 4.083667263243564e-07, "loss": 0.7963351, "num_input_tokens_seen": 143312745, "step": 6646, "time_per_iteration": 3.3858907222747803 }, { "auxiliary_loss_clip": 0.0116862, "auxiliary_loss_mlp": 0.01030183, "balance_loss_clip": 1.01349306, "balance_loss_mlp": 1.02275038, "epoch": 0.7992544940780376, "flos": 20816805974400.0, "grad_norm": 1.614260872679887, "language_loss": 0.71846527, "learning_rate": 4.0789515037781653e-07, "loss": 0.7404533, "num_input_tokens_seen": 143333470, "step": 6647, "time_per_iteration": 2.713832378387451 }, { "auxiliary_loss_clip": 0.01171413, "auxiliary_loss_mlp": 0.01021964, "balance_loss_clip": 1.01019478, "balance_loss_mlp": 1.01461768, "epoch": 0.7993747369686768, "flos": 12640772321280.0, "grad_norm": 1.6386910108539334, "language_loss": 0.82386976, "learning_rate": 4.0742381595231755e-07, "loss": 0.8458035, "num_input_tokens_seen": 143350195, "step": 6648, "time_per_iteration": 2.608400583267212 }, { "auxiliary_loss_clip": 0.01169324, "auxiliary_loss_mlp": 0.01023817, "balance_loss_clip": 0.93356073, "balance_loss_mlp": 1.01693237, "epoch": 0.7994949798593158, "flos": 20078769225600.0, "grad_norm": 1.5010394443692616, "language_loss": 0.78463978, "learning_rate": 4.06952723119359e-07, "loss": 0.80657119, "num_input_tokens_seen": 143370070, "step": 6649, "time_per_iteration": 2.697458267211914 }, { "auxiliary_loss_clip": 0.01156636, "auxiliary_loss_mlp": 0.01023635, "balance_loss_clip": 0.97141117, "balance_loss_mlp": 1.01670897, "epoch": 0.7996152227499549, "flos": 38654209509120.0, "grad_norm": 2.924453323757419, "language_loss": 0.67358446, "learning_rate": 4.0648187195040504e-07, "loss": 0.69538718, "num_input_tokens_seen": 143392275, "step": 6650, "time_per_iteration": 2.8376846313476562 }, { "auxiliary_loss_clip": 0.01058715, "auxiliary_loss_mlp": 0.0100311, "balance_loss_clip": 0.97117114, "balance_loss_mlp": 1.00152433, "epoch": 0.799735465640594, "flos": 70243821947520.0, "grad_norm": 0.8162518125262407, "language_loss": 0.67724097, "learning_rate": 4.060112625168848e-07, "loss": 0.69785923, "num_input_tokens_seen": 143457385, "step": 6651, "time_per_iteration": 5.598546028137207 }, { "auxiliary_loss_clip": 0.01168773, "auxiliary_loss_mlp": 0.01033431, "balance_loss_clip": 1.04910576, "balance_loss_mlp": 1.02599239, "epoch": 0.7998557085312331, "flos": 24240995550720.0, "grad_norm": 1.8078327862235222, "language_loss": 0.74085724, "learning_rate": 4.055408948901886e-07, "loss": 0.76287919, "num_input_tokens_seen": 143478785, "step": 6652, "time_per_iteration": 2.642275094985962 }, { "auxiliary_loss_clip": 0.01169499, "auxiliary_loss_mlp": 0.01027828, "balance_loss_clip": 1.00956678, "balance_loss_mlp": 1.02029443, "epoch": 0.7999759514218722, "flos": 27564025449600.0, "grad_norm": 1.737633706267267, "language_loss": 0.71576107, "learning_rate": 4.050707691416708e-07, "loss": 0.73773432, "num_input_tokens_seen": 143500095, "step": 6653, "time_per_iteration": 2.700253963470459 }, { "auxiliary_loss_clip": 0.01058452, "auxiliary_loss_mlp": 0.0100172, "balance_loss_clip": 0.97112864, "balance_loss_mlp": 1.00005078, "epoch": 0.8000961943125112, "flos": 67337428878720.0, "grad_norm": 0.7067607961393236, "language_loss": 0.5982495, "learning_rate": 4.046008853426495e-07, "loss": 0.61885124, "num_input_tokens_seen": 143563410, "step": 6654, "time_per_iteration": 4.095434665679932 }, { "auxiliary_loss_clip": 0.01156269, "auxiliary_loss_mlp": 0.01026867, "balance_loss_clip": 0.93056822, "balance_loss_mlp": 1.01940489, "epoch": 0.8002164372031504, "flos": 28733815676160.0, "grad_norm": 1.7239748522236884, "language_loss": 0.6216507, "learning_rate": 4.0413124356440464e-07, "loss": 0.64348209, "num_input_tokens_seen": 143587455, "step": 6655, "time_per_iteration": 2.783257484436035 }, { "auxiliary_loss_clip": 0.01166296, "auxiliary_loss_mlp": 0.01025666, "balance_loss_clip": 0.89519483, "balance_loss_mlp": 1.01844239, "epoch": 0.8003366800937894, "flos": 17639429725440.0, "grad_norm": 1.8908485271652917, "language_loss": 0.82274163, "learning_rate": 4.0366184387818223e-07, "loss": 0.84466124, "num_input_tokens_seen": 143605915, "step": 6656, "time_per_iteration": 2.7230944633483887 }, { "auxiliary_loss_clip": 0.01174501, "auxiliary_loss_mlp": 0.01029978, "balance_loss_clip": 1.05030191, "balance_loss_mlp": 1.02249718, "epoch": 0.8004569229844285, "flos": 25995303797760.0, "grad_norm": 1.7635125200075445, "language_loss": 0.85022676, "learning_rate": 4.0319268635518797e-07, "loss": 0.8722716, "num_input_tokens_seen": 143626490, "step": 6657, "time_per_iteration": 249.08684515953064 }, { "auxiliary_loss_clip": 0.01166888, "auxiliary_loss_mlp": 0.01025472, "balance_loss_clip": 1.01004171, "balance_loss_mlp": 1.01860881, "epoch": 0.8005771658750677, "flos": 20812352688000.0, "grad_norm": 1.607390689911882, "language_loss": 0.75130737, "learning_rate": 4.027237710665943e-07, "loss": 0.77323103, "num_input_tokens_seen": 143644955, "step": 6658, "time_per_iteration": 2.681439161300659 }, { "auxiliary_loss_clip": 0.01166729, "auxiliary_loss_mlp": 0.0101831, "balance_loss_clip": 0.9301936, "balance_loss_mlp": 1.01101422, "epoch": 0.8006974087657067, "flos": 25812626204160.0, "grad_norm": 1.7286641799811648, "language_loss": 0.69258893, "learning_rate": 4.022550980835344e-07, "loss": 0.71443927, "num_input_tokens_seen": 143667200, "step": 6659, "time_per_iteration": 2.8724026679992676 }, { "auxiliary_loss_clip": 0.01161432, "auxiliary_loss_mlp": 0.01023797, "balance_loss_clip": 0.93063819, "balance_loss_mlp": 1.01663876, "epoch": 0.8008176516563458, "flos": 17164690646400.0, "grad_norm": 2.037297213538561, "language_loss": 0.79110581, "learning_rate": 4.017866674771051e-07, "loss": 0.81295812, "num_input_tokens_seen": 143684685, "step": 6660, "time_per_iteration": 2.7092056274414062 }, { "auxiliary_loss_clip": 0.0115391, "auxiliary_loss_mlp": 0.01021466, "balance_loss_clip": 0.89094251, "balance_loss_mlp": 1.01448917, "epoch": 0.8009378945469849, "flos": 24207311571840.0, "grad_norm": 1.7807262294788695, "language_loss": 0.74093586, "learning_rate": 4.013184793183688e-07, "loss": 0.76268959, "num_input_tokens_seen": 143706780, "step": 6661, "time_per_iteration": 2.7729671001434326 }, { "auxiliary_loss_clip": 0.01163819, "auxiliary_loss_mlp": 0.01026339, "balance_loss_clip": 1.00730407, "balance_loss_mlp": 1.01934731, "epoch": 0.801058137437624, "flos": 19787318271360.0, "grad_norm": 1.7092703055983405, "language_loss": 0.72696221, "learning_rate": 4.008505336783472e-07, "loss": 0.7488637, "num_input_tokens_seen": 143724505, "step": 6662, "time_per_iteration": 2.7266480922698975 }, { "auxiliary_loss_clip": 0.01154653, "auxiliary_loss_mlp": 0.01028991, "balance_loss_clip": 1.00668776, "balance_loss_mlp": 1.02244306, "epoch": 0.801178380328263, "flos": 18659400324480.0, "grad_norm": 2.0548922073301714, "language_loss": 0.80590451, "learning_rate": 4.003828306280284e-07, "loss": 0.82774091, "num_input_tokens_seen": 143742180, "step": 6663, "time_per_iteration": 2.669433832168579 }, { "auxiliary_loss_clip": 0.01168587, "auxiliary_loss_mlp": 0.01029438, "balance_loss_clip": 1.01026571, "balance_loss_mlp": 1.0227325, "epoch": 0.8012986232189022, "flos": 15706573948800.0, "grad_norm": 1.7123619604687215, "language_loss": 0.78012979, "learning_rate": 3.999153702383626e-07, "loss": 0.80211008, "num_input_tokens_seen": 143760070, "step": 6664, "time_per_iteration": 2.7747952938079834 }, { "auxiliary_loss_clip": 0.01170372, "auxiliary_loss_mlp": 0.01025853, "balance_loss_clip": 1.00919461, "balance_loss_mlp": 1.01879025, "epoch": 0.8014188661095413, "flos": 28584139703040.0, "grad_norm": 1.6874933843651985, "language_loss": 0.73921382, "learning_rate": 3.9944815258026263e-07, "loss": 0.76117611, "num_input_tokens_seen": 143781890, "step": 6665, "time_per_iteration": 2.7159814834594727 }, { "auxiliary_loss_clip": 0.01171738, "auxiliary_loss_mlp": 0.01024088, "balance_loss_clip": 1.01060748, "balance_loss_mlp": 1.01658964, "epoch": 0.8015391090001803, "flos": 29310360877440.0, "grad_norm": 1.6039904872683817, "language_loss": 0.82961285, "learning_rate": 3.989811777246057e-07, "loss": 0.85157108, "num_input_tokens_seen": 143802060, "step": 6666, "time_per_iteration": 2.706937551498413 }, { "auxiliary_loss_clip": 0.01061278, "auxiliary_loss_mlp": 0.01002347, "balance_loss_clip": 1.00965095, "balance_loss_mlp": 1.00066662, "epoch": 0.8016593518908195, "flos": 70397340675840.0, "grad_norm": 0.8758292555905414, "language_loss": 0.66251791, "learning_rate": 3.985144457422305e-07, "loss": 0.68315423, "num_input_tokens_seen": 143856345, "step": 6667, "time_per_iteration": 3.1428170204162598 }, { "auxiliary_loss_clip": 0.01168171, "auxiliary_loss_mlp": 0.01026081, "balance_loss_clip": 1.04839373, "balance_loss_mlp": 1.01895177, "epoch": 0.8017795947814585, "flos": 26026114688640.0, "grad_norm": 1.7830700516648605, "language_loss": 0.76438427, "learning_rate": 3.9804795670394096e-07, "loss": 0.78632683, "num_input_tokens_seen": 143876470, "step": 6668, "time_per_iteration": 2.6442933082580566 }, { "auxiliary_loss_clip": 0.01155338, "auxiliary_loss_mlp": 0.01024782, "balance_loss_clip": 0.96838522, "balance_loss_mlp": 1.01793599, "epoch": 0.8018998376720976, "flos": 22087181260800.0, "grad_norm": 1.6281054479759762, "language_loss": 0.70554513, "learning_rate": 3.975817106805022e-07, "loss": 0.7273463, "num_input_tokens_seen": 143895170, "step": 6669, "time_per_iteration": 2.7146663665771484 }, { "auxiliary_loss_clip": 0.01163994, "auxiliary_loss_mlp": 0.0102514, "balance_loss_clip": 0.93349159, "balance_loss_mlp": 1.01823783, "epoch": 0.8020200805627368, "flos": 34568545023360.0, "grad_norm": 1.7875655546713796, "language_loss": 0.64938217, "learning_rate": 3.97115707742645e-07, "loss": 0.67127347, "num_input_tokens_seen": 143915845, "step": 6670, "time_per_iteration": 2.7519052028656006 }, { "auxiliary_loss_clip": 0.01166801, "auxiliary_loss_mlp": 0.01023534, "balance_loss_clip": 0.97344983, "balance_loss_mlp": 1.01608634, "epoch": 0.8021403234533758, "flos": 20120354196480.0, "grad_norm": 1.8754701693336862, "language_loss": 0.64907664, "learning_rate": 3.966499479610599e-07, "loss": 0.67097998, "num_input_tokens_seen": 143933940, "step": 6671, "time_per_iteration": 2.7593581676483154 }, { "auxiliary_loss_clip": 0.01164874, "auxiliary_loss_mlp": 0.01028471, "balance_loss_clip": 0.93627775, "balance_loss_mlp": 1.02193797, "epoch": 0.8022605663440149, "flos": 27746200252800.0, "grad_norm": 2.1385768456286107, "language_loss": 0.64825428, "learning_rate": 3.9618443140640225e-07, "loss": 0.67018771, "num_input_tokens_seen": 143952850, "step": 6672, "time_per_iteration": 3.6643426418304443 }, { "auxiliary_loss_clip": 0.01066686, "auxiliary_loss_mlp": 0.01003209, "balance_loss_clip": 0.86105883, "balance_loss_mlp": 1.00152838, "epoch": 0.802380809234654, "flos": 60244998768000.0, "grad_norm": 0.6818387961768673, "language_loss": 0.51430964, "learning_rate": 3.957191581492918e-07, "loss": 0.53500867, "num_input_tokens_seen": 144013610, "step": 6673, "time_per_iteration": 3.3500208854675293 }, { "auxiliary_loss_clip": 0.01158453, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 0.97044456, "balance_loss_mlp": 1.02018499, "epoch": 0.8025010521252931, "flos": 15080722352640.0, "grad_norm": 4.088369216329761, "language_loss": 0.71264476, "learning_rate": 3.952541282603097e-07, "loss": 0.73450208, "num_input_tokens_seen": 144028715, "step": 6674, "time_per_iteration": 2.6936709880828857 }, { "auxiliary_loss_clip": 0.01165116, "auxiliary_loss_mlp": 0.01022355, "balance_loss_clip": 1.01010275, "balance_loss_mlp": 1.01496696, "epoch": 0.8026212950159322, "flos": 22163527618560.0, "grad_norm": 2.8452946240699024, "language_loss": 0.83455169, "learning_rate": 3.9478934181000013e-07, "loss": 0.85642636, "num_input_tokens_seen": 144048740, "step": 6675, "time_per_iteration": 2.6812679767608643 }, { "auxiliary_loss_clip": 0.01170098, "auxiliary_loss_mlp": 0.01029865, "balance_loss_clip": 1.04698324, "balance_loss_mlp": 1.02221739, "epoch": 0.8027415379065713, "flos": 17675986792320.0, "grad_norm": 3.3228667197168997, "language_loss": 0.84434724, "learning_rate": 3.943247988688714e-07, "loss": 0.86634684, "num_input_tokens_seen": 144067435, "step": 6676, "time_per_iteration": 3.6183021068573 }, { "auxiliary_loss_clip": 0.01168491, "auxiliary_loss_mlp": 0.01025702, "balance_loss_clip": 1.00939584, "balance_loss_mlp": 1.01929116, "epoch": 0.8028617807972104, "flos": 21979593048960.0, "grad_norm": 1.6784917221101814, "language_loss": 0.7217201, "learning_rate": 3.938604995073933e-07, "loss": 0.743662, "num_input_tokens_seen": 144085905, "step": 6677, "time_per_iteration": 2.630826473236084 }, { "auxiliary_loss_clip": 0.01168623, "auxiliary_loss_mlp": 0.0102795, "balance_loss_clip": 0.97133392, "balance_loss_mlp": 1.02071404, "epoch": 0.8029820236878494, "flos": 26428457905920.0, "grad_norm": 2.599277773219105, "language_loss": 0.65183222, "learning_rate": 3.9339644379600157e-07, "loss": 0.67379797, "num_input_tokens_seen": 144105735, "step": 6678, "time_per_iteration": 3.91355562210083 }, { "auxiliary_loss_clip": 0.01171414, "auxiliary_loss_mlp": 0.01032795, "balance_loss_clip": 1.01292002, "balance_loss_mlp": 1.02580309, "epoch": 0.8031022665784886, "flos": 17676489582720.0, "grad_norm": 1.83208940648015, "language_loss": 0.71009618, "learning_rate": 3.929326318050907e-07, "loss": 0.73213828, "num_input_tokens_seen": 144123405, "step": 6679, "time_per_iteration": 2.728790044784546 }, { "auxiliary_loss_clip": 0.01165983, "auxiliary_loss_mlp": 0.01023588, "balance_loss_clip": 1.04763603, "balance_loss_mlp": 1.01652157, "epoch": 0.8032225094691277, "flos": 15450279431040.0, "grad_norm": 1.8340936224207953, "language_loss": 0.7871052, "learning_rate": 3.924690636050225e-07, "loss": 0.80900091, "num_input_tokens_seen": 144140815, "step": 6680, "time_per_iteration": 3.568702459335327 }, { "auxiliary_loss_clip": 0.0116796, "auxiliary_loss_mlp": 0.01027561, "balance_loss_clip": 1.01045561, "balance_loss_mlp": 1.02029538, "epoch": 0.8033427523597667, "flos": 26179202453760.0, "grad_norm": 2.4280659934907542, "language_loss": 0.72776282, "learning_rate": 3.9200573926611915e-07, "loss": 0.74971801, "num_input_tokens_seen": 144162230, "step": 6681, "time_per_iteration": 2.843255043029785 }, { "auxiliary_loss_clip": 0.01167377, "auxiliary_loss_mlp": 0.01028141, "balance_loss_clip": 1.01220596, "balance_loss_mlp": 1.02062464, "epoch": 0.8034629952504058, "flos": 21324905809920.0, "grad_norm": 1.965232543046391, "language_loss": 0.73348677, "learning_rate": 3.9154265885866613e-07, "loss": 0.7554419, "num_input_tokens_seen": 144181540, "step": 6682, "time_per_iteration": 2.6522648334503174 }, { "auxiliary_loss_clip": 0.01166409, "auxiliary_loss_mlp": 0.01023888, "balance_loss_clip": 1.01029181, "balance_loss_mlp": 1.01648533, "epoch": 0.8035832381410449, "flos": 21651585027840.0, "grad_norm": 7.911464510010074, "language_loss": 0.74978495, "learning_rate": 3.9107982245291394e-07, "loss": 0.77168792, "num_input_tokens_seen": 144199665, "step": 6683, "time_per_iteration": 2.6385769844055176 }, { "auxiliary_loss_clip": 0.01167231, "auxiliary_loss_mlp": 0.01024377, "balance_loss_clip": 0.93584436, "balance_loss_mlp": 1.01658106, "epoch": 0.803703481031684, "flos": 20518818744960.0, "grad_norm": 1.8162667127950662, "language_loss": 0.77558559, "learning_rate": 3.9061723011907245e-07, "loss": 0.79750168, "num_input_tokens_seen": 144219020, "step": 6684, "time_per_iteration": 2.7017829418182373 }, { "auxiliary_loss_clip": 0.01163106, "auxiliary_loss_mlp": 0.01025559, "balance_loss_clip": 0.97098982, "balance_loss_mlp": 1.01779544, "epoch": 0.803823723922323, "flos": 22854807838080.0, "grad_norm": 1.850360966515731, "language_loss": 0.79428172, "learning_rate": 3.901548819273179e-07, "loss": 0.81616831, "num_input_tokens_seen": 144239035, "step": 6685, "time_per_iteration": 2.676772356033325 }, { "auxiliary_loss_clip": 0.01170974, "auxiliary_loss_mlp": 0.01024347, "balance_loss_clip": 1.0131762, "balance_loss_mlp": 1.01737344, "epoch": 0.8039439668129622, "flos": 21362145235200.0, "grad_norm": 1.9385405696818618, "language_loss": 0.6916315, "learning_rate": 3.896927779477881e-07, "loss": 0.71358472, "num_input_tokens_seen": 144258295, "step": 6686, "time_per_iteration": 2.6400535106658936 }, { "auxiliary_loss_clip": 0.01165078, "auxiliary_loss_mlp": 0.01029363, "balance_loss_clip": 0.93266773, "balance_loss_mlp": 1.02222276, "epoch": 0.8040642097036013, "flos": 23802382575360.0, "grad_norm": 1.9573836757641176, "language_loss": 0.66993386, "learning_rate": 3.892309182505833e-07, "loss": 0.69187826, "num_input_tokens_seen": 144276110, "step": 6687, "time_per_iteration": 2.7388315200805664 }, { "auxiliary_loss_clip": 0.01167806, "auxiliary_loss_mlp": 0.01029108, "balance_loss_clip": 1.04712844, "balance_loss_mlp": 1.02221477, "epoch": 0.8041844525942403, "flos": 25922046009600.0, "grad_norm": 2.0161439130598646, "language_loss": 0.85798454, "learning_rate": 3.887693029057675e-07, "loss": 0.87995374, "num_input_tokens_seen": 144295620, "step": 6688, "time_per_iteration": 2.6451570987701416 }, { "auxiliary_loss_clip": 0.01168381, "auxiliary_loss_mlp": 0.01023198, "balance_loss_clip": 0.97254097, "balance_loss_mlp": 1.01663852, "epoch": 0.8043046954848795, "flos": 25191120153600.0, "grad_norm": 1.6338643589332176, "language_loss": 0.81047243, "learning_rate": 3.8830793198336684e-07, "loss": 0.83238816, "num_input_tokens_seen": 144315210, "step": 6689, "time_per_iteration": 2.6828465461730957 }, { "auxiliary_loss_clip": 0.01174318, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.01055598, "balance_loss_mlp": 1.02305532, "epoch": 0.8044249383755185, "flos": 41719185123840.0, "grad_norm": 1.6349496596993518, "language_loss": 0.70463049, "learning_rate": 3.878468055533721e-07, "loss": 0.72667295, "num_input_tokens_seen": 144337750, "step": 6690, "time_per_iteration": 2.7874815464019775 }, { "auxiliary_loss_clip": 0.0117322, "auxiliary_loss_mlp": 0.01033115, "balance_loss_clip": 0.93552631, "balance_loss_mlp": 1.02518129, "epoch": 0.8045451812661576, "flos": 20631434860800.0, "grad_norm": 2.5301100228110287, "language_loss": 0.84962165, "learning_rate": 3.8738592368573464e-07, "loss": 0.87168509, "num_input_tokens_seen": 144355305, "step": 6691, "time_per_iteration": 2.6692583560943604 }, { "auxiliary_loss_clip": 0.01158184, "auxiliary_loss_mlp": 0.01023641, "balance_loss_clip": 0.93425983, "balance_loss_mlp": 1.01637566, "epoch": 0.8046654241567968, "flos": 29711806254720.0, "grad_norm": 2.097222613331695, "language_loss": 0.8783884, "learning_rate": 3.8692528645037137e-07, "loss": 0.90020663, "num_input_tokens_seen": 144374485, "step": 6692, "time_per_iteration": 2.7222201824188232 }, { "auxiliary_loss_clip": 0.0116938, "auxiliary_loss_mlp": 0.01026609, "balance_loss_clip": 1.05016208, "balance_loss_mlp": 1.01951599, "epoch": 0.8047856670474358, "flos": 17671389851520.0, "grad_norm": 2.1261191646204995, "language_loss": 0.77541512, "learning_rate": 3.8646489391715907e-07, "loss": 0.79737496, "num_input_tokens_seen": 144388780, "step": 6693, "time_per_iteration": 2.602065324783325 }, { "auxiliary_loss_clip": 0.01171332, "auxiliary_loss_mlp": 0.0102883, "balance_loss_clip": 0.97490412, "balance_loss_mlp": 1.02084565, "epoch": 0.8049059099380749, "flos": 17120699464320.0, "grad_norm": 2.7837108146919514, "language_loss": 0.88056421, "learning_rate": 3.8600474615593903e-07, "loss": 0.90256584, "num_input_tokens_seen": 144403395, "step": 6694, "time_per_iteration": 2.687243700027466 }, { "auxiliary_loss_clip": 0.01068417, "auxiliary_loss_mlp": 0.01003205, "balance_loss_clip": 0.89827645, "balance_loss_mlp": 1.00160778, "epoch": 0.805026152828714, "flos": 62212903240320.0, "grad_norm": 0.7951856518868844, "language_loss": 0.59730244, "learning_rate": 3.8554484323651605e-07, "loss": 0.61801869, "num_input_tokens_seen": 144465265, "step": 6695, "time_per_iteration": 3.3152999877929688 }, { "auxiliary_loss_clip": 0.01165716, "auxiliary_loss_mlp": 0.01122733, "balance_loss_clip": 1.01022148, "balance_loss_mlp": 0.0, "epoch": 0.8051463957193531, "flos": 21688608971520.0, "grad_norm": 1.5093112623964748, "language_loss": 0.79282254, "learning_rate": 3.85085185228657e-07, "loss": 0.81570703, "num_input_tokens_seen": 144484235, "step": 6696, "time_per_iteration": 2.6598074436187744 }, { "auxiliary_loss_clip": 0.01158309, "auxiliary_loss_mlp": 0.01028508, "balance_loss_clip": 0.97032475, "balance_loss_mlp": 1.02052093, "epoch": 0.8052666386099921, "flos": 32051458535040.0, "grad_norm": 1.716485859912331, "language_loss": 0.7275492, "learning_rate": 3.8462577220209114e-07, "loss": 0.74941736, "num_input_tokens_seen": 144504610, "step": 6697, "time_per_iteration": 2.8324873447418213 }, { "auxiliary_loss_clip": 0.01060428, "auxiliary_loss_mlp": 0.01002415, "balance_loss_clip": 1.00902891, "balance_loss_mlp": 1.00081778, "epoch": 0.8053868815006313, "flos": 67157875768320.0, "grad_norm": 0.7171793244461608, "language_loss": 0.59077126, "learning_rate": 3.8416660422651127e-07, "loss": 0.61139965, "num_input_tokens_seen": 144574260, "step": 6698, "time_per_iteration": 4.142330646514893 }, { "auxiliary_loss_clip": 0.01166166, "auxiliary_loss_mlp": 0.01025171, "balance_loss_clip": 0.93170428, "balance_loss_mlp": 1.01795328, "epoch": 0.8055071243912704, "flos": 23837000307840.0, "grad_norm": 2.0038529608957005, "language_loss": 0.68153334, "learning_rate": 3.837076813715723e-07, "loss": 0.70344675, "num_input_tokens_seen": 144594145, "step": 6699, "time_per_iteration": 2.772702693939209 }, { "auxiliary_loss_clip": 0.01161166, "auxiliary_loss_mlp": 0.0102817, "balance_loss_clip": 0.93073702, "balance_loss_mlp": 1.02001023, "epoch": 0.8056273672819094, "flos": 21324510760320.0, "grad_norm": 2.1853139984770795, "language_loss": 0.75025558, "learning_rate": 3.832490037068941e-07, "loss": 0.77214897, "num_input_tokens_seen": 144612935, "step": 6700, "time_per_iteration": 2.726444959640503 }, { "auxiliary_loss_clip": 0.01155945, "auxiliary_loss_mlp": 0.01022914, "balance_loss_clip": 0.85505617, "balance_loss_mlp": 1.01554072, "epoch": 0.8057476101725486, "flos": 25768383626880.0, "grad_norm": 1.8257292620104357, "language_loss": 0.76238036, "learning_rate": 3.827905713020554e-07, "loss": 0.78416896, "num_input_tokens_seen": 144630580, "step": 6701, "time_per_iteration": 3.8654799461364746 }, { "auxiliary_loss_clip": 0.01165432, "auxiliary_loss_mlp": 0.01024992, "balance_loss_clip": 0.93107551, "balance_loss_mlp": 1.01695156, "epoch": 0.8058678530631876, "flos": 24535283679360.0, "grad_norm": 1.9251594730853974, "language_loss": 0.68737966, "learning_rate": 3.823323842266017e-07, "loss": 0.70928395, "num_input_tokens_seen": 144649975, "step": 6702, "time_per_iteration": 2.775090217590332 }, { "auxiliary_loss_clip": 0.01169632, "auxiliary_loss_mlp": 0.01024007, "balance_loss_clip": 1.0096426, "balance_loss_mlp": 1.01645541, "epoch": 0.8059880959538267, "flos": 24753728240640.0, "grad_norm": 2.4141683992722895, "language_loss": 0.73159993, "learning_rate": 3.818744425500393e-07, "loss": 0.75353628, "num_input_tokens_seen": 144667990, "step": 6703, "time_per_iteration": 3.668740749359131 }, { "auxiliary_loss_clip": 0.01158256, "auxiliary_loss_mlp": 0.01026572, "balance_loss_clip": 0.9323585, "balance_loss_mlp": 1.01884687, "epoch": 0.8061083388444659, "flos": 22196349671040.0, "grad_norm": 1.8994471660281353, "language_loss": 0.80738932, "learning_rate": 3.8141674634183675e-07, "loss": 0.82923758, "num_input_tokens_seen": 144687020, "step": 6704, "time_per_iteration": 2.7395918369293213 }, { "auxiliary_loss_clip": 0.01160584, "auxiliary_loss_mlp": 0.01021207, "balance_loss_clip": 0.89636195, "balance_loss_mlp": 1.01468909, "epoch": 0.8062285817351049, "flos": 30044195735040.0, "grad_norm": 1.767543248526123, "language_loss": 0.66680831, "learning_rate": 3.809592956714278e-07, "loss": 0.68862623, "num_input_tokens_seen": 144710255, "step": 6705, "time_per_iteration": 2.7560136318206787 }, { "auxiliary_loss_clip": 0.01171508, "auxiliary_loss_mlp": 0.010246, "balance_loss_clip": 1.01190138, "balance_loss_mlp": 1.01770425, "epoch": 0.806348824625744, "flos": 22782591544320.0, "grad_norm": 1.8918969180347558, "language_loss": 0.74925685, "learning_rate": 3.805020906082057e-07, "loss": 0.77121794, "num_input_tokens_seen": 144728830, "step": 6706, "time_per_iteration": 3.6259443759918213 }, { "auxiliary_loss_clip": 0.01170299, "auxiliary_loss_mlp": 0.01029054, "balance_loss_clip": 0.97186816, "balance_loss_mlp": 1.02127552, "epoch": 0.8064690675163831, "flos": 23404600385280.0, "grad_norm": 2.3914714550114247, "language_loss": 0.80649167, "learning_rate": 3.8004513122152917e-07, "loss": 0.82848519, "num_input_tokens_seen": 144747140, "step": 6707, "time_per_iteration": 2.6593005657196045 }, { "auxiliary_loss_clip": 0.01159059, "auxiliary_loss_mlp": 0.01030894, "balance_loss_clip": 0.97420585, "balance_loss_mlp": 1.02405405, "epoch": 0.8065893104070222, "flos": 24060903736320.0, "grad_norm": 1.7795132134507583, "language_loss": 0.67309856, "learning_rate": 3.79588417580718e-07, "loss": 0.69499803, "num_input_tokens_seen": 144765250, "step": 6708, "time_per_iteration": 2.7127256393432617 }, { "auxiliary_loss_clip": 0.01167981, "auxiliary_loss_mlp": 0.01031082, "balance_loss_clip": 1.00911653, "balance_loss_mlp": 1.0243082, "epoch": 0.8067095532976613, "flos": 22305410340480.0, "grad_norm": 1.862553871485776, "language_loss": 0.76402879, "learning_rate": 3.791319497550558e-07, "loss": 0.78601944, "num_input_tokens_seen": 144783080, "step": 6709, "time_per_iteration": 2.690441370010376 }, { "auxiliary_loss_clip": 0.01170535, "auxiliary_loss_mlp": 0.01122451, "balance_loss_clip": 0.93398505, "balance_loss_mlp": 0.0, "epoch": 0.8068297961883004, "flos": 17129498296320.0, "grad_norm": 1.9755177823276675, "language_loss": 0.70780993, "learning_rate": 3.78675727813788e-07, "loss": 0.73073983, "num_input_tokens_seen": 144800645, "step": 6710, "time_per_iteration": 2.667433261871338 }, { "auxiliary_loss_clip": 0.01165585, "auxiliary_loss_mlp": 0.01027732, "balance_loss_clip": 0.9722389, "balance_loss_mlp": 1.020612, "epoch": 0.8069500390789395, "flos": 22018843635840.0, "grad_norm": 1.7222567331670702, "language_loss": 0.73616767, "learning_rate": 3.782197518261225e-07, "loss": 0.75810087, "num_input_tokens_seen": 144820085, "step": 6711, "time_per_iteration": 2.666490077972412 }, { "auxiliary_loss_clip": 0.01170312, "auxiliary_loss_mlp": 0.01027021, "balance_loss_clip": 0.97224063, "balance_loss_mlp": 1.01983309, "epoch": 0.8070702819695785, "flos": 19244241567360.0, "grad_norm": 1.9654732870675307, "language_loss": 0.95232993, "learning_rate": 3.777640218612319e-07, "loss": 0.97430325, "num_input_tokens_seen": 144838070, "step": 6712, "time_per_iteration": 2.6989409923553467 }, { "auxiliary_loss_clip": 0.01163156, "auxiliary_loss_mlp": 0.0102789, "balance_loss_clip": 1.01116228, "balance_loss_mlp": 1.02094018, "epoch": 0.8071905248602176, "flos": 21544320038400.0, "grad_norm": 2.225253272861545, "language_loss": 0.71917987, "learning_rate": 3.773085379882488e-07, "loss": 0.74109036, "num_input_tokens_seen": 144857125, "step": 6713, "time_per_iteration": 2.609726905822754 }, { "auxiliary_loss_clip": 0.01165579, "auxiliary_loss_mlp": 0.01122812, "balance_loss_clip": 1.00742054, "balance_loss_mlp": 0.0, "epoch": 0.8073107677508568, "flos": 37268309105280.0, "grad_norm": 2.6893022720600106, "language_loss": 0.76009321, "learning_rate": 3.768533002762715e-07, "loss": 0.7829771, "num_input_tokens_seen": 144880660, "step": 6714, "time_per_iteration": 2.8002028465270996 }, { "auxiliary_loss_clip": 0.01165155, "auxiliary_loss_mlp": 0.01023853, "balance_loss_clip": 0.96961927, "balance_loss_mlp": 1.01706457, "epoch": 0.8074310106414958, "flos": 28366269759360.0, "grad_norm": 1.6537910408695495, "language_loss": 0.77247906, "learning_rate": 3.763983087943572e-07, "loss": 0.79436916, "num_input_tokens_seen": 144900050, "step": 6715, "time_per_iteration": 2.7326111793518066 }, { "auxiliary_loss_clip": 0.01155208, "auxiliary_loss_mlp": 0.01122343, "balance_loss_clip": 1.00637352, "balance_loss_mlp": 0.0, "epoch": 0.8075512535321349, "flos": 24281646768000.0, "grad_norm": 1.5526467257290273, "language_loss": 0.80976403, "learning_rate": 3.759435636115282e-07, "loss": 0.83253962, "num_input_tokens_seen": 144920835, "step": 6716, "time_per_iteration": 2.7231218814849854 }, { "auxiliary_loss_clip": 0.01160868, "auxiliary_loss_mlp": 0.01122476, "balance_loss_clip": 0.85936522, "balance_loss_mlp": 0.0, "epoch": 0.807671496422774, "flos": 26030855283840.0, "grad_norm": 1.6994872682158237, "language_loss": 0.72993159, "learning_rate": 3.7548906479676967e-07, "loss": 0.75276506, "num_input_tokens_seen": 144940430, "step": 6717, "time_per_iteration": 2.855098247528076 }, { "auxiliary_loss_clip": 0.01168572, "auxiliary_loss_mlp": 0.01027868, "balance_loss_clip": 1.00709128, "balance_loss_mlp": 1.02030969, "epoch": 0.8077917393134131, "flos": 23730740899200.0, "grad_norm": 1.6405519705333327, "language_loss": 0.71619892, "learning_rate": 3.7503481241902855e-07, "loss": 0.73816329, "num_input_tokens_seen": 144960405, "step": 6718, "time_per_iteration": 2.6666877269744873 }, { "auxiliary_loss_clip": 0.01163634, "auxiliary_loss_mlp": 0.01122259, "balance_loss_clip": 0.96991992, "balance_loss_mlp": 0.0, "epoch": 0.8079119822040521, "flos": 18402028398720.0, "grad_norm": 1.8203127166934556, "language_loss": 0.79975998, "learning_rate": 3.745808065472145e-07, "loss": 0.82261896, "num_input_tokens_seen": 144977700, "step": 6719, "time_per_iteration": 2.788407802581787 }, { "auxiliary_loss_clip": 0.01166681, "auxiliary_loss_mlp": 0.01032907, "balance_loss_clip": 1.01420498, "balance_loss_mlp": 1.02596879, "epoch": 0.8080322250946913, "flos": 23621787970560.0, "grad_norm": 1.4678544294700084, "language_loss": 0.76331449, "learning_rate": 3.741270472501994e-07, "loss": 0.78531039, "num_input_tokens_seen": 144998340, "step": 6720, "time_per_iteration": 2.7083935737609863 }, { "auxiliary_loss_clip": 0.01165166, "auxiliary_loss_mlp": 0.01019257, "balance_loss_clip": 0.97424251, "balance_loss_mlp": 1.01260495, "epoch": 0.8081524679853304, "flos": 22820692896000.0, "grad_norm": 1.6940440003831214, "language_loss": 0.72632623, "learning_rate": 3.736735345968183e-07, "loss": 0.7481705, "num_input_tokens_seen": 145017950, "step": 6721, "time_per_iteration": 2.6562983989715576 }, { "auxiliary_loss_clip": 0.01169401, "auxiliary_loss_mlp": 0.01020717, "balance_loss_clip": 1.01100731, "balance_loss_mlp": 1.01421666, "epoch": 0.8082727108759694, "flos": 17640004343040.0, "grad_norm": 1.6364604962140805, "language_loss": 0.78835231, "learning_rate": 3.7322026865586986e-07, "loss": 0.8102535, "num_input_tokens_seen": 145036985, "step": 6722, "time_per_iteration": 2.6506693363189697 }, { "auxiliary_loss_clip": 0.01176776, "auxiliary_loss_mlp": 0.01028563, "balance_loss_clip": 1.01301193, "balance_loss_mlp": 1.02133894, "epoch": 0.8083929537666086, "flos": 25958172113280.0, "grad_norm": 2.1218305299388134, "language_loss": 0.7293188, "learning_rate": 3.7276724949611206e-07, "loss": 0.75137216, "num_input_tokens_seen": 145057095, "step": 6723, "time_per_iteration": 2.6885998249053955 }, { "auxiliary_loss_clip": 0.01167571, "auxiliary_loss_mlp": 0.01031123, "balance_loss_clip": 0.97195137, "balance_loss_mlp": 1.02348804, "epoch": 0.8085131966572476, "flos": 27089178629760.0, "grad_norm": 1.7899963971686093, "language_loss": 0.75017858, "learning_rate": 3.723144771862694e-07, "loss": 0.77216554, "num_input_tokens_seen": 145077735, "step": 6724, "time_per_iteration": 3.7103676795959473 }, { "auxiliary_loss_clip": 0.01168349, "auxiliary_loss_mlp": 0.01022286, "balance_loss_clip": 0.93177795, "balance_loss_mlp": 1.01527095, "epoch": 0.8086334395478867, "flos": 23988543788160.0, "grad_norm": 1.7207730733900237, "language_loss": 0.77002096, "learning_rate": 3.718619517950263e-07, "loss": 0.79192734, "num_input_tokens_seen": 145098330, "step": 6725, "time_per_iteration": 2.812008857727051 }, { "auxiliary_loss_clip": 0.01170077, "auxiliary_loss_mlp": 0.01024671, "balance_loss_clip": 1.05218029, "balance_loss_mlp": 1.01775122, "epoch": 0.8087536824385259, "flos": 20405879406720.0, "grad_norm": 1.8531402938443342, "language_loss": 0.76527965, "learning_rate": 3.714096733910301e-07, "loss": 0.78722709, "num_input_tokens_seen": 145115855, "step": 6726, "time_per_iteration": 2.841285467147827 }, { "auxiliary_loss_clip": 0.011774, "auxiliary_loss_mlp": 0.01031617, "balance_loss_clip": 1.01252639, "balance_loss_mlp": 1.02339745, "epoch": 0.8088739253291649, "flos": 25919639798400.0, "grad_norm": 2.9973602871560083, "language_loss": 0.70630723, "learning_rate": 3.709576420428926e-07, "loss": 0.72839737, "num_input_tokens_seen": 145136655, "step": 6727, "time_per_iteration": 3.8322057723999023 }, { "auxiliary_loss_clip": 0.01166853, "auxiliary_loss_mlp": 0.01026512, "balance_loss_clip": 0.97000933, "balance_loss_mlp": 1.01939559, "epoch": 0.808994168219804, "flos": 28402072640640.0, "grad_norm": 2.002874380931932, "language_loss": 0.73311222, "learning_rate": 3.7050585781918463e-07, "loss": 0.75504589, "num_input_tokens_seen": 145156955, "step": 6728, "time_per_iteration": 2.7949678897857666 }, { "auxiliary_loss_clip": 0.01170592, "auxiliary_loss_mlp": 0.01024718, "balance_loss_clip": 1.00978303, "balance_loss_mlp": 1.01681185, "epoch": 0.8091144111104431, "flos": 17421056991360.0, "grad_norm": 2.428672867341611, "language_loss": 0.68617433, "learning_rate": 3.700543207884428e-07, "loss": 0.70812738, "num_input_tokens_seen": 145173865, "step": 6729, "time_per_iteration": 2.7767210006713867 }, { "auxiliary_loss_clip": 0.01166201, "auxiliary_loss_mlp": 0.01024654, "balance_loss_clip": 1.01153779, "balance_loss_mlp": 1.01774883, "epoch": 0.8092346540010822, "flos": 32153803361280.0, "grad_norm": 1.7151160876199074, "language_loss": 0.71097273, "learning_rate": 3.6960303101916466e-07, "loss": 0.73288125, "num_input_tokens_seen": 145193780, "step": 6730, "time_per_iteration": 3.718794822692871 }, { "auxiliary_loss_clip": 0.01060543, "auxiliary_loss_mlp": 0.01115745, "balance_loss_clip": 1.00898409, "balance_loss_mlp": 0.0, "epoch": 0.8093548968917212, "flos": 58035093390720.0, "grad_norm": 0.7430094257910287, "language_loss": 0.55604953, "learning_rate": 3.6915198857981047e-07, "loss": 0.57781243, "num_input_tokens_seen": 145258980, "step": 6731, "time_per_iteration": 3.228944778442383 }, { "auxiliary_loss_clip": 0.01164117, "auxiliary_loss_mlp": 0.01031833, "balance_loss_clip": 0.93381917, "balance_loss_mlp": 1.02487373, "epoch": 0.8094751397823604, "flos": 27381599251200.0, "grad_norm": 1.6593387470828946, "language_loss": 0.68035394, "learning_rate": 3.687011935388027e-07, "loss": 0.70231342, "num_input_tokens_seen": 145281875, "step": 6732, "time_per_iteration": 3.676525831222534 }, { "auxiliary_loss_clip": 0.01166421, "auxiliary_loss_mlp": 0.01022439, "balance_loss_clip": 1.01000237, "balance_loss_mlp": 1.0154233, "epoch": 0.8095953826729995, "flos": 24061083304320.0, "grad_norm": 1.850273284991831, "language_loss": 0.7293191, "learning_rate": 3.6825064596452646e-07, "loss": 0.75120771, "num_input_tokens_seen": 145302220, "step": 6733, "time_per_iteration": 2.6629679203033447 }, { "auxiliary_loss_clip": 0.01166711, "auxiliary_loss_mlp": 0.0102423, "balance_loss_clip": 1.00946808, "balance_loss_mlp": 1.01756668, "epoch": 0.8097156255636385, "flos": 23951412103680.0, "grad_norm": 1.699539282736508, "language_loss": 0.70608854, "learning_rate": 3.678003459253305e-07, "loss": 0.72799802, "num_input_tokens_seen": 145323070, "step": 6734, "time_per_iteration": 2.684231996536255 }, { "auxiliary_loss_clip": 0.01164135, "auxiliary_loss_mlp": 0.01027561, "balance_loss_clip": 0.93369138, "balance_loss_mlp": 1.02069449, "epoch": 0.8098358684542777, "flos": 21799142098560.0, "grad_norm": 1.815981511826913, "language_loss": 0.74360418, "learning_rate": 3.673502934895236e-07, "loss": 0.76552117, "num_input_tokens_seen": 145342575, "step": 6735, "time_per_iteration": 2.7457919120788574 }, { "auxiliary_loss_clip": 0.01060282, "auxiliary_loss_mlp": 0.01001745, "balance_loss_clip": 1.00891495, "balance_loss_mlp": 1.0001595, "epoch": 0.8099561113449167, "flos": 68809515966720.0, "grad_norm": 0.7066402151735466, "language_loss": 0.57999814, "learning_rate": 3.669004887253802e-07, "loss": 0.60061836, "num_input_tokens_seen": 145408865, "step": 6736, "time_per_iteration": 3.3316733837127686 }, { "auxiliary_loss_clip": 0.01171157, "auxiliary_loss_mlp": 0.01028196, "balance_loss_clip": 0.97441125, "balance_loss_mlp": 1.02169645, "epoch": 0.8100763542355558, "flos": 23586056916480.0, "grad_norm": 1.5738377491889755, "language_loss": 0.78842735, "learning_rate": 3.664509317011335e-07, "loss": 0.81042087, "num_input_tokens_seen": 145429200, "step": 6737, "time_per_iteration": 2.741650342941284 }, { "auxiliary_loss_clip": 0.01167876, "auxiliary_loss_mlp": 0.01024704, "balance_loss_clip": 1.01277614, "balance_loss_mlp": 1.01750422, "epoch": 0.810196597126195, "flos": 31650408207360.0, "grad_norm": 1.7945582637146353, "language_loss": 0.74165225, "learning_rate": 3.6600162248498134e-07, "loss": 0.76357806, "num_input_tokens_seen": 145452830, "step": 6738, "time_per_iteration": 2.777409791946411 }, { "auxiliary_loss_clip": 0.0114218, "auxiliary_loss_mlp": 0.01022593, "balance_loss_clip": 0.85351163, "balance_loss_mlp": 1.01614118, "epoch": 0.810316840016834, "flos": 24900459298560.0, "grad_norm": 1.7303999900962486, "language_loss": 0.75998694, "learning_rate": 3.6555256114508426e-07, "loss": 0.78163463, "num_input_tokens_seen": 145472625, "step": 6739, "time_per_iteration": 2.823646306991577 }, { "auxiliary_loss_clip": 0.01162491, "auxiliary_loss_mlp": 0.01026595, "balance_loss_clip": 0.96692044, "balance_loss_mlp": 1.01978469, "epoch": 0.8104370829074731, "flos": 27965003950080.0, "grad_norm": 1.8122774016739207, "language_loss": 0.73216093, "learning_rate": 3.651037477495642e-07, "loss": 0.75405174, "num_input_tokens_seen": 145494075, "step": 6740, "time_per_iteration": 2.7223169803619385 }, { "auxiliary_loss_clip": 0.01167091, "auxiliary_loss_mlp": 0.01027154, "balance_loss_clip": 1.04630888, "balance_loss_mlp": 1.01966715, "epoch": 0.8105573257981122, "flos": 24640752988800.0, "grad_norm": 1.809856459264776, "language_loss": 0.68161559, "learning_rate": 3.6465518236650584e-07, "loss": 0.70355809, "num_input_tokens_seen": 145514220, "step": 6741, "time_per_iteration": 2.633488178253174 }, { "auxiliary_loss_clip": 0.01160257, "auxiliary_loss_mlp": 0.01025937, "balance_loss_clip": 0.93080831, "balance_loss_mlp": 1.01880264, "epoch": 0.8106775686887513, "flos": 26358935132160.0, "grad_norm": 1.8550387992686854, "language_loss": 0.78207839, "learning_rate": 3.642068650639558e-07, "loss": 0.8039403, "num_input_tokens_seen": 145533965, "step": 6742, "time_per_iteration": 2.7522175312042236 }, { "auxiliary_loss_clip": 0.0115382, "auxiliary_loss_mlp": 0.01024446, "balance_loss_clip": 0.96587348, "balance_loss_mlp": 1.01763594, "epoch": 0.8107978115793903, "flos": 27271892136960.0, "grad_norm": 1.7296896921069205, "language_loss": 0.64262438, "learning_rate": 3.6375879590992334e-07, "loss": 0.66440701, "num_input_tokens_seen": 145554310, "step": 6743, "time_per_iteration": 2.7426490783691406 }, { "auxiliary_loss_clip": 0.01161261, "auxiliary_loss_mlp": 0.01024971, "balance_loss_clip": 0.97185349, "balance_loss_mlp": 1.01765418, "epoch": 0.8109180544700295, "flos": 24934322845440.0, "grad_norm": 1.6785299441216637, "language_loss": 0.80848086, "learning_rate": 3.6331097497238173e-07, "loss": 0.83034319, "num_input_tokens_seen": 145573755, "step": 6744, "time_per_iteration": 2.6685824394226074 }, { "auxiliary_loss_clip": 0.01161836, "auxiliary_loss_mlp": 0.0102743, "balance_loss_clip": 0.93112516, "balance_loss_mlp": 1.02062571, "epoch": 0.8110382973606686, "flos": 21105383840640.0, "grad_norm": 2.2768324881118476, "language_loss": 0.79795885, "learning_rate": 3.628634023192627e-07, "loss": 0.81985152, "num_input_tokens_seen": 145594000, "step": 6745, "time_per_iteration": 2.733832597732544 }, { "auxiliary_loss_clip": 0.01168554, "auxiliary_loss_mlp": 0.01025768, "balance_loss_clip": 1.00965452, "balance_loss_mlp": 1.0178411, "epoch": 0.8111585402513076, "flos": 15414081500160.0, "grad_norm": 2.2145337864173493, "language_loss": 0.75340796, "learning_rate": 3.624160780184644e-07, "loss": 0.77535117, "num_input_tokens_seen": 145611215, "step": 6746, "time_per_iteration": 2.586454391479492 }, { "auxiliary_loss_clip": 0.01158084, "auxiliary_loss_mlp": 0.01023514, "balance_loss_clip": 0.96898699, "balance_loss_mlp": 1.01612937, "epoch": 0.8112787831419467, "flos": 24095736950400.0, "grad_norm": 1.769352561746613, "language_loss": 0.74516439, "learning_rate": 3.6196900213784496e-07, "loss": 0.76698041, "num_input_tokens_seen": 145630530, "step": 6747, "time_per_iteration": 2.751544952392578 }, { "auxiliary_loss_clip": 0.01167709, "auxiliary_loss_mlp": 0.01025835, "balance_loss_clip": 1.0099597, "balance_loss_mlp": 1.01934731, "epoch": 0.8113990260325858, "flos": 20483374999680.0, "grad_norm": 1.8319557928780401, "language_loss": 0.86868721, "learning_rate": 3.6152217474522527e-07, "loss": 0.89062262, "num_input_tokens_seen": 145647345, "step": 6748, "time_per_iteration": 2.6261563301086426 }, { "auxiliary_loss_clip": 0.01168212, "auxiliary_loss_mlp": 0.01024977, "balance_loss_clip": 1.01252019, "balance_loss_mlp": 1.01802993, "epoch": 0.8115192689232249, "flos": 24901141656960.0, "grad_norm": 1.8617908185283607, "language_loss": 0.72397798, "learning_rate": 3.6107559590838975e-07, "loss": 0.74590987, "num_input_tokens_seen": 145666330, "step": 6749, "time_per_iteration": 2.6344716548919678 }, { "auxiliary_loss_clip": 0.01155735, "auxiliary_loss_mlp": 0.01029483, "balance_loss_clip": 0.85388339, "balance_loss_mlp": 1.0223155, "epoch": 0.811639511813864, "flos": 24057204635520.0, "grad_norm": 2.1703213965881476, "language_loss": 0.66129321, "learning_rate": 3.606292656950822e-07, "loss": 0.6831454, "num_input_tokens_seen": 145684740, "step": 6750, "time_per_iteration": 3.7488105297088623 }, { "auxiliary_loss_clip": 0.011582, "auxiliary_loss_mlp": 0.01026181, "balance_loss_clip": 0.96784222, "balance_loss_mlp": 1.01847434, "epoch": 0.8117597547045031, "flos": 23185150243200.0, "grad_norm": 2.1960470198239848, "language_loss": 0.86564165, "learning_rate": 3.601831841730121e-07, "loss": 0.88748544, "num_input_tokens_seen": 145702660, "step": 6751, "time_per_iteration": 2.6780142784118652 }, { "auxiliary_loss_clip": 0.01168208, "auxiliary_loss_mlp": 0.01024718, "balance_loss_clip": 1.01154673, "balance_loss_mlp": 1.01761925, "epoch": 0.8118799975951422, "flos": 23040250778880.0, "grad_norm": 1.781926872310133, "language_loss": 0.72888017, "learning_rate": 3.5973735140984916e-07, "loss": 0.75080943, "num_input_tokens_seen": 145722830, "step": 6752, "time_per_iteration": 2.702434539794922 }, { "auxiliary_loss_clip": 0.0115924, "auxiliary_loss_mlp": 0.01122591, "balance_loss_clip": 0.89270461, "balance_loss_mlp": 0.0, "epoch": 0.8120002404857812, "flos": 24639962889600.0, "grad_norm": 2.1486226949935063, "language_loss": 0.79347479, "learning_rate": 3.5929176747322607e-07, "loss": 0.81629312, "num_input_tokens_seen": 145741935, "step": 6753, "time_per_iteration": 3.8436806201934814 }, { "auxiliary_loss_clip": 0.01068675, "auxiliary_loss_mlp": 0.0100227, "balance_loss_clip": 0.9349761, "balance_loss_mlp": 1.00058901, "epoch": 0.8121204833764204, "flos": 57415742156160.0, "grad_norm": 0.8103684919255477, "language_loss": 0.56282985, "learning_rate": 3.588464324307372e-07, "loss": 0.58353931, "num_input_tokens_seen": 145805560, "step": 6754, "time_per_iteration": 3.3665242195129395 }, { "auxiliary_loss_clip": 0.01166611, "auxiliary_loss_mlp": 0.01025085, "balance_loss_clip": 1.00774157, "balance_loss_mlp": 1.01820064, "epoch": 0.8122407262670595, "flos": 19464589549440.0, "grad_norm": 5.971296251549588, "language_loss": 0.75501835, "learning_rate": 3.584013463499391e-07, "loss": 0.77693534, "num_input_tokens_seen": 145824180, "step": 6755, "time_per_iteration": 3.7537357807159424 }, { "auxiliary_loss_clip": 0.01066945, "auxiliary_loss_mlp": 0.01002392, "balance_loss_clip": 0.9358753, "balance_loss_mlp": 1.00083029, "epoch": 0.8123609691576985, "flos": 56425325472000.0, "grad_norm": 0.7470544702810701, "language_loss": 0.64475209, "learning_rate": 3.579565092983521e-07, "loss": 0.66544539, "num_input_tokens_seen": 145885300, "step": 6756, "time_per_iteration": 3.2175889015197754 }, { "auxiliary_loss_clip": 0.01167209, "auxiliary_loss_mlp": 0.01022337, "balance_loss_clip": 1.04811156, "balance_loss_mlp": 1.01539278, "epoch": 0.8124812120483377, "flos": 20631973564800.0, "grad_norm": 1.9025801443963357, "language_loss": 0.8380636, "learning_rate": 3.575119213434565e-07, "loss": 0.85995907, "num_input_tokens_seen": 145903815, "step": 6757, "time_per_iteration": 2.672542095184326 }, { "auxiliary_loss_clip": 0.01166551, "auxiliary_loss_mlp": 0.01024582, "balance_loss_clip": 1.01254749, "balance_loss_mlp": 1.01737833, "epoch": 0.8126014549389767, "flos": 22492397566080.0, "grad_norm": 1.6503448193001446, "language_loss": 0.81730461, "learning_rate": 3.5706758255269765e-07, "loss": 0.83921593, "num_input_tokens_seen": 145922270, "step": 6758, "time_per_iteration": 3.776110887527466 }, { "auxiliary_loss_clip": 0.01170976, "auxiliary_loss_mlp": 0.01024652, "balance_loss_clip": 0.97328973, "balance_loss_mlp": 1.01763082, "epoch": 0.8127216978296158, "flos": 23287961946240.0, "grad_norm": 1.5205537426651505, "language_loss": 0.69384325, "learning_rate": 3.566234929934795e-07, "loss": 0.71579951, "num_input_tokens_seen": 145941470, "step": 6759, "time_per_iteration": 2.7277328968048096 }, { "auxiliary_loss_clip": 0.01168166, "auxiliary_loss_mlp": 0.01022457, "balance_loss_clip": 1.01278663, "balance_loss_mlp": 1.01519978, "epoch": 0.812841940720255, "flos": 25154994049920.0, "grad_norm": 1.5108519425330618, "language_loss": 0.71693587, "learning_rate": 3.561796527331706e-07, "loss": 0.73884207, "num_input_tokens_seen": 145963145, "step": 6760, "time_per_iteration": 2.7254841327667236 }, { "auxiliary_loss_clip": 0.01165585, "auxiliary_loss_mlp": 0.0102008, "balance_loss_clip": 0.93446851, "balance_loss_mlp": 1.01279676, "epoch": 0.812962183610894, "flos": 26648446752000.0, "grad_norm": 1.8685555480322291, "language_loss": 0.7720207, "learning_rate": 3.5573606183910163e-07, "loss": 0.7938773, "num_input_tokens_seen": 145983150, "step": 6761, "time_per_iteration": 2.862213611602783 }, { "auxiliary_loss_clip": 0.01171463, "auxiliary_loss_mlp": 0.01021533, "balance_loss_clip": 1.00774646, "balance_loss_mlp": 1.01436841, "epoch": 0.8130824265015331, "flos": 24966965329920.0, "grad_norm": 1.7324918678432193, "language_loss": 0.78442979, "learning_rate": 3.5529272037856493e-07, "loss": 0.80635983, "num_input_tokens_seen": 146001365, "step": 6762, "time_per_iteration": 2.7321131229400635 }, { "auxiliary_loss_clip": 0.0107897, "auxiliary_loss_mlp": 0.01002888, "balance_loss_clip": 0.82405841, "balance_loss_mlp": 1.00112391, "epoch": 0.8132026693921722, "flos": 67622918175360.0, "grad_norm": 0.7073576842956038, "language_loss": 0.53850973, "learning_rate": 3.548496284188149e-07, "loss": 0.55932832, "num_input_tokens_seen": 146061570, "step": 6763, "time_per_iteration": 3.392505168914795 }, { "auxiliary_loss_clip": 0.01157404, "auxiliary_loss_mlp": 0.01025546, "balance_loss_clip": 0.89628494, "balance_loss_mlp": 1.01878631, "epoch": 0.8133229122828113, "flos": 19495149045120.0, "grad_norm": 1.8055977402258256, "language_loss": 0.79200077, "learning_rate": 3.544067860270681e-07, "loss": 0.81383026, "num_input_tokens_seen": 146079145, "step": 6764, "time_per_iteration": 2.75618314743042 }, { "auxiliary_loss_clip": 0.01171014, "auxiliary_loss_mlp": 0.01025225, "balance_loss_clip": 0.935583, "balance_loss_mlp": 1.01788783, "epoch": 0.8134431551734503, "flos": 20668135582080.0, "grad_norm": 1.6622291730444294, "language_loss": 0.70847112, "learning_rate": 3.539641932705029e-07, "loss": 0.73043352, "num_input_tokens_seen": 146097625, "step": 6765, "time_per_iteration": 2.765577793121338 }, { "auxiliary_loss_clip": 0.01172692, "auxiliary_loss_mlp": 0.01027559, "balance_loss_clip": 1.04935193, "balance_loss_mlp": 1.01999569, "epoch": 0.8135633980640895, "flos": 21507332008320.0, "grad_norm": 2.2418769109441428, "language_loss": 0.76949096, "learning_rate": 3.53521850216262e-07, "loss": 0.79149354, "num_input_tokens_seen": 146117195, "step": 6766, "time_per_iteration": 2.784675359725952 }, { "auxiliary_loss_clip": 0.01168581, "auxiliary_loss_mlp": 0.0102622, "balance_loss_clip": 1.04840446, "balance_loss_mlp": 1.01884687, "epoch": 0.8136836409547286, "flos": 20554442058240.0, "grad_norm": 1.7925038244567306, "language_loss": 0.76692253, "learning_rate": 3.530797569314461e-07, "loss": 0.78887051, "num_input_tokens_seen": 146136220, "step": 6767, "time_per_iteration": 2.65797758102417 }, { "auxiliary_loss_clip": 0.01168668, "auxiliary_loss_mlp": 0.01025331, "balance_loss_clip": 1.04889703, "balance_loss_mlp": 1.0182023, "epoch": 0.8138038838453676, "flos": 20299045380480.0, "grad_norm": 2.9046360496756627, "language_loss": 0.77604139, "learning_rate": 3.5263791348312235e-07, "loss": 0.79798138, "num_input_tokens_seen": 146155415, "step": 6768, "time_per_iteration": 2.7295093536376953 }, { "auxiliary_loss_clip": 0.0116501, "auxiliary_loss_mlp": 0.01023096, "balance_loss_clip": 0.97010374, "balance_loss_mlp": 1.01593411, "epoch": 0.8139241267360068, "flos": 29789840551680.0, "grad_norm": 4.86135525009975, "language_loss": 0.70392853, "learning_rate": 3.521963199383171e-07, "loss": 0.72580957, "num_input_tokens_seen": 146178370, "step": 6769, "time_per_iteration": 2.755000591278076 }, { "auxiliary_loss_clip": 0.01160193, "auxiliary_loss_mlp": 0.01023559, "balance_loss_clip": 0.89316833, "balance_loss_mlp": 1.01608467, "epoch": 0.8140443696266458, "flos": 19713270384000.0, "grad_norm": 2.392095819835577, "language_loss": 0.77085483, "learning_rate": 3.517549763640197e-07, "loss": 0.79269236, "num_input_tokens_seen": 146196010, "step": 6770, "time_per_iteration": 2.773329257965088 }, { "auxiliary_loss_clip": 0.01164786, "auxiliary_loss_mlp": 0.01122051, "balance_loss_clip": 1.01233613, "balance_loss_mlp": 0.0, "epoch": 0.8141646125172849, "flos": 27160568910720.0, "grad_norm": 2.0351958583879672, "language_loss": 0.71344054, "learning_rate": 3.513138828271829e-07, "loss": 0.73630893, "num_input_tokens_seen": 146215880, "step": 6771, "time_per_iteration": 2.707880973815918 }, { "auxiliary_loss_clip": 0.01159771, "auxiliary_loss_mlp": 0.01023986, "balance_loss_clip": 0.93202209, "balance_loss_mlp": 1.01677346, "epoch": 0.8142848554079241, "flos": 39673102700160.0, "grad_norm": 1.9237565298366737, "language_loss": 0.70382607, "learning_rate": 3.508730393947179e-07, "loss": 0.7256636, "num_input_tokens_seen": 146239135, "step": 6772, "time_per_iteration": 2.875016927719116 }, { "auxiliary_loss_clip": 0.01157412, "auxiliary_loss_mlp": 0.01021327, "balance_loss_clip": 0.9314872, "balance_loss_mlp": 1.0147016, "epoch": 0.8144050982985631, "flos": 22237288197120.0, "grad_norm": 1.6968385585342232, "language_loss": 0.72017533, "learning_rate": 3.504324461335024e-07, "loss": 0.74196267, "num_input_tokens_seen": 146259245, "step": 6773, "time_per_iteration": 2.7457525730133057 }, { "auxiliary_loss_clip": 0.01155272, "auxiliary_loss_mlp": 0.01031505, "balance_loss_clip": 0.89352071, "balance_loss_mlp": 1.02385807, "epoch": 0.8145253411892022, "flos": 23038239617280.0, "grad_norm": 1.6930381876213516, "language_loss": 0.88208842, "learning_rate": 3.499921031103732e-07, "loss": 0.90395617, "num_input_tokens_seen": 146280015, "step": 6774, "time_per_iteration": 2.8781838417053223 }, { "auxiliary_loss_clip": 0.01173466, "auxiliary_loss_mlp": 0.01023852, "balance_loss_clip": 0.93113971, "balance_loss_mlp": 1.01669359, "epoch": 0.8146455840798413, "flos": 24827668387200.0, "grad_norm": 1.7702322312319132, "language_loss": 0.78264773, "learning_rate": 3.4955201039212987e-07, "loss": 0.80462098, "num_input_tokens_seen": 146300935, "step": 6775, "time_per_iteration": 2.7453866004943848 }, { "auxiliary_loss_clip": 0.01171273, "auxiliary_loss_mlp": 0.01027065, "balance_loss_clip": 1.00989723, "balance_loss_mlp": 1.01983833, "epoch": 0.8147658269704804, "flos": 19974520978560.0, "grad_norm": 1.730166335682321, "language_loss": 0.65162992, "learning_rate": 3.4911216804553465e-07, "loss": 0.67361331, "num_input_tokens_seen": 146319835, "step": 6776, "time_per_iteration": 3.4719507694244385 }, { "auxiliary_loss_clip": 0.01162368, "auxiliary_loss_mlp": 0.01035129, "balance_loss_clip": 0.97037327, "balance_loss_mlp": 1.02740455, "epoch": 0.8148860698611194, "flos": 21178031097600.0, "grad_norm": 2.81354032218232, "language_loss": 0.70475703, "learning_rate": 3.4867257613731017e-07, "loss": 0.72673202, "num_input_tokens_seen": 146339030, "step": 6777, "time_per_iteration": 2.6998963356018066 }, { "auxiliary_loss_clip": 0.01167746, "auxiliary_loss_mlp": 0.01028057, "balance_loss_clip": 0.97190118, "balance_loss_mlp": 1.02114296, "epoch": 0.8150063127517585, "flos": 19606903234560.0, "grad_norm": 1.676117606377304, "language_loss": 0.85937911, "learning_rate": 3.4823323473414343e-07, "loss": 0.88133717, "num_input_tokens_seen": 146358550, "step": 6778, "time_per_iteration": 2.6733970642089844 }, { "auxiliary_loss_clip": 0.01171513, "auxiliary_loss_mlp": 0.01030293, "balance_loss_clip": 0.93291533, "balance_loss_mlp": 1.02246451, "epoch": 0.8151265556423977, "flos": 22638374438400.0, "grad_norm": 3.966985113470793, "language_loss": 0.76162934, "learning_rate": 3.477941439026812e-07, "loss": 0.78364742, "num_input_tokens_seen": 146376770, "step": 6779, "time_per_iteration": 3.741150379180908 }, { "auxiliary_loss_clip": 0.01165703, "auxiliary_loss_mlp": 0.01023075, "balance_loss_clip": 0.97217602, "balance_loss_mlp": 1.01629448, "epoch": 0.8152467985330367, "flos": 17968048277760.0, "grad_norm": 1.8417191434933802, "language_loss": 0.7302357, "learning_rate": 3.473553037095349e-07, "loss": 0.75212348, "num_input_tokens_seen": 146395795, "step": 6780, "time_per_iteration": 2.6061089038848877 }, { "auxiliary_loss_clip": 0.01157358, "auxiliary_loss_mlp": 0.01025217, "balance_loss_clip": 0.96922421, "balance_loss_mlp": 1.01863694, "epoch": 0.8153670414236758, "flos": 24969012405120.0, "grad_norm": 1.9135106406031808, "language_loss": 0.83465534, "learning_rate": 3.469167142212743e-07, "loss": 0.85648108, "num_input_tokens_seen": 146417640, "step": 6781, "time_per_iteration": 2.748922109603882 }, { "auxiliary_loss_clip": 0.01168406, "auxiliary_loss_mlp": 0.01029452, "balance_loss_clip": 1.01009083, "balance_loss_mlp": 1.02153111, "epoch": 0.8154872843143149, "flos": 31066069754880.0, "grad_norm": 2.3732176664538494, "language_loss": 0.63275492, "learning_rate": 3.4647837550443337e-07, "loss": 0.65473348, "num_input_tokens_seen": 146436205, "step": 6782, "time_per_iteration": 3.724184274673462 }, { "auxiliary_loss_clip": 0.01164439, "auxiliary_loss_mlp": 0.01023476, "balance_loss_clip": 0.93192911, "balance_loss_mlp": 1.01627314, "epoch": 0.815607527204954, "flos": 19391654983680.0, "grad_norm": 1.8493092959311093, "language_loss": 0.74659455, "learning_rate": 3.460402876255086e-07, "loss": 0.76847374, "num_input_tokens_seen": 146453595, "step": 6783, "time_per_iteration": 2.7428853511810303 }, { "auxiliary_loss_clip": 0.01171309, "auxiliary_loss_mlp": 0.01024419, "balance_loss_clip": 1.01093936, "balance_loss_mlp": 1.01720715, "epoch": 0.815727770095593, "flos": 26140418743680.0, "grad_norm": 2.261552826693949, "language_loss": 0.71666878, "learning_rate": 3.456024506509574e-07, "loss": 0.73862612, "num_input_tokens_seen": 146474515, "step": 6784, "time_per_iteration": 3.5940537452697754 }, { "auxiliary_loss_clip": 0.01169345, "auxiliary_loss_mlp": 0.01122497, "balance_loss_clip": 1.01282585, "balance_loss_mlp": 0.0, "epoch": 0.8158480129862322, "flos": 25337527989120.0, "grad_norm": 1.5253810597962931, "language_loss": 0.73960471, "learning_rate": 3.4516486464719873e-07, "loss": 0.76252317, "num_input_tokens_seen": 146493905, "step": 6785, "time_per_iteration": 2.6840500831604004 }, { "auxiliary_loss_clip": 0.01156526, "auxiliary_loss_mlp": 0.01024066, "balance_loss_clip": 0.89409983, "balance_loss_mlp": 1.01698208, "epoch": 0.8159682558768713, "flos": 34423645559040.0, "grad_norm": 1.5662576903572605, "language_loss": 0.62229311, "learning_rate": 3.4472752968061445e-07, "loss": 0.64409906, "num_input_tokens_seen": 146518335, "step": 6786, "time_per_iteration": 2.8451220989227295 }, { "auxiliary_loss_clip": 0.01168029, "auxiliary_loss_mlp": 0.01024207, "balance_loss_clip": 1.00959349, "balance_loss_mlp": 1.01698911, "epoch": 0.8160884987675103, "flos": 18653223185280.0, "grad_norm": 2.0710229511203586, "language_loss": 0.73799181, "learning_rate": 3.442904458175475e-07, "loss": 0.75991416, "num_input_tokens_seen": 146535655, "step": 6787, "time_per_iteration": 2.6027843952178955 }, { "auxiliary_loss_clip": 0.01163743, "auxiliary_loss_mlp": 0.01028407, "balance_loss_clip": 1.0086242, "balance_loss_mlp": 1.02126002, "epoch": 0.8162087416581495, "flos": 31430527102080.0, "grad_norm": 1.5019576184588146, "language_loss": 0.76089507, "learning_rate": 3.438536131243044e-07, "loss": 0.78281665, "num_input_tokens_seen": 146556815, "step": 6788, "time_per_iteration": 2.7161431312561035 }, { "auxiliary_loss_clip": 0.01169021, "auxiliary_loss_mlp": 0.01022718, "balance_loss_clip": 0.97086751, "balance_loss_mlp": 1.01502323, "epoch": 0.8163289845487885, "flos": 37593910915200.0, "grad_norm": 2.0840281721853002, "language_loss": 0.62180728, "learning_rate": 3.434170316671503e-07, "loss": 0.64372468, "num_input_tokens_seen": 146581845, "step": 6789, "time_per_iteration": 2.7980823516845703 }, { "auxiliary_loss_clip": 0.01161792, "auxiliary_loss_mlp": 0.01024, "balance_loss_clip": 0.93678868, "balance_loss_mlp": 1.01705563, "epoch": 0.8164492274394276, "flos": 13953989554560.0, "grad_norm": 2.235224925740119, "language_loss": 0.89590716, "learning_rate": 3.4298070151231583e-07, "loss": 0.91776514, "num_input_tokens_seen": 146597245, "step": 6790, "time_per_iteration": 2.968857526779175 }, { "auxiliary_loss_clip": 0.0116985, "auxiliary_loss_mlp": 0.01025914, "balance_loss_clip": 0.9712801, "balance_loss_mlp": 1.01884234, "epoch": 0.8165694703300668, "flos": 28986554747520.0, "grad_norm": 2.4121496706501673, "language_loss": 0.59750229, "learning_rate": 3.425446227259916e-07, "loss": 0.61945999, "num_input_tokens_seen": 146618210, "step": 6791, "time_per_iteration": 2.7386744022369385 }, { "auxiliary_loss_clip": 0.0116485, "auxiliary_loss_mlp": 0.01028307, "balance_loss_clip": 0.96946025, "balance_loss_mlp": 1.02137232, "epoch": 0.8166897132207058, "flos": 25118365155840.0, "grad_norm": 1.7797539560768543, "language_loss": 0.82284397, "learning_rate": 3.421087953743296e-07, "loss": 0.84477556, "num_input_tokens_seen": 146637975, "step": 6792, "time_per_iteration": 2.694910764694214 }, { "auxiliary_loss_clip": 0.01164395, "auxiliary_loss_mlp": 0.01025288, "balance_loss_clip": 1.0066309, "balance_loss_mlp": 1.01780725, "epoch": 0.8168099561113449, "flos": 23148593176320.0, "grad_norm": 2.003650987807286, "language_loss": 0.80269927, "learning_rate": 3.416732195234464e-07, "loss": 0.82459611, "num_input_tokens_seen": 146658030, "step": 6793, "time_per_iteration": 2.705275297164917 }, { "auxiliary_loss_clip": 0.0117032, "auxiliary_loss_mlp": 0.01023452, "balance_loss_clip": 1.01040125, "balance_loss_mlp": 1.01662111, "epoch": 0.816930199001984, "flos": 18407666833920.0, "grad_norm": 1.4631155413674697, "language_loss": 0.79262835, "learning_rate": 3.4123789523941613e-07, "loss": 0.81456602, "num_input_tokens_seen": 146677855, "step": 6794, "time_per_iteration": 2.6936464309692383 }, { "auxiliary_loss_clip": 0.01158736, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.00619066, "balance_loss_mlp": 1.02340722, "epoch": 0.8170504418926231, "flos": 21251324799360.0, "grad_norm": 1.5322099257360045, "language_loss": 0.63399065, "learning_rate": 3.4080282258827884e-07, "loss": 0.65588361, "num_input_tokens_seen": 146696230, "step": 6795, "time_per_iteration": 2.810299873352051 }, { "auxiliary_loss_clip": 0.01169806, "auxiliary_loss_mlp": 0.01027478, "balance_loss_clip": 1.01089001, "balance_loss_mlp": 1.02006269, "epoch": 0.8171706847832622, "flos": 19099234362240.0, "grad_norm": 1.971587925797797, "language_loss": 0.72492158, "learning_rate": 3.403680016360342e-07, "loss": 0.74689436, "num_input_tokens_seen": 146714835, "step": 6796, "time_per_iteration": 2.6838529109954834 }, { "auxiliary_loss_clip": 0.01161416, "auxiliary_loss_mlp": 0.01023077, "balance_loss_clip": 1.01144099, "balance_loss_mlp": 1.01565588, "epoch": 0.8172909276739013, "flos": 21470128496640.0, "grad_norm": 1.4531091555098385, "language_loss": 0.67540509, "learning_rate": 3.3993343244864403e-07, "loss": 0.69725001, "num_input_tokens_seen": 146734425, "step": 6797, "time_per_iteration": 2.674117088317871 }, { "auxiliary_loss_clip": 0.01164906, "auxiliary_loss_mlp": 0.01020103, "balance_loss_clip": 1.00974, "balance_loss_mlp": 1.01344216, "epoch": 0.8174111705645404, "flos": 27599792417280.0, "grad_norm": 3.877060258436539, "language_loss": 0.73039901, "learning_rate": 3.394991150920323e-07, "loss": 0.75224912, "num_input_tokens_seen": 146757545, "step": 6798, "time_per_iteration": 2.782921552658081 }, { "auxiliary_loss_clip": 0.01163279, "auxiliary_loss_mlp": 0.01123409, "balance_loss_clip": 0.89339471, "balance_loss_mlp": 0.0, "epoch": 0.8175314134551794, "flos": 14064594508800.0, "grad_norm": 17.481572589516535, "language_loss": 0.74303573, "learning_rate": 3.3906504963208396e-07, "loss": 0.76590264, "num_input_tokens_seen": 146774240, "step": 6799, "time_per_iteration": 2.7623250484466553 }, { "auxiliary_loss_clip": 0.01159283, "auxiliary_loss_mlp": 0.01025933, "balance_loss_clip": 0.89708382, "balance_loss_mlp": 1.01862884, "epoch": 0.8176516563458186, "flos": 22708076780160.0, "grad_norm": 1.717586216529441, "language_loss": 0.66559702, "learning_rate": 3.3863123613464774e-07, "loss": 0.68744916, "num_input_tokens_seen": 146793140, "step": 6800, "time_per_iteration": 2.7629919052124023 }, { "auxiliary_loss_clip": 0.01164836, "auxiliary_loss_mlp": 0.01024511, "balance_loss_clip": 0.96727502, "balance_loss_mlp": 1.01770711, "epoch": 0.8177718992364577, "flos": 21945406279680.0, "grad_norm": 4.209230718229438, "language_loss": 0.75205654, "learning_rate": 3.381976746655317e-07, "loss": 0.77394998, "num_input_tokens_seen": 146812895, "step": 6801, "time_per_iteration": 2.9145302772521973 }, { "auxiliary_loss_clip": 0.01159116, "auxiliary_loss_mlp": 0.01024544, "balance_loss_clip": 0.89727211, "balance_loss_mlp": 1.01799369, "epoch": 0.8178921421270967, "flos": 22017443005440.0, "grad_norm": 2.531987944887198, "language_loss": 0.67236632, "learning_rate": 3.3776436529050756e-07, "loss": 0.6942029, "num_input_tokens_seen": 146832445, "step": 6802, "time_per_iteration": 3.7219769954681396 }, { "auxiliary_loss_clip": 0.01163741, "auxiliary_loss_mlp": 0.01027836, "balance_loss_clip": 1.04681551, "balance_loss_mlp": 1.02081418, "epoch": 0.8180123850177359, "flos": 33183111496320.0, "grad_norm": 1.992957085338644, "language_loss": 0.72426009, "learning_rate": 3.373313080753073e-07, "loss": 0.74617589, "num_input_tokens_seen": 146856505, "step": 6803, "time_per_iteration": 2.7083470821380615 }, { "auxiliary_loss_clip": 0.01158057, "auxiliary_loss_mlp": 0.01025729, "balance_loss_clip": 1.00596261, "balance_loss_mlp": 1.01882124, "epoch": 0.8181326279083749, "flos": 22091167670400.0, "grad_norm": 1.4663206581177832, "language_loss": 0.77671719, "learning_rate": 3.3689850308562527e-07, "loss": 0.79855508, "num_input_tokens_seen": 146876950, "step": 6804, "time_per_iteration": 2.7929482460021973 }, { "auxiliary_loss_clip": 0.01158635, "auxiliary_loss_mlp": 0.01029173, "balance_loss_clip": 0.89825296, "balance_loss_mlp": 1.02209473, "epoch": 0.818252870799014, "flos": 15705747936000.0, "grad_norm": 1.7860153061977977, "language_loss": 0.77613431, "learning_rate": 3.364659503871183e-07, "loss": 0.79801238, "num_input_tokens_seen": 146894885, "step": 6805, "time_per_iteration": 2.699820041656494 }, { "auxiliary_loss_clip": 0.01157554, "auxiliary_loss_mlp": 0.01023323, "balance_loss_clip": 0.92973858, "balance_loss_mlp": 1.01638794, "epoch": 0.8183731136896532, "flos": 18770687637120.0, "grad_norm": 2.352740183405966, "language_loss": 0.83855319, "learning_rate": 3.3603365004540417e-07, "loss": 0.86036193, "num_input_tokens_seen": 146913180, "step": 6806, "time_per_iteration": 3.703303575515747 }, { "auxiliary_loss_clip": 0.01169238, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.05014706, "balance_loss_mlp": 1.01788592, "epoch": 0.8184933565802922, "flos": 26541792293760.0, "grad_norm": 1.9060575451753734, "language_loss": 0.76762927, "learning_rate": 3.356016021260624e-07, "loss": 0.78957736, "num_input_tokens_seen": 146933510, "step": 6807, "time_per_iteration": 2.6831319332122803 }, { "auxiliary_loss_clip": 0.01167468, "auxiliary_loss_mlp": 0.01027553, "balance_loss_clip": 1.01107228, "balance_loss_mlp": 1.02054906, "epoch": 0.8186135994709313, "flos": 17530117660800.0, "grad_norm": 2.42773924243776, "language_loss": 0.65522408, "learning_rate": 3.35169806694634e-07, "loss": 0.67717427, "num_input_tokens_seen": 146951760, "step": 6808, "time_per_iteration": 3.6631925106048584 }, { "auxiliary_loss_clip": 0.01069533, "auxiliary_loss_mlp": 0.01002591, "balance_loss_clip": 0.90021741, "balance_loss_mlp": 1.00091004, "epoch": 0.8187338423615703, "flos": 63480300675840.0, "grad_norm": 0.7255594889088292, "language_loss": 0.60717273, "learning_rate": 3.3473826381662186e-07, "loss": 0.62789404, "num_input_tokens_seen": 147022900, "step": 6809, "time_per_iteration": 3.3618812561035156 }, { "auxiliary_loss_clip": 0.01162154, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.01003373, "balance_loss_mlp": 1.02385581, "epoch": 0.8188540852522095, "flos": 17529974006400.0, "grad_norm": 1.7818076790865163, "language_loss": 0.81653714, "learning_rate": 3.3430697355749216e-07, "loss": 0.83846593, "num_input_tokens_seen": 147040590, "step": 6810, "time_per_iteration": 3.6107685565948486 }, { "auxiliary_loss_clip": 0.01153354, "auxiliary_loss_mlp": 0.01025804, "balance_loss_clip": 0.89210749, "balance_loss_mlp": 1.0185802, "epoch": 0.8189743281428485, "flos": 14392530702720.0, "grad_norm": 1.990130691489747, "language_loss": 0.75125808, "learning_rate": 3.3387593598266907e-07, "loss": 0.77304965, "num_input_tokens_seen": 147057200, "step": 6811, "time_per_iteration": 2.753795623779297 }, { "auxiliary_loss_clip": 0.01151959, "auxiliary_loss_mlp": 0.01022546, "balance_loss_clip": 0.92804718, "balance_loss_mlp": 1.0150125, "epoch": 0.8190945710334876, "flos": 25080479285760.0, "grad_norm": 1.7775608811260197, "language_loss": 0.78010571, "learning_rate": 3.3344515115754225e-07, "loss": 0.8018508, "num_input_tokens_seen": 147076180, "step": 6812, "time_per_iteration": 2.831819534301758 }, { "auxiliary_loss_clip": 0.01168747, "auxiliary_loss_mlp": 0.01030903, "balance_loss_clip": 0.93076581, "balance_loss_mlp": 1.02375603, "epoch": 0.8192148139241268, "flos": 21507152440320.0, "grad_norm": 2.3838694415506603, "language_loss": 0.79349232, "learning_rate": 3.33014619147461e-07, "loss": 0.81548887, "num_input_tokens_seen": 147094205, "step": 6813, "time_per_iteration": 2.7097620964050293 }, { "auxiliary_loss_clip": 0.01165861, "auxiliary_loss_mlp": 0.01032426, "balance_loss_clip": 0.97278827, "balance_loss_mlp": 1.02478433, "epoch": 0.8193350568147658, "flos": 23952166289280.0, "grad_norm": 1.9071331705237566, "language_loss": 0.71735018, "learning_rate": 3.325843400177362e-07, "loss": 0.73933303, "num_input_tokens_seen": 147115545, "step": 6814, "time_per_iteration": 2.6543478965759277 }, { "auxiliary_loss_clip": 0.01169304, "auxiliary_loss_mlp": 0.01122778, "balance_loss_clip": 1.0094049, "balance_loss_mlp": 0.0, "epoch": 0.8194552997054049, "flos": 20559469962240.0, "grad_norm": 1.7765924496810208, "language_loss": 0.73363602, "learning_rate": 3.32154313833642e-07, "loss": 0.75655687, "num_input_tokens_seen": 147135700, "step": 6815, "time_per_iteration": 2.6643333435058594 }, { "auxiliary_loss_clip": 0.01170339, "auxiliary_loss_mlp": 0.0102555, "balance_loss_clip": 1.04832232, "balance_loss_mlp": 1.01782477, "epoch": 0.819575542596044, "flos": 26031753123840.0, "grad_norm": 2.08791787732675, "language_loss": 0.59438872, "learning_rate": 3.3172454066041164e-07, "loss": 0.61634761, "num_input_tokens_seen": 147155205, "step": 6816, "time_per_iteration": 2.638617515563965 }, { "auxiliary_loss_clip": 0.0116313, "auxiliary_loss_mlp": 0.01122386, "balance_loss_clip": 0.85852295, "balance_loss_mlp": 0.0, "epoch": 0.8196957854866831, "flos": 29096944220160.0, "grad_norm": 1.865250666233546, "language_loss": 0.7601617, "learning_rate": 3.3129502056324234e-07, "loss": 0.78301686, "num_input_tokens_seen": 147176570, "step": 6817, "time_per_iteration": 2.8279426097869873 }, { "auxiliary_loss_clip": 0.0107979, "auxiliary_loss_mlp": 0.01002792, "balance_loss_clip": 0.78944921, "balance_loss_mlp": 1.00115907, "epoch": 0.8198160283773221, "flos": 69033631898880.0, "grad_norm": 0.8150589838388447, "language_loss": 0.59800112, "learning_rate": 3.3086575360729165e-07, "loss": 0.61882699, "num_input_tokens_seen": 147234105, "step": 6818, "time_per_iteration": 3.4000723361968994 }, { "auxiliary_loss_clip": 0.01162201, "auxiliary_loss_mlp": 0.01030519, "balance_loss_clip": 0.97064292, "balance_loss_mlp": 1.02311003, "epoch": 0.8199362712679613, "flos": 16618058496000.0, "grad_norm": 1.6855717169022524, "language_loss": 0.71232426, "learning_rate": 3.3043673985767906e-07, "loss": 0.7342515, "num_input_tokens_seen": 147253170, "step": 6819, "time_per_iteration": 3.0863020420074463 }, { "auxiliary_loss_clip": 0.0115035, "auxiliary_loss_mlp": 0.01029336, "balance_loss_clip": 0.92856222, "balance_loss_mlp": 1.02245116, "epoch": 0.8200565141586004, "flos": 21757664868480.0, "grad_norm": 1.7173790261896347, "language_loss": 0.77586037, "learning_rate": 3.3000797937948564e-07, "loss": 0.79765725, "num_input_tokens_seen": 147271465, "step": 6820, "time_per_iteration": 2.7552661895751953 }, { "auxiliary_loss_clip": 0.01069383, "auxiliary_loss_mlp": 0.01001162, "balance_loss_clip": 0.89824235, "balance_loss_mlp": 0.99955225, "epoch": 0.8201767570492394, "flos": 69807112392960.0, "grad_norm": 0.9444845372083067, "language_loss": 0.65053797, "learning_rate": 3.295794722377534e-07, "loss": 0.67124343, "num_input_tokens_seen": 147335070, "step": 6821, "time_per_iteration": 3.3264245986938477 }, { "auxiliary_loss_clip": 0.0116472, "auxiliary_loss_mlp": 0.01023646, "balance_loss_clip": 1.04719925, "balance_loss_mlp": 1.01707196, "epoch": 0.8202969999398786, "flos": 23111892455040.0, "grad_norm": 1.663312014941701, "language_loss": 0.80126357, "learning_rate": 3.291512184974876e-07, "loss": 0.82314724, "num_input_tokens_seen": 147355460, "step": 6822, "time_per_iteration": 2.9758918285369873 }, { "auxiliary_loss_clip": 0.01159127, "auxiliary_loss_mlp": 0.01027065, "balance_loss_clip": 0.96758986, "balance_loss_mlp": 1.01956689, "epoch": 0.8204172428305176, "flos": 28220616109440.0, "grad_norm": 1.5857603028033804, "language_loss": 0.66560233, "learning_rate": 3.2872321822365346e-07, "loss": 0.68746424, "num_input_tokens_seen": 147375675, "step": 6823, "time_per_iteration": 2.9314990043640137 }, { "auxiliary_loss_clip": 0.01165306, "auxiliary_loss_mlp": 0.01025404, "balance_loss_clip": 1.0104413, "balance_loss_mlp": 1.01864183, "epoch": 0.8205374857211567, "flos": 20887011106560.0, "grad_norm": 1.8746843037438818, "language_loss": 0.73487878, "learning_rate": 3.282954714811783e-07, "loss": 0.75678587, "num_input_tokens_seen": 147394580, "step": 6824, "time_per_iteration": 2.948791980743408 }, { "auxiliary_loss_clip": 0.01147846, "auxiliary_loss_mlp": 0.01029187, "balance_loss_clip": 0.9659161, "balance_loss_mlp": 1.02173901, "epoch": 0.8206577286117959, "flos": 13152140294400.0, "grad_norm": 2.1049193490698297, "language_loss": 0.70967484, "learning_rate": 3.2786797833495093e-07, "loss": 0.73144519, "num_input_tokens_seen": 147409935, "step": 6825, "time_per_iteration": 2.725911855697632 }, { "auxiliary_loss_clip": 0.01166204, "auxiliary_loss_mlp": 0.01024605, "balance_loss_clip": 1.04737794, "balance_loss_mlp": 1.01805162, "epoch": 0.8207779715024349, "flos": 25265634917760.0, "grad_norm": 1.9570349845334474, "language_loss": 0.72669661, "learning_rate": 3.274407388498213e-07, "loss": 0.74860477, "num_input_tokens_seen": 147428065, "step": 6826, "time_per_iteration": 2.62790584564209 }, { "auxiliary_loss_clip": 0.01156494, "auxiliary_loss_mlp": 0.01027595, "balance_loss_clip": 0.93096161, "balance_loss_mlp": 1.02075851, "epoch": 0.820898214393074, "flos": 19610243199360.0, "grad_norm": 1.7311388355196329, "language_loss": 0.74228305, "learning_rate": 3.270137530906021e-07, "loss": 0.76412392, "num_input_tokens_seen": 147447300, "step": 6827, "time_per_iteration": 2.759638786315918 }, { "auxiliary_loss_clip": 0.0115495, "auxiliary_loss_mlp": 0.01025254, "balance_loss_clip": 0.89550018, "balance_loss_mlp": 1.01899838, "epoch": 0.8210184572837131, "flos": 15596615439360.0, "grad_norm": 1.930563659296063, "language_loss": 0.83308464, "learning_rate": 3.265870211220665e-07, "loss": 0.85488665, "num_input_tokens_seen": 147465135, "step": 6828, "time_per_iteration": 3.7314133644104004 }, { "auxiliary_loss_clip": 0.01157323, "auxiliary_loss_mlp": 0.01027541, "balance_loss_clip": 0.93126941, "balance_loss_mlp": 1.01956642, "epoch": 0.8211387001743522, "flos": 20813932886400.0, "grad_norm": 2.2098225385176136, "language_loss": 0.81857717, "learning_rate": 3.2616054300894934e-07, "loss": 0.84042579, "num_input_tokens_seen": 147484585, "step": 6829, "time_per_iteration": 2.706939220428467 }, { "auxiliary_loss_clip": 0.01158131, "auxiliary_loss_mlp": 0.01028952, "balance_loss_clip": 0.97257859, "balance_loss_mlp": 1.02144814, "epoch": 0.8212589430649913, "flos": 27704579368320.0, "grad_norm": 2.0386896277712974, "language_loss": 0.8419978, "learning_rate": 3.2573431881594693e-07, "loss": 0.86386859, "num_input_tokens_seen": 147504130, "step": 6830, "time_per_iteration": 2.7344582080841064 }, { "auxiliary_loss_clip": 0.01156251, "auxiliary_loss_mlp": 0.01025048, "balance_loss_clip": 0.85241902, "balance_loss_mlp": 1.01786542, "epoch": 0.8213791859556304, "flos": 22455625017600.0, "grad_norm": 2.0340784186783183, "language_loss": 0.65748966, "learning_rate": 3.2530834860771663e-07, "loss": 0.67930269, "num_input_tokens_seen": 147523510, "step": 6831, "time_per_iteration": 2.804988384246826 }, { "auxiliary_loss_clip": 0.01166193, "auxiliary_loss_mlp": 0.01024091, "balance_loss_clip": 1.00936508, "balance_loss_mlp": 1.01661026, "epoch": 0.8214994288462695, "flos": 16654471908480.0, "grad_norm": 1.938654302762223, "language_loss": 0.74445575, "learning_rate": 3.248826324488794e-07, "loss": 0.76635861, "num_input_tokens_seen": 147540805, "step": 6832, "time_per_iteration": 3.5802340507507324 }, { "auxiliary_loss_clip": 0.01169962, "auxiliary_loss_mlp": 0.01026168, "balance_loss_clip": 1.05166817, "balance_loss_mlp": 1.01930141, "epoch": 0.8216196717369085, "flos": 25221787390080.0, "grad_norm": 1.7651502052939771, "language_loss": 0.87958181, "learning_rate": 3.244571704040138e-07, "loss": 0.90154314, "num_input_tokens_seen": 147560965, "step": 6833, "time_per_iteration": 2.6617510318756104 }, { "auxiliary_loss_clip": 0.01160943, "auxiliary_loss_mlp": 0.01027979, "balance_loss_clip": 1.00703788, "balance_loss_mlp": 1.02075815, "epoch": 0.8217399146275477, "flos": 25371930240000.0, "grad_norm": 1.8836193534809273, "language_loss": 0.73446119, "learning_rate": 3.2403196253766374e-07, "loss": 0.7563504, "num_input_tokens_seen": 147580045, "step": 6834, "time_per_iteration": 3.7840442657470703 }, { "auxiliary_loss_clip": 0.01164361, "auxiliary_loss_mlp": 0.01032118, "balance_loss_clip": 1.01000178, "balance_loss_mlp": 1.02337122, "epoch": 0.8218601575181868, "flos": 25629625388160.0, "grad_norm": 2.150578938797219, "language_loss": 0.78899318, "learning_rate": 3.2360700891433254e-07, "loss": 0.81095803, "num_input_tokens_seen": 147599070, "step": 6835, "time_per_iteration": 2.7177841663360596 }, { "auxiliary_loss_clip": 0.01068391, "auxiliary_loss_mlp": 0.01000511, "balance_loss_clip": 0.85940629, "balance_loss_mlp": 0.99887735, "epoch": 0.8219804004088258, "flos": 67660229427840.0, "grad_norm": 0.8436003592374693, "language_loss": 0.57326305, "learning_rate": 3.231823095984847e-07, "loss": 0.59395206, "num_input_tokens_seen": 147653710, "step": 6836, "time_per_iteration": 4.131007671356201 }, { "auxiliary_loss_clip": 0.01163947, "auxiliary_loss_mlp": 0.01022125, "balance_loss_clip": 0.97094142, "balance_loss_mlp": 1.01549101, "epoch": 0.822100643299465, "flos": 19464266327040.0, "grad_norm": 2.0379872311972824, "language_loss": 0.75938946, "learning_rate": 3.2275786465454814e-07, "loss": 0.78125018, "num_input_tokens_seen": 147670360, "step": 6837, "time_per_iteration": 2.6475400924682617 }, { "auxiliary_loss_clip": 0.01160257, "auxiliary_loss_mlp": 0.01021596, "balance_loss_clip": 0.93099058, "balance_loss_mlp": 1.01502776, "epoch": 0.822220886190104, "flos": 24681368292480.0, "grad_norm": 2.2077914797594578, "language_loss": 0.75682992, "learning_rate": 3.2233367414690917e-07, "loss": 0.7786485, "num_input_tokens_seen": 147692550, "step": 6838, "time_per_iteration": 2.769158124923706 }, { "auxiliary_loss_clip": 0.01158056, "auxiliary_loss_mlp": 0.010204, "balance_loss_clip": 0.93004501, "balance_loss_mlp": 1.01409948, "epoch": 0.8223411290807431, "flos": 27819062991360.0, "grad_norm": 2.066998667298523, "language_loss": 0.85068864, "learning_rate": 3.219097381399183e-07, "loss": 0.87247318, "num_input_tokens_seen": 147709725, "step": 6839, "time_per_iteration": 2.7663543224334717 }, { "auxiliary_loss_clip": 0.0117305, "auxiliary_loss_mlp": 0.01031932, "balance_loss_clip": 0.97302115, "balance_loss_mlp": 1.02467275, "epoch": 0.8224613719713821, "flos": 23218546913280.0, "grad_norm": 1.6888601629021611, "language_loss": 0.81143892, "learning_rate": 3.2148605669788584e-07, "loss": 0.83348876, "num_input_tokens_seen": 147729615, "step": 6840, "time_per_iteration": 2.806370973587036 }, { "auxiliary_loss_clip": 0.01167677, "auxiliary_loss_mlp": 0.01027718, "balance_loss_clip": 0.97272003, "balance_loss_mlp": 1.02060437, "epoch": 0.8225816148620213, "flos": 15706250726400.0, "grad_norm": 2.188214402218407, "language_loss": 0.77189803, "learning_rate": 3.2106262988508405e-07, "loss": 0.79385197, "num_input_tokens_seen": 147747665, "step": 6841, "time_per_iteration": 2.665611743927002 }, { "auxiliary_loss_clip": 0.01167682, "auxiliary_loss_mlp": 0.01028339, "balance_loss_clip": 0.9723224, "balance_loss_mlp": 1.02106392, "epoch": 0.8227018577526604, "flos": 18515111391360.0, "grad_norm": 1.9604645474498046, "language_loss": 0.74005377, "learning_rate": 3.206394577657465e-07, "loss": 0.76201391, "num_input_tokens_seen": 147765445, "step": 6842, "time_per_iteration": 2.720409870147705 }, { "auxiliary_loss_clip": 0.01169989, "auxiliary_loss_mlp": 0.0102664, "balance_loss_clip": 1.0090487, "balance_loss_mlp": 1.01896834, "epoch": 0.8228221006432994, "flos": 22236785406720.0, "grad_norm": 2.278990379646486, "language_loss": 0.72586548, "learning_rate": 3.202165404040675e-07, "loss": 0.74783182, "num_input_tokens_seen": 147783365, "step": 6843, "time_per_iteration": 2.7489936351776123 }, { "auxiliary_loss_clip": 0.01160784, "auxiliary_loss_mlp": 0.01031479, "balance_loss_clip": 0.85593855, "balance_loss_mlp": 1.02415371, "epoch": 0.8229423435339386, "flos": 24097532630400.0, "grad_norm": 1.850339530461547, "language_loss": 0.74743307, "learning_rate": 3.1979387786420396e-07, "loss": 0.76935577, "num_input_tokens_seen": 147803605, "step": 6844, "time_per_iteration": 2.974929094314575 }, { "auxiliary_loss_clip": 0.01161558, "auxiliary_loss_mlp": 0.0102308, "balance_loss_clip": 0.96658951, "balance_loss_mlp": 1.01606476, "epoch": 0.8230625864245776, "flos": 23878549365120.0, "grad_norm": 1.7372933048288812, "language_loss": 0.8236506, "learning_rate": 3.1937147021027346e-07, "loss": 0.84549701, "num_input_tokens_seen": 147822060, "step": 6845, "time_per_iteration": 2.9894938468933105 }, { "auxiliary_loss_clip": 0.01165706, "auxiliary_loss_mlp": 0.01028234, "balance_loss_clip": 1.00952601, "balance_loss_mlp": 1.02132273, "epoch": 0.8231828293152167, "flos": 16581106379520.0, "grad_norm": 2.526836097624073, "language_loss": 0.76207644, "learning_rate": 3.189493175063547e-07, "loss": 0.78401583, "num_input_tokens_seen": 147839295, "step": 6846, "time_per_iteration": 2.766641139984131 }, { "auxiliary_loss_clip": 0.01170188, "auxiliary_loss_mlp": 0.01025733, "balance_loss_clip": 0.9750452, "balance_loss_mlp": 1.01835656, "epoch": 0.8233030722058559, "flos": 18880071528960.0, "grad_norm": 1.9822682034200594, "language_loss": 0.67435551, "learning_rate": 3.1852741981648776e-07, "loss": 0.69631469, "num_input_tokens_seen": 147857945, "step": 6847, "time_per_iteration": 2.6632843017578125 }, { "auxiliary_loss_clip": 0.01152584, "auxiliary_loss_mlp": 0.01025166, "balance_loss_clip": 0.93206441, "balance_loss_mlp": 1.01826072, "epoch": 0.8234233150964949, "flos": 28439024757120.0, "grad_norm": 3.8299305209952523, "language_loss": 0.69933176, "learning_rate": 3.1810577720467404e-07, "loss": 0.72110921, "num_input_tokens_seen": 147879675, "step": 6848, "time_per_iteration": 2.7303237915039062 }, { "auxiliary_loss_clip": 0.01169031, "auxiliary_loss_mlp": 0.01030094, "balance_loss_clip": 0.97058594, "balance_loss_mlp": 1.02259851, "epoch": 0.823543557987134, "flos": 33765941577600.0, "grad_norm": 1.5294109743080533, "language_loss": 0.56595123, "learning_rate": 3.176843897348769e-07, "loss": 0.58794248, "num_input_tokens_seen": 147902870, "step": 6849, "time_per_iteration": 2.765042543411255 }, { "auxiliary_loss_clip": 0.01158583, "auxiliary_loss_mlp": 0.010278, "balance_loss_clip": 0.96943021, "balance_loss_mlp": 1.02057886, "epoch": 0.8236638008777731, "flos": 17092366611840.0, "grad_norm": 2.170056796948341, "language_loss": 0.75542843, "learning_rate": 3.1726325747102034e-07, "loss": 0.77729231, "num_input_tokens_seen": 147921245, "step": 6850, "time_per_iteration": 2.6870856285095215 }, { "auxiliary_loss_clip": 0.01152374, "auxiliary_loss_mlp": 0.01031378, "balance_loss_clip": 0.88969678, "balance_loss_mlp": 1.02391589, "epoch": 0.8237840437684122, "flos": 61639982334720.0, "grad_norm": 1.5350907761708596, "language_loss": 0.64106905, "learning_rate": 3.1684238047698974e-07, "loss": 0.66290659, "num_input_tokens_seen": 147949515, "step": 6851, "time_per_iteration": 3.2484447956085205 }, { "auxiliary_loss_clip": 0.01167839, "auxiliary_loss_mlp": 0.0102425, "balance_loss_clip": 0.97082293, "balance_loss_mlp": 1.0169009, "epoch": 0.8239042866590512, "flos": 27309023821440.0, "grad_norm": 2.1213038706929437, "language_loss": 0.53068155, "learning_rate": 3.1642175881663155e-07, "loss": 0.55260241, "num_input_tokens_seen": 147969245, "step": 6852, "time_per_iteration": 2.7163872718811035 }, { "auxiliary_loss_clip": 0.01165123, "auxiliary_loss_mlp": 0.01024673, "balance_loss_clip": 1.04595757, "balance_loss_mlp": 1.01774085, "epoch": 0.8240245295496904, "flos": 21726351187200.0, "grad_norm": 2.0447308152236285, "language_loss": 0.83779812, "learning_rate": 3.160013925537537e-07, "loss": 0.85969603, "num_input_tokens_seen": 147990080, "step": 6853, "time_per_iteration": 2.597080707550049 }, { "auxiliary_loss_clip": 0.01163976, "auxiliary_loss_mlp": 0.0102551, "balance_loss_clip": 0.93189716, "balance_loss_mlp": 1.01811337, "epoch": 0.8241447724403295, "flos": 20009318279040.0, "grad_norm": 2.2950018434325523, "language_loss": 0.75753522, "learning_rate": 3.155812817521266e-07, "loss": 0.77943015, "num_input_tokens_seen": 148010455, "step": 6854, "time_per_iteration": 3.6947062015533447 }, { "auxiliary_loss_clip": 0.011703, "auxiliary_loss_mlp": 0.01028244, "balance_loss_clip": 0.97495383, "balance_loss_mlp": 1.02071619, "epoch": 0.8242650153309685, "flos": 22272983337600.0, "grad_norm": 2.0418279302569813, "language_loss": 0.77980906, "learning_rate": 3.151614264754787e-07, "loss": 0.80179453, "num_input_tokens_seen": 148028400, "step": 6855, "time_per_iteration": 2.678457498550415 }, { "auxiliary_loss_clip": 0.01168112, "auxiliary_loss_mlp": 0.01025602, "balance_loss_clip": 1.04721689, "balance_loss_mlp": 1.01808274, "epoch": 0.8243852582216077, "flos": 22309971367680.0, "grad_norm": 2.253670850017354, "language_loss": 0.79164678, "learning_rate": 3.147418267875035e-07, "loss": 0.81358391, "num_input_tokens_seen": 148046530, "step": 6856, "time_per_iteration": 2.603030204772949 }, { "auxiliary_loss_clip": 0.01153564, "auxiliary_loss_mlp": 0.01122316, "balance_loss_clip": 0.85193717, "balance_loss_mlp": 0.0, "epoch": 0.8245055011122467, "flos": 24645421756800.0, "grad_norm": 1.9272232366440307, "language_loss": 0.65895176, "learning_rate": 3.1432248275185315e-07, "loss": 0.6817106, "num_input_tokens_seen": 148067040, "step": 6857, "time_per_iteration": 2.8094584941864014 }, { "auxiliary_loss_clip": 0.01165339, "auxiliary_loss_mlp": 0.01024604, "balance_loss_clip": 1.01155519, "balance_loss_mlp": 1.01723444, "epoch": 0.8246257440028858, "flos": 17487275713920.0, "grad_norm": 1.9137665569795694, "language_loss": 0.77247, "learning_rate": 3.139033944321412e-07, "loss": 0.7943694, "num_input_tokens_seen": 148084400, "step": 6858, "time_per_iteration": 3.501753807067871 }, { "auxiliary_loss_clip": 0.01169542, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.00976813, "balance_loss_mlp": 1.02199697, "epoch": 0.824745986893525, "flos": 25010130499200.0, "grad_norm": 1.931855794161472, "language_loss": 0.79110539, "learning_rate": 3.1348456189194507e-07, "loss": 0.81309402, "num_input_tokens_seen": 148104860, "step": 6859, "time_per_iteration": 2.6714742183685303 }, { "auxiliary_loss_clip": 0.01150444, "auxiliary_loss_mlp": 0.01021557, "balance_loss_clip": 0.92867273, "balance_loss_mlp": 1.01426721, "epoch": 0.824866229784164, "flos": 18772698798720.0, "grad_norm": 1.6599606433707916, "language_loss": 0.82878447, "learning_rate": 3.1306598519479876e-07, "loss": 0.85050452, "num_input_tokens_seen": 148124680, "step": 6860, "time_per_iteration": 2.695391893386841 }, { "auxiliary_loss_clip": 0.01163423, "auxiliary_loss_mlp": 0.01022713, "balance_loss_clip": 0.97185254, "balance_loss_mlp": 1.01604927, "epoch": 0.8249864726748031, "flos": 23842171866240.0, "grad_norm": 1.6010106887633817, "language_loss": 0.78328466, "learning_rate": 3.1264766440420177e-07, "loss": 0.80514604, "num_input_tokens_seen": 148147150, "step": 6861, "time_per_iteration": 3.7498133182525635 }, { "auxiliary_loss_clip": 0.01163629, "auxiliary_loss_mlp": 0.01028697, "balance_loss_clip": 1.00981379, "balance_loss_mlp": 1.02136564, "epoch": 0.8251067155654422, "flos": 20303103617280.0, "grad_norm": 1.844119457201194, "language_loss": 0.69282389, "learning_rate": 3.122295995836124e-07, "loss": 0.71474719, "num_input_tokens_seen": 148167020, "step": 6862, "time_per_iteration": 3.604957103729248 }, { "auxiliary_loss_clip": 0.01165929, "auxiliary_loss_mlp": 0.01023503, "balance_loss_clip": 1.00582278, "balance_loss_mlp": 1.01595712, "epoch": 0.8252269584560813, "flos": 25009699536000.0, "grad_norm": 1.7176847915514364, "language_loss": 0.77391583, "learning_rate": 3.118117907964508e-07, "loss": 0.79581016, "num_input_tokens_seen": 148188965, "step": 6863, "time_per_iteration": 2.683751344680786 }, { "auxiliary_loss_clip": 0.01169496, "auxiliary_loss_mlp": 0.01025273, "balance_loss_clip": 0.93298364, "balance_loss_mlp": 1.01841807, "epoch": 0.8253472013467203, "flos": 17128564542720.0, "grad_norm": 1.8606788708907211, "language_loss": 0.80588341, "learning_rate": 3.1139423810609856e-07, "loss": 0.82783103, "num_input_tokens_seen": 148205660, "step": 6864, "time_per_iteration": 2.7299447059631348 }, { "auxiliary_loss_clip": 0.01166501, "auxiliary_loss_mlp": 0.01028379, "balance_loss_clip": 1.04599953, "balance_loss_mlp": 1.02087462, "epoch": 0.8254674442373595, "flos": 22414794232320.0, "grad_norm": 2.5651125860783806, "language_loss": 0.74921668, "learning_rate": 3.1097694157589714e-07, "loss": 0.77116549, "num_input_tokens_seen": 148225545, "step": 6865, "time_per_iteration": 2.5234556198120117 }, { "auxiliary_loss_clip": 0.01163567, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.01029289, "balance_loss_mlp": 1.02168787, "epoch": 0.8255876871279986, "flos": 24786765774720.0, "grad_norm": 2.677663304926065, "language_loss": 0.76294023, "learning_rate": 3.105599012691511e-07, "loss": 0.78486729, "num_input_tokens_seen": 148243975, "step": 6866, "time_per_iteration": 2.59053635597229 }, { "auxiliary_loss_clip": 0.01162278, "auxiliary_loss_mlp": 0.01024236, "balance_loss_clip": 1.00924015, "balance_loss_mlp": 1.01754868, "epoch": 0.8257079300186376, "flos": 27455431656960.0, "grad_norm": 1.3750332683352247, "language_loss": 0.82048094, "learning_rate": 3.101431172491249e-07, "loss": 0.84234613, "num_input_tokens_seen": 148265520, "step": 6867, "time_per_iteration": 2.592646360397339 }, { "auxiliary_loss_clip": 0.01166077, "auxiliary_loss_mlp": 0.01123004, "balance_loss_clip": 0.92994034, "balance_loss_mlp": 0.0, "epoch": 0.8258281729092768, "flos": 16471866142080.0, "grad_norm": 2.299050283563028, "language_loss": 0.71820122, "learning_rate": 3.097265895790444e-07, "loss": 0.74109209, "num_input_tokens_seen": 148283730, "step": 6868, "time_per_iteration": 2.6984972953796387 }, { "auxiliary_loss_clip": 0.01161083, "auxiliary_loss_mlp": 0.0102281, "balance_loss_clip": 0.93183672, "balance_loss_mlp": 1.01577616, "epoch": 0.8259484157999158, "flos": 21433822824960.0, "grad_norm": 2.966013988201879, "language_loss": 0.83284134, "learning_rate": 3.093103183220962e-07, "loss": 0.8546803, "num_input_tokens_seen": 148303775, "step": 6869, "time_per_iteration": 2.6699018478393555 }, { "auxiliary_loss_clip": 0.01063706, "auxiliary_loss_mlp": 0.01002026, "balance_loss_clip": 0.9724803, "balance_loss_mlp": 1.00046432, "epoch": 0.8260686586905549, "flos": 58322342453760.0, "grad_norm": 0.8344527081467745, "language_loss": 0.59473085, "learning_rate": 3.0889430354142796e-07, "loss": 0.61538815, "num_input_tokens_seen": 148365285, "step": 6870, "time_per_iteration": 3.282468795776367 }, { "auxiliary_loss_clip": 0.01165421, "auxiliary_loss_mlp": 0.0102965, "balance_loss_clip": 0.92962611, "balance_loss_mlp": 1.0220449, "epoch": 0.826188901581194, "flos": 27527288814720.0, "grad_norm": 2.2000654726784896, "language_loss": 0.69942105, "learning_rate": 3.084785453001497e-07, "loss": 0.72137177, "num_input_tokens_seen": 148386200, "step": 6871, "time_per_iteration": 2.923171043395996 }, { "auxiliary_loss_clip": 0.01165396, "auxiliary_loss_mlp": 0.01122399, "balance_loss_clip": 0.97215801, "balance_loss_mlp": 0.0, "epoch": 0.8263091444718331, "flos": 23696051339520.0, "grad_norm": 2.159334367199896, "language_loss": 0.81995696, "learning_rate": 3.080630436613314e-07, "loss": 0.84283489, "num_input_tokens_seen": 148403970, "step": 6872, "time_per_iteration": 2.7636003494262695 }, { "auxiliary_loss_clip": 0.0115605, "auxiliary_loss_mlp": 0.01021216, "balance_loss_clip": 1.00740194, "balance_loss_mlp": 1.01405156, "epoch": 0.8264293873624722, "flos": 17165157523200.0, "grad_norm": 1.869588272424611, "language_loss": 0.8567245, "learning_rate": 3.076477986880039e-07, "loss": 0.87849712, "num_input_tokens_seen": 148421765, "step": 6873, "time_per_iteration": 2.854491710662842 }, { "auxiliary_loss_clip": 0.01164405, "auxiliary_loss_mlp": 0.01031091, "balance_loss_clip": 0.97085923, "balance_loss_mlp": 1.02404535, "epoch": 0.8265496302531112, "flos": 24098645952000.0, "grad_norm": 2.039568750416951, "language_loss": 0.69620943, "learning_rate": 3.0723281044315986e-07, "loss": 0.71816438, "num_input_tokens_seen": 148443720, "step": 6874, "time_per_iteration": 2.7282135486602783 }, { "auxiliary_loss_clip": 0.01161102, "auxiliary_loss_mlp": 0.01025255, "balance_loss_clip": 1.04422474, "balance_loss_mlp": 1.01852882, "epoch": 0.8266698731437504, "flos": 14099894599680.0, "grad_norm": 1.8586391510066842, "language_loss": 0.76402241, "learning_rate": 3.068180789897521e-07, "loss": 0.78588593, "num_input_tokens_seen": 148462130, "step": 6875, "time_per_iteration": 2.5966646671295166 }, { "auxiliary_loss_clip": 0.01169546, "auxiliary_loss_mlp": 0.01027019, "balance_loss_clip": 1.00903606, "balance_loss_mlp": 1.01945782, "epoch": 0.8267901160343895, "flos": 30777563715840.0, "grad_norm": 1.7124324678229943, "language_loss": 0.81599629, "learning_rate": 3.064036043906966e-07, "loss": 0.83796203, "num_input_tokens_seen": 148485570, "step": 6876, "time_per_iteration": 2.8183908462524414 }, { "auxiliary_loss_clip": 0.01169432, "auxiliary_loss_mlp": 0.01028158, "balance_loss_clip": 0.93182206, "balance_loss_mlp": 1.02039146, "epoch": 0.8269103589250285, "flos": 40624915242240.0, "grad_norm": 1.9478553429131786, "language_loss": 0.67988342, "learning_rate": 3.059893867088668e-07, "loss": 0.70185935, "num_input_tokens_seen": 148509715, "step": 6877, "time_per_iteration": 2.917203903198242 }, { "auxiliary_loss_clip": 0.0116672, "auxiliary_loss_mlp": 0.01026075, "balance_loss_clip": 1.0115459, "balance_loss_mlp": 1.01900005, "epoch": 0.8270306018156677, "flos": 30263645877120.0, "grad_norm": 1.8188598663040971, "language_loss": 0.67125309, "learning_rate": 3.055754260071004e-07, "loss": 0.69318104, "num_input_tokens_seen": 148532010, "step": 6878, "time_per_iteration": 2.6900057792663574 }, { "auxiliary_loss_clip": 0.01166638, "auxiliary_loss_mlp": 0.01026507, "balance_loss_clip": 1.00939417, "balance_loss_mlp": 1.01965809, "epoch": 0.8271508447063067, "flos": 25226599812480.0, "grad_norm": 3.6583442506308455, "language_loss": 0.7339462, "learning_rate": 3.051617223481948e-07, "loss": 0.75587767, "num_input_tokens_seen": 148553330, "step": 6879, "time_per_iteration": 2.6331300735473633 }, { "auxiliary_loss_clip": 0.01173285, "auxiliary_loss_mlp": 0.01032917, "balance_loss_clip": 0.93258196, "balance_loss_mlp": 1.025213, "epoch": 0.8272710875969458, "flos": 17566602900480.0, "grad_norm": 1.8805694986052899, "language_loss": 0.75026327, "learning_rate": 3.047482757949078e-07, "loss": 0.77232528, "num_input_tokens_seen": 148570960, "step": 6880, "time_per_iteration": 3.564208745956421 }, { "auxiliary_loss_clip": 0.01159861, "auxiliary_loss_mlp": 0.01122265, "balance_loss_clip": 0.9321208, "balance_loss_mlp": 0.0, "epoch": 0.827391330487585, "flos": 19755465886080.0, "grad_norm": 1.890788567019902, "language_loss": 0.85588008, "learning_rate": 3.043350864099605e-07, "loss": 0.87870133, "num_input_tokens_seen": 148589520, "step": 6881, "time_per_iteration": 2.689788341522217 }, { "auxiliary_loss_clip": 0.01167035, "auxiliary_loss_mlp": 0.01032498, "balance_loss_clip": 1.00728846, "balance_loss_mlp": 1.02591181, "epoch": 0.827511573378224, "flos": 16835174254080.0, "grad_norm": 2.323421581755916, "language_loss": 0.80439579, "learning_rate": 3.039221542560315e-07, "loss": 0.8263911, "num_input_tokens_seen": 148606085, "step": 6882, "time_per_iteration": 2.6297338008880615 }, { "auxiliary_loss_clip": 0.01163102, "auxiliary_loss_mlp": 0.01021197, "balance_loss_clip": 1.00923133, "balance_loss_mlp": 1.01383924, "epoch": 0.8276318162688631, "flos": 18369242259840.0, "grad_norm": 1.7607516919573756, "language_loss": 0.73308563, "learning_rate": 3.0350947939576356e-07, "loss": 0.75492859, "num_input_tokens_seen": 148625240, "step": 6883, "time_per_iteration": 2.5961201190948486 }, { "auxiliary_loss_clip": 0.01169477, "auxiliary_loss_mlp": 0.01037441, "balance_loss_clip": 1.00927067, "balance_loss_mlp": 1.0297724, "epoch": 0.8277520591595022, "flos": 19352691705600.0, "grad_norm": 1.6612638146343603, "language_loss": 0.72473335, "learning_rate": 3.0309706189175876e-07, "loss": 0.74680257, "num_input_tokens_seen": 148645075, "step": 6884, "time_per_iteration": 3.578657627105713 }, { "auxiliary_loss_clip": 0.01065587, "auxiliary_loss_mlp": 0.01003297, "balance_loss_clip": 0.93379891, "balance_loss_mlp": 1.001652, "epoch": 0.8278723020501413, "flos": 67918858329600.0, "grad_norm": 0.7564190065149581, "language_loss": 0.5739212, "learning_rate": 3.0268490180658045e-07, "loss": 0.5946101, "num_input_tokens_seen": 148707855, "step": 6885, "time_per_iteration": 3.211219549179077 }, { "auxiliary_loss_clip": 0.01168462, "auxiliary_loss_mlp": 0.0102611, "balance_loss_clip": 1.04903936, "balance_loss_mlp": 1.01917148, "epoch": 0.8279925449407803, "flos": 18185738653440.0, "grad_norm": 2.5265368884427875, "language_loss": 0.79443443, "learning_rate": 3.0227299920275305e-07, "loss": 0.81638014, "num_input_tokens_seen": 148724170, "step": 6886, "time_per_iteration": 2.5769684314727783 }, { "auxiliary_loss_clip": 0.01167865, "auxiliary_loss_mlp": 0.01030115, "balance_loss_clip": 0.9347471, "balance_loss_mlp": 1.02169847, "epoch": 0.8281127878314195, "flos": 20631434860800.0, "grad_norm": 2.352250094465441, "language_loss": 0.86019528, "learning_rate": 3.018613541427613e-07, "loss": 0.88217509, "num_input_tokens_seen": 148743690, "step": 6887, "time_per_iteration": 3.6690845489501953 }, { "auxiliary_loss_clip": 0.01165082, "auxiliary_loss_mlp": 0.01027004, "balance_loss_clip": 1.04540634, "balance_loss_mlp": 1.02010179, "epoch": 0.8282330307220586, "flos": 18004282122240.0, "grad_norm": 1.6120745901370817, "language_loss": 0.73707181, "learning_rate": 3.0144996668905243e-07, "loss": 0.75899267, "num_input_tokens_seen": 148761070, "step": 6888, "time_per_iteration": 3.565905809402466 }, { "auxiliary_loss_clip": 0.01157522, "auxiliary_loss_mlp": 0.01122409, "balance_loss_clip": 0.85109508, "balance_loss_mlp": 0.0, "epoch": 0.8283532736126976, "flos": 20084120352000.0, "grad_norm": 1.8742546574945294, "language_loss": 0.81920266, "learning_rate": 3.010388369040331e-07, "loss": 0.84200203, "num_input_tokens_seen": 148779730, "step": 6889, "time_per_iteration": 2.8016302585601807 }, { "auxiliary_loss_clip": 0.01168579, "auxiliary_loss_mlp": 0.01029929, "balance_loss_clip": 1.01064229, "balance_loss_mlp": 1.02305055, "epoch": 0.8284735165033368, "flos": 31868421805440.0, "grad_norm": 1.4816389602814033, "language_loss": 0.82935715, "learning_rate": 3.0062796485007156e-07, "loss": 0.85134226, "num_input_tokens_seen": 148800670, "step": 6890, "time_per_iteration": 2.735419750213623 }, { "auxiliary_loss_clip": 0.01169817, "auxiliary_loss_mlp": 0.0112252, "balance_loss_clip": 1.04834342, "balance_loss_mlp": 0.0, "epoch": 0.8285937593939758, "flos": 26651319840000.0, "grad_norm": 4.051524791830979, "language_loss": 0.65424454, "learning_rate": 3.002173505894965e-07, "loss": 0.67716789, "num_input_tokens_seen": 148819820, "step": 6891, "time_per_iteration": 2.635115146636963 }, { "auxiliary_loss_clip": 0.01168584, "auxiliary_loss_mlp": 0.01026237, "balance_loss_clip": 1.00793195, "balance_loss_mlp": 1.01837516, "epoch": 0.8287140022846149, "flos": 20193683811840.0, "grad_norm": 2.3927225476070415, "language_loss": 0.62727904, "learning_rate": 2.998069941845973e-07, "loss": 0.64922726, "num_input_tokens_seen": 148838890, "step": 6892, "time_per_iteration": 2.7171828746795654 }, { "auxiliary_loss_clip": 0.01059708, "auxiliary_loss_mlp": 0.01001326, "balance_loss_clip": 1.00856471, "balance_loss_mlp": 0.99970478, "epoch": 0.8288342451752541, "flos": 70755980019840.0, "grad_norm": 0.7138564681768484, "language_loss": 0.57423723, "learning_rate": 2.993968956976258e-07, "loss": 0.59484762, "num_input_tokens_seen": 148906635, "step": 6893, "time_per_iteration": 3.323640823364258 }, { "auxiliary_loss_clip": 0.01174624, "auxiliary_loss_mlp": 0.0102683, "balance_loss_clip": 1.05014253, "balance_loss_mlp": 1.01880693, "epoch": 0.8289544880658931, "flos": 24572235795840.0, "grad_norm": 2.1661800318597613, "language_loss": 0.69969863, "learning_rate": 2.9898705519079313e-07, "loss": 0.72171313, "num_input_tokens_seen": 148925740, "step": 6894, "time_per_iteration": 2.7436702251434326 }, { "auxiliary_loss_clip": 0.01155758, "auxiliary_loss_mlp": 0.01024439, "balance_loss_clip": 0.9697836, "balance_loss_mlp": 1.01743877, "epoch": 0.8290747309565322, "flos": 22273378387200.0, "grad_norm": 1.690062104468688, "language_loss": 0.74884653, "learning_rate": 2.985774727262715e-07, "loss": 0.77064848, "num_input_tokens_seen": 148944585, "step": 6895, "time_per_iteration": 2.8664708137512207 }, { "auxiliary_loss_clip": 0.01163826, "auxiliary_loss_mlp": 0.01024178, "balance_loss_clip": 1.04626143, "balance_loss_mlp": 1.01774931, "epoch": 0.8291949738471713, "flos": 23255570856960.0, "grad_norm": 1.7091380301942924, "language_loss": 0.81729215, "learning_rate": 2.981681483661949e-07, "loss": 0.83917224, "num_input_tokens_seen": 148964170, "step": 6896, "time_per_iteration": 2.7452547550201416 }, { "auxiliary_loss_clip": 0.01168381, "auxiliary_loss_mlp": 0.01029456, "balance_loss_clip": 1.01211643, "balance_loss_mlp": 1.02253294, "epoch": 0.8293152167378104, "flos": 52555768185600.0, "grad_norm": 2.0237671984464316, "language_loss": 0.71065855, "learning_rate": 2.9775908217265633e-07, "loss": 0.73263693, "num_input_tokens_seen": 148989405, "step": 6897, "time_per_iteration": 2.921360492706299 }, { "auxiliary_loss_clip": 0.01068765, "auxiliary_loss_mlp": 0.01000656, "balance_loss_clip": 0.82377946, "balance_loss_mlp": 0.99871272, "epoch": 0.8294354596284494, "flos": 63356156294400.0, "grad_norm": 0.8307968120055068, "language_loss": 0.5038051, "learning_rate": 2.9735027420771253e-07, "loss": 0.5244993, "num_input_tokens_seen": 149049740, "step": 6898, "time_per_iteration": 3.3591644763946533 }, { "auxiliary_loss_clip": 0.0116061, "auxiliary_loss_mlp": 0.0102405, "balance_loss_clip": 0.973557, "balance_loss_mlp": 1.01761007, "epoch": 0.8295557025190886, "flos": 24827021942400.0, "grad_norm": 1.6441828139978416, "language_loss": 0.71433848, "learning_rate": 2.969417245333774e-07, "loss": 0.73618513, "num_input_tokens_seen": 149069120, "step": 6899, "time_per_iteration": 2.900179386138916 }, { "auxiliary_loss_clip": 0.01161319, "auxiliary_loss_mlp": 0.01021737, "balance_loss_clip": 0.93424892, "balance_loss_mlp": 1.01517427, "epoch": 0.8296759454097277, "flos": 25118580637440.0, "grad_norm": 2.290657190369938, "language_loss": 0.78427541, "learning_rate": 2.9653343321162915e-07, "loss": 0.80610597, "num_input_tokens_seen": 149088630, "step": 6900, "time_per_iteration": 2.751253366470337 }, { "auxiliary_loss_clip": 0.01165652, "auxiliary_loss_mlp": 0.01023726, "balance_loss_clip": 0.93472886, "balance_loss_mlp": 1.01611459, "epoch": 0.8297961883003667, "flos": 24132581326080.0, "grad_norm": 1.9345134998657483, "language_loss": 0.64967912, "learning_rate": 2.9612540030440446e-07, "loss": 0.67157292, "num_input_tokens_seen": 149109175, "step": 6901, "time_per_iteration": 2.743396759033203 }, { "auxiliary_loss_clip": 0.01063001, "auxiliary_loss_mlp": 0.01004851, "balance_loss_clip": 0.93437642, "balance_loss_mlp": 1.00321829, "epoch": 0.8299164311910058, "flos": 67446561375360.0, "grad_norm": 0.8634411097069931, "language_loss": 0.64113742, "learning_rate": 2.9571762587360206e-07, "loss": 0.66181594, "num_input_tokens_seen": 149165560, "step": 6902, "time_per_iteration": 3.190113067626953 }, { "auxiliary_loss_clip": 0.01156493, "auxiliary_loss_mlp": 0.0102578, "balance_loss_clip": 0.88901991, "balance_loss_mlp": 1.01881528, "epoch": 0.8300366740816449, "flos": 25228682801280.0, "grad_norm": 2.6801149191313898, "language_loss": 0.73908412, "learning_rate": 2.953101099810806e-07, "loss": 0.76090688, "num_input_tokens_seen": 149185165, "step": 6903, "time_per_iteration": 2.8451859951019287 }, { "auxiliary_loss_clip": 0.011653, "auxiliary_loss_mlp": 0.01025293, "balance_loss_clip": 1.01278353, "balance_loss_mlp": 1.01834941, "epoch": 0.830156916972284, "flos": 18041018757120.0, "grad_norm": 1.9000527016713544, "language_loss": 0.82433581, "learning_rate": 2.9490285268865965e-07, "loss": 0.84624171, "num_input_tokens_seen": 149202655, "step": 6904, "time_per_iteration": 2.7315709590911865 }, { "auxiliary_loss_clip": 0.01170644, "auxiliary_loss_mlp": 0.01029105, "balance_loss_clip": 1.01088786, "balance_loss_mlp": 1.02162182, "epoch": 0.830277159862923, "flos": 26322485806080.0, "grad_norm": 1.9614795371559568, "language_loss": 0.7963686, "learning_rate": 2.9449585405812085e-07, "loss": 0.81836611, "num_input_tokens_seen": 149220035, "step": 6905, "time_per_iteration": 3.7800893783569336 }, { "auxiliary_loss_clip": 0.01168555, "auxiliary_loss_mlp": 0.01025224, "balance_loss_clip": 0.93301207, "balance_loss_mlp": 1.01826859, "epoch": 0.8303974027535622, "flos": 19938861751680.0, "grad_norm": 2.2926331147990844, "language_loss": 0.73821819, "learning_rate": 2.940891141512043e-07, "loss": 0.76015604, "num_input_tokens_seen": 149238055, "step": 6906, "time_per_iteration": 2.6916520595550537 }, { "auxiliary_loss_clip": 0.01163308, "auxiliary_loss_mlp": 0.01030162, "balance_loss_clip": 0.97065294, "balance_loss_mlp": 1.02260447, "epoch": 0.8305176456442013, "flos": 17165552572800.0, "grad_norm": 3.1620423269924633, "language_loss": 0.7179184, "learning_rate": 2.9368263302961385e-07, "loss": 0.73985314, "num_input_tokens_seen": 149256755, "step": 6907, "time_per_iteration": 2.641402006149292 }, { "auxiliary_loss_clip": 0.01155825, "auxiliary_loss_mlp": 0.01024513, "balance_loss_clip": 0.85283107, "balance_loss_mlp": 1.0172565, "epoch": 0.8306378885348403, "flos": 25627614226560.0, "grad_norm": 2.3601630398595073, "language_loss": 0.79970747, "learning_rate": 2.9327641075501075e-07, "loss": 0.82151085, "num_input_tokens_seen": 149275745, "step": 6908, "time_per_iteration": 3.017786741256714 }, { "auxiliary_loss_clip": 0.01154938, "auxiliary_loss_mlp": 0.01028617, "balance_loss_clip": 0.96727002, "balance_loss_mlp": 1.02097559, "epoch": 0.8307581314254795, "flos": 33947864985600.0, "grad_norm": 2.018128223204553, "language_loss": 0.66103673, "learning_rate": 2.9287044738901866e-07, "loss": 0.6828723, "num_input_tokens_seen": 149293730, "step": 6909, "time_per_iteration": 2.8064491748809814 }, { "auxiliary_loss_clip": 0.01167657, "auxiliary_loss_mlp": 0.01122227, "balance_loss_clip": 1.00996065, "balance_loss_mlp": 0.0, "epoch": 0.8308783743161186, "flos": 17562724231680.0, "grad_norm": 1.9289748562063758, "language_loss": 0.91066182, "learning_rate": 2.9246474299322274e-07, "loss": 0.93356067, "num_input_tokens_seen": 149309290, "step": 6910, "time_per_iteration": 3.6332156658172607 }, { "auxiliary_loss_clip": 0.0106539, "auxiliary_loss_mlp": 0.0100118, "balance_loss_clip": 0.89688253, "balance_loss_mlp": 0.99951154, "epoch": 0.8309986172067576, "flos": 69412885649280.0, "grad_norm": 0.89592946249907, "language_loss": 0.63167799, "learning_rate": 2.920592976291678e-07, "loss": 0.65234369, "num_input_tokens_seen": 149366620, "step": 6911, "time_per_iteration": 3.2150051593780518 }, { "auxiliary_loss_clip": 0.01162919, "auxiliary_loss_mlp": 0.01023445, "balance_loss_clip": 1.00838423, "balance_loss_mlp": 1.01621807, "epoch": 0.8311188600973968, "flos": 22309755886080.0, "grad_norm": 2.0127168620690203, "language_loss": 0.80668604, "learning_rate": 2.916541113583595e-07, "loss": 0.82854974, "num_input_tokens_seen": 149385120, "step": 6912, "time_per_iteration": 2.7278287410736084 }, { "auxiliary_loss_clip": 0.01164899, "auxiliary_loss_mlp": 0.01023144, "balance_loss_clip": 0.93313384, "balance_loss_mlp": 1.01611042, "epoch": 0.8312391029880358, "flos": 18770077105920.0, "grad_norm": 2.4297887039586974, "language_loss": 0.66472888, "learning_rate": 2.912491842422642e-07, "loss": 0.68660939, "num_input_tokens_seen": 149402825, "step": 6913, "time_per_iteration": 4.011859655380249 }, { "auxiliary_loss_clip": 0.01167469, "auxiliary_loss_mlp": 0.01030218, "balance_loss_clip": 1.010535, "balance_loss_mlp": 1.02293396, "epoch": 0.8313593458786749, "flos": 20376648714240.0, "grad_norm": 1.815343041385873, "language_loss": 0.70845354, "learning_rate": 2.9084451634230857e-07, "loss": 0.73043036, "num_input_tokens_seen": 149422125, "step": 6914, "time_per_iteration": 4.282656669616699 }, { "auxiliary_loss_clip": 0.01160026, "auxiliary_loss_mlp": 0.01028561, "balance_loss_clip": 0.93110383, "balance_loss_mlp": 1.02126575, "epoch": 0.831479588769314, "flos": 32124069878400.0, "grad_norm": 3.113680117980468, "language_loss": 0.71605945, "learning_rate": 2.9044010771988125e-07, "loss": 0.73794538, "num_input_tokens_seen": 149441940, "step": 6915, "time_per_iteration": 2.8883283138275146 }, { "auxiliary_loss_clip": 0.01156207, "auxiliary_loss_mlp": 0.01026288, "balance_loss_clip": 0.96971542, "balance_loss_mlp": 1.01925206, "epoch": 0.8315998316599531, "flos": 45185929338240.0, "grad_norm": 1.8421974924667393, "language_loss": 0.71882278, "learning_rate": 2.900359584363303e-07, "loss": 0.74064773, "num_input_tokens_seen": 149465045, "step": 6916, "time_per_iteration": 2.881882667541504 }, { "auxiliary_loss_clip": 0.01158634, "auxiliary_loss_mlp": 0.01034874, "balance_loss_clip": 0.89579189, "balance_loss_mlp": 1.02667189, "epoch": 0.8317200745505922, "flos": 18363747479040.0, "grad_norm": 2.1051200848076608, "language_loss": 0.84298265, "learning_rate": 2.8963206855296494e-07, "loss": 0.86491764, "num_input_tokens_seen": 149481285, "step": 6917, "time_per_iteration": 2.705489158630371 }, { "auxiliary_loss_clip": 0.01168536, "auxiliary_loss_mlp": 0.01025431, "balance_loss_clip": 1.01111972, "balance_loss_mlp": 1.01863921, "epoch": 0.8318403174412313, "flos": 24206557386240.0, "grad_norm": 1.6337786573305975, "language_loss": 0.77024841, "learning_rate": 2.892284381310548e-07, "loss": 0.79218805, "num_input_tokens_seen": 149502700, "step": 6918, "time_per_iteration": 2.761610984802246 }, { "auxiliary_loss_clip": 0.01163162, "auxiliary_loss_mlp": 0.01028939, "balance_loss_clip": 0.97005355, "balance_loss_mlp": 1.02190304, "epoch": 0.8319605603318704, "flos": 22418780641920.0, "grad_norm": 2.3516617436131466, "language_loss": 0.72598374, "learning_rate": 2.888250672318302e-07, "loss": 0.74790472, "num_input_tokens_seen": 149520100, "step": 6919, "time_per_iteration": 2.7572128772735596 }, { "auxiliary_loss_clip": 0.0117064, "auxiliary_loss_mlp": 0.01029452, "balance_loss_clip": 1.04925632, "balance_loss_mlp": 1.02202523, "epoch": 0.8320808032225094, "flos": 37414501459200.0, "grad_norm": 3.6604311213425595, "language_loss": 0.69029522, "learning_rate": 2.884219559164831e-07, "loss": 0.71229613, "num_input_tokens_seen": 149543245, "step": 6920, "time_per_iteration": 2.8547301292419434 }, { "auxiliary_loss_clip": 0.01165728, "auxiliary_loss_mlp": 0.01024534, "balance_loss_clip": 1.00944138, "balance_loss_mlp": 1.01722956, "epoch": 0.8322010461131486, "flos": 12787395638400.0, "grad_norm": 1.842059009611426, "language_loss": 0.81476915, "learning_rate": 2.880191042461635e-07, "loss": 0.83667183, "num_input_tokens_seen": 149559185, "step": 6921, "time_per_iteration": 2.732365369796753 }, { "auxiliary_loss_clip": 0.0116768, "auxiliary_loss_mlp": 0.01026474, "balance_loss_clip": 0.89382422, "balance_loss_mlp": 1.01923776, "epoch": 0.8323212890037877, "flos": 15815455050240.0, "grad_norm": 1.998467410341376, "language_loss": 0.80068058, "learning_rate": 2.876165122819849e-07, "loss": 0.82262212, "num_input_tokens_seen": 149577165, "step": 6922, "time_per_iteration": 2.858466148376465 }, { "auxiliary_loss_clip": 0.0116423, "auxiliary_loss_mlp": 0.01026754, "balance_loss_clip": 1.04687059, "balance_loss_mlp": 1.01940179, "epoch": 0.8324415318944267, "flos": 21719276208000.0, "grad_norm": 1.6629906318139545, "language_loss": 0.79172051, "learning_rate": 2.872141800850201e-07, "loss": 0.81363034, "num_input_tokens_seen": 149594340, "step": 6923, "time_per_iteration": 2.669809579849243 }, { "auxiliary_loss_clip": 0.01167753, "auxiliary_loss_mlp": 0.01023135, "balance_loss_clip": 1.0478555, "balance_loss_mlp": 1.01642025, "epoch": 0.8325617747850659, "flos": 34198700636160.0, "grad_norm": 1.6719117703444133, "language_loss": 0.73240888, "learning_rate": 2.868121077163024e-07, "loss": 0.75431776, "num_input_tokens_seen": 149613895, "step": 6924, "time_per_iteration": 2.772437810897827 }, { "auxiliary_loss_clip": 0.0116757, "auxiliary_loss_mlp": 0.01034215, "balance_loss_clip": 1.00818264, "balance_loss_mlp": 1.02700889, "epoch": 0.8326820176757049, "flos": 18369457741440.0, "grad_norm": 1.7459658437752523, "language_loss": 0.72289842, "learning_rate": 2.864102952368257e-07, "loss": 0.74491632, "num_input_tokens_seen": 149631820, "step": 6925, "time_per_iteration": 2.563359498977661 }, { "auxiliary_loss_clip": 0.01144234, "auxiliary_loss_mlp": 0.01026768, "balance_loss_clip": 0.88731462, "balance_loss_mlp": 1.02006292, "epoch": 0.832802260566344, "flos": 35991325716480.0, "grad_norm": 1.2536385854791772, "language_loss": 0.5938707, "learning_rate": 2.860087427075444e-07, "loss": 0.61558068, "num_input_tokens_seen": 149656070, "step": 6926, "time_per_iteration": 2.8287510871887207 }, { "auxiliary_loss_clip": 0.01158677, "auxiliary_loss_mlp": 0.01025123, "balance_loss_clip": 0.96955854, "balance_loss_mlp": 1.01800644, "epoch": 0.8329225034569832, "flos": 14244434928000.0, "grad_norm": 2.295755338732394, "language_loss": 0.85915792, "learning_rate": 2.856074501893744e-07, "loss": 0.88099599, "num_input_tokens_seen": 149671270, "step": 6927, "time_per_iteration": 2.6019375324249268 }, { "auxiliary_loss_clip": 0.01170104, "auxiliary_loss_mlp": 0.01028163, "balance_loss_clip": 1.01121783, "balance_loss_mlp": 1.02051592, "epoch": 0.8330427463476222, "flos": 18077468083200.0, "grad_norm": 1.6759136568145026, "language_loss": 0.81317896, "learning_rate": 2.8520641774319054e-07, "loss": 0.83516163, "num_input_tokens_seen": 149689360, "step": 6928, "time_per_iteration": 2.540205717086792 }, { "auxiliary_loss_clip": 0.0116287, "auxiliary_loss_mlp": 0.0102268, "balance_loss_clip": 0.96566749, "balance_loss_mlp": 1.01541758, "epoch": 0.8331629892382613, "flos": 18040839189120.0, "grad_norm": 2.0860215740185004, "language_loss": 0.75714451, "learning_rate": 2.848056454298309e-07, "loss": 0.77900004, "num_input_tokens_seen": 149706685, "step": 6929, "time_per_iteration": 2.667271375656128 }, { "auxiliary_loss_clip": 0.01165125, "auxiliary_loss_mlp": 0.01024244, "balance_loss_clip": 0.97272289, "balance_loss_mlp": 1.01711524, "epoch": 0.8332832321289004, "flos": 17457398576640.0, "grad_norm": 2.134618527200697, "language_loss": 0.65306985, "learning_rate": 2.844051333100905e-07, "loss": 0.67496353, "num_input_tokens_seen": 149724230, "step": 6930, "time_per_iteration": 2.6334261894226074 }, { "auxiliary_loss_clip": 0.0116611, "auxiliary_loss_mlp": 0.01020799, "balance_loss_clip": 0.97426772, "balance_loss_mlp": 1.01441824, "epoch": 0.8334034750195395, "flos": 15084852416640.0, "grad_norm": 2.529604946297277, "language_loss": 0.83585811, "learning_rate": 2.840048814447269e-07, "loss": 0.85772717, "num_input_tokens_seen": 149742395, "step": 6931, "time_per_iteration": 2.708415985107422 }, { "auxiliary_loss_clip": 0.01152754, "auxiliary_loss_mlp": 0.01028413, "balance_loss_clip": 0.96763754, "balance_loss_mlp": 1.02117062, "epoch": 0.8335237179101785, "flos": 19427170556160.0, "grad_norm": 2.5581493927812184, "language_loss": 0.73707068, "learning_rate": 2.836048898944587e-07, "loss": 0.75888234, "num_input_tokens_seen": 149760820, "step": 6932, "time_per_iteration": 3.4744927883148193 }, { "auxiliary_loss_clip": 0.01164035, "auxiliary_loss_mlp": 0.01026201, "balance_loss_clip": 0.97107458, "balance_loss_mlp": 1.01941824, "epoch": 0.8336439608008177, "flos": 21762046327680.0, "grad_norm": 2.412447388550445, "language_loss": 0.71882755, "learning_rate": 2.832051587199642e-07, "loss": 0.74072993, "num_input_tokens_seen": 149778075, "step": 6933, "time_per_iteration": 2.6915292739868164 }, { "auxiliary_loss_clip": 0.01063329, "auxiliary_loss_mlp": 0.01004406, "balance_loss_clip": 0.97145879, "balance_loss_mlp": 1.00272489, "epoch": 0.8337642036914568, "flos": 59702783990400.0, "grad_norm": 0.7984941842039343, "language_loss": 0.57908356, "learning_rate": 2.828056879818821e-07, "loss": 0.59976089, "num_input_tokens_seen": 149837150, "step": 6934, "time_per_iteration": 3.1487820148468018 }, { "auxiliary_loss_clip": 0.01157617, "auxiliary_loss_mlp": 0.01023636, "balance_loss_clip": 0.92715782, "balance_loss_mlp": 1.01684153, "epoch": 0.8338844465820958, "flos": 27162185022720.0, "grad_norm": 2.2068764319830985, "language_loss": 0.82978106, "learning_rate": 2.824064777408117e-07, "loss": 0.85159361, "num_input_tokens_seen": 149856940, "step": 6935, "time_per_iteration": 2.7465109825134277 }, { "auxiliary_loss_clip": 0.0116867, "auxiliary_loss_mlp": 0.01028618, "balance_loss_clip": 1.01244962, "balance_loss_mlp": 1.02126908, "epoch": 0.8340046894727349, "flos": 30481264425600.0, "grad_norm": 1.6928614171249674, "language_loss": 0.75766534, "learning_rate": 2.8200752805731263e-07, "loss": 0.77963823, "num_input_tokens_seen": 149879930, "step": 6936, "time_per_iteration": 3.656949758529663 }, { "auxiliary_loss_clip": 0.01164907, "auxiliary_loss_mlp": 0.0103188, "balance_loss_clip": 1.00967896, "balance_loss_mlp": 1.02453017, "epoch": 0.834124932363374, "flos": 27126166659840.0, "grad_norm": 1.469678548675013, "language_loss": 0.80886412, "learning_rate": 2.8160883899190625e-07, "loss": 0.830832, "num_input_tokens_seen": 149903200, "step": 6937, "time_per_iteration": 2.6701529026031494 }, { "auxiliary_loss_clip": 0.01153952, "auxiliary_loss_mlp": 0.01022397, "balance_loss_clip": 0.93376994, "balance_loss_mlp": 1.01562858, "epoch": 0.8342451752540131, "flos": 24569865498240.0, "grad_norm": 2.6967999293331495, "language_loss": 0.732306, "learning_rate": 2.8121041060507234e-07, "loss": 0.75406945, "num_input_tokens_seen": 149922230, "step": 6938, "time_per_iteration": 3.6228396892547607 }, { "auxiliary_loss_clip": 0.01170729, "auxiliary_loss_mlp": 0.01028055, "balance_loss_clip": 1.00983334, "balance_loss_mlp": 1.02083659, "epoch": 0.8343654181446521, "flos": 26615085995520.0, "grad_norm": 1.5780173765125471, "language_loss": 0.71428758, "learning_rate": 2.808122429572528e-07, "loss": 0.73627543, "num_input_tokens_seen": 149942435, "step": 6939, "time_per_iteration": 2.6173038482666016 }, { "auxiliary_loss_clip": 0.01172558, "auxiliary_loss_mlp": 0.01024748, "balance_loss_clip": 0.93368566, "balance_loss_mlp": 1.01733887, "epoch": 0.8344856610352913, "flos": 20777268078720.0, "grad_norm": 2.597354314010813, "language_loss": 0.76259452, "learning_rate": 2.804143361088489e-07, "loss": 0.78456759, "num_input_tokens_seen": 149961615, "step": 6940, "time_per_iteration": 3.670867443084717 }, { "auxiliary_loss_clip": 0.01155848, "auxiliary_loss_mlp": 0.01022664, "balance_loss_clip": 0.96874988, "balance_loss_mlp": 1.01550269, "epoch": 0.8346059039259304, "flos": 26095960684800.0, "grad_norm": 2.149011298208255, "language_loss": 0.77786261, "learning_rate": 2.8001669012022277e-07, "loss": 0.79964775, "num_input_tokens_seen": 149979585, "step": 6941, "time_per_iteration": 2.7447738647460938 }, { "auxiliary_loss_clip": 0.01165595, "auxiliary_loss_mlp": 0.01027626, "balance_loss_clip": 1.01107168, "balance_loss_mlp": 1.02089703, "epoch": 0.8347261468165694, "flos": 29027708755200.0, "grad_norm": 1.5212697887396274, "language_loss": 0.69392943, "learning_rate": 2.7961930505169795e-07, "loss": 0.71586168, "num_input_tokens_seen": 150003830, "step": 6942, "time_per_iteration": 2.663911819458008 }, { "auxiliary_loss_clip": 0.0116889, "auxiliary_loss_mlp": 0.01122603, "balance_loss_clip": 1.01101494, "balance_loss_mlp": 0.0, "epoch": 0.8348463897072086, "flos": 26396461866240.0, "grad_norm": 1.9074027510129383, "language_loss": 0.76372874, "learning_rate": 2.792221809635558e-07, "loss": 0.78664362, "num_input_tokens_seen": 150024460, "step": 6943, "time_per_iteration": 2.6761653423309326 }, { "auxiliary_loss_clip": 0.01162233, "auxiliary_loss_mlp": 0.01030129, "balance_loss_clip": 0.81907409, "balance_loss_mlp": 1.02248156, "epoch": 0.8349666325978476, "flos": 23367720096000.0, "grad_norm": 1.824229645663038, "language_loss": 0.75072861, "learning_rate": 2.788253179160411e-07, "loss": 0.77265215, "num_input_tokens_seen": 150045620, "step": 6944, "time_per_iteration": 2.809164047241211 }, { "auxiliary_loss_clip": 0.01164117, "auxiliary_loss_mlp": 0.01025888, "balance_loss_clip": 0.97162855, "balance_loss_mlp": 1.01981401, "epoch": 0.8350868754884867, "flos": 12896528135040.0, "grad_norm": 1.7903888867384188, "language_loss": 0.64912325, "learning_rate": 2.7842871596935725e-07, "loss": 0.67102325, "num_input_tokens_seen": 150064135, "step": 6945, "time_per_iteration": 2.764599084854126 }, { "auxiliary_loss_clip": 0.01171871, "auxiliary_loss_mlp": 0.01022069, "balance_loss_clip": 1.00905085, "balance_loss_mlp": 1.0152086, "epoch": 0.8352071183791259, "flos": 26505522535680.0, "grad_norm": 2.720789186202166, "language_loss": 0.68975317, "learning_rate": 2.780323751836682e-07, "loss": 0.71169257, "num_input_tokens_seen": 150085350, "step": 6946, "time_per_iteration": 2.6230618953704834 }, { "auxiliary_loss_clip": 0.01163252, "auxiliary_loss_mlp": 0.01121963, "balance_loss_clip": 0.96942902, "balance_loss_mlp": 0.0, "epoch": 0.8353273612697649, "flos": 20668063754880.0, "grad_norm": 1.443731166865201, "language_loss": 0.78580844, "learning_rate": 2.7763629561909876e-07, "loss": 0.80866057, "num_input_tokens_seen": 150106180, "step": 6947, "time_per_iteration": 2.6400749683380127 }, { "auxiliary_loss_clip": 0.01163376, "auxiliary_loss_mlp": 0.01022407, "balance_loss_clip": 1.04605663, "balance_loss_mlp": 1.01564777, "epoch": 0.835447604160404, "flos": 19754137082880.0, "grad_norm": 1.8198447002877087, "language_loss": 0.76866043, "learning_rate": 2.772404773357335e-07, "loss": 0.79051828, "num_input_tokens_seen": 150125585, "step": 6948, "time_per_iteration": 2.5311644077301025 }, { "auxiliary_loss_clip": 0.01155016, "auxiliary_loss_mlp": 0.0102411, "balance_loss_clip": 0.93143511, "balance_loss_mlp": 1.01720476, "epoch": 0.8355678470510431, "flos": 23435842239360.0, "grad_norm": 1.8510724039393225, "language_loss": 0.78178191, "learning_rate": 2.7684492039361853e-07, "loss": 0.80357325, "num_input_tokens_seen": 150144810, "step": 6949, "time_per_iteration": 2.683535575866699 }, { "auxiliary_loss_clip": 0.01169349, "auxiliary_loss_mlp": 0.01026126, "balance_loss_clip": 1.04894805, "balance_loss_mlp": 1.01925933, "epoch": 0.8356880899416822, "flos": 21214588164480.0, "grad_norm": 1.6956904050738533, "language_loss": 0.83696765, "learning_rate": 2.764496248527586e-07, "loss": 0.85892242, "num_input_tokens_seen": 150163785, "step": 6950, "time_per_iteration": 2.554237127304077 }, { "auxiliary_loss_clip": 0.01171177, "auxiliary_loss_mlp": 0.01024586, "balance_loss_clip": 0.93163717, "balance_loss_mlp": 1.01686716, "epoch": 0.8358083328323213, "flos": 28037543466240.0, "grad_norm": 2.256600351794879, "language_loss": 0.78942919, "learning_rate": 2.760545907731211e-07, "loss": 0.81138682, "num_input_tokens_seen": 150184360, "step": 6951, "time_per_iteration": 2.723588466644287 }, { "auxiliary_loss_clip": 0.01165629, "auxiliary_loss_mlp": 0.01022675, "balance_loss_clip": 1.00776172, "balance_loss_mlp": 1.01547432, "epoch": 0.8359285757229604, "flos": 27783655159680.0, "grad_norm": 2.0307228617151667, "language_loss": 0.6762594, "learning_rate": 2.75659818214631e-07, "loss": 0.69814241, "num_input_tokens_seen": 150205465, "step": 6952, "time_per_iteration": 2.6733014583587646 }, { "auxiliary_loss_clip": 0.01167142, "auxiliary_loss_mlp": 0.01026696, "balance_loss_clip": 0.97067922, "balance_loss_mlp": 1.01920378, "epoch": 0.8360488186135995, "flos": 21435115714560.0, "grad_norm": 1.904171733065994, "language_loss": 0.77933848, "learning_rate": 2.752653072371749e-07, "loss": 0.8012768, "num_input_tokens_seen": 150224900, "step": 6953, "time_per_iteration": 2.6494553089141846 }, { "auxiliary_loss_clip": 0.01159279, "auxiliary_loss_mlp": 0.01024703, "balance_loss_clip": 0.93452501, "balance_loss_mlp": 1.01796532, "epoch": 0.8361690615042385, "flos": 27632327160960.0, "grad_norm": 1.9500347799172228, "language_loss": 0.74734104, "learning_rate": 2.7487105790060105e-07, "loss": 0.76918089, "num_input_tokens_seen": 150244310, "step": 6954, "time_per_iteration": 2.6982057094573975 }, { "auxiliary_loss_clip": 0.01167011, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.00803828, "balance_loss_mlp": 1.0222919, "epoch": 0.8362893043948777, "flos": 39202529598720.0, "grad_norm": 1.822129180547346, "language_loss": 0.6889599, "learning_rate": 2.7447707026471587e-07, "loss": 0.71091795, "num_input_tokens_seen": 150267285, "step": 6955, "time_per_iteration": 2.7196030616760254 }, { "auxiliary_loss_clip": 0.01161039, "auxiliary_loss_mlp": 0.01024358, "balance_loss_clip": 0.93081903, "balance_loss_mlp": 1.01779258, "epoch": 0.8364095472855168, "flos": 24785329230720.0, "grad_norm": 1.9628736627310865, "language_loss": 0.79850978, "learning_rate": 2.740833443892874e-07, "loss": 0.82036376, "num_input_tokens_seen": 150285455, "step": 6956, "time_per_iteration": 2.636723756790161 }, { "auxiliary_loss_clip": 0.01167278, "auxiliary_loss_mlp": 0.0102424, "balance_loss_clip": 0.97154111, "balance_loss_mlp": 1.01707268, "epoch": 0.8365297901761558, "flos": 22743412784640.0, "grad_norm": 1.677796071283692, "language_loss": 0.79628628, "learning_rate": 2.7368988033404327e-07, "loss": 0.81820142, "num_input_tokens_seen": 150302970, "step": 6957, "time_per_iteration": 3.78293514251709 }, { "auxiliary_loss_clip": 0.01163601, "auxiliary_loss_mlp": 0.01024735, "balance_loss_clip": 0.93204045, "balance_loss_mlp": 1.01768708, "epoch": 0.836650033066795, "flos": 28396003242240.0, "grad_norm": 1.5079746972860595, "language_loss": 0.84466863, "learning_rate": 2.732966781586712e-07, "loss": 0.86655194, "num_input_tokens_seen": 150322715, "step": 6958, "time_per_iteration": 2.737521171569824 }, { "auxiliary_loss_clip": 0.01156606, "auxiliary_loss_mlp": 0.0102629, "balance_loss_clip": 1.006019, "balance_loss_mlp": 1.01928997, "epoch": 0.836770275957434, "flos": 22236857233920.0, "grad_norm": 1.620490996111539, "language_loss": 0.6675036, "learning_rate": 2.729037379228205e-07, "loss": 0.68933254, "num_input_tokens_seen": 150342900, "step": 6959, "time_per_iteration": 2.585909128189087 }, { "auxiliary_loss_clip": 0.01167921, "auxiliary_loss_mlp": 0.01024697, "balance_loss_clip": 0.97548914, "balance_loss_mlp": 1.01755071, "epoch": 0.8368905188480731, "flos": 22491930689280.0, "grad_norm": 1.4719601851392554, "language_loss": 0.80256176, "learning_rate": 2.725110596860998e-07, "loss": 0.82448792, "num_input_tokens_seen": 150363580, "step": 6960, "time_per_iteration": 2.6330649852752686 }, { "auxiliary_loss_clip": 0.01159615, "auxiliary_loss_mlp": 0.01023984, "balance_loss_clip": 0.8956601, "balance_loss_mlp": 1.01677787, "epoch": 0.8370107617387123, "flos": 13370405287680.0, "grad_norm": 2.7349335503341483, "language_loss": 0.69813037, "learning_rate": 2.7211864350807776e-07, "loss": 0.71996641, "num_input_tokens_seen": 150381780, "step": 6961, "time_per_iteration": 2.623090982437134 }, { "auxiliary_loss_clip": 0.01167329, "auxiliary_loss_mlp": 0.01024526, "balance_loss_clip": 1.046749, "balance_loss_mlp": 1.01783299, "epoch": 0.8371310046293513, "flos": 25261289372160.0, "grad_norm": 1.701793342282133, "language_loss": 0.73784447, "learning_rate": 2.717264894482836e-07, "loss": 0.759763, "num_input_tokens_seen": 150402120, "step": 6962, "time_per_iteration": 3.549546241760254 }, { "auxiliary_loss_clip": 0.01169582, "auxiliary_loss_mlp": 0.0102467, "balance_loss_clip": 1.01086533, "balance_loss_mlp": 1.01744902, "epoch": 0.8372512475199904, "flos": 19792705311360.0, "grad_norm": 1.805778338478746, "language_loss": 0.80607051, "learning_rate": 2.7133459756620646e-07, "loss": 0.828013, "num_input_tokens_seen": 150419315, "step": 6963, "time_per_iteration": 2.598569393157959 }, { "auxiliary_loss_clip": 0.01158728, "auxiliary_loss_mlp": 0.01025424, "balance_loss_clip": 1.00738335, "balance_loss_mlp": 1.01810741, "epoch": 0.8373714904106295, "flos": 19391224020480.0, "grad_norm": 2.10357385703202, "language_loss": 0.73610061, "learning_rate": 2.7094296792129733e-07, "loss": 0.75794214, "num_input_tokens_seen": 150438915, "step": 6964, "time_per_iteration": 2.605057716369629 }, { "auxiliary_loss_clip": 0.01166413, "auxiliary_loss_mlp": 0.01025496, "balance_loss_clip": 1.00922835, "balance_loss_mlp": 1.01860619, "epoch": 0.8374917333012686, "flos": 14975935401600.0, "grad_norm": 1.6148255783043541, "language_loss": 0.75200206, "learning_rate": 2.7055160057296424e-07, "loss": 0.77392113, "num_input_tokens_seen": 150456155, "step": 6965, "time_per_iteration": 3.5301151275634766 }, { "auxiliary_loss_clip": 0.01163618, "auxiliary_loss_mlp": 0.01025847, "balance_loss_clip": 0.93293703, "balance_loss_mlp": 1.01834321, "epoch": 0.8376119761919076, "flos": 30331839847680.0, "grad_norm": 1.6929602382229778, "language_loss": 0.72466636, "learning_rate": 2.7016049558057896e-07, "loss": 0.74656105, "num_input_tokens_seen": 150478115, "step": 6966, "time_per_iteration": 3.7218334674835205 }, { "auxiliary_loss_clip": 0.0116583, "auxiliary_loss_mlp": 0.01029169, "balance_loss_clip": 1.01188874, "balance_loss_mlp": 1.02201009, "epoch": 0.8377322190825467, "flos": 29423336129280.0, "grad_norm": 1.8113240289463812, "language_loss": 0.70425975, "learning_rate": 2.6976965300347074e-07, "loss": 0.7262097, "num_input_tokens_seen": 150500725, "step": 6967, "time_per_iteration": 2.620643377304077 }, { "auxiliary_loss_clip": 0.01158275, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 0.96849626, "balance_loss_mlp": 1.02158904, "epoch": 0.8378524619731859, "flos": 26687086807680.0, "grad_norm": 2.6059876057628926, "language_loss": 0.69354528, "learning_rate": 2.693790729009309e-07, "loss": 0.71542084, "num_input_tokens_seen": 150522335, "step": 6968, "time_per_iteration": 2.6283771991729736 }, { "auxiliary_loss_clip": 0.01167034, "auxiliary_loss_mlp": 0.01023068, "balance_loss_clip": 0.97317326, "balance_loss_mlp": 1.01599884, "epoch": 0.8379727048638249, "flos": 20703866636160.0, "grad_norm": 1.7390399648275723, "language_loss": 0.88263422, "learning_rate": 2.6898875533220946e-07, "loss": 0.90453523, "num_input_tokens_seen": 150541640, "step": 6969, "time_per_iteration": 2.5908517837524414 }, { "auxiliary_loss_clip": 0.01163144, "auxiliary_loss_mlp": 0.01022059, "balance_loss_clip": 1.04830599, "balance_loss_mlp": 1.0153389, "epoch": 0.838092947754464, "flos": 20084084438400.0, "grad_norm": 1.6627710182951256, "language_loss": 0.81509781, "learning_rate": 2.685987003565171e-07, "loss": 0.83694988, "num_input_tokens_seen": 150559680, "step": 6970, "time_per_iteration": 2.550320863723755 }, { "auxiliary_loss_clip": 0.01159588, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 0.93636143, "balance_loss_mlp": 1.02056527, "epoch": 0.8382131906451031, "flos": 18113270964480.0, "grad_norm": 2.2818211069583874, "language_loss": 0.75449622, "learning_rate": 2.6820890803302566e-07, "loss": 0.77636826, "num_input_tokens_seen": 150575205, "step": 6971, "time_per_iteration": 2.616266965866089 }, { "auxiliary_loss_clip": 0.01170835, "auxiliary_loss_mlp": 0.01029095, "balance_loss_clip": 0.9763813, "balance_loss_mlp": 1.02232718, "epoch": 0.8383334335357422, "flos": 17092653920640.0, "grad_norm": 2.06048581199858, "language_loss": 0.81903303, "learning_rate": 2.6781937842086557e-07, "loss": 0.84103233, "num_input_tokens_seen": 150593995, "step": 6972, "time_per_iteration": 2.567434072494507 }, { "auxiliary_loss_clip": 0.01167662, "auxiliary_loss_mlp": 0.01026761, "balance_loss_clip": 1.00967038, "balance_loss_mlp": 1.01939344, "epoch": 0.8384536764263812, "flos": 20704728562560.0, "grad_norm": 1.8042720057726898, "language_loss": 0.67615116, "learning_rate": 2.6743011157912933e-07, "loss": 0.69809532, "num_input_tokens_seen": 150613715, "step": 6973, "time_per_iteration": 2.573303461074829 }, { "auxiliary_loss_clip": 0.0115578, "auxiliary_loss_mlp": 0.01029033, "balance_loss_clip": 0.88977909, "balance_loss_mlp": 1.02220225, "epoch": 0.8385739193170204, "flos": 28986842056320.0, "grad_norm": 1.7249301203225935, "language_loss": 0.6530031, "learning_rate": 2.6704110756686725e-07, "loss": 0.67485118, "num_input_tokens_seen": 150634540, "step": 6974, "time_per_iteration": 2.7543115615844727 }, { "auxiliary_loss_clip": 0.01159057, "auxiliary_loss_mlp": 0.01122722, "balance_loss_clip": 0.96792483, "balance_loss_mlp": 0.0, "epoch": 0.8386941622076595, "flos": 23438068882560.0, "grad_norm": 1.7272221245876331, "language_loss": 0.83919591, "learning_rate": 2.6665236644309085e-07, "loss": 0.8620137, "num_input_tokens_seen": 150654850, "step": 6975, "time_per_iteration": 2.6734609603881836 }, { "auxiliary_loss_clip": 0.0116517, "auxiliary_loss_mlp": 0.01029433, "balance_loss_clip": 1.0087285, "balance_loss_mlp": 1.02244985, "epoch": 0.8388144050982985, "flos": 23002724044800.0, "grad_norm": 1.7665492157204767, "language_loss": 0.79721516, "learning_rate": 2.662638882667727e-07, "loss": 0.81916118, "num_input_tokens_seen": 150673790, "step": 6976, "time_per_iteration": 2.6708199977874756 }, { "auxiliary_loss_clip": 0.01169039, "auxiliary_loss_mlp": 0.01032142, "balance_loss_clip": 1.0468421, "balance_loss_mlp": 1.02474451, "epoch": 0.8389346479889377, "flos": 24280353878400.0, "grad_norm": 2.516626763684293, "language_loss": 0.72829962, "learning_rate": 2.658756730968443e-07, "loss": 0.75031137, "num_input_tokens_seen": 150692255, "step": 6977, "time_per_iteration": 2.5968990325927734 }, { "auxiliary_loss_clip": 0.01169856, "auxiliary_loss_mlp": 0.01024004, "balance_loss_clip": 0.97463918, "balance_loss_mlp": 1.01736379, "epoch": 0.8390548908795767, "flos": 21215019127680.0, "grad_norm": 1.8330011687480292, "language_loss": 0.87770343, "learning_rate": 2.654877209921975e-07, "loss": 0.89964199, "num_input_tokens_seen": 150709790, "step": 6978, "time_per_iteration": 2.620774269104004 }, { "auxiliary_loss_clip": 0.01165546, "auxiliary_loss_mlp": 0.01027971, "balance_loss_clip": 0.89262259, "balance_loss_mlp": 1.01983523, "epoch": 0.8391751337702158, "flos": 35627299332480.0, "grad_norm": 2.8942675105998625, "language_loss": 0.62834644, "learning_rate": 2.651000320116843e-07, "loss": 0.65028167, "num_input_tokens_seen": 150730675, "step": 6979, "time_per_iteration": 2.7794625759124756 }, { "auxiliary_loss_clip": 0.01159159, "auxiliary_loss_mlp": 0.01123102, "balance_loss_clip": 0.93261856, "balance_loss_mlp": 0.0, "epoch": 0.839295376660855, "flos": 21325229032320.0, "grad_norm": 1.8411677817469903, "language_loss": 0.75852156, "learning_rate": 2.647126062141163e-07, "loss": 0.78134412, "num_input_tokens_seen": 150749750, "step": 6980, "time_per_iteration": 2.658600091934204 }, { "auxiliary_loss_clip": 0.01165543, "auxiliary_loss_mlp": 0.01022272, "balance_loss_clip": 0.96872097, "balance_loss_mlp": 1.01573968, "epoch": 0.839415619551494, "flos": 18442535961600.0, "grad_norm": 1.7017188247277764, "language_loss": 0.84143662, "learning_rate": 2.643254436582669e-07, "loss": 0.86331481, "num_input_tokens_seen": 150769240, "step": 6981, "time_per_iteration": 2.6157543659210205 }, { "auxiliary_loss_clip": 0.0116782, "auxiliary_loss_mlp": 0.01024371, "balance_loss_clip": 0.89582253, "balance_loss_mlp": 1.01766539, "epoch": 0.8395358624421331, "flos": 23221958705280.0, "grad_norm": 2.546471677954222, "language_loss": 0.8220582, "learning_rate": 2.6393854440286743e-07, "loss": 0.84398007, "num_input_tokens_seen": 150788410, "step": 6982, "time_per_iteration": 2.727090835571289 }, { "auxiliary_loss_clip": 0.01168798, "auxiliary_loss_mlp": 0.01025804, "balance_loss_clip": 1.05026042, "balance_loss_mlp": 1.01854753, "epoch": 0.8396561053327722, "flos": 24381657210240.0, "grad_norm": 2.0105573206588327, "language_loss": 0.70545357, "learning_rate": 2.6355190850661045e-07, "loss": 0.72739959, "num_input_tokens_seen": 150805245, "step": 6983, "time_per_iteration": 2.511021614074707 }, { "auxiliary_loss_clip": 0.01163977, "auxiliary_loss_mlp": 0.01025132, "balance_loss_clip": 0.97205764, "balance_loss_mlp": 1.01819086, "epoch": 0.8397763482234113, "flos": 22237755073920.0, "grad_norm": 3.4748428245931318, "language_loss": 0.86451048, "learning_rate": 2.631655360281486e-07, "loss": 0.88640153, "num_input_tokens_seen": 150824920, "step": 6984, "time_per_iteration": 3.3788342475891113 }, { "auxiliary_loss_clip": 0.01171594, "auxiliary_loss_mlp": 0.01122691, "balance_loss_clip": 1.00935984, "balance_loss_mlp": 0.0, "epoch": 0.8398965911140504, "flos": 22163743100160.0, "grad_norm": 1.827864628965577, "language_loss": 0.65538275, "learning_rate": 2.6277942702609323e-07, "loss": 0.67832553, "num_input_tokens_seen": 150844400, "step": 6985, "time_per_iteration": 2.6038243770599365 }, { "auxiliary_loss_clip": 0.01165333, "auxiliary_loss_mlp": 0.01026616, "balance_loss_clip": 0.93357968, "balance_loss_mlp": 1.01955867, "epoch": 0.8400168340046895, "flos": 21542775753600.0, "grad_norm": 1.8623917528180773, "language_loss": 0.87222219, "learning_rate": 2.623935815590186e-07, "loss": 0.89414167, "num_input_tokens_seen": 150862780, "step": 6986, "time_per_iteration": 2.7104740142822266 }, { "auxiliary_loss_clip": 0.01168709, "auxiliary_loss_mlp": 0.01025749, "balance_loss_clip": 0.97452134, "balance_loss_mlp": 1.0189904, "epoch": 0.8401370768953286, "flos": 22491966602880.0, "grad_norm": 1.6657010627505544, "language_loss": 0.80336905, "learning_rate": 2.6200799968545516e-07, "loss": 0.82531363, "num_input_tokens_seen": 150883075, "step": 6987, "time_per_iteration": 2.651154041290283 }, { "auxiliary_loss_clip": 0.01065118, "auxiliary_loss_mlp": 0.010032, "balance_loss_clip": 0.937105, "balance_loss_mlp": 1.00154328, "epoch": 0.8402573197859676, "flos": 59238890818560.0, "grad_norm": 0.8685381318027584, "language_loss": 0.5646019, "learning_rate": 2.616226814638969e-07, "loss": 0.58528507, "num_input_tokens_seen": 150948180, "step": 6988, "time_per_iteration": 4.251953125 }, { "auxiliary_loss_clip": 0.01168759, "auxiliary_loss_mlp": 0.01026146, "balance_loss_clip": 0.97304606, "balance_loss_mlp": 1.01918435, "epoch": 0.8403775626766068, "flos": 22674608282880.0, "grad_norm": 1.8409194001332205, "language_loss": 0.77635747, "learning_rate": 2.612376269527954e-07, "loss": 0.79830647, "num_input_tokens_seen": 150967885, "step": 6989, "time_per_iteration": 2.6032326221466064 }, { "auxiliary_loss_clip": 0.01163045, "auxiliary_loss_mlp": 0.01029591, "balance_loss_clip": 0.97165692, "balance_loss_mlp": 1.02210474, "epoch": 0.8404978055672458, "flos": 19609704495360.0, "grad_norm": 1.6541485121037218, "language_loss": 0.67519081, "learning_rate": 2.608528362105635e-07, "loss": 0.69711721, "num_input_tokens_seen": 150987255, "step": 6990, "time_per_iteration": 2.568554162979126 }, { "auxiliary_loss_clip": 0.01163869, "auxiliary_loss_mlp": 0.01023569, "balance_loss_clip": 0.93039715, "balance_loss_mlp": 1.01696491, "epoch": 0.8406180484578849, "flos": 27526929678720.0, "grad_norm": 1.700489982351643, "language_loss": 0.73568821, "learning_rate": 2.6046830929557374e-07, "loss": 0.75756258, "num_input_tokens_seen": 151006905, "step": 6991, "time_per_iteration": 3.655773639678955 }, { "auxiliary_loss_clip": 0.01159881, "auxiliary_loss_mlp": 0.01026216, "balance_loss_clip": 0.93399787, "balance_loss_mlp": 1.01899445, "epoch": 0.8407382913485241, "flos": 22127473342080.0, "grad_norm": 2.161056254900189, "language_loss": 0.84871399, "learning_rate": 2.6008404626615776e-07, "loss": 0.87057495, "num_input_tokens_seen": 151025405, "step": 6992, "time_per_iteration": 3.6442716121673584 }, { "auxiliary_loss_clip": 0.01169891, "auxiliary_loss_mlp": 0.01025993, "balance_loss_clip": 1.01169157, "balance_loss_mlp": 1.01914191, "epoch": 0.8408585342391631, "flos": 13918473982080.0, "grad_norm": 2.2149421917812733, "language_loss": 0.73314047, "learning_rate": 2.597000471806092e-07, "loss": 0.7550993, "num_input_tokens_seen": 151041970, "step": 6993, "time_per_iteration": 2.567681312561035 }, { "auxiliary_loss_clip": 0.01163916, "auxiliary_loss_mlp": 0.01030364, "balance_loss_clip": 0.97317362, "balance_loss_mlp": 1.02249575, "epoch": 0.8409787771298022, "flos": 20187865808640.0, "grad_norm": 2.2205958816317817, "language_loss": 0.73080826, "learning_rate": 2.593163120971793e-07, "loss": 0.75275105, "num_input_tokens_seen": 151060835, "step": 6994, "time_per_iteration": 2.5772855281829834 }, { "auxiliary_loss_clip": 0.01148286, "auxiliary_loss_mlp": 0.01024457, "balance_loss_clip": 0.88912773, "balance_loss_mlp": 1.01778746, "epoch": 0.8410990200204413, "flos": 23142523777920.0, "grad_norm": 1.9328255108833317, "language_loss": 0.68784076, "learning_rate": 2.5893284107408165e-07, "loss": 0.7095682, "num_input_tokens_seen": 151078205, "step": 6995, "time_per_iteration": 2.686781644821167 }, { "auxiliary_loss_clip": 0.01166709, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 0.89829004, "balance_loss_mlp": 1.02233231, "epoch": 0.8412192629110804, "flos": 24027219757440.0, "grad_norm": 1.7261892127069236, "language_loss": 0.77916646, "learning_rate": 2.5854963416948726e-07, "loss": 0.80113208, "num_input_tokens_seen": 151100470, "step": 6996, "time_per_iteration": 2.6893017292022705 }, { "auxiliary_loss_clip": 0.01157079, "auxiliary_loss_mlp": 0.01024596, "balance_loss_clip": 0.88753939, "balance_loss_mlp": 1.01774168, "epoch": 0.8413395058017195, "flos": 25591703604480.0, "grad_norm": 3.2627133886070068, "language_loss": 0.69372702, "learning_rate": 2.5816669144152816e-07, "loss": 0.71554381, "num_input_tokens_seen": 151121650, "step": 6997, "time_per_iteration": 2.7173776626586914 }, { "auxiliary_loss_clip": 0.01059441, "auxiliary_loss_mlp": 0.01000691, "balance_loss_clip": 1.00862765, "balance_loss_mlp": 0.99912971, "epoch": 0.8414597486923585, "flos": 63635396624640.0, "grad_norm": 0.8785261094226915, "language_loss": 0.66361654, "learning_rate": 2.5778401294829777e-07, "loss": 0.68421787, "num_input_tokens_seen": 151180390, "step": 6998, "time_per_iteration": 3.2009646892547607 }, { "auxiliary_loss_clip": 0.01164028, "auxiliary_loss_mlp": 0.01122161, "balance_loss_clip": 1.01005232, "balance_loss_mlp": 0.0, "epoch": 0.8415799915829977, "flos": 19098731571840.0, "grad_norm": 1.6418613101049777, "language_loss": 0.65065134, "learning_rate": 2.574015987478473e-07, "loss": 0.67351317, "num_input_tokens_seen": 151198520, "step": 6999, "time_per_iteration": 2.573272705078125 }, { "auxiliary_loss_clip": 0.01171015, "auxiliary_loss_mlp": 0.01030027, "balance_loss_clip": 0.9715904, "balance_loss_mlp": 1.02214086, "epoch": 0.8417002344736367, "flos": 19821612781440.0, "grad_norm": 3.4787010747688307, "language_loss": 0.86645114, "learning_rate": 2.570194488981887e-07, "loss": 0.88846159, "num_input_tokens_seen": 151215065, "step": 7000, "time_per_iteration": 2.7169039249420166 }, { "auxiliary_loss_clip": 0.01059546, "auxiliary_loss_mlp": 0.01001726, "balance_loss_clip": 1.00847697, "balance_loss_mlp": 1.00012827, "epoch": 0.8418204773642758, "flos": 62161516834560.0, "grad_norm": 0.8359417200835729, "language_loss": 0.60344094, "learning_rate": 2.566375634572939e-07, "loss": 0.62405366, "num_input_tokens_seen": 151275705, "step": 7001, "time_per_iteration": 3.150742292404175 }, { "auxiliary_loss_clip": 0.0116749, "auxiliary_loss_mlp": 0.01025992, "balance_loss_clip": 0.93057173, "balance_loss_mlp": 1.01905441, "epoch": 0.841940720254915, "flos": 17092905315840.0, "grad_norm": 1.734104140050404, "language_loss": 0.7641421, "learning_rate": 2.562559424830943e-07, "loss": 0.7860769, "num_input_tokens_seen": 151293665, "step": 7002, "time_per_iteration": 2.653679847717285 }, { "auxiliary_loss_clip": 0.01161282, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 0.9704982, "balance_loss_mlp": 1.01747525, "epoch": 0.842060963145554, "flos": 16283586026880.0, "grad_norm": 1.9546301643739052, "language_loss": 0.70787072, "learning_rate": 2.5587458603348256e-07, "loss": 0.72973031, "num_input_tokens_seen": 151310955, "step": 7003, "time_per_iteration": 2.5900583267211914 }, { "auxiliary_loss_clip": 0.01156761, "auxiliary_loss_mlp": 0.0102651, "balance_loss_clip": 0.93058079, "balance_loss_mlp": 1.01959932, "epoch": 0.8421812060361931, "flos": 21908238681600.0, "grad_norm": 2.020532084440526, "language_loss": 0.83960497, "learning_rate": 2.554934941663085e-07, "loss": 0.86143762, "num_input_tokens_seen": 151328490, "step": 7004, "time_per_iteration": 2.723597526550293 }, { "auxiliary_loss_clip": 0.01161314, "auxiliary_loss_mlp": 0.01026617, "balance_loss_clip": 0.9313904, "balance_loss_mlp": 1.01899922, "epoch": 0.8423014489268322, "flos": 27777693502080.0, "grad_norm": 1.8330524768010967, "language_loss": 0.73983604, "learning_rate": 2.5511266693938484e-07, "loss": 0.76171535, "num_input_tokens_seen": 151346950, "step": 7005, "time_per_iteration": 2.7233591079711914 }, { "auxiliary_loss_clip": 0.01163561, "auxiliary_loss_mlp": 0.01022089, "balance_loss_clip": 0.97320694, "balance_loss_mlp": 1.01481104, "epoch": 0.8424216918174713, "flos": 25117610970240.0, "grad_norm": 1.5391212155006555, "language_loss": 0.77736735, "learning_rate": 2.547321044104822e-07, "loss": 0.79922384, "num_input_tokens_seen": 151368445, "step": 7006, "time_per_iteration": 2.6771841049194336 }, { "auxiliary_loss_clip": 0.01171353, "auxiliary_loss_mlp": 0.01025832, "balance_loss_clip": 1.04934251, "balance_loss_mlp": 1.01792824, "epoch": 0.8425419347081103, "flos": 24748448941440.0, "grad_norm": 1.603532951873198, "language_loss": 0.76557612, "learning_rate": 2.5435180663733113e-07, "loss": 0.78754795, "num_input_tokens_seen": 151388745, "step": 7007, "time_per_iteration": 2.685519218444824 }, { "auxiliary_loss_clip": 0.01169326, "auxiliary_loss_mlp": 0.01020431, "balance_loss_clip": 0.89243662, "balance_loss_mlp": 1.01337957, "epoch": 0.8426621775987495, "flos": 24820916630400.0, "grad_norm": 3.445812409125151, "language_loss": 0.71954131, "learning_rate": 2.539717736776241e-07, "loss": 0.74143887, "num_input_tokens_seen": 151404970, "step": 7008, "time_per_iteration": 2.8170108795166016 }, { "auxiliary_loss_clip": 0.011624, "auxiliary_loss_mlp": 0.01024733, "balance_loss_clip": 1.00891435, "balance_loss_mlp": 1.0175066, "epoch": 0.8427824204893886, "flos": 23550074467200.0, "grad_norm": 1.5439020443591682, "language_loss": 0.76335537, "learning_rate": 2.535920055890097e-07, "loss": 0.7852267, "num_input_tokens_seen": 151426265, "step": 7009, "time_per_iteration": 2.598479747772217 }, { "auxiliary_loss_clip": 0.01157421, "auxiliary_loss_mlp": 0.01031288, "balance_loss_clip": 0.89312583, "balance_loss_mlp": 1.02333045, "epoch": 0.8429026633800276, "flos": 16143858120960.0, "grad_norm": 1.917126923547288, "language_loss": 0.64350426, "learning_rate": 2.5321250242910006e-07, "loss": 0.66539139, "num_input_tokens_seen": 151444180, "step": 7010, "time_per_iteration": 3.543980598449707 }, { "auxiliary_loss_clip": 0.01167118, "auxiliary_loss_mlp": 0.01023744, "balance_loss_clip": 1.04858994, "balance_loss_mlp": 1.01726544, "epoch": 0.8430229062706668, "flos": 22198540400640.0, "grad_norm": 1.6455415000984257, "language_loss": 0.86513132, "learning_rate": 2.5283326425546493e-07, "loss": 0.88703996, "num_input_tokens_seen": 151463290, "step": 7011, "time_per_iteration": 2.578388214111328 }, { "auxiliary_loss_clip": 0.01159287, "auxiliary_loss_mlp": 0.01021154, "balance_loss_clip": 0.93558979, "balance_loss_mlp": 1.015131, "epoch": 0.8431431491613058, "flos": 35330317683840.0, "grad_norm": 2.0071032941095064, "language_loss": 0.69227475, "learning_rate": 2.5245429112563443e-07, "loss": 0.71407914, "num_input_tokens_seen": 151483965, "step": 7012, "time_per_iteration": 2.7359890937805176 }, { "auxiliary_loss_clip": 0.01164633, "auxiliary_loss_mlp": 0.01026874, "balance_loss_clip": 1.00997353, "balance_loss_mlp": 1.0200671, "epoch": 0.8432633920519449, "flos": 25812374808960.0, "grad_norm": 1.716656708408737, "language_loss": 0.82084852, "learning_rate": 2.5207558309709865e-07, "loss": 0.8427636, "num_input_tokens_seen": 151503700, "step": 7013, "time_per_iteration": 2.6201748847961426 }, { "auxiliary_loss_clip": 0.01070838, "auxiliary_loss_mlp": 0.0111571, "balance_loss_clip": 0.89673948, "balance_loss_mlp": 0.0, "epoch": 0.8433836349425841, "flos": 64959531592320.0, "grad_norm": 0.6650030674184362, "language_loss": 0.56354737, "learning_rate": 2.516971402273065e-07, "loss": 0.58541286, "num_input_tokens_seen": 151569765, "step": 7014, "time_per_iteration": 4.148841142654419 }, { "auxiliary_loss_clip": 0.01162943, "auxiliary_loss_mlp": 0.01021881, "balance_loss_clip": 0.97018206, "balance_loss_mlp": 1.01436234, "epoch": 0.8435038778332231, "flos": 20229989483520.0, "grad_norm": 1.8685836508102796, "language_loss": 0.67414951, "learning_rate": 2.513189625736687e-07, "loss": 0.69599783, "num_input_tokens_seen": 151586660, "step": 7015, "time_per_iteration": 2.6589462757110596 }, { "auxiliary_loss_clip": 0.01167053, "auxiliary_loss_mlp": 0.01028933, "balance_loss_clip": 0.93172419, "balance_loss_mlp": 1.02163744, "epoch": 0.8436241207238622, "flos": 20992229020800.0, "grad_norm": 2.7120837005464438, "language_loss": 0.71530783, "learning_rate": 2.509410501935534e-07, "loss": 0.73726773, "num_input_tokens_seen": 151602295, "step": 7016, "time_per_iteration": 2.692288398742676 }, { "auxiliary_loss_clip": 0.01167857, "auxiliary_loss_mlp": 0.01026099, "balance_loss_clip": 0.97136462, "balance_loss_mlp": 1.0178771, "epoch": 0.8437443636145013, "flos": 14682257804160.0, "grad_norm": 2.769732800044904, "language_loss": 0.7501539, "learning_rate": 2.5056340314429116e-07, "loss": 0.77209342, "num_input_tokens_seen": 151619760, "step": 7017, "time_per_iteration": 3.561265707015991 }, { "auxiliary_loss_clip": 0.01163555, "auxiliary_loss_mlp": 0.01026848, "balance_loss_clip": 0.89181793, "balance_loss_mlp": 1.01901019, "epoch": 0.8438646065051404, "flos": 21608814908160.0, "grad_norm": 2.1650089821751513, "language_loss": 0.8041476, "learning_rate": 2.5018602148316904e-07, "loss": 0.82605159, "num_input_tokens_seen": 151635795, "step": 7018, "time_per_iteration": 3.647491931915283 }, { "auxiliary_loss_clip": 0.01166369, "auxiliary_loss_mlp": 0.01027246, "balance_loss_clip": 1.05022752, "balance_loss_mlp": 1.02020621, "epoch": 0.8439848493957794, "flos": 23289937194240.0, "grad_norm": 1.679815496281072, "language_loss": 0.80351621, "learning_rate": 2.498089052674359e-07, "loss": 0.82545239, "num_input_tokens_seen": 151653770, "step": 7019, "time_per_iteration": 2.6415042877197266 }, { "auxiliary_loss_clip": 0.01169458, "auxiliary_loss_mlp": 0.01036476, "balance_loss_clip": 1.01270103, "balance_loss_mlp": 1.02891779, "epoch": 0.8441050922864186, "flos": 19719339782400.0, "grad_norm": 1.9775585577184498, "language_loss": 0.74924183, "learning_rate": 2.494320545543007e-07, "loss": 0.77130121, "num_input_tokens_seen": 151673340, "step": 7020, "time_per_iteration": 2.612844228744507 }, { "auxiliary_loss_clip": 0.01172029, "auxiliary_loss_mlp": 0.01026118, "balance_loss_clip": 1.04835248, "balance_loss_mlp": 1.01870883, "epoch": 0.8442253351770577, "flos": 21835268202240.0, "grad_norm": 1.7476806112156285, "language_loss": 0.67028016, "learning_rate": 2.490554694009308e-07, "loss": 0.69226164, "num_input_tokens_seen": 151694205, "step": 7021, "time_per_iteration": 2.6073431968688965 }, { "auxiliary_loss_clip": 0.01169304, "auxiliary_loss_mlp": 0.01026926, "balance_loss_clip": 1.00890672, "balance_loss_mlp": 1.02040219, "epoch": 0.8443455780676967, "flos": 34346365447680.0, "grad_norm": 1.5633273721485916, "language_loss": 0.78271747, "learning_rate": 2.4867914986445426e-07, "loss": 0.80467975, "num_input_tokens_seen": 151716595, "step": 7022, "time_per_iteration": 2.6629106998443604 }, { "auxiliary_loss_clip": 0.01164429, "auxiliary_loss_mlp": 0.01027255, "balance_loss_clip": 0.96781647, "balance_loss_mlp": 1.02059174, "epoch": 0.8444658209583359, "flos": 48214599281280.0, "grad_norm": 1.887986886285108, "language_loss": 0.70781755, "learning_rate": 2.483030960019581e-07, "loss": 0.72973442, "num_input_tokens_seen": 151740525, "step": 7023, "time_per_iteration": 2.859995126724243 }, { "auxiliary_loss_clip": 0.01066735, "auxiliary_loss_mlp": 0.01001567, "balance_loss_clip": 0.85990167, "balance_loss_mlp": 0.9999221, "epoch": 0.8445860638489749, "flos": 68484773105280.0, "grad_norm": 0.7480337288504864, "language_loss": 0.55478275, "learning_rate": 2.479273078704891e-07, "loss": 0.5754658, "num_input_tokens_seen": 151793890, "step": 7024, "time_per_iteration": 3.0857834815979004 }, { "auxiliary_loss_clip": 0.01063981, "auxiliary_loss_mlp": 0.0100233, "balance_loss_clip": 0.86215639, "balance_loss_mlp": 1.00075662, "epoch": 0.844706306739614, "flos": 62833331882880.0, "grad_norm": 0.9252845387898442, "language_loss": 0.64802861, "learning_rate": 2.475517855270552e-07, "loss": 0.66869175, "num_input_tokens_seen": 151853970, "step": 7025, "time_per_iteration": 3.236563205718994 }, { "auxiliary_loss_clip": 0.01165635, "auxiliary_loss_mlp": 0.01025132, "balance_loss_clip": 1.0488342, "balance_loss_mlp": 1.01826251, "epoch": 0.8448265496302532, "flos": 14976114969600.0, "grad_norm": 1.7977197060986831, "language_loss": 0.72622979, "learning_rate": 2.4717652902862143e-07, "loss": 0.74813747, "num_input_tokens_seen": 151872945, "step": 7026, "time_per_iteration": 2.5328378677368164 }, { "auxiliary_loss_clip": 0.01170587, "auxiliary_loss_mlp": 0.01026339, "balance_loss_clip": 0.97194582, "balance_loss_mlp": 1.01933885, "epoch": 0.8449467925208922, "flos": 23441265192960.0, "grad_norm": 2.8088331455518247, "language_loss": 0.81466961, "learning_rate": 2.4680153843211495e-07, "loss": 0.83663893, "num_input_tokens_seen": 151892875, "step": 7027, "time_per_iteration": 2.6338915824890137 }, { "auxiliary_loss_clip": 0.01165083, "auxiliary_loss_mlp": 0.01027282, "balance_loss_clip": 0.97293377, "balance_loss_mlp": 1.01975441, "epoch": 0.8450670354115313, "flos": 22748045639040.0, "grad_norm": 1.6672464301706111, "language_loss": 0.72208643, "learning_rate": 2.464268137944212e-07, "loss": 0.74401009, "num_input_tokens_seen": 151914170, "step": 7028, "time_per_iteration": 2.7007522583007812 }, { "auxiliary_loss_clip": 0.01147864, "auxiliary_loss_mlp": 0.01026925, "balance_loss_clip": 0.89230955, "balance_loss_mlp": 1.01930165, "epoch": 0.8451872783021703, "flos": 29825571605760.0, "grad_norm": 1.8092908026973484, "language_loss": 0.78263259, "learning_rate": 2.46052355172385e-07, "loss": 0.80438054, "num_input_tokens_seen": 151932210, "step": 7029, "time_per_iteration": 2.740084648132324 }, { "auxiliary_loss_clip": 0.01169267, "auxiliary_loss_mlp": 0.01029785, "balance_loss_clip": 1.04860377, "balance_loss_mlp": 1.02194142, "epoch": 0.8453075211928095, "flos": 21870029589120.0, "grad_norm": 1.825806075991512, "language_loss": 0.74515188, "learning_rate": 2.456781626228128e-07, "loss": 0.76714236, "num_input_tokens_seen": 151951715, "step": 7030, "time_per_iteration": 2.6593308448791504 }, { "auxiliary_loss_clip": 0.0106956, "auxiliary_loss_mlp": 0.01115818, "balance_loss_clip": 0.86069429, "balance_loss_mlp": 0.0, "epoch": 0.8454277640834486, "flos": 58751869288320.0, "grad_norm": 0.9145481839492435, "language_loss": 0.6639142, "learning_rate": 2.453042362024675e-07, "loss": 0.68576789, "num_input_tokens_seen": 152004960, "step": 7031, "time_per_iteration": 3.2651350498199463 }, { "auxiliary_loss_clip": 0.01163745, "auxiliary_loss_mlp": 0.01022655, "balance_loss_clip": 1.04669237, "balance_loss_mlp": 1.01597905, "epoch": 0.8455480069740876, "flos": 27090076469760.0, "grad_norm": 1.566569183642653, "language_loss": 0.7329694, "learning_rate": 2.449305759680751e-07, "loss": 0.7548334, "num_input_tokens_seen": 152026285, "step": 7032, "time_per_iteration": 2.6644320487976074 }, { "auxiliary_loss_clip": 0.01158949, "auxiliary_loss_mlp": 0.01023947, "balance_loss_clip": 0.93401229, "balance_loss_mlp": 1.01676512, "epoch": 0.8456682498647268, "flos": 27198670262400.0, "grad_norm": 1.5176636123333211, "language_loss": 0.74938244, "learning_rate": 2.445571819763188e-07, "loss": 0.77121139, "num_input_tokens_seen": 152048585, "step": 7033, "time_per_iteration": 2.8888442516326904 }, { "auxiliary_loss_clip": 0.01166492, "auxiliary_loss_mlp": 0.01025355, "balance_loss_clip": 1.04812574, "balance_loss_mlp": 1.01840782, "epoch": 0.8457884927553658, "flos": 20631901737600.0, "grad_norm": 1.6289699660222339, "language_loss": 0.58568966, "learning_rate": 2.4418405428384227e-07, "loss": 0.60760814, "num_input_tokens_seen": 152068795, "step": 7034, "time_per_iteration": 2.798633575439453 }, { "auxiliary_loss_clip": 0.01164312, "auxiliary_loss_mlp": 0.01122836, "balance_loss_clip": 1.04660058, "balance_loss_mlp": 0.0, "epoch": 0.8459087356460049, "flos": 15299023259520.0, "grad_norm": 1.763310431574452, "language_loss": 0.71600735, "learning_rate": 2.4381119294724864e-07, "loss": 0.73887885, "num_input_tokens_seen": 152086240, "step": 7035, "time_per_iteration": 2.607142210006714 }, { "auxiliary_loss_clip": 0.01167098, "auxiliary_loss_mlp": 0.01029803, "balance_loss_clip": 1.0482595, "balance_loss_mlp": 1.02319598, "epoch": 0.846028978536644, "flos": 18843155326080.0, "grad_norm": 1.8370631454429338, "language_loss": 0.53867388, "learning_rate": 2.434385980231004e-07, "loss": 0.56064284, "num_input_tokens_seen": 152105080, "step": 7036, "time_per_iteration": 3.451992988586426 }, { "auxiliary_loss_clip": 0.01166899, "auxiliary_loss_mlp": 0.01029519, "balance_loss_clip": 1.01026487, "balance_loss_mlp": 1.02271867, "epoch": 0.8461492214272831, "flos": 52661740285440.0, "grad_norm": 1.4701086914199057, "language_loss": 0.65427738, "learning_rate": 2.4306626956792043e-07, "loss": 0.67624152, "num_input_tokens_seen": 152130025, "step": 7037, "time_per_iteration": 2.9009690284729004 }, { "auxiliary_loss_clip": 0.01164709, "auxiliary_loss_mlp": 0.01022891, "balance_loss_clip": 1.00780916, "balance_loss_mlp": 1.01638508, "epoch": 0.8462694643179222, "flos": 18588405093120.0, "grad_norm": 1.8594759175357265, "language_loss": 0.75489485, "learning_rate": 2.4269420763819017e-07, "loss": 0.77677083, "num_input_tokens_seen": 152148070, "step": 7038, "time_per_iteration": 2.567470073699951 }, { "auxiliary_loss_clip": 0.01163671, "auxiliary_loss_mlp": 0.01025071, "balance_loss_clip": 1.00978756, "balance_loss_mlp": 1.0178349, "epoch": 0.8463897072085613, "flos": 24387080163840.0, "grad_norm": 2.490762461703964, "language_loss": 0.83923918, "learning_rate": 2.4232241229035223e-07, "loss": 0.86112654, "num_input_tokens_seen": 152165825, "step": 7039, "time_per_iteration": 2.6674814224243164 }, { "auxiliary_loss_clip": 0.01063372, "auxiliary_loss_mlp": 0.01002135, "balance_loss_clip": 0.97183841, "balance_loss_mlp": 1.00053787, "epoch": 0.8465099500992004, "flos": 68702140258560.0, "grad_norm": 0.7686254594526672, "language_loss": 0.56823051, "learning_rate": 2.419508835808064e-07, "loss": 0.58888561, "num_input_tokens_seen": 152222380, "step": 7040, "time_per_iteration": 4.06515908241272 }, { "auxiliary_loss_clip": 0.01164167, "auxiliary_loss_mlp": 0.01023747, "balance_loss_clip": 0.97086668, "balance_loss_mlp": 1.01654708, "epoch": 0.8466301929898394, "flos": 13735724561280.0, "grad_norm": 1.8747564602522218, "language_loss": 0.62728262, "learning_rate": 2.415796215659134e-07, "loss": 0.64916176, "num_input_tokens_seen": 152239085, "step": 7041, "time_per_iteration": 2.6426708698272705 }, { "auxiliary_loss_clip": 0.01164236, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 0.92941672, "balance_loss_mlp": 1.02640963, "epoch": 0.8467504358804786, "flos": 19241260738560.0, "grad_norm": 1.8931096557462974, "language_loss": 0.76709664, "learning_rate": 2.412086263019939e-07, "loss": 0.78907514, "num_input_tokens_seen": 152257110, "step": 7042, "time_per_iteration": 2.682583808898926 }, { "auxiliary_loss_clip": 0.01163186, "auxiliary_loss_mlp": 0.01023828, "balance_loss_clip": 1.04866278, "balance_loss_mlp": 1.01743817, "epoch": 0.8468706787711177, "flos": 21324115710720.0, "grad_norm": 1.4977946885192988, "language_loss": 0.7972517, "learning_rate": 2.408378978453276e-07, "loss": 0.81912178, "num_input_tokens_seen": 152277230, "step": 7043, "time_per_iteration": 3.470402956008911 }, { "auxiliary_loss_clip": 0.01062787, "auxiliary_loss_mlp": 0.01001637, "balance_loss_clip": 0.97124612, "balance_loss_mlp": 0.99996775, "epoch": 0.8469909216617567, "flos": 64877439058560.0, "grad_norm": 0.8149062835577128, "language_loss": 0.63982129, "learning_rate": 2.404674362521533e-07, "loss": 0.66046554, "num_input_tokens_seen": 152335725, "step": 7044, "time_per_iteration": 4.1399595737457275 }, { "auxiliary_loss_clip": 0.01163749, "auxiliary_loss_mlp": 0.01022272, "balance_loss_clip": 1.01067543, "balance_loss_mlp": 1.01514924, "epoch": 0.8471111645523959, "flos": 19280583152640.0, "grad_norm": 2.313800957400373, "language_loss": 0.75035578, "learning_rate": 2.4009724157866997e-07, "loss": 0.77221596, "num_input_tokens_seen": 152352785, "step": 7045, "time_per_iteration": 2.6512868404388428 }, { "auxiliary_loss_clip": 0.01165148, "auxiliary_loss_mlp": 0.01020448, "balance_loss_clip": 1.04774618, "balance_loss_mlp": 1.01384044, "epoch": 0.8472314074430349, "flos": 22015826893440.0, "grad_norm": 3.220475189946779, "language_loss": 0.76249242, "learning_rate": 2.3972731388103564e-07, "loss": 0.78434837, "num_input_tokens_seen": 152371265, "step": 7046, "time_per_iteration": 2.5723791122436523 }, { "auxiliary_loss_clip": 0.01065682, "auxiliary_loss_mlp": 0.01000896, "balance_loss_clip": 0.82452846, "balance_loss_mlp": 0.99933398, "epoch": 0.847351650333674, "flos": 57882580243200.0, "grad_norm": 1.13152417204504, "language_loss": 0.62326765, "learning_rate": 2.393576532153687e-07, "loss": 0.64393342, "num_input_tokens_seen": 152435050, "step": 7047, "time_per_iteration": 3.4309065341949463 }, { "auxiliary_loss_clip": 0.01060378, "auxiliary_loss_mlp": 0.01001432, "balance_loss_clip": 0.97238255, "balance_loss_mlp": 0.99988234, "epoch": 0.8474718932243132, "flos": 41284238313600.0, "grad_norm": 0.9315963032920705, "language_loss": 0.57866561, "learning_rate": 2.389882596377453e-07, "loss": 0.5992837, "num_input_tokens_seen": 152489315, "step": 7048, "time_per_iteration": 3.3943445682525635 }, { "auxiliary_loss_clip": 0.01164499, "auxiliary_loss_mlp": 0.01026844, "balance_loss_clip": 1.04596472, "balance_loss_mlp": 1.02001047, "epoch": 0.8475921361149522, "flos": 38180906974080.0, "grad_norm": 1.6746379547653911, "language_loss": 0.76014113, "learning_rate": 2.386191332042031e-07, "loss": 0.78205454, "num_input_tokens_seen": 152511210, "step": 7049, "time_per_iteration": 2.6763916015625 }, { "auxiliary_loss_clip": 0.01170974, "auxiliary_loss_mlp": 0.0102778, "balance_loss_clip": 1.04936361, "balance_loss_mlp": 1.02109814, "epoch": 0.8477123790055913, "flos": 25375054723200.0, "grad_norm": 4.490137689617844, "language_loss": 0.73160934, "learning_rate": 2.3825027397073794e-07, "loss": 0.7535969, "num_input_tokens_seen": 152531685, "step": 7050, "time_per_iteration": 2.5964298248291016 }, { "auxiliary_loss_clip": 0.01164163, "auxiliary_loss_mlp": 0.01028734, "balance_loss_clip": 1.0116564, "balance_loss_mlp": 1.02186775, "epoch": 0.8478326218962304, "flos": 30225185389440.0, "grad_norm": 2.3853480502777114, "language_loss": 0.66555536, "learning_rate": 2.3788168199330515e-07, "loss": 0.68748426, "num_input_tokens_seen": 152553245, "step": 7051, "time_per_iteration": 2.6246073246002197 }, { "auxiliary_loss_clip": 0.011455, "auxiliary_loss_mlp": 0.01027752, "balance_loss_clip": 0.96563566, "balance_loss_mlp": 1.0207485, "epoch": 0.8479528647868695, "flos": 38213800853760.0, "grad_norm": 1.587776593257799, "language_loss": 0.72415131, "learning_rate": 2.3751335732782074e-07, "loss": 0.74588388, "num_input_tokens_seen": 152574505, "step": 7052, "time_per_iteration": 2.729092597961426 }, { "auxiliary_loss_clip": 0.01165671, "auxiliary_loss_mlp": 0.01025538, "balance_loss_clip": 1.012398, "balance_loss_mlp": 1.01798606, "epoch": 0.8480731076775085, "flos": 20957790856320.0, "grad_norm": 5.576795259408118, "language_loss": 0.79582298, "learning_rate": 2.371453000301582e-07, "loss": 0.81773502, "num_input_tokens_seen": 152593190, "step": 7053, "time_per_iteration": 2.578615427017212 }, { "auxiliary_loss_clip": 0.0116096, "auxiliary_loss_mlp": 0.01027122, "balance_loss_clip": 0.9332608, "balance_loss_mlp": 1.02020526, "epoch": 0.8481933505681477, "flos": 32596510487040.0, "grad_norm": 1.7608110758927256, "language_loss": 0.74199665, "learning_rate": 2.3677751015615222e-07, "loss": 0.76387751, "num_input_tokens_seen": 152615265, "step": 7054, "time_per_iteration": 2.7559070587158203 }, { "auxiliary_loss_clip": 0.01150118, "auxiliary_loss_mlp": 0.01029194, "balance_loss_clip": 0.96741658, "balance_loss_mlp": 1.02213693, "epoch": 0.8483135934587868, "flos": 20741177888640.0, "grad_norm": 1.7928345718867456, "language_loss": 0.85180163, "learning_rate": 2.3640998776159593e-07, "loss": 0.87359476, "num_input_tokens_seen": 152632770, "step": 7055, "time_per_iteration": 2.597360849380493 }, { "auxiliary_loss_clip": 0.0116571, "auxiliary_loss_mlp": 0.01020522, "balance_loss_clip": 0.97277069, "balance_loss_mlp": 1.01434684, "epoch": 0.8484338363494258, "flos": 21653057485440.0, "grad_norm": 1.6940952795190216, "language_loss": 0.81192613, "learning_rate": 2.3604273290224253e-07, "loss": 0.83378845, "num_input_tokens_seen": 152653485, "step": 7056, "time_per_iteration": 2.6592462062835693 }, { "auxiliary_loss_clip": 0.01164649, "auxiliary_loss_mlp": 0.01026922, "balance_loss_clip": 0.97263277, "balance_loss_mlp": 1.01909637, "epoch": 0.848554079240065, "flos": 15013964926080.0, "grad_norm": 1.8443905491302748, "language_loss": 0.74378705, "learning_rate": 2.356757456338039e-07, "loss": 0.76570278, "num_input_tokens_seen": 152670970, "step": 7057, "time_per_iteration": 2.6229827404022217 }, { "auxiliary_loss_clip": 0.01062387, "auxiliary_loss_mlp": 0.00999472, "balance_loss_clip": 0.93608725, "balance_loss_mlp": 0.99798203, "epoch": 0.848674322130704, "flos": 68060453742720.0, "grad_norm": 2.0582119719908905, "language_loss": 0.59123123, "learning_rate": 2.3530902601195147e-07, "loss": 0.61184978, "num_input_tokens_seen": 152739460, "step": 7058, "time_per_iteration": 3.2879021167755127 }, { "auxiliary_loss_clip": 0.01166659, "auxiliary_loss_mlp": 0.01027504, "balance_loss_clip": 1.01058877, "balance_loss_mlp": 1.0200479, "epoch": 0.8487945650213431, "flos": 18475788977280.0, "grad_norm": 2.0726051808935497, "language_loss": 0.79156566, "learning_rate": 2.34942574092317e-07, "loss": 0.81350732, "num_input_tokens_seen": 152754710, "step": 7059, "time_per_iteration": 2.657559633255005 }, { "auxiliary_loss_clip": 0.01170843, "auxiliary_loss_mlp": 0.01024898, "balance_loss_clip": 1.01012814, "balance_loss_mlp": 1.01767373, "epoch": 0.8489148079119821, "flos": 23473189405440.0, "grad_norm": 1.8537462313035031, "language_loss": 0.76565611, "learning_rate": 2.3457638993049045e-07, "loss": 0.78761351, "num_input_tokens_seen": 152772700, "step": 7060, "time_per_iteration": 2.656602621078491 }, { "auxiliary_loss_clip": 0.01166929, "auxiliary_loss_mlp": 0.01028857, "balance_loss_clip": 0.85927773, "balance_loss_mlp": 1.02095342, "epoch": 0.8490350508026213, "flos": 19937604775680.0, "grad_norm": 2.8743783543214505, "language_loss": 0.64031518, "learning_rate": 2.3421047358202252e-07, "loss": 0.66227305, "num_input_tokens_seen": 152791550, "step": 7061, "time_per_iteration": 2.757716417312622 }, { "auxiliary_loss_clip": 0.01168862, "auxiliary_loss_mlp": 0.01026478, "balance_loss_clip": 1.01075733, "balance_loss_mlp": 1.01939678, "epoch": 0.8491552936932604, "flos": 24279958828800.0, "grad_norm": 2.560201045854158, "language_loss": 0.8300274, "learning_rate": 2.3384482510242144e-07, "loss": 0.85198081, "num_input_tokens_seen": 152809410, "step": 7062, "time_per_iteration": 3.51556396484375 }, { "auxiliary_loss_clip": 0.01168745, "auxiliary_loss_mlp": 0.0102213, "balance_loss_clip": 1.04686403, "balance_loss_mlp": 1.01493001, "epoch": 0.8492755365838994, "flos": 22522526098560.0, "grad_norm": 1.8066455056193642, "language_loss": 0.77228224, "learning_rate": 2.3347944454715575e-07, "loss": 0.794191, "num_input_tokens_seen": 152825800, "step": 7063, "time_per_iteration": 2.5792782306671143 }, { "auxiliary_loss_clip": 0.01167759, "auxiliary_loss_mlp": 0.01027944, "balance_loss_clip": 1.04687452, "balance_loss_mlp": 1.020437, "epoch": 0.8493957794745386, "flos": 26980441182720.0, "grad_norm": 2.6573867773225146, "language_loss": 0.67354417, "learning_rate": 2.331143319716542e-07, "loss": 0.69550121, "num_input_tokens_seen": 152845330, "step": 7064, "time_per_iteration": 2.617441415786743 }, { "auxiliary_loss_clip": 0.01172172, "auxiliary_loss_mlp": 0.01025422, "balance_loss_clip": 0.93478703, "balance_loss_mlp": 1.01827812, "epoch": 0.8495160223651776, "flos": 29861985018240.0, "grad_norm": 2.078038105416487, "language_loss": 0.65826756, "learning_rate": 2.3274948743130363e-07, "loss": 0.68024349, "num_input_tokens_seen": 152865165, "step": 7065, "time_per_iteration": 2.7131454944610596 }, { "auxiliary_loss_clip": 0.01166899, "auxiliary_loss_mlp": 0.01026932, "balance_loss_clip": 1.04714155, "balance_loss_mlp": 1.02016687, "epoch": 0.8496362652558167, "flos": 23075443128960.0, "grad_norm": 1.551349725105174, "language_loss": 0.79165417, "learning_rate": 2.3238491098145085e-07, "loss": 0.81359243, "num_input_tokens_seen": 152884695, "step": 7066, "time_per_iteration": 3.5046935081481934 }, { "auxiliary_loss_clip": 0.01164598, "auxiliary_loss_mlp": 0.0102762, "balance_loss_clip": 1.00952697, "balance_loss_mlp": 1.02089655, "epoch": 0.8497565081464559, "flos": 14609107756800.0, "grad_norm": 2.2730894742568544, "language_loss": 0.73027259, "learning_rate": 2.3202060267740141e-07, "loss": 0.75219476, "num_input_tokens_seen": 152902220, "step": 7067, "time_per_iteration": 2.558929920196533 }, { "auxiliary_loss_clip": 0.01156368, "auxiliary_loss_mlp": 0.01024816, "balance_loss_clip": 0.89169008, "balance_loss_mlp": 1.01819372, "epoch": 0.8498767510370949, "flos": 21136446126720.0, "grad_norm": 2.0851816834969266, "language_loss": 0.77602726, "learning_rate": 2.3165656257442044e-07, "loss": 0.79783905, "num_input_tokens_seen": 152920740, "step": 7068, "time_per_iteration": 2.730067491531372 }, { "auxiliary_loss_clip": 0.01162971, "auxiliary_loss_mlp": 0.0102664, "balance_loss_clip": 1.01063073, "balance_loss_mlp": 1.02033639, "epoch": 0.849996993927734, "flos": 23654538195840.0, "grad_norm": 1.7104776409032523, "language_loss": 0.90219837, "learning_rate": 2.31292790727734e-07, "loss": 0.92409444, "num_input_tokens_seen": 152938305, "step": 7069, "time_per_iteration": 3.9098713397979736 }, { "auxiliary_loss_clip": 0.01163049, "auxiliary_loss_mlp": 0.01026939, "balance_loss_clip": 1.04590726, "balance_loss_mlp": 1.02000451, "epoch": 0.8501172368183731, "flos": 20558069331840.0, "grad_norm": 5.7599584421087, "language_loss": 0.80090749, "learning_rate": 2.3092928719252392e-07, "loss": 0.82280737, "num_input_tokens_seen": 152956705, "step": 7070, "time_per_iteration": 3.5852839946746826 }, { "auxiliary_loss_clip": 0.01164977, "auxiliary_loss_mlp": 0.01022878, "balance_loss_clip": 1.01027036, "balance_loss_mlp": 1.01624084, "epoch": 0.8502374797090122, "flos": 22272624201600.0, "grad_norm": 1.9687119967511213, "language_loss": 0.78093457, "learning_rate": 2.3056605202393475e-07, "loss": 0.80281311, "num_input_tokens_seen": 152974265, "step": 7071, "time_per_iteration": 2.6179163455963135 }, { "auxiliary_loss_clip": 0.01156476, "auxiliary_loss_mlp": 0.01123023, "balance_loss_clip": 1.00437474, "balance_loss_mlp": 0.0, "epoch": 0.8503577225996513, "flos": 23659817495040.0, "grad_norm": 1.831885191217578, "language_loss": 0.66598797, "learning_rate": 2.3020308527706888e-07, "loss": 0.68878299, "num_input_tokens_seen": 152993680, "step": 7072, "time_per_iteration": 2.615046501159668 }, { "auxiliary_loss_clip": 0.0116586, "auxiliary_loss_mlp": 0.01024957, "balance_loss_clip": 0.96847975, "balance_loss_mlp": 1.0182724, "epoch": 0.8504779654902904, "flos": 26758513002240.0, "grad_norm": 1.7555675351769455, "language_loss": 0.88738728, "learning_rate": 2.2984038700698715e-07, "loss": 0.90929544, "num_input_tokens_seen": 153012990, "step": 7073, "time_per_iteration": 2.769052743911743 }, { "auxiliary_loss_clip": 0.01162516, "auxiliary_loss_mlp": 0.01025708, "balance_loss_clip": 1.00980568, "balance_loss_mlp": 1.01898468, "epoch": 0.8505982083809295, "flos": 26468247196800.0, "grad_norm": 1.5152389361756418, "language_loss": 0.78895694, "learning_rate": 2.2947795726871222e-07, "loss": 0.81083918, "num_input_tokens_seen": 153034015, "step": 7074, "time_per_iteration": 2.63641357421875 }, { "auxiliary_loss_clip": 0.01169719, "auxiliary_loss_mlp": 0.01122301, "balance_loss_clip": 1.0158776, "balance_loss_mlp": 0.0, "epoch": 0.8507184512715685, "flos": 20303390926080.0, "grad_norm": 2.3522982010476503, "language_loss": 0.85887456, "learning_rate": 2.2911579611722253e-07, "loss": 0.88179469, "num_input_tokens_seen": 153053160, "step": 7075, "time_per_iteration": 2.6393754482269287 }, { "auxiliary_loss_clip": 0.01159525, "auxiliary_loss_mlp": 0.01023213, "balance_loss_clip": 0.96972847, "balance_loss_mlp": 1.01659131, "epoch": 0.8508386941622077, "flos": 19025186474880.0, "grad_norm": 1.6155605263294175, "language_loss": 0.87227589, "learning_rate": 2.2875390360745905e-07, "loss": 0.89410329, "num_input_tokens_seen": 153072565, "step": 7076, "time_per_iteration": 2.5866596698760986 }, { "auxiliary_loss_clip": 0.01171407, "auxiliary_loss_mlp": 0.01034737, "balance_loss_clip": 0.93224323, "balance_loss_mlp": 1.02685118, "epoch": 0.8509589370528468, "flos": 16433405654400.0, "grad_norm": 2.5435122611309096, "language_loss": 0.77412152, "learning_rate": 2.2839227979432008e-07, "loss": 0.79618299, "num_input_tokens_seen": 153090215, "step": 7077, "time_per_iteration": 2.6365954875946045 }, { "auxiliary_loss_clip": 0.01164935, "auxiliary_loss_mlp": 0.01022083, "balance_loss_clip": 0.97051859, "balance_loss_mlp": 1.01544046, "epoch": 0.8510791799434858, "flos": 18259714713600.0, "grad_norm": 1.799747766849792, "language_loss": 0.85147786, "learning_rate": 2.2803092473266373e-07, "loss": 0.87334806, "num_input_tokens_seen": 153107740, "step": 7078, "time_per_iteration": 2.612063407897949 }, { "auxiliary_loss_clip": 0.01168991, "auxiliary_loss_mlp": 0.01026302, "balance_loss_clip": 1.04967237, "balance_loss_mlp": 1.01942968, "epoch": 0.851199422834125, "flos": 23441372933760.0, "grad_norm": 2.2842580341695853, "language_loss": 0.86632562, "learning_rate": 2.2766983847730724e-07, "loss": 0.88827848, "num_input_tokens_seen": 153127410, "step": 7079, "time_per_iteration": 2.5803277492523193 }, { "auxiliary_loss_clip": 0.01173051, "auxiliary_loss_mlp": 0.01033085, "balance_loss_clip": 0.93296456, "balance_loss_mlp": 1.02585161, "epoch": 0.851319665724764, "flos": 16289404030080.0, "grad_norm": 1.8626401786830256, "language_loss": 0.66643202, "learning_rate": 2.2730902108302663e-07, "loss": 0.68849337, "num_input_tokens_seen": 153144325, "step": 7080, "time_per_iteration": 2.648569345474243 }, { "auxiliary_loss_clip": 0.01154619, "auxiliary_loss_mlp": 0.01030703, "balance_loss_clip": 0.96678007, "balance_loss_mlp": 1.02358663, "epoch": 0.8514399086154031, "flos": 18989347680000.0, "grad_norm": 1.8508161967309873, "language_loss": 0.68553066, "learning_rate": 2.269484726045583e-07, "loss": 0.70738387, "num_input_tokens_seen": 153163240, "step": 7081, "time_per_iteration": 2.608348846435547 }, { "auxiliary_loss_clip": 0.01165668, "auxiliary_loss_mlp": 0.01022493, "balance_loss_clip": 0.93417025, "balance_loss_mlp": 1.01536977, "epoch": 0.8515601515060423, "flos": 24571194301440.0, "grad_norm": 1.6975032106181815, "language_loss": 0.7910918, "learning_rate": 2.2658819309659672e-07, "loss": 0.8129735, "num_input_tokens_seen": 153183440, "step": 7082, "time_per_iteration": 2.7241203784942627 }, { "auxiliary_loss_clip": 0.01162891, "auxiliary_loss_mlp": 0.01021366, "balance_loss_clip": 0.9742012, "balance_loss_mlp": 1.01481819, "epoch": 0.8516803943966813, "flos": 19529443555200.0, "grad_norm": 2.2204259994331754, "language_loss": 0.84665453, "learning_rate": 2.2622818261379706e-07, "loss": 0.86849713, "num_input_tokens_seen": 153200460, "step": 7083, "time_per_iteration": 2.633740186691284 }, { "auxiliary_loss_clip": 0.01161513, "auxiliary_loss_mlp": 0.01029783, "balance_loss_clip": 0.97028232, "balance_loss_mlp": 1.02212644, "epoch": 0.8518006372873204, "flos": 20265792364800.0, "grad_norm": 1.6268926764126765, "language_loss": 0.74858922, "learning_rate": 2.2586844121077142e-07, "loss": 0.77050221, "num_input_tokens_seen": 153218970, "step": 7084, "time_per_iteration": 2.6707959175109863 }, { "auxiliary_loss_clip": 0.01161783, "auxiliary_loss_mlp": 0.01026104, "balance_loss_clip": 0.89213616, "balance_loss_mlp": 1.01876092, "epoch": 0.8519208801779595, "flos": 24133227770880.0, "grad_norm": 1.647914577968033, "language_loss": 0.72030962, "learning_rate": 2.2550896894209215e-07, "loss": 0.74218845, "num_input_tokens_seen": 153238485, "step": 7085, "time_per_iteration": 2.744957447052002 }, { "auxiliary_loss_clip": 0.01073478, "auxiliary_loss_mlp": 0.01000956, "balance_loss_clip": 0.82451499, "balance_loss_mlp": 0.99931043, "epoch": 0.8520411230685986, "flos": 63035223252480.0, "grad_norm": 0.6931542673078303, "language_loss": 0.56654179, "learning_rate": 2.2514976586229184e-07, "loss": 0.58728617, "num_input_tokens_seen": 153306430, "step": 7086, "time_per_iteration": 3.524487018585205 }, { "auxiliary_loss_clip": 0.01062845, "auxiliary_loss_mlp": 0.0100206, "balance_loss_clip": 0.97158283, "balance_loss_mlp": 1.00042713, "epoch": 0.8521613659592376, "flos": 65836865283840.0, "grad_norm": 0.7686797766620697, "language_loss": 0.54722321, "learning_rate": 2.247908320258609e-07, "loss": 0.56787229, "num_input_tokens_seen": 153366520, "step": 7087, "time_per_iteration": 3.412020444869995 }, { "auxiliary_loss_clip": 0.01157313, "auxiliary_loss_mlp": 0.01022782, "balance_loss_clip": 0.89678133, "balance_loss_mlp": 1.01508713, "epoch": 0.8522816088498768, "flos": 23112323418240.0, "grad_norm": 2.395323250200306, "language_loss": 0.79616642, "learning_rate": 2.2443216748724914e-07, "loss": 0.81796736, "num_input_tokens_seen": 153387230, "step": 7088, "time_per_iteration": 3.573500394821167 }, { "auxiliary_loss_clip": 0.01170147, "auxiliary_loss_mlp": 0.01122557, "balance_loss_clip": 1.01114511, "balance_loss_mlp": 0.0, "epoch": 0.8524018517405159, "flos": 31758140073600.0, "grad_norm": 1.82869259290222, "language_loss": 0.74276221, "learning_rate": 2.2407377230086588e-07, "loss": 0.76568919, "num_input_tokens_seen": 153409585, "step": 7089, "time_per_iteration": 2.748293399810791 }, { "auxiliary_loss_clip": 0.01165232, "auxiliary_loss_mlp": 0.01026429, "balance_loss_clip": 0.93653607, "balance_loss_mlp": 1.01946449, "epoch": 0.8525220946311549, "flos": 18690318956160.0, "grad_norm": 1.8512555567284814, "language_loss": 0.83416808, "learning_rate": 2.23715646521079e-07, "loss": 0.8560847, "num_input_tokens_seen": 153427105, "step": 7090, "time_per_iteration": 2.706824779510498 }, { "auxiliary_loss_clip": 0.01169536, "auxiliary_loss_mlp": 0.01122933, "balance_loss_clip": 1.0080899, "balance_loss_mlp": 0.0, "epoch": 0.852642337521794, "flos": 21793216354560.0, "grad_norm": 2.1824391267443795, "language_loss": 0.84302038, "learning_rate": 2.2335779020221724e-07, "loss": 0.8659451, "num_input_tokens_seen": 153443725, "step": 7091, "time_per_iteration": 2.643951177597046 }, { "auxiliary_loss_clip": 0.01073371, "auxiliary_loss_mlp": 0.01001412, "balance_loss_clip": 0.98598933, "balance_loss_mlp": 0.99979037, "epoch": 0.8527625804124331, "flos": 69040132260480.0, "grad_norm": 0.8569559107436281, "language_loss": 0.56484151, "learning_rate": 2.2300020339856497e-07, "loss": 0.58558929, "num_input_tokens_seen": 153506410, "step": 7092, "time_per_iteration": 3.2400670051574707 }, { "auxiliary_loss_clip": 0.0116022, "auxiliary_loss_mlp": 0.01027398, "balance_loss_clip": 0.96959984, "balance_loss_mlp": 1.01999247, "epoch": 0.8528828233030722, "flos": 26979399688320.0, "grad_norm": 2.0880748428346414, "language_loss": 0.78242505, "learning_rate": 2.2264288616436966e-07, "loss": 0.8043012, "num_input_tokens_seen": 153526665, "step": 7093, "time_per_iteration": 3.600187301635742 }, { "auxiliary_loss_clip": 0.01160049, "auxiliary_loss_mlp": 0.01026294, "balance_loss_clip": 0.97172487, "balance_loss_mlp": 1.01909685, "epoch": 0.8530030661937112, "flos": 17487598936320.0, "grad_norm": 2.053491124720334, "language_loss": 0.72320122, "learning_rate": 2.222858385538351e-07, "loss": 0.74506468, "num_input_tokens_seen": 153543465, "step": 7094, "time_per_iteration": 2.6170742511749268 }, { "auxiliary_loss_clip": 0.01163176, "auxiliary_loss_mlp": 0.01021231, "balance_loss_clip": 1.01040351, "balance_loss_mlp": 1.01473427, "epoch": 0.8531233090843504, "flos": 22160798184960.0, "grad_norm": 1.812978620591092, "language_loss": 0.68013465, "learning_rate": 2.2192906062112527e-07, "loss": 0.70197874, "num_input_tokens_seen": 153563340, "step": 7095, "time_per_iteration": 2.744903802871704 }, { "auxiliary_loss_clip": 0.01164746, "auxiliary_loss_mlp": 0.01021323, "balance_loss_clip": 1.0457902, "balance_loss_mlp": 1.01433432, "epoch": 0.8532435519749895, "flos": 37635388145280.0, "grad_norm": 2.0042195787716914, "language_loss": 0.70611674, "learning_rate": 2.2157255242036377e-07, "loss": 0.7279774, "num_input_tokens_seen": 153587005, "step": 7096, "time_per_iteration": 4.610821485519409 }, { "auxiliary_loss_clip": 0.0115707, "auxiliary_loss_mlp": 0.01025365, "balance_loss_clip": 0.93226588, "balance_loss_mlp": 1.0190351, "epoch": 0.8533637948656285, "flos": 21398163598080.0, "grad_norm": 1.5049999964645169, "language_loss": 0.74030733, "learning_rate": 2.2121631400563135e-07, "loss": 0.76213169, "num_input_tokens_seen": 153606835, "step": 7097, "time_per_iteration": 2.7123286724090576 }, { "auxiliary_loss_clip": 0.01058191, "auxiliary_loss_mlp": 0.01001896, "balance_loss_clip": 0.97192729, "balance_loss_mlp": 1.00035846, "epoch": 0.8534840377562677, "flos": 53345122490880.0, "grad_norm": 0.7630525208965675, "language_loss": 0.53009862, "learning_rate": 2.208603454309701e-07, "loss": 0.55069947, "num_input_tokens_seen": 153664925, "step": 7098, "time_per_iteration": 3.1590769290924072 }, { "auxiliary_loss_clip": 0.01160703, "auxiliary_loss_mlp": 0.01030327, "balance_loss_clip": 0.89432895, "balance_loss_mlp": 1.02310038, "epoch": 0.8536042806469067, "flos": 20814148368000.0, "grad_norm": 1.9606203493051924, "language_loss": 0.70945072, "learning_rate": 2.2050464675037994e-07, "loss": 0.73136103, "num_input_tokens_seen": 153683550, "step": 7099, "time_per_iteration": 2.6654739379882812 }, { "auxiliary_loss_clip": 0.01166043, "auxiliary_loss_mlp": 0.0102464, "balance_loss_clip": 0.97206795, "balance_loss_mlp": 1.01788986, "epoch": 0.8537245235375458, "flos": 24681368292480.0, "grad_norm": 1.8598724207505481, "language_loss": 0.73076415, "learning_rate": 2.2014921801782016e-07, "loss": 0.75267094, "num_input_tokens_seen": 153703040, "step": 7100, "time_per_iteration": 2.645030975341797 }, { "auxiliary_loss_clip": 0.01160465, "auxiliary_loss_mlp": 0.0103052, "balance_loss_clip": 0.9659071, "balance_loss_mlp": 1.0235405, "epoch": 0.853844766428185, "flos": 24384817607040.0, "grad_norm": 1.8711134544433823, "language_loss": 0.73867983, "learning_rate": 2.1979405928720872e-07, "loss": 0.76058972, "num_input_tokens_seen": 153722695, "step": 7101, "time_per_iteration": 2.672119140625 }, { "auxiliary_loss_clip": 0.01164986, "auxiliary_loss_mlp": 0.01026686, "balance_loss_clip": 0.96966934, "balance_loss_mlp": 1.01989985, "epoch": 0.853965009318824, "flos": 20955707867520.0, "grad_norm": 1.4485364535688912, "language_loss": 0.79243016, "learning_rate": 2.1943917061242257e-07, "loss": 0.81434685, "num_input_tokens_seen": 153742550, "step": 7102, "time_per_iteration": 2.6965179443359375 }, { "auxiliary_loss_clip": 0.01170953, "auxiliary_loss_mlp": 0.01123003, "balance_loss_clip": 1.00967932, "balance_loss_mlp": 0.0, "epoch": 0.8540852522094631, "flos": 24201816791040.0, "grad_norm": 1.7113021528330352, "language_loss": 0.66355354, "learning_rate": 2.1908455204729903e-07, "loss": 0.68649304, "num_input_tokens_seen": 153761700, "step": 7103, "time_per_iteration": 2.625295400619507 }, { "auxiliary_loss_clip": 0.01162318, "auxiliary_loss_mlp": 0.01027364, "balance_loss_clip": 0.96785259, "balance_loss_mlp": 1.02025914, "epoch": 0.8542054951001022, "flos": 25082921410560.0, "grad_norm": 2.0427518289258013, "language_loss": 0.78211856, "learning_rate": 2.1873020364563265e-07, "loss": 0.8040154, "num_input_tokens_seen": 153780765, "step": 7104, "time_per_iteration": 2.679164171218872 }, { "auxiliary_loss_clip": 0.01163345, "auxiliary_loss_mlp": 0.01021599, "balance_loss_clip": 1.0095439, "balance_loss_mlp": 1.01471186, "epoch": 0.8543257379907413, "flos": 24316551809280.0, "grad_norm": 2.570557298651503, "language_loss": 0.75913262, "learning_rate": 2.183761254611789e-07, "loss": 0.78098208, "num_input_tokens_seen": 153801090, "step": 7105, "time_per_iteration": 2.6197729110717773 }, { "auxiliary_loss_clip": 0.01166415, "auxiliary_loss_mlp": 0.0102418, "balance_loss_clip": 1.01116228, "balance_loss_mlp": 1.01731646, "epoch": 0.8544459808813804, "flos": 55286630467200.0, "grad_norm": 1.957359914766101, "language_loss": 0.70300716, "learning_rate": 2.1802231754764987e-07, "loss": 0.72491312, "num_input_tokens_seen": 153826530, "step": 7106, "time_per_iteration": 3.002825975418091 }, { "auxiliary_loss_clip": 0.01164942, "auxiliary_loss_mlp": 0.01025631, "balance_loss_clip": 0.96890211, "balance_loss_mlp": 1.01821315, "epoch": 0.8545662237720195, "flos": 25776248705280.0, "grad_norm": 1.9691379054888773, "language_loss": 0.76065999, "learning_rate": 2.17668779958718e-07, "loss": 0.78256577, "num_input_tokens_seen": 153849110, "step": 7107, "time_per_iteration": 2.677988290786743 }, { "auxiliary_loss_clip": 0.01168197, "auxiliary_loss_mlp": 0.01026095, "balance_loss_clip": 1.04852986, "balance_loss_mlp": 1.01885259, "epoch": 0.8546864666626586, "flos": 11108320427520.0, "grad_norm": 2.2664716389665696, "language_loss": 0.80383611, "learning_rate": 2.1731551274801553e-07, "loss": 0.82577902, "num_input_tokens_seen": 153865550, "step": 7108, "time_per_iteration": 2.5361454486846924 }, { "auxiliary_loss_clip": 0.01169045, "auxiliary_loss_mlp": 0.01023474, "balance_loss_clip": 0.9715482, "balance_loss_mlp": 1.01682806, "epoch": 0.8548067095532976, "flos": 25520169669120.0, "grad_norm": 2.266982137985291, "language_loss": 0.61744297, "learning_rate": 2.169625159691324e-07, "loss": 0.63936818, "num_input_tokens_seen": 153885425, "step": 7109, "time_per_iteration": 2.6838104724884033 }, { "auxiliary_loss_clip": 0.01167363, "auxiliary_loss_mlp": 0.01022919, "balance_loss_clip": 0.89214635, "balance_loss_mlp": 1.01535487, "epoch": 0.8549269524439368, "flos": 24717853532160.0, "grad_norm": 2.10133825344804, "language_loss": 0.7442314, "learning_rate": 2.1660978967561784e-07, "loss": 0.76613426, "num_input_tokens_seen": 153904760, "step": 7110, "time_per_iteration": 2.7241647243499756 }, { "auxiliary_loss_clip": 0.01165629, "auxiliary_loss_mlp": 0.0102735, "balance_loss_clip": 1.04669356, "balance_loss_mlp": 1.02079618, "epoch": 0.8550471953345758, "flos": 19825599191040.0, "grad_norm": 2.3270295261386247, "language_loss": 0.78946996, "learning_rate": 2.1625733392098035e-07, "loss": 0.81139976, "num_input_tokens_seen": 153920370, "step": 7111, "time_per_iteration": 2.5623114109039307 }, { "auxiliary_loss_clip": 0.01163609, "auxiliary_loss_mlp": 0.01022803, "balance_loss_clip": 1.04551482, "balance_loss_mlp": 1.01589525, "epoch": 0.8551674382252149, "flos": 22820441500800.0, "grad_norm": 3.0146839743685097, "language_loss": 0.79528117, "learning_rate": 2.159051487586867e-07, "loss": 0.81714529, "num_input_tokens_seen": 153940500, "step": 7112, "time_per_iteration": 2.646259307861328 }, { "auxiliary_loss_clip": 0.01171904, "auxiliary_loss_mlp": 0.01026957, "balance_loss_clip": 0.97464943, "balance_loss_mlp": 1.01952696, "epoch": 0.8552876811158541, "flos": 20631255292800.0, "grad_norm": 2.1478608907573333, "language_loss": 0.72234541, "learning_rate": 2.155532342421642e-07, "loss": 0.74433398, "num_input_tokens_seen": 153958500, "step": 7113, "time_per_iteration": 2.6136178970336914 }, { "auxiliary_loss_clip": 0.01167113, "auxiliary_loss_mlp": 0.01027033, "balance_loss_clip": 1.00881362, "balance_loss_mlp": 1.01940632, "epoch": 0.8554079240064931, "flos": 23112359331840.0, "grad_norm": 1.8208457879531508, "language_loss": 0.78419662, "learning_rate": 2.1520159042479636e-07, "loss": 0.80613804, "num_input_tokens_seen": 153976790, "step": 7114, "time_per_iteration": 3.3915750980377197 }, { "auxiliary_loss_clip": 0.01168499, "auxiliary_loss_mlp": 0.01027663, "balance_loss_clip": 1.01158905, "balance_loss_mlp": 1.02054358, "epoch": 0.8555281668971322, "flos": 22128047959680.0, "grad_norm": 2.1468185766946495, "language_loss": 0.71544302, "learning_rate": 2.148502173599287e-07, "loss": 0.73740464, "num_input_tokens_seen": 153994930, "step": 7115, "time_per_iteration": 2.5973691940307617 }, { "auxiliary_loss_clip": 0.01159595, "auxiliary_loss_mlp": 0.0102463, "balance_loss_clip": 0.97081137, "balance_loss_mlp": 1.01722109, "epoch": 0.8556484097877713, "flos": 31139040234240.0, "grad_norm": 1.7305912020972654, "language_loss": 0.6540525, "learning_rate": 2.1449911510086372e-07, "loss": 0.67589474, "num_input_tokens_seen": 154014400, "step": 7116, "time_per_iteration": 2.6994831562042236 }, { "auxiliary_loss_clip": 0.01163825, "auxiliary_loss_mlp": 0.01028718, "balance_loss_clip": 1.00849986, "balance_loss_mlp": 1.02185464, "epoch": 0.8557686526784104, "flos": 24316551809280.0, "grad_norm": 1.6713707991762699, "language_loss": 0.76938224, "learning_rate": 2.141482837008628e-07, "loss": 0.79130769, "num_input_tokens_seen": 154034940, "step": 7117, "time_per_iteration": 2.6962170600891113 }, { "auxiliary_loss_clip": 0.01156887, "auxiliary_loss_mlp": 0.01025806, "balance_loss_clip": 1.00869107, "balance_loss_mlp": 1.01873732, "epoch": 0.8558888955690495, "flos": 17712723427200.0, "grad_norm": 1.711370649030336, "language_loss": 0.72094798, "learning_rate": 2.1379772321314826e-07, "loss": 0.7427749, "num_input_tokens_seen": 154052985, "step": 7118, "time_per_iteration": 2.6512696743011475 }, { "auxiliary_loss_clip": 0.01153024, "auxiliary_loss_mlp": 0.01028944, "balance_loss_clip": 0.85716784, "balance_loss_mlp": 1.02153492, "epoch": 0.8560091384596886, "flos": 19171702051200.0, "grad_norm": 1.922795423468803, "language_loss": 0.8172965, "learning_rate": 2.1344743369089802e-07, "loss": 0.83911622, "num_input_tokens_seen": 154068765, "step": 7119, "time_per_iteration": 3.726722478866577 }, { "auxiliary_loss_clip": 0.0116976, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 0.97542858, "balance_loss_mlp": 1.01952434, "epoch": 0.8561293813503277, "flos": 23914855036800.0, "grad_norm": 1.6819253308771485, "language_loss": 0.82078791, "learning_rate": 2.130974151872522e-07, "loss": 0.84275264, "num_input_tokens_seen": 154089100, "step": 7120, "time_per_iteration": 2.7254974842071533 }, { "auxiliary_loss_clip": 0.01168904, "auxiliary_loss_mlp": 0.01025711, "balance_loss_clip": 0.93550456, "balance_loss_mlp": 1.01868963, "epoch": 0.8562496242409667, "flos": 22529206028160.0, "grad_norm": 1.7172103641970446, "language_loss": 0.78758228, "learning_rate": 2.1274766775530773e-07, "loss": 0.80952835, "num_input_tokens_seen": 154108965, "step": 7121, "time_per_iteration": 2.6196625232696533 }, { "auxiliary_loss_clip": 0.0116861, "auxiliary_loss_mlp": 0.0102364, "balance_loss_clip": 1.04730618, "balance_loss_mlp": 1.0162375, "epoch": 0.8563698671316058, "flos": 14712745472640.0, "grad_norm": 1.8913174482146013, "language_loss": 0.79261577, "learning_rate": 2.1239819144812077e-07, "loss": 0.8145383, "num_input_tokens_seen": 154123425, "step": 7122, "time_per_iteration": 4.408765554428101 }, { "auxiliary_loss_clip": 0.01151151, "auxiliary_loss_mlp": 0.01025786, "balance_loss_clip": 0.92904705, "balance_loss_mlp": 1.01899648, "epoch": 0.856490110022245, "flos": 39167768211840.0, "grad_norm": 1.698396961687031, "language_loss": 0.69828266, "learning_rate": 2.1204898631870716e-07, "loss": 0.720052, "num_input_tokens_seen": 154148315, "step": 7123, "time_per_iteration": 2.84976863861084 }, { "auxiliary_loss_clip": 0.01165256, "auxiliary_loss_mlp": 0.01023459, "balance_loss_clip": 0.97265953, "balance_loss_mlp": 1.01634216, "epoch": 0.856610352912884, "flos": 29059345658880.0, "grad_norm": 2.0897619535602625, "language_loss": 0.75965303, "learning_rate": 2.1170005242004006e-07, "loss": 0.78154016, "num_input_tokens_seen": 154169665, "step": 7124, "time_per_iteration": 2.801532745361328 }, { "auxiliary_loss_clip": 0.01168525, "auxiliary_loss_mlp": 0.01023065, "balance_loss_clip": 0.97058594, "balance_loss_mlp": 1.01666033, "epoch": 0.8567305958035231, "flos": 23878333883520.0, "grad_norm": 1.9399835543746753, "language_loss": 0.78113067, "learning_rate": 2.1135138980505384e-07, "loss": 0.80304658, "num_input_tokens_seen": 154190335, "step": 7125, "time_per_iteration": 2.663590908050537 }, { "auxiliary_loss_clip": 0.01161898, "auxiliary_loss_mlp": 0.01023307, "balance_loss_clip": 0.97322178, "balance_loss_mlp": 1.01665199, "epoch": 0.8568508386941622, "flos": 22200120599040.0, "grad_norm": 1.8492723960334723, "language_loss": 0.72298706, "learning_rate": 2.110029985266395e-07, "loss": 0.74483913, "num_input_tokens_seen": 154210040, "step": 7126, "time_per_iteration": 2.6704955101013184 }, { "auxiliary_loss_clip": 0.01169867, "auxiliary_loss_mlp": 0.0102019, "balance_loss_clip": 0.97146213, "balance_loss_mlp": 1.01355636, "epoch": 0.8569710815848013, "flos": 17307507121920.0, "grad_norm": 1.637801906845894, "language_loss": 0.74152547, "learning_rate": 2.1065487863764787e-07, "loss": 0.76342607, "num_input_tokens_seen": 154228385, "step": 7127, "time_per_iteration": 2.701488733291626 }, { "auxiliary_loss_clip": 0.01151431, "auxiliary_loss_mlp": 0.01025319, "balance_loss_clip": 0.88937199, "balance_loss_mlp": 1.01823199, "epoch": 0.8570913244754403, "flos": 23732285184000.0, "grad_norm": 1.5027925418667094, "language_loss": 0.85636663, "learning_rate": 2.1030703019088846e-07, "loss": 0.87813413, "num_input_tokens_seen": 154249015, "step": 7128, "time_per_iteration": 2.8829166889190674 }, { "auxiliary_loss_clip": 0.01160751, "auxiliary_loss_mlp": 0.01021361, "balance_loss_clip": 1.0090816, "balance_loss_mlp": 1.01478994, "epoch": 0.8572115673660795, "flos": 20048748433920.0, "grad_norm": 2.247122411105706, "language_loss": 0.70909047, "learning_rate": 2.099594532391291e-07, "loss": 0.73091161, "num_input_tokens_seen": 154267700, "step": 7129, "time_per_iteration": 2.711308240890503 }, { "auxiliary_loss_clip": 0.01154437, "auxiliary_loss_mlp": 0.01030166, "balance_loss_clip": 1.00690317, "balance_loss_mlp": 1.02309966, "epoch": 0.8573318102567186, "flos": 27160389342720.0, "grad_norm": 1.5137661012661088, "language_loss": 0.78860199, "learning_rate": 2.0961214783509806e-07, "loss": 0.81044805, "num_input_tokens_seen": 154290580, "step": 7130, "time_per_iteration": 2.685457944869995 }, { "auxiliary_loss_clip": 0.01166893, "auxiliary_loss_mlp": 0.01031068, "balance_loss_clip": 0.97021288, "balance_loss_mlp": 1.02432036, "epoch": 0.8574520531473576, "flos": 24936585402240.0, "grad_norm": 1.597372932829824, "language_loss": 0.74777013, "learning_rate": 2.0926511403148051e-07, "loss": 0.76974976, "num_input_tokens_seen": 154309545, "step": 7131, "time_per_iteration": 2.6743557453155518 }, { "auxiliary_loss_clip": 0.01171657, "auxiliary_loss_mlp": 0.01027314, "balance_loss_clip": 0.9335472, "balance_loss_mlp": 1.02080202, "epoch": 0.8575722960379968, "flos": 18771154513920.0, "grad_norm": 1.9394967876353773, "language_loss": 0.75719965, "learning_rate": 2.0891835188092143e-07, "loss": 0.77918935, "num_input_tokens_seen": 154326545, "step": 7132, "time_per_iteration": 2.6516969203948975 }, { "auxiliary_loss_clip": 0.01171633, "auxiliary_loss_mlp": 0.01023312, "balance_loss_clip": 0.93296921, "balance_loss_mlp": 1.01683652, "epoch": 0.8576925389286358, "flos": 22200300167040.0, "grad_norm": 2.62775056793039, "language_loss": 0.81519181, "learning_rate": 2.0857186143602434e-07, "loss": 0.83714128, "num_input_tokens_seen": 154345190, "step": 7133, "time_per_iteration": 2.6607816219329834 }, { "auxiliary_loss_clip": 0.01151329, "auxiliary_loss_mlp": 0.01027494, "balance_loss_clip": 0.92861384, "balance_loss_mlp": 1.02048481, "epoch": 0.8578127818192749, "flos": 22894345733760.0, "grad_norm": 2.306074368056061, "language_loss": 0.67680478, "learning_rate": 2.0822564274935094e-07, "loss": 0.69859302, "num_input_tokens_seen": 154364615, "step": 7134, "time_per_iteration": 2.681562900543213 }, { "auxiliary_loss_clip": 0.0116511, "auxiliary_loss_mlp": 0.01024283, "balance_loss_clip": 0.97200632, "balance_loss_mlp": 1.01682925, "epoch": 0.8579330247099141, "flos": 34824839541120.0, "grad_norm": 1.7205988704405044, "language_loss": 0.66663766, "learning_rate": 2.078796958734239e-07, "loss": 0.68853164, "num_input_tokens_seen": 154387335, "step": 7135, "time_per_iteration": 2.715153932571411 }, { "auxiliary_loss_clip": 0.01167907, "auxiliary_loss_mlp": 0.01021522, "balance_loss_clip": 1.01129913, "balance_loss_mlp": 1.01434875, "epoch": 0.8580532676005531, "flos": 19755681367680.0, "grad_norm": 1.7916501810029941, "language_loss": 0.74953103, "learning_rate": 2.0753402086072124e-07, "loss": 0.77142531, "num_input_tokens_seen": 154405965, "step": 7136, "time_per_iteration": 2.5750722885131836 }, { "auxiliary_loss_clip": 0.01172192, "auxiliary_loss_mlp": 0.01029435, "balance_loss_clip": 0.8191216, "balance_loss_mlp": 1.02207065, "epoch": 0.8581735104911922, "flos": 22739318634240.0, "grad_norm": 2.932080434468456, "language_loss": 0.75165218, "learning_rate": 2.071886177636828e-07, "loss": 0.77366853, "num_input_tokens_seen": 154422750, "step": 7137, "time_per_iteration": 2.760169267654419 }, { "auxiliary_loss_clip": 0.01160949, "auxiliary_loss_mlp": 0.01023787, "balance_loss_clip": 1.00942051, "balance_loss_mlp": 1.01672673, "epoch": 0.8582937533818313, "flos": 23149131880320.0, "grad_norm": 1.9065565431500002, "language_loss": 0.83366573, "learning_rate": 2.0684348663470575e-07, "loss": 0.8555131, "num_input_tokens_seen": 154442930, "step": 7138, "time_per_iteration": 2.772260904312134 }, { "auxiliary_loss_clip": 0.01160368, "auxiliary_loss_mlp": 0.01023676, "balance_loss_clip": 0.9681856, "balance_loss_mlp": 1.01626682, "epoch": 0.8584139962724704, "flos": 19498668577920.0, "grad_norm": 1.7245259128839132, "language_loss": 0.61667854, "learning_rate": 2.0649862752614555e-07, "loss": 0.63851899, "num_input_tokens_seen": 154461640, "step": 7139, "time_per_iteration": 2.6019158363342285 }, { "auxiliary_loss_clip": 0.01065144, "auxiliary_loss_mlp": 0.01000036, "balance_loss_clip": 0.93391061, "balance_loss_mlp": 0.9983905, "epoch": 0.8585342391631094, "flos": 71276577788160.0, "grad_norm": 0.7565232392404639, "language_loss": 0.57128, "learning_rate": 2.0615404049031838e-07, "loss": 0.59193182, "num_input_tokens_seen": 154518610, "step": 7140, "time_per_iteration": 4.146923065185547 }, { "auxiliary_loss_clip": 0.0116645, "auxiliary_loss_mlp": 0.01029573, "balance_loss_clip": 1.01061678, "balance_loss_mlp": 1.02196193, "epoch": 0.8586544820537486, "flos": 10815432929280.0, "grad_norm": 3.0188668113610673, "language_loss": 0.78229821, "learning_rate": 2.0580972557949616e-07, "loss": 0.80425841, "num_input_tokens_seen": 154533700, "step": 7141, "time_per_iteration": 2.7023537158966064 }, { "auxiliary_loss_clip": 0.0106325, "auxiliary_loss_mlp": 0.01002471, "balance_loss_clip": 0.97105187, "balance_loss_mlp": 1.00074244, "epoch": 0.8587747249443877, "flos": 64811184422400.0, "grad_norm": 0.8011484214265037, "language_loss": 0.54337788, "learning_rate": 2.054656828459125e-07, "loss": 0.56403512, "num_input_tokens_seen": 154597810, "step": 7142, "time_per_iteration": 3.2080705165863037 }, { "auxiliary_loss_clip": 0.01159955, "auxiliary_loss_mlp": 0.01027352, "balance_loss_clip": 0.89508832, "balance_loss_mlp": 1.019943, "epoch": 0.8588949678350267, "flos": 26834607964800.0, "grad_norm": 1.7056819186991703, "language_loss": 0.7727145, "learning_rate": 2.051219123417578e-07, "loss": 0.79458755, "num_input_tokens_seen": 154617870, "step": 7143, "time_per_iteration": 2.749969005584717 }, { "auxiliary_loss_clip": 0.01166625, "auxiliary_loss_mlp": 0.01026214, "balance_loss_clip": 1.04614031, "balance_loss_mlp": 1.01841724, "epoch": 0.8590152107256659, "flos": 26104256726400.0, "grad_norm": 2.2030623525628803, "language_loss": 0.59820843, "learning_rate": 2.0477841411918196e-07, "loss": 0.6201368, "num_input_tokens_seen": 154637395, "step": 7144, "time_per_iteration": 2.599276542663574 }, { "auxiliary_loss_clip": 0.0116031, "auxiliary_loss_mlp": 0.01024437, "balance_loss_clip": 1.00779307, "balance_loss_mlp": 1.01771951, "epoch": 0.859135453616305, "flos": 26140885620480.0, "grad_norm": 1.7922036757595903, "language_loss": 0.74630737, "learning_rate": 2.0443518823029326e-07, "loss": 0.76815486, "num_input_tokens_seen": 154657935, "step": 7145, "time_per_iteration": 3.7544806003570557 }, { "auxiliary_loss_clip": 0.01156669, "auxiliary_loss_mlp": 0.0102609, "balance_loss_clip": 0.93267667, "balance_loss_mlp": 1.01892877, "epoch": 0.859255696506944, "flos": 12969319046400.0, "grad_norm": 2.46075534952158, "language_loss": 0.76488352, "learning_rate": 2.0409223472715854e-07, "loss": 0.7867111, "num_input_tokens_seen": 154675080, "step": 7146, "time_per_iteration": 2.679151773452759 }, { "auxiliary_loss_clip": 0.01164664, "auxiliary_loss_mlp": 0.01122289, "balance_loss_clip": 0.93277287, "balance_loss_mlp": 0.0, "epoch": 0.8593759393975832, "flos": 18475753063680.0, "grad_norm": 1.7702272233034952, "language_loss": 0.74732268, "learning_rate": 2.0374955366180434e-07, "loss": 0.77019215, "num_input_tokens_seen": 154692720, "step": 7147, "time_per_iteration": 2.66875958442688 }, { "auxiliary_loss_clip": 0.01165654, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 0.93103397, "balance_loss_mlp": 1.02656722, "epoch": 0.8594961822882222, "flos": 22200156512640.0, "grad_norm": 1.62548125202387, "language_loss": 0.72444129, "learning_rate": 2.034071450862147e-07, "loss": 0.74643654, "num_input_tokens_seen": 154710190, "step": 7148, "time_per_iteration": 3.5956056118011475 }, { "auxiliary_loss_clip": 0.01163395, "auxiliary_loss_mlp": 0.01029994, "balance_loss_clip": 0.96772647, "balance_loss_mlp": 1.02198887, "epoch": 0.8596164251788613, "flos": 23294749616640.0, "grad_norm": 1.6788841730503468, "language_loss": 0.76965332, "learning_rate": 2.030650090523327e-07, "loss": 0.79158717, "num_input_tokens_seen": 154729380, "step": 7149, "time_per_iteration": 3.5570380687713623 }, { "auxiliary_loss_clip": 0.01156624, "auxiliary_loss_mlp": 0.01025207, "balance_loss_clip": 0.92935854, "balance_loss_mlp": 1.01845717, "epoch": 0.8597366680695004, "flos": 31649905416960.0, "grad_norm": 1.5906432203935992, "language_loss": 0.59158313, "learning_rate": 2.0272314561205995e-07, "loss": 0.61340153, "num_input_tokens_seen": 154749775, "step": 7150, "time_per_iteration": 2.7273998260498047 }, { "auxiliary_loss_clip": 0.01149783, "auxiliary_loss_mlp": 0.01021261, "balance_loss_clip": 0.92686409, "balance_loss_mlp": 1.01426673, "epoch": 0.8598569109601395, "flos": 21287738211840.0, "grad_norm": 2.2538066191169417, "language_loss": 0.72995245, "learning_rate": 2.023815548172567e-07, "loss": 0.75166297, "num_input_tokens_seen": 154769845, "step": 7151, "time_per_iteration": 2.7869362831115723 }, { "auxiliary_loss_clip": 0.01163685, "auxiliary_loss_mlp": 0.01023974, "balance_loss_clip": 1.00689626, "balance_loss_mlp": 1.01669002, "epoch": 0.8599771538507786, "flos": 25447809720960.0, "grad_norm": 1.5617971062777107, "language_loss": 0.66179192, "learning_rate": 2.0204023671974267e-07, "loss": 0.68366849, "num_input_tokens_seen": 154789230, "step": 7152, "time_per_iteration": 2.6545679569244385 }, { "auxiliary_loss_clip": 0.01162159, "auxiliary_loss_mlp": 0.01024843, "balance_loss_clip": 1.00844502, "balance_loss_mlp": 1.01770806, "epoch": 0.8600973967414177, "flos": 16723958768640.0, "grad_norm": 2.1252057929700072, "language_loss": 0.81085718, "learning_rate": 2.0169919137129532e-07, "loss": 0.83272719, "num_input_tokens_seen": 154807670, "step": 7153, "time_per_iteration": 2.659759044647217 }, { "auxiliary_loss_clip": 0.01168393, "auxiliary_loss_mlp": 0.0102602, "balance_loss_clip": 1.01186252, "balance_loss_mlp": 1.0184207, "epoch": 0.8602176396320568, "flos": 25227928615680.0, "grad_norm": 3.055727072129161, "language_loss": 0.71032351, "learning_rate": 2.013584188236508e-07, "loss": 0.73226762, "num_input_tokens_seen": 154825575, "step": 7154, "time_per_iteration": 2.6239116191864014 }, { "auxiliary_loss_clip": 0.01168769, "auxiliary_loss_mlp": 0.01022623, "balance_loss_clip": 1.04737115, "balance_loss_mlp": 1.01545227, "epoch": 0.8603378825226958, "flos": 20412236113920.0, "grad_norm": 1.5816829918795272, "language_loss": 0.79314965, "learning_rate": 2.0101791912850396e-07, "loss": 0.8150636, "num_input_tokens_seen": 154845115, "step": 7155, "time_per_iteration": 2.6242599487304688 }, { "auxiliary_loss_clip": 0.0116747, "auxiliary_loss_mlp": 0.01025731, "balance_loss_clip": 0.97322142, "balance_loss_mlp": 1.01817954, "epoch": 0.8604581254133349, "flos": 34930201109760.0, "grad_norm": 1.922816723987633, "language_loss": 0.63917375, "learning_rate": 2.006776923375082e-07, "loss": 0.66110575, "num_input_tokens_seen": 154866770, "step": 7156, "time_per_iteration": 2.7711844444274902 }, { "auxiliary_loss_clip": 0.01168531, "auxiliary_loss_mlp": 0.01024693, "balance_loss_clip": 1.04853284, "balance_loss_mlp": 1.01803541, "epoch": 0.860578368303974, "flos": 22596538072320.0, "grad_norm": 1.6190943305526697, "language_loss": 0.71404827, "learning_rate": 2.003377385022764e-07, "loss": 0.73598051, "num_input_tokens_seen": 154885595, "step": 7157, "time_per_iteration": 2.5789401531219482 }, { "auxiliary_loss_clip": 0.01165101, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 0.97101498, "balance_loss_mlp": 1.02565742, "epoch": 0.8606986111946131, "flos": 21324331192320.0, "grad_norm": 3.3984480152656427, "language_loss": 0.77121544, "learning_rate": 1.9999805767437826e-07, "loss": 0.79319018, "num_input_tokens_seen": 154904485, "step": 7158, "time_per_iteration": 2.6363699436187744 }, { "auxiliary_loss_clip": 0.01154271, "auxiliary_loss_mlp": 0.01017695, "balance_loss_clip": 0.96802449, "balance_loss_mlp": 1.01049733, "epoch": 0.8608188540852522, "flos": 28877206769280.0, "grad_norm": 1.6782689573500125, "language_loss": 0.71574515, "learning_rate": 1.9965864990534386e-07, "loss": 0.73746485, "num_input_tokens_seen": 154925010, "step": 7159, "time_per_iteration": 2.700345993041992 }, { "auxiliary_loss_clip": 0.01152499, "auxiliary_loss_mlp": 0.0102755, "balance_loss_clip": 0.92811888, "balance_loss_mlp": 1.02065396, "epoch": 0.8609390969758913, "flos": 29716187713920.0, "grad_norm": 1.589665159905405, "language_loss": 0.77457529, "learning_rate": 1.9931951524666092e-07, "loss": 0.79637575, "num_input_tokens_seen": 154946100, "step": 7160, "time_per_iteration": 2.7193312644958496 }, { "auxiliary_loss_clip": 0.01170456, "auxiliary_loss_mlp": 0.01122043, "balance_loss_clip": 1.01083624, "balance_loss_mlp": 0.0, "epoch": 0.8610593398665304, "flos": 21249349551360.0, "grad_norm": 1.672298849094127, "language_loss": 0.81292975, "learning_rate": 1.9898065374977534e-07, "loss": 0.83585477, "num_input_tokens_seen": 154966305, "step": 7161, "time_per_iteration": 2.6128122806549072 }, { "auxiliary_loss_clip": 0.01162139, "auxiliary_loss_mlp": 0.01021921, "balance_loss_clip": 0.9317013, "balance_loss_mlp": 1.01581144, "epoch": 0.8611795827571694, "flos": 14830102183680.0, "grad_norm": 1.8778178764530584, "language_loss": 0.73263568, "learning_rate": 1.9864206546609342e-07, "loss": 0.75447631, "num_input_tokens_seen": 154985145, "step": 7162, "time_per_iteration": 2.6360416412353516 }, { "auxiliary_loss_clip": 0.01162372, "auxiliary_loss_mlp": 0.01029271, "balance_loss_clip": 1.04449081, "balance_loss_mlp": 1.02218425, "epoch": 0.8612998256478086, "flos": 24243258107520.0, "grad_norm": 1.7894220985774634, "language_loss": 0.84446537, "learning_rate": 1.983037504469771e-07, "loss": 0.86638176, "num_input_tokens_seen": 155003855, "step": 7163, "time_per_iteration": 2.5955891609191895 }, { "auxiliary_loss_clip": 0.01166204, "auxiliary_loss_mlp": 0.01032596, "balance_loss_clip": 1.00994706, "balance_loss_mlp": 1.02582514, "epoch": 0.8614200685384477, "flos": 21252653602560.0, "grad_norm": 1.532344948963828, "language_loss": 0.66182649, "learning_rate": 1.9796570874374984e-07, "loss": 0.68381453, "num_input_tokens_seen": 155023960, "step": 7164, "time_per_iteration": 2.5973496437072754 }, { "auxiliary_loss_clip": 0.01165819, "auxiliary_loss_mlp": 0.01023809, "balance_loss_clip": 0.97074008, "balance_loss_mlp": 1.01709735, "epoch": 0.8615403114290867, "flos": 20007738080640.0, "grad_norm": 1.6773859765418409, "language_loss": 0.77633744, "learning_rate": 1.976279404076917e-07, "loss": 0.79823375, "num_input_tokens_seen": 155043360, "step": 7165, "time_per_iteration": 2.8281190395355225 }, { "auxiliary_loss_clip": 0.01164526, "auxiliary_loss_mlp": 0.0102186, "balance_loss_clip": 0.93580663, "balance_loss_mlp": 1.01501727, "epoch": 0.8616605543197259, "flos": 29789373674880.0, "grad_norm": 1.6331186117793872, "language_loss": 0.75938803, "learning_rate": 1.9729044549004193e-07, "loss": 0.78125191, "num_input_tokens_seen": 155064745, "step": 7166, "time_per_iteration": 3.551511764526367 }, { "auxiliary_loss_clip": 0.01164526, "auxiliary_loss_mlp": 0.01023914, "balance_loss_clip": 1.01170599, "balance_loss_mlp": 1.01682091, "epoch": 0.8617807972103649, "flos": 28911609020160.0, "grad_norm": 1.5460844711898005, "language_loss": 0.70260787, "learning_rate": 1.9695322404199822e-07, "loss": 0.72449231, "num_input_tokens_seen": 155086790, "step": 7167, "time_per_iteration": 2.6536812782287598 }, { "auxiliary_loss_clip": 0.01169141, "auxiliary_loss_mlp": 0.01030596, "balance_loss_clip": 0.97273546, "balance_loss_mlp": 1.02355051, "epoch": 0.861901040101004, "flos": 27673804391040.0, "grad_norm": 1.9047772344937972, "language_loss": 0.82390636, "learning_rate": 1.9661627611471654e-07, "loss": 0.84590375, "num_input_tokens_seen": 155106585, "step": 7168, "time_per_iteration": 2.682917594909668 }, { "auxiliary_loss_clip": 0.0117302, "auxiliary_loss_mlp": 0.01026893, "balance_loss_clip": 0.97093719, "balance_loss_mlp": 1.01947832, "epoch": 0.8620212829916432, "flos": 49748056755840.0, "grad_norm": 1.7302774670092613, "language_loss": 0.70017052, "learning_rate": 1.9627960175931246e-07, "loss": 0.72216964, "num_input_tokens_seen": 155131285, "step": 7169, "time_per_iteration": 2.9201080799102783 }, { "auxiliary_loss_clip": 0.01165546, "auxiliary_loss_mlp": 0.01023479, "balance_loss_clip": 1.01093733, "balance_loss_mlp": 1.0165168, "epoch": 0.8621415258822822, "flos": 21138672769920.0, "grad_norm": 1.7865899820090545, "language_loss": 0.74264491, "learning_rate": 1.9594320102685847e-07, "loss": 0.76453519, "num_input_tokens_seen": 155150555, "step": 7170, "time_per_iteration": 2.6201670169830322 }, { "auxiliary_loss_clip": 0.01153502, "auxiliary_loss_mlp": 0.01122368, "balance_loss_clip": 0.96864426, "balance_loss_mlp": 0.0, "epoch": 0.8622617687729213, "flos": 21689039934720.0, "grad_norm": 1.9684303630202433, "language_loss": 0.64198202, "learning_rate": 1.956070739683864e-07, "loss": 0.66474068, "num_input_tokens_seen": 155169890, "step": 7171, "time_per_iteration": 3.6135640144348145 }, { "auxiliary_loss_clip": 0.01144415, "auxiliary_loss_mlp": 0.01019086, "balance_loss_clip": 0.9287163, "balance_loss_mlp": 1.01289606, "epoch": 0.8623820116635604, "flos": 26250592734720.0, "grad_norm": 1.5186185415682445, "language_loss": 0.74158061, "learning_rate": 1.9527122063488678e-07, "loss": 0.76321566, "num_input_tokens_seen": 155191005, "step": 7172, "time_per_iteration": 2.9316577911376953 }, { "auxiliary_loss_clip": 0.01158667, "auxiliary_loss_mlp": 0.01026732, "balance_loss_clip": 0.96585727, "balance_loss_mlp": 1.01964533, "epoch": 0.8625022545541995, "flos": 19647554451840.0, "grad_norm": 1.587130863687591, "language_loss": 0.80374932, "learning_rate": 1.9493564107730755e-07, "loss": 0.82560337, "num_input_tokens_seen": 155211005, "step": 7173, "time_per_iteration": 2.720879554748535 }, { "auxiliary_loss_clip": 0.01155756, "auxiliary_loss_mlp": 0.01021659, "balance_loss_clip": 0.96744478, "balance_loss_mlp": 1.01477718, "epoch": 0.8626224974448385, "flos": 21908382336000.0, "grad_norm": 1.9002740991069982, "language_loss": 0.61035538, "learning_rate": 1.9460033534655684e-07, "loss": 0.63212949, "num_input_tokens_seen": 155230365, "step": 7174, "time_per_iteration": 3.7881367206573486 }, { "auxiliary_loss_clip": 0.0115491, "auxiliary_loss_mlp": 0.01022811, "balance_loss_clip": 0.96593034, "balance_loss_mlp": 1.01589406, "epoch": 0.8627427403354777, "flos": 23331198942720.0, "grad_norm": 1.6094896024782854, "language_loss": 0.84358859, "learning_rate": 1.9426530349349978e-07, "loss": 0.86536574, "num_input_tokens_seen": 155250815, "step": 7175, "time_per_iteration": 2.7006008625030518 }, { "auxiliary_loss_clip": 0.01163317, "auxiliary_loss_mlp": 0.01122042, "balance_loss_clip": 1.00850403, "balance_loss_mlp": 0.0, "epoch": 0.8628629832261168, "flos": 16362877299840.0, "grad_norm": 1.8240489485898232, "language_loss": 0.64829218, "learning_rate": 1.9393054556896038e-07, "loss": 0.6711458, "num_input_tokens_seen": 155268515, "step": 7176, "time_per_iteration": 2.6849162578582764 }, { "auxiliary_loss_clip": 0.01160565, "auxiliary_loss_mlp": 0.01025248, "balance_loss_clip": 0.93106943, "balance_loss_mlp": 1.01786304, "epoch": 0.8629832261167558, "flos": 28103941756800.0, "grad_norm": 2.383340822685766, "language_loss": 0.68849814, "learning_rate": 1.9359606162372133e-07, "loss": 0.7103563, "num_input_tokens_seen": 155290120, "step": 7177, "time_per_iteration": 2.7326676845550537 }, { "auxiliary_loss_clip": 0.0116533, "auxiliary_loss_mlp": 0.01022031, "balance_loss_clip": 1.04783881, "balance_loss_mlp": 1.01528704, "epoch": 0.863103469007395, "flos": 20230061310720.0, "grad_norm": 1.537332665940809, "language_loss": 0.70602727, "learning_rate": 1.9326185170852293e-07, "loss": 0.72790086, "num_input_tokens_seen": 155309085, "step": 7178, "time_per_iteration": 2.574253559112549 }, { "auxiliary_loss_clip": 0.0116271, "auxiliary_loss_mlp": 0.01033021, "balance_loss_clip": 1.00975132, "balance_loss_mlp": 1.02599978, "epoch": 0.863223711898034, "flos": 24498547044480.0, "grad_norm": 1.833020576755294, "language_loss": 0.72259599, "learning_rate": 1.9292791587406598e-07, "loss": 0.74455333, "num_input_tokens_seen": 155327945, "step": 7179, "time_per_iteration": 2.6439807415008545 }, { "auxiliary_loss_clip": 0.01160466, "auxiliary_loss_mlp": 0.01122401, "balance_loss_clip": 1.00650048, "balance_loss_mlp": 0.0, "epoch": 0.8633439547886731, "flos": 17675376261120.0, "grad_norm": 1.837462177627161, "language_loss": 0.86877763, "learning_rate": 1.9259425417100661e-07, "loss": 0.89160633, "num_input_tokens_seen": 155344060, "step": 7180, "time_per_iteration": 2.6147074699401855 }, { "auxiliary_loss_clip": 0.01150008, "auxiliary_loss_mlp": 0.01025788, "balance_loss_clip": 0.84832937, "balance_loss_mlp": 1.01884985, "epoch": 0.8634641976793123, "flos": 12895055677440.0, "grad_norm": 2.2674974063116373, "language_loss": 0.74526429, "learning_rate": 1.9226086664996234e-07, "loss": 0.76702225, "num_input_tokens_seen": 155362305, "step": 7181, "time_per_iteration": 2.73209810256958 }, { "auxiliary_loss_clip": 0.01166139, "auxiliary_loss_mlp": 0.01021926, "balance_loss_clip": 0.97180849, "balance_loss_mlp": 1.01478565, "epoch": 0.8635844405699513, "flos": 23878980328320.0, "grad_norm": 1.820449017154609, "language_loss": 0.7426002, "learning_rate": 1.9192775336150712e-07, "loss": 0.76448083, "num_input_tokens_seen": 155382605, "step": 7182, "time_per_iteration": 2.6411514282226562 }, { "auxiliary_loss_clip": 0.01059948, "auxiliary_loss_mlp": 0.01001469, "balance_loss_clip": 0.97191471, "balance_loss_mlp": 0.99984783, "epoch": 0.8637046834605904, "flos": 60453387521280.0, "grad_norm": 0.7711765361911551, "language_loss": 0.56365705, "learning_rate": 1.915949143561739e-07, "loss": 0.58427119, "num_input_tokens_seen": 155437280, "step": 7183, "time_per_iteration": 3.1608800888061523 }, { "auxiliary_loss_clip": 0.01167774, "auxiliary_loss_mlp": 0.0102949, "balance_loss_clip": 1.01170397, "balance_loss_mlp": 1.0224719, "epoch": 0.8638249263512295, "flos": 20558751690240.0, "grad_norm": 1.5764859833176026, "language_loss": 0.78006017, "learning_rate": 1.9126234968445498e-07, "loss": 0.80203283, "num_input_tokens_seen": 155456970, "step": 7184, "time_per_iteration": 2.590207576751709 }, { "auxiliary_loss_clip": 0.01165623, "auxiliary_loss_mlp": 0.01024257, "balance_loss_clip": 1.04627848, "balance_loss_mlp": 1.01695836, "epoch": 0.8639451692418686, "flos": 26615768353920.0, "grad_norm": 1.4203600309997735, "language_loss": 0.67843401, "learning_rate": 1.9093005939679884e-07, "loss": 0.70033288, "num_input_tokens_seen": 155478925, "step": 7185, "time_per_iteration": 2.7471678256988525 }, { "auxiliary_loss_clip": 0.01165831, "auxiliary_loss_mlp": 0.0101892, "balance_loss_clip": 1.01082969, "balance_loss_mlp": 1.01188326, "epoch": 0.8640654121325076, "flos": 15122450977920.0, "grad_norm": 1.8195741296823815, "language_loss": 0.76365221, "learning_rate": 1.9059804354361452e-07, "loss": 0.78549969, "num_input_tokens_seen": 155496700, "step": 7186, "time_per_iteration": 2.5941617488861084 }, { "auxiliary_loss_clip": 0.01154418, "auxiliary_loss_mlp": 0.01027619, "balance_loss_clip": 0.96683729, "balance_loss_mlp": 1.02032363, "epoch": 0.8641856550231467, "flos": 31869068250240.0, "grad_norm": 1.533098452529569, "language_loss": 0.70473641, "learning_rate": 1.902663021752684e-07, "loss": 0.72655672, "num_input_tokens_seen": 155518130, "step": 7187, "time_per_iteration": 2.7897074222564697 }, { "auxiliary_loss_clip": 0.01170383, "auxiliary_loss_mlp": 0.01026115, "balance_loss_clip": 1.04911804, "balance_loss_mlp": 1.01909375, "epoch": 0.8643058979137859, "flos": 14976545932800.0, "grad_norm": 2.28643715037375, "language_loss": 0.82261586, "learning_rate": 1.8993483534208556e-07, "loss": 0.84458083, "num_input_tokens_seen": 155537040, "step": 7188, "time_per_iteration": 2.576186180114746 }, { "auxiliary_loss_clip": 0.01159128, "auxiliary_loss_mlp": 0.01023204, "balance_loss_clip": 0.97140008, "balance_loss_mlp": 1.01623046, "epoch": 0.8644261408044249, "flos": 13115726881920.0, "grad_norm": 2.502962751282139, "language_loss": 0.74693799, "learning_rate": 1.8960364309434884e-07, "loss": 0.7687614, "num_input_tokens_seen": 155554535, "step": 7189, "time_per_iteration": 2.6198651790618896 }, { "auxiliary_loss_clip": 0.01152253, "auxiliary_loss_mlp": 0.01121692, "balance_loss_clip": 0.85262835, "balance_loss_mlp": 0.0, "epoch": 0.864546383695064, "flos": 20850920916480.0, "grad_norm": 1.675587438055266, "language_loss": 0.78175718, "learning_rate": 1.8927272548229967e-07, "loss": 0.80449665, "num_input_tokens_seen": 155574225, "step": 7190, "time_per_iteration": 2.7384541034698486 }, { "auxiliary_loss_clip": 0.01161528, "auxiliary_loss_mlp": 0.01031457, "balance_loss_clip": 0.89462471, "balance_loss_mlp": 1.02420354, "epoch": 0.8646666265857031, "flos": 21324582587520.0, "grad_norm": 1.5846917393297395, "language_loss": 0.82990795, "learning_rate": 1.8894208255613876e-07, "loss": 0.85183775, "num_input_tokens_seen": 155593540, "step": 7191, "time_per_iteration": 2.68365216255188 }, { "auxiliary_loss_clip": 0.01166239, "auxiliary_loss_mlp": 0.01025821, "balance_loss_clip": 1.04699028, "balance_loss_mlp": 1.01931262, "epoch": 0.8647868694763422, "flos": 19750833031680.0, "grad_norm": 1.8404798022363273, "language_loss": 0.77540278, "learning_rate": 1.8861171436602397e-07, "loss": 0.79732347, "num_input_tokens_seen": 155610655, "step": 7192, "time_per_iteration": 3.3656539916992188 }, { "auxiliary_loss_clip": 0.01168817, "auxiliary_loss_mlp": 0.01023332, "balance_loss_clip": 1.00994503, "balance_loss_mlp": 1.01635504, "epoch": 0.8649071123669813, "flos": 26176760328960.0, "grad_norm": 2.1155150638384557, "language_loss": 0.80401397, "learning_rate": 1.882816209620719e-07, "loss": 0.82593548, "num_input_tokens_seen": 155627365, "step": 7193, "time_per_iteration": 2.612881898880005 }, { "auxiliary_loss_clip": 0.01173459, "auxiliary_loss_mlp": 0.01028935, "balance_loss_clip": 0.9762947, "balance_loss_mlp": 1.02131748, "epoch": 0.8650273552576204, "flos": 20302888135680.0, "grad_norm": 1.8531618558737075, "language_loss": 0.76774353, "learning_rate": 1.8795180239435738e-07, "loss": 0.7897675, "num_input_tokens_seen": 155646220, "step": 7194, "time_per_iteration": 2.6234967708587646 }, { "auxiliary_loss_clip": 0.01171247, "auxiliary_loss_mlp": 0.01031889, "balance_loss_clip": 0.9720701, "balance_loss_mlp": 1.02470064, "epoch": 0.8651475981482595, "flos": 23951088881280.0, "grad_norm": 3.150229896697412, "language_loss": 0.7564981, "learning_rate": 1.8762225871291348e-07, "loss": 0.77852947, "num_input_tokens_seen": 155662095, "step": 7195, "time_per_iteration": 2.6234281063079834 }, { "auxiliary_loss_clip": 0.01166532, "auxiliary_loss_mlp": 0.01122339, "balance_loss_clip": 1.04691911, "balance_loss_mlp": 0.0, "epoch": 0.8652678410388985, "flos": 21684622561920.0, "grad_norm": 1.6082601357793935, "language_loss": 0.80842805, "learning_rate": 1.8729298996773201e-07, "loss": 0.83131683, "num_input_tokens_seen": 155680845, "step": 7196, "time_per_iteration": 3.6017541885375977 }, { "auxiliary_loss_clip": 0.01056751, "auxiliary_loss_mlp": 0.01002868, "balance_loss_clip": 0.97023773, "balance_loss_mlp": 1.00122249, "epoch": 0.8653880839295377, "flos": 65224660855680.0, "grad_norm": 0.8704535792367987, "language_loss": 0.6098938, "learning_rate": 1.8696399620876301e-07, "loss": 0.63048995, "num_input_tokens_seen": 155737875, "step": 7197, "time_per_iteration": 3.1566081047058105 }, { "auxiliary_loss_clip": 0.0115453, "auxiliary_loss_mlp": 0.01028315, "balance_loss_clip": 0.92694783, "balance_loss_mlp": 1.02076292, "epoch": 0.8655083268201768, "flos": 17749172753280.0, "grad_norm": 1.905574890825056, "language_loss": 0.78900814, "learning_rate": 1.866352774859141e-07, "loss": 0.81083655, "num_input_tokens_seen": 155753100, "step": 7198, "time_per_iteration": 2.689258337020874 }, { "auxiliary_loss_clip": 0.01164206, "auxiliary_loss_mlp": 0.01024757, "balance_loss_clip": 0.92969042, "balance_loss_mlp": 1.01778913, "epoch": 0.8656285697108158, "flos": 20703974376960.0, "grad_norm": 2.2730254025978684, "language_loss": 0.69013977, "learning_rate": 1.8630683384905188e-07, "loss": 0.7120294, "num_input_tokens_seen": 155772430, "step": 7199, "time_per_iteration": 2.729212999343872 }, { "auxiliary_loss_clip": 0.01166201, "auxiliary_loss_mlp": 0.01122799, "balance_loss_clip": 1.04808331, "balance_loss_mlp": 0.0, "epoch": 0.865748812601455, "flos": 18653833716480.0, "grad_norm": 1.7382100271190541, "language_loss": 0.88269973, "learning_rate": 1.8597866534800045e-07, "loss": 0.9055897, "num_input_tokens_seen": 155787545, "step": 7200, "time_per_iteration": 3.4791629314422607 }, { "auxiliary_loss_clip": 0.01168813, "auxiliary_loss_mlp": 0.01122156, "balance_loss_clip": 1.01071715, "balance_loss_mlp": 0.0, "epoch": 0.865869055492094, "flos": 70652554807680.0, "grad_norm": 1.7668532520030544, "language_loss": 0.74474764, "learning_rate": 1.8565077203254398e-07, "loss": 0.76765734, "num_input_tokens_seen": 155813005, "step": 7201, "time_per_iteration": 3.0835001468658447 }, { "auxiliary_loss_clip": 0.011622, "auxiliary_loss_mlp": 0.01028309, "balance_loss_clip": 0.93397653, "balance_loss_mlp": 1.02065623, "epoch": 0.8659892983827331, "flos": 17383961220480.0, "grad_norm": 2.3829045924425403, "language_loss": 0.72370422, "learning_rate": 1.8532315395242203e-07, "loss": 0.74560928, "num_input_tokens_seen": 155829455, "step": 7202, "time_per_iteration": 2.66111421585083 }, { "auxiliary_loss_clip": 0.01166932, "auxiliary_loss_mlp": 0.01021605, "balance_loss_clip": 0.93229347, "balance_loss_mlp": 1.01446176, "epoch": 0.8661095412733723, "flos": 17895221452800.0, "grad_norm": 1.9880348293219967, "language_loss": 0.72362578, "learning_rate": 1.849958111573353e-07, "loss": 0.74551117, "num_input_tokens_seen": 155848060, "step": 7203, "time_per_iteration": 2.7036378383636475 }, { "auxiliary_loss_clip": 0.01164008, "auxiliary_loss_mlp": 0.01021226, "balance_loss_clip": 1.04689467, "balance_loss_mlp": 1.0144074, "epoch": 0.8662297841640113, "flos": 18224163227520.0, "grad_norm": 1.5212976495724722, "language_loss": 0.6388526, "learning_rate": 1.8466874369694074e-07, "loss": 0.66070497, "num_input_tokens_seen": 155865755, "step": 7204, "time_per_iteration": 2.6182334423065186 }, { "auxiliary_loss_clip": 0.01160658, "auxiliary_loss_mlp": 0.01024939, "balance_loss_clip": 0.92871261, "balance_loss_mlp": 1.01832032, "epoch": 0.8663500270546504, "flos": 16362159027840.0, "grad_norm": 2.3343550218128697, "language_loss": 0.70032895, "learning_rate": 1.843419516208542e-07, "loss": 0.7221849, "num_input_tokens_seen": 155882680, "step": 7205, "time_per_iteration": 2.66542387008667 }, { "auxiliary_loss_clip": 0.01170248, "auxiliary_loss_mlp": 0.01028328, "balance_loss_clip": 1.01295972, "balance_loss_mlp": 1.02030563, "epoch": 0.8664702699452895, "flos": 17894431353600.0, "grad_norm": 1.9559929407807846, "language_loss": 0.79533154, "learning_rate": 1.8401543497865047e-07, "loss": 0.81731731, "num_input_tokens_seen": 155900680, "step": 7206, "time_per_iteration": 2.588927745819092 }, { "auxiliary_loss_clip": 0.01169389, "auxiliary_loss_mlp": 0.01122081, "balance_loss_clip": 1.00988007, "balance_loss_mlp": 0.0, "epoch": 0.8665905128359286, "flos": 30736373794560.0, "grad_norm": 2.354143824657775, "language_loss": 0.64166528, "learning_rate": 1.836891938198608e-07, "loss": 0.66457999, "num_input_tokens_seen": 155921105, "step": 7207, "time_per_iteration": 2.7166168689727783 }, { "auxiliary_loss_clip": 0.01164979, "auxiliary_loss_mlp": 0.01024526, "balance_loss_clip": 0.97189677, "balance_loss_mlp": 1.01697373, "epoch": 0.8667107557265676, "flos": 18656419495680.0, "grad_norm": 2.414713357104854, "language_loss": 0.71101749, "learning_rate": 1.8336322819397677e-07, "loss": 0.73291254, "num_input_tokens_seen": 155938640, "step": 7208, "time_per_iteration": 2.6721155643463135 }, { "auxiliary_loss_clip": 0.01167642, "auxiliary_loss_mlp": 0.01025494, "balance_loss_clip": 0.93254811, "balance_loss_mlp": 1.01830244, "epoch": 0.8668309986172068, "flos": 20083725302400.0, "grad_norm": 1.8756154784890728, "language_loss": 0.62241143, "learning_rate": 1.8303753815044654e-07, "loss": 0.64434278, "num_input_tokens_seen": 155957945, "step": 7209, "time_per_iteration": 2.6381940841674805 }, { "auxiliary_loss_clip": 0.01173558, "auxiliary_loss_mlp": 0.01025699, "balance_loss_clip": 0.97108305, "balance_loss_mlp": 1.0180012, "epoch": 0.8669512415078459, "flos": 21615099788160.0, "grad_norm": 3.776643417486623, "language_loss": 0.70073164, "learning_rate": 1.827121237386773e-07, "loss": 0.7227242, "num_input_tokens_seen": 155975390, "step": 7210, "time_per_iteration": 2.687178611755371 }, { "auxiliary_loss_clip": 0.01165687, "auxiliary_loss_mlp": 0.01026565, "balance_loss_clip": 0.97030109, "balance_loss_mlp": 1.01904845, "epoch": 0.8670714843984849, "flos": 17703601372800.0, "grad_norm": 2.477705238212912, "language_loss": 0.74751091, "learning_rate": 1.8238698500803374e-07, "loss": 0.76943338, "num_input_tokens_seen": 155988155, "step": 7211, "time_per_iteration": 2.6079068183898926 }, { "auxiliary_loss_clip": 0.01062585, "auxiliary_loss_mlp": 0.01003948, "balance_loss_clip": 0.97042531, "balance_loss_mlp": 1.00237465, "epoch": 0.8671917272891241, "flos": 60705483125760.0, "grad_norm": 0.7172644939825148, "language_loss": 0.56321871, "learning_rate": 1.820621220078391e-07, "loss": 0.583884, "num_input_tokens_seen": 156052065, "step": 7212, "time_per_iteration": 3.2450313568115234 }, { "auxiliary_loss_clip": 0.01163787, "auxiliary_loss_mlp": 0.01025237, "balance_loss_clip": 1.04500508, "balance_loss_mlp": 1.01822114, "epoch": 0.8673119701797631, "flos": 20451881750400.0, "grad_norm": 1.6071264729608863, "language_loss": 0.67727768, "learning_rate": 1.8173753478737553e-07, "loss": 0.69916797, "num_input_tokens_seen": 156072500, "step": 7213, "time_per_iteration": 2.5913898944854736 }, { "auxiliary_loss_clip": 0.01166712, "auxiliary_loss_mlp": 0.01020133, "balance_loss_clip": 1.04773128, "balance_loss_mlp": 1.01328754, "epoch": 0.8674322130704022, "flos": 19647410797440.0, "grad_norm": 1.9144959631518657, "language_loss": 0.79845858, "learning_rate": 1.8141322339588205e-07, "loss": 0.82032704, "num_input_tokens_seen": 156089840, "step": 7214, "time_per_iteration": 2.540355920791626 }, { "auxiliary_loss_clip": 0.01165948, "auxiliary_loss_mlp": 0.01022903, "balance_loss_clip": 1.0486505, "balance_loss_mlp": 1.01614356, "epoch": 0.8675524559610414, "flos": 26025001367040.0, "grad_norm": 2.421072442230197, "language_loss": 0.69939983, "learning_rate": 1.810891878825569e-07, "loss": 0.72128832, "num_input_tokens_seen": 156109815, "step": 7215, "time_per_iteration": 2.6127421855926514 }, { "auxiliary_loss_clip": 0.0116204, "auxiliary_loss_mlp": 0.01029483, "balance_loss_clip": 0.96902394, "balance_loss_mlp": 1.02215719, "epoch": 0.8676726988516804, "flos": 15049444584960.0, "grad_norm": 1.8782233638193424, "language_loss": 0.71469116, "learning_rate": 1.8076542829655561e-07, "loss": 0.73660648, "num_input_tokens_seen": 156128620, "step": 7216, "time_per_iteration": 2.5750253200531006 }, { "auxiliary_loss_clip": 0.01168038, "auxiliary_loss_mlp": 0.01024763, "balance_loss_clip": 0.97303045, "balance_loss_mlp": 1.01724708, "epoch": 0.8677929417423195, "flos": 16288111140480.0, "grad_norm": 3.475011364859161, "language_loss": 0.79367554, "learning_rate": 1.8044194468699203e-07, "loss": 0.81560349, "num_input_tokens_seen": 156145930, "step": 7217, "time_per_iteration": 2.622039556503296 }, { "auxiliary_loss_clip": 0.01163698, "auxiliary_loss_mlp": 0.01025771, "balance_loss_clip": 0.97273946, "balance_loss_mlp": 1.01856184, "epoch": 0.8679131846329585, "flos": 18844160906880.0, "grad_norm": 4.027660452961855, "language_loss": 0.75807959, "learning_rate": 1.8011873710293912e-07, "loss": 0.77997434, "num_input_tokens_seen": 156164435, "step": 7218, "time_per_iteration": 3.4913363456726074 }, { "auxiliary_loss_clip": 0.01164962, "auxiliary_loss_mlp": 0.01026437, "balance_loss_clip": 1.01036358, "balance_loss_mlp": 1.0195291, "epoch": 0.8680334275235977, "flos": 33620718890880.0, "grad_norm": 1.9724445444361414, "language_loss": 0.69468141, "learning_rate": 1.7979580559342677e-07, "loss": 0.71659541, "num_input_tokens_seen": 156185165, "step": 7219, "time_per_iteration": 2.7514469623565674 }, { "auxiliary_loss_clip": 0.01161243, "auxiliary_loss_mlp": 0.01027772, "balance_loss_clip": 0.97045231, "balance_loss_mlp": 1.02039266, "epoch": 0.8681536704142367, "flos": 24681152810880.0, "grad_norm": 1.690605112772439, "language_loss": 0.66423243, "learning_rate": 1.7947315020744358e-07, "loss": 0.6861226, "num_input_tokens_seen": 156206260, "step": 7220, "time_per_iteration": 2.687032699584961 }, { "auxiliary_loss_clip": 0.01163142, "auxiliary_loss_mlp": 0.01026368, "balance_loss_clip": 0.97003543, "balance_loss_mlp": 1.01985574, "epoch": 0.8682739133048758, "flos": 20011042131840.0, "grad_norm": 2.1454465297165117, "language_loss": 0.80463696, "learning_rate": 1.7915077099393594e-07, "loss": 0.82653207, "num_input_tokens_seen": 156222860, "step": 7221, "time_per_iteration": 2.6113786697387695 }, { "auxiliary_loss_clip": 0.01165607, "auxiliary_loss_mlp": 0.01027298, "balance_loss_clip": 1.00759816, "balance_loss_mlp": 1.01965356, "epoch": 0.868394156195515, "flos": 16654759217280.0, "grad_norm": 2.8653109231524154, "language_loss": 0.73009503, "learning_rate": 1.788286680018083e-07, "loss": 0.75202411, "num_input_tokens_seen": 156241570, "step": 7222, "time_per_iteration": 3.5320358276367188 }, { "auxiliary_loss_clip": 0.01167805, "auxiliary_loss_mlp": 0.01026498, "balance_loss_clip": 0.97133255, "balance_loss_mlp": 1.02013516, "epoch": 0.868514399086154, "flos": 28001381448960.0, "grad_norm": 1.4995711095864195, "language_loss": 0.72463471, "learning_rate": 1.7850684127992443e-07, "loss": 0.74657774, "num_input_tokens_seen": 156261315, "step": 7223, "time_per_iteration": 2.6795120239257812 }, { "auxiliary_loss_clip": 0.01163992, "auxiliary_loss_mlp": 0.01024103, "balance_loss_clip": 0.93320286, "balance_loss_mlp": 1.01745689, "epoch": 0.8686346419767931, "flos": 20084587228800.0, "grad_norm": 1.8832922703018216, "language_loss": 0.7054683, "learning_rate": 1.7818529087710378e-07, "loss": 0.72734928, "num_input_tokens_seen": 156281670, "step": 7224, "time_per_iteration": 2.6641812324523926 }, { "auxiliary_loss_clip": 0.01166253, "auxiliary_loss_mlp": 0.01122575, "balance_loss_clip": 1.00967932, "balance_loss_mlp": 0.0, "epoch": 0.8687548848674322, "flos": 18223516782720.0, "grad_norm": 1.677715940360335, "language_loss": 0.83805716, "learning_rate": 1.7786401684212637e-07, "loss": 0.86094546, "num_input_tokens_seen": 156300500, "step": 7225, "time_per_iteration": 2.57597279548645 }, { "auxiliary_loss_clip": 0.01069313, "auxiliary_loss_mlp": 0.01003949, "balance_loss_clip": 0.89920568, "balance_loss_mlp": 1.00237513, "epoch": 0.8688751277580713, "flos": 70457885049600.0, "grad_norm": 0.8437211050439556, "language_loss": 0.56013441, "learning_rate": 1.7754301922372883e-07, "loss": 0.58086705, "num_input_tokens_seen": 156350145, "step": 7226, "time_per_iteration": 4.931986331939697 }, { "auxiliary_loss_clip": 0.01168205, "auxiliary_loss_mlp": 0.01028156, "balance_loss_clip": 0.85530198, "balance_loss_mlp": 1.02090478, "epoch": 0.8689953706487104, "flos": 26906788344960.0, "grad_norm": 2.320494503022459, "language_loss": 0.80625165, "learning_rate": 1.7722229807060617e-07, "loss": 0.82821524, "num_input_tokens_seen": 156368725, "step": 7227, "time_per_iteration": 2.818499803543091 }, { "auxiliary_loss_clip": 0.01152002, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 0.92941713, "balance_loss_mlp": 1.02158868, "epoch": 0.8691156135393495, "flos": 34637385438720.0, "grad_norm": 1.9724912482431285, "language_loss": 0.81600434, "learning_rate": 1.7690185343141172e-07, "loss": 0.83781183, "num_input_tokens_seen": 156388640, "step": 7228, "time_per_iteration": 2.830515146255493 }, { "auxiliary_loss_clip": 0.01161785, "auxiliary_loss_mlp": 0.01019436, "balance_loss_clip": 0.96899474, "balance_loss_mlp": 1.01284397, "epoch": 0.8692358564299886, "flos": 18989814556800.0, "grad_norm": 2.4406781112564837, "language_loss": 0.69636482, "learning_rate": 1.7658168535475615e-07, "loss": 0.71817708, "num_input_tokens_seen": 156406425, "step": 7229, "time_per_iteration": 2.655310869216919 }, { "auxiliary_loss_clip": 0.01169873, "auxiliary_loss_mlp": 0.01027577, "balance_loss_clip": 0.97370255, "balance_loss_mlp": 1.02048099, "epoch": 0.8693560993206276, "flos": 30370839039360.0, "grad_norm": 4.964867023106709, "language_loss": 0.64244032, "learning_rate": 1.7626179388920948e-07, "loss": 0.66441482, "num_input_tokens_seen": 156427705, "step": 7230, "time_per_iteration": 2.8160130977630615 }, { "auxiliary_loss_clip": 0.01163987, "auxiliary_loss_mlp": 0.01122193, "balance_loss_clip": 0.97063911, "balance_loss_mlp": 0.0, "epoch": 0.8694763422112668, "flos": 27200430028800.0, "grad_norm": 1.6015821100485372, "language_loss": 0.80488449, "learning_rate": 1.7594217908329866e-07, "loss": 0.82774627, "num_input_tokens_seen": 156449890, "step": 7231, "time_per_iteration": 2.7528135776519775 }, { "auxiliary_loss_clip": 0.0115441, "auxiliary_loss_mlp": 0.01024977, "balance_loss_clip": 0.97008359, "balance_loss_mlp": 1.01846862, "epoch": 0.8695965851019059, "flos": 26139161767680.0, "grad_norm": 1.7663305407532732, "language_loss": 0.73828506, "learning_rate": 1.7562284098550895e-07, "loss": 0.76007891, "num_input_tokens_seen": 156469600, "step": 7232, "time_per_iteration": 2.6497576236724854 }, { "auxiliary_loss_clip": 0.01058321, "auxiliary_loss_mlp": 0.01003196, "balance_loss_clip": 0.93533766, "balance_loss_mlp": 1.00163424, "epoch": 0.8697168279925449, "flos": 67332616456320.0, "grad_norm": 0.8531718722501772, "language_loss": 0.62256891, "learning_rate": 1.753037796442838e-07, "loss": 0.64318407, "num_input_tokens_seen": 156529040, "step": 7233, "time_per_iteration": 3.1681466102600098 }, { "auxiliary_loss_clip": 0.01165357, "auxiliary_loss_mlp": 0.01026935, "balance_loss_clip": 1.04694164, "balance_loss_mlp": 1.02027416, "epoch": 0.8698370708831841, "flos": 19718693337600.0, "grad_norm": 2.0066434699071527, "language_loss": 0.75123888, "learning_rate": 1.74984995108024e-07, "loss": 0.77316177, "num_input_tokens_seen": 156546970, "step": 7234, "time_per_iteration": 2.616074323654175 }, { "auxiliary_loss_clip": 0.01166979, "auxiliary_loss_mlp": 0.01024355, "balance_loss_clip": 1.00957692, "balance_loss_mlp": 1.01738095, "epoch": 0.8699573137738231, "flos": 12859971068160.0, "grad_norm": 2.347514205903049, "language_loss": 0.83384812, "learning_rate": 1.7466648742508981e-07, "loss": 0.85576141, "num_input_tokens_seen": 156563155, "step": 7235, "time_per_iteration": 2.578315258026123 }, { "auxiliary_loss_clip": 0.01164613, "auxiliary_loss_mlp": 0.01025614, "balance_loss_clip": 0.97413242, "balance_loss_mlp": 1.01828599, "epoch": 0.8700775566644622, "flos": 17420733768960.0, "grad_norm": 2.8560791713653666, "language_loss": 0.84547335, "learning_rate": 1.7434825664379837e-07, "loss": 0.86737561, "num_input_tokens_seen": 156581660, "step": 7236, "time_per_iteration": 2.607670783996582 }, { "auxiliary_loss_clip": 0.01164676, "auxiliary_loss_mlp": 0.01027137, "balance_loss_clip": 1.00879049, "balance_loss_mlp": 1.01969266, "epoch": 0.8701977995551013, "flos": 13735221770880.0, "grad_norm": 6.544812322861209, "language_loss": 0.86043215, "learning_rate": 1.740303028124246e-07, "loss": 0.88235033, "num_input_tokens_seen": 156597720, "step": 7237, "time_per_iteration": 2.5868642330169678 }, { "auxiliary_loss_clip": 0.01150275, "auxiliary_loss_mlp": 0.0102498, "balance_loss_clip": 0.85314608, "balance_loss_mlp": 1.01788378, "epoch": 0.8703180424457404, "flos": 30555707362560.0, "grad_norm": 1.7776824030694782, "language_loss": 0.75738227, "learning_rate": 1.7371262597920212e-07, "loss": 0.77913475, "num_input_tokens_seen": 156619780, "step": 7238, "time_per_iteration": 2.838446617126465 }, { "auxiliary_loss_clip": 0.01160605, "auxiliary_loss_mlp": 0.01028613, "balance_loss_clip": 0.89599818, "balance_loss_mlp": 1.02199411, "epoch": 0.8704382853363795, "flos": 19608986223360.0, "grad_norm": 1.5488431573391506, "language_loss": 0.76397038, "learning_rate": 1.7339522619232195e-07, "loss": 0.78586257, "num_input_tokens_seen": 156638160, "step": 7239, "time_per_iteration": 2.655745506286621 }, { "auxiliary_loss_clip": 0.01171419, "auxiliary_loss_mlp": 0.01025152, "balance_loss_clip": 0.96918535, "balance_loss_mlp": 1.01786804, "epoch": 0.8705585282270186, "flos": 26613900846720.0, "grad_norm": 1.8568156827376798, "language_loss": 0.75804764, "learning_rate": 1.730781034999338e-07, "loss": 0.78001338, "num_input_tokens_seen": 156659740, "step": 7240, "time_per_iteration": 2.7016263008117676 }, { "auxiliary_loss_clip": 0.01166104, "auxiliary_loss_mlp": 0.01022172, "balance_loss_clip": 1.05064607, "balance_loss_mlp": 1.01573813, "epoch": 0.8706787711176577, "flos": 34090465979520.0, "grad_norm": 1.8581569717462314, "language_loss": 0.73492789, "learning_rate": 1.7276125795014497e-07, "loss": 0.75681067, "num_input_tokens_seen": 156678190, "step": 7241, "time_per_iteration": 2.794764518737793 }, { "auxiliary_loss_clip": 0.01164998, "auxiliary_loss_mlp": 0.01025585, "balance_loss_clip": 0.96850133, "balance_loss_mlp": 1.01804745, "epoch": 0.8707990140082967, "flos": 14611513968000.0, "grad_norm": 2.397287453617307, "language_loss": 0.67546797, "learning_rate": 1.7244468959102054e-07, "loss": 0.69737375, "num_input_tokens_seen": 156695245, "step": 7242, "time_per_iteration": 2.6717989444732666 }, { "auxiliary_loss_clip": 0.0116328, "auxiliary_loss_mlp": 0.0102758, "balance_loss_clip": 1.0088563, "balance_loss_mlp": 1.02039123, "epoch": 0.8709192568989359, "flos": 20084156265600.0, "grad_norm": 2.1632497953724408, "language_loss": 0.84972322, "learning_rate": 1.7212839847058348e-07, "loss": 0.87163174, "num_input_tokens_seen": 156710375, "step": 7243, "time_per_iteration": 2.6001100540161133 }, { "auxiliary_loss_clip": 0.01170957, "auxiliary_loss_mlp": 0.01023123, "balance_loss_clip": 0.85519326, "balance_loss_mlp": 1.01604545, "epoch": 0.871039499789575, "flos": 16727083251840.0, "grad_norm": 1.9028634722365994, "language_loss": 0.73623818, "learning_rate": 1.718123846368147e-07, "loss": 0.75817895, "num_input_tokens_seen": 156729420, "step": 7244, "time_per_iteration": 3.545804738998413 }, { "auxiliary_loss_clip": 0.01162646, "auxiliary_loss_mlp": 0.01122165, "balance_loss_clip": 0.97091937, "balance_loss_mlp": 0.0, "epoch": 0.871159742680214, "flos": 21068790860160.0, "grad_norm": 1.6735335304419956, "language_loss": 0.71325576, "learning_rate": 1.714966481376543e-07, "loss": 0.73610389, "num_input_tokens_seen": 156746100, "step": 7245, "time_per_iteration": 2.6042284965515137 }, { "auxiliary_loss_clip": 0.01164094, "auxiliary_loss_mlp": 0.01025862, "balance_loss_clip": 1.00901747, "balance_loss_mlp": 1.01863766, "epoch": 0.8712799855708532, "flos": 28256526731520.0, "grad_norm": 1.8258816362610895, "language_loss": 0.82556343, "learning_rate": 1.7118118902099797e-07, "loss": 0.84746301, "num_input_tokens_seen": 156764185, "step": 7246, "time_per_iteration": 2.6397194862365723 }, { "auxiliary_loss_clip": 0.01165071, "auxiliary_loss_mlp": 0.01025365, "balance_loss_clip": 1.00868368, "balance_loss_mlp": 1.01842141, "epoch": 0.8714002284614922, "flos": 22236677665920.0, "grad_norm": 1.6218821129830132, "language_loss": 0.80763173, "learning_rate": 1.7086600733470146e-07, "loss": 0.82953608, "num_input_tokens_seen": 156784855, "step": 7247, "time_per_iteration": 2.627126932144165 }, { "auxiliary_loss_clip": 0.01159441, "auxiliary_loss_mlp": 0.01025851, "balance_loss_clip": 1.00799906, "balance_loss_mlp": 1.0185132, "epoch": 0.8715204713521313, "flos": 21431919404160.0, "grad_norm": 1.6607054772424086, "language_loss": 0.76955438, "learning_rate": 1.7055110312657738e-07, "loss": 0.79140735, "num_input_tokens_seen": 156804350, "step": 7248, "time_per_iteration": 2.551815986633301 }, { "auxiliary_loss_clip": 0.01155764, "auxiliary_loss_mlp": 0.01026168, "balance_loss_clip": 0.96960688, "balance_loss_mlp": 1.01917922, "epoch": 0.8716407142427703, "flos": 23440439180160.0, "grad_norm": 2.49760370608401, "language_loss": 0.74573445, "learning_rate": 1.702364764443962e-07, "loss": 0.76755381, "num_input_tokens_seen": 156823425, "step": 7249, "time_per_iteration": 3.6033735275268555 }, { "auxiliary_loss_clip": 0.01152445, "auxiliary_loss_mlp": 0.01026933, "balance_loss_clip": 0.8532787, "balance_loss_mlp": 1.01998949, "epoch": 0.8717609571334095, "flos": 27958683156480.0, "grad_norm": 2.1070698850598824, "language_loss": 0.72604692, "learning_rate": 1.6992212733588685e-07, "loss": 0.74784076, "num_input_tokens_seen": 156843090, "step": 7250, "time_per_iteration": 2.7496461868286133 }, { "auxiliary_loss_clip": 0.01156757, "auxiliary_loss_mlp": 0.01027777, "balance_loss_clip": 0.96860451, "balance_loss_mlp": 1.0203743, "epoch": 0.8718812000240486, "flos": 25479482538240.0, "grad_norm": 1.8197176152980863, "language_loss": 0.74905169, "learning_rate": 1.6960805584873538e-07, "loss": 0.77089703, "num_input_tokens_seen": 156861090, "step": 7251, "time_per_iteration": 2.6783876419067383 }, { "auxiliary_loss_clip": 0.01160598, "auxiliary_loss_mlp": 0.01026118, "balance_loss_clip": 0.89159876, "balance_loss_mlp": 1.01933765, "epoch": 0.8720014429146876, "flos": 23403056100480.0, "grad_norm": 2.628937656088704, "language_loss": 0.78322965, "learning_rate": 1.6929426203058684e-07, "loss": 0.80509686, "num_input_tokens_seen": 156881515, "step": 7252, "time_per_iteration": 3.6893744468688965 }, { "auxiliary_loss_clip": 0.01169138, "auxiliary_loss_mlp": 0.01122995, "balance_loss_clip": 1.04590034, "balance_loss_mlp": 0.0, "epoch": 0.8721216858053268, "flos": 24352821567360.0, "grad_norm": 1.941957440504079, "language_loss": 0.79777247, "learning_rate": 1.689807459290431e-07, "loss": 0.82069379, "num_input_tokens_seen": 156900170, "step": 7253, "time_per_iteration": 3.4692001342773438 }, { "auxiliary_loss_clip": 0.01167459, "auxiliary_loss_mlp": 0.0102363, "balance_loss_clip": 0.9733367, "balance_loss_mlp": 1.01643014, "epoch": 0.8722419286959658, "flos": 33869687034240.0, "grad_norm": 2.900892926469116, "language_loss": 0.70742738, "learning_rate": 1.6866750759166437e-07, "loss": 0.72933823, "num_input_tokens_seen": 156920150, "step": 7254, "time_per_iteration": 2.7873783111572266 }, { "auxiliary_loss_clip": 0.01155828, "auxiliary_loss_mlp": 0.01027892, "balance_loss_clip": 0.92915642, "balance_loss_mlp": 1.02087915, "epoch": 0.8723621715866049, "flos": 18369385914240.0, "grad_norm": 2.2064867318233823, "language_loss": 0.77046353, "learning_rate": 1.6835454706596865e-07, "loss": 0.7923007, "num_input_tokens_seen": 156937980, "step": 7255, "time_per_iteration": 2.6296138763427734 }, { "auxiliary_loss_clip": 0.0116803, "auxiliary_loss_mlp": 0.01023666, "balance_loss_clip": 1.04893422, "balance_loss_mlp": 1.01713037, "epoch": 0.8724824144772441, "flos": 22013348855040.0, "grad_norm": 1.5808612342739634, "language_loss": 0.73967868, "learning_rate": 1.680418643994317e-07, "loss": 0.76159561, "num_input_tokens_seen": 156956550, "step": 7256, "time_per_iteration": 2.6200127601623535 }, { "auxiliary_loss_clip": 0.01058969, "auxiliary_loss_mlp": 0.0100202, "balance_loss_clip": 1.00811362, "balance_loss_mlp": 1.00039887, "epoch": 0.8726026573678831, "flos": 66698720213760.0, "grad_norm": 0.9537793900978883, "language_loss": 0.64549017, "learning_rate": 1.6772945963948738e-07, "loss": 0.66610008, "num_input_tokens_seen": 157014715, "step": 7257, "time_per_iteration": 3.2081480026245117 }, { "auxiliary_loss_clip": 0.01164308, "auxiliary_loss_mlp": 0.01028463, "balance_loss_clip": 0.97265249, "balance_loss_mlp": 1.02118254, "epoch": 0.8727229002585222, "flos": 13370908078080.0, "grad_norm": 2.0199473763494096, "language_loss": 0.76797277, "learning_rate": 1.6741733283352733e-07, "loss": 0.78990048, "num_input_tokens_seen": 157032320, "step": 7258, "time_per_iteration": 2.6157612800598145 }, { "auxiliary_loss_clip": 0.01167703, "auxiliary_loss_mlp": 0.01028711, "balance_loss_clip": 0.89519918, "balance_loss_mlp": 1.02118266, "epoch": 0.8728431431491613, "flos": 21796987282560.0, "grad_norm": 1.4749748646673149, "language_loss": 0.83780682, "learning_rate": 1.6710548402890102e-07, "loss": 0.85977101, "num_input_tokens_seen": 157052845, "step": 7259, "time_per_iteration": 2.6919400691986084 }, { "auxiliary_loss_clip": 0.01170476, "auxiliary_loss_mlp": 0.01025208, "balance_loss_clip": 1.04869521, "balance_loss_mlp": 1.01747131, "epoch": 0.8729633860398004, "flos": 36173823742080.0, "grad_norm": 2.793412101821808, "language_loss": 0.6661731, "learning_rate": 1.6679391327291527e-07, "loss": 0.6881299, "num_input_tokens_seen": 157074050, "step": 7260, "time_per_iteration": 2.6910345554351807 }, { "auxiliary_loss_clip": 0.01159295, "auxiliary_loss_mlp": 0.01027767, "balance_loss_clip": 0.96602982, "balance_loss_mlp": 1.02039969, "epoch": 0.8730836289304394, "flos": 16359680989440.0, "grad_norm": 2.744543137124113, "language_loss": 0.68449485, "learning_rate": 1.6648262061283492e-07, "loss": 0.70636547, "num_input_tokens_seen": 157089350, "step": 7261, "time_per_iteration": 2.6285183429718018 }, { "auxiliary_loss_clip": 0.01161908, "auxiliary_loss_mlp": 0.01021853, "balance_loss_clip": 0.92958403, "balance_loss_mlp": 1.01449823, "epoch": 0.8732038718210786, "flos": 21215126868480.0, "grad_norm": 2.010682438008242, "language_loss": 0.73549753, "learning_rate": 1.6617160609588353e-07, "loss": 0.75733519, "num_input_tokens_seen": 157108525, "step": 7262, "time_per_iteration": 2.668895959854126 }, { "auxiliary_loss_clip": 0.0116652, "auxiliary_loss_mlp": 0.01023295, "balance_loss_clip": 0.96976632, "balance_loss_mlp": 1.01638067, "epoch": 0.8733241147117177, "flos": 16610696208000.0, "grad_norm": 2.050640548561514, "language_loss": 0.71806121, "learning_rate": 1.6586086976924163e-07, "loss": 0.73995936, "num_input_tokens_seen": 157124025, "step": 7263, "time_per_iteration": 2.641554117202759 }, { "auxiliary_loss_clip": 0.01164297, "auxiliary_loss_mlp": 0.01022374, "balance_loss_clip": 1.00754952, "balance_loss_mlp": 1.01588631, "epoch": 0.8734443576023567, "flos": 20193935207040.0, "grad_norm": 2.2111491782846726, "language_loss": 0.78408372, "learning_rate": 1.6555041168004747e-07, "loss": 0.8059504, "num_input_tokens_seen": 157143345, "step": 7264, "time_per_iteration": 2.63655161857605 }, { "auxiliary_loss_clip": 0.01159227, "auxiliary_loss_mlp": 0.01027092, "balance_loss_clip": 0.97035396, "balance_loss_mlp": 1.02018654, "epoch": 0.8735646004929959, "flos": 18041162411520.0, "grad_norm": 3.749491845322591, "language_loss": 0.69103098, "learning_rate": 1.6524023187539715e-07, "loss": 0.71289414, "num_input_tokens_seen": 157161630, "step": 7265, "time_per_iteration": 2.727937698364258 }, { "auxiliary_loss_clip": 0.01162205, "auxiliary_loss_mlp": 0.01028019, "balance_loss_clip": 0.96909761, "balance_loss_mlp": 1.02114081, "epoch": 0.873684843383635, "flos": 20262344659200.0, "grad_norm": 1.80114311967172, "language_loss": 0.75187773, "learning_rate": 1.649303304023446e-07, "loss": 0.77378005, "num_input_tokens_seen": 157181385, "step": 7266, "time_per_iteration": 2.631484270095825 }, { "auxiliary_loss_clip": 0.01155922, "auxiliary_loss_mlp": 0.01027308, "balance_loss_clip": 0.93373752, "balance_loss_mlp": 1.02057862, "epoch": 0.873805086274274, "flos": 16947287579520.0, "grad_norm": 1.8730945595961792, "language_loss": 0.78558958, "learning_rate": 1.6462070730790246e-07, "loss": 0.80742192, "num_input_tokens_seen": 157200545, "step": 7267, "time_per_iteration": 2.6950933933258057 }, { "auxiliary_loss_clip": 0.0115441, "auxiliary_loss_mlp": 0.01024835, "balance_loss_clip": 0.96690404, "balance_loss_mlp": 1.01814461, "epoch": 0.8739253291649132, "flos": 18041270152320.0, "grad_norm": 2.5737610316502795, "language_loss": 0.78472471, "learning_rate": 1.6431136263903912e-07, "loss": 0.80651718, "num_input_tokens_seen": 157219545, "step": 7268, "time_per_iteration": 2.596085786819458 }, { "auxiliary_loss_clip": 0.01166478, "auxiliary_loss_mlp": 0.01122364, "balance_loss_clip": 1.00679243, "balance_loss_mlp": 0.0, "epoch": 0.8740455720555522, "flos": 21325085377920.0, "grad_norm": 1.809032858387605, "language_loss": 0.73382163, "learning_rate": 1.6400229644268282e-07, "loss": 0.75671005, "num_input_tokens_seen": 157237900, "step": 7269, "time_per_iteration": 2.6682913303375244 }, { "auxiliary_loss_clip": 0.0116466, "auxiliary_loss_mlp": 0.01031446, "balance_loss_clip": 0.93916947, "balance_loss_mlp": 1.02398682, "epoch": 0.8741658149461913, "flos": 15158684822400.0, "grad_norm": 1.9564825007942768, "language_loss": 0.80703962, "learning_rate": 1.6369350876571852e-07, "loss": 0.82900071, "num_input_tokens_seen": 157256055, "step": 7270, "time_per_iteration": 3.5806994438171387 }, { "auxiliary_loss_clip": 0.01156399, "auxiliary_loss_mlp": 0.01021276, "balance_loss_clip": 0.89170444, "balance_loss_mlp": 1.01454329, "epoch": 0.8742860578368304, "flos": 23039855729280.0, "grad_norm": 2.8822770250370517, "language_loss": 0.81433499, "learning_rate": 1.6338499965498874e-07, "loss": 0.83611172, "num_input_tokens_seen": 157274785, "step": 7271, "time_per_iteration": 2.7220616340637207 }, { "auxiliary_loss_clip": 0.01157893, "auxiliary_loss_mlp": 0.01024003, "balance_loss_clip": 0.93026757, "balance_loss_mlp": 1.01680303, "epoch": 0.8744063007274695, "flos": 28145347159680.0, "grad_norm": 1.4123433666810181, "language_loss": 0.77277684, "learning_rate": 1.630767691572943e-07, "loss": 0.79459584, "num_input_tokens_seen": 157294805, "step": 7272, "time_per_iteration": 2.9162447452545166 }, { "auxiliary_loss_clip": 0.01067379, "auxiliary_loss_mlp": 0.01001827, "balance_loss_clip": 0.93511641, "balance_loss_mlp": 1.0002532, "epoch": 0.8745265436181086, "flos": 64034076654720.0, "grad_norm": 0.7429162324851339, "language_loss": 0.53607708, "learning_rate": 1.6276881731939306e-07, "loss": 0.55676919, "num_input_tokens_seen": 157356695, "step": 7273, "time_per_iteration": 3.284099578857422 }, { "auxiliary_loss_clip": 0.01163565, "auxiliary_loss_mlp": 0.01026803, "balance_loss_clip": 1.01059294, "balance_loss_mlp": 1.02013361, "epoch": 0.8746467865087477, "flos": 28658618553600.0, "grad_norm": 1.7156054626714785, "language_loss": 0.7523495, "learning_rate": 1.6246114418800193e-07, "loss": 0.77425313, "num_input_tokens_seen": 157376975, "step": 7274, "time_per_iteration": 3.4691905975341797 }, { "auxiliary_loss_clip": 0.01158321, "auxiliary_loss_mlp": 0.01027527, "balance_loss_clip": 1.00757062, "balance_loss_mlp": 1.01998365, "epoch": 0.8747670293993868, "flos": 23985850268160.0, "grad_norm": 1.7058332348926641, "language_loss": 0.76507366, "learning_rate": 1.6215374980979423e-07, "loss": 0.78693211, "num_input_tokens_seen": 157397385, "step": 7275, "time_per_iteration": 2.7087225914001465 }, { "auxiliary_loss_clip": 0.01163857, "auxiliary_loss_mlp": 0.01030006, "balance_loss_clip": 1.01162672, "balance_loss_mlp": 1.02307987, "epoch": 0.8748872722900258, "flos": 45221624478720.0, "grad_norm": 3.198249636475753, "language_loss": 0.68817174, "learning_rate": 1.6184663423140133e-07, "loss": 0.71011043, "num_input_tokens_seen": 157417685, "step": 7276, "time_per_iteration": 2.778054714202881 }, { "auxiliary_loss_clip": 0.011647, "auxiliary_loss_mlp": 0.01029063, "balance_loss_clip": 0.89505804, "balance_loss_mlp": 1.02214313, "epoch": 0.875007515180665, "flos": 19754280737280.0, "grad_norm": 1.86648778687186, "language_loss": 0.64384842, "learning_rate": 1.615397974994126e-07, "loss": 0.66578615, "num_input_tokens_seen": 157435490, "step": 7277, "time_per_iteration": 2.7849748134613037 }, { "auxiliary_loss_clip": 0.01164288, "auxiliary_loss_mlp": 0.0102556, "balance_loss_clip": 1.04781294, "balance_loss_mlp": 1.01918805, "epoch": 0.875127758071304, "flos": 22710734386560.0, "grad_norm": 1.817888274262644, "language_loss": 0.80765992, "learning_rate": 1.6123323966037438e-07, "loss": 0.82955843, "num_input_tokens_seen": 157454010, "step": 7278, "time_per_iteration": 3.4806723594665527 }, { "auxiliary_loss_clip": 0.01168375, "auxiliary_loss_mlp": 0.01025351, "balance_loss_clip": 1.05015028, "balance_loss_mlp": 1.0183835, "epoch": 0.8752480009619431, "flos": 23403846199680.0, "grad_norm": 1.9562628212029012, "language_loss": 0.78525966, "learning_rate": 1.6092696076079216e-07, "loss": 0.80719692, "num_input_tokens_seen": 157472385, "step": 7279, "time_per_iteration": 2.576110363006592 }, { "auxiliary_loss_clip": 0.0114953, "auxiliary_loss_mlp": 0.01023102, "balance_loss_clip": 0.93070763, "balance_loss_mlp": 1.01667356, "epoch": 0.8753682438525822, "flos": 26213101914240.0, "grad_norm": 1.982877076897686, "language_loss": 0.73534149, "learning_rate": 1.6062096084712785e-07, "loss": 0.7570678, "num_input_tokens_seen": 157493735, "step": 7280, "time_per_iteration": 2.6824564933776855 }, { "auxiliary_loss_clip": 0.01151641, "auxiliary_loss_mlp": 0.01122542, "balance_loss_clip": 0.96710408, "balance_loss_mlp": 0.0, "epoch": 0.8754884867432213, "flos": 23326745656320.0, "grad_norm": 1.7469506262738672, "language_loss": 0.70408499, "learning_rate": 1.6031523996580098e-07, "loss": 0.72682679, "num_input_tokens_seen": 157511295, "step": 7281, "time_per_iteration": 2.6920571327209473 }, { "auxiliary_loss_clip": 0.01170752, "auxiliary_loss_mlp": 0.01024733, "balance_loss_clip": 0.9332149, "balance_loss_mlp": 1.01735377, "epoch": 0.8756087296338604, "flos": 12495226412160.0, "grad_norm": 1.9377385721917508, "language_loss": 0.66080785, "learning_rate": 1.6000979816318981e-07, "loss": 0.68276274, "num_input_tokens_seen": 157529760, "step": 7282, "time_per_iteration": 2.6650655269622803 }, { "auxiliary_loss_clip": 0.01164008, "auxiliary_loss_mlp": 0.01027467, "balance_loss_clip": 1.01074922, "balance_loss_mlp": 1.02063584, "epoch": 0.8757289725244994, "flos": 18952898353920.0, "grad_norm": 2.200800944213822, "language_loss": 0.74880999, "learning_rate": 1.5970463548562886e-07, "loss": 0.77072471, "num_input_tokens_seen": 157548915, "step": 7283, "time_per_iteration": 2.561868906021118 }, { "auxiliary_loss_clip": 0.01160068, "auxiliary_loss_mlp": 0.01024044, "balance_loss_clip": 0.9704994, "balance_loss_mlp": 1.01726389, "epoch": 0.8758492154151386, "flos": 25265958140160.0, "grad_norm": 1.6123880946828224, "language_loss": 0.71315956, "learning_rate": 1.5939975197941192e-07, "loss": 0.73500061, "num_input_tokens_seen": 157570570, "step": 7284, "time_per_iteration": 2.7231192588806152 }, { "auxiliary_loss_clip": 0.01067531, "auxiliary_loss_mlp": 0.01000782, "balance_loss_clip": 0.9351747, "balance_loss_mlp": 0.99914879, "epoch": 0.8759694583057777, "flos": 65571664193280.0, "grad_norm": 0.8318979335580811, "language_loss": 0.53437197, "learning_rate": 1.5909514769078892e-07, "loss": 0.55505514, "num_input_tokens_seen": 157635675, "step": 7285, "time_per_iteration": 3.282475709915161 }, { "auxiliary_loss_clip": 0.01156253, "auxiliary_loss_mlp": 0.01026027, "balance_loss_clip": 0.93570876, "balance_loss_mlp": 1.01918721, "epoch": 0.8760897011964167, "flos": 25446193608960.0, "grad_norm": 1.5388334472871716, "language_loss": 0.77592081, "learning_rate": 1.5879082266596867e-07, "loss": 0.79774356, "num_input_tokens_seen": 157657015, "step": 7286, "time_per_iteration": 2.7477834224700928 }, { "auxiliary_loss_clip": 0.01153681, "auxiliary_loss_mlp": 0.01023635, "balance_loss_clip": 0.96499807, "balance_loss_mlp": 1.01705456, "epoch": 0.8762099440870559, "flos": 28984830894720.0, "grad_norm": 1.887407710668317, "language_loss": 0.72109413, "learning_rate": 1.5848677695111645e-07, "loss": 0.74286723, "num_input_tokens_seen": 157678615, "step": 7287, "time_per_iteration": 2.7090113162994385 }, { "auxiliary_loss_clip": 0.01170467, "auxiliary_loss_mlp": 0.01023884, "balance_loss_clip": 0.93433511, "balance_loss_mlp": 1.01630187, "epoch": 0.8763301869776949, "flos": 21609461352960.0, "grad_norm": 2.5355701416598952, "language_loss": 0.69570905, "learning_rate": 1.5818301059235562e-07, "loss": 0.71765256, "num_input_tokens_seen": 157693790, "step": 7288, "time_per_iteration": 2.654324531555176 }, { "auxiliary_loss_clip": 0.0116786, "auxiliary_loss_mlp": 0.01027254, "balance_loss_clip": 0.97373486, "balance_loss_mlp": 1.01987505, "epoch": 0.876450429868334, "flos": 24644416176000.0, "grad_norm": 1.4425252552950623, "language_loss": 0.81589544, "learning_rate": 1.578795236357684e-07, "loss": 0.83784652, "num_input_tokens_seen": 157715255, "step": 7289, "time_per_iteration": 2.6672723293304443 }, { "auxiliary_loss_clip": 0.01163871, "auxiliary_loss_mlp": 0.01025317, "balance_loss_clip": 0.97247446, "balance_loss_mlp": 1.0182271, "epoch": 0.8765706727589732, "flos": 20260046188800.0, "grad_norm": 2.0372135365668975, "language_loss": 0.85584223, "learning_rate": 1.5757631612739218e-07, "loss": 0.87773418, "num_input_tokens_seen": 157728800, "step": 7290, "time_per_iteration": 2.6690449714660645 }, { "auxiliary_loss_clip": 0.01058901, "auxiliary_loss_mlp": 0.01000837, "balance_loss_clip": 1.00815344, "balance_loss_mlp": 0.99922746, "epoch": 0.8766909156496122, "flos": 71371165276800.0, "grad_norm": 0.9075183513866212, "language_loss": 0.61484528, "learning_rate": 1.572733881132242e-07, "loss": 0.63544273, "num_input_tokens_seen": 157789445, "step": 7291, "time_per_iteration": 3.2070860862731934 }, { "auxiliary_loss_clip": 0.01062905, "auxiliary_loss_mlp": 0.01002228, "balance_loss_clip": 0.89856255, "balance_loss_mlp": 1.00067842, "epoch": 0.8768111585402513, "flos": 69523490603520.0, "grad_norm": 0.7847211933498809, "language_loss": 0.58590555, "learning_rate": 1.5697073963921814e-07, "loss": 0.60655689, "num_input_tokens_seen": 157848685, "step": 7292, "time_per_iteration": 3.1366055011749268 }, { "auxiliary_loss_clip": 0.01166705, "auxiliary_loss_mlp": 0.01023745, "balance_loss_clip": 1.01034403, "balance_loss_mlp": 1.01639318, "epoch": 0.8769314014308904, "flos": 18838558385280.0, "grad_norm": 2.1586460276996036, "language_loss": 0.84813941, "learning_rate": 1.566683707512857e-07, "loss": 0.87004393, "num_input_tokens_seen": 157866360, "step": 7293, "time_per_iteration": 2.6361336708068848 }, { "auxiliary_loss_clip": 0.01158695, "auxiliary_loss_mlp": 0.01027929, "balance_loss_clip": 0.97012305, "balance_loss_mlp": 1.02106285, "epoch": 0.8770516443215295, "flos": 14976402278400.0, "grad_norm": 2.02937328704934, "language_loss": 0.7939868, "learning_rate": 1.5636628149529553e-07, "loss": 0.815853, "num_input_tokens_seen": 157884150, "step": 7294, "time_per_iteration": 2.6236720085144043 }, { "auxiliary_loss_clip": 0.01163089, "auxiliary_loss_mlp": 0.01024192, "balance_loss_clip": 0.97040099, "balance_loss_mlp": 1.01755846, "epoch": 0.8771718872121685, "flos": 31649654021760.0, "grad_norm": 2.0025988755916377, "language_loss": 0.79695779, "learning_rate": 1.560644719170743e-07, "loss": 0.81883061, "num_input_tokens_seen": 157905020, "step": 7295, "time_per_iteration": 2.733466386795044 }, { "auxiliary_loss_clip": 0.01159319, "auxiliary_loss_mlp": 0.01026149, "balance_loss_clip": 0.93002486, "balance_loss_mlp": 1.01841807, "epoch": 0.8772921301028077, "flos": 36095466222720.0, "grad_norm": 1.8416728392397803, "language_loss": 0.72231156, "learning_rate": 1.5576294206240692e-07, "loss": 0.74416625, "num_input_tokens_seen": 157924545, "step": 7296, "time_per_iteration": 2.7725672721862793 }, { "auxiliary_loss_clip": 0.01161639, "auxiliary_loss_mlp": 0.01025621, "balance_loss_clip": 0.97069705, "balance_loss_mlp": 1.01817644, "epoch": 0.8774123729934468, "flos": 57116961849600.0, "grad_norm": 1.958353511818494, "language_loss": 0.67509711, "learning_rate": 1.5546169197703507e-07, "loss": 0.69696975, "num_input_tokens_seen": 157950820, "step": 7297, "time_per_iteration": 4.105653285980225 }, { "auxiliary_loss_clip": 0.01164765, "auxiliary_loss_mlp": 0.01022074, "balance_loss_clip": 0.96782333, "balance_loss_mlp": 1.0153451, "epoch": 0.8775326158840858, "flos": 23914495900800.0, "grad_norm": 2.4837809498248475, "language_loss": 0.77164006, "learning_rate": 1.5516072170665774e-07, "loss": 0.79350853, "num_input_tokens_seen": 157968790, "step": 7298, "time_per_iteration": 2.7136073112487793 }, { "auxiliary_loss_clip": 0.01163978, "auxiliary_loss_mlp": 0.01020007, "balance_loss_clip": 1.00825906, "balance_loss_mlp": 1.01296496, "epoch": 0.877652858774725, "flos": 17123285243520.0, "grad_norm": 1.7864722043288532, "language_loss": 0.86546314, "learning_rate": 1.5486003129693214e-07, "loss": 0.88730299, "num_input_tokens_seen": 157986155, "step": 7299, "time_per_iteration": 2.6092448234558105 }, { "auxiliary_loss_clip": 0.01168761, "auxiliary_loss_mlp": 0.01022693, "balance_loss_clip": 1.0094018, "balance_loss_mlp": 1.01542759, "epoch": 0.877773101665364, "flos": 16508961912960.0, "grad_norm": 1.8296134305341216, "language_loss": 0.78096604, "learning_rate": 1.545596207934725e-07, "loss": 0.80288064, "num_input_tokens_seen": 158004640, "step": 7300, "time_per_iteration": 2.627180337905884 }, { "auxiliary_loss_clip": 0.01157014, "auxiliary_loss_mlp": 0.01028585, "balance_loss_clip": 0.96884364, "balance_loss_mlp": 1.02157235, "epoch": 0.8778933445560031, "flos": 22053209973120.0, "grad_norm": 1.9196836317779853, "language_loss": 0.77554131, "learning_rate": 1.5425949024185147e-07, "loss": 0.79739726, "num_input_tokens_seen": 158024665, "step": 7301, "time_per_iteration": 3.6882517337799072 }, { "auxiliary_loss_clip": 0.01163542, "auxiliary_loss_mlp": 0.01025911, "balance_loss_clip": 0.96887523, "balance_loss_mlp": 1.01909804, "epoch": 0.8780135874466423, "flos": 22564757514240.0, "grad_norm": 2.61760838901501, "language_loss": 0.67560911, "learning_rate": 1.5395963968759818e-07, "loss": 0.69750357, "num_input_tokens_seen": 158044940, "step": 7302, "time_per_iteration": 2.7434072494506836 }, { "auxiliary_loss_clip": 0.01164027, "auxiliary_loss_mlp": 0.01025523, "balance_loss_clip": 0.96863878, "balance_loss_mlp": 1.01849306, "epoch": 0.8781338303372813, "flos": 61531999073280.0, "grad_norm": 1.5223379029611621, "language_loss": 0.642564, "learning_rate": 1.536600691761998e-07, "loss": 0.66445947, "num_input_tokens_seen": 158070770, "step": 7303, "time_per_iteration": 3.047553300857544 }, { "auxiliary_loss_clip": 0.01164762, "auxiliary_loss_mlp": 0.01024804, "balance_loss_clip": 0.93315971, "balance_loss_mlp": 1.01738286, "epoch": 0.8782540732279204, "flos": 22674751937280.0, "grad_norm": 1.8686238267137365, "language_loss": 0.71358281, "learning_rate": 1.5336077875310084e-07, "loss": 0.73547846, "num_input_tokens_seen": 158089995, "step": 7304, "time_per_iteration": 4.483531951904297 }, { "auxiliary_loss_clip": 0.01166271, "auxiliary_loss_mlp": 0.01022921, "balance_loss_clip": 0.89281201, "balance_loss_mlp": 1.01620972, "epoch": 0.8783743161185595, "flos": 16070348937600.0, "grad_norm": 1.8762804703803644, "language_loss": 0.74128687, "learning_rate": 1.5306176846370321e-07, "loss": 0.76317877, "num_input_tokens_seen": 158108140, "step": 7305, "time_per_iteration": 2.6735382080078125 }, { "auxiliary_loss_clip": 0.01167841, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 0.96870208, "balance_loss_mlp": 1.02486765, "epoch": 0.8784945590091986, "flos": 26067879227520.0, "grad_norm": 3.6133498919524163, "language_loss": 0.73531419, "learning_rate": 1.5276303835336712e-07, "loss": 0.75731254, "num_input_tokens_seen": 158128680, "step": 7306, "time_per_iteration": 2.6517422199249268 }, { "auxiliary_loss_clip": 0.01063758, "auxiliary_loss_mlp": 0.01001385, "balance_loss_clip": 0.97146577, "balance_loss_mlp": 0.99971563, "epoch": 0.8786148018998376, "flos": 62720643939840.0, "grad_norm": 0.768696016656809, "language_loss": 0.53517008, "learning_rate": 1.524645884674094e-07, "loss": 0.55582154, "num_input_tokens_seen": 158185610, "step": 7307, "time_per_iteration": 3.181544542312622 }, { "auxiliary_loss_clip": 0.01168256, "auxiliary_loss_mlp": 0.01123342, "balance_loss_clip": 1.04681849, "balance_loss_mlp": 0.0, "epoch": 0.8787350447904768, "flos": 21652734263040.0, "grad_norm": 2.1069216850800725, "language_loss": 0.79070288, "learning_rate": 1.521664188511047e-07, "loss": 0.81361884, "num_input_tokens_seen": 158205635, "step": 7308, "time_per_iteration": 2.804313898086548 }, { "auxiliary_loss_clip": 0.01166001, "auxiliary_loss_mlp": 0.01122302, "balance_loss_clip": 0.97374064, "balance_loss_mlp": 0.0, "epoch": 0.8788552876811159, "flos": 25478476957440.0, "grad_norm": 1.9168618381466622, "language_loss": 0.80347437, "learning_rate": 1.518685295496851e-07, "loss": 0.82635736, "num_input_tokens_seen": 158223495, "step": 7309, "time_per_iteration": 2.6650218963623047 }, { "auxiliary_loss_clip": 0.01163213, "auxiliary_loss_mlp": 0.01029878, "balance_loss_clip": 1.00747514, "balance_loss_mlp": 1.022663, "epoch": 0.8789755305717549, "flos": 22310222762880.0, "grad_norm": 1.572517887409946, "language_loss": 0.85560453, "learning_rate": 1.5157092060833975e-07, "loss": 0.87753546, "num_input_tokens_seen": 158243145, "step": 7310, "time_per_iteration": 2.634571075439453 }, { "auxiliary_loss_clip": 0.01160204, "auxiliary_loss_mlp": 0.01025059, "balance_loss_clip": 0.96798587, "balance_loss_mlp": 1.01833844, "epoch": 0.879095773462394, "flos": 29310971408640.0, "grad_norm": 1.547490215766706, "language_loss": 0.65927672, "learning_rate": 1.5127359207221658e-07, "loss": 0.68112934, "num_input_tokens_seen": 158262625, "step": 7311, "time_per_iteration": 2.6802849769592285 }, { "auxiliary_loss_clip": 0.01144481, "auxiliary_loss_mlp": 0.01025311, "balance_loss_clip": 0.85004842, "balance_loss_mlp": 1.01729465, "epoch": 0.8792160163530331, "flos": 16690023394560.0, "grad_norm": 1.8089262424815102, "language_loss": 0.73153305, "learning_rate": 1.5097654398641923e-07, "loss": 0.75323105, "num_input_tokens_seen": 158280530, "step": 7312, "time_per_iteration": 2.7514615058898926 }, { "auxiliary_loss_clip": 0.01173199, "auxiliary_loss_mlp": 0.01029077, "balance_loss_clip": 1.01145077, "balance_loss_mlp": 1.02228165, "epoch": 0.8793362592436722, "flos": 24499301230080.0, "grad_norm": 1.3224130762022184, "language_loss": 0.7304377, "learning_rate": 1.5067977639601014e-07, "loss": 0.75246036, "num_input_tokens_seen": 158303290, "step": 7313, "time_per_iteration": 2.7027711868286133 }, { "auxiliary_loss_clip": 0.01164077, "auxiliary_loss_mlp": 0.01021329, "balance_loss_clip": 0.9720962, "balance_loss_mlp": 1.01456344, "epoch": 0.8794565021343113, "flos": 14538399834240.0, "grad_norm": 2.2594668211192936, "language_loss": 0.71031725, "learning_rate": 1.5038328934600864e-07, "loss": 0.73217124, "num_input_tokens_seen": 158319925, "step": 7314, "time_per_iteration": 2.6524407863616943 }, { "auxiliary_loss_clip": 0.01166219, "auxiliary_loss_mlp": 0.01024475, "balance_loss_clip": 0.97337568, "balance_loss_mlp": 1.01769757, "epoch": 0.8795767450249504, "flos": 39530286224640.0, "grad_norm": 2.0713530957988597, "language_loss": 0.69884419, "learning_rate": 1.5008708288139161e-07, "loss": 0.72075123, "num_input_tokens_seen": 158342285, "step": 7315, "time_per_iteration": 2.8181629180908203 }, { "auxiliary_loss_clip": 0.01167459, "auxiliary_loss_mlp": 0.01030805, "balance_loss_clip": 1.0111028, "balance_loss_mlp": 1.0237956, "epoch": 0.8796969879155895, "flos": 22960672197120.0, "grad_norm": 1.8445437714809725, "language_loss": 0.73281276, "learning_rate": 1.497911570470931e-07, "loss": 0.75479537, "num_input_tokens_seen": 158362290, "step": 7316, "time_per_iteration": 2.670579433441162 }, { "auxiliary_loss_clip": 0.0115169, "auxiliary_loss_mlp": 0.01025622, "balance_loss_clip": 0.93054217, "balance_loss_mlp": 1.01849008, "epoch": 0.8798172308062285, "flos": 28362427004160.0, "grad_norm": 1.6197849560003388, "language_loss": 0.85778075, "learning_rate": 1.494955118880048e-07, "loss": 0.87955391, "num_input_tokens_seen": 158383275, "step": 7317, "time_per_iteration": 2.7532198429107666 }, { "auxiliary_loss_clip": 0.01164016, "auxiliary_loss_mlp": 0.01024194, "balance_loss_clip": 1.00800824, "balance_loss_mlp": 1.01739597, "epoch": 0.8799374736968677, "flos": 23988974751360.0, "grad_norm": 1.6757361551916086, "language_loss": 0.72706372, "learning_rate": 1.4920014744897634e-07, "loss": 0.74894583, "num_input_tokens_seen": 158402690, "step": 7318, "time_per_iteration": 2.6112654209136963 }, { "auxiliary_loss_clip": 0.01156809, "auxiliary_loss_mlp": 0.01027609, "balance_loss_clip": 0.97174042, "balance_loss_mlp": 1.02051032, "epoch": 0.8800577165875068, "flos": 25630271832960.0, "grad_norm": 1.980082088596911, "language_loss": 0.86461306, "learning_rate": 1.4890506377481392e-07, "loss": 0.8864572, "num_input_tokens_seen": 158421780, "step": 7319, "time_per_iteration": 2.7662770748138428 }, { "auxiliary_loss_clip": 0.01160395, "auxiliary_loss_mlp": 0.01025421, "balance_loss_clip": 0.85672653, "balance_loss_mlp": 1.01862311, "epoch": 0.8801779594781458, "flos": 23440331439360.0, "grad_norm": 1.3862560348193242, "language_loss": 0.64117467, "learning_rate": 1.486102609102815e-07, "loss": 0.66303283, "num_input_tokens_seen": 158442330, "step": 7320, "time_per_iteration": 2.735856294631958 }, { "auxiliary_loss_clip": 0.01161213, "auxiliary_loss_mlp": 0.01023534, "balance_loss_clip": 0.97073042, "balance_loss_mlp": 1.01699877, "epoch": 0.880298202368785, "flos": 11508580656000.0, "grad_norm": 2.343141249550898, "language_loss": 0.85374999, "learning_rate": 1.483157389001004e-07, "loss": 0.87559742, "num_input_tokens_seen": 158459890, "step": 7321, "time_per_iteration": 2.5920047760009766 }, { "auxiliary_loss_clip": 0.01157532, "auxiliary_loss_mlp": 0.0102792, "balance_loss_clip": 0.96585274, "balance_loss_mlp": 1.01978374, "epoch": 0.880418445259424, "flos": 22671447886080.0, "grad_norm": 2.3936042307545358, "language_loss": 0.78837788, "learning_rate": 1.4802149778894933e-07, "loss": 0.8102324, "num_input_tokens_seen": 158478680, "step": 7322, "time_per_iteration": 2.6600735187530518 }, { "auxiliary_loss_clip": 0.0115157, "auxiliary_loss_mlp": 0.0101921, "balance_loss_clip": 1.00345969, "balance_loss_mlp": 1.0124929, "epoch": 0.8805386881500631, "flos": 20522158709760.0, "grad_norm": 1.658413085795564, "language_loss": 0.87408471, "learning_rate": 1.4772753762146484e-07, "loss": 0.89579254, "num_input_tokens_seen": 158497935, "step": 7323, "time_per_iteration": 3.45243763923645 }, { "auxiliary_loss_clip": 0.0115822, "auxiliary_loss_mlp": 0.01021316, "balance_loss_clip": 1.0063231, "balance_loss_mlp": 1.01420486, "epoch": 0.8806589310407023, "flos": 36538891620480.0, "grad_norm": 1.5094272285220374, "language_loss": 0.70477629, "learning_rate": 1.474338584422401e-07, "loss": 0.72657168, "num_input_tokens_seen": 158523145, "step": 7324, "time_per_iteration": 2.72623610496521 }, { "auxiliary_loss_clip": 0.01160848, "auxiliary_loss_mlp": 0.01023362, "balance_loss_clip": 1.00992227, "balance_loss_mlp": 1.01626337, "epoch": 0.8807791739313413, "flos": 23440187784960.0, "grad_norm": 1.5484514090439212, "language_loss": 0.75735784, "learning_rate": 1.4714046029582595e-07, "loss": 0.7791999, "num_input_tokens_seen": 158542210, "step": 7325, "time_per_iteration": 2.6268696784973145 }, { "auxiliary_loss_clip": 0.01165162, "auxiliary_loss_mlp": 0.01031988, "balance_loss_clip": 0.93168908, "balance_loss_mlp": 1.02528524, "epoch": 0.8808994168219804, "flos": 25956843310080.0, "grad_norm": 1.8396425809729118, "language_loss": 0.7571227, "learning_rate": 1.46847343226731e-07, "loss": 0.77909422, "num_input_tokens_seen": 158563250, "step": 7326, "time_per_iteration": 2.763134717941284 }, { "auxiliary_loss_clip": 0.01166818, "auxiliary_loss_mlp": 0.01023387, "balance_loss_clip": 1.00780654, "balance_loss_mlp": 1.01647305, "epoch": 0.8810196597126195, "flos": 17092079303040.0, "grad_norm": 2.9694113149261585, "language_loss": 0.69297099, "learning_rate": 1.465545072794203e-07, "loss": 0.71487308, "num_input_tokens_seen": 158581125, "step": 7327, "time_per_iteration": 3.6141676902770996 }, { "auxiliary_loss_clip": 0.01160504, "auxiliary_loss_mlp": 0.01022123, "balance_loss_clip": 0.8969714, "balance_loss_mlp": 1.01542902, "epoch": 0.8811399026032586, "flos": 23002831785600.0, "grad_norm": 1.4879136339524637, "language_loss": 0.75640202, "learning_rate": 1.4626195249831774e-07, "loss": 0.77822834, "num_input_tokens_seen": 158602025, "step": 7328, "time_per_iteration": 2.7304601669311523 }, { "auxiliary_loss_clip": 0.01160372, "auxiliary_loss_mlp": 0.01023613, "balance_loss_clip": 1.00603867, "balance_loss_mlp": 1.01689231, "epoch": 0.8812601454938976, "flos": 14463813242880.0, "grad_norm": 1.7467213625749352, "language_loss": 0.71777928, "learning_rate": 1.4596967892780244e-07, "loss": 0.73961908, "num_input_tokens_seen": 158618355, "step": 7329, "time_per_iteration": 3.368116855621338 }, { "auxiliary_loss_clip": 0.01163295, "auxiliary_loss_mlp": 0.01026371, "balance_loss_clip": 1.0464325, "balance_loss_mlp": 1.01982641, "epoch": 0.8813803883845368, "flos": 22493223578880.0, "grad_norm": 2.6574293605647488, "language_loss": 0.74637121, "learning_rate": 1.4567768661221314e-07, "loss": 0.76826787, "num_input_tokens_seen": 158638925, "step": 7330, "time_per_iteration": 3.452266216278076 }, { "auxiliary_loss_clip": 0.01168233, "auxiliary_loss_mlp": 0.0112263, "balance_loss_clip": 1.01067376, "balance_loss_mlp": 0.0, "epoch": 0.8815006312751759, "flos": 21506901045120.0, "grad_norm": 2.244232793921065, "language_loss": 0.74622333, "learning_rate": 1.4538597559584442e-07, "loss": 0.76913196, "num_input_tokens_seen": 158656715, "step": 7331, "time_per_iteration": 2.588355779647827 }, { "auxiliary_loss_clip": 0.01157681, "auxiliary_loss_mlp": 0.01029299, "balance_loss_clip": 0.96959448, "balance_loss_mlp": 1.02185464, "epoch": 0.8816208741658149, "flos": 22784566792320.0, "grad_norm": 2.1070092935868105, "language_loss": 0.78771526, "learning_rate": 1.4509454592294823e-07, "loss": 0.80958509, "num_input_tokens_seen": 158677200, "step": 7332, "time_per_iteration": 2.6094188690185547 }, { "auxiliary_loss_clip": 0.01165463, "auxiliary_loss_mlp": 0.01122715, "balance_loss_clip": 0.93374515, "balance_loss_mlp": 0.0, "epoch": 0.8817411170564541, "flos": 17779409026560.0, "grad_norm": 1.8221124288104087, "language_loss": 0.78745985, "learning_rate": 1.448033976377354e-07, "loss": 0.81034172, "num_input_tokens_seen": 158692185, "step": 7333, "time_per_iteration": 2.6553544998168945 }, { "auxiliary_loss_clip": 0.01167872, "auxiliary_loss_mlp": 0.01027707, "balance_loss_clip": 1.007936, "balance_loss_mlp": 1.02087331, "epoch": 0.8818613599470931, "flos": 18551812112640.0, "grad_norm": 1.885664928381305, "language_loss": 0.74559236, "learning_rate": 1.445125307843713e-07, "loss": 0.7675482, "num_input_tokens_seen": 158710410, "step": 7334, "time_per_iteration": 2.5658910274505615 }, { "auxiliary_loss_clip": 0.01161962, "auxiliary_loss_mlp": 0.01025635, "balance_loss_clip": 1.00969982, "balance_loss_mlp": 1.01933169, "epoch": 0.8819816028377322, "flos": 27599792417280.0, "grad_norm": 1.827058406375046, "language_loss": 0.75305694, "learning_rate": 1.442219454069813e-07, "loss": 0.77493286, "num_input_tokens_seen": 158731435, "step": 7335, "time_per_iteration": 2.677183151245117 }, { "auxiliary_loss_clip": 0.01164821, "auxiliary_loss_mlp": 0.01022344, "balance_loss_clip": 0.89395082, "balance_loss_mlp": 1.01551056, "epoch": 0.8821018457283714, "flos": 23404600385280.0, "grad_norm": 5.191015595482234, "language_loss": 0.65949678, "learning_rate": 1.4393164154964676e-07, "loss": 0.68136841, "num_input_tokens_seen": 158750965, "step": 7336, "time_per_iteration": 2.697948932647705 }, { "auxiliary_loss_clip": 0.01166786, "auxiliary_loss_mlp": 0.01027822, "balance_loss_clip": 1.01217997, "balance_loss_mlp": 1.0203476, "epoch": 0.8822220886190104, "flos": 29132459792640.0, "grad_norm": 2.507926396011683, "language_loss": 0.94160497, "learning_rate": 1.4364161925640649e-07, "loss": 0.96355104, "num_input_tokens_seen": 158772365, "step": 7337, "time_per_iteration": 2.6519038677215576 }, { "auxiliary_loss_clip": 0.01165411, "auxiliary_loss_mlp": 0.01027362, "balance_loss_clip": 1.04786968, "balance_loss_mlp": 1.0206176, "epoch": 0.8823423315096495, "flos": 20485422074880.0, "grad_norm": 1.767304068484276, "language_loss": 0.84806263, "learning_rate": 1.4335187857125663e-07, "loss": 0.86999035, "num_input_tokens_seen": 158791065, "step": 7338, "time_per_iteration": 2.6176931858062744 }, { "auxiliary_loss_clip": 0.011649, "auxiliary_loss_mlp": 0.0102378, "balance_loss_clip": 1.00822496, "balance_loss_mlp": 1.01710987, "epoch": 0.8824625744002886, "flos": 24206377818240.0, "grad_norm": 1.7355674564043486, "language_loss": 0.75630873, "learning_rate": 1.4306241953815023e-07, "loss": 0.7781955, "num_input_tokens_seen": 158812125, "step": 7339, "time_per_iteration": 2.6536266803741455 }, { "auxiliary_loss_clip": 0.01167365, "auxiliary_loss_mlp": 0.01027366, "balance_loss_clip": 1.00905871, "balance_loss_mlp": 1.02059758, "epoch": 0.8825828172909277, "flos": 24679500785280.0, "grad_norm": 1.8295482857446523, "language_loss": 0.70681369, "learning_rate": 1.4277324220099862e-07, "loss": 0.72876102, "num_input_tokens_seen": 158834035, "step": 7340, "time_per_iteration": 2.714266300201416 }, { "auxiliary_loss_clip": 0.01155242, "auxiliary_loss_mlp": 0.01025372, "balance_loss_clip": 0.92902422, "balance_loss_mlp": 1.01824355, "epoch": 0.8827030601815667, "flos": 22456163721600.0, "grad_norm": 1.8547026962358886, "language_loss": 0.74165285, "learning_rate": 1.4248434660366938e-07, "loss": 0.76345897, "num_input_tokens_seen": 158853510, "step": 7341, "time_per_iteration": 2.705111026763916 }, { "auxiliary_loss_clip": 0.01162561, "auxiliary_loss_mlp": 0.01025549, "balance_loss_clip": 0.97097868, "balance_loss_mlp": 1.01836395, "epoch": 0.8828233030722058, "flos": 19865639877120.0, "grad_norm": 1.8660396679626041, "language_loss": 0.7054773, "learning_rate": 1.4219573278998808e-07, "loss": 0.7273584, "num_input_tokens_seen": 158871970, "step": 7342, "time_per_iteration": 2.739351511001587 }, { "auxiliary_loss_clip": 0.01158137, "auxiliary_loss_mlp": 0.01026426, "balance_loss_clip": 0.96743393, "balance_loss_mlp": 1.01877856, "epoch": 0.882943545962845, "flos": 39347213581440.0, "grad_norm": 1.8540250422048077, "language_loss": 0.64675564, "learning_rate": 1.4190740080373685e-07, "loss": 0.66860127, "num_input_tokens_seen": 158892250, "step": 7343, "time_per_iteration": 2.8198585510253906 }, { "auxiliary_loss_clip": 0.01162386, "auxiliary_loss_mlp": 0.01027474, "balance_loss_clip": 0.8964864, "balance_loss_mlp": 1.01979685, "epoch": 0.883063788853484, "flos": 19054524908160.0, "grad_norm": 1.8538252040698278, "language_loss": 0.84078777, "learning_rate": 1.4161935068865538e-07, "loss": 0.8626864, "num_input_tokens_seen": 158907395, "step": 7344, "time_per_iteration": 2.7609119415283203 }, { "auxiliary_loss_clip": 0.01165843, "auxiliary_loss_mlp": 0.01023587, "balance_loss_clip": 1.04656076, "balance_loss_mlp": 1.01640141, "epoch": 0.8831840317441231, "flos": 18733196816640.0, "grad_norm": 1.9567913001221384, "language_loss": 0.75776142, "learning_rate": 1.4133158248844113e-07, "loss": 0.77965569, "num_input_tokens_seen": 158926300, "step": 7345, "time_per_iteration": 2.5620622634887695 }, { "auxiliary_loss_clip": 0.01165134, "auxiliary_loss_mlp": 0.01025176, "balance_loss_clip": 0.93116736, "balance_loss_mlp": 1.01772523, "epoch": 0.8833042746347622, "flos": 26827712553600.0, "grad_norm": 1.7884689281639656, "language_loss": 0.73214412, "learning_rate": 1.4104409624674785e-07, "loss": 0.75404727, "num_input_tokens_seen": 158946085, "step": 7346, "time_per_iteration": 2.6825873851776123 }, { "auxiliary_loss_clip": 0.01169233, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.01244462, "balance_loss_mlp": 1.02555656, "epoch": 0.8834245175254013, "flos": 26104077158400.0, "grad_norm": 1.8141794314171815, "language_loss": 0.78248525, "learning_rate": 1.407568920071873e-07, "loss": 0.8045035, "num_input_tokens_seen": 158964950, "step": 7347, "time_per_iteration": 2.666520833969116 }, { "auxiliary_loss_clip": 0.01172853, "auxiliary_loss_mlp": 0.01028194, "balance_loss_clip": 1.04979038, "balance_loss_mlp": 1.02039742, "epoch": 0.8835447604160404, "flos": 30629036977920.0, "grad_norm": 2.1266328749235743, "language_loss": 0.68198824, "learning_rate": 1.4046996981332782e-07, "loss": 0.70399868, "num_input_tokens_seen": 158984835, "step": 7348, "time_per_iteration": 2.648160696029663 }, { "auxiliary_loss_clip": 0.01165826, "auxiliary_loss_mlp": 0.01021308, "balance_loss_clip": 0.93222463, "balance_loss_mlp": 1.01402378, "epoch": 0.8836650033066795, "flos": 24718356322560.0, "grad_norm": 1.8307872216542966, "language_loss": 0.7799322, "learning_rate": 1.4018332970869516e-07, "loss": 0.80180353, "num_input_tokens_seen": 159002775, "step": 7349, "time_per_iteration": 3.476155996322632 }, { "auxiliary_loss_clip": 0.01159077, "auxiliary_loss_mlp": 0.01028672, "balance_loss_clip": 0.97209328, "balance_loss_mlp": 1.02126884, "epoch": 0.8837852461973186, "flos": 25413371556480.0, "grad_norm": 1.7493609917968451, "language_loss": 0.84915251, "learning_rate": 1.398969717367733e-07, "loss": 0.87102997, "num_input_tokens_seen": 159024100, "step": 7350, "time_per_iteration": 2.6710894107818604 }, { "auxiliary_loss_clip": 0.01161965, "auxiliary_loss_mlp": 0.01025715, "balance_loss_clip": 0.89744014, "balance_loss_mlp": 1.01875019, "epoch": 0.8839054890879576, "flos": 17822574195840.0, "grad_norm": 2.3954847564535626, "language_loss": 0.7632519, "learning_rate": 1.396108959410014e-07, "loss": 0.78512871, "num_input_tokens_seen": 159043315, "step": 7351, "time_per_iteration": 2.7326090335845947 }, { "auxiliary_loss_clip": 0.01164962, "auxiliary_loss_mlp": 0.01122862, "balance_loss_clip": 1.01050532, "balance_loss_mlp": 0.0, "epoch": 0.8840257319785968, "flos": 23769021818880.0, "grad_norm": 4.381879548386204, "language_loss": 0.81156605, "learning_rate": 1.3932510236477745e-07, "loss": 0.83444428, "num_input_tokens_seen": 159063985, "step": 7352, "time_per_iteration": 2.6230576038360596 }, { "auxiliary_loss_clip": 0.0116113, "auxiliary_loss_mlp": 0.01026733, "balance_loss_clip": 1.00553262, "balance_loss_mlp": 1.01904678, "epoch": 0.8841459748692359, "flos": 29059776622080.0, "grad_norm": 1.7612118990162737, "language_loss": 0.55727434, "learning_rate": 1.3903959105145636e-07, "loss": 0.579153, "num_input_tokens_seen": 159084475, "step": 7353, "time_per_iteration": 3.7262144088745117 }, { "auxiliary_loss_clip": 0.01166395, "auxiliary_loss_mlp": 0.01029468, "balance_loss_clip": 1.04801023, "balance_loss_mlp": 1.02227402, "epoch": 0.8842662177598749, "flos": 24311523905280.0, "grad_norm": 2.086325788908519, "language_loss": 0.83020407, "learning_rate": 1.387543620443492e-07, "loss": 0.85216272, "num_input_tokens_seen": 159101320, "step": 7354, "time_per_iteration": 2.5785064697265625 }, { "auxiliary_loss_clip": 0.0116649, "auxiliary_loss_mlp": 0.01029246, "balance_loss_clip": 1.04888093, "balance_loss_mlp": 1.02189422, "epoch": 0.8843864606505141, "flos": 25007867942400.0, "grad_norm": 2.022109409372343, "language_loss": 0.84211129, "learning_rate": 1.3846941538672606e-07, "loss": 0.86406863, "num_input_tokens_seen": 159120025, "step": 7355, "time_per_iteration": 3.562169075012207 }, { "auxiliary_loss_clip": 0.01167248, "auxiliary_loss_mlp": 0.01023766, "balance_loss_clip": 0.89795136, "balance_loss_mlp": 1.01694763, "epoch": 0.8845067035411531, "flos": 28183915388160.0, "grad_norm": 2.142883395119379, "language_loss": 0.80965376, "learning_rate": 1.3818475112181193e-07, "loss": 0.83156395, "num_input_tokens_seen": 159138820, "step": 7356, "time_per_iteration": 3.5864689350128174 }, { "auxiliary_loss_clip": 0.01163774, "auxiliary_loss_mlp": 0.01022394, "balance_loss_clip": 0.97037232, "balance_loss_mlp": 1.01606059, "epoch": 0.8846269464317922, "flos": 12853219311360.0, "grad_norm": 1.9502269085259785, "language_loss": 0.7957418, "learning_rate": 1.3790036929279091e-07, "loss": 0.81760347, "num_input_tokens_seen": 159155975, "step": 7357, "time_per_iteration": 2.593554973602295 }, { "auxiliary_loss_clip": 0.01169359, "auxiliary_loss_mlp": 0.01122073, "balance_loss_clip": 1.01041436, "balance_loss_mlp": 0.0, "epoch": 0.8847471893224313, "flos": 18624351628800.0, "grad_norm": 2.371566994486956, "language_loss": 0.58841491, "learning_rate": 1.3761626994280363e-07, "loss": 0.6113292, "num_input_tokens_seen": 159173445, "step": 7358, "time_per_iteration": 2.579714059829712 }, { "auxiliary_loss_clip": 0.01168298, "auxiliary_loss_mlp": 0.01021299, "balance_loss_clip": 0.93217826, "balance_loss_mlp": 1.01423264, "epoch": 0.8848674322130704, "flos": 35769433449600.0, "grad_norm": 1.637791784959654, "language_loss": 0.73501182, "learning_rate": 1.3733245311494735e-07, "loss": 0.75690782, "num_input_tokens_seen": 159196100, "step": 7359, "time_per_iteration": 2.758711576461792 }, { "auxiliary_loss_clip": 0.01166695, "auxiliary_loss_mlp": 0.01026092, "balance_loss_clip": 1.01059282, "balance_loss_mlp": 1.01958323, "epoch": 0.8849876751037095, "flos": 24243760897920.0, "grad_norm": 2.1574986548034762, "language_loss": 0.7043156, "learning_rate": 1.3704891885227676e-07, "loss": 0.72624344, "num_input_tokens_seen": 159216145, "step": 7360, "time_per_iteration": 2.582824945449829 }, { "auxiliary_loss_clip": 0.01159874, "auxiliary_loss_mlp": 0.01030802, "balance_loss_clip": 0.92915118, "balance_loss_mlp": 1.02345896, "epoch": 0.8851079179943486, "flos": 21500580251520.0, "grad_norm": 1.931899808997864, "language_loss": 0.77799594, "learning_rate": 1.367656671978037e-07, "loss": 0.79990268, "num_input_tokens_seen": 159233610, "step": 7361, "time_per_iteration": 2.622356414794922 }, { "auxiliary_loss_clip": 0.01168619, "auxiliary_loss_mlp": 0.01024006, "balance_loss_clip": 0.96886867, "balance_loss_mlp": 1.01700521, "epoch": 0.8852281608849877, "flos": 15300711198720.0, "grad_norm": 2.3289440175871094, "language_loss": 0.73474717, "learning_rate": 1.36482698194498e-07, "loss": 0.75667346, "num_input_tokens_seen": 159250155, "step": 7362, "time_per_iteration": 2.565768241882324 }, { "auxiliary_loss_clip": 0.01161856, "auxiliary_loss_mlp": 0.01027246, "balance_loss_clip": 0.96992892, "balance_loss_mlp": 1.02025783, "epoch": 0.8853484037756267, "flos": 23295719283840.0, "grad_norm": 1.8884053259697025, "language_loss": 0.71876514, "learning_rate": 1.3620001188528506e-07, "loss": 0.74065614, "num_input_tokens_seen": 159270875, "step": 7363, "time_per_iteration": 2.627440929412842 }, { "auxiliary_loss_clip": 0.01167005, "auxiliary_loss_mlp": 0.01025097, "balance_loss_clip": 1.00714242, "balance_loss_mlp": 1.01793826, "epoch": 0.8854686466662659, "flos": 25114773795840.0, "grad_norm": 2.5786181427644053, "language_loss": 0.74172372, "learning_rate": 1.3591760831304865e-07, "loss": 0.7636447, "num_input_tokens_seen": 159288565, "step": 7364, "time_per_iteration": 2.637256383895874 }, { "auxiliary_loss_clip": 0.01166577, "auxiliary_loss_mlp": 0.01026601, "balance_loss_clip": 1.04751766, "balance_loss_mlp": 1.01909637, "epoch": 0.885588889556905, "flos": 21390873137280.0, "grad_norm": 1.626235173047815, "language_loss": 0.7930662, "learning_rate": 1.356354875206287e-07, "loss": 0.81499797, "num_input_tokens_seen": 159306400, "step": 7365, "time_per_iteration": 2.576812744140625 }, { "auxiliary_loss_clip": 0.01160009, "auxiliary_loss_mlp": 0.01025289, "balance_loss_clip": 0.93383098, "balance_loss_mlp": 1.01889336, "epoch": 0.885709132447544, "flos": 26906752431360.0, "grad_norm": 1.9495436241501116, "language_loss": 0.69580704, "learning_rate": 1.3535364955082296e-07, "loss": 0.71766007, "num_input_tokens_seen": 159326250, "step": 7366, "time_per_iteration": 2.6712183952331543 }, { "auxiliary_loss_clip": 0.01164091, "auxiliary_loss_mlp": 0.01025338, "balance_loss_clip": 1.04684687, "balance_loss_mlp": 1.01846588, "epoch": 0.8858293753381832, "flos": 26103394800000.0, "grad_norm": 1.7999505421006932, "language_loss": 0.64948261, "learning_rate": 1.3507209444638613e-07, "loss": 0.67137694, "num_input_tokens_seen": 159348250, "step": 7367, "time_per_iteration": 2.6309142112731934 }, { "auxiliary_loss_clip": 0.0116581, "auxiliary_loss_mlp": 0.0102309, "balance_loss_clip": 1.01033854, "balance_loss_mlp": 1.01565719, "epoch": 0.8859496182288222, "flos": 23292810282240.0, "grad_norm": 1.8368746153656172, "language_loss": 0.74178857, "learning_rate": 1.347908222500298e-07, "loss": 0.76367754, "num_input_tokens_seen": 159368325, "step": 7368, "time_per_iteration": 2.580151081085205 }, { "auxiliary_loss_clip": 0.01149479, "auxiliary_loss_mlp": 0.01025957, "balance_loss_clip": 0.93110538, "balance_loss_mlp": 1.01922464, "epoch": 0.8860698611194613, "flos": 16872916469760.0, "grad_norm": 1.939015198641835, "language_loss": 0.69955158, "learning_rate": 1.3450983300442276e-07, "loss": 0.72130597, "num_input_tokens_seen": 159387555, "step": 7369, "time_per_iteration": 2.705045223236084 }, { "auxiliary_loss_clip": 0.01167175, "auxiliary_loss_mlp": 0.01021051, "balance_loss_clip": 1.00934672, "balance_loss_mlp": 1.01442325, "epoch": 0.8861901040101005, "flos": 24681404206080.0, "grad_norm": 3.132815001756222, "language_loss": 0.73672438, "learning_rate": 1.3422912675219068e-07, "loss": 0.75860655, "num_input_tokens_seen": 159407310, "step": 7370, "time_per_iteration": 2.6445887088775635 }, { "auxiliary_loss_clip": 0.0116465, "auxiliary_loss_mlp": 0.01027963, "balance_loss_clip": 1.0482372, "balance_loss_mlp": 1.02146029, "epoch": 0.8863103469007395, "flos": 24423026699520.0, "grad_norm": 1.4894142355648143, "language_loss": 0.79086447, "learning_rate": 1.339487035359166e-07, "loss": 0.81279057, "num_input_tokens_seen": 159427680, "step": 7371, "time_per_iteration": 2.6047513484954834 }, { "auxiliary_loss_clip": 0.0116432, "auxiliary_loss_mlp": 0.01121402, "balance_loss_clip": 0.97146076, "balance_loss_mlp": 0.0, "epoch": 0.8864305897913786, "flos": 22053964158720.0, "grad_norm": 1.514165761224923, "language_loss": 0.84725845, "learning_rate": 1.336685633981409e-07, "loss": 0.8701157, "num_input_tokens_seen": 159448765, "step": 7372, "time_per_iteration": 2.6834921836853027 }, { "auxiliary_loss_clip": 0.01166906, "auxiliary_loss_mlp": 0.01026165, "balance_loss_clip": 1.00809073, "balance_loss_mlp": 1.01909351, "epoch": 0.8865508326820177, "flos": 19099449843840.0, "grad_norm": 13.360265846522767, "language_loss": 0.75005686, "learning_rate": 1.333887063813597e-07, "loss": 0.77198756, "num_input_tokens_seen": 159466870, "step": 7373, "time_per_iteration": 2.625354290008545 }, { "auxiliary_loss_clip": 0.01165346, "auxiliary_loss_mlp": 0.01023772, "balance_loss_clip": 0.97048223, "balance_loss_mlp": 1.01714754, "epoch": 0.8866710755726568, "flos": 15414189240960.0, "grad_norm": 1.7281509420281362, "language_loss": 0.66360033, "learning_rate": 1.331091325280278e-07, "loss": 0.68549156, "num_input_tokens_seen": 159485840, "step": 7374, "time_per_iteration": 2.6122283935546875 }, { "auxiliary_loss_clip": 0.01156565, "auxiliary_loss_mlp": 0.01029278, "balance_loss_clip": 0.89425009, "balance_loss_mlp": 1.02158308, "epoch": 0.8867913184632958, "flos": 20083689388800.0, "grad_norm": 1.7674990430664679, "language_loss": 0.78587604, "learning_rate": 1.3282984188055625e-07, "loss": 0.80773443, "num_input_tokens_seen": 159505630, "step": 7375, "time_per_iteration": 2.7441091537475586 }, { "auxiliary_loss_clip": 0.01167012, "auxiliary_loss_mlp": 0.01027989, "balance_loss_clip": 1.04805636, "balance_loss_mlp": 1.0211997, "epoch": 0.8869115613539349, "flos": 23365852588800.0, "grad_norm": 1.8580324882846138, "language_loss": 0.79607469, "learning_rate": 1.3255083448131288e-07, "loss": 0.81802464, "num_input_tokens_seen": 159524675, "step": 7376, "time_per_iteration": 3.4118142127990723 }, { "auxiliary_loss_clip": 0.01168551, "auxiliary_loss_mlp": 0.01025504, "balance_loss_clip": 1.00835037, "balance_loss_mlp": 1.01861989, "epoch": 0.8870318042445741, "flos": 21286840371840.0, "grad_norm": 2.11332310360186, "language_loss": 0.78821814, "learning_rate": 1.3227211037262365e-07, "loss": 0.81015867, "num_input_tokens_seen": 159541915, "step": 7377, "time_per_iteration": 2.553889274597168 }, { "auxiliary_loss_clip": 0.0116499, "auxiliary_loss_mlp": 0.01024945, "balance_loss_clip": 0.89399302, "balance_loss_mlp": 1.01737499, "epoch": 0.8871520471352131, "flos": 20010862563840.0, "grad_norm": 1.9308419025420278, "language_loss": 0.85424334, "learning_rate": 1.319936695967696e-07, "loss": 0.87614262, "num_input_tokens_seen": 159559740, "step": 7378, "time_per_iteration": 2.677792549133301 }, { "auxiliary_loss_clip": 0.01171552, "auxiliary_loss_mlp": 0.01026438, "balance_loss_clip": 1.04731011, "balance_loss_mlp": 1.01882625, "epoch": 0.8872722900258522, "flos": 22601422321920.0, "grad_norm": 2.261615513204646, "language_loss": 0.81767529, "learning_rate": 1.3171551219599097e-07, "loss": 0.83965522, "num_input_tokens_seen": 159578265, "step": 7379, "time_per_iteration": 3.535935401916504 }, { "auxiliary_loss_clip": 0.0116751, "auxiliary_loss_mlp": 0.01022538, "balance_loss_clip": 1.04969454, "balance_loss_mlp": 1.01545954, "epoch": 0.8873925329164913, "flos": 22163276223360.0, "grad_norm": 2.1059827709399794, "language_loss": 0.78034478, "learning_rate": 1.3143763821248377e-07, "loss": 0.80224526, "num_input_tokens_seen": 159595350, "step": 7380, "time_per_iteration": 2.5680792331695557 }, { "auxiliary_loss_clip": 0.01163621, "auxiliary_loss_mlp": 0.01023453, "balance_loss_clip": 1.04699719, "balance_loss_mlp": 1.01674783, "epoch": 0.8875127758071304, "flos": 19208223204480.0, "grad_norm": 2.008896282899265, "language_loss": 0.72163248, "learning_rate": 1.3116004768840118e-07, "loss": 0.74350321, "num_input_tokens_seen": 159613725, "step": 7381, "time_per_iteration": 2.526439905166626 }, { "auxiliary_loss_clip": 0.01165978, "auxiliary_loss_mlp": 0.01025213, "balance_loss_clip": 1.04670906, "balance_loss_mlp": 1.01808476, "epoch": 0.8876330186977694, "flos": 18110900666880.0, "grad_norm": 1.6305005386775437, "language_loss": 0.74112421, "learning_rate": 1.3088274066585348e-07, "loss": 0.76303613, "num_input_tokens_seen": 159631335, "step": 7382, "time_per_iteration": 3.4182393550872803 }, { "auxiliary_loss_clip": 0.01167449, "auxiliary_loss_mlp": 0.01026508, "balance_loss_clip": 0.93021125, "balance_loss_mlp": 1.01875949, "epoch": 0.8877532615884086, "flos": 22009434272640.0, "grad_norm": 2.176828289628841, "language_loss": 0.90256405, "learning_rate": 1.3060571718690749e-07, "loss": 0.92450362, "num_input_tokens_seen": 159648830, "step": 7383, "time_per_iteration": 3.6745169162750244 }, { "auxiliary_loss_clip": 0.01066567, "auxiliary_loss_mlp": 0.01115651, "balance_loss_clip": 0.89687729, "balance_loss_mlp": 0.0, "epoch": 0.8878735044790477, "flos": 72136924346880.0, "grad_norm": 0.7488886116941545, "language_loss": 0.56929874, "learning_rate": 1.3032897729358805e-07, "loss": 0.5911209, "num_input_tokens_seen": 159709785, "step": 7384, "time_per_iteration": 3.275259494781494 }, { "auxiliary_loss_clip": 0.01145518, "auxiliary_loss_mlp": 0.01122715, "balance_loss_clip": 0.85214388, "balance_loss_mlp": 0.0, "epoch": 0.8879937473696867, "flos": 27526355061120.0, "grad_norm": 1.858715099217798, "language_loss": 0.79953575, "learning_rate": 1.3005252102787645e-07, "loss": 0.82221812, "num_input_tokens_seen": 159728725, "step": 7385, "time_per_iteration": 3.032892942428589 }, { "auxiliary_loss_clip": 0.01169954, "auxiliary_loss_mlp": 0.01024049, "balance_loss_clip": 1.01147985, "balance_loss_mlp": 1.01736462, "epoch": 0.8881139902603259, "flos": 22234091886720.0, "grad_norm": 1.8567855523981067, "language_loss": 0.73353958, "learning_rate": 1.297763484317105e-07, "loss": 0.75547957, "num_input_tokens_seen": 159747020, "step": 7386, "time_per_iteration": 2.7060327529907227 }, { "auxiliary_loss_clip": 0.01155249, "auxiliary_loss_mlp": 0.01122453, "balance_loss_clip": 0.89106578, "balance_loss_mlp": 0.0, "epoch": 0.888234233150965, "flos": 20299548170880.0, "grad_norm": 2.1183488319457253, "language_loss": 0.70596498, "learning_rate": 1.2950045954698551e-07, "loss": 0.728742, "num_input_tokens_seen": 159764855, "step": 7387, "time_per_iteration": 2.7141165733337402 }, { "auxiliary_loss_clip": 0.01151742, "auxiliary_loss_mlp": 0.0102395, "balance_loss_clip": 0.93088758, "balance_loss_mlp": 1.01668429, "epoch": 0.888354476041604, "flos": 18147996437760.0, "grad_norm": 1.807601038257503, "language_loss": 0.75598794, "learning_rate": 1.2922485441555343e-07, "loss": 0.77774489, "num_input_tokens_seen": 159783935, "step": 7388, "time_per_iteration": 2.6928350925445557 }, { "auxiliary_loss_clip": 0.01164867, "auxiliary_loss_mlp": 0.01021239, "balance_loss_clip": 1.04597056, "balance_loss_mlp": 1.01426554, "epoch": 0.8884747189322432, "flos": 22014282608640.0, "grad_norm": 2.3282610611213523, "language_loss": 0.81774342, "learning_rate": 1.2894953307922363e-07, "loss": 0.83960444, "num_input_tokens_seen": 159802895, "step": 7389, "time_per_iteration": 2.5746395587921143 }, { "auxiliary_loss_clip": 0.01155761, "auxiliary_loss_mlp": 0.01025524, "balance_loss_clip": 0.93208635, "balance_loss_mlp": 1.01867557, "epoch": 0.8885949618228822, "flos": 19786779567360.0, "grad_norm": 1.9490531269248434, "language_loss": 0.83791929, "learning_rate": 1.2867449557976208e-07, "loss": 0.85973215, "num_input_tokens_seen": 159820995, "step": 7390, "time_per_iteration": 2.705136299133301 }, { "auxiliary_loss_clip": 0.01166454, "auxiliary_loss_mlp": 0.0102467, "balance_loss_clip": 1.011693, "balance_loss_mlp": 1.01783371, "epoch": 0.8887152047135213, "flos": 20047599198720.0, "grad_norm": 2.1125260073390915, "language_loss": 0.75330108, "learning_rate": 1.283997419588916e-07, "loss": 0.77521229, "num_input_tokens_seen": 159840465, "step": 7391, "time_per_iteration": 2.6027979850769043 }, { "auxiliary_loss_clip": 0.01169522, "auxiliary_loss_mlp": 0.01026877, "balance_loss_clip": 1.00932074, "balance_loss_mlp": 1.02011788, "epoch": 0.8888354476041604, "flos": 18588117784320.0, "grad_norm": 1.820358253673094, "language_loss": 0.61509103, "learning_rate": 1.2812527225829216e-07, "loss": 0.63705504, "num_input_tokens_seen": 159858690, "step": 7392, "time_per_iteration": 2.600740432739258 }, { "auxiliary_loss_clip": 0.01173094, "auxiliary_loss_mlp": 0.01027091, "balance_loss_clip": 1.01083863, "balance_loss_mlp": 1.01900303, "epoch": 0.8889556904947995, "flos": 21689794120320.0, "grad_norm": 1.8423519918909608, "language_loss": 0.76598495, "learning_rate": 1.2785108651960052e-07, "loss": 0.78798676, "num_input_tokens_seen": 159880325, "step": 7393, "time_per_iteration": 2.6336188316345215 }, { "auxiliary_loss_clip": 0.01167473, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.00898004, "balance_loss_mlp": 1.02327991, "epoch": 0.8890759333854386, "flos": 27381204201600.0, "grad_norm": 1.9126704990968526, "language_loss": 0.80579495, "learning_rate": 1.2757718478441094e-07, "loss": 0.82777524, "num_input_tokens_seen": 159901070, "step": 7394, "time_per_iteration": 2.645183801651001 }, { "auxiliary_loss_clip": 0.01161034, "auxiliary_loss_mlp": 0.0102496, "balance_loss_clip": 0.96802348, "balance_loss_mlp": 1.01836765, "epoch": 0.8891961762760777, "flos": 24498834353280.0, "grad_norm": 1.677925309942129, "language_loss": 0.77471733, "learning_rate": 1.2730356709427302e-07, "loss": 0.79657722, "num_input_tokens_seen": 159919750, "step": 7395, "time_per_iteration": 2.70439076423645 }, { "auxiliary_loss_clip": 0.01165578, "auxiliary_loss_mlp": 0.01022523, "balance_loss_clip": 1.01143193, "balance_loss_mlp": 1.01528144, "epoch": 0.8893164191667168, "flos": 41499770895360.0, "grad_norm": 1.6818766955815936, "language_loss": 0.5958488, "learning_rate": 1.2703023349069542e-07, "loss": 0.61772978, "num_input_tokens_seen": 159944600, "step": 7396, "time_per_iteration": 2.7960524559020996 }, { "auxiliary_loss_clip": 0.01163325, "auxiliary_loss_mlp": 0.01023957, "balance_loss_clip": 1.01019979, "balance_loss_mlp": 1.0171622, "epoch": 0.8894366620573558, "flos": 33583623120000.0, "grad_norm": 1.70425530704488, "language_loss": 0.61660206, "learning_rate": 1.2675718401514223e-07, "loss": 0.63847494, "num_input_tokens_seen": 159968780, "step": 7397, "time_per_iteration": 2.7842600345611572 }, { "auxiliary_loss_clip": 0.01162088, "auxiliary_loss_mlp": 0.0102483, "balance_loss_clip": 0.97058809, "balance_loss_mlp": 1.01751077, "epoch": 0.889556904947995, "flos": 16909832672640.0, "grad_norm": 1.885667938191396, "language_loss": 0.74450582, "learning_rate": 1.264844187090346e-07, "loss": 0.76637501, "num_input_tokens_seen": 159985905, "step": 7398, "time_per_iteration": 2.7045373916625977 }, { "auxiliary_loss_clip": 0.01157627, "auxiliary_loss_mlp": 0.01028148, "balance_loss_clip": 0.96958596, "balance_loss_mlp": 1.02138567, "epoch": 0.889677147838634, "flos": 26030855283840.0, "grad_norm": 1.7409234299654046, "language_loss": 0.75307262, "learning_rate": 1.262119376137516e-07, "loss": 0.77493036, "num_input_tokens_seen": 160006965, "step": 7399, "time_per_iteration": 2.7672131061553955 }, { "auxiliary_loss_clip": 0.01152227, "auxiliary_loss_mlp": 0.01022856, "balance_loss_clip": 1.00580955, "balance_loss_mlp": 1.01620972, "epoch": 0.8897973907292731, "flos": 26468283110400.0, "grad_norm": 1.5250902144063092, "language_loss": 0.84913248, "learning_rate": 1.2593974077062707e-07, "loss": 0.87088329, "num_input_tokens_seen": 160028585, "step": 7400, "time_per_iteration": 2.658827543258667 }, { "auxiliary_loss_clip": 0.01155533, "auxiliary_loss_mlp": 0.01023457, "balance_loss_clip": 0.93194735, "balance_loss_mlp": 1.01583338, "epoch": 0.8899176336199123, "flos": 26249694894720.0, "grad_norm": 1.5454503212304773, "language_loss": 0.63620603, "learning_rate": 1.2566782822095423e-07, "loss": 0.65799594, "num_input_tokens_seen": 160048840, "step": 7401, "time_per_iteration": 2.848465919494629 }, { "auxiliary_loss_clip": 0.01169161, "auxiliary_loss_mlp": 0.01029778, "balance_loss_clip": 0.93505573, "balance_loss_mlp": 1.02234852, "epoch": 0.8900378765105513, "flos": 20811742156800.0, "grad_norm": 1.7752231932042326, "language_loss": 0.71231401, "learning_rate": 1.2539620000598162e-07, "loss": 0.73430347, "num_input_tokens_seen": 160068175, "step": 7402, "time_per_iteration": 3.5963668823242188 }, { "auxiliary_loss_clip": 0.01165292, "auxiliary_loss_mlp": 0.01025472, "balance_loss_clip": 1.0473628, "balance_loss_mlp": 1.0186559, "epoch": 0.8901581194011904, "flos": 16472333018880.0, "grad_norm": 1.8965066248052658, "language_loss": 0.796785, "learning_rate": 1.2512485616691492e-07, "loss": 0.81869268, "num_input_tokens_seen": 160085230, "step": 7403, "time_per_iteration": 2.5923895835876465 }, { "auxiliary_loss_clip": 0.01166422, "auxiliary_loss_mlp": 0.01027412, "balance_loss_clip": 0.93334079, "balance_loss_mlp": 1.0202471, "epoch": 0.8902783622918296, "flos": 35155253773440.0, "grad_norm": 1.4014222093469142, "language_loss": 0.8093729, "learning_rate": 1.2485379674491681e-07, "loss": 0.83131123, "num_input_tokens_seen": 160111425, "step": 7404, "time_per_iteration": 2.8650455474853516 }, { "auxiliary_loss_clip": 0.01164945, "auxiliary_loss_mlp": 0.01028704, "balance_loss_clip": 0.97275436, "balance_loss_mlp": 1.0217247, "epoch": 0.8903986051824686, "flos": 17201068145280.0, "grad_norm": 5.6145772236903335, "language_loss": 0.79193056, "learning_rate": 1.2458302178110657e-07, "loss": 0.81386703, "num_input_tokens_seen": 160129790, "step": 7405, "time_per_iteration": 3.553619384765625 }, { "auxiliary_loss_clip": 0.01151171, "auxiliary_loss_mlp": 0.0102126, "balance_loss_clip": 0.93106848, "balance_loss_mlp": 1.01498389, "epoch": 0.8905188480731077, "flos": 25483863997440.0, "grad_norm": 1.781912824474208, "language_loss": 0.82534683, "learning_rate": 1.2431253131656118e-07, "loss": 0.84707117, "num_input_tokens_seen": 160149265, "step": 7406, "time_per_iteration": 2.730072021484375 }, { "auxiliary_loss_clip": 0.01160347, "auxiliary_loss_mlp": 0.01024361, "balance_loss_clip": 0.97179925, "balance_loss_mlp": 1.01734865, "epoch": 0.8906390909637467, "flos": 23365888502400.0, "grad_norm": 2.3876711174565015, "language_loss": 0.76452643, "learning_rate": 1.240423253923133e-07, "loss": 0.7863735, "num_input_tokens_seen": 160168870, "step": 7407, "time_per_iteration": 2.6385321617126465 }, { "auxiliary_loss_clip": 0.01164658, "auxiliary_loss_mlp": 0.01030106, "balance_loss_clip": 1.00857472, "balance_loss_mlp": 1.02274227, "epoch": 0.8907593338543859, "flos": 21068790860160.0, "grad_norm": 3.3805610783369904, "language_loss": 0.6936487, "learning_rate": 1.237724040493533e-07, "loss": 0.71559632, "num_input_tokens_seen": 160187495, "step": 7408, "time_per_iteration": 3.6478829383850098 }, { "auxiliary_loss_clip": 0.01171597, "auxiliary_loss_mlp": 0.01032508, "balance_loss_clip": 1.04976106, "balance_loss_mlp": 1.02503371, "epoch": 0.8908795767450249, "flos": 21869562712320.0, "grad_norm": 5.612931447965669, "language_loss": 0.73322314, "learning_rate": 1.2350276732862773e-07, "loss": 0.75526422, "num_input_tokens_seen": 160208520, "step": 7409, "time_per_iteration": 3.521261215209961 }, { "auxiliary_loss_clip": 0.01063338, "auxiliary_loss_mlp": 0.01003469, "balance_loss_clip": 0.97130734, "balance_loss_mlp": 1.0018481, "epoch": 0.890999819635664, "flos": 66307869348480.0, "grad_norm": 0.8331057262060332, "language_loss": 0.56680202, "learning_rate": 1.2323341527103993e-07, "loss": 0.58747011, "num_input_tokens_seen": 160263720, "step": 7410, "time_per_iteration": 3.140091896057129 }, { "auxiliary_loss_clip": 0.01164508, "auxiliary_loss_mlp": 0.01022486, "balance_loss_clip": 1.04709983, "balance_loss_mlp": 1.01530957, "epoch": 0.8911200625263032, "flos": 26869908055680.0, "grad_norm": 2.0759906072106173, "language_loss": 0.85317445, "learning_rate": 1.2296434791745135e-07, "loss": 0.87504435, "num_input_tokens_seen": 160282170, "step": 7411, "time_per_iteration": 2.6479554176330566 }, { "auxiliary_loss_clip": 0.01169221, "auxiliary_loss_mlp": 0.01025166, "balance_loss_clip": 1.01058269, "balance_loss_mlp": 1.01773584, "epoch": 0.8912403054169422, "flos": 20885825957760.0, "grad_norm": 1.5526746079083533, "language_loss": 0.76740319, "learning_rate": 1.2269556530867875e-07, "loss": 0.78934705, "num_input_tokens_seen": 160300725, "step": 7412, "time_per_iteration": 2.6288821697235107 }, { "auxiliary_loss_clip": 0.01171587, "auxiliary_loss_mlp": 0.01031805, "balance_loss_clip": 1.04907882, "balance_loss_mlp": 1.02334702, "epoch": 0.8913605483075813, "flos": 27016567286400.0, "grad_norm": 1.8109594424633098, "language_loss": 0.8184166, "learning_rate": 1.2242706748549614e-07, "loss": 0.84045053, "num_input_tokens_seen": 160318720, "step": 7413, "time_per_iteration": 2.6517839431762695 }, { "auxiliary_loss_clip": 0.01161461, "auxiliary_loss_mlp": 0.01020055, "balance_loss_clip": 0.96637523, "balance_loss_mlp": 1.01342142, "epoch": 0.8914807911982204, "flos": 23621500661760.0, "grad_norm": 1.9480878904356338, "language_loss": 0.81989563, "learning_rate": 1.2215885448863473e-07, "loss": 0.84171081, "num_input_tokens_seen": 160339595, "step": 7414, "time_per_iteration": 2.706088066101074 }, { "auxiliary_loss_clip": 0.01164764, "auxiliary_loss_mlp": 0.01028126, "balance_loss_clip": 0.97235036, "balance_loss_mlp": 1.02115846, "epoch": 0.8916010340888595, "flos": 24462277286400.0, "grad_norm": 2.5975539653435415, "language_loss": 0.80650496, "learning_rate": 1.2189092635878152e-07, "loss": 0.82843381, "num_input_tokens_seen": 160361045, "step": 7415, "time_per_iteration": 2.6546168327331543 }, { "auxiliary_loss_clip": 0.01151422, "auxiliary_loss_mlp": 0.0102783, "balance_loss_clip": 0.93055129, "balance_loss_mlp": 1.0206356, "epoch": 0.8917212769794985, "flos": 21215773313280.0, "grad_norm": 1.5600632737872202, "language_loss": 0.77142084, "learning_rate": 1.216232831365822e-07, "loss": 0.79321331, "num_input_tokens_seen": 160379990, "step": 7416, "time_per_iteration": 2.72135853767395 }, { "auxiliary_loss_clip": 0.01171548, "auxiliary_loss_mlp": 0.01027195, "balance_loss_clip": 0.97236729, "balance_loss_mlp": 1.01986933, "epoch": 0.8918415198701377, "flos": 25513992529920.0, "grad_norm": 5.4577111321395755, "language_loss": 0.80632341, "learning_rate": 1.2135592486263678e-07, "loss": 0.82831079, "num_input_tokens_seen": 160399240, "step": 7417, "time_per_iteration": 2.6490442752838135 }, { "auxiliary_loss_clip": 0.01163539, "auxiliary_loss_mlp": 0.01026675, "balance_loss_clip": 0.97020823, "balance_loss_mlp": 1.01999044, "epoch": 0.8919617627607768, "flos": 37853006693760.0, "grad_norm": 1.700513633073312, "language_loss": 0.61028397, "learning_rate": 1.2108885157750415e-07, "loss": 0.63218606, "num_input_tokens_seen": 160421600, "step": 7418, "time_per_iteration": 2.815195083618164 }, { "auxiliary_loss_clip": 0.01162623, "auxiliary_loss_mlp": 0.0112203, "balance_loss_clip": 0.93626738, "balance_loss_mlp": 0.0, "epoch": 0.8920820056514158, "flos": 26213676531840.0, "grad_norm": 1.776752030748092, "language_loss": 0.80174887, "learning_rate": 1.2082206332169897e-07, "loss": 0.82459539, "num_input_tokens_seen": 160441695, "step": 7419, "time_per_iteration": 2.7090578079223633 }, { "auxiliary_loss_clip": 0.01159458, "auxiliary_loss_mlp": 0.01024307, "balance_loss_clip": 0.97131848, "balance_loss_mlp": 1.01709485, "epoch": 0.892202248542055, "flos": 17383135207680.0, "grad_norm": 2.373230289191325, "language_loss": 0.73404706, "learning_rate": 1.2055556013569225e-07, "loss": 0.75588471, "num_input_tokens_seen": 160457205, "step": 7420, "time_per_iteration": 2.700500011444092 }, { "auxiliary_loss_clip": 0.01165126, "auxiliary_loss_mlp": 0.01025994, "balance_loss_clip": 0.97004783, "balance_loss_mlp": 1.01846957, "epoch": 0.892322491432694, "flos": 21324223451520.0, "grad_norm": 1.6797076077967725, "language_loss": 0.82111609, "learning_rate": 1.2028934205991315e-07, "loss": 0.84302723, "num_input_tokens_seen": 160476525, "step": 7421, "time_per_iteration": 2.6684625148773193 }, { "auxiliary_loss_clip": 0.01164903, "auxiliary_loss_mlp": 0.01027258, "balance_loss_clip": 1.00912881, "balance_loss_mlp": 1.02057981, "epoch": 0.8924427343233331, "flos": 24029374573440.0, "grad_norm": 1.379451553379144, "language_loss": 0.76506531, "learning_rate": 1.2002340913474607e-07, "loss": 0.78698695, "num_input_tokens_seen": 160500160, "step": 7422, "time_per_iteration": 2.7119174003601074 }, { "auxiliary_loss_clip": 0.01167439, "auxiliary_loss_mlp": 0.01024766, "balance_loss_clip": 1.04711628, "balance_loss_mlp": 1.01755428, "epoch": 0.8925629772139723, "flos": 30008069631360.0, "grad_norm": 2.6509743745280274, "language_loss": 0.73999429, "learning_rate": 1.1975776140053317e-07, "loss": 0.76191634, "num_input_tokens_seen": 160520130, "step": 7423, "time_per_iteration": 2.669278860092163 }, { "auxiliary_loss_clip": 0.01161967, "auxiliary_loss_mlp": 0.01030329, "balance_loss_clip": 0.89459705, "balance_loss_mlp": 1.02241397, "epoch": 0.8926832201046113, "flos": 22601709630720.0, "grad_norm": 1.9829536118873539, "language_loss": 0.73387742, "learning_rate": 1.194923988975729e-07, "loss": 0.75580037, "num_input_tokens_seen": 160539730, "step": 7424, "time_per_iteration": 2.740239143371582 }, { "auxiliary_loss_clip": 0.01161805, "auxiliary_loss_mlp": 0.01022595, "balance_loss_clip": 0.9330042, "balance_loss_mlp": 1.01528716, "epoch": 0.8928034629952504, "flos": 13297722117120.0, "grad_norm": 2.3013481445528865, "language_loss": 0.73446363, "learning_rate": 1.192273216661206e-07, "loss": 0.7563076, "num_input_tokens_seen": 160557820, "step": 7425, "time_per_iteration": 2.6460049152374268 }, { "auxiliary_loss_clip": 0.01073229, "auxiliary_loss_mlp": 0.01000653, "balance_loss_clip": 0.82366753, "balance_loss_mlp": 0.99902022, "epoch": 0.8929237058858895, "flos": 54854556744960.0, "grad_norm": 0.85302078784982, "language_loss": 0.57502127, "learning_rate": 1.189625297463881e-07, "loss": 0.59576011, "num_input_tokens_seen": 160619510, "step": 7426, "time_per_iteration": 3.3624746799468994 }, { "auxiliary_loss_clip": 0.01154781, "auxiliary_loss_mlp": 0.01024202, "balance_loss_clip": 0.85269904, "balance_loss_mlp": 1.0174253, "epoch": 0.8930439487765286, "flos": 28883850785280.0, "grad_norm": 1.5781640397206365, "language_loss": 0.7943505, "learning_rate": 1.1869802317854394e-07, "loss": 0.81614029, "num_input_tokens_seen": 160643295, "step": 7427, "time_per_iteration": 3.885612726211548 }, { "auxiliary_loss_clip": 0.01164795, "auxiliary_loss_mlp": 0.01030278, "balance_loss_clip": 0.89370382, "balance_loss_mlp": 1.02315784, "epoch": 0.8931641916671677, "flos": 22419283432320.0, "grad_norm": 1.6468334475865711, "language_loss": 0.72164887, "learning_rate": 1.1843380200271425e-07, "loss": 0.74359959, "num_input_tokens_seen": 160662495, "step": 7428, "time_per_iteration": 2.742582321166992 }, { "auxiliary_loss_clip": 0.01156852, "auxiliary_loss_mlp": 0.01026127, "balance_loss_clip": 0.9320246, "balance_loss_mlp": 1.01880801, "epoch": 0.8932844345578068, "flos": 25843149786240.0, "grad_norm": 1.8975994760199184, "language_loss": 0.8041507, "learning_rate": 1.181698662589805e-07, "loss": 0.82598054, "num_input_tokens_seen": 160682080, "step": 7429, "time_per_iteration": 2.7576613426208496 }, { "auxiliary_loss_clip": 0.01166011, "auxiliary_loss_mlp": 0.01026374, "balance_loss_clip": 1.00852668, "balance_loss_mlp": 1.01935256, "epoch": 0.8934046774484459, "flos": 22925803069440.0, "grad_norm": 4.537728434820711, "language_loss": 0.76001155, "learning_rate": 1.1790621598738249e-07, "loss": 0.78193545, "num_input_tokens_seen": 160700395, "step": 7430, "time_per_iteration": 2.6696321964263916 }, { "auxiliary_loss_clip": 0.01165334, "auxiliary_loss_mlp": 0.01020052, "balance_loss_clip": 1.04915714, "balance_loss_mlp": 1.01399565, "epoch": 0.8935249203390849, "flos": 24462097718400.0, "grad_norm": 2.0015343213832226, "language_loss": 0.74887675, "learning_rate": 1.1764285122791461e-07, "loss": 0.77073061, "num_input_tokens_seen": 160721115, "step": 7431, "time_per_iteration": 3.51523756980896 }, { "auxiliary_loss_clip": 0.01164524, "auxiliary_loss_mlp": 0.01023764, "balance_loss_clip": 1.00775266, "balance_loss_mlp": 1.01675749, "epoch": 0.8936451632297241, "flos": 15742735966080.0, "grad_norm": 3.5374358331956874, "language_loss": 0.77167761, "learning_rate": 1.173797720205294e-07, "loss": 0.7935605, "num_input_tokens_seen": 160739150, "step": 7432, "time_per_iteration": 2.662196636199951 }, { "auxiliary_loss_clip": 0.01168409, "auxiliary_loss_mlp": 0.01026097, "balance_loss_clip": 1.01069438, "balance_loss_mlp": 1.01829433, "epoch": 0.8937654061203631, "flos": 35115500396160.0, "grad_norm": 3.289768703205141, "language_loss": 0.71877444, "learning_rate": 1.1711697840513602e-07, "loss": 0.7407195, "num_input_tokens_seen": 160758585, "step": 7433, "time_per_iteration": 2.755389451980591 }, { "auxiliary_loss_clip": 0.01156217, "auxiliary_loss_mlp": 0.01023408, "balance_loss_clip": 1.00633478, "balance_loss_mlp": 1.01672673, "epoch": 0.8938856490110022, "flos": 16107444708480.0, "grad_norm": 3.1663892180787268, "language_loss": 0.71070588, "learning_rate": 1.1685447042160012e-07, "loss": 0.73250216, "num_input_tokens_seen": 160776620, "step": 7434, "time_per_iteration": 3.4798786640167236 }, { "auxiliary_loss_clip": 0.01169133, "auxiliary_loss_mlp": 0.01027759, "balance_loss_clip": 1.0483954, "balance_loss_mlp": 1.02018952, "epoch": 0.8940058919016414, "flos": 20704189858560.0, "grad_norm": 9.8572257836287, "language_loss": 0.71588999, "learning_rate": 1.1659224810974367e-07, "loss": 0.73785889, "num_input_tokens_seen": 160796580, "step": 7435, "time_per_iteration": 3.590686798095703 }, { "auxiliary_loss_clip": 0.01163017, "auxiliary_loss_mlp": 0.010232, "balance_loss_clip": 0.97281969, "balance_loss_mlp": 1.01654267, "epoch": 0.8941261347922804, "flos": 25229041937280.0, "grad_norm": 1.3912701231233744, "language_loss": 0.68332541, "learning_rate": 1.1633031150934591e-07, "loss": 0.70518756, "num_input_tokens_seen": 160819610, "step": 7436, "time_per_iteration": 2.773472785949707 }, { "auxiliary_loss_clip": 0.01169286, "auxiliary_loss_mlp": 0.0102837, "balance_loss_clip": 1.0119319, "balance_loss_mlp": 1.02089524, "epoch": 0.8942463776829195, "flos": 19537236806400.0, "grad_norm": 2.0395478423559807, "language_loss": 0.79989612, "learning_rate": 1.1606866066014176e-07, "loss": 0.82187265, "num_input_tokens_seen": 160838660, "step": 7437, "time_per_iteration": 2.701533794403076 }, { "auxiliary_loss_clip": 0.01159402, "auxiliary_loss_mlp": 0.01022203, "balance_loss_clip": 0.93122119, "balance_loss_mlp": 1.01479375, "epoch": 0.8943666205735585, "flos": 22301567585280.0, "grad_norm": 5.188810080607118, "language_loss": 0.75117719, "learning_rate": 1.1580729560182434e-07, "loss": 0.77299321, "num_input_tokens_seen": 160854515, "step": 7438, "time_per_iteration": 2.6558244228363037 }, { "auxiliary_loss_clip": 0.01166195, "auxiliary_loss_mlp": 0.01122297, "balance_loss_clip": 1.04784846, "balance_loss_mlp": 0.0, "epoch": 0.8944868634641977, "flos": 18912893581440.0, "grad_norm": 1.8056718624267027, "language_loss": 0.70942259, "learning_rate": 1.1554621637404171e-07, "loss": 0.73230749, "num_input_tokens_seen": 160872605, "step": 7439, "time_per_iteration": 2.6373279094696045 }, { "auxiliary_loss_clip": 0.01166148, "auxiliary_loss_mlp": 0.01023967, "balance_loss_clip": 1.00759459, "balance_loss_mlp": 1.01728272, "epoch": 0.8946071063548368, "flos": 14460904241280.0, "grad_norm": 2.279887704379277, "language_loss": 0.61200005, "learning_rate": 1.1528542301639999e-07, "loss": 0.63390124, "num_input_tokens_seen": 160889395, "step": 7440, "time_per_iteration": 2.610978841781616 }, { "auxiliary_loss_clip": 0.01163485, "auxiliary_loss_mlp": 0.01025017, "balance_loss_clip": 0.93041217, "balance_loss_mlp": 1.01741791, "epoch": 0.8947273492454758, "flos": 20084084438400.0, "grad_norm": 2.3627618940188952, "language_loss": 0.82398367, "learning_rate": 1.1502491556846105e-07, "loss": 0.84586871, "num_input_tokens_seen": 160907890, "step": 7441, "time_per_iteration": 2.678744077682495 }, { "auxiliary_loss_clip": 0.01161845, "auxiliary_loss_mlp": 0.01022246, "balance_loss_clip": 0.97087693, "balance_loss_mlp": 1.01518297, "epoch": 0.894847592136115, "flos": 18550555136640.0, "grad_norm": 2.1186791690898374, "language_loss": 0.81147897, "learning_rate": 1.1476469406974331e-07, "loss": 0.8333199, "num_input_tokens_seen": 160923490, "step": 7442, "time_per_iteration": 2.754265785217285 }, { "auxiliary_loss_clip": 0.01166932, "auxiliary_loss_mlp": 0.01023659, "balance_loss_clip": 1.04827547, "balance_loss_mlp": 1.01658356, "epoch": 0.894967835026754, "flos": 23478468704640.0, "grad_norm": 1.624104296373448, "language_loss": 0.77179492, "learning_rate": 1.1450475855972341e-07, "loss": 0.79370081, "num_input_tokens_seen": 160944280, "step": 7443, "time_per_iteration": 2.604515790939331 }, { "auxiliary_loss_clip": 0.01159281, "auxiliary_loss_mlp": 0.01122275, "balance_loss_clip": 0.96741462, "balance_loss_mlp": 0.0, "epoch": 0.8950880779173931, "flos": 15188310564480.0, "grad_norm": 2.884856457003942, "language_loss": 0.7045849, "learning_rate": 1.1424510907783158e-07, "loss": 0.72740048, "num_input_tokens_seen": 160961560, "step": 7444, "time_per_iteration": 2.678764581680298 }, { "auxiliary_loss_clip": 0.01165679, "auxiliary_loss_mlp": 0.01022644, "balance_loss_clip": 0.96771532, "balance_loss_mlp": 1.01633191, "epoch": 0.8952083208080323, "flos": 22091957769600.0, "grad_norm": 1.592898595161629, "language_loss": 0.82787019, "learning_rate": 1.1398574566345787e-07, "loss": 0.84975338, "num_input_tokens_seen": 160982195, "step": 7445, "time_per_iteration": 2.6569571495056152 }, { "auxiliary_loss_clip": 0.011676, "auxiliary_loss_mlp": 0.01023729, "balance_loss_clip": 0.96866608, "balance_loss_mlp": 1.01644528, "epoch": 0.8953285636986713, "flos": 23254026572160.0, "grad_norm": 2.0950630219230373, "language_loss": 0.82341224, "learning_rate": 1.1372666835594702e-07, "loss": 0.84532547, "num_input_tokens_seen": 161000520, "step": 7446, "time_per_iteration": 2.6353065967559814 }, { "auxiliary_loss_clip": 0.01161937, "auxiliary_loss_mlp": 0.01022143, "balance_loss_clip": 0.97077608, "balance_loss_mlp": 1.01499379, "epoch": 0.8954488065893104, "flos": 16362661818240.0, "grad_norm": 1.8573186636696946, "language_loss": 0.71778029, "learning_rate": 1.1346787719460071e-07, "loss": 0.73962104, "num_input_tokens_seen": 161019405, "step": 7447, "time_per_iteration": 2.6455271244049072 }, { "auxiliary_loss_clip": 0.01159956, "auxiliary_loss_mlp": 0.01023626, "balance_loss_clip": 0.97026247, "balance_loss_mlp": 1.01707816, "epoch": 0.8955690494799495, "flos": 18257883120000.0, "grad_norm": 2.0451136315433156, "language_loss": 0.72444165, "learning_rate": 1.1320937221867732e-07, "loss": 0.74627751, "num_input_tokens_seen": 161036985, "step": 7448, "time_per_iteration": 2.658169746398926 }, { "auxiliary_loss_clip": 0.01162184, "auxiliary_loss_mlp": 0.01023382, "balance_loss_clip": 0.96892977, "balance_loss_mlp": 1.01725125, "epoch": 0.8956892923705886, "flos": 25447486498560.0, "grad_norm": 1.854578105163528, "language_loss": 0.79517341, "learning_rate": 1.1295115346739192e-07, "loss": 0.81702906, "num_input_tokens_seen": 161056985, "step": 7449, "time_per_iteration": 2.6627824306488037 }, { "auxiliary_loss_clip": 0.01165275, "auxiliary_loss_mlp": 0.01026462, "balance_loss_clip": 0.96966493, "balance_loss_mlp": 1.01917863, "epoch": 0.8958095352612276, "flos": 52661883939840.0, "grad_norm": 2.3782382789236696, "language_loss": 0.73500443, "learning_rate": 1.1269322097991629e-07, "loss": 0.75692177, "num_input_tokens_seen": 161080270, "step": 7450, "time_per_iteration": 2.909970998764038 }, { "auxiliary_loss_clip": 0.01169594, "auxiliary_loss_mlp": 0.01033298, "balance_loss_clip": 1.00987506, "balance_loss_mlp": 1.02543592, "epoch": 0.8959297781518668, "flos": 23186335392000.0, "grad_norm": 2.1529485043164667, "language_loss": 0.68042028, "learning_rate": 1.1243557479537846e-07, "loss": 0.7024492, "num_input_tokens_seen": 161100160, "step": 7451, "time_per_iteration": 2.650679111480713 }, { "auxiliary_loss_clip": 0.01164478, "auxiliary_loss_mlp": 0.01023388, "balance_loss_clip": 1.04506695, "balance_loss_mlp": 1.01643801, "epoch": 0.8960500210425059, "flos": 20334309557760.0, "grad_norm": 2.9025538447655115, "language_loss": 0.6858933, "learning_rate": 1.121782149528634e-07, "loss": 0.70777202, "num_input_tokens_seen": 161117260, "step": 7452, "time_per_iteration": 2.611185073852539 }, { "auxiliary_loss_clip": 0.01173046, "auxiliary_loss_mlp": 0.01026544, "balance_loss_clip": 0.9748553, "balance_loss_mlp": 1.01939154, "epoch": 0.8961702639331449, "flos": 19901694153600.0, "grad_norm": 1.9039446417256383, "language_loss": 0.78822184, "learning_rate": 1.1192114149141208e-07, "loss": 0.81021774, "num_input_tokens_seen": 161136895, "step": 7453, "time_per_iteration": 3.7174999713897705 }, { "auxiliary_loss_clip": 0.01166371, "auxiliary_loss_mlp": 0.01028478, "balance_loss_clip": 0.96898961, "balance_loss_mlp": 1.02078307, "epoch": 0.8962905068237841, "flos": 12896348567040.0, "grad_norm": 2.5722480741919833, "language_loss": 0.65569049, "learning_rate": 1.1166435445002197e-07, "loss": 0.67763889, "num_input_tokens_seen": 161154565, "step": 7454, "time_per_iteration": 2.705104351043701 }, { "auxiliary_loss_clip": 0.01171274, "auxiliary_loss_mlp": 0.0103107, "balance_loss_clip": 1.01220429, "balance_loss_mlp": 1.02391458, "epoch": 0.8964107497144231, "flos": 23440331439360.0, "grad_norm": 3.6009478566309014, "language_loss": 0.68320775, "learning_rate": 1.1140785386764818e-07, "loss": 0.70523113, "num_input_tokens_seen": 161173265, "step": 7455, "time_per_iteration": 2.706979990005493 }, { "auxiliary_loss_clip": 0.01155635, "auxiliary_loss_mlp": 0.01026132, "balance_loss_clip": 1.00629854, "balance_loss_mlp": 1.01880097, "epoch": 0.8965309926050622, "flos": 19500176949120.0, "grad_norm": 1.9564085218463758, "language_loss": 0.69820774, "learning_rate": 1.1115163978320153e-07, "loss": 0.7200253, "num_input_tokens_seen": 161191995, "step": 7456, "time_per_iteration": 2.677022695541382 }, { "auxiliary_loss_clip": 0.01170895, "auxiliary_loss_mlp": 0.01122546, "balance_loss_clip": 1.01045847, "balance_loss_mlp": 0.0, "epoch": 0.8966512354957014, "flos": 28658008022400.0, "grad_norm": 2.1483704339223113, "language_loss": 0.8244921, "learning_rate": 1.1089571223554917e-07, "loss": 0.84742647, "num_input_tokens_seen": 161212880, "step": 7457, "time_per_iteration": 3.6923766136169434 }, { "auxiliary_loss_clip": 0.01166904, "auxiliary_loss_mlp": 0.0102424, "balance_loss_clip": 1.00864792, "balance_loss_mlp": 1.01678038, "epoch": 0.8967714783863404, "flos": 23370916406400.0, "grad_norm": 1.9573017671744903, "language_loss": 0.85777009, "learning_rate": 1.1064007126351537e-07, "loss": 0.87968147, "num_input_tokens_seen": 161233595, "step": 7458, "time_per_iteration": 2.6717872619628906 }, { "auxiliary_loss_clip": 0.01157382, "auxiliary_loss_mlp": 0.01026371, "balance_loss_clip": 0.97033703, "balance_loss_mlp": 1.02001715, "epoch": 0.8968917212769795, "flos": 24535175938560.0, "grad_norm": 2.423424056706476, "language_loss": 0.75974298, "learning_rate": 1.1038471690588003e-07, "loss": 0.78158045, "num_input_tokens_seen": 161252740, "step": 7459, "time_per_iteration": 2.656130790710449 }, { "auxiliary_loss_clip": 0.01166917, "auxiliary_loss_mlp": 0.01028965, "balance_loss_clip": 0.89678776, "balance_loss_mlp": 1.02172899, "epoch": 0.8970119641676186, "flos": 23475416048640.0, "grad_norm": 1.8150104959406705, "language_loss": 0.80011535, "learning_rate": 1.1012964920138145e-07, "loss": 0.82207417, "num_input_tokens_seen": 161272325, "step": 7460, "time_per_iteration": 3.689608335494995 }, { "auxiliary_loss_clip": 0.01155696, "auxiliary_loss_mlp": 0.01028884, "balance_loss_clip": 0.96722227, "balance_loss_mlp": 1.02191651, "epoch": 0.8971322070582577, "flos": 24538192680960.0, "grad_norm": 1.6502646387991144, "language_loss": 0.75631011, "learning_rate": 1.0987486818871205e-07, "loss": 0.77815586, "num_input_tokens_seen": 161295915, "step": 7461, "time_per_iteration": 3.6418304443359375 }, { "auxiliary_loss_clip": 0.01165294, "auxiliary_loss_mlp": 0.01122576, "balance_loss_clip": 1.00916266, "balance_loss_mlp": 0.0, "epoch": 0.8972524499488967, "flos": 21797454159360.0, "grad_norm": 2.1141380281973587, "language_loss": 0.73380989, "learning_rate": 1.0962037390652245e-07, "loss": 0.75668859, "num_input_tokens_seen": 161314935, "step": 7462, "time_per_iteration": 2.6105470657348633 }, { "auxiliary_loss_clip": 0.01168611, "auxiliary_loss_mlp": 0.01029722, "balance_loss_clip": 0.97314161, "balance_loss_mlp": 1.0219171, "epoch": 0.8973726928395359, "flos": 21726243446400.0, "grad_norm": 1.6089775617632485, "language_loss": 0.7181288, "learning_rate": 1.0936616639341911e-07, "loss": 0.74011213, "num_input_tokens_seen": 161335225, "step": 7463, "time_per_iteration": 2.7258429527282715 }, { "auxiliary_loss_clip": 0.0106041, "auxiliary_loss_mlp": 0.01004157, "balance_loss_clip": 0.97359419, "balance_loss_mlp": 1.00251234, "epoch": 0.897492935730175, "flos": 53837100097920.0, "grad_norm": 0.7445046187974923, "language_loss": 0.54843557, "learning_rate": 1.0911224568796473e-07, "loss": 0.56908119, "num_input_tokens_seen": 161393420, "step": 7464, "time_per_iteration": 3.2516067028045654 }, { "auxiliary_loss_clip": 0.01163298, "auxiliary_loss_mlp": 0.01018207, "balance_loss_clip": 1.00957799, "balance_loss_mlp": 1.0113734, "epoch": 0.897613178620814, "flos": 18290346036480.0, "grad_norm": 1.993761841722228, "language_loss": 0.71122473, "learning_rate": 1.0885861182867984e-07, "loss": 0.7330398, "num_input_tokens_seen": 161411525, "step": 7465, "time_per_iteration": 2.6401901245117188 }, { "auxiliary_loss_clip": 0.0116565, "auxiliary_loss_mlp": 0.0102108, "balance_loss_clip": 0.96999007, "balance_loss_mlp": 1.01400518, "epoch": 0.8977334215114532, "flos": 32993718059520.0, "grad_norm": 1.6692203339012017, "language_loss": 0.70605284, "learning_rate": 1.0860526485403942e-07, "loss": 0.72792017, "num_input_tokens_seen": 161432800, "step": 7466, "time_per_iteration": 2.7388386726379395 }, { "auxiliary_loss_clip": 0.01166644, "auxiliary_loss_mlp": 0.01027384, "balance_loss_clip": 1.04883599, "balance_loss_mlp": 1.02050614, "epoch": 0.8978536644020922, "flos": 15195636938880.0, "grad_norm": 1.7800910707335593, "language_loss": 0.76867306, "learning_rate": 1.0835220480247675e-07, "loss": 0.79061341, "num_input_tokens_seen": 161451295, "step": 7467, "time_per_iteration": 2.6355669498443604 }, { "auxiliary_loss_clip": 0.0116534, "auxiliary_loss_mlp": 0.01024443, "balance_loss_clip": 0.972736, "balance_loss_mlp": 1.01698971, "epoch": 0.8979739072927313, "flos": 18004389863040.0, "grad_norm": 1.9347420452300272, "language_loss": 0.83532828, "learning_rate": 1.0809943171238067e-07, "loss": 0.85722607, "num_input_tokens_seen": 161469220, "step": 7468, "time_per_iteration": 2.6047346591949463 }, { "auxiliary_loss_clip": 0.01171586, "auxiliary_loss_mlp": 0.01027087, "balance_loss_clip": 0.97194183, "balance_loss_mlp": 1.01896894, "epoch": 0.8980941501833704, "flos": 22271546793600.0, "grad_norm": 2.173015319187495, "language_loss": 0.62947041, "learning_rate": 1.078469456220965e-07, "loss": 0.65145713, "num_input_tokens_seen": 161489375, "step": 7469, "time_per_iteration": 2.6522374153137207 }, { "auxiliary_loss_clip": 0.01167446, "auxiliary_loss_mlp": 0.01022665, "balance_loss_clip": 1.0089438, "balance_loss_mlp": 1.01529217, "epoch": 0.8982143930740095, "flos": 37560729726720.0, "grad_norm": 1.8763501070955628, "language_loss": 0.69406915, "learning_rate": 1.0759474656992606e-07, "loss": 0.71597028, "num_input_tokens_seen": 161512145, "step": 7470, "time_per_iteration": 2.7142250537872314 }, { "auxiliary_loss_clip": 0.01167242, "auxiliary_loss_mlp": 0.01030127, "balance_loss_clip": 0.96915406, "balance_loss_mlp": 1.02309942, "epoch": 0.8983346359646486, "flos": 18076893465600.0, "grad_norm": 2.177480668168622, "language_loss": 0.78331792, "learning_rate": 1.0734283459412785e-07, "loss": 0.80529165, "num_input_tokens_seen": 161528995, "step": 7471, "time_per_iteration": 2.632478952407837 }, { "auxiliary_loss_clip": 0.0116445, "auxiliary_loss_mlp": 0.0102637, "balance_loss_clip": 0.89496017, "balance_loss_mlp": 1.0185616, "epoch": 0.8984548788552876, "flos": 20558895344640.0, "grad_norm": 1.7058087659517354, "language_loss": 0.80740982, "learning_rate": 1.0709120973291707e-07, "loss": 0.82931805, "num_input_tokens_seen": 161548775, "step": 7472, "time_per_iteration": 2.6603939533233643 }, { "auxiliary_loss_clip": 0.01170034, "auxiliary_loss_mlp": 0.01030738, "balance_loss_clip": 1.04929137, "balance_loss_mlp": 1.02283466, "epoch": 0.8985751217459268, "flos": 17785442511360.0, "grad_norm": 1.9718712049293436, "language_loss": 0.77865279, "learning_rate": 1.0683987202446475e-07, "loss": 0.80066049, "num_input_tokens_seen": 161566960, "step": 7473, "time_per_iteration": 2.556337833404541 }, { "auxiliary_loss_clip": 0.01169926, "auxiliary_loss_mlp": 0.01025092, "balance_loss_clip": 1.00984955, "balance_loss_mlp": 1.01779008, "epoch": 0.8986953646365659, "flos": 21617003208960.0, "grad_norm": 1.905701323604986, "language_loss": 0.6965214, "learning_rate": 1.0658882150689862e-07, "loss": 0.71847153, "num_input_tokens_seen": 161585820, "step": 7474, "time_per_iteration": 2.6018354892730713 }, { "auxiliary_loss_clip": 0.01165042, "auxiliary_loss_mlp": 0.01028668, "balance_loss_clip": 0.93231708, "balance_loss_mlp": 1.02127123, "epoch": 0.8988156075272049, "flos": 14027355083520.0, "grad_norm": 3.0935386595049272, "language_loss": 0.7837804, "learning_rate": 1.0633805821830288e-07, "loss": 0.80571747, "num_input_tokens_seen": 161602505, "step": 7475, "time_per_iteration": 2.6690006256103516 }, { "auxiliary_loss_clip": 0.01165696, "auxiliary_loss_mlp": 0.01027299, "balance_loss_clip": 0.97110325, "balance_loss_mlp": 1.02010822, "epoch": 0.8989358504178441, "flos": 29059202004480.0, "grad_norm": 3.094894862898306, "language_loss": 0.83061194, "learning_rate": 1.0608758219671753e-07, "loss": 0.85254192, "num_input_tokens_seen": 161621545, "step": 7476, "time_per_iteration": 2.7193872928619385 }, { "auxiliary_loss_clip": 0.01172092, "auxiliary_loss_mlp": 0.0102941, "balance_loss_clip": 0.97204566, "balance_loss_mlp": 1.02207232, "epoch": 0.8990560933084831, "flos": 20230420446720.0, "grad_norm": 1.5893325547310937, "language_loss": 0.70728511, "learning_rate": 1.0583739348014065e-07, "loss": 0.72930014, "num_input_tokens_seen": 161642630, "step": 7477, "time_per_iteration": 2.6295359134674072 }, { "auxiliary_loss_clip": 0.01168068, "auxiliary_loss_mlp": 0.01026777, "balance_loss_clip": 1.0490303, "balance_loss_mlp": 1.02002978, "epoch": 0.8991763361991222, "flos": 25520672459520.0, "grad_norm": 1.9620953528870875, "language_loss": 0.84501326, "learning_rate": 1.0558749210652518e-07, "loss": 0.86696172, "num_input_tokens_seen": 161662560, "step": 7478, "time_per_iteration": 2.62461519241333 }, { "auxiliary_loss_clip": 0.01169089, "auxiliary_loss_mlp": 0.0102349, "balance_loss_clip": 0.93411219, "balance_loss_mlp": 1.01623356, "epoch": 0.8992965790897613, "flos": 25119191168640.0, "grad_norm": 2.1785322861064484, "language_loss": 0.85400552, "learning_rate": 1.053378781137808e-07, "loss": 0.87593132, "num_input_tokens_seen": 161683480, "step": 7479, "time_per_iteration": 3.538027286529541 }, { "auxiliary_loss_clip": 0.01169501, "auxiliary_loss_mlp": 0.01026641, "balance_loss_clip": 0.97168612, "balance_loss_mlp": 1.0191015, "epoch": 0.8994168219804004, "flos": 16070815814400.0, "grad_norm": 1.8952423874824011, "language_loss": 0.77989089, "learning_rate": 1.0508855153977392e-07, "loss": 0.80185235, "num_input_tokens_seen": 161699945, "step": 7480, "time_per_iteration": 2.595660448074341 }, { "auxiliary_loss_clip": 0.01162834, "auxiliary_loss_mlp": 0.01023965, "balance_loss_clip": 1.006248, "balance_loss_mlp": 1.01709819, "epoch": 0.8995370648710395, "flos": 24825764966400.0, "grad_norm": 2.192971506056647, "language_loss": 0.66326189, "learning_rate": 1.0483951242232669e-07, "loss": 0.68512988, "num_input_tokens_seen": 161720420, "step": 7481, "time_per_iteration": 2.625572919845581 }, { "auxiliary_loss_clip": 0.01059059, "auxiliary_loss_mlp": 0.01003371, "balance_loss_clip": 1.00827122, "balance_loss_mlp": 1.00174928, "epoch": 0.8996573077616786, "flos": 63116238378240.0, "grad_norm": 1.0082626576208837, "language_loss": 0.57770741, "learning_rate": 1.0459076079921936e-07, "loss": 0.59833181, "num_input_tokens_seen": 161773080, "step": 7482, "time_per_iteration": 3.2983877658843994 }, { "auxiliary_loss_clip": 0.01155448, "auxiliary_loss_mlp": 0.0103164, "balance_loss_clip": 0.9698956, "balance_loss_mlp": 1.02428138, "epoch": 0.8997775506523177, "flos": 18219674027520.0, "grad_norm": 2.3336618491557966, "language_loss": 0.85273063, "learning_rate": 1.0434229670818618e-07, "loss": 0.87460148, "num_input_tokens_seen": 161789755, "step": 7483, "time_per_iteration": 3.5673534870147705 }, { "auxiliary_loss_clip": 0.01155699, "auxiliary_loss_mlp": 0.01023444, "balance_loss_clip": 0.96962142, "balance_loss_mlp": 1.01640821, "epoch": 0.8998977935429567, "flos": 24166768095360.0, "grad_norm": 1.6990030885754828, "language_loss": 0.79969418, "learning_rate": 1.0409412018691944e-07, "loss": 0.82148564, "num_input_tokens_seen": 161810220, "step": 7484, "time_per_iteration": 2.6703317165374756 }, { "auxiliary_loss_clip": 0.01161012, "auxiliary_loss_mlp": 0.01028739, "balance_loss_clip": 0.97204864, "balance_loss_mlp": 1.02137756, "epoch": 0.9000180364335959, "flos": 20773030273920.0, "grad_norm": 1.6998026706673823, "language_loss": 0.75185055, "learning_rate": 1.0384623127306724e-07, "loss": 0.77374804, "num_input_tokens_seen": 161827565, "step": 7485, "time_per_iteration": 2.7261574268341064 }, { "auxiliary_loss_clip": 0.01159118, "auxiliary_loss_mlp": 0.01026808, "balance_loss_clip": 0.93068564, "balance_loss_mlp": 1.02001953, "epoch": 0.900138279324235, "flos": 19205745166080.0, "grad_norm": 1.6740586505343502, "language_loss": 0.7947762, "learning_rate": 1.0359863000423397e-07, "loss": 0.81663549, "num_input_tokens_seen": 161845700, "step": 7486, "time_per_iteration": 3.597792387008667 }, { "auxiliary_loss_clip": 0.01167315, "auxiliary_loss_mlp": 0.01026636, "balance_loss_clip": 1.04728925, "balance_loss_mlp": 1.01953435, "epoch": 0.900258522214874, "flos": 28731158069760.0, "grad_norm": 1.6923867347034802, "language_loss": 0.71770388, "learning_rate": 1.0335131641798112e-07, "loss": 0.73964334, "num_input_tokens_seen": 161867660, "step": 7487, "time_per_iteration": 3.5722134113311768 }, { "auxiliary_loss_clip": 0.01063019, "auxiliary_loss_mlp": 0.01003743, "balance_loss_clip": 0.93469489, "balance_loss_mlp": 1.0020386, "epoch": 0.9003787651055132, "flos": 58280685655680.0, "grad_norm": 0.8311551438961264, "language_loss": 0.55663514, "learning_rate": 1.0310429055182512e-07, "loss": 0.57730275, "num_input_tokens_seen": 161921980, "step": 7488, "time_per_iteration": 3.1225790977478027 }, { "auxiliary_loss_clip": 0.01167468, "auxiliary_loss_mlp": 0.01024771, "balance_loss_clip": 0.93335634, "balance_loss_mlp": 1.01754105, "epoch": 0.9004990079961522, "flos": 25556475340800.0, "grad_norm": 1.743363290953299, "language_loss": 0.74089217, "learning_rate": 1.0285755244324024e-07, "loss": 0.76281458, "num_input_tokens_seen": 161942725, "step": 7489, "time_per_iteration": 2.7406373023986816 }, { "auxiliary_loss_clip": 0.01166042, "auxiliary_loss_mlp": 0.01122291, "balance_loss_clip": 0.96979463, "balance_loss_mlp": 0.0, "epoch": 0.9006192508867913, "flos": 23335185352320.0, "grad_norm": 1.441804880430036, "language_loss": 0.68941474, "learning_rate": 1.0261110212965629e-07, "loss": 0.71229804, "num_input_tokens_seen": 161964520, "step": 7490, "time_per_iteration": 2.6964457035064697 }, { "auxiliary_loss_clip": 0.01163431, "auxiliary_loss_mlp": 0.01030224, "balance_loss_clip": 0.96948355, "balance_loss_mlp": 1.02348602, "epoch": 0.9007394937774305, "flos": 18040300485120.0, "grad_norm": 2.029064936768715, "language_loss": 0.78892887, "learning_rate": 1.023649396484596e-07, "loss": 0.8108654, "num_input_tokens_seen": 161983575, "step": 7491, "time_per_iteration": 2.6506357192993164 }, { "auxiliary_loss_clip": 0.01167226, "auxiliary_loss_mlp": 0.01024202, "balance_loss_clip": 1.04753745, "balance_loss_mlp": 1.0175885, "epoch": 0.9008597366680695, "flos": 43068456633600.0, "grad_norm": 2.777956186516845, "language_loss": 0.6767841, "learning_rate": 1.0211906503699275e-07, "loss": 0.6986984, "num_input_tokens_seen": 162006550, "step": 7492, "time_per_iteration": 2.756197690963745 }, { "auxiliary_loss_clip": 0.01169056, "auxiliary_loss_mlp": 0.01027866, "balance_loss_clip": 1.01183105, "balance_loss_mlp": 1.02070093, "epoch": 0.9009799795587086, "flos": 14939055112320.0, "grad_norm": 2.162210389344314, "language_loss": 0.82024181, "learning_rate": 1.0187347833255455e-07, "loss": 0.84221095, "num_input_tokens_seen": 162022455, "step": 7493, "time_per_iteration": 2.617975950241089 }, { "auxiliary_loss_clip": 0.01165163, "auxiliary_loss_mlp": 0.01026574, "balance_loss_clip": 1.04859412, "balance_loss_mlp": 1.01961184, "epoch": 0.9011002224493477, "flos": 21579584215680.0, "grad_norm": 1.7026169087434069, "language_loss": 0.79172975, "learning_rate": 1.0162817957240056e-07, "loss": 0.81364715, "num_input_tokens_seen": 162042350, "step": 7494, "time_per_iteration": 2.596271514892578 }, { "auxiliary_loss_clip": 0.01062665, "auxiliary_loss_mlp": 0.010019, "balance_loss_clip": 0.97163498, "balance_loss_mlp": 1.00023127, "epoch": 0.9012204653399868, "flos": 71166367883520.0, "grad_norm": 0.8791844046577423, "language_loss": 0.63107306, "learning_rate": 1.0138316879374253e-07, "loss": 0.65171868, "num_input_tokens_seen": 162111640, "step": 7495, "time_per_iteration": 3.317721128463745 }, { "auxiliary_loss_clip": 0.01166098, "auxiliary_loss_mlp": 0.01023605, "balance_loss_clip": 0.97262025, "balance_loss_mlp": 1.01662493, "epoch": 0.9013407082306258, "flos": 15594963413760.0, "grad_norm": 2.1260512569632226, "language_loss": 0.74600232, "learning_rate": 1.0113844603374833e-07, "loss": 0.76789927, "num_input_tokens_seen": 162128165, "step": 7496, "time_per_iteration": 2.582359552383423 }, { "auxiliary_loss_clip": 0.01163864, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 0.96944231, "balance_loss_mlp": 1.02414, "epoch": 0.901460951121265, "flos": 15049157276160.0, "grad_norm": 2.50957176234986, "language_loss": 0.72105229, "learning_rate": 1.0089401132954178e-07, "loss": 0.7430107, "num_input_tokens_seen": 162146145, "step": 7497, "time_per_iteration": 2.737253189086914 }, { "auxiliary_loss_clip": 0.01167416, "auxiliary_loss_mlp": 0.01026469, "balance_loss_clip": 0.97305429, "balance_loss_mlp": 1.02008522, "epoch": 0.9015811940119041, "flos": 22236857233920.0, "grad_norm": 1.6689923702430685, "language_loss": 0.71996868, "learning_rate": 1.006498647182037e-07, "loss": 0.74190748, "num_input_tokens_seen": 162164800, "step": 7498, "time_per_iteration": 2.6370906829833984 }, { "auxiliary_loss_clip": 0.0115502, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 0.85326302, "balance_loss_mlp": 1.02066803, "epoch": 0.9017014369025431, "flos": 24973824827520.0, "grad_norm": 2.081170329479492, "language_loss": 0.71567333, "learning_rate": 1.004060062367713e-07, "loss": 0.73750657, "num_input_tokens_seen": 162185895, "step": 7499, "time_per_iteration": 2.8005599975585938 }, { "auxiliary_loss_clip": 0.01168223, "auxiliary_loss_mlp": 0.01030078, "balance_loss_clip": 1.00964999, "balance_loss_mlp": 1.02256799, "epoch": 0.9018216797931822, "flos": 18114168804480.0, "grad_norm": 1.7161279501640767, "language_loss": 0.69162548, "learning_rate": 1.0016243592223728e-07, "loss": 0.7136085, "num_input_tokens_seen": 162206295, "step": 7500, "time_per_iteration": 2.6142873764038086 }, { "auxiliary_loss_clip": 0.01155859, "auxiliary_loss_mlp": 0.01023679, "balance_loss_clip": 0.85565215, "balance_loss_mlp": 1.01691425, "epoch": 0.9019419226838213, "flos": 37268452759680.0, "grad_norm": 1.9330938704181793, "language_loss": 0.65822923, "learning_rate": 9.991915381155114e-08, "loss": 0.68002468, "num_input_tokens_seen": 162229275, "step": 7501, "time_per_iteration": 2.9257051944732666 }, { "auxiliary_loss_clip": 0.01171385, "auxiliary_loss_mlp": 0.01022667, "balance_loss_clip": 1.01164806, "balance_loss_mlp": 1.01549673, "epoch": 0.9020621655744604, "flos": 23441121538560.0, "grad_norm": 2.0844028199762974, "language_loss": 0.74557137, "learning_rate": 9.967615994161871e-08, "loss": 0.76751196, "num_input_tokens_seen": 162248935, "step": 7502, "time_per_iteration": 2.7666807174682617 }, { "auxiliary_loss_clip": 0.01163564, "auxiliary_loss_mlp": 0.01024618, "balance_loss_clip": 1.04532361, "balance_loss_mlp": 1.01765037, "epoch": 0.9021824084650995, "flos": 22857465444480.0, "grad_norm": 1.698420325543636, "language_loss": 0.78496581, "learning_rate": 9.943345434930161e-08, "loss": 0.80684763, "num_input_tokens_seen": 162269185, "step": 7503, "time_per_iteration": 2.5838377475738525 }, { "auxiliary_loss_clip": 0.01161223, "auxiliary_loss_mlp": 0.01024622, "balance_loss_clip": 0.93378168, "balance_loss_mlp": 1.01777375, "epoch": 0.9023026513557386, "flos": 22127581082880.0, "grad_norm": 2.6295084745119373, "language_loss": 0.69268823, "learning_rate": 9.919103707141885e-08, "loss": 0.71454668, "num_input_tokens_seen": 162288065, "step": 7504, "time_per_iteration": 2.6616365909576416 }, { "auxiliary_loss_clip": 0.0116624, "auxiliary_loss_mlp": 0.01027997, "balance_loss_clip": 1.00923896, "balance_loss_mlp": 1.02045667, "epoch": 0.9024228942463777, "flos": 24199087357440.0, "grad_norm": 2.2738059769816448, "language_loss": 0.76299524, "learning_rate": 9.89489081447441e-08, "loss": 0.78493762, "num_input_tokens_seen": 162305265, "step": 7505, "time_per_iteration": 3.6969480514526367 }, { "auxiliary_loss_clip": 0.01162724, "auxiliary_loss_mlp": 0.01021877, "balance_loss_clip": 0.96911246, "balance_loss_mlp": 1.01467121, "epoch": 0.9025431371370167, "flos": 25008262992000.0, "grad_norm": 1.9076483983222132, "language_loss": 0.82876289, "learning_rate": 9.870706760600844e-08, "loss": 0.85060894, "num_input_tokens_seen": 162325215, "step": 7506, "time_per_iteration": 2.724283218383789 }, { "auxiliary_loss_clip": 0.01169889, "auxiliary_loss_mlp": 0.01029596, "balance_loss_clip": 0.90017104, "balance_loss_mlp": 1.02282202, "epoch": 0.9026633800276559, "flos": 18952862440320.0, "grad_norm": 1.8925364888777505, "language_loss": 0.72847795, "learning_rate": 9.846551549189918e-08, "loss": 0.75047284, "num_input_tokens_seen": 162344820, "step": 7507, "time_per_iteration": 2.748673439025879 }, { "auxiliary_loss_clip": 0.01164195, "auxiliary_loss_mlp": 0.0103115, "balance_loss_clip": 0.97310734, "balance_loss_mlp": 1.02344275, "epoch": 0.902783622918295, "flos": 32416059536640.0, "grad_norm": 1.8905994370020898, "language_loss": 0.68914175, "learning_rate": 9.822425183905902e-08, "loss": 0.71109521, "num_input_tokens_seen": 162365345, "step": 7508, "time_per_iteration": 2.808478355407715 }, { "auxiliary_loss_clip": 0.01066578, "auxiliary_loss_mlp": 0.01002541, "balance_loss_clip": 0.89836317, "balance_loss_mlp": 1.00084841, "epoch": 0.902903865808934, "flos": 63717453244800.0, "grad_norm": 0.9362518796013728, "language_loss": 0.75345635, "learning_rate": 9.798327668408823e-08, "loss": 0.77414751, "num_input_tokens_seen": 162426980, "step": 7509, "time_per_iteration": 4.27722954750061 }, { "auxiliary_loss_clip": 0.01170639, "auxiliary_loss_mlp": 0.01028764, "balance_loss_clip": 1.04851162, "balance_loss_mlp": 1.02138543, "epoch": 0.9030241086995732, "flos": 23804034600960.0, "grad_norm": 2.0434776037153766, "language_loss": 0.6864723, "learning_rate": 9.774259006354158e-08, "loss": 0.70846635, "num_input_tokens_seen": 162447050, "step": 7510, "time_per_iteration": 2.6393849849700928 }, { "auxiliary_loss_clip": 0.0116706, "auxiliary_loss_mlp": 0.01027278, "balance_loss_clip": 0.96981204, "balance_loss_mlp": 1.02064967, "epoch": 0.9031443515902122, "flos": 26395887248640.0, "grad_norm": 1.7434026240919902, "language_loss": 0.76633829, "learning_rate": 9.750219201393184e-08, "loss": 0.78828168, "num_input_tokens_seen": 162467015, "step": 7511, "time_per_iteration": 3.652996778488159 }, { "auxiliary_loss_clip": 0.01164688, "auxiliary_loss_mlp": 0.01029771, "balance_loss_clip": 1.00803936, "balance_loss_mlp": 1.02301145, "epoch": 0.9032645944808513, "flos": 24939350749440.0, "grad_norm": 1.7418643937352378, "language_loss": 0.77620941, "learning_rate": 9.726208257172697e-08, "loss": 0.798154, "num_input_tokens_seen": 162488710, "step": 7512, "time_per_iteration": 2.7097222805023193 }, { "auxiliary_loss_clip": 0.01165611, "auxiliary_loss_mlp": 0.01021189, "balance_loss_clip": 1.04780602, "balance_loss_mlp": 1.01443911, "epoch": 0.9033848373714904, "flos": 21178821196800.0, "grad_norm": 2.0277038883377156, "language_loss": 0.74902344, "learning_rate": 9.702226177335115e-08, "loss": 0.77089149, "num_input_tokens_seen": 162507205, "step": 7513, "time_per_iteration": 3.4361002445220947 }, { "auxiliary_loss_clip": 0.01166782, "auxiliary_loss_mlp": 0.01026137, "balance_loss_clip": 0.97301316, "balance_loss_mlp": 1.0192318, "epoch": 0.9035050802621295, "flos": 26286359702400.0, "grad_norm": 1.626042422214569, "language_loss": 0.72743118, "learning_rate": 9.67827296551853e-08, "loss": 0.74936038, "num_input_tokens_seen": 162528490, "step": 7514, "time_per_iteration": 2.6860318183898926 }, { "auxiliary_loss_clip": 0.01151886, "auxiliary_loss_mlp": 0.01121853, "balance_loss_clip": 0.96669066, "balance_loss_mlp": 0.0, "epoch": 0.9036253231527686, "flos": 24204546224640.0, "grad_norm": 2.0739346627530506, "language_loss": 0.68407691, "learning_rate": 9.65434862535659e-08, "loss": 0.70681429, "num_input_tokens_seen": 162547860, "step": 7515, "time_per_iteration": 2.6750733852386475 }, { "auxiliary_loss_clip": 0.01168583, "auxiliary_loss_mlp": 0.01028207, "balance_loss_clip": 0.9710201, "balance_loss_mlp": 1.02119732, "epoch": 0.9037455660434077, "flos": 18072655660800.0, "grad_norm": 2.5134743343106862, "language_loss": 0.6474576, "learning_rate": 9.630453160478635e-08, "loss": 0.66942555, "num_input_tokens_seen": 162563215, "step": 7516, "time_per_iteration": 2.614799976348877 }, { "auxiliary_loss_clip": 0.01158872, "auxiliary_loss_mlp": 0.01028858, "balance_loss_clip": 0.89122832, "balance_loss_mlp": 1.02184868, "epoch": 0.9038658089340468, "flos": 24060795995520.0, "grad_norm": 1.6971200642328266, "language_loss": 0.82346284, "learning_rate": 9.60658657450959e-08, "loss": 0.84534013, "num_input_tokens_seen": 162583515, "step": 7517, "time_per_iteration": 2.7208242416381836 }, { "auxiliary_loss_clip": 0.01152126, "auxiliary_loss_mlp": 0.01022232, "balance_loss_clip": 0.96805364, "balance_loss_mlp": 1.01546383, "epoch": 0.9039860518246858, "flos": 21834298535040.0, "grad_norm": 1.5432657226163575, "language_loss": 0.79424232, "learning_rate": 9.582748871069979e-08, "loss": 0.81598592, "num_input_tokens_seen": 162602955, "step": 7518, "time_per_iteration": 2.687452554702759 }, { "auxiliary_loss_clip": 0.01165268, "auxiliary_loss_mlp": 0.01122137, "balance_loss_clip": 0.96907759, "balance_loss_mlp": 0.0, "epoch": 0.904106294715325, "flos": 26614870513920.0, "grad_norm": 3.1635815775002993, "language_loss": 0.83354592, "learning_rate": 9.558940053775954e-08, "loss": 0.85641998, "num_input_tokens_seen": 162621595, "step": 7519, "time_per_iteration": 2.7192165851593018 }, { "auxiliary_loss_clip": 0.01165804, "auxiliary_loss_mlp": 0.01028712, "balance_loss_clip": 1.01012373, "balance_loss_mlp": 1.02169919, "epoch": 0.904226537605964, "flos": 17785693906560.0, "grad_norm": 1.8436969540403556, "language_loss": 0.67840636, "learning_rate": 9.535160126239294e-08, "loss": 0.70035154, "num_input_tokens_seen": 162638220, "step": 7520, "time_per_iteration": 2.5727622509002686 }, { "auxiliary_loss_clip": 0.0116605, "auxiliary_loss_mlp": 0.0102343, "balance_loss_clip": 1.01195168, "balance_loss_mlp": 1.01632822, "epoch": 0.9043467804966031, "flos": 24790428961920.0, "grad_norm": 1.4404092743613952, "language_loss": 0.70641959, "learning_rate": 9.511409092067424e-08, "loss": 0.7283144, "num_input_tokens_seen": 162658575, "step": 7521, "time_per_iteration": 2.697760581970215 }, { "auxiliary_loss_clip": 0.01164126, "auxiliary_loss_mlp": 0.01022692, "balance_loss_clip": 0.9722985, "balance_loss_mlp": 1.0152061, "epoch": 0.9044670233872423, "flos": 22632125472000.0, "grad_norm": 1.6765950622650794, "language_loss": 0.67325395, "learning_rate": 9.487686954863327e-08, "loss": 0.69512218, "num_input_tokens_seen": 162678295, "step": 7522, "time_per_iteration": 2.6631407737731934 }, { "auxiliary_loss_clip": 0.011648, "auxiliary_loss_mlp": 0.01020601, "balance_loss_clip": 1.01004195, "balance_loss_mlp": 1.01341224, "epoch": 0.9045872662778813, "flos": 23771320289280.0, "grad_norm": 2.3646668681964207, "language_loss": 0.77280986, "learning_rate": 9.46399371822566e-08, "loss": 0.79466391, "num_input_tokens_seen": 162698070, "step": 7523, "time_per_iteration": 2.707524299621582 }, { "auxiliary_loss_clip": 0.01169537, "auxiliary_loss_mlp": 0.01026503, "balance_loss_clip": 1.04915261, "balance_loss_mlp": 1.01964271, "epoch": 0.9047075091685204, "flos": 15191039998080.0, "grad_norm": 4.035112464120941, "language_loss": 0.72280586, "learning_rate": 9.440329385748657e-08, "loss": 0.74476624, "num_input_tokens_seen": 162715140, "step": 7524, "time_per_iteration": 2.55295991897583 }, { "auxiliary_loss_clip": 0.01164456, "auxiliary_loss_mlp": 0.01020866, "balance_loss_clip": 0.93435729, "balance_loss_mlp": 1.0147028, "epoch": 0.9048277520591596, "flos": 18003707504640.0, "grad_norm": 1.7494840903411994, "language_loss": 0.70754743, "learning_rate": 9.416693961022137e-08, "loss": 0.72940058, "num_input_tokens_seen": 162733390, "step": 7525, "time_per_iteration": 2.7428178787231445 }, { "auxiliary_loss_clip": 0.01143652, "auxiliary_loss_mlp": 0.01029753, "balance_loss_clip": 0.85178024, "balance_loss_mlp": 1.02273798, "epoch": 0.9049479949497986, "flos": 21872471713920.0, "grad_norm": 1.7601806193077196, "language_loss": 0.77201551, "learning_rate": 9.393087447631654e-08, "loss": 0.79374957, "num_input_tokens_seen": 162751670, "step": 7526, "time_per_iteration": 2.750436544418335 }, { "auxiliary_loss_clip": 0.0116533, "auxiliary_loss_mlp": 0.01024527, "balance_loss_clip": 0.97008538, "balance_loss_mlp": 1.01766026, "epoch": 0.9050682378404377, "flos": 20773928113920.0, "grad_norm": 1.5428126842915961, "language_loss": 0.72453707, "learning_rate": 9.36950984915823e-08, "loss": 0.74643564, "num_input_tokens_seen": 162770025, "step": 7527, "time_per_iteration": 2.625576972961426 }, { "auxiliary_loss_clip": 0.01167628, "auxiliary_loss_mlp": 0.01027014, "balance_loss_clip": 1.04838777, "balance_loss_mlp": 1.02006376, "epoch": 0.9051884807310768, "flos": 21580015178880.0, "grad_norm": 1.773470023073589, "language_loss": 0.68962866, "learning_rate": 9.345961169178607e-08, "loss": 0.71157509, "num_input_tokens_seen": 162789710, "step": 7528, "time_per_iteration": 2.578794240951538 }, { "auxiliary_loss_clip": 0.01156452, "auxiliary_loss_mlp": 0.01028155, "balance_loss_clip": 0.93588454, "balance_loss_mlp": 1.02100837, "epoch": 0.9053087236217159, "flos": 21908059113600.0, "grad_norm": 1.4248570281016506, "language_loss": 0.72899002, "learning_rate": 9.322441411265081e-08, "loss": 0.75083613, "num_input_tokens_seen": 162810695, "step": 7529, "time_per_iteration": 2.649142265319824 }, { "auxiliary_loss_clip": 0.01158585, "auxiliary_loss_mlp": 0.01024902, "balance_loss_clip": 0.97114158, "balance_loss_mlp": 1.01790154, "epoch": 0.9054289665123549, "flos": 17055809544960.0, "grad_norm": 1.876365105625734, "language_loss": 0.73180521, "learning_rate": 9.298950578985554e-08, "loss": 0.75364006, "num_input_tokens_seen": 162827770, "step": 7530, "time_per_iteration": 2.648735761642456 }, { "auxiliary_loss_clip": 0.01165561, "auxiliary_loss_mlp": 0.01122853, "balance_loss_clip": 1.01153505, "balance_loss_mlp": 0.0, "epoch": 0.905549209402994, "flos": 20777268078720.0, "grad_norm": 1.7215555789907115, "language_loss": 0.70762056, "learning_rate": 9.275488675903665e-08, "loss": 0.73050469, "num_input_tokens_seen": 162846715, "step": 7531, "time_per_iteration": 3.513004779815674 }, { "auxiliary_loss_clip": 0.01160521, "auxiliary_loss_mlp": 0.01021022, "balance_loss_clip": 0.89440441, "balance_loss_mlp": 1.013798, "epoch": 0.9056694522936332, "flos": 21686813291520.0, "grad_norm": 2.276371744241441, "language_loss": 0.73863471, "learning_rate": 9.252055705578454e-08, "loss": 0.76045012, "num_input_tokens_seen": 162866215, "step": 7532, "time_per_iteration": 2.687514543533325 }, { "auxiliary_loss_clip": 0.01165954, "auxiliary_loss_mlp": 0.01028141, "balance_loss_clip": 1.00904918, "balance_loss_mlp": 1.0216229, "epoch": 0.9057896951842722, "flos": 29569133433600.0, "grad_norm": 1.6239354319768464, "language_loss": 0.72120309, "learning_rate": 9.228651671564747e-08, "loss": 0.74314404, "num_input_tokens_seen": 162888245, "step": 7533, "time_per_iteration": 2.6388721466064453 }, { "auxiliary_loss_clip": 0.01163113, "auxiliary_loss_mlp": 0.01027859, "balance_loss_clip": 0.89718038, "balance_loss_mlp": 1.02123404, "epoch": 0.9059099380749113, "flos": 27892248952320.0, "grad_norm": 1.4507869231742343, "language_loss": 0.77666366, "learning_rate": 9.205276577412901e-08, "loss": 0.79857337, "num_input_tokens_seen": 162911025, "step": 7534, "time_per_iteration": 2.745758533477783 }, { "auxiliary_loss_clip": 0.01169919, "auxiliary_loss_mlp": 0.01122691, "balance_loss_clip": 0.96986353, "balance_loss_mlp": 0.0, "epoch": 0.9060301809655504, "flos": 17748993185280.0, "grad_norm": 2.509815579138281, "language_loss": 0.77073956, "learning_rate": 9.181930426668905e-08, "loss": 0.79366571, "num_input_tokens_seen": 162927820, "step": 7535, "time_per_iteration": 2.6027896404266357 }, { "auxiliary_loss_clip": 0.01159536, "auxiliary_loss_mlp": 0.01020304, "balance_loss_clip": 0.89456987, "balance_loss_mlp": 1.01335454, "epoch": 0.9061504238561895, "flos": 31759432963200.0, "grad_norm": 1.5451631273686222, "language_loss": 0.67778552, "learning_rate": 9.158613222874346e-08, "loss": 0.69958389, "num_input_tokens_seen": 162949445, "step": 7536, "time_per_iteration": 3.752373218536377 }, { "auxiliary_loss_clip": 0.01162104, "auxiliary_loss_mlp": 0.01027933, "balance_loss_clip": 0.97061992, "balance_loss_mlp": 1.02088797, "epoch": 0.9062706667468285, "flos": 20048066075520.0, "grad_norm": 1.6294045008664286, "language_loss": 0.81785864, "learning_rate": 9.135324969566394e-08, "loss": 0.83975899, "num_input_tokens_seen": 162968945, "step": 7537, "time_per_iteration": 3.500149965286255 }, { "auxiliary_loss_clip": 0.01172923, "auxiliary_loss_mlp": 0.01023664, "balance_loss_clip": 1.01260459, "balance_loss_mlp": 1.01676452, "epoch": 0.9063909096374677, "flos": 18437292576000.0, "grad_norm": 2.236129482765967, "language_loss": 0.75399327, "learning_rate": 9.112065670277913e-08, "loss": 0.77595913, "num_input_tokens_seen": 162985310, "step": 7538, "time_per_iteration": 3.5791831016540527 }, { "auxiliary_loss_clip": 0.01159396, "auxiliary_loss_mlp": 0.01025849, "balance_loss_clip": 0.96867824, "balance_loss_mlp": 1.01921153, "epoch": 0.9065111525281068, "flos": 33547353361920.0, "grad_norm": 1.9535340492091537, "language_loss": 0.73037422, "learning_rate": 9.088835328537303e-08, "loss": 0.75222665, "num_input_tokens_seen": 163006900, "step": 7539, "time_per_iteration": 2.758603572845459 }, { "auxiliary_loss_clip": 0.01170462, "auxiliary_loss_mlp": 0.01032181, "balance_loss_clip": 0.97294462, "balance_loss_mlp": 1.02478456, "epoch": 0.9066313954187458, "flos": 23367863750400.0, "grad_norm": 4.576745942618813, "language_loss": 0.71472275, "learning_rate": 9.065633947868568e-08, "loss": 0.73674917, "num_input_tokens_seen": 163026505, "step": 7540, "time_per_iteration": 2.7207274436950684 }, { "auxiliary_loss_clip": 0.01160873, "auxiliary_loss_mlp": 0.01122008, "balance_loss_clip": 0.9348104, "balance_loss_mlp": 0.0, "epoch": 0.906751638309385, "flos": 26249623067520.0, "grad_norm": 2.726474420806786, "language_loss": 0.79765731, "learning_rate": 9.042461531791379e-08, "loss": 0.82048619, "num_input_tokens_seen": 163044925, "step": 7541, "time_per_iteration": 2.7174689769744873 }, { "auxiliary_loss_clip": 0.01162115, "auxiliary_loss_mlp": 0.01020998, "balance_loss_clip": 1.04597187, "balance_loss_mlp": 1.01430178, "epoch": 0.906871881200024, "flos": 16544477485440.0, "grad_norm": 1.5491948954788566, "language_loss": 0.78186512, "learning_rate": 9.019318083820903e-08, "loss": 0.80369622, "num_input_tokens_seen": 163063505, "step": 7542, "time_per_iteration": 2.5881412029266357 }, { "auxiliary_loss_clip": 0.01164318, "auxiliary_loss_mlp": 0.01025869, "balance_loss_clip": 1.00864828, "balance_loss_mlp": 1.01854062, "epoch": 0.9069921240906631, "flos": 24605129675520.0, "grad_norm": 1.5130904830939187, "language_loss": 0.84859049, "learning_rate": 8.996203607468045e-08, "loss": 0.87049234, "num_input_tokens_seen": 163082505, "step": 7543, "time_per_iteration": 2.7825775146484375 }, { "auxiliary_loss_clip": 0.01158797, "auxiliary_loss_mlp": 0.01022091, "balance_loss_clip": 1.00606704, "balance_loss_mlp": 1.01509976, "epoch": 0.9071123669813023, "flos": 25374731500800.0, "grad_norm": 3.093404391961825, "language_loss": 0.75318056, "learning_rate": 8.973118106239241e-08, "loss": 0.77498943, "num_input_tokens_seen": 163105110, "step": 7544, "time_per_iteration": 2.668793201446533 }, { "auxiliary_loss_clip": 0.01155395, "auxiliary_loss_mlp": 0.01026943, "balance_loss_clip": 0.85043907, "balance_loss_mlp": 1.01973939, "epoch": 0.9072326098719413, "flos": 26725798690560.0, "grad_norm": 1.889247154608036, "language_loss": 0.94571054, "learning_rate": 8.95006158363656e-08, "loss": 0.96753389, "num_input_tokens_seen": 163125295, "step": 7545, "time_per_iteration": 2.837505340576172 }, { "auxiliary_loss_clip": 0.01168312, "auxiliary_loss_mlp": 0.01030741, "balance_loss_clip": 1.0113821, "balance_loss_mlp": 1.02284312, "epoch": 0.9073528527625804, "flos": 23878800760320.0, "grad_norm": 1.654777016233181, "language_loss": 0.77171522, "learning_rate": 8.9270340431576e-08, "loss": 0.79370576, "num_input_tokens_seen": 163144385, "step": 7546, "time_per_iteration": 2.718472480773926 }, { "auxiliary_loss_clip": 0.01166048, "auxiliary_loss_mlp": 0.01023535, "balance_loss_clip": 1.00741982, "balance_loss_mlp": 1.01635253, "epoch": 0.9074730956532195, "flos": 37852144767360.0, "grad_norm": 2.711903864361529, "language_loss": 0.73272467, "learning_rate": 8.904035488295658e-08, "loss": 0.75462055, "num_input_tokens_seen": 163163885, "step": 7547, "time_per_iteration": 2.8696024417877197 }, { "auxiliary_loss_clip": 0.01062919, "auxiliary_loss_mlp": 0.01115798, "balance_loss_clip": 0.9720962, "balance_loss_mlp": 0.0, "epoch": 0.9075933385438586, "flos": 65173307385600.0, "grad_norm": 0.6733337747637349, "language_loss": 0.53324437, "learning_rate": 8.881065922539632e-08, "loss": 0.55503154, "num_input_tokens_seen": 163224325, "step": 7548, "time_per_iteration": 3.174959897994995 }, { "auxiliary_loss_clip": 0.01158065, "auxiliary_loss_mlp": 0.01026553, "balance_loss_clip": 0.93311334, "balance_loss_mlp": 1.01996088, "epoch": 0.9077135814344977, "flos": 19931571290880.0, "grad_norm": 1.6402525939784482, "language_loss": 0.73327327, "learning_rate": 8.85812534937389e-08, "loss": 0.75511944, "num_input_tokens_seen": 163242425, "step": 7549, "time_per_iteration": 2.7218358516693115 }, { "auxiliary_loss_clip": 0.01173172, "auxiliary_loss_mlp": 0.01025429, "balance_loss_clip": 1.01153767, "balance_loss_mlp": 1.01814508, "epoch": 0.9078338243251368, "flos": 17529650784000.0, "grad_norm": 3.00904216790287, "language_loss": 0.67384171, "learning_rate": 8.835213772278583e-08, "loss": 0.69582772, "num_input_tokens_seen": 163259280, "step": 7550, "time_per_iteration": 2.561352014541626 }, { "auxiliary_loss_clip": 0.01156785, "auxiliary_loss_mlp": 0.01020372, "balance_loss_clip": 0.93410301, "balance_loss_mlp": 1.01345158, "epoch": 0.9079540672157759, "flos": 28803410277120.0, "grad_norm": 1.7964071341506411, "language_loss": 0.78686082, "learning_rate": 8.812331194729373e-08, "loss": 0.80863237, "num_input_tokens_seen": 163278925, "step": 7551, "time_per_iteration": 2.722450017929077 }, { "auxiliary_loss_clip": 0.01169474, "auxiliary_loss_mlp": 0.01031943, "balance_loss_clip": 1.05030429, "balance_loss_mlp": 1.02424848, "epoch": 0.9080743101064149, "flos": 23513840622720.0, "grad_norm": 1.8240850266198978, "language_loss": 0.72098559, "learning_rate": 8.789477620197461e-08, "loss": 0.74299979, "num_input_tokens_seen": 163298450, "step": 7552, "time_per_iteration": 2.639409303665161 }, { "auxiliary_loss_clip": 0.01163494, "auxiliary_loss_mlp": 0.01026971, "balance_loss_clip": 0.97029465, "balance_loss_mlp": 1.01994061, "epoch": 0.9081945529970541, "flos": 22778102344320.0, "grad_norm": 2.0418273255371293, "language_loss": 0.79366785, "learning_rate": 8.766653052149831e-08, "loss": 0.8155725, "num_input_tokens_seen": 163313635, "step": 7553, "time_per_iteration": 2.732257843017578 }, { "auxiliary_loss_clip": 0.01165943, "auxiliary_loss_mlp": 0.01024512, "balance_loss_clip": 0.97203159, "balance_loss_mlp": 1.01767564, "epoch": 0.9083147958876931, "flos": 18873714821760.0, "grad_norm": 1.9849238702550587, "language_loss": 0.74137485, "learning_rate": 8.743857494048823e-08, "loss": 0.76327944, "num_input_tokens_seen": 163330450, "step": 7554, "time_per_iteration": 2.609273672103882 }, { "auxiliary_loss_clip": 0.01161965, "auxiliary_loss_mlp": 0.01019931, "balance_loss_clip": 0.93214405, "balance_loss_mlp": 1.01298082, "epoch": 0.9084350387783322, "flos": 18909374048640.0, "grad_norm": 1.8287178344053527, "language_loss": 0.62693989, "learning_rate": 8.721090949352605e-08, "loss": 0.64875883, "num_input_tokens_seen": 163346690, "step": 7555, "time_per_iteration": 2.6863179206848145 }, { "auxiliary_loss_clip": 0.01178132, "auxiliary_loss_mlp": 0.01027735, "balance_loss_clip": 1.01367164, "balance_loss_mlp": 1.02005208, "epoch": 0.9085552816689714, "flos": 20595488325120.0, "grad_norm": 1.790696230589839, "language_loss": 0.72820854, "learning_rate": 8.698353421514793e-08, "loss": 0.75026721, "num_input_tokens_seen": 163365065, "step": 7556, "time_per_iteration": 2.606003999710083 }, { "auxiliary_loss_clip": 0.01164313, "auxiliary_loss_mlp": 0.01028759, "balance_loss_clip": 1.01088142, "balance_loss_mlp": 1.02165425, "epoch": 0.9086755245596104, "flos": 18113163223680.0, "grad_norm": 2.1792676766098897, "language_loss": 0.80291796, "learning_rate": 8.67564491398467e-08, "loss": 0.82484871, "num_input_tokens_seen": 163382070, "step": 7557, "time_per_iteration": 3.395994186401367 }, { "auxiliary_loss_clip": 0.01166308, "auxiliary_loss_mlp": 0.01029085, "balance_loss_clip": 1.00827646, "balance_loss_mlp": 1.02181959, "epoch": 0.9087957674502495, "flos": 19129793857920.0, "grad_norm": 2.0619807020872214, "language_loss": 0.73378491, "learning_rate": 8.652965430207104e-08, "loss": 0.75573885, "num_input_tokens_seen": 163399975, "step": 7558, "time_per_iteration": 2.581925630569458 }, { "auxiliary_loss_clip": 0.01167194, "auxiliary_loss_mlp": 0.01022956, "balance_loss_clip": 1.00786877, "balance_loss_mlp": 1.01586604, "epoch": 0.9089160103408886, "flos": 18109930999680.0, "grad_norm": 1.9392588570694416, "language_loss": 0.6524784, "learning_rate": 8.630314973622521e-08, "loss": 0.67437989, "num_input_tokens_seen": 163417520, "step": 7559, "time_per_iteration": 2.5644826889038086 }, { "auxiliary_loss_clip": 0.01161686, "auxiliary_loss_mlp": 0.01030072, "balance_loss_clip": 1.00930929, "balance_loss_mlp": 1.02336669, "epoch": 0.9090362532315277, "flos": 33364855336320.0, "grad_norm": 1.7915095828224064, "language_loss": 0.70888662, "learning_rate": 8.607693547666995e-08, "loss": 0.7308042, "num_input_tokens_seen": 163440060, "step": 7560, "time_per_iteration": 2.715796709060669 }, { "auxiliary_loss_clip": 0.01067442, "auxiliary_loss_mlp": 0.0100235, "balance_loss_clip": 0.89689022, "balance_loss_mlp": 1.00060964, "epoch": 0.9091564961221668, "flos": 71480585082240.0, "grad_norm": 0.8915120916881671, "language_loss": 0.57989049, "learning_rate": 8.585101155772201e-08, "loss": 0.60058844, "num_input_tokens_seen": 163502180, "step": 7561, "time_per_iteration": 3.314460039138794 }, { "auxiliary_loss_clip": 0.01152853, "auxiliary_loss_mlp": 0.01024932, "balance_loss_clip": 0.96596527, "balance_loss_mlp": 1.01747, "epoch": 0.9092767390128058, "flos": 24712574232960.0, "grad_norm": 2.6067107664490052, "language_loss": 0.68402255, "learning_rate": 8.562537801365377e-08, "loss": 0.70580041, "num_input_tokens_seen": 163521915, "step": 7562, "time_per_iteration": 3.52685546875 }, { "auxiliary_loss_clip": 0.01170341, "auxiliary_loss_mlp": 0.01032656, "balance_loss_clip": 1.04890037, "balance_loss_mlp": 1.02508032, "epoch": 0.909396981903445, "flos": 23586487879680.0, "grad_norm": 1.7492287391728238, "language_loss": 0.70102644, "learning_rate": 8.540003487869362e-08, "loss": 0.72305632, "num_input_tokens_seen": 163543585, "step": 7563, "time_per_iteration": 2.5350630283355713 }, { "auxiliary_loss_clip": 0.01149951, "auxiliary_loss_mlp": 0.01021891, "balance_loss_clip": 0.93123496, "balance_loss_mlp": 1.01547146, "epoch": 0.909517224794084, "flos": 23404169422080.0, "grad_norm": 1.8178149691864978, "language_loss": 0.79515558, "learning_rate": 8.517498218702557e-08, "loss": 0.81687397, "num_input_tokens_seen": 163561515, "step": 7564, "time_per_iteration": 3.7358922958374023 }, { "auxiliary_loss_clip": 0.0115677, "auxiliary_loss_mlp": 0.01024811, "balance_loss_clip": 0.92947018, "balance_loss_mlp": 1.01776576, "epoch": 0.9096374676847231, "flos": 19208618254080.0, "grad_norm": 1.6658717224459039, "language_loss": 0.70090771, "learning_rate": 8.49502199727905e-08, "loss": 0.7227236, "num_input_tokens_seen": 163579540, "step": 7565, "time_per_iteration": 3.401956796646118 }, { "auxiliary_loss_clip": 0.01156871, "auxiliary_loss_mlp": 0.01026541, "balance_loss_clip": 1.00527453, "balance_loss_mlp": 1.01931739, "epoch": 0.9097577105753623, "flos": 33292495388160.0, "grad_norm": 2.0625805469537584, "language_loss": 0.66461778, "learning_rate": 8.472574827008428e-08, "loss": 0.68645191, "num_input_tokens_seen": 163600425, "step": 7566, "time_per_iteration": 2.6848998069763184 }, { "auxiliary_loss_clip": 0.01165269, "auxiliary_loss_mlp": 0.01024828, "balance_loss_clip": 1.00872386, "balance_loss_mlp": 1.01800382, "epoch": 0.9098779534660013, "flos": 21906443001600.0, "grad_norm": 1.789111851189495, "language_loss": 0.84416747, "learning_rate": 8.450156711295942e-08, "loss": 0.86606842, "num_input_tokens_seen": 163620595, "step": 7567, "time_per_iteration": 2.6429905891418457 }, { "auxiliary_loss_clip": 0.01163585, "auxiliary_loss_mlp": 0.01026514, "balance_loss_clip": 0.97453648, "balance_loss_mlp": 1.01961827, "epoch": 0.9099981963566404, "flos": 25730354102400.0, "grad_norm": 2.058212293662253, "language_loss": 0.86198425, "learning_rate": 8.427767653542383e-08, "loss": 0.88388526, "num_input_tokens_seen": 163635765, "step": 7568, "time_per_iteration": 2.634324073791504 }, { "auxiliary_loss_clip": 0.01156386, "auxiliary_loss_mlp": 0.01021623, "balance_loss_clip": 0.89257413, "balance_loss_mlp": 1.01490283, "epoch": 0.9101184392472795, "flos": 21069437304960.0, "grad_norm": 1.8562450281861493, "language_loss": 0.70129645, "learning_rate": 8.405407657144125e-08, "loss": 0.72307652, "num_input_tokens_seen": 163654925, "step": 7569, "time_per_iteration": 2.675980567932129 }, { "auxiliary_loss_clip": 0.01155048, "auxiliary_loss_mlp": 0.01024141, "balance_loss_clip": 0.96811628, "balance_loss_mlp": 1.01720548, "epoch": 0.9102386821379186, "flos": 24752614919040.0, "grad_norm": 1.7434831015660825, "language_loss": 0.72440916, "learning_rate": 8.383076725493232e-08, "loss": 0.74620104, "num_input_tokens_seen": 163672245, "step": 7570, "time_per_iteration": 2.671285390853882 }, { "auxiliary_loss_clip": 0.01164399, "auxiliary_loss_mlp": 0.01030136, "balance_loss_clip": 1.00753522, "balance_loss_mlp": 1.02283764, "epoch": 0.9103589250285576, "flos": 22562818179840.0, "grad_norm": 2.2751755235630013, "language_loss": 0.67944396, "learning_rate": 8.360774861977216e-08, "loss": 0.70138925, "num_input_tokens_seen": 163691365, "step": 7571, "time_per_iteration": 2.6061835289001465 }, { "auxiliary_loss_clip": 0.01161192, "auxiliary_loss_mlp": 0.01028343, "balance_loss_clip": 0.96721178, "balance_loss_mlp": 1.02167296, "epoch": 0.9104791679191968, "flos": 25373474524800.0, "grad_norm": 1.7422285910733941, "language_loss": 0.74548733, "learning_rate": 8.338502069979281e-08, "loss": 0.76738262, "num_input_tokens_seen": 163711675, "step": 7572, "time_per_iteration": 2.6946005821228027 }, { "auxiliary_loss_clip": 0.01166537, "auxiliary_loss_mlp": 0.01028245, "balance_loss_clip": 1.00748301, "balance_loss_mlp": 1.02099061, "epoch": 0.9105994108098359, "flos": 14426681558400.0, "grad_norm": 2.8902992611103113, "language_loss": 0.79654205, "learning_rate": 8.316258352878214e-08, "loss": 0.81848991, "num_input_tokens_seen": 163728095, "step": 7573, "time_per_iteration": 2.53898024559021 }, { "auxiliary_loss_clip": 0.01170589, "auxiliary_loss_mlp": 0.01027411, "balance_loss_clip": 1.00863659, "balance_loss_mlp": 1.01979923, "epoch": 0.9107196537004749, "flos": 26718292748160.0, "grad_norm": 1.7188051683426258, "language_loss": 0.70793879, "learning_rate": 8.294043714048338e-08, "loss": 0.72991872, "num_input_tokens_seen": 163747175, "step": 7574, "time_per_iteration": 2.758598566055298 }, { "auxiliary_loss_clip": 0.01065469, "auxiliary_loss_mlp": 0.01002416, "balance_loss_clip": 0.93390065, "balance_loss_mlp": 1.00072312, "epoch": 0.9108398965911141, "flos": 66532634703360.0, "grad_norm": 0.7656151398965229, "language_loss": 0.60578299, "learning_rate": 8.271858156859624e-08, "loss": 0.62646174, "num_input_tokens_seen": 163812545, "step": 7575, "time_per_iteration": 3.25911808013916 }, { "auxiliary_loss_clip": 0.01165776, "auxiliary_loss_mlp": 0.01023084, "balance_loss_clip": 1.04746306, "balance_loss_mlp": 1.0165689, "epoch": 0.9109601394817531, "flos": 25411073086080.0, "grad_norm": 1.5919384079362342, "language_loss": 0.73761129, "learning_rate": 8.249701684677557e-08, "loss": 0.75949991, "num_input_tokens_seen": 163833870, "step": 7576, "time_per_iteration": 2.579782485961914 }, { "auxiliary_loss_clip": 0.01168679, "auxiliary_loss_mlp": 0.01029655, "balance_loss_clip": 1.01258802, "balance_loss_mlp": 1.0228157, "epoch": 0.9110803823723922, "flos": 22747794243840.0, "grad_norm": 1.6950922810796432, "language_loss": 0.80976903, "learning_rate": 8.227574300863294e-08, "loss": 0.83175242, "num_input_tokens_seen": 163854040, "step": 7577, "time_per_iteration": 2.6143922805786133 }, { "auxiliary_loss_clip": 0.01169992, "auxiliary_loss_mlp": 0.01029211, "balance_loss_clip": 0.97242671, "balance_loss_mlp": 1.02169466, "epoch": 0.9112006252630314, "flos": 48469924131840.0, "grad_norm": 1.75828785133821, "language_loss": 0.6970427, "learning_rate": 8.205476008773548e-08, "loss": 0.71903473, "num_input_tokens_seen": 163878040, "step": 7578, "time_per_iteration": 2.8518333435058594 }, { "auxiliary_loss_clip": 0.01158723, "auxiliary_loss_mlp": 0.01031117, "balance_loss_clip": 0.93419671, "balance_loss_mlp": 1.02431011, "epoch": 0.9113208681536704, "flos": 30009649829760.0, "grad_norm": 3.7303931879527257, "language_loss": 0.82566857, "learning_rate": 8.183406811760596e-08, "loss": 0.84756696, "num_input_tokens_seen": 163897770, "step": 7579, "time_per_iteration": 2.76501202583313 }, { "auxiliary_loss_clip": 0.01145043, "auxiliary_loss_mlp": 0.01027835, "balance_loss_clip": 0.92796385, "balance_loss_mlp": 1.02038145, "epoch": 0.9114411110443095, "flos": 25594971742080.0, "grad_norm": 1.3956842637280213, "language_loss": 0.74134803, "learning_rate": 8.161366713172313e-08, "loss": 0.76307672, "num_input_tokens_seen": 163920160, "step": 7580, "time_per_iteration": 2.6929771900177 }, { "auxiliary_loss_clip": 0.01165683, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 0.93099844, "balance_loss_mlp": 1.02009058, "epoch": 0.9115613539349486, "flos": 18399729928320.0, "grad_norm": 2.501692195731526, "language_loss": 0.84435254, "learning_rate": 8.139355716352137e-08, "loss": 0.86628962, "num_input_tokens_seen": 163935000, "step": 7581, "time_per_iteration": 2.6221582889556885 }, { "auxiliary_loss_clip": 0.01166567, "auxiliary_loss_mlp": 0.01022923, "balance_loss_clip": 0.96910179, "balance_loss_mlp": 1.01550567, "epoch": 0.9116815968255877, "flos": 21726171619200.0, "grad_norm": 1.4651681195676298, "language_loss": 0.69982666, "learning_rate": 8.117373824639196e-08, "loss": 0.72172153, "num_input_tokens_seen": 163955265, "step": 7582, "time_per_iteration": 2.759945869445801 }, { "auxiliary_loss_clip": 0.01059124, "auxiliary_loss_mlp": 0.01002903, "balance_loss_clip": 1.00826812, "balance_loss_mlp": 1.00129402, "epoch": 0.9118018397162267, "flos": 65363526835200.0, "grad_norm": 0.7279858957601352, "language_loss": 0.59324276, "learning_rate": 8.095421041368067e-08, "loss": 0.61386299, "num_input_tokens_seen": 164014680, "step": 7583, "time_per_iteration": 3.15122389793396 }, { "auxiliary_loss_clip": 0.01164158, "auxiliary_loss_mlp": 0.01122242, "balance_loss_clip": 0.9728564, "balance_loss_mlp": 0.0, "epoch": 0.9119220826068659, "flos": 20922885815040.0, "grad_norm": 1.833623979332811, "language_loss": 0.70824343, "learning_rate": 8.073497369868999e-08, "loss": 0.73110747, "num_input_tokens_seen": 164033140, "step": 7584, "time_per_iteration": 3.4837803840637207 }, { "auxiliary_loss_clip": 0.0117081, "auxiliary_loss_mlp": 0.01025143, "balance_loss_clip": 0.97074127, "balance_loss_mlp": 1.01804984, "epoch": 0.912042325497505, "flos": 28366449327360.0, "grad_norm": 1.6066577552447452, "language_loss": 0.75620902, "learning_rate": 8.051602813467772e-08, "loss": 0.77816856, "num_input_tokens_seen": 164054995, "step": 7585, "time_per_iteration": 2.6933419704437256 }, { "auxiliary_loss_clip": 0.01167734, "auxiliary_loss_mlp": 0.01025018, "balance_loss_clip": 1.00998235, "balance_loss_mlp": 1.01851821, "epoch": 0.912162568388144, "flos": 17566782468480.0, "grad_norm": 1.8096314792879724, "language_loss": 0.71316898, "learning_rate": 8.029737375485756e-08, "loss": 0.73509645, "num_input_tokens_seen": 164074225, "step": 7586, "time_per_iteration": 2.6714823246002197 }, { "auxiliary_loss_clip": 0.01168801, "auxiliary_loss_mlp": 0.01025324, "balance_loss_clip": 1.0487231, "balance_loss_mlp": 1.01808214, "epoch": 0.9122828112787832, "flos": 19827897661440.0, "grad_norm": 1.6699019564594932, "language_loss": 0.72277814, "learning_rate": 8.007901059239986e-08, "loss": 0.74471939, "num_input_tokens_seen": 164093505, "step": 7587, "time_per_iteration": 2.5628912448883057 }, { "auxiliary_loss_clip": 0.01162363, "auxiliary_loss_mlp": 0.01025058, "balance_loss_clip": 0.96865189, "balance_loss_mlp": 1.018332, "epoch": 0.9124030541694222, "flos": 20813789232000.0, "grad_norm": 1.70017568562462, "language_loss": 0.80107415, "learning_rate": 7.986093868042964e-08, "loss": 0.82294834, "num_input_tokens_seen": 164113750, "step": 7588, "time_per_iteration": 3.5657434463500977 }, { "auxiliary_loss_clip": 0.0116412, "auxiliary_loss_mlp": 0.01027301, "balance_loss_clip": 1.00855446, "balance_loss_mlp": 1.02118862, "epoch": 0.9125232970600613, "flos": 25192305302400.0, "grad_norm": 2.10646536986411, "language_loss": 0.67644602, "learning_rate": 7.964315805202826e-08, "loss": 0.6983602, "num_input_tokens_seen": 164134330, "step": 7589, "time_per_iteration": 2.702023983001709 }, { "auxiliary_loss_clip": 0.01170035, "auxiliary_loss_mlp": 0.01026844, "balance_loss_clip": 0.97398138, "balance_loss_mlp": 1.01953697, "epoch": 0.9126435399507005, "flos": 19719591177600.0, "grad_norm": 1.731975891101715, "language_loss": 0.7326436, "learning_rate": 7.942566874023304e-08, "loss": 0.75461245, "num_input_tokens_seen": 164153515, "step": 7590, "time_per_iteration": 3.638927936553955 }, { "auxiliary_loss_clip": 0.01158775, "auxiliary_loss_mlp": 0.0102243, "balance_loss_clip": 0.96753293, "balance_loss_mlp": 1.01532817, "epoch": 0.9127637828413395, "flos": 19573614305280.0, "grad_norm": 2.0793404777336306, "language_loss": 0.6975103, "learning_rate": 7.920847077803649e-08, "loss": 0.71932232, "num_input_tokens_seen": 164171305, "step": 7591, "time_per_iteration": 3.474865198135376 }, { "auxiliary_loss_clip": 0.01145755, "auxiliary_loss_mlp": 0.01026073, "balance_loss_clip": 0.88746721, "balance_loss_mlp": 1.01879525, "epoch": 0.9128840257319786, "flos": 20230635928320.0, "grad_norm": 1.7776204614742481, "language_loss": 0.81731886, "learning_rate": 7.899156419838826e-08, "loss": 0.83903718, "num_input_tokens_seen": 164190275, "step": 7592, "time_per_iteration": 2.656567096710205 }, { "auxiliary_loss_clip": 0.01158406, "auxiliary_loss_mlp": 0.01020788, "balance_loss_clip": 0.93159795, "balance_loss_mlp": 1.01422238, "epoch": 0.9130042686226177, "flos": 24858658846080.0, "grad_norm": 7.507721883213795, "language_loss": 0.65429533, "learning_rate": 7.87749490341918e-08, "loss": 0.67608726, "num_input_tokens_seen": 164210550, "step": 7593, "time_per_iteration": 2.744656801223755 }, { "auxiliary_loss_clip": 0.01170451, "auxiliary_loss_mlp": 0.01025231, "balance_loss_clip": 1.04941988, "balance_loss_mlp": 1.01804304, "epoch": 0.9131245115132568, "flos": 23581747284480.0, "grad_norm": 1.924736733438426, "language_loss": 0.83304232, "learning_rate": 7.855862531830836e-08, "loss": 0.85499918, "num_input_tokens_seen": 164226660, "step": 7594, "time_per_iteration": 2.5641579627990723 }, { "auxiliary_loss_clip": 0.01163742, "auxiliary_loss_mlp": 0.01030253, "balance_loss_clip": 1.00878811, "balance_loss_mlp": 1.02244425, "epoch": 0.9132447544038959, "flos": 19931607204480.0, "grad_norm": 1.6124210639633143, "language_loss": 0.72411317, "learning_rate": 7.834259308355373e-08, "loss": 0.7460531, "num_input_tokens_seen": 164245425, "step": 7595, "time_per_iteration": 2.7026121616363525 }, { "auxiliary_loss_clip": 0.01141648, "auxiliary_loss_mlp": 0.01023293, "balance_loss_clip": 0.85327846, "balance_loss_mlp": 1.01625371, "epoch": 0.9133649972945349, "flos": 21981747864960.0, "grad_norm": 4.068800513509963, "language_loss": 0.74847978, "learning_rate": 7.812685236269989e-08, "loss": 0.7701292, "num_input_tokens_seen": 164264085, "step": 7596, "time_per_iteration": 2.9190902709960938 }, { "auxiliary_loss_clip": 0.01065803, "auxiliary_loss_mlp": 0.01003586, "balance_loss_clip": 0.89866185, "balance_loss_mlp": 1.00201213, "epoch": 0.9134852401851741, "flos": 71240523511680.0, "grad_norm": 0.7953684003816095, "language_loss": 0.58694589, "learning_rate": 7.791140318847445e-08, "loss": 0.60763979, "num_input_tokens_seen": 164322220, "step": 7597, "time_per_iteration": 3.2619524002075195 }, { "auxiliary_loss_clip": 0.01161883, "auxiliary_loss_mlp": 0.01019632, "balance_loss_clip": 0.97379518, "balance_loss_mlp": 1.0129298, "epoch": 0.9136054830758131, "flos": 23626923615360.0, "grad_norm": 1.725195159473633, "language_loss": 0.80291986, "learning_rate": 7.769624559356081e-08, "loss": 0.82473499, "num_input_tokens_seen": 164345615, "step": 7598, "time_per_iteration": 2.727447271347046 }, { "auxiliary_loss_clip": 0.01163956, "auxiliary_loss_mlp": 0.01028205, "balance_loss_clip": 1.00872397, "balance_loss_mlp": 1.02091861, "epoch": 0.9137257259664522, "flos": 23438858981760.0, "grad_norm": 2.623681125713789, "language_loss": 0.75282043, "learning_rate": 7.748137961059842e-08, "loss": 0.77474207, "num_input_tokens_seen": 164359595, "step": 7599, "time_per_iteration": 2.60652494430542 }, { "auxiliary_loss_clip": 0.0116331, "auxiliary_loss_mlp": 0.01026521, "balance_loss_clip": 1.04742277, "balance_loss_mlp": 1.0200181, "epoch": 0.9138459688570914, "flos": 19127854523520.0, "grad_norm": 2.176308401991881, "language_loss": 0.65605724, "learning_rate": 7.726680527218211e-08, "loss": 0.67795557, "num_input_tokens_seen": 164376635, "step": 7600, "time_per_iteration": 2.548328161239624 }, { "auxiliary_loss_clip": 0.01164834, "auxiliary_loss_mlp": 0.01022147, "balance_loss_clip": 1.04446697, "balance_loss_mlp": 1.01474357, "epoch": 0.9139662117477304, "flos": 46281240714240.0, "grad_norm": 1.6670725193379992, "language_loss": 0.75407064, "learning_rate": 7.70525226108627e-08, "loss": 0.77594048, "num_input_tokens_seen": 164400305, "step": 7601, "time_per_iteration": 2.873410224914551 }, { "auxiliary_loss_clip": 0.0117146, "auxiliary_loss_mlp": 0.01030041, "balance_loss_clip": 1.01385927, "balance_loss_mlp": 1.02326357, "epoch": 0.9140864546383695, "flos": 22273198819200.0, "grad_norm": 1.6678783598040037, "language_loss": 0.79753816, "learning_rate": 7.683853165914666e-08, "loss": 0.8195532, "num_input_tokens_seen": 164418075, "step": 7602, "time_per_iteration": 2.64096999168396 }, { "auxiliary_loss_clip": 0.01163749, "auxiliary_loss_mlp": 0.01022206, "balance_loss_clip": 0.89524317, "balance_loss_mlp": 1.01543832, "epoch": 0.9142066975290086, "flos": 17530009920000.0, "grad_norm": 1.6840726993215118, "language_loss": 0.77063453, "learning_rate": 7.662483244949602e-08, "loss": 0.79249418, "num_input_tokens_seen": 164435335, "step": 7603, "time_per_iteration": 2.6910741329193115 }, { "auxiliary_loss_clip": 0.01159329, "auxiliary_loss_mlp": 0.01030522, "balance_loss_clip": 0.93400812, "balance_loss_mlp": 1.02385199, "epoch": 0.9143269404196477, "flos": 17712148809600.0, "grad_norm": 2.2036714742175914, "language_loss": 0.80526233, "learning_rate": 7.641142501432951e-08, "loss": 0.82716084, "num_input_tokens_seen": 164451530, "step": 7604, "time_per_iteration": 2.6279609203338623 }, { "auxiliary_loss_clip": 0.01157868, "auxiliary_loss_mlp": 0.01024814, "balance_loss_clip": 0.96937108, "balance_loss_mlp": 1.01802242, "epoch": 0.9144471833102867, "flos": 33323414019840.0, "grad_norm": 1.6902076072632513, "language_loss": 0.73658609, "learning_rate": 7.619830938602013e-08, "loss": 0.75841296, "num_input_tokens_seen": 164472755, "step": 7605, "time_per_iteration": 2.815375804901123 }, { "auxiliary_loss_clip": 0.01158775, "auxiliary_loss_mlp": 0.01029056, "balance_loss_clip": 1.0080862, "balance_loss_mlp": 1.02190638, "epoch": 0.9145674262009259, "flos": 21068970428160.0, "grad_norm": 2.49792034929163, "language_loss": 0.82668948, "learning_rate": 7.598548559689777e-08, "loss": 0.84856772, "num_input_tokens_seen": 164491155, "step": 7606, "time_per_iteration": 2.6243371963500977 }, { "auxiliary_loss_clip": 0.01155613, "auxiliary_loss_mlp": 0.01028258, "balance_loss_clip": 0.93179393, "balance_loss_mlp": 1.02105737, "epoch": 0.914687669091565, "flos": 16800269212800.0, "grad_norm": 2.1672683197097298, "language_loss": 0.80881613, "learning_rate": 7.577295367924751e-08, "loss": 0.83065486, "num_input_tokens_seen": 164507555, "step": 7607, "time_per_iteration": 2.6429762840270996 }, { "auxiliary_loss_clip": 0.01171144, "auxiliary_loss_mlp": 0.01032962, "balance_loss_clip": 0.97416878, "balance_loss_mlp": 1.02565432, "epoch": 0.914807911982204, "flos": 25773627012480.0, "grad_norm": 1.706649361586258, "language_loss": 0.82255089, "learning_rate": 7.556071366531002e-08, "loss": 0.84459192, "num_input_tokens_seen": 164528525, "step": 7608, "time_per_iteration": 2.7818572521209717 }, { "auxiliary_loss_clip": 0.01165885, "auxiliary_loss_mlp": 0.01027689, "balance_loss_clip": 1.0110333, "balance_loss_mlp": 1.02032232, "epoch": 0.9149281548728432, "flos": 19208043636480.0, "grad_norm": 1.778688589403677, "language_loss": 0.7924825, "learning_rate": 7.53487655872822e-08, "loss": 0.8144182, "num_input_tokens_seen": 164547695, "step": 7609, "time_per_iteration": 3.575336217880249 }, { "auxiliary_loss_clip": 0.01165094, "auxiliary_loss_mlp": 0.01026429, "balance_loss_clip": 0.89241016, "balance_loss_mlp": 1.01937151, "epoch": 0.9150483977634822, "flos": 26870554500480.0, "grad_norm": 1.7228140945349133, "language_loss": 0.73783875, "learning_rate": 7.513710947731656e-08, "loss": 0.75975394, "num_input_tokens_seen": 164568905, "step": 7610, "time_per_iteration": 2.730393648147583 }, { "auxiliary_loss_clip": 0.01155251, "auxiliary_loss_mlp": 0.01025211, "balance_loss_clip": 0.97013867, "balance_loss_mlp": 1.01818347, "epoch": 0.9151686406541213, "flos": 21908956953600.0, "grad_norm": 1.9044886158308798, "language_loss": 0.84903705, "learning_rate": 7.492574536752095e-08, "loss": 0.87084162, "num_input_tokens_seen": 164588895, "step": 7611, "time_per_iteration": 2.7240231037139893 }, { "auxiliary_loss_clip": 0.01160835, "auxiliary_loss_mlp": 0.01026419, "balance_loss_clip": 1.00954568, "balance_loss_mlp": 1.01988673, "epoch": 0.9152888835447605, "flos": 27308556944640.0, "grad_norm": 1.8528624505119977, "language_loss": 0.78244913, "learning_rate": 7.471467328995907e-08, "loss": 0.80432171, "num_input_tokens_seen": 164607705, "step": 7612, "time_per_iteration": 2.6533987522125244 }, { "auxiliary_loss_clip": 0.01163831, "auxiliary_loss_mlp": 0.01026064, "balance_loss_clip": 0.77802211, "balance_loss_mlp": 1.01877999, "epoch": 0.9154091264353995, "flos": 13370728510080.0, "grad_norm": 2.623145079305023, "language_loss": 0.60538113, "learning_rate": 7.450389327665018e-08, "loss": 0.62728, "num_input_tokens_seen": 164625540, "step": 7613, "time_per_iteration": 2.884411096572876 }, { "auxiliary_loss_clip": 0.0117129, "auxiliary_loss_mlp": 0.01028381, "balance_loss_clip": 0.9393003, "balance_loss_mlp": 1.02053428, "epoch": 0.9155293693260386, "flos": 20193037367040.0, "grad_norm": 2.309975847648276, "language_loss": 0.67764986, "learning_rate": 7.429340535957029e-08, "loss": 0.69964659, "num_input_tokens_seen": 164640735, "step": 7614, "time_per_iteration": 4.1302196979522705 }, { "auxiliary_loss_clip": 0.01164863, "auxiliary_loss_mlp": 0.01024714, "balance_loss_clip": 0.96978068, "balance_loss_mlp": 1.01783848, "epoch": 0.9156496122166777, "flos": 19354990176000.0, "grad_norm": 2.143693303101544, "language_loss": 0.7082119, "learning_rate": 7.40832095706494e-08, "loss": 0.73010767, "num_input_tokens_seen": 164657430, "step": 7615, "time_per_iteration": 2.6449122428894043 }, { "auxiliary_loss_clip": 0.01166725, "auxiliary_loss_mlp": 0.01026763, "balance_loss_clip": 0.93168956, "balance_loss_mlp": 1.01980674, "epoch": 0.9157698551073168, "flos": 21107287261440.0, "grad_norm": 1.823728614880124, "language_loss": 0.80475795, "learning_rate": 7.387330594177443e-08, "loss": 0.82669282, "num_input_tokens_seen": 164679505, "step": 7616, "time_per_iteration": 3.715432643890381 }, { "auxiliary_loss_clip": 0.0115685, "auxiliary_loss_mlp": 0.01022052, "balance_loss_clip": 0.93272221, "balance_loss_mlp": 1.01526916, "epoch": 0.9158900979979558, "flos": 25193167228800.0, "grad_norm": 1.7252667439758278, "language_loss": 0.79171038, "learning_rate": 7.366369450478749e-08, "loss": 0.81349939, "num_input_tokens_seen": 164700615, "step": 7617, "time_per_iteration": 3.864339828491211 }, { "auxiliary_loss_clip": 0.01155901, "auxiliary_loss_mlp": 0.01031141, "balance_loss_clip": 0.93052244, "balance_loss_mlp": 1.02401483, "epoch": 0.916010340888595, "flos": 30146648302080.0, "grad_norm": 1.8995162466045663, "language_loss": 0.66405272, "learning_rate": 7.345437529148646e-08, "loss": 0.6859231, "num_input_tokens_seen": 164719625, "step": 7618, "time_per_iteration": 2.785602569580078 }, { "auxiliary_loss_clip": 0.01165866, "auxiliary_loss_mlp": 0.01024364, "balance_loss_clip": 0.9328239, "balance_loss_mlp": 1.01724398, "epoch": 0.9161305837792341, "flos": 17091827907840.0, "grad_norm": 1.9982916068711838, "language_loss": 0.72843945, "learning_rate": 7.324534833362483e-08, "loss": 0.75034177, "num_input_tokens_seen": 164737200, "step": 7619, "time_per_iteration": 2.7809700965881348 }, { "auxiliary_loss_clip": 0.01163087, "auxiliary_loss_mlp": 0.01019944, "balance_loss_clip": 0.97131127, "balance_loss_mlp": 1.01344681, "epoch": 0.9162508266698731, "flos": 22893699288960.0, "grad_norm": 1.7470766953570367, "language_loss": 0.68358409, "learning_rate": 7.303661366291192e-08, "loss": 0.70541441, "num_input_tokens_seen": 164757870, "step": 7620, "time_per_iteration": 2.6936724185943604 }, { "auxiliary_loss_clip": 0.0115564, "auxiliary_loss_mlp": 0.01026691, "balance_loss_clip": 0.89153624, "balance_loss_mlp": 1.02016699, "epoch": 0.9163710695605123, "flos": 19974808287360.0, "grad_norm": 1.7790654859122068, "language_loss": 0.8164677, "learning_rate": 7.28281713110126e-08, "loss": 0.83829105, "num_input_tokens_seen": 164775945, "step": 7621, "time_per_iteration": 2.732235908508301 }, { "auxiliary_loss_clip": 0.011621, "auxiliary_loss_mlp": 0.01030003, "balance_loss_clip": 0.97243822, "balance_loss_mlp": 1.02327108, "epoch": 0.9164913124511513, "flos": 22783812606720.0, "grad_norm": 1.9463858961484168, "language_loss": 0.77469444, "learning_rate": 7.262002130954759e-08, "loss": 0.79661548, "num_input_tokens_seen": 164794400, "step": 7622, "time_per_iteration": 2.667100667953491 }, { "auxiliary_loss_clip": 0.01163267, "auxiliary_loss_mlp": 0.01029934, "balance_loss_clip": 0.89397776, "balance_loss_mlp": 1.02239418, "epoch": 0.9166115553417904, "flos": 24900854348160.0, "grad_norm": 1.5870055691493092, "language_loss": 0.78658545, "learning_rate": 7.241216369009296e-08, "loss": 0.80851746, "num_input_tokens_seen": 164814585, "step": 7623, "time_per_iteration": 2.77406644821167 }, { "auxiliary_loss_clip": 0.01165696, "auxiliary_loss_mlp": 0.01028023, "balance_loss_clip": 1.04555964, "balance_loss_mlp": 1.02096319, "epoch": 0.9167317982324296, "flos": 25702919089920.0, "grad_norm": 1.7145533870476641, "language_loss": 0.66287661, "learning_rate": 7.220459848418037e-08, "loss": 0.68481374, "num_input_tokens_seen": 164834660, "step": 7624, "time_per_iteration": 2.634188175201416 }, { "auxiliary_loss_clip": 0.01167073, "auxiliary_loss_mlp": 0.01028208, "balance_loss_clip": 1.0491761, "balance_loss_mlp": 1.02161241, "epoch": 0.9168520411230686, "flos": 15632813370240.0, "grad_norm": 1.7909072198667673, "language_loss": 0.79784954, "learning_rate": 7.199732572329708e-08, "loss": 0.81980234, "num_input_tokens_seen": 164852560, "step": 7625, "time_per_iteration": 2.6155595779418945 }, { "auxiliary_loss_clip": 0.0116372, "auxiliary_loss_mlp": 0.01029246, "balance_loss_clip": 0.93144459, "balance_loss_mlp": 1.02230823, "epoch": 0.9169722840137077, "flos": 30258151096320.0, "grad_norm": 2.0348363444356514, "language_loss": 0.76051402, "learning_rate": 7.179034543888684e-08, "loss": 0.78244364, "num_input_tokens_seen": 164872065, "step": 7626, "time_per_iteration": 2.77152681350708 }, { "auxiliary_loss_clip": 0.01165935, "auxiliary_loss_mlp": 0.01030022, "balance_loss_clip": 1.00758052, "balance_loss_mlp": 1.02300668, "epoch": 0.9170925269043467, "flos": 22491643380480.0, "grad_norm": 1.8435342686105798, "language_loss": 0.77425456, "learning_rate": 7.158365766234808e-08, "loss": 0.7962141, "num_input_tokens_seen": 164890915, "step": 7627, "time_per_iteration": 2.6068499088287354 }, { "auxiliary_loss_clip": 0.01151609, "auxiliary_loss_mlp": 0.01024801, "balance_loss_clip": 0.92764521, "balance_loss_mlp": 1.01726151, "epoch": 0.9172127697949859, "flos": 22893914770560.0, "grad_norm": 3.832943717198085, "language_loss": 0.72070754, "learning_rate": 7.137726242503527e-08, "loss": 0.74247164, "num_input_tokens_seen": 164909835, "step": 7628, "time_per_iteration": 2.7463440895080566 }, { "auxiliary_loss_clip": 0.01164904, "auxiliary_loss_mlp": 0.01122271, "balance_loss_clip": 1.00981057, "balance_loss_mlp": 0.0, "epoch": 0.917333012685625, "flos": 17451867882240.0, "grad_norm": 2.7948337964121968, "language_loss": 0.780779, "learning_rate": 7.11711597582585e-08, "loss": 0.80365074, "num_input_tokens_seen": 164927195, "step": 7629, "time_per_iteration": 2.6418511867523193 }, { "auxiliary_loss_clip": 0.01159818, "auxiliary_loss_mlp": 0.01024079, "balance_loss_clip": 0.92827499, "balance_loss_mlp": 1.01765954, "epoch": 0.917453255576264, "flos": 14318949692160.0, "grad_norm": 1.6419126874510808, "language_loss": 0.79906023, "learning_rate": 7.096534969328271e-08, "loss": 0.82089919, "num_input_tokens_seen": 164944640, "step": 7630, "time_per_iteration": 2.6735098361968994 }, { "auxiliary_loss_clip": 0.01166262, "auxiliary_loss_mlp": 0.01022858, "balance_loss_clip": 0.96865058, "balance_loss_mlp": 1.01660216, "epoch": 0.9175734984669032, "flos": 20741177888640.0, "grad_norm": 2.034613137608446, "language_loss": 0.84162426, "learning_rate": 7.075983226132987e-08, "loss": 0.8635155, "num_input_tokens_seen": 164963570, "step": 7631, "time_per_iteration": 2.699054002761841 }, { "auxiliary_loss_clip": 0.01170831, "auxiliary_loss_mlp": 0.01122681, "balance_loss_clip": 0.97158611, "balance_loss_mlp": 0.0, "epoch": 0.9176937413575422, "flos": 14830497233280.0, "grad_norm": 2.3850166403346225, "language_loss": 0.79534394, "learning_rate": 7.055460749357656e-08, "loss": 0.81827903, "num_input_tokens_seen": 164979850, "step": 7632, "time_per_iteration": 2.6173980236053467 }, { "auxiliary_loss_clip": 0.01162052, "auxiliary_loss_mlp": 0.01024157, "balance_loss_clip": 0.97213811, "balance_loss_mlp": 1.01710594, "epoch": 0.9178139842481813, "flos": 18474603828480.0, "grad_norm": 2.063095095958952, "language_loss": 0.7042731, "learning_rate": 7.034967542115521e-08, "loss": 0.72613513, "num_input_tokens_seen": 164998115, "step": 7633, "time_per_iteration": 2.6709537506103516 }, { "auxiliary_loss_clip": 0.01153218, "auxiliary_loss_mlp": 0.01122179, "balance_loss_clip": 1.00606036, "balance_loss_mlp": 0.0, "epoch": 0.9179342271388204, "flos": 20047455544320.0, "grad_norm": 2.1596007587626103, "language_loss": 0.75336945, "learning_rate": 7.014503607515388e-08, "loss": 0.77612346, "num_input_tokens_seen": 165017420, "step": 7634, "time_per_iteration": 2.615194797515869 }, { "auxiliary_loss_clip": 0.0116737, "auxiliary_loss_mlp": 0.01025802, "balance_loss_clip": 0.97436404, "balance_loss_mlp": 1.01887548, "epoch": 0.9180544700294595, "flos": 24676232647680.0, "grad_norm": 1.9899599962794687, "language_loss": 0.68419832, "learning_rate": 6.994068948661592e-08, "loss": 0.70613003, "num_input_tokens_seen": 165035575, "step": 7635, "time_per_iteration": 3.5644478797912598 }, { "auxiliary_loss_clip": 0.01167139, "auxiliary_loss_mlp": 0.01027661, "balance_loss_clip": 1.00989747, "balance_loss_mlp": 1.02025187, "epoch": 0.9181747129200986, "flos": 16727478301440.0, "grad_norm": 2.304406682851743, "language_loss": 0.76751447, "learning_rate": 6.973663568654142e-08, "loss": 0.78946245, "num_input_tokens_seen": 165053280, "step": 7636, "time_per_iteration": 2.571134090423584 }, { "auxiliary_loss_clip": 0.01166564, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.04742575, "balance_loss_mlp": 1.02526021, "epoch": 0.9182949558107377, "flos": 24271626873600.0, "grad_norm": 2.2259352777767263, "language_loss": 0.65448833, "learning_rate": 6.953287470588386e-08, "loss": 0.67647386, "num_input_tokens_seen": 165071235, "step": 7637, "time_per_iteration": 2.6826963424682617 }, { "auxiliary_loss_clip": 0.01167223, "auxiliary_loss_mlp": 0.01028913, "balance_loss_clip": 1.00744808, "balance_loss_mlp": 1.02179074, "epoch": 0.9184151987013768, "flos": 22082117443200.0, "grad_norm": 2.2461814900722556, "language_loss": 0.86110556, "learning_rate": 6.932940657555452e-08, "loss": 0.88306689, "num_input_tokens_seen": 165087365, "step": 7638, "time_per_iteration": 2.5911777019500732 }, { "auxiliary_loss_clip": 0.01163423, "auxiliary_loss_mlp": 0.01021885, "balance_loss_clip": 1.04829407, "balance_loss_mlp": 1.01569521, "epoch": 0.9185354415920158, "flos": 32166732257280.0, "grad_norm": 1.4357683340619976, "language_loss": 0.76464558, "learning_rate": 6.912623132641938e-08, "loss": 0.78649867, "num_input_tokens_seen": 165112455, "step": 7639, "time_per_iteration": 2.726858139038086 }, { "auxiliary_loss_clip": 0.01167117, "auxiliary_loss_mlp": 0.01027561, "balance_loss_clip": 0.97204924, "balance_loss_mlp": 1.01996732, "epoch": 0.918655684482655, "flos": 20997831542400.0, "grad_norm": 1.7499179460518335, "language_loss": 0.76793563, "learning_rate": 6.892334898929952e-08, "loss": 0.78988242, "num_input_tokens_seen": 165132700, "step": 7640, "time_per_iteration": 3.6821184158325195 }, { "auxiliary_loss_clip": 0.01157707, "auxiliary_loss_mlp": 0.01024896, "balance_loss_clip": 1.00808311, "balance_loss_mlp": 1.01800537, "epoch": 0.918775927373294, "flos": 15560704817280.0, "grad_norm": 1.789146965405217, "language_loss": 0.8450824, "learning_rate": 6.872075959497236e-08, "loss": 0.86690837, "num_input_tokens_seen": 165151475, "step": 7641, "time_per_iteration": 2.65177845954895 }, { "auxiliary_loss_clip": 0.0116945, "auxiliary_loss_mlp": 0.01025035, "balance_loss_clip": 1.00882196, "balance_loss_mlp": 1.01787663, "epoch": 0.9188961702639331, "flos": 29934057657600.0, "grad_norm": 1.9557475051097022, "language_loss": 0.82644892, "learning_rate": 6.85184631741702e-08, "loss": 0.8483938, "num_input_tokens_seen": 165172040, "step": 7642, "time_per_iteration": 4.486421346664429 }, { "auxiliary_loss_clip": 0.01162617, "auxiliary_loss_mlp": 0.01024841, "balance_loss_clip": 1.00813496, "balance_loss_mlp": 1.01737225, "epoch": 0.9190164131545723, "flos": 20701244943360.0, "grad_norm": 1.8182098950769345, "language_loss": 0.77764481, "learning_rate": 6.831645975758161e-08, "loss": 0.79951936, "num_input_tokens_seen": 165189980, "step": 7643, "time_per_iteration": 2.6022911071777344 }, { "auxiliary_loss_clip": 0.01155733, "auxiliary_loss_mlp": 0.01030862, "balance_loss_clip": 0.97035277, "balance_loss_mlp": 1.02345312, "epoch": 0.9191366560452113, "flos": 25629912696960.0, "grad_norm": 1.8969277798138913, "language_loss": 0.67246091, "learning_rate": 6.811474937585026e-08, "loss": 0.69432682, "num_input_tokens_seen": 165209770, "step": 7644, "time_per_iteration": 2.647847890853882 }, { "auxiliary_loss_clip": 0.01157646, "auxiliary_loss_mlp": 0.01028196, "balance_loss_clip": 0.93224835, "balance_loss_mlp": 1.02185154, "epoch": 0.9192568989358504, "flos": 21434325615360.0, "grad_norm": 1.5900100898254461, "language_loss": 0.7925657, "learning_rate": 6.79133320595755e-08, "loss": 0.8144241, "num_input_tokens_seen": 165229690, "step": 7645, "time_per_iteration": 2.704955816268921 }, { "auxiliary_loss_clip": 0.01169805, "auxiliary_loss_mlp": 0.01022787, "balance_loss_clip": 0.97499901, "balance_loss_mlp": 1.01615918, "epoch": 0.9193771418264896, "flos": 23185078416000.0, "grad_norm": 1.8841945301202878, "language_loss": 0.75265419, "learning_rate": 6.771220783931198e-08, "loss": 0.77458012, "num_input_tokens_seen": 165249850, "step": 7646, "time_per_iteration": 2.690849781036377 }, { "auxiliary_loss_clip": 0.01087711, "auxiliary_loss_mlp": 0.01116621, "balance_loss_clip": 0.76170421, "balance_loss_mlp": 0.0, "epoch": 0.9194973847171286, "flos": 70582963184640.0, "grad_norm": 1.3163899575221363, "language_loss": 0.64626813, "learning_rate": 6.751137674556994e-08, "loss": 0.66831142, "num_input_tokens_seen": 165310235, "step": 7647, "time_per_iteration": 3.8248817920684814 }, { "auxiliary_loss_clip": 0.01168574, "auxiliary_loss_mlp": 0.01023396, "balance_loss_clip": 1.00829315, "balance_loss_mlp": 1.01662517, "epoch": 0.9196176276077677, "flos": 14720682378240.0, "grad_norm": 1.9448554162534206, "language_loss": 0.77536988, "learning_rate": 6.731083880881572e-08, "loss": 0.79728955, "num_input_tokens_seen": 165326455, "step": 7648, "time_per_iteration": 3.7550644874572754 }, { "auxiliary_loss_clip": 0.01161922, "auxiliary_loss_mlp": 0.01024281, "balance_loss_clip": 0.97037685, "balance_loss_mlp": 1.01722407, "epoch": 0.9197378704984068, "flos": 23294893271040.0, "grad_norm": 2.0026847011555717, "language_loss": 0.81026316, "learning_rate": 6.711059405947072e-08, "loss": 0.83212519, "num_input_tokens_seen": 165344645, "step": 7649, "time_per_iteration": 2.6786344051361084 }, { "auxiliary_loss_clip": 0.01159938, "auxiliary_loss_mlp": 0.01024641, "balance_loss_clip": 0.93268281, "balance_loss_mlp": 1.01791453, "epoch": 0.9198581133890459, "flos": 20302564913280.0, "grad_norm": 1.960032839571581, "language_loss": 0.76855612, "learning_rate": 6.691064252791156e-08, "loss": 0.79040194, "num_input_tokens_seen": 165364120, "step": 7650, "time_per_iteration": 2.722126007080078 }, { "auxiliary_loss_clip": 0.01155891, "auxiliary_loss_mlp": 0.01026575, "balance_loss_clip": 0.89390302, "balance_loss_mlp": 1.01882601, "epoch": 0.9199783562796849, "flos": 17675663569920.0, "grad_norm": 2.0232520663996674, "language_loss": 0.78059196, "learning_rate": 6.67109842444713e-08, "loss": 0.80241656, "num_input_tokens_seen": 165383050, "step": 7651, "time_per_iteration": 2.732550621032715 }, { "auxiliary_loss_clip": 0.01167862, "auxiliary_loss_mlp": 0.0112307, "balance_loss_clip": 1.01240611, "balance_loss_mlp": 0.0, "epoch": 0.9200985991703241, "flos": 17676022705920.0, "grad_norm": 1.7660428862752542, "language_loss": 0.7625584, "learning_rate": 6.651161923943704e-08, "loss": 0.78546774, "num_input_tokens_seen": 165400955, "step": 7652, "time_per_iteration": 2.6898319721221924 }, { "auxiliary_loss_clip": 0.01160587, "auxiliary_loss_mlp": 0.01022387, "balance_loss_clip": 1.00811684, "balance_loss_mlp": 1.01524031, "epoch": 0.9202188420609632, "flos": 20996574566400.0, "grad_norm": 1.7079350057601457, "language_loss": 0.766271, "learning_rate": 6.631254754305326e-08, "loss": 0.78810072, "num_input_tokens_seen": 165420415, "step": 7653, "time_per_iteration": 2.582920789718628 }, { "auxiliary_loss_clip": 0.01168181, "auxiliary_loss_mlp": 0.01022431, "balance_loss_clip": 1.04664457, "balance_loss_mlp": 1.01540399, "epoch": 0.9203390849516022, "flos": 13918222586880.0, "grad_norm": 2.0748478983814467, "language_loss": 0.77964795, "learning_rate": 6.611376918551848e-08, "loss": 0.80155408, "num_input_tokens_seen": 165439200, "step": 7654, "time_per_iteration": 2.625716209411621 }, { "auxiliary_loss_clip": 0.01158589, "auxiliary_loss_mlp": 0.01122713, "balance_loss_clip": 0.92997491, "balance_loss_mlp": 0.0, "epoch": 0.9204593278422414, "flos": 21175912195200.0, "grad_norm": 1.9955884085953834, "language_loss": 0.79583514, "learning_rate": 6.591528419698744e-08, "loss": 0.81864816, "num_input_tokens_seen": 165458985, "step": 7655, "time_per_iteration": 2.627789258956909 }, { "auxiliary_loss_clip": 0.01164278, "auxiliary_loss_mlp": 0.01019988, "balance_loss_clip": 0.96786773, "balance_loss_mlp": 1.0132854, "epoch": 0.9205795707328804, "flos": 14501375890560.0, "grad_norm": 2.246904191169017, "language_loss": 0.83152533, "learning_rate": 6.571709260756986e-08, "loss": 0.85336804, "num_input_tokens_seen": 165475630, "step": 7656, "time_per_iteration": 2.647242546081543 }, { "auxiliary_loss_clip": 0.01169799, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.01216817, "balance_loss_mlp": 1.01982796, "epoch": 0.9206998136235195, "flos": 22417559579520.0, "grad_norm": 2.668107289026472, "language_loss": 0.76161695, "learning_rate": 6.551919444733122e-08, "loss": 0.78358448, "num_input_tokens_seen": 165493445, "step": 7657, "time_per_iteration": 2.600123405456543 }, { "auxiliary_loss_clip": 0.01164754, "auxiliary_loss_mlp": 0.0102729, "balance_loss_clip": 0.97272384, "balance_loss_mlp": 1.01986337, "epoch": 0.9208200565141585, "flos": 53358407544960.0, "grad_norm": 1.826671110580632, "language_loss": 0.65816164, "learning_rate": 6.53215897462931e-08, "loss": 0.68008208, "num_input_tokens_seen": 165517200, "step": 7658, "time_per_iteration": 2.9035143852233887 }, { "auxiliary_loss_clip": 0.01161074, "auxiliary_loss_mlp": 0.01027409, "balance_loss_clip": 1.00734186, "balance_loss_mlp": 1.02043557, "epoch": 0.9209402994047977, "flos": 30589139946240.0, "grad_norm": 1.863715694642821, "language_loss": 0.75049365, "learning_rate": 6.512427853443103e-08, "loss": 0.7723785, "num_input_tokens_seen": 165539280, "step": 7659, "time_per_iteration": 2.7038023471832275 }, { "auxiliary_loss_clip": 0.0116632, "auxiliary_loss_mlp": 0.01029529, "balance_loss_clip": 1.00920033, "balance_loss_mlp": 1.0226984, "epoch": 0.9210605422954368, "flos": 29132711187840.0, "grad_norm": 1.5247611127679632, "language_loss": 0.75511295, "learning_rate": 6.492726084167799e-08, "loss": 0.77707142, "num_input_tokens_seen": 165561395, "step": 7660, "time_per_iteration": 2.649099111557007 }, { "auxiliary_loss_clip": 0.01058987, "auxiliary_loss_mlp": 0.01001126, "balance_loss_clip": 1.00806665, "balance_loss_mlp": 0.99951667, "epoch": 0.9211807851860758, "flos": 54853838472960.0, "grad_norm": 1.1653711123536454, "language_loss": 0.57589489, "learning_rate": 6.473053669792072e-08, "loss": 0.59649599, "num_input_tokens_seen": 165616085, "step": 7661, "time_per_iteration": 3.9082627296447754 }, { "auxiliary_loss_clip": 0.01163191, "auxiliary_loss_mlp": 0.0102473, "balance_loss_clip": 1.00725925, "balance_loss_mlp": 1.01779211, "epoch": 0.921301028076715, "flos": 19201974238080.0, "grad_norm": 2.0096475158593963, "language_loss": 0.72886634, "learning_rate": 6.453410613300248e-08, "loss": 0.75074553, "num_input_tokens_seen": 165634015, "step": 7662, "time_per_iteration": 2.625314950942993 }, { "auxiliary_loss_clip": 0.01161177, "auxiliary_loss_mlp": 0.01030365, "balance_loss_clip": 0.85530102, "balance_loss_mlp": 1.02287269, "epoch": 0.921421270967354, "flos": 27526893765120.0, "grad_norm": 2.821136361594868, "language_loss": 0.58501297, "learning_rate": 6.43379691767214e-08, "loss": 0.60692841, "num_input_tokens_seen": 165653220, "step": 7663, "time_per_iteration": 2.783538579940796 }, { "auxiliary_loss_clip": 0.01069785, "auxiliary_loss_mlp": 0.0100384, "balance_loss_clip": 0.86045825, "balance_loss_mlp": 1.00214756, "epoch": 0.9215415138579931, "flos": 70209311955840.0, "grad_norm": 0.7399106166013997, "language_loss": 0.55166173, "learning_rate": 6.414212585883105e-08, "loss": 0.57239795, "num_input_tokens_seen": 165715850, "step": 7664, "time_per_iteration": 3.384188652038574 }, { "auxiliary_loss_clip": 0.01165047, "auxiliary_loss_mlp": 0.01025456, "balance_loss_clip": 0.97041011, "balance_loss_mlp": 1.01883388, "epoch": 0.9216617567486323, "flos": 35553107790720.0, "grad_norm": 1.9975979245883533, "language_loss": 0.69789708, "learning_rate": 6.394657620904143e-08, "loss": 0.71980214, "num_input_tokens_seen": 165738960, "step": 7665, "time_per_iteration": 3.2944581508636475 }, { "auxiliary_loss_clip": 0.01171372, "auxiliary_loss_mlp": 0.01027918, "balance_loss_clip": 1.04882264, "balance_loss_mlp": 1.01989532, "epoch": 0.9217819996392713, "flos": 29533330552320.0, "grad_norm": 1.8938260306500339, "language_loss": 0.71356416, "learning_rate": 6.375132025701657e-08, "loss": 0.73555702, "num_input_tokens_seen": 165761260, "step": 7666, "time_per_iteration": 3.6956827640533447 }, { "auxiliary_loss_clip": 0.01170305, "auxiliary_loss_mlp": 0.01032661, "balance_loss_clip": 1.04990101, "balance_loss_mlp": 1.02480483, "epoch": 0.9219022425299104, "flos": 14574669592320.0, "grad_norm": 2.6628230531566075, "language_loss": 0.68730295, "learning_rate": 6.355635803237724e-08, "loss": 0.70933259, "num_input_tokens_seen": 165776960, "step": 7667, "time_per_iteration": 2.5650038719177246 }, { "auxiliary_loss_clip": 0.01164597, "auxiliary_loss_mlp": 0.01024473, "balance_loss_clip": 1.01041436, "balance_loss_mlp": 1.01726353, "epoch": 0.9220224854205495, "flos": 18077503996800.0, "grad_norm": 2.3848419456608676, "language_loss": 0.79549122, "learning_rate": 6.336168956469867e-08, "loss": 0.81738198, "num_input_tokens_seen": 165795435, "step": 7668, "time_per_iteration": 3.588890314102173 }, { "auxiliary_loss_clip": 0.01152031, "auxiliary_loss_mlp": 0.01025177, "balance_loss_clip": 0.96854329, "balance_loss_mlp": 1.01854873, "epoch": 0.9221427283111886, "flos": 24790464875520.0, "grad_norm": 1.6910617628996287, "language_loss": 0.71553493, "learning_rate": 6.316731488351168e-08, "loss": 0.73730707, "num_input_tokens_seen": 165816625, "step": 7669, "time_per_iteration": 2.6488430500030518 }, { "auxiliary_loss_clip": 0.01165611, "auxiliary_loss_mlp": 0.01028685, "balance_loss_clip": 1.00939035, "balance_loss_mlp": 1.02173519, "epoch": 0.9222629712018277, "flos": 13845036625920.0, "grad_norm": 1.8561791957614961, "language_loss": 0.63453376, "learning_rate": 6.297323401830334e-08, "loss": 0.65647674, "num_input_tokens_seen": 165835410, "step": 7670, "time_per_iteration": 2.5671615600585938 }, { "auxiliary_loss_clip": 0.01168415, "auxiliary_loss_mlp": 0.01027975, "balance_loss_clip": 1.00931895, "balance_loss_mlp": 1.02117157, "epoch": 0.9223832140924668, "flos": 21616177196160.0, "grad_norm": 2.001164568300307, "language_loss": 0.69299912, "learning_rate": 6.277944699851523e-08, "loss": 0.71496308, "num_input_tokens_seen": 165854930, "step": 7671, "time_per_iteration": 2.5965890884399414 }, { "auxiliary_loss_clip": 0.01166344, "auxiliary_loss_mlp": 0.01025203, "balance_loss_clip": 1.04728508, "balance_loss_mlp": 1.01788068, "epoch": 0.9225034569831059, "flos": 21142084561920.0, "grad_norm": 1.820490698709031, "language_loss": 0.7349714, "learning_rate": 6.25859538535447e-08, "loss": 0.7568869, "num_input_tokens_seen": 165875725, "step": 7672, "time_per_iteration": 2.565706253051758 }, { "auxiliary_loss_clip": 0.01164664, "auxiliary_loss_mlp": 0.01020816, "balance_loss_clip": 0.97185409, "balance_loss_mlp": 1.01399159, "epoch": 0.9226236998737449, "flos": 12495046844160.0, "grad_norm": 2.529657002636531, "language_loss": 0.78655815, "learning_rate": 6.239275461274474e-08, "loss": 0.80841297, "num_input_tokens_seen": 165892100, "step": 7673, "time_per_iteration": 2.6340081691741943 }, { "auxiliary_loss_clip": 0.01167379, "auxiliary_loss_mlp": 0.01025502, "balance_loss_clip": 1.01042163, "balance_loss_mlp": 1.01892471, "epoch": 0.9227439427643841, "flos": 26214071581440.0, "grad_norm": 2.5752020958593262, "language_loss": 0.85844707, "learning_rate": 6.219984930542299e-08, "loss": 0.88037586, "num_input_tokens_seen": 165912840, "step": 7674, "time_per_iteration": 2.63224458694458 }, { "auxiliary_loss_clip": 0.01166719, "auxiliary_loss_mlp": 0.01023035, "balance_loss_clip": 1.00822592, "balance_loss_mlp": 1.01584053, "epoch": 0.9228641856550232, "flos": 17967581400960.0, "grad_norm": 2.500179014869186, "language_loss": 0.75653815, "learning_rate": 6.200723796084383e-08, "loss": 0.77843571, "num_input_tokens_seen": 165930935, "step": 7675, "time_per_iteration": 2.595547676086426 }, { "auxiliary_loss_clip": 0.01070226, "auxiliary_loss_mlp": 0.01003113, "balance_loss_clip": 0.89659792, "balance_loss_mlp": 1.00134873, "epoch": 0.9229844285456622, "flos": 70420609710720.0, "grad_norm": 0.7562748958238621, "language_loss": 0.63060796, "learning_rate": 6.181492060822546e-08, "loss": 0.65134132, "num_input_tokens_seen": 165991110, "step": 7676, "time_per_iteration": 3.2474186420440674 }, { "auxiliary_loss_clip": 0.01155913, "auxiliary_loss_mlp": 0.01024676, "balance_loss_clip": 0.89178276, "balance_loss_mlp": 1.01771462, "epoch": 0.9231046714363014, "flos": 17967832796160.0, "grad_norm": 2.06996813400915, "language_loss": 0.81479996, "learning_rate": 6.162289727674274e-08, "loss": 0.83660585, "num_input_tokens_seen": 166008790, "step": 7677, "time_per_iteration": 2.7241928577423096 }, { "auxiliary_loss_clip": 0.0116102, "auxiliary_loss_mlp": 0.01023628, "balance_loss_clip": 0.9310801, "balance_loss_mlp": 1.01692259, "epoch": 0.9232249143269404, "flos": 17858233422720.0, "grad_norm": 2.0821353674443883, "language_loss": 0.87726545, "learning_rate": 6.143116799552527e-08, "loss": 0.89911193, "num_input_tokens_seen": 166025035, "step": 7678, "time_per_iteration": 2.6898579597473145 }, { "auxiliary_loss_clip": 0.01169657, "auxiliary_loss_mlp": 0.01022805, "balance_loss_clip": 1.01067519, "balance_loss_mlp": 1.01554513, "epoch": 0.9233451572175795, "flos": 23404384903680.0, "grad_norm": 2.3275012576717846, "language_loss": 0.55630028, "learning_rate": 6.123973279365802e-08, "loss": 0.5782249, "num_input_tokens_seen": 166044010, "step": 7679, "time_per_iteration": 2.694977045059204 }, { "auxiliary_loss_clip": 0.01168763, "auxiliary_loss_mlp": 0.01025396, "balance_loss_clip": 1.01117229, "balance_loss_mlp": 1.01859188, "epoch": 0.9234654001082186, "flos": 17999326045440.0, "grad_norm": 2.0830575414530843, "language_loss": 0.77365339, "learning_rate": 6.10485917001824e-08, "loss": 0.79559499, "num_input_tokens_seen": 166061865, "step": 7680, "time_per_iteration": 2.6515467166900635 }, { "auxiliary_loss_clip": 0.01168083, "auxiliary_loss_mlp": 0.01022319, "balance_loss_clip": 0.97074342, "balance_loss_mlp": 1.01573586, "epoch": 0.9235856429988577, "flos": 24750747411840.0, "grad_norm": 1.5753014009729063, "language_loss": 0.81108093, "learning_rate": 6.085774474409322e-08, "loss": 0.83298492, "num_input_tokens_seen": 166082425, "step": 7681, "time_per_iteration": 2.709538698196411 }, { "auxiliary_loss_clip": 0.01164906, "auxiliary_loss_mlp": 0.01024373, "balance_loss_clip": 0.9740051, "balance_loss_mlp": 1.01753008, "epoch": 0.9237058858894968, "flos": 14099894599680.0, "grad_norm": 2.1120231463242884, "language_loss": 0.69665778, "learning_rate": 6.066719195434267e-08, "loss": 0.71855056, "num_input_tokens_seen": 166100225, "step": 7682, "time_per_iteration": 2.64255428314209 }, { "auxiliary_loss_clip": 0.01168449, "auxiliary_loss_mlp": 0.01027926, "balance_loss_clip": 1.01054084, "balance_loss_mlp": 1.02026105, "epoch": 0.9238261287801359, "flos": 28694529175680.0, "grad_norm": 2.3486918472418266, "language_loss": 0.66866237, "learning_rate": 6.047693335983717e-08, "loss": 0.69062614, "num_input_tokens_seen": 166122570, "step": 7683, "time_per_iteration": 2.651668071746826 }, { "auxiliary_loss_clip": 0.01167439, "auxiliary_loss_mlp": 0.01026363, "balance_loss_clip": 1.0086143, "balance_loss_mlp": 1.01889408, "epoch": 0.923946371670775, "flos": 23111856541440.0, "grad_norm": 2.41969467533508, "language_loss": 0.82812208, "learning_rate": 6.028696898943853e-08, "loss": 0.85006011, "num_input_tokens_seen": 166141630, "step": 7684, "time_per_iteration": 2.682593822479248 }, { "auxiliary_loss_clip": 0.01164744, "auxiliary_loss_mlp": 0.01122652, "balance_loss_clip": 0.96945173, "balance_loss_mlp": 0.0, "epoch": 0.924066614561414, "flos": 21867120587520.0, "grad_norm": 1.9355281184157715, "language_loss": 0.7080766, "learning_rate": 6.00972988719648e-08, "loss": 0.73095059, "num_input_tokens_seen": 166159865, "step": 7685, "time_per_iteration": 2.6555428504943848 }, { "auxiliary_loss_clip": 0.01165258, "auxiliary_loss_mlp": 0.01122805, "balance_loss_clip": 0.93204343, "balance_loss_mlp": 0.0, "epoch": 0.9241868574520532, "flos": 28511887495680.0, "grad_norm": 2.423931862247547, "language_loss": 0.70735955, "learning_rate": 5.990792303618807e-08, "loss": 0.73024017, "num_input_tokens_seen": 166179445, "step": 7686, "time_per_iteration": 2.7725181579589844 }, { "auxiliary_loss_clip": 0.01167647, "auxiliary_loss_mlp": 0.01021874, "balance_loss_clip": 0.93486297, "balance_loss_mlp": 1.01524019, "epoch": 0.9243071003426923, "flos": 30518324282880.0, "grad_norm": 1.652101958973146, "language_loss": 0.69434106, "learning_rate": 5.971884151083695e-08, "loss": 0.71623629, "num_input_tokens_seen": 166201855, "step": 7687, "time_per_iteration": 3.5820751190185547 }, { "auxiliary_loss_clip": 0.01163823, "auxiliary_loss_mlp": 0.01030501, "balance_loss_clip": 0.96984076, "balance_loss_mlp": 1.02359569, "epoch": 0.9244273432333313, "flos": 28658331244800.0, "grad_norm": 1.7575983354803457, "language_loss": 0.74237704, "learning_rate": 5.9530054324595124e-08, "loss": 0.76432031, "num_input_tokens_seen": 166221970, "step": 7688, "time_per_iteration": 2.7119503021240234 }, { "auxiliary_loss_clip": 0.01056033, "auxiliary_loss_mlp": 0.01115544, "balance_loss_clip": 0.96959394, "balance_loss_mlp": 0.0, "epoch": 0.9245475861239704, "flos": 66230589237120.0, "grad_norm": 0.7252974599942166, "language_loss": 0.57578504, "learning_rate": 5.934156150610103e-08, "loss": 0.5975008, "num_input_tokens_seen": 166279335, "step": 7689, "time_per_iteration": 3.2311389446258545 }, { "auxiliary_loss_clip": 0.01160424, "auxiliary_loss_mlp": 0.01026312, "balance_loss_clip": 0.96949196, "balance_loss_mlp": 1.01911163, "epoch": 0.9246678290146095, "flos": 24239918142720.0, "grad_norm": 3.397318434954599, "language_loss": 0.79159898, "learning_rate": 5.915336308394914e-08, "loss": 0.81346631, "num_input_tokens_seen": 166298170, "step": 7690, "time_per_iteration": 2.633563995361328 }, { "auxiliary_loss_clip": 0.01162073, "auxiliary_loss_mlp": 0.01024072, "balance_loss_clip": 1.00962067, "balance_loss_mlp": 1.01744652, "epoch": 0.9247880719052486, "flos": 18988808976000.0, "grad_norm": 1.5319784078917658, "language_loss": 0.76884407, "learning_rate": 5.89654590866886e-08, "loss": 0.7907055, "num_input_tokens_seen": 166317670, "step": 7691, "time_per_iteration": 2.6393370628356934 }, { "auxiliary_loss_clip": 0.01167489, "auxiliary_loss_mlp": 0.01025205, "balance_loss_clip": 0.85879278, "balance_loss_mlp": 1.01748633, "epoch": 0.9249083147958876, "flos": 24024095274240.0, "grad_norm": 1.7761743254624487, "language_loss": 0.88031888, "learning_rate": 5.877784954282483e-08, "loss": 0.90224582, "num_input_tokens_seen": 166337010, "step": 7692, "time_per_iteration": 2.702803373336792 }, { "auxiliary_loss_clip": 0.01169984, "auxiliary_loss_mlp": 0.01030522, "balance_loss_clip": 1.01023591, "balance_loss_mlp": 1.02273798, "epoch": 0.9250285576865268, "flos": 30773972355840.0, "grad_norm": 2.338066473260236, "language_loss": 0.72464299, "learning_rate": 5.8590534480817963e-08, "loss": 0.74664807, "num_input_tokens_seen": 166358735, "step": 7693, "time_per_iteration": 3.6398181915283203 }, { "auxiliary_loss_clip": 0.011682, "auxiliary_loss_mlp": 0.01023684, "balance_loss_clip": 1.04897261, "balance_loss_mlp": 1.01675224, "epoch": 0.9251488005771659, "flos": 10633581348480.0, "grad_norm": 2.0384272726484323, "language_loss": 0.72577918, "learning_rate": 5.840351392908349e-08, "loss": 0.74769807, "num_input_tokens_seen": 166374455, "step": 7694, "time_per_iteration": 3.555964708328247 }, { "auxiliary_loss_clip": 0.01168612, "auxiliary_loss_mlp": 0.01122147, "balance_loss_clip": 0.97002518, "balance_loss_mlp": 0.0, "epoch": 0.9252690434678049, "flos": 23586416052480.0, "grad_norm": 4.001097892398639, "language_loss": 0.70987743, "learning_rate": 5.821678791599205e-08, "loss": 0.73278499, "num_input_tokens_seen": 166393900, "step": 7695, "time_per_iteration": 3.5781068801879883 }, { "auxiliary_loss_clip": 0.01162885, "auxiliary_loss_mlp": 0.01023855, "balance_loss_clip": 0.97173512, "balance_loss_mlp": 1.01700628, "epoch": 0.9253892863584441, "flos": 21469158829440.0, "grad_norm": 1.6959510104822793, "language_loss": 0.80961931, "learning_rate": 5.803035646986965e-08, "loss": 0.8314867, "num_input_tokens_seen": 166413235, "step": 7696, "time_per_iteration": 2.6521058082580566 }, { "auxiliary_loss_clip": 0.01170915, "auxiliary_loss_mlp": 0.0103253, "balance_loss_clip": 1.04913557, "balance_loss_mlp": 1.0252049, "epoch": 0.9255095292490831, "flos": 17456680304640.0, "grad_norm": 2.123695322616834, "language_loss": 0.67556739, "learning_rate": 5.7844219618998766e-08, "loss": 0.69760191, "num_input_tokens_seen": 166427560, "step": 7697, "time_per_iteration": 2.5380892753601074 }, { "auxiliary_loss_clip": 0.01146082, "auxiliary_loss_mlp": 0.01026542, "balance_loss_clip": 0.92848754, "balance_loss_mlp": 1.01964283, "epoch": 0.9256297721397222, "flos": 24750675584640.0, "grad_norm": 1.7641686247584163, "language_loss": 0.71705198, "learning_rate": 5.765837739161505e-08, "loss": 0.73877823, "num_input_tokens_seen": 166446680, "step": 7698, "time_per_iteration": 2.668782949447632 }, { "auxiliary_loss_clip": 0.01161299, "auxiliary_loss_mlp": 0.01024902, "balance_loss_clip": 0.93108189, "balance_loss_mlp": 1.01849127, "epoch": 0.9257500150303614, "flos": 23112215677440.0, "grad_norm": 1.6768820963766486, "language_loss": 0.74287951, "learning_rate": 5.7472829815911504e-08, "loss": 0.76474148, "num_input_tokens_seen": 166465505, "step": 7699, "time_per_iteration": 2.660231351852417 }, { "auxiliary_loss_clip": 0.0115737, "auxiliary_loss_mlp": 0.01025541, "balance_loss_clip": 0.96982682, "balance_loss_mlp": 1.01821017, "epoch": 0.9258702579210004, "flos": 22564685687040.0, "grad_norm": 1.7560843842224938, "language_loss": 0.81490445, "learning_rate": 5.7287576920035164e-08, "loss": 0.83673352, "num_input_tokens_seen": 166484520, "step": 7700, "time_per_iteration": 2.6098849773406982 }, { "auxiliary_loss_clip": 0.01157935, "auxiliary_loss_mlp": 0.010235, "balance_loss_clip": 0.93304598, "balance_loss_mlp": 1.01689327, "epoch": 0.9259905008116395, "flos": 30004298703360.0, "grad_norm": 2.0815934151197424, "language_loss": 0.76689893, "learning_rate": 5.7102618732088435e-08, "loss": 0.78871334, "num_input_tokens_seen": 166503850, "step": 7701, "time_per_iteration": 2.7566468715667725 }, { "auxiliary_loss_clip": 0.0117074, "auxiliary_loss_mlp": 0.01026551, "balance_loss_clip": 0.97194642, "balance_loss_mlp": 1.01959813, "epoch": 0.9261107437022786, "flos": 24572128055040.0, "grad_norm": 1.6465974999627633, "language_loss": 0.74539065, "learning_rate": 5.6917955280130216e-08, "loss": 0.76736355, "num_input_tokens_seen": 166525330, "step": 7702, "time_per_iteration": 2.6716177463531494 }, { "auxiliary_loss_clip": 0.01162644, "auxiliary_loss_mlp": 0.01024202, "balance_loss_clip": 1.00964725, "balance_loss_mlp": 1.01745129, "epoch": 0.9262309865929177, "flos": 22018448586240.0, "grad_norm": 2.2012316193881505, "language_loss": 0.72142386, "learning_rate": 5.6733586592172755e-08, "loss": 0.74329233, "num_input_tokens_seen": 166544825, "step": 7703, "time_per_iteration": 2.6271321773529053 }, { "auxiliary_loss_clip": 0.01157092, "auxiliary_loss_mlp": 0.01121619, "balance_loss_clip": 0.96875048, "balance_loss_mlp": 0.0, "epoch": 0.9263512294835567, "flos": 20339481116160.0, "grad_norm": 2.2061938443053988, "language_loss": 0.79954493, "learning_rate": 5.6549512696185244e-08, "loss": 0.82233202, "num_input_tokens_seen": 166563325, "step": 7704, "time_per_iteration": 2.6311581134796143 }, { "auxiliary_loss_clip": 0.01165956, "auxiliary_loss_mlp": 0.01025228, "balance_loss_clip": 1.04883516, "balance_loss_mlp": 1.01822138, "epoch": 0.9264714723741959, "flos": 21215378263680.0, "grad_norm": 1.6677171587834276, "language_loss": 0.68061376, "learning_rate": 5.636573362009156e-08, "loss": 0.70252556, "num_input_tokens_seen": 166583385, "step": 7705, "time_per_iteration": 2.590041399002075 }, { "auxiliary_loss_clip": 0.01170698, "auxiliary_loss_mlp": 0.01027354, "balance_loss_clip": 1.04836774, "balance_loss_mlp": 1.02015948, "epoch": 0.926591715264835, "flos": 18004964480640.0, "grad_norm": 4.33372974063976, "language_loss": 0.76567817, "learning_rate": 5.618224939177074e-08, "loss": 0.78765875, "num_input_tokens_seen": 166601290, "step": 7706, "time_per_iteration": 2.6033947467803955 }, { "auxiliary_loss_clip": 0.0115349, "auxiliary_loss_mlp": 0.01025556, "balance_loss_clip": 0.96973264, "balance_loss_mlp": 1.01818871, "epoch": 0.926711958155474, "flos": 36167969825280.0, "grad_norm": 3.18306455270663, "language_loss": 0.70061505, "learning_rate": 5.599906003905719e-08, "loss": 0.72240549, "num_input_tokens_seen": 166623835, "step": 7707, "time_per_iteration": 2.850520133972168 }, { "auxiliary_loss_clip": 0.01164925, "auxiliary_loss_mlp": 0.01023596, "balance_loss_clip": 1.0120014, "balance_loss_mlp": 1.01653838, "epoch": 0.9268322010461132, "flos": 21032736583680.0, "grad_norm": 2.5872827229059494, "language_loss": 0.81714368, "learning_rate": 5.581616558974023e-08, "loss": 0.83902884, "num_input_tokens_seen": 166642400, "step": 7708, "time_per_iteration": 2.619040012359619 }, { "auxiliary_loss_clip": 0.01173186, "auxiliary_loss_mlp": 0.01122885, "balance_loss_clip": 1.01151645, "balance_loss_mlp": 0.0, "epoch": 0.9269524439367522, "flos": 22964838174720.0, "grad_norm": 1.7398057871908506, "language_loss": 0.78802013, "learning_rate": 5.5633566071565444e-08, "loss": 0.8109808, "num_input_tokens_seen": 166661640, "step": 7709, "time_per_iteration": 2.703259229660034 }, { "auxiliary_loss_clip": 0.01164582, "auxiliary_loss_mlp": 0.01023625, "balance_loss_clip": 0.85658473, "balance_loss_mlp": 1.0169909, "epoch": 0.9270726868273913, "flos": 41975551468800.0, "grad_norm": 1.9353221366620135, "language_loss": 0.70699888, "learning_rate": 5.5451261512232896e-08, "loss": 0.72888094, "num_input_tokens_seen": 166684320, "step": 7710, "time_per_iteration": 2.9258010387420654 }, { "auxiliary_loss_clip": 0.01168991, "auxiliary_loss_mlp": 0.01025645, "balance_loss_clip": 1.00745595, "balance_loss_mlp": 1.018507, "epoch": 0.9271929297180305, "flos": 19791771557760.0, "grad_norm": 1.949325611572555, "language_loss": 0.62995452, "learning_rate": 5.5269251939397576e-08, "loss": 0.65190089, "num_input_tokens_seen": 166703835, "step": 7711, "time_per_iteration": 2.6395578384399414 }, { "auxiliary_loss_clip": 0.0116203, "auxiliary_loss_mlp": 0.01024082, "balance_loss_clip": 0.92894596, "balance_loss_mlp": 1.01670313, "epoch": 0.9273131726086695, "flos": 19968343839360.0, "grad_norm": 2.036704685921344, "language_loss": 0.76399326, "learning_rate": 5.508753738067073e-08, "loss": 0.7858544, "num_input_tokens_seen": 166723375, "step": 7712, "time_per_iteration": 2.703270673751831 }, { "auxiliary_loss_clip": 0.01170134, "auxiliary_loss_mlp": 0.01021657, "balance_loss_clip": 1.01043594, "balance_loss_mlp": 1.01471353, "epoch": 0.9274334154993086, "flos": 23258587599360.0, "grad_norm": 1.8707674591241659, "language_loss": 0.79297727, "learning_rate": 5.4906117863617875e-08, "loss": 0.81489515, "num_input_tokens_seen": 166742760, "step": 7713, "time_per_iteration": 3.376819610595703 }, { "auxiliary_loss_clip": 0.01155338, "auxiliary_loss_mlp": 0.01021022, "balance_loss_clip": 0.92936337, "balance_loss_mlp": 1.01438475, "epoch": 0.9275536583899477, "flos": 31795343585280.0, "grad_norm": 1.7567953044333453, "language_loss": 0.78067267, "learning_rate": 5.4724993415760533e-08, "loss": 0.80243635, "num_input_tokens_seen": 166761115, "step": 7714, "time_per_iteration": 2.726440906524658 }, { "auxiliary_loss_clip": 0.01170457, "auxiliary_loss_mlp": 0.01122508, "balance_loss_clip": 0.93085539, "balance_loss_mlp": 0.0, "epoch": 0.9276739012805868, "flos": 18696998885760.0, "grad_norm": 2.194547462199194, "language_loss": 0.75094128, "learning_rate": 5.454416406457496e-08, "loss": 0.77387094, "num_input_tokens_seen": 166780210, "step": 7715, "time_per_iteration": 2.647730588912964 }, { "auxiliary_loss_clip": 0.01163447, "auxiliary_loss_mlp": 0.0102456, "balance_loss_clip": 1.0080719, "balance_loss_mlp": 1.01822066, "epoch": 0.9277941441712259, "flos": 13879079740800.0, "grad_norm": 2.8439823762672405, "language_loss": 0.73582262, "learning_rate": 5.436362983749299e-08, "loss": 0.75770265, "num_input_tokens_seen": 166795380, "step": 7716, "time_per_iteration": 2.611541748046875 }, { "auxiliary_loss_clip": 0.01158893, "auxiliary_loss_mlp": 0.01020772, "balance_loss_clip": 0.93456316, "balance_loss_mlp": 1.01407886, "epoch": 0.927914387061865, "flos": 23258659426560.0, "grad_norm": 1.8208767144728784, "language_loss": 0.64384997, "learning_rate": 5.418339076190137e-08, "loss": 0.66564667, "num_input_tokens_seen": 166814890, "step": 7717, "time_per_iteration": 2.6822924613952637 }, { "auxiliary_loss_clip": 0.01157289, "auxiliary_loss_mlp": 0.01026901, "balance_loss_clip": 0.9708457, "balance_loss_mlp": 1.01949239, "epoch": 0.9280346299525041, "flos": 18073733068800.0, "grad_norm": 1.7771939197581768, "language_loss": 0.88461411, "learning_rate": 5.400344686514202e-08, "loss": 0.90645605, "num_input_tokens_seen": 166832475, "step": 7718, "time_per_iteration": 2.7424683570861816 }, { "auxiliary_loss_clip": 0.01163729, "auxiliary_loss_mlp": 0.01024726, "balance_loss_clip": 1.00997996, "balance_loss_mlp": 1.018116, "epoch": 0.9281548728431431, "flos": 22342901160960.0, "grad_norm": 1.6693228667563504, "language_loss": 0.66971719, "learning_rate": 5.38237981745131e-08, "loss": 0.69160175, "num_input_tokens_seen": 166850590, "step": 7719, "time_per_iteration": 3.6298739910125732 }, { "auxiliary_loss_clip": 0.01167069, "auxiliary_loss_mlp": 0.01121864, "balance_loss_clip": 1.01026392, "balance_loss_mlp": 0.0, "epoch": 0.9282751157337822, "flos": 18843765857280.0, "grad_norm": 1.7133646488466778, "language_loss": 0.8137604, "learning_rate": 5.364444471726592e-08, "loss": 0.83664972, "num_input_tokens_seen": 166869795, "step": 7720, "time_per_iteration": 3.611258029937744 }, { "auxiliary_loss_clip": 0.01164074, "auxiliary_loss_mlp": 0.01027278, "balance_loss_clip": 1.00830019, "balance_loss_mlp": 1.01984203, "epoch": 0.9283953586244214, "flos": 25556834476800.0, "grad_norm": 1.9183803028256943, "language_loss": 0.80113935, "learning_rate": 5.346538652060939e-08, "loss": 0.82305288, "num_input_tokens_seen": 166891150, "step": 7721, "time_per_iteration": 2.700687885284424 }, { "auxiliary_loss_clip": 0.01161104, "auxiliary_loss_mlp": 0.01023818, "balance_loss_clip": 0.97127008, "balance_loss_mlp": 1.01732719, "epoch": 0.9285156015150604, "flos": 18223480869120.0, "grad_norm": 1.9953564747403476, "language_loss": 0.70123881, "learning_rate": 5.3286623611705994e-08, "loss": 0.72308803, "num_input_tokens_seen": 166909195, "step": 7722, "time_per_iteration": 2.6479907035827637 }, { "auxiliary_loss_clip": 0.01058863, "auxiliary_loss_mlp": 0.01001724, "balance_loss_clip": 1.00801873, "balance_loss_mlp": 1.00012636, "epoch": 0.9286358444056995, "flos": 66400017690240.0, "grad_norm": 0.8199544908270884, "language_loss": 0.60590541, "learning_rate": 5.3108156017673824e-08, "loss": 0.62651128, "num_input_tokens_seen": 166970955, "step": 7723, "time_per_iteration": 3.332338571548462 }, { "auxiliary_loss_clip": 0.01171168, "auxiliary_loss_mlp": 0.01024063, "balance_loss_clip": 0.97204876, "balance_loss_mlp": 1.01662493, "epoch": 0.9287560872963386, "flos": 22345630594560.0, "grad_norm": 1.773031791778122, "language_loss": 0.71550667, "learning_rate": 5.2929983765586775e-08, "loss": 0.73745894, "num_input_tokens_seen": 166989735, "step": 7724, "time_per_iteration": 2.624748706817627 }, { "auxiliary_loss_clip": 0.01165895, "auxiliary_loss_mlp": 0.01024908, "balance_loss_clip": 1.04857695, "balance_loss_mlp": 1.01760972, "epoch": 0.9288763301869777, "flos": 25700225569920.0, "grad_norm": 1.668357414422252, "language_loss": 0.62629306, "learning_rate": 5.275210688247278e-08, "loss": 0.64820105, "num_input_tokens_seen": 167010060, "step": 7725, "time_per_iteration": 2.5712647438049316 }, { "auxiliary_loss_clip": 0.01163279, "auxiliary_loss_mlp": 0.01021111, "balance_loss_clip": 0.89523661, "balance_loss_mlp": 1.01427436, "epoch": 0.9289965730776167, "flos": 12312046028160.0, "grad_norm": 1.9189821525493267, "language_loss": 0.85053629, "learning_rate": 5.257452539531604e-08, "loss": 0.87238014, "num_input_tokens_seen": 167027130, "step": 7726, "time_per_iteration": 2.65090274810791 }, { "auxiliary_loss_clip": 0.01164685, "auxiliary_loss_mlp": 0.01023981, "balance_loss_clip": 1.0080657, "balance_loss_mlp": 1.01703691, "epoch": 0.9291168159682559, "flos": 26685973486080.0, "grad_norm": 1.5172792170211369, "language_loss": 0.68287015, "learning_rate": 5.2397239331055445e-08, "loss": 0.70475686, "num_input_tokens_seen": 167049130, "step": 7727, "time_per_iteration": 2.624241352081299 }, { "auxiliary_loss_clip": 0.01159998, "auxiliary_loss_mlp": 0.0103176, "balance_loss_clip": 0.97160202, "balance_loss_mlp": 1.0248611, "epoch": 0.929237058858895, "flos": 14538256179840.0, "grad_norm": 2.5584562058634455, "language_loss": 0.81006622, "learning_rate": 5.2220248716585036e-08, "loss": 0.8319838, "num_input_tokens_seen": 167066810, "step": 7728, "time_per_iteration": 2.6616432666778564 }, { "auxiliary_loss_clip": 0.01157065, "auxiliary_loss_mlp": 0.01028074, "balance_loss_clip": 1.00708818, "balance_loss_mlp": 1.02071309, "epoch": 0.929357301749534, "flos": 23835456023040.0, "grad_norm": 2.232264875925813, "language_loss": 0.75291371, "learning_rate": 5.204355357875445e-08, "loss": 0.77476513, "num_input_tokens_seen": 167085155, "step": 7729, "time_per_iteration": 2.6309733390808105 }, { "auxiliary_loss_clip": 0.01158617, "auxiliary_loss_mlp": 0.01027531, "balance_loss_clip": 0.96874475, "balance_loss_mlp": 1.02054489, "epoch": 0.9294775446401732, "flos": 12969319046400.0, "grad_norm": 2.280949873580767, "language_loss": 0.70319307, "learning_rate": 5.1867153944367584e-08, "loss": 0.7250545, "num_input_tokens_seen": 167101545, "step": 7730, "time_per_iteration": 2.6720597743988037 }, { "auxiliary_loss_clip": 0.01169166, "auxiliary_loss_mlp": 0.01025262, "balance_loss_clip": 0.93301487, "balance_loss_mlp": 1.01798689, "epoch": 0.9295977875308122, "flos": 26211809024640.0, "grad_norm": 1.5887695166524742, "language_loss": 0.73349178, "learning_rate": 5.16910498401848e-08, "loss": 0.75543606, "num_input_tokens_seen": 167120995, "step": 7731, "time_per_iteration": 2.7048614025115967 }, { "auxiliary_loss_clip": 0.01165394, "auxiliary_loss_mlp": 0.0103095, "balance_loss_clip": 1.04890645, "balance_loss_mlp": 1.02376461, "epoch": 0.9297180304214513, "flos": 16472297105280.0, "grad_norm": 1.9602267769544317, "language_loss": 0.83367509, "learning_rate": 5.151524129292073e-08, "loss": 0.8556385, "num_input_tokens_seen": 167138890, "step": 7732, "time_per_iteration": 2.6767466068267822 }, { "auxiliary_loss_clip": 0.01165187, "auxiliary_loss_mlp": 0.01029527, "balance_loss_clip": 1.00893044, "balance_loss_mlp": 1.02268195, "epoch": 0.9298382733120905, "flos": 24060436859520.0, "grad_norm": 2.0537786841039267, "language_loss": 0.66338253, "learning_rate": 5.1339728329245155e-08, "loss": 0.68532968, "num_input_tokens_seen": 167159455, "step": 7733, "time_per_iteration": 2.5898971557617188 }, { "auxiliary_loss_clip": 0.01169404, "auxiliary_loss_mlp": 0.01025347, "balance_loss_clip": 1.04714656, "balance_loss_mlp": 1.01736307, "epoch": 0.9299585162027295, "flos": 22127652910080.0, "grad_norm": 1.9673140561544122, "language_loss": 0.79014027, "learning_rate": 5.116451097578367e-08, "loss": 0.81208777, "num_input_tokens_seen": 167178495, "step": 7734, "time_per_iteration": 2.663294792175293 }, { "auxiliary_loss_clip": 0.01159346, "auxiliary_loss_mlp": 0.01026619, "balance_loss_clip": 0.93140519, "balance_loss_mlp": 1.01993692, "epoch": 0.9300787590933686, "flos": 21471780522240.0, "grad_norm": 1.5611058120466916, "language_loss": 0.74182469, "learning_rate": 5.0989589259115895e-08, "loss": 0.76368427, "num_input_tokens_seen": 167199380, "step": 7735, "time_per_iteration": 2.732111930847168 }, { "auxiliary_loss_clip": 0.01159848, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.00629938, "balance_loss_mlp": 1.02212667, "epoch": 0.9301990019840077, "flos": 17779588594560.0, "grad_norm": 1.8099759996327351, "language_loss": 0.71364397, "learning_rate": 5.081496320577816e-08, "loss": 0.73553848, "num_input_tokens_seen": 167216500, "step": 7736, "time_per_iteration": 2.701341152191162 }, { "auxiliary_loss_clip": 0.0107503, "auxiliary_loss_mlp": 0.01001533, "balance_loss_clip": 0.947074, "balance_loss_mlp": 0.99982816, "epoch": 0.9303192448746468, "flos": 58896122307840.0, "grad_norm": 0.9112032228756429, "language_loss": 0.61257178, "learning_rate": 5.0640632842260835e-08, "loss": 0.63333738, "num_input_tokens_seen": 167276760, "step": 7737, "time_per_iteration": 3.296916961669922 }, { "auxiliary_loss_clip": 0.0116579, "auxiliary_loss_mlp": 0.01122331, "balance_loss_clip": 0.93717909, "balance_loss_mlp": 0.0, "epoch": 0.9304394877652858, "flos": 57663522172800.0, "grad_norm": 1.881447944531793, "language_loss": 0.72616142, "learning_rate": 5.0466598195009426e-08, "loss": 0.74904263, "num_input_tokens_seen": 167303630, "step": 7738, "time_per_iteration": 3.0766537189483643 }, { "auxiliary_loss_clip": 0.01167072, "auxiliary_loss_mlp": 0.01024816, "balance_loss_clip": 0.93407011, "balance_loss_mlp": 1.01808083, "epoch": 0.930559730655925, "flos": 20996143603200.0, "grad_norm": 3.5514274635127334, "language_loss": 0.70203257, "learning_rate": 5.0292859290425036e-08, "loss": 0.72395146, "num_input_tokens_seen": 167321500, "step": 7739, "time_per_iteration": 3.442328453063965 }, { "auxiliary_loss_clip": 0.01165303, "auxiliary_loss_mlp": 0.01024759, "balance_loss_clip": 1.04791999, "balance_loss_mlp": 1.01817226, "epoch": 0.9306799735465641, "flos": 23258264376960.0, "grad_norm": 2.019899973858902, "language_loss": 0.77857828, "learning_rate": 5.011941615486348e-08, "loss": 0.80047894, "num_input_tokens_seen": 167340615, "step": 7740, "time_per_iteration": 2.6336190700531006 }, { "auxiliary_loss_clip": 0.01164648, "auxiliary_loss_mlp": 0.01019376, "balance_loss_clip": 1.04692149, "balance_loss_mlp": 1.01289964, "epoch": 0.9308002164372031, "flos": 15231547560960.0, "grad_norm": 2.815828183285261, "language_loss": 0.84501988, "learning_rate": 4.994626881463659e-08, "loss": 0.86686009, "num_input_tokens_seen": 167356870, "step": 7741, "time_per_iteration": 2.525911331176758 }, { "auxiliary_loss_clip": 0.01142502, "auxiliary_loss_mlp": 0.01029033, "balance_loss_clip": 0.88998616, "balance_loss_mlp": 1.02253616, "epoch": 0.9309204593278423, "flos": 30847481539200.0, "grad_norm": 2.0043982672480523, "language_loss": 0.71174037, "learning_rate": 4.9773417296009814e-08, "loss": 0.73345578, "num_input_tokens_seen": 167378390, "step": 7742, "time_per_iteration": 2.781878709793091 }, { "auxiliary_loss_clip": 0.01170094, "auxiliary_loss_mlp": 0.01022644, "balance_loss_clip": 1.00916767, "balance_loss_mlp": 1.01539052, "epoch": 0.9310407022184813, "flos": 23037269950080.0, "grad_norm": 2.6277426381388613, "language_loss": 0.65635002, "learning_rate": 4.960086162520527e-08, "loss": 0.67827737, "num_input_tokens_seen": 167398480, "step": 7743, "time_per_iteration": 2.666161298751831 }, { "auxiliary_loss_clip": 0.01169067, "auxiliary_loss_mlp": 0.01024503, "balance_loss_clip": 0.89445704, "balance_loss_mlp": 1.01763654, "epoch": 0.9311609451091204, "flos": 22127976132480.0, "grad_norm": 1.9114890728692453, "language_loss": 0.82411194, "learning_rate": 4.942860182839936e-08, "loss": 0.84604758, "num_input_tokens_seen": 167416825, "step": 7744, "time_per_iteration": 3.703435182571411 }, { "auxiliary_loss_clip": 0.01162018, "auxiliary_loss_mlp": 0.01024807, "balance_loss_clip": 0.97055483, "balance_loss_mlp": 1.01747549, "epoch": 0.9312811879997596, "flos": 21099206701440.0, "grad_norm": 1.7418715238187092, "language_loss": 0.79493582, "learning_rate": 4.925663793172341e-08, "loss": 0.81680411, "num_input_tokens_seen": 167434785, "step": 7745, "time_per_iteration": 2.6057894229888916 }, { "auxiliary_loss_clip": 0.01060581, "auxiliary_loss_mlp": 0.01115574, "balance_loss_clip": 0.93577933, "balance_loss_mlp": 0.0, "epoch": 0.9314014308903986, "flos": 67148179096320.0, "grad_norm": 0.7961973243150018, "language_loss": 0.56482828, "learning_rate": 4.908496996126477e-08, "loss": 0.58658981, "num_input_tokens_seen": 167498245, "step": 7746, "time_per_iteration": 4.27347469329834 }, { "auxiliary_loss_clip": 0.01167852, "auxiliary_loss_mlp": 0.01027412, "balance_loss_clip": 1.01370347, "balance_loss_mlp": 1.02047682, "epoch": 0.9315216737810377, "flos": 22565583527040.0, "grad_norm": 1.647637558391886, "language_loss": 0.76347136, "learning_rate": 4.89135979430646e-08, "loss": 0.78542399, "num_input_tokens_seen": 167518290, "step": 7747, "time_per_iteration": 2.7043066024780273 }, { "auxiliary_loss_clip": 0.01167291, "auxiliary_loss_mlp": 0.01022736, "balance_loss_clip": 1.04856062, "balance_loss_mlp": 1.01610184, "epoch": 0.9316419166716768, "flos": 23984054588160.0, "grad_norm": 1.7526667919242067, "language_loss": 0.85316539, "learning_rate": 4.874252190312078e-08, "loss": 0.87506568, "num_input_tokens_seen": 167538675, "step": 7748, "time_per_iteration": 2.5512163639068604 }, { "auxiliary_loss_clip": 0.01168243, "auxiliary_loss_mlp": 0.01025685, "balance_loss_clip": 1.00895154, "balance_loss_mlp": 1.01873553, "epoch": 0.9317621595623159, "flos": 30230464688640.0, "grad_norm": 1.9708791151602356, "language_loss": 0.64521813, "learning_rate": 4.857174186738477e-08, "loss": 0.66715747, "num_input_tokens_seen": 167562025, "step": 7749, "time_per_iteration": 2.6922500133514404 }, { "auxiliary_loss_clip": 0.01170129, "auxiliary_loss_mlp": 0.01022861, "balance_loss_clip": 1.05092371, "balance_loss_mlp": 1.0158608, "epoch": 0.931882402452955, "flos": 15742735966080.0, "grad_norm": 2.5933086140787958, "language_loss": 0.73703593, "learning_rate": 4.840125786176408e-08, "loss": 0.75896585, "num_input_tokens_seen": 167578230, "step": 7750, "time_per_iteration": 2.5240390300750732 }, { "auxiliary_loss_clip": 0.01159976, "auxiliary_loss_mlp": 0.010228, "balance_loss_clip": 0.97187436, "balance_loss_mlp": 1.01629996, "epoch": 0.932002645343594, "flos": 28366521154560.0, "grad_norm": 1.8265322039737169, "language_loss": 0.7758401, "learning_rate": 4.823106991212067e-08, "loss": 0.79766786, "num_input_tokens_seen": 167597470, "step": 7751, "time_per_iteration": 2.754866361618042 }, { "auxiliary_loss_clip": 0.01166235, "auxiliary_loss_mlp": 0.01026701, "balance_loss_clip": 1.00941491, "balance_loss_mlp": 1.02009678, "epoch": 0.9321228882342332, "flos": 15341146934400.0, "grad_norm": 5.810471405494502, "language_loss": 0.83220422, "learning_rate": 4.806117804427212e-08, "loss": 0.85413361, "num_input_tokens_seen": 167615405, "step": 7752, "time_per_iteration": 2.5540857315063477 }, { "auxiliary_loss_clip": 0.01160884, "auxiliary_loss_mlp": 0.01033103, "balance_loss_clip": 1.00976789, "balance_loss_mlp": 1.02587295, "epoch": 0.9322431311248722, "flos": 17895365107200.0, "grad_norm": 2.0982008541511736, "language_loss": 0.63694358, "learning_rate": 4.7891582283990926e-08, "loss": 0.65888345, "num_input_tokens_seen": 167634130, "step": 7753, "time_per_iteration": 2.5995993614196777 }, { "auxiliary_loss_clip": 0.0116259, "auxiliary_loss_mlp": 0.01029708, "balance_loss_clip": 0.93085921, "balance_loss_mlp": 1.02253795, "epoch": 0.9323633740155113, "flos": 24169713010560.0, "grad_norm": 1.497984569124537, "language_loss": 0.72731316, "learning_rate": 4.772228265700473e-08, "loss": 0.74923611, "num_input_tokens_seen": 167654990, "step": 7754, "time_per_iteration": 2.6880786418914795 }, { "auxiliary_loss_clip": 0.01168492, "auxiliary_loss_mlp": 0.01028844, "balance_loss_clip": 1.01014924, "balance_loss_mlp": 1.02170944, "epoch": 0.9324836169061504, "flos": 15043482927360.0, "grad_norm": 2.3760238292061353, "language_loss": 0.75742137, "learning_rate": 4.75532791889961e-08, "loss": 0.77939475, "num_input_tokens_seen": 167671690, "step": 7755, "time_per_iteration": 2.585024118423462 }, { "auxiliary_loss_clip": 0.01163036, "auxiliary_loss_mlp": 0.01024019, "balance_loss_clip": 1.00815415, "balance_loss_mlp": 1.01744127, "epoch": 0.9326038597967895, "flos": 18624890332800.0, "grad_norm": 1.766855521955724, "language_loss": 0.65176964, "learning_rate": 4.738457190560252e-08, "loss": 0.67364019, "num_input_tokens_seen": 167690800, "step": 7756, "time_per_iteration": 2.587415933609009 }, { "auxiliary_loss_clip": 0.01165156, "auxiliary_loss_mlp": 0.01026299, "balance_loss_clip": 0.89815116, "balance_loss_mlp": 1.01951897, "epoch": 0.9327241026874286, "flos": 18952646958720.0, "grad_norm": 2.418089017629428, "language_loss": 0.79220819, "learning_rate": 4.721616083241664e-08, "loss": 0.8141228, "num_input_tokens_seen": 167709055, "step": 7757, "time_per_iteration": 2.6641552448272705 }, { "auxiliary_loss_clip": 0.01159737, "auxiliary_loss_mlp": 0.01030398, "balance_loss_clip": 1.00922132, "balance_loss_mlp": 1.02323961, "epoch": 0.9328443455780677, "flos": 29570282668800.0, "grad_norm": 1.846987759988745, "language_loss": 0.77361631, "learning_rate": 4.7048045994986684e-08, "loss": 0.79551768, "num_input_tokens_seen": 167729915, "step": 7758, "time_per_iteration": 2.6986286640167236 }, { "auxiliary_loss_clip": 0.01171609, "auxiliary_loss_mlp": 0.01022261, "balance_loss_clip": 1.01017356, "balance_loss_mlp": 1.01472151, "epoch": 0.9329645884687068, "flos": 30081722469120.0, "grad_norm": 2.416207049935834, "language_loss": 0.90973532, "learning_rate": 4.688022741881559e-08, "loss": 0.931674, "num_input_tokens_seen": 167750440, "step": 7759, "time_per_iteration": 2.659853219985962 }, { "auxiliary_loss_clip": 0.01160304, "auxiliary_loss_mlp": 0.01021411, "balance_loss_clip": 1.00761962, "balance_loss_mlp": 1.01431489, "epoch": 0.9330848313593458, "flos": 21867982513920.0, "grad_norm": 1.925556423665575, "language_loss": 0.74856079, "learning_rate": 4.671270512936076e-08, "loss": 0.77037793, "num_input_tokens_seen": 167769600, "step": 7760, "time_per_iteration": 2.6117734909057617 }, { "auxiliary_loss_clip": 0.01155226, "auxiliary_loss_mlp": 0.01024127, "balance_loss_clip": 0.93082631, "balance_loss_mlp": 1.01691508, "epoch": 0.933205074249985, "flos": 22127221946880.0, "grad_norm": 1.7297168489196075, "language_loss": 0.83079851, "learning_rate": 4.6545479152035884e-08, "loss": 0.85259205, "num_input_tokens_seen": 167788770, "step": 7761, "time_per_iteration": 2.671342611312866 }, { "auxiliary_loss_clip": 0.01167387, "auxiliary_loss_mlp": 0.01030612, "balance_loss_clip": 1.00984967, "balance_loss_mlp": 1.02353084, "epoch": 0.9333253171406241, "flos": 15341254675200.0, "grad_norm": 1.8939569863692451, "language_loss": 0.76079309, "learning_rate": 4.637854951220821e-08, "loss": 0.78277302, "num_input_tokens_seen": 167805555, "step": 7762, "time_per_iteration": 2.6226933002471924 }, { "auxiliary_loss_clip": 0.01158456, "auxiliary_loss_mlp": 0.01027029, "balance_loss_clip": 0.93212783, "balance_loss_mlp": 1.02041852, "epoch": 0.9334455600312631, "flos": 15706142985600.0, "grad_norm": 2.3668705876670204, "language_loss": 0.74647349, "learning_rate": 4.621191623520171e-08, "loss": 0.76832831, "num_input_tokens_seen": 167823985, "step": 7763, "time_per_iteration": 2.6877613067626953 }, { "auxiliary_loss_clip": 0.01175078, "auxiliary_loss_mlp": 0.01024515, "balance_loss_clip": 0.85785848, "balance_loss_mlp": 1.01680839, "epoch": 0.9335658029219023, "flos": 22163563532160.0, "grad_norm": 2.314177964251473, "language_loss": 0.84193611, "learning_rate": 4.604557934629372e-08, "loss": 0.86393201, "num_input_tokens_seen": 167843060, "step": 7764, "time_per_iteration": 2.9695184230804443 }, { "auxiliary_loss_clip": 0.01158351, "auxiliary_loss_mlp": 0.01025381, "balance_loss_clip": 0.9708581, "balance_loss_mlp": 1.01869607, "epoch": 0.9336860458125413, "flos": 20266833859200.0, "grad_norm": 1.9329839060839356, "language_loss": 0.8044287, "learning_rate": 4.587953887071805e-08, "loss": 0.82626605, "num_input_tokens_seen": 167862880, "step": 7765, "time_per_iteration": 2.8660106658935547 }, { "auxiliary_loss_clip": 0.01155802, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 0.96649945, "balance_loss_mlp": 1.01986539, "epoch": 0.9338062887031804, "flos": 20919689504640.0, "grad_norm": 1.687585492393405, "language_loss": 0.85788035, "learning_rate": 4.5713794833662554e-08, "loss": 0.87970567, "num_input_tokens_seen": 167882095, "step": 7766, "time_per_iteration": 3.536728858947754 }, { "auxiliary_loss_clip": 0.01168261, "auxiliary_loss_mlp": 0.01022276, "balance_loss_clip": 1.04773927, "balance_loss_mlp": 1.01505446, "epoch": 0.9339265315938196, "flos": 23221635482880.0, "grad_norm": 1.6867661359590258, "language_loss": 0.63062489, "learning_rate": 4.5548347260270236e-08, "loss": 0.65253025, "num_input_tokens_seen": 167901385, "step": 7767, "time_per_iteration": 2.6530635356903076 }, { "auxiliary_loss_clip": 0.01156726, "auxiliary_loss_mlp": 0.01025079, "balance_loss_clip": 0.93079537, "balance_loss_mlp": 1.01820087, "epoch": 0.9340467744844586, "flos": 22820261932800.0, "grad_norm": 1.6437865490103782, "language_loss": 0.69160771, "learning_rate": 4.538319617564012e-08, "loss": 0.71342576, "num_input_tokens_seen": 167920405, "step": 7768, "time_per_iteration": 2.6938042640686035 }, { "auxiliary_loss_clip": 0.01162153, "auxiliary_loss_mlp": 0.01023346, "balance_loss_clip": 0.96920693, "balance_loss_mlp": 1.01558506, "epoch": 0.9341670173750977, "flos": 23660428026240.0, "grad_norm": 1.9718654668842777, "language_loss": 0.74331677, "learning_rate": 4.521834160482485e-08, "loss": 0.76517177, "num_input_tokens_seen": 167939145, "step": 7769, "time_per_iteration": 2.7236268520355225 }, { "auxiliary_loss_clip": 0.01167942, "auxiliary_loss_mlp": 0.01027727, "balance_loss_clip": 1.00883031, "balance_loss_mlp": 1.02165353, "epoch": 0.9342872602657368, "flos": 24824256595200.0, "grad_norm": 1.5485063245261432, "language_loss": 0.82159245, "learning_rate": 4.5053783572832846e-08, "loss": 0.84354919, "num_input_tokens_seen": 167959325, "step": 7770, "time_per_iteration": 2.7069740295410156 }, { "auxiliary_loss_clip": 0.01165818, "auxiliary_loss_mlp": 0.01025118, "balance_loss_clip": 1.01061416, "balance_loss_mlp": 1.01791191, "epoch": 0.9344075031563759, "flos": 25771831332480.0, "grad_norm": 1.5393190618164507, "language_loss": 0.76182806, "learning_rate": 4.488952210462771e-08, "loss": 0.78373742, "num_input_tokens_seen": 167979530, "step": 7771, "time_per_iteration": 3.6962008476257324 }, { "auxiliary_loss_clip": 0.01166953, "auxiliary_loss_mlp": 0.01025805, "balance_loss_clip": 1.04930508, "balance_loss_mlp": 1.01887262, "epoch": 0.9345277460470149, "flos": 25551303782400.0, "grad_norm": 1.859724112403196, "language_loss": 0.86050856, "learning_rate": 4.4725557225127495e-08, "loss": 0.8824361, "num_input_tokens_seen": 167997870, "step": 7772, "time_per_iteration": 3.4419164657592773 }, { "auxiliary_loss_clip": 0.01166935, "auxiliary_loss_mlp": 0.01031885, "balance_loss_clip": 1.01024687, "balance_loss_mlp": 1.02515531, "epoch": 0.9346479889376541, "flos": 34313112432000.0, "grad_norm": 1.5470347656865153, "language_loss": 0.79118133, "learning_rate": 4.456188895920565e-08, "loss": 0.81316948, "num_input_tokens_seen": 168019625, "step": 7773, "time_per_iteration": 3.71828556060791 }, { "auxiliary_loss_clip": 0.01169158, "auxiliary_loss_mlp": 0.01022948, "balance_loss_clip": 1.04927373, "balance_loss_mlp": 1.01584888, "epoch": 0.9347682318282932, "flos": 19093739581440.0, "grad_norm": 1.848838763052677, "language_loss": 0.85165846, "learning_rate": 4.439851733169031e-08, "loss": 0.8735795, "num_input_tokens_seen": 168037415, "step": 7774, "time_per_iteration": 2.5436699390411377 }, { "auxiliary_loss_clip": 0.01162927, "auxiliary_loss_mlp": 0.01022788, "balance_loss_clip": 0.93212974, "balance_loss_mlp": 1.01586461, "epoch": 0.9348884747189322, "flos": 26249587153920.0, "grad_norm": 2.157980610212242, "language_loss": 0.68909556, "learning_rate": 4.4235442367365204e-08, "loss": 0.71095276, "num_input_tokens_seen": 168057725, "step": 7775, "time_per_iteration": 2.7506792545318604 }, { "auxiliary_loss_clip": 0.01156308, "auxiliary_loss_mlp": 0.01024764, "balance_loss_clip": 0.96591276, "balance_loss_mlp": 1.01773643, "epoch": 0.9350087176095714, "flos": 18333080242560.0, "grad_norm": 1.893466044278317, "language_loss": 0.79357028, "learning_rate": 4.4072664090968545e-08, "loss": 0.81538099, "num_input_tokens_seen": 168076110, "step": 7776, "time_per_iteration": 2.58796763420105 }, { "auxiliary_loss_clip": 0.0116293, "auxiliary_loss_mlp": 0.01022193, "balance_loss_clip": 0.96758687, "balance_loss_mlp": 1.01551437, "epoch": 0.9351289605002104, "flos": 19318253541120.0, "grad_norm": 1.9792935945376913, "language_loss": 0.84761083, "learning_rate": 4.391018252719347e-08, "loss": 0.86946207, "num_input_tokens_seen": 168095905, "step": 7777, "time_per_iteration": 2.7251839637756348 }, { "auxiliary_loss_clip": 0.01165529, "auxiliary_loss_mlp": 0.01032117, "balance_loss_clip": 0.97006404, "balance_loss_mlp": 1.02431464, "epoch": 0.9352492033908495, "flos": 18799990156800.0, "grad_norm": 1.775199905180169, "language_loss": 0.68975258, "learning_rate": 4.374799770068849e-08, "loss": 0.71172899, "num_input_tokens_seen": 168112580, "step": 7778, "time_per_iteration": 2.6325395107269287 }, { "auxiliary_loss_clip": 0.01160031, "auxiliary_loss_mlp": 0.01019868, "balance_loss_clip": 1.00805962, "balance_loss_mlp": 1.01312971, "epoch": 0.9353694462814887, "flos": 29530134241920.0, "grad_norm": 2.310542087278874, "language_loss": 0.74878132, "learning_rate": 4.358610963605658e-08, "loss": 0.77058041, "num_input_tokens_seen": 168133030, "step": 7779, "time_per_iteration": 2.658562421798706 }, { "auxiliary_loss_clip": 0.01168612, "auxiliary_loss_mlp": 0.01031421, "balance_loss_clip": 1.04888141, "balance_loss_mlp": 1.02386594, "epoch": 0.9354896891721277, "flos": 30665450390400.0, "grad_norm": 2.312291611508791, "language_loss": 0.6852448, "learning_rate": 4.342451835785677e-08, "loss": 0.70724511, "num_input_tokens_seen": 168153940, "step": 7780, "time_per_iteration": 2.648514747619629 }, { "auxiliary_loss_clip": 0.0116112, "auxiliary_loss_mlp": 0.01021251, "balance_loss_clip": 0.97087055, "balance_loss_mlp": 1.01489449, "epoch": 0.9356099320627668, "flos": 19463907191040.0, "grad_norm": 1.6532937724078773, "language_loss": 0.74930906, "learning_rate": 4.3263223890601665e-08, "loss": 0.77113283, "num_input_tokens_seen": 168172650, "step": 7781, "time_per_iteration": 2.6410388946533203 }, { "auxiliary_loss_clip": 0.01162805, "auxiliary_loss_mlp": 0.01122093, "balance_loss_clip": 1.01059222, "balance_loss_mlp": 0.0, "epoch": 0.9357301749534058, "flos": 19098156954240.0, "grad_norm": 1.6027721060267353, "language_loss": 0.79424727, "learning_rate": 4.31022262587597e-08, "loss": 0.81709617, "num_input_tokens_seen": 168191325, "step": 7782, "time_per_iteration": 2.591984510421753 }, { "auxiliary_loss_clip": 0.01164569, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.00989354, "balance_loss_mlp": 1.02039468, "epoch": 0.935850417844045, "flos": 23550361776000.0, "grad_norm": 1.676897680770456, "language_loss": 0.66121876, "learning_rate": 4.2941525486754225e-08, "loss": 0.68314296, "num_input_tokens_seen": 168211645, "step": 7783, "time_per_iteration": 2.6140687465667725 }, { "auxiliary_loss_clip": 0.01156921, "auxiliary_loss_mlp": 0.01026349, "balance_loss_clip": 0.93285424, "balance_loss_mlp": 1.01952136, "epoch": 0.935970660734684, "flos": 18588333265920.0, "grad_norm": 1.9031488776609622, "language_loss": 0.79394907, "learning_rate": 4.278112159896286e-08, "loss": 0.81578177, "num_input_tokens_seen": 168229485, "step": 7784, "time_per_iteration": 2.660292148590088 }, { "auxiliary_loss_clip": 0.01154878, "auxiliary_loss_mlp": 0.01026146, "balance_loss_clip": 0.96672082, "balance_loss_mlp": 1.01978362, "epoch": 0.9360909036253231, "flos": 20631255292800.0, "grad_norm": 1.8967227964363877, "language_loss": 0.67567748, "learning_rate": 4.2621014619719896e-08, "loss": 0.69748771, "num_input_tokens_seen": 168247250, "step": 7785, "time_per_iteration": 2.6271867752075195 }, { "auxiliary_loss_clip": 0.01064123, "auxiliary_loss_mlp": 0.01001944, "balance_loss_clip": 0.93489718, "balance_loss_mlp": 1.00026274, "epoch": 0.9362111465159623, "flos": 61791421052160.0, "grad_norm": 0.7913127780848326, "language_loss": 0.58698756, "learning_rate": 4.246120457331215e-08, "loss": 0.60764819, "num_input_tokens_seen": 168309425, "step": 7786, "time_per_iteration": 3.2305097579956055 }, { "auxiliary_loss_clip": 0.0115556, "auxiliary_loss_mlp": 0.01034594, "balance_loss_clip": 0.97089064, "balance_loss_mlp": 1.02717316, "epoch": 0.9363313894066013, "flos": 24170395368960.0, "grad_norm": 2.4827274154529815, "language_loss": 0.71994978, "learning_rate": 4.2301691483983325e-08, "loss": 0.74185133, "num_input_tokens_seen": 168329545, "step": 7787, "time_per_iteration": 2.6649961471557617 }, { "auxiliary_loss_clip": 0.01168199, "auxiliary_loss_mlp": 0.01030452, "balance_loss_clip": 1.00884235, "balance_loss_mlp": 1.02372861, "epoch": 0.9364516322972404, "flos": 20120354196480.0, "grad_norm": 1.8419000149762543, "language_loss": 0.75503027, "learning_rate": 4.214247537593163e-08, "loss": 0.77701688, "num_input_tokens_seen": 168348795, "step": 7788, "time_per_iteration": 2.5916740894317627 }, { "auxiliary_loss_clip": 0.01164507, "auxiliary_loss_mlp": 0.01025911, "balance_loss_clip": 0.97066694, "balance_loss_mlp": 1.01900315, "epoch": 0.9365718751878795, "flos": 20703758895360.0, "grad_norm": 2.248879808332816, "language_loss": 0.8058995, "learning_rate": 4.1983556273309293e-08, "loss": 0.82780373, "num_input_tokens_seen": 168367545, "step": 7789, "time_per_iteration": 2.6717753410339355 }, { "auxiliary_loss_clip": 0.01166701, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 1.04653788, "balance_loss_mlp": 1.02145672, "epoch": 0.9366921180785186, "flos": 18655270260480.0, "grad_norm": 2.5393897764823583, "language_loss": 0.68781656, "learning_rate": 4.182493420022526e-08, "loss": 0.70977366, "num_input_tokens_seen": 168383215, "step": 7790, "time_per_iteration": 2.535511016845703 }, { "auxiliary_loss_clip": 0.01165529, "auxiliary_loss_mlp": 0.01022815, "balance_loss_clip": 0.93292361, "balance_loss_mlp": 1.01616323, "epoch": 0.9368123609691577, "flos": 25774955815680.0, "grad_norm": 1.6578320278223078, "language_loss": 0.78684658, "learning_rate": 4.166660918074139e-08, "loss": 0.80873001, "num_input_tokens_seen": 168403120, "step": 7791, "time_per_iteration": 3.617368221282959 }, { "auxiliary_loss_clip": 0.01156622, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 0.93144029, "balance_loss_mlp": 1.02277136, "epoch": 0.9369326038597968, "flos": 25553386771200.0, "grad_norm": 1.3960087185850083, "language_loss": 0.73486209, "learning_rate": 4.15085812388758e-08, "loss": 0.7567265, "num_input_tokens_seen": 168425340, "step": 7792, "time_per_iteration": 2.7134695053100586 }, { "auxiliary_loss_clip": 0.01163922, "auxiliary_loss_mlp": 0.01024581, "balance_loss_clip": 0.97192973, "balance_loss_mlp": 1.01780701, "epoch": 0.9370528467504359, "flos": 23220019370880.0, "grad_norm": 1.6421447687808701, "language_loss": 0.78503817, "learning_rate": 4.135085039860153e-08, "loss": 0.80692315, "num_input_tokens_seen": 168444740, "step": 7793, "time_per_iteration": 2.6684987545013428 }, { "auxiliary_loss_clip": 0.01162561, "auxiliary_loss_mlp": 0.01022831, "balance_loss_clip": 0.97298902, "balance_loss_mlp": 1.01573241, "epoch": 0.9371730896410749, "flos": 24967468120320.0, "grad_norm": 2.2482173754882293, "language_loss": 0.78655416, "learning_rate": 4.1193416683845906e-08, "loss": 0.80840808, "num_input_tokens_seen": 168463670, "step": 7794, "time_per_iteration": 2.722346544265747 }, { "auxiliary_loss_clip": 0.01164323, "auxiliary_loss_mlp": 0.01019525, "balance_loss_clip": 0.93328369, "balance_loss_mlp": 1.01297116, "epoch": 0.9372933325317141, "flos": 15553091134080.0, "grad_norm": 2.657234319924327, "language_loss": 0.83145273, "learning_rate": 4.103628011849136e-08, "loss": 0.85329115, "num_input_tokens_seen": 168479030, "step": 7795, "time_per_iteration": 2.631204843521118 }, { "auxiliary_loss_clip": 0.0116908, "auxiliary_loss_mlp": 0.0102352, "balance_loss_clip": 0.97312099, "balance_loss_mlp": 1.01605701, "epoch": 0.9374135754223532, "flos": 21871861182720.0, "grad_norm": 2.1901745007975277, "language_loss": 0.75755161, "learning_rate": 4.0879440726375506e-08, "loss": 0.7794776, "num_input_tokens_seen": 168496815, "step": 7796, "time_per_iteration": 2.714148998260498 }, { "auxiliary_loss_clip": 0.01155666, "auxiliary_loss_mlp": 0.01026831, "balance_loss_clip": 0.9657169, "balance_loss_mlp": 1.01925778, "epoch": 0.9375338183129922, "flos": 22631048064000.0, "grad_norm": 2.893274401770568, "language_loss": 0.56776273, "learning_rate": 4.0722898531291074e-08, "loss": 0.58958769, "num_input_tokens_seen": 168514055, "step": 7797, "time_per_iteration": 3.619877576828003 }, { "auxiliary_loss_clip": 0.01172188, "auxiliary_loss_mlp": 0.01021431, "balance_loss_clip": 0.97166985, "balance_loss_mlp": 1.01402235, "epoch": 0.9376540612036314, "flos": 26104292640000.0, "grad_norm": 1.7256332685721625, "language_loss": 0.76841748, "learning_rate": 4.0566653556985295e-08, "loss": 0.79035366, "num_input_tokens_seen": 168534600, "step": 7798, "time_per_iteration": 4.490628719329834 }, { "auxiliary_loss_clip": 0.01163225, "auxiliary_loss_mlp": 0.01025811, "balance_loss_clip": 0.8180607, "balance_loss_mlp": 1.01807737, "epoch": 0.9377743040942704, "flos": 19717580016000.0, "grad_norm": 2.53499978054005, "language_loss": 0.82093811, "learning_rate": 4.0410705827159886e-08, "loss": 0.84282845, "num_input_tokens_seen": 168551895, "step": 7799, "time_per_iteration": 2.8585331439971924 }, { "auxiliary_loss_clip": 0.01157942, "auxiliary_loss_mlp": 0.01022122, "balance_loss_clip": 0.9664917, "balance_loss_mlp": 1.01510906, "epoch": 0.9378945469849095, "flos": 15267530010240.0, "grad_norm": 2.032001297343819, "language_loss": 0.71042955, "learning_rate": 4.0255055365472356e-08, "loss": 0.73223019, "num_input_tokens_seen": 168569990, "step": 7800, "time_per_iteration": 2.805222511291504 }, { "auxiliary_loss_clip": 0.01158718, "auxiliary_loss_mlp": 0.01027686, "balance_loss_clip": 0.85196114, "balance_loss_mlp": 1.02051544, "epoch": 0.9380147898755486, "flos": 20591394174720.0, "grad_norm": 2.1997457793782482, "language_loss": 0.74632466, "learning_rate": 4.009970219553471e-08, "loss": 0.76818871, "num_input_tokens_seen": 168586940, "step": 7801, "time_per_iteration": 2.697401285171509 }, { "auxiliary_loss_clip": 0.01169884, "auxiliary_loss_mlp": 0.01028609, "balance_loss_clip": 1.00899088, "balance_loss_mlp": 1.02087784, "epoch": 0.9381350327661877, "flos": 26281116316800.0, "grad_norm": 2.5243476018363684, "language_loss": 0.76928449, "learning_rate": 3.99446463409141e-08, "loss": 0.79126942, "num_input_tokens_seen": 168604795, "step": 7802, "time_per_iteration": 2.6767995357513428 }, { "auxiliary_loss_clip": 0.01169859, "auxiliary_loss_mlp": 0.01026195, "balance_loss_clip": 1.00765765, "balance_loss_mlp": 1.01875019, "epoch": 0.9382552756568268, "flos": 23586344225280.0, "grad_norm": 2.151774015542573, "language_loss": 0.69329512, "learning_rate": 3.978988782513215e-08, "loss": 0.71525562, "num_input_tokens_seen": 168622290, "step": 7803, "time_per_iteration": 2.589154005050659 }, { "auxiliary_loss_clip": 0.01169013, "auxiliary_loss_mlp": 0.01026562, "balance_loss_clip": 1.00876856, "balance_loss_mlp": 1.01967776, "epoch": 0.9383755185474659, "flos": 28438809275520.0, "grad_norm": 1.802032363465531, "language_loss": 0.76165199, "learning_rate": 3.963542667166586e-08, "loss": 0.78360772, "num_input_tokens_seen": 168642395, "step": 7804, "time_per_iteration": 2.6839101314544678 }, { "auxiliary_loss_clip": 0.01166727, "auxiliary_loss_mlp": 0.01029962, "balance_loss_clip": 0.93590391, "balance_loss_mlp": 1.02359939, "epoch": 0.938495761438105, "flos": 20449583280000.0, "grad_norm": 1.841980431488164, "language_loss": 0.68133789, "learning_rate": 3.9481262903946486e-08, "loss": 0.70330477, "num_input_tokens_seen": 168661840, "step": 7805, "time_per_iteration": 2.6395366191864014 }, { "auxiliary_loss_clip": 0.01073147, "auxiliary_loss_mlp": 0.01002248, "balance_loss_clip": 0.86084366, "balance_loss_mlp": 1.00045967, "epoch": 0.938616004328744, "flos": 69302711658240.0, "grad_norm": 0.771108848488092, "language_loss": 0.54515898, "learning_rate": 3.932739654536066e-08, "loss": 0.56591296, "num_input_tokens_seen": 168724540, "step": 7806, "time_per_iteration": 3.346752166748047 }, { "auxiliary_loss_clip": 0.01165354, "auxiliary_loss_mlp": 0.01025806, "balance_loss_clip": 1.00961566, "balance_loss_mlp": 1.01899362, "epoch": 0.9387362472193832, "flos": 18911636605440.0, "grad_norm": 2.40397351362345, "language_loss": 0.73776251, "learning_rate": 3.917382761925014e-08, "loss": 0.75967413, "num_input_tokens_seen": 168740375, "step": 7807, "time_per_iteration": 2.571418046951294 }, { "auxiliary_loss_clip": 0.01160508, "auxiliary_loss_mlp": 0.01025071, "balance_loss_clip": 1.00893092, "balance_loss_mlp": 1.01827633, "epoch": 0.9388564901100223, "flos": 26501967089280.0, "grad_norm": 1.623367604195172, "language_loss": 0.79253101, "learning_rate": 3.9020556148910754e-08, "loss": 0.81438684, "num_input_tokens_seen": 168759730, "step": 7808, "time_per_iteration": 2.664802074432373 }, { "auxiliary_loss_clip": 0.01066107, "auxiliary_loss_mlp": 0.01001214, "balance_loss_clip": 0.93371534, "balance_loss_mlp": 0.9996047, "epoch": 0.9389767330006613, "flos": 58941083157120.0, "grad_norm": 0.7185076120060846, "language_loss": 0.56810313, "learning_rate": 3.8867582157593895e-08, "loss": 0.58877629, "num_input_tokens_seen": 168813935, "step": 7809, "time_per_iteration": 3.0635218620300293 }, { "auxiliary_loss_clip": 0.01162595, "auxiliary_loss_mlp": 0.01026173, "balance_loss_clip": 1.01019645, "balance_loss_mlp": 1.01946473, "epoch": 0.9390969758913005, "flos": 31102554994560.0, "grad_norm": 1.6274213223516025, "language_loss": 0.76487637, "learning_rate": 3.871490566850544e-08, "loss": 0.78676403, "num_input_tokens_seen": 168838145, "step": 7810, "time_per_iteration": 2.6470046043395996 }, { "auxiliary_loss_clip": 0.01156534, "auxiliary_loss_mlp": 0.01027681, "balance_loss_clip": 0.96945906, "balance_loss_mlp": 1.02056384, "epoch": 0.9392172187819395, "flos": 22419391173120.0, "grad_norm": 1.4709034783754549, "language_loss": 0.70769745, "learning_rate": 3.856252670480642e-08, "loss": 0.72953957, "num_input_tokens_seen": 168856805, "step": 7811, "time_per_iteration": 2.718832492828369 }, { "auxiliary_loss_clip": 0.0115561, "auxiliary_loss_mlp": 0.01027362, "balance_loss_clip": 0.96594739, "balance_loss_mlp": 1.02028704, "epoch": 0.9393374616725786, "flos": 19719483436800.0, "grad_norm": 1.5972398173785265, "language_loss": 0.81317729, "learning_rate": 3.841044528961279e-08, "loss": 0.83500695, "num_input_tokens_seen": 168874600, "step": 7812, "time_per_iteration": 2.5853428840637207 }, { "auxiliary_loss_clip": 0.01167561, "auxiliary_loss_mlp": 0.01021039, "balance_loss_clip": 1.04567575, "balance_loss_mlp": 1.01417303, "epoch": 0.9394577045632178, "flos": 24170215800960.0, "grad_norm": 1.8155191927928451, "language_loss": 0.78677654, "learning_rate": 3.825866144599477e-08, "loss": 0.80866253, "num_input_tokens_seen": 168893655, "step": 7813, "time_per_iteration": 2.56266188621521 }, { "auxiliary_loss_clip": 0.01162958, "auxiliary_loss_mlp": 0.01023805, "balance_loss_clip": 0.96937364, "balance_loss_mlp": 1.01723361, "epoch": 0.9395779474538568, "flos": 19023929498880.0, "grad_norm": 1.9067061551064632, "language_loss": 0.75322509, "learning_rate": 3.8107175196978145e-08, "loss": 0.77509272, "num_input_tokens_seen": 168909960, "step": 7814, "time_per_iteration": 2.5842843055725098 }, { "auxiliary_loss_clip": 0.01163258, "auxiliary_loss_mlp": 0.01027962, "balance_loss_clip": 0.93444026, "balance_loss_mlp": 1.02121758, "epoch": 0.9396981903444959, "flos": 14319129260160.0, "grad_norm": 1.9479395993084998, "language_loss": 0.76866841, "learning_rate": 3.7955986565542996e-08, "loss": 0.79058063, "num_input_tokens_seen": 168928040, "step": 7815, "time_per_iteration": 2.7916672229766846 }, { "auxiliary_loss_clip": 0.01159014, "auxiliary_loss_mlp": 0.01024558, "balance_loss_clip": 0.93050909, "balance_loss_mlp": 1.01767969, "epoch": 0.9398184332351349, "flos": 34787564202240.0, "grad_norm": 2.0576683165083787, "language_loss": 0.68360639, "learning_rate": 3.780509557462497e-08, "loss": 0.70544213, "num_input_tokens_seen": 168948240, "step": 7816, "time_per_iteration": 2.7148497104644775 }, { "auxiliary_loss_clip": 0.01161523, "auxiliary_loss_mlp": 0.01025009, "balance_loss_clip": 0.96895105, "balance_loss_mlp": 1.01712966, "epoch": 0.9399386761257741, "flos": 25372253462400.0, "grad_norm": 1.6128489399543309, "language_loss": 0.75298244, "learning_rate": 3.765450224711375e-08, "loss": 0.77484775, "num_input_tokens_seen": 168968745, "step": 7817, "time_per_iteration": 3.775993824005127 }, { "auxiliary_loss_clip": 0.01162155, "auxiliary_loss_mlp": 0.01019422, "balance_loss_clip": 0.9725194, "balance_loss_mlp": 1.01294255, "epoch": 0.9400589190164131, "flos": 27304965584640.0, "grad_norm": 1.5972685293037732, "language_loss": 0.79779273, "learning_rate": 3.750420660585396e-08, "loss": 0.81960851, "num_input_tokens_seen": 168990685, "step": 7818, "time_per_iteration": 2.720743417739868 }, { "auxiliary_loss_clip": 0.01166983, "auxiliary_loss_mlp": 0.01022561, "balance_loss_clip": 1.04980636, "balance_loss_mlp": 1.01597726, "epoch": 0.9401791619070522, "flos": 23399859790080.0, "grad_norm": 1.9695610909169163, "language_loss": 0.7965377, "learning_rate": 3.735420867364603e-08, "loss": 0.81843311, "num_input_tokens_seen": 169011665, "step": 7819, "time_per_iteration": 2.55644154548645 }, { "auxiliary_loss_clip": 0.0114958, "auxiliary_loss_mlp": 0.01022045, "balance_loss_clip": 0.85187316, "balance_loss_mlp": 1.01531267, "epoch": 0.9402994047976914, "flos": 35881403120640.0, "grad_norm": 1.5615553140820069, "language_loss": 0.61515385, "learning_rate": 3.7204508473244186e-08, "loss": 0.63687009, "num_input_tokens_seen": 169035290, "step": 7820, "time_per_iteration": 2.8373003005981445 }, { "auxiliary_loss_clip": 0.01155125, "auxiliary_loss_mlp": 0.0102573, "balance_loss_clip": 0.81746602, "balance_loss_mlp": 1.01890576, "epoch": 0.9404196476883304, "flos": 22236821320320.0, "grad_norm": 1.4969747852978657, "language_loss": 0.69055212, "learning_rate": 3.7055106027357395e-08, "loss": 0.71236062, "num_input_tokens_seen": 169055155, "step": 7821, "time_per_iteration": 2.8915839195251465 }, { "auxiliary_loss_clip": 0.01163071, "auxiliary_loss_mlp": 0.01023695, "balance_loss_clip": 1.0095489, "balance_loss_mlp": 1.01670051, "epoch": 0.9405398905789695, "flos": 18915802583040.0, "grad_norm": 1.9404717097940904, "language_loss": 0.72071368, "learning_rate": 3.690600135865063e-08, "loss": 0.74258137, "num_input_tokens_seen": 169072080, "step": 7822, "time_per_iteration": 3.0550284385681152 }, { "auxiliary_loss_clip": 0.01070873, "auxiliary_loss_mlp": 0.01005477, "balance_loss_clip": 0.86027658, "balance_loss_mlp": 1.00384426, "epoch": 0.9406601334696086, "flos": 70274130048000.0, "grad_norm": 0.8158681166581297, "language_loss": 0.58122361, "learning_rate": 3.675719448974246e-08, "loss": 0.60198712, "num_input_tokens_seen": 169137170, "step": 7823, "time_per_iteration": 4.329629421234131 }, { "auxiliary_loss_clip": 0.01159949, "auxiliary_loss_mlp": 0.01122706, "balance_loss_clip": 0.89524472, "balance_loss_mlp": 0.0, "epoch": 0.9407803763602477, "flos": 22165071903360.0, "grad_norm": 1.8832517808951117, "language_loss": 0.60199833, "learning_rate": 3.6608685443207054e-08, "loss": 0.62482488, "num_input_tokens_seen": 169156320, "step": 7824, "time_per_iteration": 3.9973416328430176 }, { "auxiliary_loss_clip": 0.01163131, "auxiliary_loss_mlp": 0.01030847, "balance_loss_clip": 0.93197286, "balance_loss_mlp": 1.02407312, "epoch": 0.9409006192508867, "flos": 18879496911360.0, "grad_norm": 2.1132790531802796, "language_loss": 0.66850978, "learning_rate": 3.646047424157306e-08, "loss": 0.6904496, "num_input_tokens_seen": 169173295, "step": 7825, "time_per_iteration": 2.6747422218322754 }, { "auxiliary_loss_clip": 0.01164831, "auxiliary_loss_mlp": 0.01033856, "balance_loss_clip": 0.9715721, "balance_loss_mlp": 1.02648902, "epoch": 0.9410208621415259, "flos": 23368258800000.0, "grad_norm": 2.5397332308845715, "language_loss": 0.68308443, "learning_rate": 3.631256090732382e-08, "loss": 0.70507133, "num_input_tokens_seen": 169193755, "step": 7826, "time_per_iteration": 2.6651177406311035 }, { "auxiliary_loss_clip": 0.01164961, "auxiliary_loss_mlp": 0.0102308, "balance_loss_clip": 0.93464327, "balance_loss_mlp": 1.01624382, "epoch": 0.941141105032165, "flos": 22742227635840.0, "grad_norm": 1.6484309581346024, "language_loss": 0.82706821, "learning_rate": 3.6164945462897833e-08, "loss": 0.8489486, "num_input_tokens_seen": 169213045, "step": 7827, "time_per_iteration": 2.706915855407715 }, { "auxiliary_loss_clip": 0.01165715, "auxiliary_loss_mlp": 0.01122106, "balance_loss_clip": 1.01173735, "balance_loss_mlp": 0.0, "epoch": 0.941261347922804, "flos": 20704908130560.0, "grad_norm": 3.507756129179172, "language_loss": 0.75816625, "learning_rate": 3.6017627930687856e-08, "loss": 0.78104448, "num_input_tokens_seen": 169232870, "step": 7828, "time_per_iteration": 2.6351802349090576 }, { "auxiliary_loss_clip": 0.0115486, "auxiliary_loss_mlp": 0.01022103, "balance_loss_clip": 0.89036405, "balance_loss_mlp": 1.01550794, "epoch": 0.9413815908134432, "flos": 19421998997760.0, "grad_norm": 1.9125936580269745, "language_loss": 0.77021819, "learning_rate": 3.587060833304267e-08, "loss": 0.79198784, "num_input_tokens_seen": 169251060, "step": 7829, "time_per_iteration": 2.6703505516052246 }, { "auxiliary_loss_clip": 0.01169135, "auxiliary_loss_mlp": 0.01028189, "balance_loss_clip": 1.01080596, "balance_loss_mlp": 1.02105165, "epoch": 0.9415018337040822, "flos": 17493452853120.0, "grad_norm": 1.943936010331348, "language_loss": 0.63987005, "learning_rate": 3.5723886692264225e-08, "loss": 0.6618433, "num_input_tokens_seen": 169268600, "step": 7830, "time_per_iteration": 2.6124236583709717 }, { "auxiliary_loss_clip": 0.01159889, "auxiliary_loss_mlp": 0.01025377, "balance_loss_clip": 0.96910655, "balance_loss_mlp": 1.01845098, "epoch": 0.9416220765947213, "flos": 31831613343360.0, "grad_norm": 2.632415004186641, "language_loss": 0.6182158, "learning_rate": 3.557746303061071e-08, "loss": 0.64006841, "num_input_tokens_seen": 169290355, "step": 7831, "time_per_iteration": 2.723388195037842 }, { "auxiliary_loss_clip": 0.01160576, "auxiliary_loss_mlp": 0.01026071, "balance_loss_clip": 0.96849924, "balance_loss_mlp": 1.01875782, "epoch": 0.9417423194853605, "flos": 23511973115520.0, "grad_norm": 1.8930002267901536, "language_loss": 0.71843374, "learning_rate": 3.543133737029391e-08, "loss": 0.74030018, "num_input_tokens_seen": 169310865, "step": 7832, "time_per_iteration": 2.6724791526794434 }, { "auxiliary_loss_clip": 0.01167468, "auxiliary_loss_mlp": 0.01022842, "balance_loss_clip": 1.00901818, "balance_loss_mlp": 1.01600838, "epoch": 0.9418625623759995, "flos": 23915106432000.0, "grad_norm": 1.774614622687301, "language_loss": 0.68903184, "learning_rate": 3.5285509733481214e-08, "loss": 0.710935, "num_input_tokens_seen": 169330590, "step": 7833, "time_per_iteration": 2.6295223236083984 }, { "auxiliary_loss_clip": 0.01159866, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 1.00767636, "balance_loss_mlp": 1.02324009, "epoch": 0.9419828052666386, "flos": 18076965292800.0, "grad_norm": 1.6178470571034165, "language_loss": 0.76499957, "learning_rate": 3.513998014229469e-08, "loss": 0.78690392, "num_input_tokens_seen": 169349540, "step": 7834, "time_per_iteration": 2.5897202491760254 }, { "auxiliary_loss_clip": 0.01166906, "auxiliary_loss_mlp": 0.01022438, "balance_loss_clip": 0.97221863, "balance_loss_mlp": 1.01556277, "epoch": 0.9421030481572777, "flos": 17712328377600.0, "grad_norm": 2.7138797935211256, "language_loss": 0.86485535, "learning_rate": 3.499474861881069e-08, "loss": 0.88674879, "num_input_tokens_seen": 169366765, "step": 7835, "time_per_iteration": 2.6618404388427734 }, { "auxiliary_loss_clip": 0.01159672, "auxiliary_loss_mlp": 0.01028855, "balance_loss_clip": 0.85648882, "balance_loss_mlp": 1.02211642, "epoch": 0.9422232910479168, "flos": 20194114775040.0, "grad_norm": 7.808679699993672, "language_loss": 0.67840552, "learning_rate": 3.4849815185061136e-08, "loss": 0.7002908, "num_input_tokens_seen": 169386655, "step": 7836, "time_per_iteration": 2.7636830806732178 }, { "auxiliary_loss_clip": 0.01163882, "auxiliary_loss_mlp": 0.0102121, "balance_loss_clip": 1.00874567, "balance_loss_mlp": 1.01517463, "epoch": 0.9423435339385559, "flos": 18442571875200.0, "grad_norm": 2.011383216703443, "language_loss": 0.76066536, "learning_rate": 3.470517986303223e-08, "loss": 0.78251624, "num_input_tokens_seen": 169405640, "step": 7837, "time_per_iteration": 2.6139039993286133 }, { "auxiliary_loss_clip": 0.0116122, "auxiliary_loss_mlp": 0.01030093, "balance_loss_clip": 0.93406284, "balance_loss_mlp": 1.02329254, "epoch": 0.942463776829195, "flos": 20080636732800.0, "grad_norm": 1.588119327272424, "language_loss": 0.79280478, "learning_rate": 3.4560842674664856e-08, "loss": 0.81471789, "num_input_tokens_seen": 169424155, "step": 7838, "time_per_iteration": 2.6776206493377686 }, { "auxiliary_loss_clip": 0.0116359, "auxiliary_loss_mlp": 0.01022709, "balance_loss_clip": 1.00599504, "balance_loss_mlp": 1.01577687, "epoch": 0.9425840197198341, "flos": 22636255536000.0, "grad_norm": 1.808923350257961, "language_loss": 0.75194132, "learning_rate": 3.441680364185506e-08, "loss": 0.77380425, "num_input_tokens_seen": 169444025, "step": 7839, "time_per_iteration": 2.6313328742980957 }, { "auxiliary_loss_clip": 0.01168462, "auxiliary_loss_mlp": 0.0103026, "balance_loss_clip": 0.97438967, "balance_loss_mlp": 1.02263975, "epoch": 0.9427042626104731, "flos": 19937892084480.0, "grad_norm": 2.710918083294948, "language_loss": 0.74521774, "learning_rate": 3.427306278645314e-08, "loss": 0.767205, "num_input_tokens_seen": 169462480, "step": 7840, "time_per_iteration": 2.6454882621765137 }, { "auxiliary_loss_clip": 0.01161577, "auxiliary_loss_mlp": 0.0102274, "balance_loss_clip": 0.89374888, "balance_loss_mlp": 1.01609969, "epoch": 0.9428245055011123, "flos": 22856998567680.0, "grad_norm": 2.0291095461692765, "language_loss": 0.7293241, "learning_rate": 3.4129620130264767e-08, "loss": 0.75116724, "num_input_tokens_seen": 169480840, "step": 7841, "time_per_iteration": 2.690842628479004 }, { "auxiliary_loss_clip": 0.01169919, "auxiliary_loss_mlp": 0.01122111, "balance_loss_clip": 0.97378117, "balance_loss_mlp": 0.0, "epoch": 0.9429447483917514, "flos": 20951757371520.0, "grad_norm": 2.1083745803517058, "language_loss": 0.77644312, "learning_rate": 3.398647569505009e-08, "loss": 0.79936343, "num_input_tokens_seen": 169498265, "step": 7842, "time_per_iteration": 2.6103758811950684 }, { "auxiliary_loss_clip": 0.01167509, "auxiliary_loss_mlp": 0.01025603, "balance_loss_clip": 0.93192911, "balance_loss_mlp": 1.01850426, "epoch": 0.9430649912823904, "flos": 18843658116480.0, "grad_norm": 2.474447519483, "language_loss": 0.7514137, "learning_rate": 3.384362950252373e-08, "loss": 0.77334487, "num_input_tokens_seen": 169515235, "step": 7843, "time_per_iteration": 2.699050188064575 }, { "auxiliary_loss_clip": 0.01158017, "auxiliary_loss_mlp": 0.01022471, "balance_loss_clip": 0.96691078, "balance_loss_mlp": 1.0149281, "epoch": 0.9431852341730296, "flos": 32556038837760.0, "grad_norm": 1.866671214380235, "language_loss": 0.57171643, "learning_rate": 3.3701081574355473e-08, "loss": 0.59352136, "num_input_tokens_seen": 169537195, "step": 7844, "time_per_iteration": 3.530846357345581 }, { "auxiliary_loss_clip": 0.01066937, "auxiliary_loss_mlp": 0.01001596, "balance_loss_clip": 0.93404943, "balance_loss_mlp": 1.00002217, "epoch": 0.9433054770636686, "flos": 66904490252160.0, "grad_norm": 0.6382333058459316, "language_loss": 0.51676798, "learning_rate": 3.3558831932169796e-08, "loss": 0.53745329, "num_input_tokens_seen": 169605865, "step": 7845, "time_per_iteration": 3.318838357925415 }, { "auxiliary_loss_clip": 0.01163958, "auxiliary_loss_mlp": 0.01022587, "balance_loss_clip": 1.01024723, "balance_loss_mlp": 1.01525593, "epoch": 0.9434257199543077, "flos": 26140346916480.0, "grad_norm": 1.7705636075481759, "language_loss": 0.88614154, "learning_rate": 3.341688059754588e-08, "loss": 0.90800691, "num_input_tokens_seen": 169621520, "step": 7846, "time_per_iteration": 2.6570215225219727 }, { "auxiliary_loss_clip": 0.01168088, "auxiliary_loss_mlp": 0.01122405, "balance_loss_clip": 0.9315058, "balance_loss_mlp": 0.0, "epoch": 0.9435459628449467, "flos": 25003486483200.0, "grad_norm": 2.080336021812658, "language_loss": 0.78294653, "learning_rate": 3.327522759201762e-08, "loss": 0.80585146, "num_input_tokens_seen": 169641390, "step": 7847, "time_per_iteration": 2.748180866241455 }, { "auxiliary_loss_clip": 0.01163244, "auxiliary_loss_mlp": 0.010253, "balance_loss_clip": 0.93370575, "balance_loss_mlp": 1.01874018, "epoch": 0.9436662057355859, "flos": 22163240309760.0, "grad_norm": 2.1293473730382315, "language_loss": 0.66608065, "learning_rate": 3.313387293707359e-08, "loss": 0.68796605, "num_input_tokens_seen": 169660095, "step": 7848, "time_per_iteration": 2.713223457336426 }, { "auxiliary_loss_clip": 0.01159536, "auxiliary_loss_mlp": 0.01025027, "balance_loss_clip": 0.93321609, "balance_loss_mlp": 1.01730251, "epoch": 0.943786448626225, "flos": 20118522602880.0, "grad_norm": 1.6691423918796453, "language_loss": 0.68318951, "learning_rate": 3.29928166541571e-08, "loss": 0.70503521, "num_input_tokens_seen": 169679050, "step": 7849, "time_per_iteration": 3.5192737579345703 }, { "auxiliary_loss_clip": 0.01155979, "auxiliary_loss_mlp": 0.01023695, "balance_loss_clip": 0.97067803, "balance_loss_mlp": 1.01738274, "epoch": 0.943906691516864, "flos": 22090808534400.0, "grad_norm": 1.8070561625982906, "language_loss": 0.80167657, "learning_rate": 3.2852058764666346e-08, "loss": 0.82347327, "num_input_tokens_seen": 169698150, "step": 7850, "time_per_iteration": 3.61673903465271 }, { "auxiliary_loss_clip": 0.01155767, "auxiliary_loss_mlp": 0.01028869, "balance_loss_clip": 0.9363392, "balance_loss_mlp": 1.02223539, "epoch": 0.9440269344075032, "flos": 35298501212160.0, "grad_norm": 2.4331885808209637, "language_loss": 0.68414944, "learning_rate": 3.2711599289954264e-08, "loss": 0.7059958, "num_input_tokens_seen": 169722185, "step": 7851, "time_per_iteration": 3.796963930130005 }, { "auxiliary_loss_clip": 0.01160637, "auxiliary_loss_mlp": 0.01028921, "balance_loss_clip": 0.8555603, "balance_loss_mlp": 1.02196229, "epoch": 0.9441471772981422, "flos": 19238136255360.0, "grad_norm": 1.7371044603878278, "language_loss": 0.77721763, "learning_rate": 3.257143825132847e-08, "loss": 0.79911321, "num_input_tokens_seen": 169740355, "step": 7852, "time_per_iteration": 2.747098207473755 }, { "auxiliary_loss_clip": 0.01162432, "auxiliary_loss_mlp": 0.01024019, "balance_loss_clip": 0.96967018, "balance_loss_mlp": 1.0176084, "epoch": 0.9442674201887813, "flos": 25739799379200.0, "grad_norm": 1.9125694018209434, "language_loss": 0.75774151, "learning_rate": 3.243157567005106e-08, "loss": 0.77960604, "num_input_tokens_seen": 169758535, "step": 7853, "time_per_iteration": 2.6672568321228027 }, { "auxiliary_loss_clip": 0.01172127, "auxiliary_loss_mlp": 0.0102792, "balance_loss_clip": 1.05149174, "balance_loss_mlp": 1.02051401, "epoch": 0.9443876630794205, "flos": 15523321737600.0, "grad_norm": 4.324125356409329, "language_loss": 0.64096189, "learning_rate": 3.2292011567339296e-08, "loss": 0.66296238, "num_input_tokens_seen": 169776340, "step": 7854, "time_per_iteration": 2.5501744747161865 }, { "auxiliary_loss_clip": 0.01164339, "auxiliary_loss_mlp": 0.01122088, "balance_loss_clip": 1.00713944, "balance_loss_mlp": 0.0, "epoch": 0.9445079059700595, "flos": 13400821128960.0, "grad_norm": 2.0699707220207784, "language_loss": 0.55520713, "learning_rate": 3.21527459643649e-08, "loss": 0.57807142, "num_input_tokens_seen": 169793225, "step": 7855, "time_per_iteration": 2.5819737911224365 }, { "auxiliary_loss_clip": 0.0116698, "auxiliary_loss_mlp": 0.0102654, "balance_loss_clip": 1.01016068, "balance_loss_mlp": 1.01933992, "epoch": 0.9446281488606986, "flos": 23659242877440.0, "grad_norm": 1.9208047836821491, "language_loss": 0.74053359, "learning_rate": 3.2013778882254536e-08, "loss": 0.76246881, "num_input_tokens_seen": 169812020, "step": 7856, "time_per_iteration": 2.626762866973877 }, { "auxiliary_loss_clip": 0.01153589, "auxiliary_loss_mlp": 0.01028636, "balance_loss_clip": 1.00614142, "balance_loss_mlp": 1.02156067, "epoch": 0.9447483917513377, "flos": 25557337267200.0, "grad_norm": 1.8729994940171835, "language_loss": 0.76131725, "learning_rate": 3.1875110342088676e-08, "loss": 0.78313947, "num_input_tokens_seen": 169833470, "step": 7857, "time_per_iteration": 2.6202352046966553 }, { "auxiliary_loss_clip": 0.01157148, "auxiliary_loss_mlp": 0.01020983, "balance_loss_clip": 0.96889257, "balance_loss_mlp": 1.01398516, "epoch": 0.9448686346419768, "flos": 24535463247360.0, "grad_norm": 1.6801051523527664, "language_loss": 0.65328991, "learning_rate": 3.1736740364904035e-08, "loss": 0.67507124, "num_input_tokens_seen": 169854000, "step": 7858, "time_per_iteration": 2.6351828575134277 }, { "auxiliary_loss_clip": 0.01157906, "auxiliary_loss_mlp": 0.01122038, "balance_loss_clip": 0.89276791, "balance_loss_mlp": 0.0, "epoch": 0.9449888775326158, "flos": 14721256995840.0, "grad_norm": 2.113964980111409, "language_loss": 0.77539283, "learning_rate": 3.159866897169094e-08, "loss": 0.79819232, "num_input_tokens_seen": 169872200, "step": 7859, "time_per_iteration": 2.696394205093384 }, { "auxiliary_loss_clip": 0.01169721, "auxiliary_loss_mlp": 0.01026888, "balance_loss_clip": 0.93222594, "balance_loss_mlp": 1.0190444, "epoch": 0.945109120423255, "flos": 15447873219840.0, "grad_norm": 1.9241140277398836, "language_loss": 0.7578811, "learning_rate": 3.146089618339487e-08, "loss": 0.7798472, "num_input_tokens_seen": 169889055, "step": 7860, "time_per_iteration": 2.633875846862793 }, { "auxiliary_loss_clip": 0.01163305, "auxiliary_loss_mlp": 0.010249, "balance_loss_clip": 0.93253881, "balance_loss_mlp": 1.01752126, "epoch": 0.9452293633138941, "flos": 25448097029760.0, "grad_norm": 1.822902734949673, "language_loss": 0.68175679, "learning_rate": 3.132342202091554e-08, "loss": 0.70363879, "num_input_tokens_seen": 169909280, "step": 7861, "time_per_iteration": 2.7068533897399902 }, { "auxiliary_loss_clip": 0.01167682, "auxiliary_loss_mlp": 0.0102917, "balance_loss_clip": 1.04767454, "balance_loss_mlp": 1.02192187, "epoch": 0.9453496062045331, "flos": 21215342350080.0, "grad_norm": 1.9936136032342193, "language_loss": 0.68645227, "learning_rate": 3.1186246505107595e-08, "loss": 0.70842081, "num_input_tokens_seen": 169928420, "step": 7862, "time_per_iteration": 2.9060235023498535 }, { "auxiliary_loss_clip": 0.01165345, "auxiliary_loss_mlp": 0.01022597, "balance_loss_clip": 1.01150537, "balance_loss_mlp": 1.01518202, "epoch": 0.9454698490951723, "flos": 20010898477440.0, "grad_norm": 2.475599096586741, "language_loss": 0.83714229, "learning_rate": 3.104936965678084e-08, "loss": 0.85902172, "num_input_tokens_seen": 169946750, "step": 7863, "time_per_iteration": 2.5657248497009277 }, { "auxiliary_loss_clip": 0.01166113, "auxiliary_loss_mlp": 0.01026304, "balance_loss_clip": 1.00750256, "balance_loss_mlp": 1.01921988, "epoch": 0.9455900919858113, "flos": 21069652786560.0, "grad_norm": 1.7939546841571445, "language_loss": 0.81933236, "learning_rate": 3.091279149669956e-08, "loss": 0.8412565, "num_input_tokens_seen": 169965540, "step": 7864, "time_per_iteration": 2.6514134407043457 }, { "auxiliary_loss_clip": 0.01164839, "auxiliary_loss_mlp": 0.01122238, "balance_loss_clip": 1.00864911, "balance_loss_mlp": 0.0, "epoch": 0.9457103348764504, "flos": 20740854666240.0, "grad_norm": 1.8159979654339822, "language_loss": 0.73718518, "learning_rate": 3.0776512045581624e-08, "loss": 0.76005602, "num_input_tokens_seen": 169984330, "step": 7865, "time_per_iteration": 2.6358754634857178 }, { "auxiliary_loss_clip": 0.01156162, "auxiliary_loss_mlp": 0.01028279, "balance_loss_clip": 0.96993351, "balance_loss_mlp": 1.02126408, "epoch": 0.9458305777670896, "flos": 21428363957760.0, "grad_norm": 1.7700208416450287, "language_loss": 0.77587068, "learning_rate": 3.0640531324101384e-08, "loss": 0.79771513, "num_input_tokens_seen": 170002095, "step": 7866, "time_per_iteration": 2.6368179321289062 }, { "auxiliary_loss_clip": 0.01168646, "auxiliary_loss_mlp": 0.01027347, "balance_loss_clip": 1.01429033, "balance_loss_mlp": 1.0202179, "epoch": 0.9459508206577286, "flos": 20011185786240.0, "grad_norm": 1.6047924813074832, "language_loss": 0.75799525, "learning_rate": 3.0504849352886554e-08, "loss": 0.77995515, "num_input_tokens_seen": 170020240, "step": 7867, "time_per_iteration": 2.5879054069519043 }, { "auxiliary_loss_clip": 0.01164061, "auxiliary_loss_mlp": 0.01032351, "balance_loss_clip": 1.00946939, "balance_loss_mlp": 1.02501988, "epoch": 0.9460710635483677, "flos": 12166428291840.0, "grad_norm": 2.25285781033494, "language_loss": 0.71514195, "learning_rate": 3.036946615252023e-08, "loss": 0.73710608, "num_input_tokens_seen": 170035770, "step": 7868, "time_per_iteration": 2.604403495788574 }, { "auxiliary_loss_clip": 0.01173009, "auxiliary_loss_mlp": 0.01027072, "balance_loss_clip": 0.97288465, "balance_loss_mlp": 1.02014613, "epoch": 0.9461913064390068, "flos": 34276196229120.0, "grad_norm": 2.1939854752324255, "language_loss": 0.66548753, "learning_rate": 3.0234381743539984e-08, "loss": 0.68748832, "num_input_tokens_seen": 170053385, "step": 7869, "time_per_iteration": 3.5555689334869385 }, { "auxiliary_loss_clip": 0.01168581, "auxiliary_loss_mlp": 0.01029818, "balance_loss_clip": 0.97128296, "balance_loss_mlp": 1.02311504, "epoch": 0.9463115493296459, "flos": 19463763536640.0, "grad_norm": 3.708931348022523, "language_loss": 0.79975879, "learning_rate": 3.0099596146437863e-08, "loss": 0.82174277, "num_input_tokens_seen": 170070490, "step": 7870, "time_per_iteration": 2.656870126724243 }, { "auxiliary_loss_clip": 0.01058789, "auxiliary_loss_mlp": 0.01001363, "balance_loss_clip": 1.00795996, "balance_loss_mlp": 0.99977702, "epoch": 0.946431792220285, "flos": 70570824387840.0, "grad_norm": 0.7833842277656571, "language_loss": 0.60105145, "learning_rate": 2.996510938166086e-08, "loss": 0.62165296, "num_input_tokens_seen": 170133465, "step": 7871, "time_per_iteration": 3.242858409881592 }, { "auxiliary_loss_clip": 0.01164719, "auxiliary_loss_mlp": 0.01026566, "balance_loss_clip": 1.01025414, "balance_loss_mlp": 1.01988769, "epoch": 0.9465520351109241, "flos": 18947906363520.0, "grad_norm": 2.0715164212268333, "language_loss": 0.73430872, "learning_rate": 2.983092146960997e-08, "loss": 0.75622153, "num_input_tokens_seen": 170150810, "step": 7872, "time_per_iteration": 2.545348644256592 }, { "auxiliary_loss_clip": 0.01164494, "auxiliary_loss_mlp": 0.01026234, "balance_loss_clip": 0.97047132, "balance_loss_mlp": 1.01905215, "epoch": 0.9466722780015632, "flos": 19135647774720.0, "grad_norm": 2.1417865666632006, "language_loss": 0.79972547, "learning_rate": 2.9697032430642256e-08, "loss": 0.8216328, "num_input_tokens_seen": 170169025, "step": 7873, "time_per_iteration": 2.7256603240966797 }, { "auxiliary_loss_clip": 0.01164282, "auxiliary_loss_mlp": 0.01023577, "balance_loss_clip": 1.04821515, "balance_loss_mlp": 1.01725304, "epoch": 0.9467925208922022, "flos": 17237912520960.0, "grad_norm": 2.383711951565977, "language_loss": 0.73253727, "learning_rate": 2.9563442285067906e-08, "loss": 0.75441587, "num_input_tokens_seen": 170186070, "step": 7874, "time_per_iteration": 2.522993803024292 }, { "auxiliary_loss_clip": 0.01170138, "auxiliary_loss_mlp": 0.01031661, "balance_loss_clip": 1.01185715, "balance_loss_mlp": 1.02436268, "epoch": 0.9469127637828414, "flos": 29169016859520.0, "grad_norm": 1.9811642941040155, "language_loss": 0.79792261, "learning_rate": 2.943015105315294e-08, "loss": 0.81994069, "num_input_tokens_seen": 170206265, "step": 7875, "time_per_iteration": 3.6599960327148438 }, { "auxiliary_loss_clip": 0.01165747, "auxiliary_loss_mlp": 0.0103076, "balance_loss_clip": 0.89554286, "balance_loss_mlp": 1.0231427, "epoch": 0.9470330066734804, "flos": 26030460234240.0, "grad_norm": 2.564954420843524, "language_loss": 0.66225898, "learning_rate": 2.929715875511718e-08, "loss": 0.68422413, "num_input_tokens_seen": 170225300, "step": 7876, "time_per_iteration": 2.7261741161346436 }, { "auxiliary_loss_clip": 0.01163139, "auxiliary_loss_mlp": 0.01022775, "balance_loss_clip": 1.0047555, "balance_loss_mlp": 1.01580477, "epoch": 0.9471532495641195, "flos": 23440906056960.0, "grad_norm": 1.79275889038004, "language_loss": 0.70182729, "learning_rate": 2.9164465411135375e-08, "loss": 0.72368646, "num_input_tokens_seen": 170245070, "step": 7877, "time_per_iteration": 3.5480384826660156 }, { "auxiliary_loss_clip": 0.01167678, "auxiliary_loss_mlp": 0.01031066, "balance_loss_clip": 1.01251888, "balance_loss_mlp": 1.02427745, "epoch": 0.9472734924547586, "flos": 15815850099840.0, "grad_norm": 1.6987812989376085, "language_loss": 0.80766892, "learning_rate": 2.9032071041337426e-08, "loss": 0.82965636, "num_input_tokens_seen": 170263305, "step": 7878, "time_per_iteration": 3.5026586055755615 }, { "auxiliary_loss_clip": 0.01155467, "auxiliary_loss_mlp": 0.01024088, "balance_loss_clip": 0.97001255, "balance_loss_mlp": 1.01707518, "epoch": 0.9473937353453977, "flos": 11181793697280.0, "grad_norm": 2.76315090909936, "language_loss": 0.72439456, "learning_rate": 2.889997566580704e-08, "loss": 0.74619007, "num_input_tokens_seen": 170281460, "step": 7879, "time_per_iteration": 2.6600470542907715 }, { "auxiliary_loss_clip": 0.01168325, "auxiliary_loss_mlp": 0.01028587, "balance_loss_clip": 1.04706383, "balance_loss_mlp": 1.02126122, "epoch": 0.9475139782360368, "flos": 25775530433280.0, "grad_norm": 2.1528585984511386, "language_loss": 0.70211697, "learning_rate": 2.8768179304583086e-08, "loss": 0.72408611, "num_input_tokens_seen": 170303515, "step": 7880, "time_per_iteration": 2.6324424743652344 }, { "auxiliary_loss_clip": 0.01164823, "auxiliary_loss_mlp": 0.0102769, "balance_loss_clip": 0.93625891, "balance_loss_mlp": 1.02026367, "epoch": 0.9476342211266758, "flos": 22820046451200.0, "grad_norm": 1.5684865346826076, "language_loss": 0.73424131, "learning_rate": 2.8636681977659117e-08, "loss": 0.75616646, "num_input_tokens_seen": 170323165, "step": 7881, "time_per_iteration": 2.731877088546753 }, { "auxiliary_loss_clip": 0.0115886, "auxiliary_loss_mlp": 0.01025438, "balance_loss_clip": 0.89648032, "balance_loss_mlp": 1.01806819, "epoch": 0.947754464017315, "flos": 20193611984640.0, "grad_norm": 2.0730691025197934, "language_loss": 0.77848846, "learning_rate": 2.850548370498318e-08, "loss": 0.80033141, "num_input_tokens_seen": 170341005, "step": 7882, "time_per_iteration": 2.67396879196167 }, { "auxiliary_loss_clip": 0.01161956, "auxiliary_loss_mlp": 0.01022874, "balance_loss_clip": 1.00743639, "balance_loss_mlp": 1.01643419, "epoch": 0.9478747069079541, "flos": 24717925359360.0, "grad_norm": 1.48898281926179, "language_loss": 0.7153275, "learning_rate": 2.8374584506457798e-08, "loss": 0.73717582, "num_input_tokens_seen": 170362280, "step": 7883, "time_per_iteration": 2.686976432800293 }, { "auxiliary_loss_clip": 0.01160455, "auxiliary_loss_mlp": 0.01022511, "balance_loss_clip": 0.97067201, "balance_loss_mlp": 1.01571608, "epoch": 0.9479949497985931, "flos": 21361355136000.0, "grad_norm": 2.2298018923821417, "language_loss": 0.67590714, "learning_rate": 2.824398440193998e-08, "loss": 0.69773686, "num_input_tokens_seen": 170381080, "step": 7884, "time_per_iteration": 2.695977210998535 }, { "auxiliary_loss_clip": 0.01156458, "auxiliary_loss_mlp": 0.01026401, "balance_loss_clip": 0.89380455, "balance_loss_mlp": 1.01890516, "epoch": 0.9481151926892323, "flos": 18148606968960.0, "grad_norm": 2.001764279672521, "language_loss": 0.71683496, "learning_rate": 2.811368341124232e-08, "loss": 0.73866355, "num_input_tokens_seen": 170400150, "step": 7885, "time_per_iteration": 2.7783243656158447 }, { "auxiliary_loss_clip": 0.01163221, "auxiliary_loss_mlp": 0.01023765, "balance_loss_clip": 0.97132343, "balance_loss_mlp": 1.01726246, "epoch": 0.9482354355798713, "flos": 22128012046080.0, "grad_norm": 2.447758844672781, "language_loss": 0.68122137, "learning_rate": 2.7983681554131222e-08, "loss": 0.70309126, "num_input_tokens_seen": 170420410, "step": 7886, "time_per_iteration": 2.6308422088623047 }, { "auxiliary_loss_clip": 0.01162058, "auxiliary_loss_mlp": 0.01026477, "balance_loss_clip": 0.96984899, "balance_loss_mlp": 1.01894605, "epoch": 0.9483556784705104, "flos": 19063072344960.0, "grad_norm": 2.298237974173595, "language_loss": 0.70159686, "learning_rate": 2.7853978850327365e-08, "loss": 0.72348225, "num_input_tokens_seen": 170439580, "step": 7887, "time_per_iteration": 2.663376808166504 }, { "auxiliary_loss_clip": 0.01166006, "auxiliary_loss_mlp": 0.01020284, "balance_loss_clip": 0.93785954, "balance_loss_mlp": 1.01393652, "epoch": 0.9484759213611496, "flos": 25777110631680.0, "grad_norm": 1.7068739769347843, "language_loss": 0.87312734, "learning_rate": 2.7724575319507225e-08, "loss": 0.89499027, "num_input_tokens_seen": 170459290, "step": 7888, "time_per_iteration": 2.7077057361602783 }, { "auxiliary_loss_clip": 0.01160621, "auxiliary_loss_mlp": 0.01025559, "balance_loss_clip": 1.00457811, "balance_loss_mlp": 1.01918983, "epoch": 0.9485961642517886, "flos": 20667740532480.0, "grad_norm": 1.8259899611520571, "language_loss": 0.76820052, "learning_rate": 2.759547098130044e-08, "loss": 0.79006231, "num_input_tokens_seen": 170478020, "step": 7889, "time_per_iteration": 2.669081687927246 }, { "auxiliary_loss_clip": 0.01165882, "auxiliary_loss_mlp": 0.01026698, "balance_loss_clip": 1.04838562, "balance_loss_mlp": 1.01952195, "epoch": 0.9487164071424277, "flos": 22674069578880.0, "grad_norm": 1.7352058202011775, "language_loss": 0.76726437, "learning_rate": 2.746666585529267e-08, "loss": 0.78919017, "num_input_tokens_seen": 170498295, "step": 7890, "time_per_iteration": 2.7262308597564697 }, { "auxiliary_loss_clip": 0.01155932, "auxiliary_loss_mlp": 0.01026133, "balance_loss_clip": 1.00705612, "balance_loss_mlp": 1.01903701, "epoch": 0.9488366500330668, "flos": 38726461716480.0, "grad_norm": 2.0795507389191306, "language_loss": 0.74016583, "learning_rate": 2.73381599610234e-08, "loss": 0.76198649, "num_input_tokens_seen": 170518695, "step": 7891, "time_per_iteration": 2.756380319595337 }, { "auxiliary_loss_clip": 0.01156688, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 1.00585437, "balance_loss_mlp": 1.02035713, "epoch": 0.9489568929237059, "flos": 27890920149120.0, "grad_norm": 1.889471976185711, "language_loss": 0.71539783, "learning_rate": 2.7209953317987033e-08, "loss": 0.7372334, "num_input_tokens_seen": 170539735, "step": 7892, "time_per_iteration": 2.6505868434906006 }, { "auxiliary_loss_clip": 0.01166436, "auxiliary_loss_mlp": 0.01026917, "balance_loss_clip": 1.01035416, "balance_loss_mlp": 1.01969838, "epoch": 0.9490771358143449, "flos": 33580642291200.0, "grad_norm": 1.9656801622116833, "language_loss": 0.78061795, "learning_rate": 2.7082045945631793e-08, "loss": 0.80255145, "num_input_tokens_seen": 170561950, "step": 7893, "time_per_iteration": 2.756270170211792 }, { "auxiliary_loss_clip": 0.01151562, "auxiliary_loss_mlp": 0.01027081, "balance_loss_clip": 0.93113261, "balance_loss_mlp": 1.02030134, "epoch": 0.9491973787049841, "flos": 14793796512000.0, "grad_norm": 11.388086472271262, "language_loss": 0.69286788, "learning_rate": 2.6954437863361712e-08, "loss": 0.71465433, "num_input_tokens_seen": 170579865, "step": 7894, "time_per_iteration": 2.658958911895752 }, { "auxiliary_loss_clip": 0.01162605, "auxiliary_loss_mlp": 0.01022006, "balance_loss_clip": 0.85604966, "balance_loss_mlp": 1.01572359, "epoch": 0.9493176215956232, "flos": 25332535998720.0, "grad_norm": 1.8433228029783886, "language_loss": 0.70815742, "learning_rate": 2.6827129090534862e-08, "loss": 0.73000348, "num_input_tokens_seen": 170600165, "step": 7895, "time_per_iteration": 3.581725835800171 }, { "auxiliary_loss_clip": 0.01165978, "auxiliary_loss_mlp": 0.01026845, "balance_loss_clip": 0.97368848, "balance_loss_mlp": 1.01998758, "epoch": 0.9494378644862622, "flos": 21029971236480.0, "grad_norm": 1.8230206083924914, "language_loss": 0.77909255, "learning_rate": 2.670011964646335e-08, "loss": 0.8010208, "num_input_tokens_seen": 170618845, "step": 7896, "time_per_iteration": 2.632222890853882 }, { "auxiliary_loss_clip": 0.01161301, "auxiliary_loss_mlp": 0.01031655, "balance_loss_clip": 0.81163675, "balance_loss_mlp": 1.02395427, "epoch": 0.9495581073769014, "flos": 15195134148480.0, "grad_norm": 2.9715121591738947, "language_loss": 0.68176317, "learning_rate": 2.657340955041487e-08, "loss": 0.70369267, "num_input_tokens_seen": 170637620, "step": 7897, "time_per_iteration": 2.8814780712127686 }, { "auxiliary_loss_clip": 0.01167349, "auxiliary_loss_mlp": 0.01029908, "balance_loss_clip": 0.97381127, "balance_loss_mlp": 1.02266026, "epoch": 0.9496783502675404, "flos": 28616566705920.0, "grad_norm": 1.8163513388174457, "language_loss": 0.71324003, "learning_rate": 2.6446998821611167e-08, "loss": 0.73521256, "num_input_tokens_seen": 170657815, "step": 7898, "time_per_iteration": 2.8373050689697266 }, { "auxiliary_loss_clip": 0.01162439, "auxiliary_loss_mlp": 0.01031009, "balance_loss_clip": 0.89375389, "balance_loss_mlp": 1.02400589, "epoch": 0.9497985931581795, "flos": 14866874732160.0, "grad_norm": 2.8262505989129316, "language_loss": 0.7161057, "learning_rate": 2.6320887479228228e-08, "loss": 0.73804015, "num_input_tokens_seen": 170674415, "step": 7899, "time_per_iteration": 2.643317937850952 }, { "auxiliary_loss_clip": 0.0116671, "auxiliary_loss_mlp": 0.01024539, "balance_loss_clip": 0.97067422, "balance_loss_mlp": 1.01730871, "epoch": 0.9499188360488187, "flos": 27193319136000.0, "grad_norm": 2.0266141305408056, "language_loss": 0.72030252, "learning_rate": 2.619507554239786e-08, "loss": 0.74221492, "num_input_tokens_seen": 170692975, "step": 7900, "time_per_iteration": 3.6428062915802 }, { "auxiliary_loss_clip": 0.01164907, "auxiliary_loss_mlp": 0.01024537, "balance_loss_clip": 0.97009885, "balance_loss_mlp": 1.01728857, "epoch": 0.9500390789394577, "flos": 24316479982080.0, "grad_norm": 1.8764079852440294, "language_loss": 0.69958562, "learning_rate": 2.606956303020502e-08, "loss": 0.72148007, "num_input_tokens_seen": 170713780, "step": 7901, "time_per_iteration": 2.661123752593994 }, { "auxiliary_loss_clip": 0.01165473, "auxiliary_loss_mlp": 0.01026042, "balance_loss_clip": 1.01150417, "balance_loss_mlp": 1.01905322, "epoch": 0.9501593218300968, "flos": 14354752573440.0, "grad_norm": 1.7563273833974584, "language_loss": 0.84316874, "learning_rate": 2.5944349961690036e-08, "loss": 0.86508387, "num_input_tokens_seen": 170730800, "step": 7902, "time_per_iteration": 3.3823349475860596 }, { "auxiliary_loss_clip": 0.01163533, "auxiliary_loss_mlp": 0.0102363, "balance_loss_clip": 0.93287706, "balance_loss_mlp": 1.01646006, "epoch": 0.9502795647207359, "flos": 38728113742080.0, "grad_norm": 1.648850752980011, "language_loss": 0.7294879, "learning_rate": 2.581943635584749e-08, "loss": 0.75135946, "num_input_tokens_seen": 170753630, "step": 7903, "time_per_iteration": 3.740596055984497 }, { "auxiliary_loss_clip": 0.01153977, "auxiliary_loss_mlp": 0.01027133, "balance_loss_clip": 0.96926725, "balance_loss_mlp": 1.02048683, "epoch": 0.950399807611375, "flos": 40808023799040.0, "grad_norm": 1.4459911554406082, "language_loss": 0.64818698, "learning_rate": 2.569482223162689e-08, "loss": 0.66999811, "num_input_tokens_seen": 170777605, "step": 7904, "time_per_iteration": 2.8156728744506836 }, { "auxiliary_loss_clip": 0.01165937, "auxiliary_loss_mlp": 0.01027061, "balance_loss_clip": 1.00895751, "balance_loss_mlp": 1.01919305, "epoch": 0.950520050502014, "flos": 23440403266560.0, "grad_norm": 1.619353962087769, "language_loss": 0.72439975, "learning_rate": 2.5570507607932e-08, "loss": 0.74632978, "num_input_tokens_seen": 170797520, "step": 7905, "time_per_iteration": 2.6494545936584473 }, { "auxiliary_loss_clip": 0.0116768, "auxiliary_loss_mlp": 0.0102456, "balance_loss_clip": 1.0091114, "balance_loss_mlp": 1.01719332, "epoch": 0.9506402933926532, "flos": 17783718658560.0, "grad_norm": 3.0161229654128245, "language_loss": 0.63638598, "learning_rate": 2.54464925036213e-08, "loss": 0.65830839, "num_input_tokens_seen": 170814810, "step": 7906, "time_per_iteration": 2.6428706645965576 }, { "auxiliary_loss_clip": 0.01162872, "auxiliary_loss_mlp": 0.01020822, "balance_loss_clip": 1.00872087, "balance_loss_mlp": 1.0138247, "epoch": 0.9507605362832923, "flos": 32561928668160.0, "grad_norm": 1.765529953465917, "language_loss": 0.60310328, "learning_rate": 2.532277693750773e-08, "loss": 0.62494028, "num_input_tokens_seen": 170835735, "step": 7907, "time_per_iteration": 2.703974723815918 }, { "auxiliary_loss_clip": 0.01163782, "auxiliary_loss_mlp": 0.010269, "balance_loss_clip": 0.89946771, "balance_loss_mlp": 1.019557, "epoch": 0.9508807791739313, "flos": 19602054898560.0, "grad_norm": 1.9443127914223564, "language_loss": 0.76039869, "learning_rate": 2.5199360928358948e-08, "loss": 0.78230548, "num_input_tokens_seen": 170852970, "step": 7908, "time_per_iteration": 2.7798025608062744 }, { "auxiliary_loss_clip": 0.01153188, "auxiliary_loss_mlp": 0.01122169, "balance_loss_clip": 1.00644076, "balance_loss_mlp": 0.0, "epoch": 0.9510010220645704, "flos": 21471852349440.0, "grad_norm": 1.6524909997404214, "language_loss": 0.87143874, "learning_rate": 2.507624449489665e-08, "loss": 0.89419228, "num_input_tokens_seen": 170871600, "step": 7909, "time_per_iteration": 2.648045778274536 }, { "auxiliary_loss_clip": 0.0116444, "auxiliary_loss_mlp": 0.01025075, "balance_loss_clip": 0.97147995, "balance_loss_mlp": 1.01828957, "epoch": 0.9511212649552095, "flos": 18879999701760.0, "grad_norm": 1.8406683536684376, "language_loss": 0.64696693, "learning_rate": 2.495342765579811e-08, "loss": 0.6688621, "num_input_tokens_seen": 170890260, "step": 7910, "time_per_iteration": 2.6390087604522705 }, { "auxiliary_loss_clip": 0.01161991, "auxiliary_loss_mlp": 0.01024881, "balance_loss_clip": 0.8960644, "balance_loss_mlp": 1.01793671, "epoch": 0.9512415078458486, "flos": 20810521094400.0, "grad_norm": 1.6241632344935508, "language_loss": 0.70996165, "learning_rate": 2.4830910429693984e-08, "loss": 0.73183036, "num_input_tokens_seen": 170910220, "step": 7911, "time_per_iteration": 2.7086427211761475 }, { "auxiliary_loss_clip": 0.01164564, "auxiliary_loss_mlp": 0.01020733, "balance_loss_clip": 1.04632187, "balance_loss_mlp": 1.01401889, "epoch": 0.9513617507364877, "flos": 18369565482240.0, "grad_norm": 2.29397293672201, "language_loss": 0.79656959, "learning_rate": 2.470869283517052e-08, "loss": 0.8184225, "num_input_tokens_seen": 170928255, "step": 7912, "time_per_iteration": 2.53609299659729 }, { "auxiliary_loss_clip": 0.01154834, "auxiliary_loss_mlp": 0.01025327, "balance_loss_clip": 1.00536704, "balance_loss_mlp": 1.01914275, "epoch": 0.9514819936271268, "flos": 25010166412800.0, "grad_norm": 1.5637351626784048, "language_loss": 0.76936555, "learning_rate": 2.458677489076777e-08, "loss": 0.79116714, "num_input_tokens_seen": 170949265, "step": 7913, "time_per_iteration": 2.5988576412200928 }, { "auxiliary_loss_clip": 0.01153567, "auxiliary_loss_mlp": 0.01021917, "balance_loss_clip": 1.00555992, "balance_loss_mlp": 1.01544988, "epoch": 0.9516022365177659, "flos": 18662129758080.0, "grad_norm": 3.4671664120917405, "language_loss": 0.82755816, "learning_rate": 2.446515661498072e-08, "loss": 0.84931302, "num_input_tokens_seen": 170968595, "step": 7914, "time_per_iteration": 2.6148383617401123 }, { "auxiliary_loss_clip": 0.01160786, "auxiliary_loss_mlp": 0.01025636, "balance_loss_clip": 0.85575509, "balance_loss_mlp": 1.01846218, "epoch": 0.9517224794084049, "flos": 25372109808000.0, "grad_norm": 2.2245002505791627, "language_loss": 0.7435236, "learning_rate": 2.434383802625861e-08, "loss": 0.76538777, "num_input_tokens_seen": 170987550, "step": 7915, "time_per_iteration": 2.831868886947632 }, { "auxiliary_loss_clip": 0.0116387, "auxiliary_loss_mlp": 0.01025249, "balance_loss_clip": 0.93154949, "balance_loss_mlp": 1.01878428, "epoch": 0.9518427222990441, "flos": 21470918595840.0, "grad_norm": 1.8250127851158722, "language_loss": 0.73738176, "learning_rate": 2.4222819143005168e-08, "loss": 0.75927293, "num_input_tokens_seen": 171007145, "step": 7916, "time_per_iteration": 2.755688428878784 }, { "auxiliary_loss_clip": 0.01165201, "auxiliary_loss_mlp": 0.01019811, "balance_loss_clip": 1.0480895, "balance_loss_mlp": 1.01327264, "epoch": 0.9519629651896832, "flos": 21033634423680.0, "grad_norm": 1.796852686381444, "language_loss": 0.80897939, "learning_rate": 2.4102099983579706e-08, "loss": 0.8308295, "num_input_tokens_seen": 171026295, "step": 7917, "time_per_iteration": 2.716134786605835 }, { "auxiliary_loss_clip": 0.01164021, "auxiliary_loss_mlp": 0.01032098, "balance_loss_clip": 1.00852931, "balance_loss_mlp": 1.02514553, "epoch": 0.9520832080803222, "flos": 21689219502720.0, "grad_norm": 1.7274559251834345, "language_loss": 0.7723068, "learning_rate": 2.3981680566294236e-08, "loss": 0.79426801, "num_input_tokens_seen": 171045895, "step": 7918, "time_per_iteration": 2.697411298751831 }, { "auxiliary_loss_clip": 0.01165297, "auxiliary_loss_mlp": 0.01025702, "balance_loss_clip": 1.04859376, "balance_loss_mlp": 1.0190115, "epoch": 0.9522034509709614, "flos": 23145289125120.0, "grad_norm": 1.682284702489171, "language_loss": 0.73448032, "learning_rate": 2.3861560909416822e-08, "loss": 0.75639033, "num_input_tokens_seen": 171065445, "step": 7919, "time_per_iteration": 2.57793927192688 }, { "auxiliary_loss_clip": 0.01161288, "auxiliary_loss_mlp": 0.01021674, "balance_loss_clip": 0.89556056, "balance_loss_mlp": 1.01487947, "epoch": 0.9523236938616004, "flos": 24679428958080.0, "grad_norm": 1.7129398441276247, "language_loss": 0.82411575, "learning_rate": 2.3741741031169325e-08, "loss": 0.84594542, "num_input_tokens_seen": 171085015, "step": 7920, "time_per_iteration": 2.7264702320098877 }, { "auxiliary_loss_clip": 0.01155013, "auxiliary_loss_mlp": 0.0103024, "balance_loss_clip": 0.89084864, "balance_loss_mlp": 1.02327859, "epoch": 0.9524439367522395, "flos": 22672309812480.0, "grad_norm": 1.6698585650319209, "language_loss": 0.71718919, "learning_rate": 2.3622220949728544e-08, "loss": 0.73904169, "num_input_tokens_seen": 171103900, "step": 7921, "time_per_iteration": 3.5435211658477783 }, { "auxiliary_loss_clip": 0.01156615, "auxiliary_loss_mlp": 0.01028843, "balance_loss_clip": 1.00656974, "balance_loss_mlp": 1.02157688, "epoch": 0.9525641796428787, "flos": 34055525024640.0, "grad_norm": 2.3063016083820203, "language_loss": 0.61102855, "learning_rate": 2.3503000683225526e-08, "loss": 0.63288313, "num_input_tokens_seen": 171121615, "step": 7922, "time_per_iteration": 2.6871187686920166 }, { "auxiliary_loss_clip": 0.01165856, "auxiliary_loss_mlp": 0.01024799, "balance_loss_clip": 1.04644084, "balance_loss_mlp": 1.01739597, "epoch": 0.9526844225335177, "flos": 16727083251840.0, "grad_norm": 1.9141785130980655, "language_loss": 0.84404469, "learning_rate": 2.3384080249745585e-08, "loss": 0.86595118, "num_input_tokens_seen": 171139505, "step": 7923, "time_per_iteration": 2.5636794567108154 }, { "auxiliary_loss_clip": 0.0116604, "auxiliary_loss_mlp": 0.01026058, "balance_loss_clip": 0.89380115, "balance_loss_mlp": 1.01872969, "epoch": 0.9528046654241568, "flos": 36939367330560.0, "grad_norm": 2.7254135350464064, "language_loss": 0.83456707, "learning_rate": 2.3265459667329178e-08, "loss": 0.85648799, "num_input_tokens_seen": 171158995, "step": 7924, "time_per_iteration": 2.796412229537964 }, { "auxiliary_loss_clip": 0.01164245, "auxiliary_loss_mlp": 0.0102385, "balance_loss_clip": 0.97172534, "balance_loss_mlp": 1.01682258, "epoch": 0.9529249083147959, "flos": 18255010032000.0, "grad_norm": 2.1336377034638185, "language_loss": 0.8649475, "learning_rate": 2.31471389539708e-08, "loss": 0.88682848, "num_input_tokens_seen": 171176120, "step": 7925, "time_per_iteration": 2.6228413581848145 }, { "auxiliary_loss_clip": 0.01167174, "auxiliary_loss_mlp": 0.01121719, "balance_loss_clip": 1.01081622, "balance_loss_mlp": 0.0, "epoch": 0.953045151205435, "flos": 28658438985600.0, "grad_norm": 2.062682496597517, "language_loss": 0.72627336, "learning_rate": 2.3029118127619872e-08, "loss": 0.74916232, "num_input_tokens_seen": 171195835, "step": 7926, "time_per_iteration": 2.674788236618042 }, { "auxiliary_loss_clip": 0.011562, "auxiliary_loss_mlp": 0.01022927, "balance_loss_clip": 0.97042787, "balance_loss_mlp": 1.01621282, "epoch": 0.953165394096074, "flos": 21835232288640.0, "grad_norm": 2.10635334561346, "language_loss": 0.86674958, "learning_rate": 2.2911397206179628e-08, "loss": 0.88854086, "num_input_tokens_seen": 171212585, "step": 7927, "time_per_iteration": 4.063323736190796 }, { "auxiliary_loss_clip": 0.01166968, "auxiliary_loss_mlp": 0.010245, "balance_loss_clip": 1.04899025, "balance_loss_mlp": 1.01741278, "epoch": 0.9532856369867132, "flos": 19975059682560.0, "grad_norm": 1.837213850866575, "language_loss": 0.62750876, "learning_rate": 2.279397620750845e-08, "loss": 0.64942336, "num_input_tokens_seen": 171231630, "step": 7928, "time_per_iteration": 3.338597297668457 }, { "auxiliary_loss_clip": 0.01158128, "auxiliary_loss_mlp": 0.01026512, "balance_loss_clip": 0.96911305, "balance_loss_mlp": 1.01959229, "epoch": 0.9534058798773523, "flos": 15049588239360.0, "grad_norm": 1.940782302010755, "language_loss": 0.78803122, "learning_rate": 2.2676855149419195e-08, "loss": 0.80987763, "num_input_tokens_seen": 171248800, "step": 7929, "time_per_iteration": 3.5027289390563965 }, { "auxiliary_loss_clip": 0.01165891, "auxiliary_loss_mlp": 0.01025422, "balance_loss_clip": 0.97577018, "balance_loss_mlp": 1.01847196, "epoch": 0.9535261227679913, "flos": 17602800831360.0, "grad_norm": 2.636783440766199, "language_loss": 0.75549573, "learning_rate": 2.2560034049678988e-08, "loss": 0.77740884, "num_input_tokens_seen": 171263150, "step": 7930, "time_per_iteration": 2.6282880306243896 }, { "auxiliary_loss_clip": 0.01171304, "auxiliary_loss_mlp": 0.01025398, "balance_loss_clip": 1.05004334, "balance_loss_mlp": 1.01844478, "epoch": 0.9536463656586305, "flos": 23142954741120.0, "grad_norm": 1.6869601365597904, "language_loss": 0.75611234, "learning_rate": 2.2443512926008988e-08, "loss": 0.77807933, "num_input_tokens_seen": 171282480, "step": 7931, "time_per_iteration": 2.5523829460144043 }, { "auxiliary_loss_clip": 0.01166184, "auxiliary_loss_mlp": 0.01026797, "balance_loss_clip": 0.93229771, "balance_loss_mlp": 1.01973736, "epoch": 0.9537666085492695, "flos": 18625033987200.0, "grad_norm": 2.2338459257035996, "language_loss": 0.69528455, "learning_rate": 2.2327291796085946e-08, "loss": 0.71721435, "num_input_tokens_seen": 171300840, "step": 7932, "time_per_iteration": 2.6401474475860596 }, { "auxiliary_loss_clip": 0.01167634, "auxiliary_loss_mlp": 0.01026917, "balance_loss_clip": 1.04815495, "balance_loss_mlp": 1.01973438, "epoch": 0.9538868514399086, "flos": 18989347680000.0, "grad_norm": 2.37601757713046, "language_loss": 0.77101719, "learning_rate": 2.2211370677540197e-08, "loss": 0.79296267, "num_input_tokens_seen": 171317365, "step": 7933, "time_per_iteration": 2.5611658096313477 }, { "auxiliary_loss_clip": 0.01167369, "auxiliary_loss_mlp": 0.01025545, "balance_loss_clip": 1.04805899, "balance_loss_mlp": 1.0188694, "epoch": 0.9540070943305478, "flos": 16800556521600.0, "grad_norm": 2.3735185377230605, "language_loss": 0.7824223, "learning_rate": 2.2095749587957012e-08, "loss": 0.80435145, "num_input_tokens_seen": 171335270, "step": 7934, "time_per_iteration": 2.545482635498047 }, { "auxiliary_loss_clip": 0.01155168, "auxiliary_loss_mlp": 0.01028356, "balance_loss_clip": 0.96608496, "balance_loss_mlp": 1.02094126, "epoch": 0.9541273372211868, "flos": 20156911263360.0, "grad_norm": 1.9395453029360152, "language_loss": 0.69669378, "learning_rate": 2.1980428544876138e-08, "loss": 0.71852899, "num_input_tokens_seen": 171353910, "step": 7935, "time_per_iteration": 2.6766748428344727 }, { "auxiliary_loss_clip": 0.01149926, "auxiliary_loss_mlp": 0.0102687, "balance_loss_clip": 0.88770896, "balance_loss_mlp": 1.01911879, "epoch": 0.9542475801118259, "flos": 26725511381760.0, "grad_norm": 1.605147738779308, "language_loss": 0.73943472, "learning_rate": 2.1865407565791584e-08, "loss": 0.76120269, "num_input_tokens_seen": 171375480, "step": 7936, "time_per_iteration": 2.7444136142730713 }, { "auxiliary_loss_clip": 0.01161618, "auxiliary_loss_mlp": 0.01025382, "balance_loss_clip": 0.96840239, "balance_loss_mlp": 1.01767516, "epoch": 0.954367823002465, "flos": 23330911633920.0, "grad_norm": 2.0738705467260683, "language_loss": 0.7732774, "learning_rate": 2.175068666815183e-08, "loss": 0.79514736, "num_input_tokens_seen": 171396320, "step": 7937, "time_per_iteration": 2.6619813442230225 }, { "auxiliary_loss_clip": 0.01161871, "auxiliary_loss_mlp": 0.01030293, "balance_loss_clip": 0.93166256, "balance_loss_mlp": 1.02280664, "epoch": 0.9544880658931041, "flos": 14902713527040.0, "grad_norm": 1.9297682776669784, "language_loss": 0.78801143, "learning_rate": 2.163626586935985e-08, "loss": 0.80993313, "num_input_tokens_seen": 171412860, "step": 7938, "time_per_iteration": 2.7442359924316406 }, { "auxiliary_loss_clip": 0.01159927, "auxiliary_loss_mlp": 0.01029132, "balance_loss_clip": 1.00762355, "balance_loss_mlp": 1.02194989, "epoch": 0.9546083087837431, "flos": 29095902725760.0, "grad_norm": 2.023337697795824, "language_loss": 0.62746692, "learning_rate": 2.1522145186773755e-08, "loss": 0.6493575, "num_input_tokens_seen": 171431780, "step": 7939, "time_per_iteration": 2.6982197761535645 }, { "auxiliary_loss_clip": 0.01163873, "auxiliary_loss_mlp": 0.01020413, "balance_loss_clip": 0.97191638, "balance_loss_mlp": 1.01393688, "epoch": 0.9547285516743822, "flos": 21142335957120.0, "grad_norm": 2.0275823820423677, "language_loss": 0.85538101, "learning_rate": 2.140832463770481e-08, "loss": 0.87722385, "num_input_tokens_seen": 171450975, "step": 7940, "time_per_iteration": 2.671293020248413 }, { "auxiliary_loss_clip": 0.01166656, "auxiliary_loss_mlp": 0.01027181, "balance_loss_clip": 0.96952087, "balance_loss_mlp": 1.02037406, "epoch": 0.9548487945650214, "flos": 27490157130240.0, "grad_norm": 4.1249585642241655, "language_loss": 0.76014292, "learning_rate": 2.129480423941987e-08, "loss": 0.78208131, "num_input_tokens_seen": 171467645, "step": 7941, "time_per_iteration": 2.6775224208831787 }, { "auxiliary_loss_clip": 0.01166735, "auxiliary_loss_mlp": 0.01027303, "balance_loss_clip": 0.97137594, "balance_loss_mlp": 1.02041268, "epoch": 0.9549690374556604, "flos": 22273198819200.0, "grad_norm": 2.054284530825727, "language_loss": 0.80384141, "learning_rate": 2.1181584009140052e-08, "loss": 0.82578182, "num_input_tokens_seen": 171487185, "step": 7942, "time_per_iteration": 2.659395217895508 }, { "auxiliary_loss_clip": 0.01170782, "auxiliary_loss_mlp": 0.01024246, "balance_loss_clip": 0.93262196, "balance_loss_mlp": 1.01737928, "epoch": 0.9550892803462995, "flos": 17595294888960.0, "grad_norm": 2.37608653682618, "language_loss": 0.83568293, "learning_rate": 2.10686639640405e-08, "loss": 0.85763317, "num_input_tokens_seen": 171501275, "step": 7943, "time_per_iteration": 2.6572704315185547 }, { "auxiliary_loss_clip": 0.01168252, "auxiliary_loss_mlp": 0.01025922, "balance_loss_clip": 1.00857997, "balance_loss_mlp": 1.01888847, "epoch": 0.9552095232369386, "flos": 24353144789760.0, "grad_norm": 1.577189538922842, "language_loss": 0.81267846, "learning_rate": 2.0956044121251294e-08, "loss": 0.83462024, "num_input_tokens_seen": 171520060, "step": 7944, "time_per_iteration": 2.6563608646392822 }, { "auxiliary_loss_clip": 0.01165785, "auxiliary_loss_mlp": 0.01028447, "balance_loss_clip": 0.93495685, "balance_loss_mlp": 1.02116299, "epoch": 0.9553297661275777, "flos": 22746860490240.0, "grad_norm": 2.5690640525901642, "language_loss": 0.81079704, "learning_rate": 2.084372449785654e-08, "loss": 0.83273929, "num_input_tokens_seen": 171539895, "step": 7945, "time_per_iteration": 2.699491262435913 }, { "auxiliary_loss_clip": 0.01158516, "auxiliary_loss_mlp": 0.01021857, "balance_loss_clip": 0.96815777, "balance_loss_mlp": 1.01532769, "epoch": 0.9554500090182168, "flos": 15413866018560.0, "grad_norm": 1.726433242742184, "language_loss": 0.68965638, "learning_rate": 2.0731705110895282e-08, "loss": 0.71146011, "num_input_tokens_seen": 171557385, "step": 7946, "time_per_iteration": 3.5092523097991943 }, { "auxiliary_loss_clip": 0.01170766, "auxiliary_loss_mlp": 0.01028255, "balance_loss_clip": 1.01350594, "balance_loss_mlp": 1.02038407, "epoch": 0.9555702519088559, "flos": 23513517400320.0, "grad_norm": 3.9871097096882604, "language_loss": 0.86765397, "learning_rate": 2.0619985977360587e-08, "loss": 0.88964415, "num_input_tokens_seen": 171575705, "step": 7947, "time_per_iteration": 2.654512405395508 }, { "auxiliary_loss_clip": 0.01161448, "auxiliary_loss_mlp": 0.01026292, "balance_loss_clip": 0.89268512, "balance_loss_mlp": 1.01953316, "epoch": 0.955690494799495, "flos": 22962072827520.0, "grad_norm": 1.5699430452875809, "language_loss": 0.76929724, "learning_rate": 2.0508567114200237e-08, "loss": 0.79117465, "num_input_tokens_seen": 171595620, "step": 7948, "time_per_iteration": 2.76920485496521 }, { "auxiliary_loss_clip": 0.01166864, "auxiliary_loss_mlp": 0.01022322, "balance_loss_clip": 0.97014487, "balance_loss_mlp": 1.01570332, "epoch": 0.955810737690134, "flos": 26031250333440.0, "grad_norm": 1.9902935942554114, "language_loss": 0.78653669, "learning_rate": 2.0397448538316485e-08, "loss": 0.80842865, "num_input_tokens_seen": 171616660, "step": 7949, "time_per_iteration": 2.7458791732788086 }, { "auxiliary_loss_clip": 0.0115754, "auxiliary_loss_mlp": 0.01023202, "balance_loss_clip": 0.93159521, "balance_loss_mlp": 1.01669359, "epoch": 0.9559309805807732, "flos": 20849951249280.0, "grad_norm": 1.9400345546868816, "language_loss": 0.66583121, "learning_rate": 2.028663026656563e-08, "loss": 0.68763864, "num_input_tokens_seen": 171635515, "step": 7950, "time_per_iteration": 2.7005724906921387 }, { "auxiliary_loss_clip": 0.01163431, "auxiliary_loss_mlp": 0.01122092, "balance_loss_clip": 1.04700851, "balance_loss_mlp": 0.0, "epoch": 0.9560512234714122, "flos": 21578219498880.0, "grad_norm": 2.1507655944027375, "language_loss": 0.71557617, "learning_rate": 2.0176112315758885e-08, "loss": 0.73843133, "num_input_tokens_seen": 171653305, "step": 7951, "time_per_iteration": 2.675295114517212 }, { "auxiliary_loss_clip": 0.01168146, "auxiliary_loss_mlp": 0.01023658, "balance_loss_clip": 0.8955934, "balance_loss_mlp": 1.01634741, "epoch": 0.9561714663620513, "flos": 17450144029440.0, "grad_norm": 3.721441303683059, "language_loss": 0.69581419, "learning_rate": 2.0065894702661957e-08, "loss": 0.71773219, "num_input_tokens_seen": 171669980, "step": 7952, "time_per_iteration": 2.7014925479888916 }, { "auxiliary_loss_clip": 0.01152752, "auxiliary_loss_mlp": 0.01122676, "balance_loss_clip": 0.92900622, "balance_loss_mlp": 0.0, "epoch": 0.9562917092526905, "flos": 26098510550400.0, "grad_norm": 1.961380295446067, "language_loss": 0.77594185, "learning_rate": 1.9955977443994577e-08, "loss": 0.7986961, "num_input_tokens_seen": 171689970, "step": 7953, "time_per_iteration": 3.595404624938965 }, { "auxiliary_loss_clip": 0.01161604, "auxiliary_loss_mlp": 0.01021218, "balance_loss_clip": 0.96933198, "balance_loss_mlp": 1.01352954, "epoch": 0.9564119521433295, "flos": 24096742531200.0, "grad_norm": 1.9856932733106785, "language_loss": 0.62549984, "learning_rate": 1.9846360556430965e-08, "loss": 0.64732802, "num_input_tokens_seen": 171708270, "step": 7954, "time_per_iteration": 3.497791290283203 }, { "auxiliary_loss_clip": 0.01165155, "auxiliary_loss_mlp": 0.01023795, "balance_loss_clip": 1.0477773, "balance_loss_mlp": 1.01677406, "epoch": 0.9565321950339686, "flos": 32008903896960.0, "grad_norm": 2.5936991639227225, "language_loss": 0.61487931, "learning_rate": 1.973704405660004e-08, "loss": 0.63676882, "num_input_tokens_seen": 171729385, "step": 7955, "time_per_iteration": 3.6075572967529297 }, { "auxiliary_loss_clip": 0.01159736, "auxiliary_loss_mlp": 0.01023519, "balance_loss_clip": 0.85545897, "balance_loss_mlp": 1.01717377, "epoch": 0.9566524379246077, "flos": 23588642695680.0, "grad_norm": 1.522931710889325, "language_loss": 0.78430092, "learning_rate": 1.9628027961085203e-08, "loss": 0.80613345, "num_input_tokens_seen": 171752615, "step": 7956, "time_per_iteration": 2.8531715869903564 }, { "auxiliary_loss_clip": 0.01147302, "auxiliary_loss_mlp": 0.01028028, "balance_loss_clip": 0.92853391, "balance_loss_mlp": 1.02111125, "epoch": 0.9567726808152468, "flos": 38067716240640.0, "grad_norm": 1.7636797088202192, "language_loss": 0.83949071, "learning_rate": 1.9519312286423894e-08, "loss": 0.86124408, "num_input_tokens_seen": 171775810, "step": 7957, "time_per_iteration": 3.0439765453338623 }, { "auxiliary_loss_clip": 0.01164658, "auxiliary_loss_mlp": 0.01024351, "balance_loss_clip": 1.0112772, "balance_loss_mlp": 1.0176369, "epoch": 0.9568929237058859, "flos": 22744059229440.0, "grad_norm": 1.5556469725212634, "language_loss": 0.77656221, "learning_rate": 1.9410897049108255e-08, "loss": 0.79845232, "num_input_tokens_seen": 171795090, "step": 7958, "time_per_iteration": 2.591228723526001 }, { "auxiliary_loss_clip": 0.01173121, "auxiliary_loss_mlp": 0.01028111, "balance_loss_clip": 1.05073631, "balance_loss_mlp": 1.02094066, "epoch": 0.957013166596525, "flos": 23841633162240.0, "grad_norm": 2.9147698938499524, "language_loss": 0.9122647, "learning_rate": 1.9302782265584905e-08, "loss": 0.934277, "num_input_tokens_seen": 171815755, "step": 7959, "time_per_iteration": 2.6324830055236816 }, { "auxiliary_loss_clip": 0.01151024, "auxiliary_loss_mlp": 0.01025507, "balance_loss_clip": 0.89465225, "balance_loss_mlp": 1.01828003, "epoch": 0.9571334094871641, "flos": 17639286071040.0, "grad_norm": 2.360806784250362, "language_loss": 0.86650842, "learning_rate": 1.9194967952254282e-08, "loss": 0.88827378, "num_input_tokens_seen": 171834330, "step": 7960, "time_per_iteration": 2.628408670425415 }, { "auxiliary_loss_clip": 0.01165319, "auxiliary_loss_mlp": 0.01028019, "balance_loss_clip": 1.0114429, "balance_loss_mlp": 1.02079189, "epoch": 0.9572536523778031, "flos": 15369623441280.0, "grad_norm": 2.2174826769485274, "language_loss": 0.80570161, "learning_rate": 1.9087454125472635e-08, "loss": 0.82763499, "num_input_tokens_seen": 171848805, "step": 7961, "time_per_iteration": 2.568589687347412 }, { "auxiliary_loss_clip": 0.01167232, "auxiliary_loss_mlp": 0.01020962, "balance_loss_clip": 1.04772019, "balance_loss_mlp": 1.01433742, "epoch": 0.9573738952684423, "flos": 24969838417920.0, "grad_norm": 1.8513076881082235, "language_loss": 0.78275883, "learning_rate": 1.8980240801548696e-08, "loss": 0.80464077, "num_input_tokens_seen": 171867995, "step": 7962, "time_per_iteration": 2.5739781856536865 }, { "auxiliary_loss_clip": 0.01163644, "auxiliary_loss_mlp": 0.01023402, "balance_loss_clip": 0.97333056, "balance_loss_mlp": 1.01629734, "epoch": 0.9574941381590814, "flos": 25769461034880.0, "grad_norm": 1.6071001027192136, "language_loss": 0.7408874, "learning_rate": 1.8873327996747458e-08, "loss": 0.7627579, "num_input_tokens_seen": 171886495, "step": 7963, "time_per_iteration": 2.6741175651550293 }, { "auxiliary_loss_clip": 0.01166183, "auxiliary_loss_mlp": 0.01024817, "balance_loss_clip": 1.0074122, "balance_loss_mlp": 1.01775336, "epoch": 0.9576143810497204, "flos": 32307178435200.0, "grad_norm": 2.1279783423148255, "language_loss": 0.66082698, "learning_rate": 1.8766715727287053e-08, "loss": 0.68273699, "num_input_tokens_seen": 171908200, "step": 7964, "time_per_iteration": 2.7875685691833496 }, { "auxiliary_loss_clip": 0.01170533, "auxiliary_loss_mlp": 0.0112284, "balance_loss_clip": 1.00906694, "balance_loss_mlp": 0.0, "epoch": 0.9577346239403596, "flos": 27745733376000.0, "grad_norm": 2.3480944243697475, "language_loss": 0.79125869, "learning_rate": 1.8660404009340546e-08, "loss": 0.81419241, "num_input_tokens_seen": 171928650, "step": 7965, "time_per_iteration": 2.7065694332122803 }, { "auxiliary_loss_clip": 0.01063631, "auxiliary_loss_mlp": 0.01000523, "balance_loss_clip": 0.97136682, "balance_loss_mlp": 0.9988777, "epoch": 0.9578548668309986, "flos": 57468313710720.0, "grad_norm": 0.9189921556234129, "language_loss": 0.5953353, "learning_rate": 1.8554392859035485e-08, "loss": 0.61597687, "num_input_tokens_seen": 171986400, "step": 7966, "time_per_iteration": 3.1572768688201904 }, { "auxiliary_loss_clip": 0.01163668, "auxiliary_loss_mlp": 0.01031731, "balance_loss_clip": 0.8166852, "balance_loss_mlp": 1.02438736, "epoch": 0.9579751097216377, "flos": 19756040503680.0, "grad_norm": 1.6308373266544136, "language_loss": 0.78771985, "learning_rate": 1.8448682292453444e-08, "loss": 0.80967391, "num_input_tokens_seen": 172005475, "step": 7967, "time_per_iteration": 2.8733632564544678 }, { "auxiliary_loss_clip": 0.01166839, "auxiliary_loss_mlp": 0.0102697, "balance_loss_clip": 1.04855466, "balance_loss_mlp": 1.02004719, "epoch": 0.9580953526122769, "flos": 18041270152320.0, "grad_norm": 2.055947198029111, "language_loss": 0.65801632, "learning_rate": 1.8343272325631154e-08, "loss": 0.67995441, "num_input_tokens_seen": 172024420, "step": 7968, "time_per_iteration": 3.1393933296203613 }, { "auxiliary_loss_clip": 0.01166342, "auxiliary_loss_mlp": 0.01122931, "balance_loss_clip": 0.8200165, "balance_loss_mlp": 0.0, "epoch": 0.9582155955029159, "flos": 24270154416000.0, "grad_norm": 2.147825011340169, "language_loss": 0.78234017, "learning_rate": 1.8238162974558492e-08, "loss": 0.80523288, "num_input_tokens_seen": 172038350, "step": 7969, "time_per_iteration": 2.822631597518921 }, { "auxiliary_loss_clip": 0.0116267, "auxiliary_loss_mlp": 0.01025656, "balance_loss_clip": 0.97210062, "balance_loss_mlp": 1.01873589, "epoch": 0.958335838393555, "flos": 22783309816320.0, "grad_norm": 1.8009385395131188, "language_loss": 0.74678469, "learning_rate": 1.8133354255181144e-08, "loss": 0.76866794, "num_input_tokens_seen": 172058665, "step": 7970, "time_per_iteration": 2.731743097305298 }, { "auxiliary_loss_clip": 0.01156509, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 1.00518417, "balance_loss_mlp": 1.01986814, "epoch": 0.958456081284194, "flos": 16911484698240.0, "grad_norm": 1.6796639268049212, "language_loss": 0.74606764, "learning_rate": 1.802884618339795e-08, "loss": 0.76790005, "num_input_tokens_seen": 172077470, "step": 7971, "time_per_iteration": 2.5870583057403564 }, { "auxiliary_loss_clip": 0.01168409, "auxiliary_loss_mlp": 0.01023827, "balance_loss_clip": 1.01101995, "balance_loss_mlp": 1.01657057, "epoch": 0.9585763241748332, "flos": 19974951941760.0, "grad_norm": 2.324750673770578, "language_loss": 0.81106174, "learning_rate": 1.7924638775062894e-08, "loss": 0.83298409, "num_input_tokens_seen": 172096590, "step": 7972, "time_per_iteration": 3.4971234798431396 }, { "auxiliary_loss_clip": 0.01157393, "auxiliary_loss_mlp": 0.01029045, "balance_loss_clip": 0.93320179, "balance_loss_mlp": 1.02229178, "epoch": 0.9586965670654722, "flos": 21395649646080.0, "grad_norm": 2.257475345502514, "language_loss": 0.81628555, "learning_rate": 1.7820732045984444e-08, "loss": 0.83814996, "num_input_tokens_seen": 172116735, "step": 7973, "time_per_iteration": 2.7028493881225586 }, { "auxiliary_loss_clip": 0.01160115, "auxiliary_loss_mlp": 0.01026388, "balance_loss_clip": 1.00750089, "balance_loss_mlp": 1.01865137, "epoch": 0.9588168099561113, "flos": 21435115714560.0, "grad_norm": 1.961909374004028, "language_loss": 0.74211323, "learning_rate": 1.7717126011924655e-08, "loss": 0.7639783, "num_input_tokens_seen": 172138320, "step": 7974, "time_per_iteration": 2.657979726791382 }, { "auxiliary_loss_clip": 0.01151072, "auxiliary_loss_mlp": 0.01030336, "balance_loss_clip": 0.88962722, "balance_loss_mlp": 1.02324331, "epoch": 0.9589370528467505, "flos": 11763761852160.0, "grad_norm": 2.4819059954949876, "language_loss": 0.76221538, "learning_rate": 1.7613820688600957e-08, "loss": 0.78402948, "num_input_tokens_seen": 172154225, "step": 7975, "time_per_iteration": 2.6929609775543213 }, { "auxiliary_loss_clip": 0.01170597, "auxiliary_loss_mlp": 0.01025275, "balance_loss_clip": 0.97077662, "balance_loss_mlp": 1.01828909, "epoch": 0.9590572957373895, "flos": 23441516588160.0, "grad_norm": 2.8439932462305078, "language_loss": 0.78587532, "learning_rate": 1.7510816091684588e-08, "loss": 0.80783409, "num_input_tokens_seen": 172174150, "step": 7976, "time_per_iteration": 2.6246888637542725 }, { "auxiliary_loss_clip": 0.01164735, "auxiliary_loss_mlp": 0.01027525, "balance_loss_clip": 0.97192115, "balance_loss_mlp": 1.0202235, "epoch": 0.9591775386280286, "flos": 22528272274560.0, "grad_norm": 4.0348516554560305, "language_loss": 0.78376627, "learning_rate": 1.740811223680083e-08, "loss": 0.80568886, "num_input_tokens_seen": 172191005, "step": 7977, "time_per_iteration": 2.677550792694092 }, { "auxiliary_loss_clip": 0.01167077, "auxiliary_loss_mlp": 0.0102414, "balance_loss_clip": 1.04868484, "balance_loss_mlp": 1.01691878, "epoch": 0.9592977815186677, "flos": 18186959715840.0, "grad_norm": 2.325483185796491, "language_loss": 0.74277627, "learning_rate": 1.7305709139530334e-08, "loss": 0.76468843, "num_input_tokens_seen": 172209785, "step": 7978, "time_per_iteration": 3.504950523376465 }, { "auxiliary_loss_clip": 0.01158067, "auxiliary_loss_mlp": 0.01022794, "balance_loss_clip": 1.00709188, "balance_loss_mlp": 1.01543319, "epoch": 0.9594180244093068, "flos": 16537797555840.0, "grad_norm": 2.1559832167733615, "language_loss": 0.74682075, "learning_rate": 1.7203606815407334e-08, "loss": 0.76862931, "num_input_tokens_seen": 172224380, "step": 7979, "time_per_iteration": 2.606785297393799 }, { "auxiliary_loss_clip": 0.01169848, "auxiliary_loss_mlp": 0.01026466, "balance_loss_clip": 0.97400045, "balance_loss_mlp": 1.01928413, "epoch": 0.9595382672999458, "flos": 20554334317440.0, "grad_norm": 1.821289388162289, "language_loss": 0.79652154, "learning_rate": 1.7101805279920557e-08, "loss": 0.81848466, "num_input_tokens_seen": 172242540, "step": 7980, "time_per_iteration": 3.6080760955810547 }, { "auxiliary_loss_clip": 0.01167687, "auxiliary_loss_mlp": 0.01023428, "balance_loss_clip": 1.04881811, "balance_loss_mlp": 1.01612055, "epoch": 0.959658510190585, "flos": 22638266697600.0, "grad_norm": 2.5757618312991792, "language_loss": 0.81022811, "learning_rate": 1.7000304548513643e-08, "loss": 0.83213925, "num_input_tokens_seen": 172262645, "step": 7981, "time_per_iteration": 3.5933752059936523 }, { "auxiliary_loss_clip": 0.0115692, "auxiliary_loss_mlp": 0.01025771, "balance_loss_clip": 0.93062955, "balance_loss_mlp": 1.018273, "epoch": 0.9597787530812241, "flos": 19135252725120.0, "grad_norm": 1.8827786495834762, "language_loss": 0.82804972, "learning_rate": 1.6899104636583394e-08, "loss": 0.84987664, "num_input_tokens_seen": 172280695, "step": 7982, "time_per_iteration": 2.6592154502868652 }, { "auxiliary_loss_clip": 0.01062562, "auxiliary_loss_mlp": 0.0100105, "balance_loss_clip": 0.97071296, "balance_loss_mlp": 0.99945277, "epoch": 0.9598989959718631, "flos": 60098124055680.0, "grad_norm": 0.7334822845007387, "language_loss": 0.61981881, "learning_rate": 1.6798205559482638e-08, "loss": 0.64045489, "num_input_tokens_seen": 172343075, "step": 7983, "time_per_iteration": 3.3586838245391846 }, { "auxiliary_loss_clip": 0.01169124, "auxiliary_loss_mlp": 0.01025998, "balance_loss_clip": 0.93534315, "balance_loss_mlp": 1.01927149, "epoch": 0.9600192388625023, "flos": 20886795624960.0, "grad_norm": 1.750105531703065, "language_loss": 0.76517969, "learning_rate": 1.669760733251713e-08, "loss": 0.78713089, "num_input_tokens_seen": 172361950, "step": 7984, "time_per_iteration": 2.6811821460723877 }, { "auxiliary_loss_clip": 0.01167951, "auxiliary_loss_mlp": 0.0102482, "balance_loss_clip": 0.85718334, "balance_loss_mlp": 1.01844537, "epoch": 0.9601394817531413, "flos": 20445740524800.0, "grad_norm": 1.589327021308972, "language_loss": 0.82436723, "learning_rate": 1.659730997094755e-08, "loss": 0.84629488, "num_input_tokens_seen": 172380440, "step": 7985, "time_per_iteration": 2.706904172897339 }, { "auxiliary_loss_clip": 0.01155285, "auxiliary_loss_mlp": 0.01020524, "balance_loss_clip": 1.00712776, "balance_loss_mlp": 1.01342762, "epoch": 0.9602597246437804, "flos": 21507152440320.0, "grad_norm": 1.6302998411094805, "language_loss": 0.62339407, "learning_rate": 1.6497313489989283e-08, "loss": 0.64515215, "num_input_tokens_seen": 172400265, "step": 7986, "time_per_iteration": 2.6457910537719727 }, { "auxiliary_loss_clip": 0.01153647, "auxiliary_loss_mlp": 0.01022929, "balance_loss_clip": 0.88643688, "balance_loss_mlp": 1.01608086, "epoch": 0.9603799675344196, "flos": 29935099152000.0, "grad_norm": 3.5481680689319424, "language_loss": 0.7026397, "learning_rate": 1.639761790481131e-08, "loss": 0.72440541, "num_input_tokens_seen": 172421145, "step": 7987, "time_per_iteration": 2.8468244075775146 }, { "auxiliary_loss_clip": 0.01168601, "auxiliary_loss_mlp": 0.01028495, "balance_loss_clip": 1.00947726, "balance_loss_mlp": 1.02166688, "epoch": 0.9605002104250586, "flos": 28001525103360.0, "grad_norm": 2.0613762889478524, "language_loss": 0.78659999, "learning_rate": 1.6298223230537754e-08, "loss": 0.80857092, "num_input_tokens_seen": 172438945, "step": 7988, "time_per_iteration": 2.6476645469665527 }, { "auxiliary_loss_clip": 0.01163276, "auxiliary_loss_mlp": 0.01122615, "balance_loss_clip": 0.97022796, "balance_loss_mlp": 0.0, "epoch": 0.9606204533156977, "flos": 35590490870400.0, "grad_norm": 1.8063733915043418, "language_loss": 0.69473505, "learning_rate": 1.619912948224611e-08, "loss": 0.71759391, "num_input_tokens_seen": 172460150, "step": 7989, "time_per_iteration": 2.783867597579956 }, { "auxiliary_loss_clip": 0.01156099, "auxiliary_loss_mlp": 0.01032733, "balance_loss_clip": 0.93176192, "balance_loss_mlp": 1.02524042, "epoch": 0.9607406962063368, "flos": 26574614346240.0, "grad_norm": 2.445153881924808, "language_loss": 0.61128402, "learning_rate": 1.6100336674969682e-08, "loss": 0.63317227, "num_input_tokens_seen": 172478990, "step": 7990, "time_per_iteration": 2.827836275100708 }, { "auxiliary_loss_clip": 0.01164111, "auxiliary_loss_mlp": 0.01027551, "balance_loss_clip": 0.89318514, "balance_loss_mlp": 1.02063727, "epoch": 0.9608609390969759, "flos": 25331781813120.0, "grad_norm": 1.8121218463116768, "language_loss": 0.76594901, "learning_rate": 1.600184482369449e-08, "loss": 0.78786564, "num_input_tokens_seen": 172498905, "step": 7991, "time_per_iteration": 2.775156259536743 }, { "auxiliary_loss_clip": 0.01164653, "auxiliary_loss_mlp": 0.01031518, "balance_loss_clip": 0.9312799, "balance_loss_mlp": 1.02405596, "epoch": 0.960981181987615, "flos": 21069114082560.0, "grad_norm": 4.58812754347363, "language_loss": 0.88668501, "learning_rate": 1.5903653943362126e-08, "loss": 0.9086467, "num_input_tokens_seen": 172517900, "step": 7992, "time_per_iteration": 2.6962828636169434 }, { "auxiliary_loss_clip": 0.01165937, "auxiliary_loss_mlp": 0.01025007, "balance_loss_clip": 0.9710651, "balance_loss_mlp": 1.01811719, "epoch": 0.9611014248782541, "flos": 17823256554240.0, "grad_norm": 2.018808515100643, "language_loss": 0.76685894, "learning_rate": 1.580576404886802e-08, "loss": 0.78876835, "num_input_tokens_seen": 172536430, "step": 7993, "time_per_iteration": 2.704176902770996 }, { "auxiliary_loss_clip": 0.01165013, "auxiliary_loss_mlp": 0.01023477, "balance_loss_clip": 1.00952005, "balance_loss_mlp": 1.01746309, "epoch": 0.9612216677688932, "flos": 19354631040000.0, "grad_norm": 1.8687219509552375, "language_loss": 0.79723388, "learning_rate": 1.570817515506162e-08, "loss": 0.8191188, "num_input_tokens_seen": 172555120, "step": 7994, "time_per_iteration": 2.7532074451446533 }, { "auxiliary_loss_clip": 0.01165262, "auxiliary_loss_mlp": 0.01027149, "balance_loss_clip": 1.04847157, "balance_loss_mlp": 1.02031541, "epoch": 0.9613419106595322, "flos": 15808739207040.0, "grad_norm": 1.9856363622143716, "language_loss": 0.81554461, "learning_rate": 1.561088727674753e-08, "loss": 0.83746874, "num_input_tokens_seen": 172569330, "step": 7995, "time_per_iteration": 2.6372969150543213 }, { "auxiliary_loss_clip": 0.01175477, "auxiliary_loss_mlp": 0.01027114, "balance_loss_clip": 0.89461315, "balance_loss_mlp": 1.01910043, "epoch": 0.9614621535501714, "flos": 25702488126720.0, "grad_norm": 6.209481384776549, "language_loss": 0.70747483, "learning_rate": 1.551390042868417e-08, "loss": 0.72950077, "num_input_tokens_seen": 172591100, "step": 7996, "time_per_iteration": 2.813739538192749 }, { "auxiliary_loss_clip": 0.01166897, "auxiliary_loss_mlp": 0.01027869, "balance_loss_clip": 1.01028514, "balance_loss_mlp": 1.0213666, "epoch": 0.9615823964408104, "flos": 17819054663040.0, "grad_norm": 1.7373805790681054, "language_loss": 0.7066679, "learning_rate": 1.5417214625584207e-08, "loss": 0.72861564, "num_input_tokens_seen": 172608755, "step": 7997, "time_per_iteration": 2.593583822250366 }, { "auxiliary_loss_clip": 0.01156926, "auxiliary_loss_mlp": 0.01026142, "balance_loss_clip": 1.00667453, "balance_loss_mlp": 1.01922154, "epoch": 0.9617026393314495, "flos": 20190020624640.0, "grad_norm": 1.589126895440396, "language_loss": 0.85460031, "learning_rate": 1.5320829882114806e-08, "loss": 0.87643099, "num_input_tokens_seen": 172626830, "step": 7998, "time_per_iteration": 3.450883388519287 }, { "auxiliary_loss_clip": 0.01164778, "auxiliary_loss_mlp": 0.01025844, "balance_loss_clip": 1.04470789, "balance_loss_mlp": 1.01902568, "epoch": 0.9618228822220887, "flos": 20267013427200.0, "grad_norm": 1.8642689487993624, "language_loss": 0.79103661, "learning_rate": 1.5224746212897378e-08, "loss": 0.81294292, "num_input_tokens_seen": 172646125, "step": 7999, "time_per_iteration": 2.5658514499664307 }, { "auxiliary_loss_clip": 0.01161568, "auxiliary_loss_mlp": 0.01023277, "balance_loss_clip": 1.04588318, "balance_loss_mlp": 1.01644313, "epoch": 0.9619431251127277, "flos": 21031300039680.0, "grad_norm": 1.6167324517600334, "language_loss": 0.77018398, "learning_rate": 1.512896363250804e-08, "loss": 0.79203248, "num_input_tokens_seen": 172666235, "step": 8000, "time_per_iteration": 2.6222074031829834 }, { "auxiliary_loss_clip": 0.0116551, "auxiliary_loss_mlp": 0.01028765, "balance_loss_clip": 1.00744617, "balance_loss_mlp": 1.02194619, "epoch": 0.9620633680033668, "flos": 22382654538240.0, "grad_norm": 1.9651519402281952, "language_loss": 0.75600457, "learning_rate": 1.503348215547673e-08, "loss": 0.77794731, "num_input_tokens_seen": 172687325, "step": 8001, "time_per_iteration": 2.6518003940582275 }, { "auxiliary_loss_clip": 0.01163286, "auxiliary_loss_mlp": 0.01023819, "balance_loss_clip": 0.97210789, "balance_loss_mlp": 1.01687169, "epoch": 0.962183610894006, "flos": 18471730740480.0, "grad_norm": 1.6659928187684283, "language_loss": 0.80241263, "learning_rate": 1.4938301796288078e-08, "loss": 0.82428366, "num_input_tokens_seen": 172703895, "step": 8002, "time_per_iteration": 2.7431020736694336 }, { "auxiliary_loss_clip": 0.01168111, "auxiliary_loss_mlp": 0.01026111, "balance_loss_clip": 1.0483098, "balance_loss_mlp": 1.01894021, "epoch": 0.962303853784645, "flos": 18435245500800.0, "grad_norm": 2.5114690015520615, "language_loss": 0.82252097, "learning_rate": 1.4843422569380537e-08, "loss": 0.84446311, "num_input_tokens_seen": 172720650, "step": 8003, "time_per_iteration": 2.548247814178467 }, { "auxiliary_loss_clip": 0.01159329, "auxiliary_loss_mlp": 0.01027532, "balance_loss_clip": 0.89087629, "balance_loss_mlp": 1.02052248, "epoch": 0.9624240966752841, "flos": 26391074826240.0, "grad_norm": 1.6115924560553503, "language_loss": 0.82575917, "learning_rate": 1.4748844489147483e-08, "loss": 0.84762782, "num_input_tokens_seen": 172737640, "step": 8004, "time_per_iteration": 3.734238386154175 }, { "auxiliary_loss_clip": 0.01162795, "auxiliary_loss_mlp": 0.01024989, "balance_loss_clip": 0.969401, "balance_loss_mlp": 1.01804495, "epoch": 0.9625443395659231, "flos": 14647675985280.0, "grad_norm": 2.0110916823508975, "language_loss": 0.70900482, "learning_rate": 1.4654567569936326e-08, "loss": 0.73088264, "num_input_tokens_seen": 172755215, "step": 8005, "time_per_iteration": 2.644770383834839 }, { "auxiliary_loss_clip": 0.01154992, "auxiliary_loss_mlp": 0.01026463, "balance_loss_clip": 0.89414632, "balance_loss_mlp": 1.0195725, "epoch": 0.9626645824565623, "flos": 18367626147840.0, "grad_norm": 1.9949097789183092, "language_loss": 0.82999521, "learning_rate": 1.456059182604874e-08, "loss": 0.85180974, "num_input_tokens_seen": 172774020, "step": 8006, "time_per_iteration": 3.6339051723480225 }, { "auxiliary_loss_clip": 0.01167344, "auxiliary_loss_mlp": 0.01032559, "balance_loss_clip": 1.04795611, "balance_loss_mlp": 1.02551377, "epoch": 0.9627848253472013, "flos": 16580424021120.0, "grad_norm": 1.6880233597925276, "language_loss": 0.76446569, "learning_rate": 1.4466917271740653e-08, "loss": 0.78646469, "num_input_tokens_seen": 172792220, "step": 8007, "time_per_iteration": 2.5885584354400635 }, { "auxiliary_loss_clip": 0.01158083, "auxiliary_loss_mlp": 0.01030592, "balance_loss_clip": 0.96986222, "balance_loss_mlp": 1.02280712, "epoch": 0.9629050682378404, "flos": 20886867452160.0, "grad_norm": 2.7045406279902027, "language_loss": 0.67312276, "learning_rate": 1.4373543921222697e-08, "loss": 0.69500959, "num_input_tokens_seen": 172811805, "step": 8008, "time_per_iteration": 3.5385870933532715 }, { "auxiliary_loss_clip": 0.01164782, "auxiliary_loss_mlp": 0.01031607, "balance_loss_clip": 0.97200948, "balance_loss_mlp": 1.02426994, "epoch": 0.9630253111284796, "flos": 17019252478080.0, "grad_norm": 1.686730431732316, "language_loss": 0.7799536, "learning_rate": 1.428047178865932e-08, "loss": 0.80191749, "num_input_tokens_seen": 172828595, "step": 8009, "time_per_iteration": 2.635470151901245 }, { "auxiliary_loss_clip": 0.01159269, "auxiliary_loss_mlp": 0.01027106, "balance_loss_clip": 0.9670009, "balance_loss_mlp": 1.02045441, "epoch": 0.9631455540191186, "flos": 20338942412160.0, "grad_norm": 1.7458288968808906, "language_loss": 0.74509335, "learning_rate": 1.4187700888169451e-08, "loss": 0.76695716, "num_input_tokens_seen": 172847770, "step": 8010, "time_per_iteration": 2.653898000717163 }, { "auxiliary_loss_clip": 0.01062264, "auxiliary_loss_mlp": 0.01000615, "balance_loss_clip": 0.97216856, "balance_loss_mlp": 0.99904138, "epoch": 0.9632657969097577, "flos": 65956700033280.0, "grad_norm": 0.7628803729176076, "language_loss": 0.57027972, "learning_rate": 1.40952312338265e-08, "loss": 0.59090847, "num_input_tokens_seen": 172912415, "step": 8011, "time_per_iteration": 3.212756872177124 }, { "auxiliary_loss_clip": 0.01164018, "auxiliary_loss_mlp": 0.01027924, "balance_loss_clip": 0.93175912, "balance_loss_mlp": 1.0205394, "epoch": 0.9633860398003968, "flos": 44419523823360.0, "grad_norm": 3.3092557776795095, "language_loss": 0.68616205, "learning_rate": 1.4003062839657909e-08, "loss": 0.70808148, "num_input_tokens_seen": 172934895, "step": 8012, "time_per_iteration": 2.904552936553955 }, { "auxiliary_loss_clip": 0.01165464, "auxiliary_loss_mlp": 0.01025892, "balance_loss_clip": 0.93107665, "balance_loss_mlp": 1.01929116, "epoch": 0.9635062826910359, "flos": 24827704300800.0, "grad_norm": 1.6465876226137715, "language_loss": 0.79887903, "learning_rate": 1.391119571964583e-08, "loss": 0.82079262, "num_input_tokens_seen": 172955835, "step": 8013, "time_per_iteration": 2.691845178604126 }, { "auxiliary_loss_clip": 0.01164552, "auxiliary_loss_mlp": 0.01024514, "balance_loss_clip": 1.00944257, "balance_loss_mlp": 1.018291, "epoch": 0.9636265255816749, "flos": 15961360095360.0, "grad_norm": 1.7219340148677693, "language_loss": 0.72843552, "learning_rate": 1.3819629887726225e-08, "loss": 0.75032616, "num_input_tokens_seen": 172973925, "step": 8014, "time_per_iteration": 2.602200508117676 }, { "auxiliary_loss_clip": 0.01168171, "auxiliary_loss_mlp": 0.01025804, "balance_loss_clip": 0.97077477, "balance_loss_mlp": 1.01868987, "epoch": 0.9637467684723141, "flos": 22601781457920.0, "grad_norm": 1.7433909235091205, "language_loss": 0.76160681, "learning_rate": 1.3728365357789317e-08, "loss": 0.78354657, "num_input_tokens_seen": 172993290, "step": 8015, "time_per_iteration": 2.6817514896392822 }, { "auxiliary_loss_clip": 0.01145747, "auxiliary_loss_mlp": 0.01026585, "balance_loss_clip": 0.85238385, "balance_loss_mlp": 1.01937556, "epoch": 0.9638670113629532, "flos": 17565812801280.0, "grad_norm": 2.2331054936831762, "language_loss": 0.76432389, "learning_rate": 1.3637402143680254e-08, "loss": 0.78604722, "num_input_tokens_seen": 173008190, "step": 8016, "time_per_iteration": 2.718035936355591 }, { "auxiliary_loss_clip": 0.01069306, "auxiliary_loss_mlp": 0.01001062, "balance_loss_clip": 0.90052658, "balance_loss_mlp": 0.99947661, "epoch": 0.9639872542535922, "flos": 55072139379840.0, "grad_norm": 0.7326930090826578, "language_loss": 0.55084932, "learning_rate": 1.3546740259197998e-08, "loss": 0.57155299, "num_input_tokens_seen": 173061000, "step": 8017, "time_per_iteration": 3.2112441062927246 }, { "auxiliary_loss_clip": 0.01164991, "auxiliary_loss_mlp": 0.01027478, "balance_loss_clip": 0.97156233, "balance_loss_mlp": 1.02040327, "epoch": 0.9641074971442314, "flos": 24134484746880.0, "grad_norm": 2.362699240211662, "language_loss": 0.70492715, "learning_rate": 1.3456379718095989e-08, "loss": 0.72685188, "num_input_tokens_seen": 173081415, "step": 8018, "time_per_iteration": 2.7075164318084717 }, { "auxiliary_loss_clip": 0.01060282, "auxiliary_loss_mlp": 0.01001507, "balance_loss_clip": 0.93287486, "balance_loss_mlp": 0.99977845, "epoch": 0.9642277400348704, "flos": 66747416077440.0, "grad_norm": 0.8407150207237728, "language_loss": 0.62028605, "learning_rate": 1.3366320534081487e-08, "loss": 0.64090395, "num_input_tokens_seen": 173144095, "step": 8019, "time_per_iteration": 3.2609753608703613 }, { "auxiliary_loss_clip": 0.01163926, "auxiliary_loss_mlp": 0.01024395, "balance_loss_clip": 1.00905943, "balance_loss_mlp": 1.01720691, "epoch": 0.9643479829255095, "flos": 30920272450560.0, "grad_norm": 1.9663172446103097, "language_loss": 0.76026994, "learning_rate": 1.3276562720816675e-08, "loss": 0.78215319, "num_input_tokens_seen": 173165605, "step": 8020, "time_per_iteration": 2.7290422916412354 }, { "auxiliary_loss_clip": 0.01164561, "auxiliary_loss_mlp": 0.01027171, "balance_loss_clip": 1.04520082, "balance_loss_mlp": 1.01984596, "epoch": 0.9644682258161487, "flos": 20048245643520.0, "grad_norm": 2.505759809280163, "language_loss": 0.8286562, "learning_rate": 1.3187106291917549e-08, "loss": 0.85057354, "num_input_tokens_seen": 173182595, "step": 8021, "time_per_iteration": 2.6376500129699707 }, { "auxiliary_loss_clip": 0.01161014, "auxiliary_loss_mlp": 0.01023252, "balance_loss_clip": 1.00902426, "balance_loss_mlp": 1.01656389, "epoch": 0.9645884687067877, "flos": 21178713456000.0, "grad_norm": 1.877775889113735, "language_loss": 0.70581728, "learning_rate": 1.309795126095503e-08, "loss": 0.72765994, "num_input_tokens_seen": 173200895, "step": 8022, "time_per_iteration": 2.6138811111450195 }, { "auxiliary_loss_clip": 0.01153949, "auxiliary_loss_mlp": 0.01027227, "balance_loss_clip": 0.81517196, "balance_loss_mlp": 1.02043164, "epoch": 0.9647087115974268, "flos": 18945967029120.0, "grad_norm": 2.0493081236473962, "language_loss": 0.80883014, "learning_rate": 1.3009097641453192e-08, "loss": 0.83064187, "num_input_tokens_seen": 173218745, "step": 8023, "time_per_iteration": 2.86110258102417 }, { "auxiliary_loss_clip": 0.01165281, "auxiliary_loss_mlp": 0.01021385, "balance_loss_clip": 0.97170007, "balance_loss_mlp": 1.01444066, "epoch": 0.9648289544880659, "flos": 16545088016640.0, "grad_norm": 1.552906611045015, "language_loss": 0.75826395, "learning_rate": 1.2920545446891474e-08, "loss": 0.78013057, "num_input_tokens_seen": 173235465, "step": 8024, "time_per_iteration": 3.6357345581054688 }, { "auxiliary_loss_clip": 0.01169299, "auxiliary_loss_mlp": 0.01030719, "balance_loss_clip": 0.97423756, "balance_loss_mlp": 1.0237627, "epoch": 0.964949197378705, "flos": 24057527857920.0, "grad_norm": 1.6978305661712025, "language_loss": 0.70493048, "learning_rate": 1.2832294690703127e-08, "loss": 0.72693068, "num_input_tokens_seen": 173254440, "step": 8025, "time_per_iteration": 2.681908369064331 }, { "auxiliary_loss_clip": 0.01168993, "auxiliary_loss_mlp": 0.01028542, "balance_loss_clip": 1.01326561, "balance_loss_mlp": 1.02142859, "epoch": 0.965069440269344, "flos": 23365565280000.0, "grad_norm": 1.8330021945219688, "language_loss": 0.77334476, "learning_rate": 1.2744345386275668e-08, "loss": 0.79532003, "num_input_tokens_seen": 173273980, "step": 8026, "time_per_iteration": 2.6829845905303955 }, { "auxiliary_loss_clip": 0.01172047, "auxiliary_loss_mlp": 0.01023132, "balance_loss_clip": 0.97423303, "balance_loss_mlp": 1.01642609, "epoch": 0.9651896831599832, "flos": 25374875155200.0, "grad_norm": 2.5244170639802697, "language_loss": 0.78459626, "learning_rate": 1.265669754695109e-08, "loss": 0.806548, "num_input_tokens_seen": 173293550, "step": 8027, "time_per_iteration": 2.730926275253296 }, { "auxiliary_loss_clip": 0.01163695, "auxiliary_loss_mlp": 0.01024335, "balance_loss_clip": 0.85445333, "balance_loss_mlp": 1.01705682, "epoch": 0.9653099260506223, "flos": 22272875596800.0, "grad_norm": 1.7617214241174923, "language_loss": 0.81805217, "learning_rate": 1.2569351186025201e-08, "loss": 0.83993244, "num_input_tokens_seen": 173312005, "step": 8028, "time_per_iteration": 2.856912612915039 }, { "auxiliary_loss_clip": 0.01152173, "auxiliary_loss_mlp": 0.0102241, "balance_loss_clip": 0.93163669, "balance_loss_mlp": 1.01547837, "epoch": 0.9654301689412613, "flos": 26760847386240.0, "grad_norm": 1.571217008893138, "language_loss": 0.75263494, "learning_rate": 1.2482306316748737e-08, "loss": 0.7743808, "num_input_tokens_seen": 173332450, "step": 8029, "time_per_iteration": 2.8724544048309326 }, { "auxiliary_loss_clip": 0.01168519, "auxiliary_loss_mlp": 0.01021667, "balance_loss_clip": 1.00690055, "balance_loss_mlp": 1.01435995, "epoch": 0.9655504118319005, "flos": 17412689122560.0, "grad_norm": 2.712406605602428, "language_loss": 0.78232551, "learning_rate": 1.2395562952326021e-08, "loss": 0.80422741, "num_input_tokens_seen": 173349610, "step": 8030, "time_per_iteration": 3.8984131813049316 }, { "auxiliary_loss_clip": 0.01172084, "auxiliary_loss_mlp": 0.01033508, "balance_loss_clip": 0.97195059, "balance_loss_mlp": 1.0259949, "epoch": 0.9656706547225395, "flos": 22126970551680.0, "grad_norm": 1.9660110315658659, "language_loss": 0.81689483, "learning_rate": 1.2309121105916309e-08, "loss": 0.83895075, "num_input_tokens_seen": 173367900, "step": 8031, "time_per_iteration": 2.6804897785186768 }, { "auxiliary_loss_clip": 0.01169716, "auxiliary_loss_mlp": 0.01027012, "balance_loss_clip": 1.01079774, "balance_loss_mlp": 1.01898301, "epoch": 0.9657908976131786, "flos": 37049289926400.0, "grad_norm": 1.9319036673637342, "language_loss": 0.69209117, "learning_rate": 1.222298079063222e-08, "loss": 0.7140584, "num_input_tokens_seen": 173389040, "step": 8032, "time_per_iteration": 3.7031545639038086 }, { "auxiliary_loss_clip": 0.01165188, "auxiliary_loss_mlp": 0.0102395, "balance_loss_clip": 1.00948238, "balance_loss_mlp": 1.01741195, "epoch": 0.9659111405038178, "flos": 24389809597440.0, "grad_norm": 1.9655176964086716, "language_loss": 0.72621548, "learning_rate": 1.2137142019541524e-08, "loss": 0.7481069, "num_input_tokens_seen": 173407595, "step": 8033, "time_per_iteration": 2.631361961364746 }, { "auxiliary_loss_clip": 0.01172864, "auxiliary_loss_mlp": 0.01024505, "balance_loss_clip": 0.97238529, "balance_loss_mlp": 1.01741505, "epoch": 0.9660313833944568, "flos": 25009412227200.0, "grad_norm": 1.7713980684198756, "language_loss": 0.73436832, "learning_rate": 1.2051604805666027e-08, "loss": 0.75634199, "num_input_tokens_seen": 173424720, "step": 8034, "time_per_iteration": 3.6951820850372314 }, { "auxiliary_loss_clip": 0.01165425, "auxiliary_loss_mlp": 0.01121965, "balance_loss_clip": 1.04789662, "balance_loss_mlp": 0.0, "epoch": 0.9661516262850959, "flos": 11801575895040.0, "grad_norm": 5.966715532263929, "language_loss": 0.78419745, "learning_rate": 1.196636916198135e-08, "loss": 0.80707133, "num_input_tokens_seen": 173442260, "step": 8035, "time_per_iteration": 2.5641746520996094 }, { "auxiliary_loss_clip": 0.01168666, "auxiliary_loss_mlp": 0.01024551, "balance_loss_clip": 1.04758334, "balance_loss_mlp": 1.01758933, "epoch": 0.9662718691757349, "flos": 20047778766720.0, "grad_norm": 1.8029659716642532, "language_loss": 0.76941407, "learning_rate": 1.1881435101418036e-08, "loss": 0.79134619, "num_input_tokens_seen": 173461675, "step": 8036, "time_per_iteration": 2.6389894485473633 }, { "auxiliary_loss_clip": 0.01062477, "auxiliary_loss_mlp": 0.01001643, "balance_loss_clip": 0.93440735, "balance_loss_mlp": 1.0001049, "epoch": 0.9663921120663741, "flos": 68027703517440.0, "grad_norm": 0.7273379434810217, "language_loss": 0.65550792, "learning_rate": 1.1796802636860003e-08, "loss": 0.67614913, "num_input_tokens_seen": 173530205, "step": 8037, "time_per_iteration": 3.2602546215057373 }, { "auxiliary_loss_clip": 0.01169082, "auxiliary_loss_mlp": 0.01019631, "balance_loss_clip": 1.04835916, "balance_loss_mlp": 1.01288342, "epoch": 0.9665123549570132, "flos": 26322916769280.0, "grad_norm": 1.8956750826542534, "language_loss": 0.73617578, "learning_rate": 1.1712471781146316e-08, "loss": 0.75806296, "num_input_tokens_seen": 173549540, "step": 8038, "time_per_iteration": 2.670753002166748 }, { "auxiliary_loss_clip": 0.01164839, "auxiliary_loss_mlp": 0.01027201, "balance_loss_clip": 1.04677773, "balance_loss_mlp": 1.020293, "epoch": 0.9666325978476522, "flos": 43941121557120.0, "grad_norm": 2.0506880056992465, "language_loss": 0.67173976, "learning_rate": 1.1628442547069628e-08, "loss": 0.6936602, "num_input_tokens_seen": 173571740, "step": 8039, "time_per_iteration": 2.7649545669555664 }, { "auxiliary_loss_clip": 0.01167167, "auxiliary_loss_mlp": 0.01122575, "balance_loss_clip": 1.00823843, "balance_loss_mlp": 0.0, "epoch": 0.9667528407382914, "flos": 21543422198400.0, "grad_norm": 1.8775985644056856, "language_loss": 0.77592707, "learning_rate": 1.1544714947377521e-08, "loss": 0.79882443, "num_input_tokens_seen": 173589425, "step": 8040, "time_per_iteration": 2.6302521228790283 }, { "auxiliary_loss_clip": 0.01169451, "auxiliary_loss_mlp": 0.01023782, "balance_loss_clip": 1.04946053, "balance_loss_mlp": 1.01642632, "epoch": 0.9668730836289304, "flos": 23878585278720.0, "grad_norm": 2.9312816003802395, "language_loss": 0.70184267, "learning_rate": 1.1461288994770945e-08, "loss": 0.72377503, "num_input_tokens_seen": 173608500, "step": 8041, "time_per_iteration": 2.623030662536621 }, { "auxiliary_loss_clip": 0.01168167, "auxiliary_loss_mlp": 0.01026814, "balance_loss_clip": 1.04674828, "balance_loss_mlp": 1.01969683, "epoch": 0.9669933265195695, "flos": 28293011971200.0, "grad_norm": 1.8203514287027966, "language_loss": 0.77570999, "learning_rate": 1.1378164701906002e-08, "loss": 0.79765975, "num_input_tokens_seen": 173630265, "step": 8042, "time_per_iteration": 2.6581764221191406 }, { "auxiliary_loss_clip": 0.01169686, "auxiliary_loss_mlp": 0.01020796, "balance_loss_clip": 1.04932666, "balance_loss_mlp": 1.0135448, "epoch": 0.9671135694102087, "flos": 22454763091200.0, "grad_norm": 3.274484149888707, "language_loss": 0.666897, "learning_rate": 1.1295342081392156e-08, "loss": 0.68880177, "num_input_tokens_seen": 173649625, "step": 8043, "time_per_iteration": 2.645035743713379 }, { "auxiliary_loss_clip": 0.01169005, "auxiliary_loss_mlp": 0.01025572, "balance_loss_clip": 0.97130138, "balance_loss_mlp": 1.01895618, "epoch": 0.9672338123008477, "flos": 20155941596160.0, "grad_norm": 1.7106520148187574, "language_loss": 0.69005638, "learning_rate": 1.1212821145793804e-08, "loss": 0.71200216, "num_input_tokens_seen": 173669240, "step": 8044, "time_per_iteration": 2.688229560852051 }, { "auxiliary_loss_clip": 0.01161403, "auxiliary_loss_mlp": 0.01028418, "balance_loss_clip": 0.96906793, "balance_loss_mlp": 1.02162349, "epoch": 0.9673540551914868, "flos": 16977487939200.0, "grad_norm": 2.0537132175701225, "language_loss": 0.78892595, "learning_rate": 1.1130601907629156e-08, "loss": 0.81082416, "num_input_tokens_seen": 173686970, "step": 8045, "time_per_iteration": 2.666618824005127 }, { "auxiliary_loss_clip": 0.01062263, "auxiliary_loss_mlp": 0.01001903, "balance_loss_clip": 0.97042918, "balance_loss_mlp": 1.00013852, "epoch": 0.9674742980821259, "flos": 61892903952000.0, "grad_norm": 1.0044212892304918, "language_loss": 0.64910311, "learning_rate": 1.1048684379370899e-08, "loss": 0.66974473, "num_input_tokens_seen": 173747655, "step": 8046, "time_per_iteration": 3.1868321895599365 }, { "auxiliary_loss_clip": 0.01150991, "auxiliary_loss_mlp": 0.01023028, "balance_loss_clip": 0.96862739, "balance_loss_mlp": 1.01630187, "epoch": 0.967594540972765, "flos": 18697824898560.0, "grad_norm": 1.9325350729555373, "language_loss": 0.74374771, "learning_rate": 1.0967068573445759e-08, "loss": 0.76548791, "num_input_tokens_seen": 173765140, "step": 8047, "time_per_iteration": 2.721588611602783 }, { "auxiliary_loss_clip": 0.01159647, "auxiliary_loss_mlp": 0.01023958, "balance_loss_clip": 0.96913576, "balance_loss_mlp": 1.01654613, "epoch": 0.967714783863404, "flos": 20777411733120.0, "grad_norm": 2.022807053661025, "language_loss": 0.65147537, "learning_rate": 1.0885754502234945e-08, "loss": 0.67331141, "num_input_tokens_seen": 173784800, "step": 8048, "time_per_iteration": 2.700669527053833 }, { "auxiliary_loss_clip": 0.0116399, "auxiliary_loss_mlp": 0.01027892, "balance_loss_clip": 0.93404758, "balance_loss_mlp": 1.02102816, "epoch": 0.9678350267540432, "flos": 23185473465600.0, "grad_norm": 1.891984371845231, "language_loss": 0.77989209, "learning_rate": 1.08047421780737e-08, "loss": 0.80181092, "num_input_tokens_seen": 173803990, "step": 8049, "time_per_iteration": 2.6865522861480713 }, { "auxiliary_loss_clip": 0.01171966, "auxiliary_loss_mlp": 0.01122803, "balance_loss_clip": 0.97152704, "balance_loss_mlp": 0.0, "epoch": 0.9679552696446823, "flos": 21726063878400.0, "grad_norm": 2.1582788023851336, "language_loss": 0.73803645, "learning_rate": 1.0724031613251305e-08, "loss": 0.76098418, "num_input_tokens_seen": 173821890, "step": 8050, "time_per_iteration": 2.744878053665161 }, { "auxiliary_loss_clip": 0.01171607, "auxiliary_loss_mlp": 0.01030295, "balance_loss_clip": 1.00966907, "balance_loss_mlp": 1.0228591, "epoch": 0.9680755125353213, "flos": 26869046129280.0, "grad_norm": 2.1921253029188787, "language_loss": 0.66465485, "learning_rate": 1.0643622820011744e-08, "loss": 0.68667388, "num_input_tokens_seen": 173842945, "step": 8051, "time_per_iteration": 3.3640637397766113 }, { "auxiliary_loss_clip": 0.011692, "auxiliary_loss_mlp": 0.01025396, "balance_loss_clip": 1.04817986, "balance_loss_mlp": 1.01824379, "epoch": 0.9681957554259605, "flos": 28325008010880.0, "grad_norm": 4.858283258891556, "language_loss": 0.6789335, "learning_rate": 1.0563515810552814e-08, "loss": 0.70087945, "num_input_tokens_seen": 173859915, "step": 8052, "time_per_iteration": 2.6792633533477783 }, { "auxiliary_loss_clip": 0.01168292, "auxiliary_loss_mlp": 0.01028976, "balance_loss_clip": 1.04970622, "balance_loss_mlp": 1.02193379, "epoch": 0.9683159983165995, "flos": 20557674282240.0, "grad_norm": 1.5438303691099302, "language_loss": 0.73362601, "learning_rate": 1.0483710597026795e-08, "loss": 0.75559866, "num_input_tokens_seen": 173879775, "step": 8053, "time_per_iteration": 2.653642416000366 }, { "auxiliary_loss_clip": 0.01159929, "auxiliary_loss_mlp": 0.01023758, "balance_loss_clip": 0.9318127, "balance_loss_mlp": 1.0171417, "epoch": 0.9684362412072386, "flos": 24207958016640.0, "grad_norm": 2.0643506372303153, "language_loss": 0.74168611, "learning_rate": 1.0404207191540227e-08, "loss": 0.76352298, "num_input_tokens_seen": 173900230, "step": 8054, "time_per_iteration": 2.7449488639831543 }, { "auxiliary_loss_clip": 0.01163696, "auxiliary_loss_mlp": 0.01027101, "balance_loss_clip": 1.04585671, "balance_loss_mlp": 1.02009773, "epoch": 0.9685564840978778, "flos": 22346241125760.0, "grad_norm": 1.8167053861297857, "language_loss": 0.74496651, "learning_rate": 1.0325005606153236e-08, "loss": 0.76687455, "num_input_tokens_seen": 173919690, "step": 8055, "time_per_iteration": 2.612234354019165 }, { "auxiliary_loss_clip": 0.01163474, "auxiliary_loss_mlp": 0.01031872, "balance_loss_clip": 0.89278865, "balance_loss_mlp": 1.02485681, "epoch": 0.9686767269885168, "flos": 14386389477120.0, "grad_norm": 2.608042058766756, "language_loss": 0.79353642, "learning_rate": 1.0246105852881104e-08, "loss": 0.81548989, "num_input_tokens_seen": 173934790, "step": 8056, "time_per_iteration": 3.7222390174865723 }, { "auxiliary_loss_clip": 0.0117116, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.04898477, "balance_loss_mlp": 1.0264225, "epoch": 0.9687969698791559, "flos": 21287630471040.0, "grad_norm": 1.898228310766776, "language_loss": 0.78703928, "learning_rate": 1.0167507943692476e-08, "loss": 0.80908835, "num_input_tokens_seen": 173953875, "step": 8057, "time_per_iteration": 2.633263349533081 }, { "auxiliary_loss_clip": 0.01166545, "auxiliary_loss_mlp": 0.01033905, "balance_loss_clip": 1.01219988, "balance_loss_mlp": 1.026636, "epoch": 0.968917212769795, "flos": 19828328624640.0, "grad_norm": 1.938145476133956, "language_loss": 0.7160145, "learning_rate": 1.008921189051093e-08, "loss": 0.73801899, "num_input_tokens_seen": 173971220, "step": 8058, "time_per_iteration": 2.629352331161499 }, { "auxiliary_loss_clip": 0.01168282, "auxiliary_loss_mlp": 0.01023744, "balance_loss_clip": 1.04863667, "balance_loss_mlp": 1.0169462, "epoch": 0.9690374556604341, "flos": 21681749473920.0, "grad_norm": 2.0348446279259553, "language_loss": 0.77132118, "learning_rate": 1.0011217705213848e-08, "loss": 0.79324144, "num_input_tokens_seen": 173989095, "step": 8059, "time_per_iteration": 3.4781365394592285 }, { "auxiliary_loss_clip": 0.01162633, "auxiliary_loss_mlp": 0.01023021, "balance_loss_clip": 1.0090816, "balance_loss_mlp": 1.0162859, "epoch": 0.9691576985510731, "flos": 32635437851520.0, "grad_norm": 1.947747794509957, "language_loss": 0.74746341, "learning_rate": 9.933525399632658e-09, "loss": 0.76932001, "num_input_tokens_seen": 174007330, "step": 8060, "time_per_iteration": 3.5852794647216797 }, { "auxiliary_loss_clip": 0.0116573, "auxiliary_loss_mlp": 0.01021203, "balance_loss_clip": 0.9714421, "balance_loss_mlp": 1.01353204, "epoch": 0.9692779414417123, "flos": 35663174040960.0, "grad_norm": 1.6539227492695832, "language_loss": 0.65313327, "learning_rate": 9.856134985553488e-09, "loss": 0.67500263, "num_input_tokens_seen": 174027055, "step": 8061, "time_per_iteration": 2.748521327972412 }, { "auxiliary_loss_clip": 0.0116652, "auxiliary_loss_mlp": 0.01023789, "balance_loss_clip": 1.04784298, "balance_loss_mlp": 1.01705408, "epoch": 0.9693981843323514, "flos": 28366952117760.0, "grad_norm": 1.9629749174923372, "language_loss": 0.73653591, "learning_rate": 9.77904647471628e-09, "loss": 0.75843894, "num_input_tokens_seen": 174050235, "step": 8062, "time_per_iteration": 2.73752498626709 }, { "auxiliary_loss_clip": 0.01151844, "auxiliary_loss_mlp": 0.0102218, "balance_loss_clip": 0.89185643, "balance_loss_mlp": 1.01486373, "epoch": 0.9695184272229904, "flos": 23622865378560.0, "grad_norm": 1.7244587056431595, "language_loss": 0.73925906, "learning_rate": 9.702259878815454e-09, "loss": 0.76099932, "num_input_tokens_seen": 174070560, "step": 8063, "time_per_iteration": 2.7799220085144043 }, { "auxiliary_loss_clip": 0.01170443, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 1.01178098, "balance_loss_mlp": 1.0190593, "epoch": 0.9696386701136296, "flos": 23294677789440.0, "grad_norm": 2.0830804797038818, "language_loss": 0.74225569, "learning_rate": 9.625775209499254e-09, "loss": 0.76422727, "num_input_tokens_seen": 174090565, "step": 8064, "time_per_iteration": 2.7156453132629395 }, { "auxiliary_loss_clip": 0.01151227, "auxiliary_loss_mlp": 0.0101886, "balance_loss_clip": 0.92880988, "balance_loss_mlp": 1.01237798, "epoch": 0.9697589130042686, "flos": 15121876360320.0, "grad_norm": 2.142304210086046, "language_loss": 0.74020052, "learning_rate": 9.549592478370172e-09, "loss": 0.76190138, "num_input_tokens_seen": 174108745, "step": 8065, "time_per_iteration": 2.6548359394073486 }, { "auxiliary_loss_clip": 0.01167332, "auxiliary_loss_mlp": 0.0102293, "balance_loss_clip": 1.00803041, "balance_loss_mlp": 1.01635242, "epoch": 0.9698791558949077, "flos": 18879532824960.0, "grad_norm": 1.6262132093329835, "language_loss": 0.79267257, "learning_rate": 9.473711696985632e-09, "loss": 0.8145752, "num_input_tokens_seen": 174128075, "step": 8066, "time_per_iteration": 2.6352696418762207 }, { "auxiliary_loss_clip": 0.01164989, "auxiliary_loss_mlp": 0.01024694, "balance_loss_clip": 0.97088265, "balance_loss_mlp": 1.01718962, "epoch": 0.9699993987855468, "flos": 17931455297280.0, "grad_norm": 3.426823068048439, "language_loss": 0.76013875, "learning_rate": 9.398132876856201e-09, "loss": 0.78203559, "num_input_tokens_seen": 174147040, "step": 8067, "time_per_iteration": 2.5858941078186035 }, { "auxiliary_loss_clip": 0.010685, "auxiliary_loss_mlp": 0.01002398, "balance_loss_clip": 0.85986233, "balance_loss_mlp": 1.000705, "epoch": 0.9701196416761859, "flos": 67182186297600.0, "grad_norm": 0.7918418660527632, "language_loss": 0.60820603, "learning_rate": 9.322856029447379e-09, "loss": 0.62891495, "num_input_tokens_seen": 174208225, "step": 8068, "time_per_iteration": 3.1730079650878906 }, { "auxiliary_loss_clip": 0.01164377, "auxiliary_loss_mlp": 0.0102707, "balance_loss_clip": 1.04841053, "balance_loss_mlp": 1.02031088, "epoch": 0.970239884566825, "flos": 24277804012800.0, "grad_norm": 1.7645952310360746, "language_loss": 0.80175918, "learning_rate": 9.247881166178695e-09, "loss": 0.82367361, "num_input_tokens_seen": 174226935, "step": 8069, "time_per_iteration": 2.581282138824463 }, { "auxiliary_loss_clip": 0.01171731, "auxiliary_loss_mlp": 0.01024069, "balance_loss_clip": 0.93366784, "balance_loss_mlp": 1.01728582, "epoch": 0.970360127457464, "flos": 25301689194240.0, "grad_norm": 2.7709863552538496, "language_loss": 0.76359451, "learning_rate": 9.173208298423274e-09, "loss": 0.78555256, "num_input_tokens_seen": 174248140, "step": 8070, "time_per_iteration": 2.7355403900146484 }, { "auxiliary_loss_clip": 0.01162635, "auxiliary_loss_mlp": 0.01122642, "balance_loss_clip": 0.89566755, "balance_loss_mlp": 0.0, "epoch": 0.9704803703481032, "flos": 29572473398400.0, "grad_norm": 1.5122292040812995, "language_loss": 0.7586627, "learning_rate": 9.09883743750961e-09, "loss": 0.78151548, "num_input_tokens_seen": 174271030, "step": 8071, "time_per_iteration": 2.7832400798797607 }, { "auxiliary_loss_clip": 0.0116294, "auxiliary_loss_mlp": 0.01027182, "balance_loss_clip": 0.97058761, "balance_loss_mlp": 1.02030349, "epoch": 0.9706006132387422, "flos": 17380046638080.0, "grad_norm": 1.6573774772829877, "language_loss": 0.83894223, "learning_rate": 9.024768594719124e-09, "loss": 0.86084348, "num_input_tokens_seen": 174289410, "step": 8072, "time_per_iteration": 2.6429102420806885 }, { "auxiliary_loss_clip": 0.01163988, "auxiliary_loss_mlp": 0.01024278, "balance_loss_clip": 0.93133551, "balance_loss_mlp": 1.0174799, "epoch": 0.9707208561293813, "flos": 18186421011840.0, "grad_norm": 2.1562739710768017, "language_loss": 0.7271831, "learning_rate": 8.95100178128816e-09, "loss": 0.74906576, "num_input_tokens_seen": 174308550, "step": 8073, "time_per_iteration": 2.6259877681732178 }, { "auxiliary_loss_clip": 0.01163042, "auxiliary_loss_mlp": 0.01028468, "balance_loss_clip": 0.96979141, "balance_loss_mlp": 1.02145231, "epoch": 0.9708410990200205, "flos": 31248388212480.0, "grad_norm": 1.6958042106786102, "language_loss": 0.70145988, "learning_rate": 8.877537008407321e-09, "loss": 0.72337496, "num_input_tokens_seen": 174328600, "step": 8074, "time_per_iteration": 2.713186740875244 }, { "auxiliary_loss_clip": 0.01169191, "auxiliary_loss_mlp": 0.01028383, "balance_loss_clip": 0.97116017, "balance_loss_mlp": 1.02151918, "epoch": 0.9709613419106595, "flos": 30554450386560.0, "grad_norm": 1.6527662051078698, "language_loss": 0.68753743, "learning_rate": 8.804374287221028e-09, "loss": 0.70951319, "num_input_tokens_seen": 174349835, "step": 8075, "time_per_iteration": 2.891246795654297 }, { "auxiliary_loss_clip": 0.01150418, "auxiliary_loss_mlp": 0.01023245, "balance_loss_clip": 0.92733175, "balance_loss_mlp": 1.01636696, "epoch": 0.9710815848012986, "flos": 23730166281600.0, "grad_norm": 1.6026836752598832, "language_loss": 0.84663904, "learning_rate": 8.731513628827958e-09, "loss": 0.86837572, "num_input_tokens_seen": 174369200, "step": 8076, "time_per_iteration": 3.572213649749756 }, { "auxiliary_loss_clip": 0.01167717, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.0102191, "balance_loss_mlp": 1.02339172, "epoch": 0.9712018276919377, "flos": 23761875012480.0, "grad_norm": 2.2336005712341263, "language_loss": 0.8231734, "learning_rate": 8.658955044280825e-09, "loss": 0.84515274, "num_input_tokens_seen": 174388125, "step": 8077, "time_per_iteration": 2.6542553901672363 }, { "auxiliary_loss_clip": 0.01164589, "auxiliary_loss_mlp": 0.01025102, "balance_loss_clip": 1.00893986, "balance_loss_mlp": 1.01802707, "epoch": 0.9713220705825768, "flos": 23330983461120.0, "grad_norm": 1.579382220136877, "language_loss": 0.77592695, "learning_rate": 8.586698544587268e-09, "loss": 0.79782391, "num_input_tokens_seen": 174409735, "step": 8078, "time_per_iteration": 2.68255615234375 }, { "auxiliary_loss_clip": 0.01151294, "auxiliary_loss_mlp": 0.01027208, "balance_loss_clip": 0.96786064, "balance_loss_mlp": 1.02027631, "epoch": 0.9714423134732159, "flos": 22200946611840.0, "grad_norm": 1.7532419490252398, "language_loss": 0.73961949, "learning_rate": 8.514744140707853e-09, "loss": 0.76140451, "num_input_tokens_seen": 174428875, "step": 8079, "time_per_iteration": 2.7071070671081543 }, { "auxiliary_loss_clip": 0.0116421, "auxiliary_loss_mlp": 0.01027491, "balance_loss_clip": 1.04666936, "balance_loss_mlp": 1.0205977, "epoch": 0.971562556363855, "flos": 20229917656320.0, "grad_norm": 1.504312732017113, "language_loss": 0.76470417, "learning_rate": 8.443091843558515e-09, "loss": 0.78662121, "num_input_tokens_seen": 174447960, "step": 8080, "time_per_iteration": 2.6435186862945557 }, { "auxiliary_loss_clip": 0.01155722, "auxiliary_loss_mlp": 0.01021758, "balance_loss_clip": 0.96902597, "balance_loss_mlp": 1.01444483, "epoch": 0.9716827992544941, "flos": 24970197553920.0, "grad_norm": 2.422925840657499, "language_loss": 0.64387029, "learning_rate": 8.37174166400878e-09, "loss": 0.66564506, "num_input_tokens_seen": 174463535, "step": 8081, "time_per_iteration": 2.660297393798828 }, { "auxiliary_loss_clip": 0.01168878, "auxiliary_loss_mlp": 0.01027674, "balance_loss_clip": 1.05155194, "balance_loss_mlp": 1.02085209, "epoch": 0.9718030421451331, "flos": 24681476033280.0, "grad_norm": 1.8813138537449308, "language_loss": 0.85146821, "learning_rate": 8.300693612881992e-09, "loss": 0.87343377, "num_input_tokens_seen": 174483600, "step": 8082, "time_per_iteration": 3.5718588829040527 }, { "auxiliary_loss_clip": 0.01163928, "auxiliary_loss_mlp": 0.01122328, "balance_loss_clip": 1.00972128, "balance_loss_mlp": 0.0, "epoch": 0.9719232850357723, "flos": 22090700793600.0, "grad_norm": 1.9082494496561788, "language_loss": 0.8145383, "learning_rate": 8.22994770095664e-09, "loss": 0.83740085, "num_input_tokens_seen": 174502175, "step": 8083, "time_per_iteration": 2.689272880554199 }, { "auxiliary_loss_clip": 0.01167973, "auxiliary_loss_mlp": 0.0102466, "balance_loss_clip": 0.97628438, "balance_loss_mlp": 1.01768935, "epoch": 0.9720435279264114, "flos": 23656908493440.0, "grad_norm": 1.9724830490640735, "language_loss": 0.75260752, "learning_rate": 8.159503938964585e-09, "loss": 0.77453387, "num_input_tokens_seen": 174519495, "step": 8084, "time_per_iteration": 3.5796611309051514 }, { "auxiliary_loss_clip": 0.01152299, "auxiliary_loss_mlp": 0.01028574, "balance_loss_clip": 0.93102348, "balance_loss_mlp": 1.02203834, "epoch": 0.9721637708170504, "flos": 28365910623360.0, "grad_norm": 1.7569672118032869, "language_loss": 0.70318949, "learning_rate": 8.089362337592164e-09, "loss": 0.72499824, "num_input_tokens_seen": 174543120, "step": 8085, "time_per_iteration": 2.7706892490386963 }, { "auxiliary_loss_clip": 0.01160114, "auxiliary_loss_mlp": 0.01026475, "balance_loss_clip": 0.97198665, "balance_loss_mlp": 1.0194118, "epoch": 0.9722840137076896, "flos": 29130807767040.0, "grad_norm": 1.5240072213371356, "language_loss": 0.71847713, "learning_rate": 8.019522907479536e-09, "loss": 0.74034303, "num_input_tokens_seen": 174563480, "step": 8086, "time_per_iteration": 3.7434065341949463 }, { "auxiliary_loss_clip": 0.01169981, "auxiliary_loss_mlp": 0.01026924, "balance_loss_clip": 1.0122391, "balance_loss_mlp": 1.02041817, "epoch": 0.9724042565983286, "flos": 19243954258560.0, "grad_norm": 2.0111666047849988, "language_loss": 0.77185035, "learning_rate": 7.949985659221558e-09, "loss": 0.79381943, "num_input_tokens_seen": 174580745, "step": 8087, "time_per_iteration": 2.611487627029419 }, { "auxiliary_loss_clip": 0.01167342, "auxiliary_loss_mlp": 0.01023977, "balance_loss_clip": 0.97038054, "balance_loss_mlp": 1.0169313, "epoch": 0.9725244994889677, "flos": 23039676161280.0, "grad_norm": 2.0961880324250424, "language_loss": 0.78968304, "learning_rate": 7.880750603366904e-09, "loss": 0.81159627, "num_input_tokens_seen": 174599615, "step": 8088, "time_per_iteration": 2.709087371826172 }, { "auxiliary_loss_clip": 0.01172038, "auxiliary_loss_mlp": 0.01027808, "balance_loss_clip": 0.93221664, "balance_loss_mlp": 1.02046812, "epoch": 0.9726447423796069, "flos": 23367468700800.0, "grad_norm": 1.9241377705338119, "language_loss": 0.79655576, "learning_rate": 7.811817750418282e-09, "loss": 0.81855416, "num_input_tokens_seen": 174618375, "step": 8089, "time_per_iteration": 2.6777191162109375 }, { "auxiliary_loss_clip": 0.01162224, "auxiliary_loss_mlp": 0.01027936, "balance_loss_clip": 0.93515277, "balance_loss_mlp": 1.02036047, "epoch": 0.9727649852702459, "flos": 26541648639360.0, "grad_norm": 1.5498182434640115, "language_loss": 0.80041951, "learning_rate": 7.743187110833105e-09, "loss": 0.82232112, "num_input_tokens_seen": 174641135, "step": 8090, "time_per_iteration": 2.7629823684692383 }, { "auxiliary_loss_clip": 0.01164491, "auxiliary_loss_mlp": 0.0101983, "balance_loss_clip": 0.96846753, "balance_loss_mlp": 1.01299906, "epoch": 0.972885228160885, "flos": 20522338277760.0, "grad_norm": 1.5923765064624076, "language_loss": 0.80664897, "learning_rate": 7.674858695022602e-09, "loss": 0.82849222, "num_input_tokens_seen": 174659490, "step": 8091, "time_per_iteration": 2.752312421798706 }, { "auxiliary_loss_clip": 0.01169025, "auxiliary_loss_mlp": 0.01027721, "balance_loss_clip": 1.04846597, "balance_loss_mlp": 1.02046967, "epoch": 0.9730054710515241, "flos": 17566064196480.0, "grad_norm": 2.6078968119886015, "language_loss": 0.7684558, "learning_rate": 7.606832513351591e-09, "loss": 0.79042327, "num_input_tokens_seen": 174677440, "step": 8092, "time_per_iteration": 2.756263017654419 }, { "auxiliary_loss_clip": 0.01058625, "auxiliary_loss_mlp": 0.01115537, "balance_loss_clip": 1.00773883, "balance_loss_mlp": 0.0, "epoch": 0.9731257139421632, "flos": 68972010117120.0, "grad_norm": 0.8266263775524545, "language_loss": 0.63971663, "learning_rate": 7.539108576140264e-09, "loss": 0.66145825, "num_input_tokens_seen": 174741550, "step": 8093, "time_per_iteration": 3.2259814739227295 }, { "auxiliary_loss_clip": 0.01160125, "auxiliary_loss_mlp": 0.01025527, "balance_loss_clip": 0.89374483, "balance_loss_mlp": 1.01842785, "epoch": 0.9732459568328022, "flos": 18478841633280.0, "grad_norm": 1.828270014218134, "language_loss": 0.6998421, "learning_rate": 7.471686893661732e-09, "loss": 0.72169858, "num_input_tokens_seen": 174759845, "step": 8094, "time_per_iteration": 2.728980779647827 }, { "auxiliary_loss_clip": 0.01162123, "auxiliary_loss_mlp": 0.01024103, "balance_loss_clip": 0.972565, "balance_loss_mlp": 1.01744843, "epoch": 0.9733661997234414, "flos": 20883886623360.0, "grad_norm": 1.5875210166620684, "language_loss": 0.64041084, "learning_rate": 7.4045674761442636e-09, "loss": 0.66227311, "num_input_tokens_seen": 174777175, "step": 8095, "time_per_iteration": 2.6572353839874268 }, { "auxiliary_loss_clip": 0.01167118, "auxiliary_loss_mlp": 0.01122359, "balance_loss_clip": 1.04878676, "balance_loss_mlp": 0.0, "epoch": 0.9734864426140805, "flos": 23766795175680.0, "grad_norm": 3.1852769283661453, "language_loss": 0.74401414, "learning_rate": 7.337750333769488e-09, "loss": 0.76690888, "num_input_tokens_seen": 174796980, "step": 8096, "time_per_iteration": 2.579968214035034 }, { "auxiliary_loss_clip": 0.0116581, "auxiliary_loss_mlp": 0.0102453, "balance_loss_clip": 0.96716464, "balance_loss_mlp": 1.01703489, "epoch": 0.9736066855047195, "flos": 35042422176000.0, "grad_norm": 1.79728026564238, "language_loss": 0.7264505, "learning_rate": 7.2712354766737425e-09, "loss": 0.74835384, "num_input_tokens_seen": 174817310, "step": 8097, "time_per_iteration": 2.812026023864746 }, { "auxiliary_loss_clip": 0.01157497, "auxiliary_loss_mlp": 0.01031251, "balance_loss_clip": 0.93557626, "balance_loss_mlp": 1.0240804, "epoch": 0.9737269283953586, "flos": 20410620001920.0, "grad_norm": 1.4879590130287903, "language_loss": 0.80688721, "learning_rate": 7.2050229149469565e-09, "loss": 0.82877469, "num_input_tokens_seen": 174837320, "step": 8098, "time_per_iteration": 2.669309139251709 }, { "auxiliary_loss_clip": 0.01162912, "auxiliary_loss_mlp": 0.01021942, "balance_loss_clip": 0.92877054, "balance_loss_mlp": 1.01524234, "epoch": 0.9738471712859977, "flos": 28911680847360.0, "grad_norm": 1.806164666717163, "language_loss": 0.63622147, "learning_rate": 7.139112658633984e-09, "loss": 0.65807003, "num_input_tokens_seen": 174857470, "step": 8099, "time_per_iteration": 2.8545584678649902 }, { "auxiliary_loss_clip": 0.01159832, "auxiliary_loss_mlp": 0.01024637, "balance_loss_clip": 0.93365562, "balance_loss_mlp": 1.01720464, "epoch": 0.9739674141766368, "flos": 27782326356480.0, "grad_norm": 2.004598637347306, "language_loss": 0.70293105, "learning_rate": 7.073504717733048e-09, "loss": 0.72477579, "num_input_tokens_seen": 174877035, "step": 8100, "time_per_iteration": 2.717125415802002 }, { "auxiliary_loss_clip": 0.01071955, "auxiliary_loss_mlp": 0.010035, "balance_loss_clip": 0.86194968, "balance_loss_mlp": 1.00186634, "epoch": 0.9740876570672758, "flos": 68863057188480.0, "grad_norm": 0.7409797552107715, "language_loss": 0.57238936, "learning_rate": 7.008199102196855e-09, "loss": 0.59314388, "num_input_tokens_seen": 174938460, "step": 8101, "time_per_iteration": 3.282203197479248 }, { "auxiliary_loss_clip": 0.0105784, "auxiliary_loss_mlp": 0.01003582, "balance_loss_clip": 0.93535209, "balance_loss_mlp": 1.00210345, "epoch": 0.974207899957915, "flos": 58236622646400.0, "grad_norm": 0.8008466240583862, "language_loss": 0.59006679, "learning_rate": 6.9431958219321464e-09, "loss": 0.61068094, "num_input_tokens_seen": 174994625, "step": 8102, "time_per_iteration": 3.960994243621826 }, { "auxiliary_loss_clip": 0.01161766, "auxiliary_loss_mlp": 0.01024641, "balance_loss_clip": 0.9686594, "balance_loss_mlp": 1.01806021, "epoch": 0.9743281428485541, "flos": 22600057605120.0, "grad_norm": 1.5245818331902306, "language_loss": 0.77643871, "learning_rate": 6.878494886800146e-09, "loss": 0.79830277, "num_input_tokens_seen": 175015400, "step": 8103, "time_per_iteration": 2.663984775543213 }, { "auxiliary_loss_clip": 0.01167385, "auxiliary_loss_mlp": 0.01020092, "balance_loss_clip": 0.97276711, "balance_loss_mlp": 1.01336563, "epoch": 0.9744483857391931, "flos": 20008815488640.0, "grad_norm": 1.7824264228835964, "language_loss": 0.76167858, "learning_rate": 6.814096306615669e-09, "loss": 0.78355336, "num_input_tokens_seen": 175033540, "step": 8104, "time_per_iteration": 2.6597304344177246 }, { "auxiliary_loss_clip": 0.01170105, "auxiliary_loss_mlp": 0.01023829, "balance_loss_clip": 0.9695971, "balance_loss_mlp": 1.01645041, "epoch": 0.9745686286298323, "flos": 17675268520320.0, "grad_norm": 2.185135623756616, "language_loss": 0.6560607, "learning_rate": 6.750000091148011e-09, "loss": 0.67800009, "num_input_tokens_seen": 175050835, "step": 8105, "time_per_iteration": 2.7212259769439697 }, { "auxiliary_loss_clip": 0.01168946, "auxiliary_loss_mlp": 0.0102266, "balance_loss_clip": 1.04875469, "balance_loss_mlp": 1.01585579, "epoch": 0.9746888715204713, "flos": 29460252332160.0, "grad_norm": 1.909760507355269, "language_loss": 0.72705585, "learning_rate": 6.686206250120729e-09, "loss": 0.74897188, "num_input_tokens_seen": 175072330, "step": 8106, "time_per_iteration": 2.7532482147216797 }, { "auxiliary_loss_clip": 0.01168218, "auxiliary_loss_mlp": 0.01023766, "balance_loss_clip": 0.92976534, "balance_loss_mlp": 1.01610041, "epoch": 0.9748091144111104, "flos": 18479308510080.0, "grad_norm": 2.046033113248377, "language_loss": 0.74886537, "learning_rate": 6.622714793210749e-09, "loss": 0.77078521, "num_input_tokens_seen": 175091250, "step": 8107, "time_per_iteration": 2.787367820739746 }, { "auxiliary_loss_clip": 0.01170405, "auxiliary_loss_mlp": 0.01023538, "balance_loss_clip": 1.04913116, "balance_loss_mlp": 1.01710975, "epoch": 0.9749293573017496, "flos": 20665154753280.0, "grad_norm": 1.6567364055842952, "language_loss": 0.78694367, "learning_rate": 6.559525730050364e-09, "loss": 0.80888307, "num_input_tokens_seen": 175111350, "step": 8108, "time_per_iteration": 3.693730354309082 }, { "auxiliary_loss_clip": 0.01165655, "auxiliary_loss_mlp": 0.01029829, "balance_loss_clip": 0.93352973, "balance_loss_mlp": 1.02315021, "epoch": 0.9750496001923886, "flos": 18478590238080.0, "grad_norm": 1.8699346345468542, "language_loss": 0.76410902, "learning_rate": 6.496639070224574e-09, "loss": 0.78606391, "num_input_tokens_seen": 175129835, "step": 8109, "time_per_iteration": 2.6949989795684814 }, { "auxiliary_loss_clip": 0.01170428, "auxiliary_loss_mlp": 0.01021991, "balance_loss_clip": 1.01068461, "balance_loss_mlp": 1.01556814, "epoch": 0.9751698430830277, "flos": 19572967860480.0, "grad_norm": 2.716105981515384, "language_loss": 0.83558518, "learning_rate": 6.4340548232739714e-09, "loss": 0.85750937, "num_input_tokens_seen": 175146035, "step": 8110, "time_per_iteration": 2.6156206130981445 }, { "auxiliary_loss_clip": 0.0116726, "auxiliary_loss_mlp": 0.01021908, "balance_loss_clip": 0.93199599, "balance_loss_mlp": 1.01563478, "epoch": 0.9752900859736668, "flos": 23550325862400.0, "grad_norm": 2.135495772600366, "language_loss": 0.79235083, "learning_rate": 6.371772998692071e-09, "loss": 0.81424254, "num_input_tokens_seen": 175165290, "step": 8111, "time_per_iteration": 4.451690912246704 }, { "auxiliary_loss_clip": 0.01162785, "auxiliary_loss_mlp": 0.01025086, "balance_loss_clip": 0.92917007, "balance_loss_mlp": 1.01823735, "epoch": 0.9754103288643059, "flos": 20303211358080.0, "grad_norm": 2.3005055298909642, "language_loss": 0.6438235, "learning_rate": 6.309793605927094e-09, "loss": 0.66570228, "num_input_tokens_seen": 175183610, "step": 8112, "time_per_iteration": 2.7177956104278564 }, { "auxiliary_loss_clip": 0.01171375, "auxiliary_loss_mlp": 0.01026991, "balance_loss_clip": 0.97172701, "balance_loss_mlp": 1.02049136, "epoch": 0.975530571754945, "flos": 19350680544000.0, "grad_norm": 1.7972573590971834, "language_loss": 0.80044162, "learning_rate": 6.248116654381297e-09, "loss": 0.82242525, "num_input_tokens_seen": 175202080, "step": 8113, "time_per_iteration": 2.618938446044922 }, { "auxiliary_loss_clip": 0.0116663, "auxiliary_loss_mlp": 0.01023674, "balance_loss_clip": 0.96871614, "balance_loss_mlp": 1.01729941, "epoch": 0.9756508146455841, "flos": 23583399310080.0, "grad_norm": 1.8159620108085566, "language_loss": 0.72876692, "learning_rate": 6.186742153410751e-09, "loss": 0.75066996, "num_input_tokens_seen": 175221575, "step": 8114, "time_per_iteration": 2.696841239929199 }, { "auxiliary_loss_clip": 0.01161558, "auxiliary_loss_mlp": 0.01024961, "balance_loss_clip": 0.97043473, "balance_loss_mlp": 1.01751018, "epoch": 0.9757710575362232, "flos": 22966921163520.0, "grad_norm": 3.1758200784520443, "language_loss": 0.87355161, "learning_rate": 6.125670112326453e-09, "loss": 0.89541674, "num_input_tokens_seen": 175240835, "step": 8115, "time_per_iteration": 2.6698532104492188 }, { "auxiliary_loss_clip": 0.01164601, "auxiliary_loss_mlp": 0.01022435, "balance_loss_clip": 1.00679839, "balance_loss_mlp": 1.01531243, "epoch": 0.9758913004268622, "flos": 27966009530880.0, "grad_norm": 1.6567124978550032, "language_loss": 0.70291913, "learning_rate": 6.064900540392548e-09, "loss": 0.72478944, "num_input_tokens_seen": 175262930, "step": 8116, "time_per_iteration": 2.7082297801971436 }, { "auxiliary_loss_clip": 0.0115944, "auxiliary_loss_mlp": 0.01026111, "balance_loss_clip": 0.97136551, "balance_loss_mlp": 1.01959896, "epoch": 0.9760115433175014, "flos": 22200156512640.0, "grad_norm": 1.973323521564592, "language_loss": 0.78734583, "learning_rate": 6.0044334468278835e-09, "loss": 0.80920136, "num_input_tokens_seen": 175282275, "step": 8117, "time_per_iteration": 2.656855821609497 }, { "auxiliary_loss_clip": 0.01158916, "auxiliary_loss_mlp": 0.01022768, "balance_loss_clip": 0.89263618, "balance_loss_mlp": 1.01606512, "epoch": 0.9761317862081405, "flos": 26250736389120.0, "grad_norm": 2.7458327823478226, "language_loss": 0.71394432, "learning_rate": 5.944268840805345e-09, "loss": 0.73576117, "num_input_tokens_seen": 175303020, "step": 8118, "time_per_iteration": 2.767383098602295 }, { "auxiliary_loss_clip": 0.01157629, "auxiliary_loss_mlp": 0.01028038, "balance_loss_clip": 0.93099469, "balance_loss_mlp": 1.02146983, "epoch": 0.9762520290987795, "flos": 26575440359040.0, "grad_norm": 2.1943051586867717, "language_loss": 0.64432114, "learning_rate": 5.88440673145163e-09, "loss": 0.66617787, "num_input_tokens_seen": 175324070, "step": 8119, "time_per_iteration": 2.873309850692749 }, { "auxiliary_loss_clip": 0.01163692, "auxiliary_loss_mlp": 0.01027471, "balance_loss_clip": 1.0122571, "balance_loss_mlp": 1.02070343, "epoch": 0.9763722719894187, "flos": 18005036307840.0, "grad_norm": 2.413375308805847, "language_loss": 0.82572114, "learning_rate": 5.824847127848142e-09, "loss": 0.84763277, "num_input_tokens_seen": 175342595, "step": 8120, "time_per_iteration": 2.7870311737060547 }, { "auxiliary_loss_clip": 0.01167938, "auxiliary_loss_mlp": 0.01028601, "balance_loss_clip": 0.89713323, "balance_loss_mlp": 1.02115083, "epoch": 0.9764925148800577, "flos": 22455660931200.0, "grad_norm": 2.242279544813859, "language_loss": 0.78912342, "learning_rate": 5.765590039029433e-09, "loss": 0.81108886, "num_input_tokens_seen": 175361915, "step": 8121, "time_per_iteration": 2.7142696380615234 }, { "auxiliary_loss_clip": 0.01169321, "auxiliary_loss_mlp": 0.01027776, "balance_loss_clip": 1.05039036, "balance_loss_mlp": 1.0205462, "epoch": 0.9766127577706968, "flos": 36757084786560.0, "grad_norm": 1.526109626823739, "language_loss": 0.71069896, "learning_rate": 5.706635473985422e-09, "loss": 0.73266995, "num_input_tokens_seen": 175385785, "step": 8122, "time_per_iteration": 2.6970579624176025 }, { "auxiliary_loss_clip": 0.01163158, "auxiliary_loss_mlp": 0.01025653, "balance_loss_clip": 1.00813353, "balance_loss_mlp": 1.01863122, "epoch": 0.976733000661336, "flos": 22309971367680.0, "grad_norm": 1.6836927162934174, "language_loss": 0.85292476, "learning_rate": 5.6479834416591764e-09, "loss": 0.87481284, "num_input_tokens_seen": 175405145, "step": 8123, "time_per_iteration": 2.6137430667877197 }, { "auxiliary_loss_clip": 0.01163887, "auxiliary_loss_mlp": 0.01123336, "balance_loss_clip": 1.00971913, "balance_loss_mlp": 0.0, "epoch": 0.976853243551975, "flos": 25810938264960.0, "grad_norm": 2.9943148747684294, "language_loss": 0.68274844, "learning_rate": 5.589633950947803e-09, "loss": 0.70562077, "num_input_tokens_seen": 175422645, "step": 8124, "time_per_iteration": 2.6255173683166504 }, { "auxiliary_loss_clip": 0.01160533, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 0.97091621, "balance_loss_mlp": 1.022017, "epoch": 0.9769734864426141, "flos": 21397445326080.0, "grad_norm": 1.839013325264406, "language_loss": 0.69860864, "learning_rate": 5.5315870107035535e-09, "loss": 0.72051263, "num_input_tokens_seen": 175440695, "step": 8125, "time_per_iteration": 2.6707377433776855 }, { "auxiliary_loss_clip": 0.01163037, "auxiliary_loss_mlp": 0.01022369, "balance_loss_clip": 0.9721908, "balance_loss_mlp": 1.01552677, "epoch": 0.9770937293332532, "flos": 13990977584640.0, "grad_norm": 1.694359750613159, "language_loss": 0.78990602, "learning_rate": 5.473842629731607e-09, "loss": 0.81176007, "num_input_tokens_seen": 175459195, "step": 8126, "time_per_iteration": 2.5954716205596924 }, { "auxiliary_loss_clip": 0.01171324, "auxiliary_loss_mlp": 0.01122907, "balance_loss_clip": 0.97031599, "balance_loss_mlp": 0.0, "epoch": 0.9772139722238923, "flos": 17931994001280.0, "grad_norm": 4.829122125736299, "language_loss": 0.77767354, "learning_rate": 5.416400816792066e-09, "loss": 0.80061591, "num_input_tokens_seen": 175476710, "step": 8127, "time_per_iteration": 2.6486153602600098 }, { "auxiliary_loss_clip": 0.01162598, "auxiliary_loss_mlp": 0.01020649, "balance_loss_clip": 1.04522729, "balance_loss_mlp": 1.01389539, "epoch": 0.9773342151145313, "flos": 20446171488000.0, "grad_norm": 3.0662719353087926, "language_loss": 0.78622699, "learning_rate": 5.359261580598407e-09, "loss": 0.80805945, "num_input_tokens_seen": 175492550, "step": 8128, "time_per_iteration": 3.320242404937744 }, { "auxiliary_loss_clip": 0.01167014, "auxiliary_loss_mlp": 0.01025659, "balance_loss_clip": 1.00954926, "balance_loss_mlp": 1.01764786, "epoch": 0.9774544580051704, "flos": 11837306949120.0, "grad_norm": 2.400829004620666, "language_loss": 0.7790516, "learning_rate": 5.302424929819027e-09, "loss": 0.8009783, "num_input_tokens_seen": 175506560, "step": 8129, "time_per_iteration": 2.6165757179260254 }, { "auxiliary_loss_clip": 0.01164073, "auxiliary_loss_mlp": 0.010256, "balance_loss_clip": 1.00477481, "balance_loss_mlp": 1.01826823, "epoch": 0.9775747008958096, "flos": 13479932833920.0, "grad_norm": 2.1360964005713163, "language_loss": 0.73137248, "learning_rate": 5.24589087307592e-09, "loss": 0.7532692, "num_input_tokens_seen": 175524180, "step": 8130, "time_per_iteration": 2.5856869220733643 }, { "auxiliary_loss_clip": 0.01166996, "auxiliary_loss_mlp": 0.01023955, "balance_loss_clip": 1.04638553, "balance_loss_mlp": 1.01689816, "epoch": 0.9776949437864486, "flos": 59532314042880.0, "grad_norm": 1.6571100487530397, "language_loss": 0.64721429, "learning_rate": 5.189659418944891e-09, "loss": 0.66912383, "num_input_tokens_seen": 175554355, "step": 8131, "time_per_iteration": 3.025810956954956 }, { "auxiliary_loss_clip": 0.01168295, "auxiliary_loss_mlp": 0.01025964, "balance_loss_clip": 1.04957438, "balance_loss_mlp": 1.01879358, "epoch": 0.9778151866770877, "flos": 21178605715200.0, "grad_norm": 1.8837543619036201, "language_loss": 0.78377855, "learning_rate": 5.133730575956674e-09, "loss": 0.80572116, "num_input_tokens_seen": 175574025, "step": 8132, "time_per_iteration": 2.6156883239746094 }, { "auxiliary_loss_clip": 0.01165442, "auxiliary_loss_mlp": 0.01029161, "balance_loss_clip": 0.97121316, "balance_loss_mlp": 1.02256322, "epoch": 0.9779354295677268, "flos": 20886795624960.0, "grad_norm": 2.1067959931153837, "language_loss": 0.72352481, "learning_rate": 5.0781043525953696e-09, "loss": 0.74547088, "num_input_tokens_seen": 175592090, "step": 8133, "time_per_iteration": 2.648547887802124 }, { "auxiliary_loss_clip": 0.01159908, "auxiliary_loss_mlp": 0.01023666, "balance_loss_clip": 0.97197974, "balance_loss_mlp": 1.01682925, "epoch": 0.9780556724583659, "flos": 23440618748160.0, "grad_norm": 1.9282096386386702, "language_loss": 0.73666561, "learning_rate": 5.0227807572995605e-09, "loss": 0.75850141, "num_input_tokens_seen": 175614065, "step": 8134, "time_per_iteration": 3.6107053756713867 }, { "auxiliary_loss_clip": 0.01164849, "auxiliary_loss_mlp": 0.01022693, "balance_loss_clip": 0.96930945, "balance_loss_mlp": 1.01588583, "epoch": 0.9781759153490049, "flos": 20923244951040.0, "grad_norm": 2.2379583432011287, "language_loss": 0.67407012, "learning_rate": 4.967759798461646e-09, "loss": 0.69594556, "num_input_tokens_seen": 175632410, "step": 8135, "time_per_iteration": 2.6665444374084473 }, { "auxiliary_loss_clip": 0.01165819, "auxiliary_loss_mlp": 0.01021194, "balance_loss_clip": 1.04840815, "balance_loss_mlp": 1.01489127, "epoch": 0.9782961582396441, "flos": 28293191539200.0, "grad_norm": 1.9943840116426985, "language_loss": 0.74420112, "learning_rate": 4.913041484428282e-09, "loss": 0.7660712, "num_input_tokens_seen": 175652885, "step": 8136, "time_per_iteration": 3.562546968460083 }, { "auxiliary_loss_clip": 0.01166679, "auxiliary_loss_mlp": 0.01025933, "balance_loss_clip": 1.00948071, "balance_loss_mlp": 1.01935542, "epoch": 0.9784164011302832, "flos": 25552955808000.0, "grad_norm": 1.7101542308769377, "language_loss": 0.74201202, "learning_rate": 4.858625823500384e-09, "loss": 0.76393813, "num_input_tokens_seen": 175670585, "step": 8137, "time_per_iteration": 3.6405811309814453 }, { "auxiliary_loss_clip": 0.01169002, "auxiliary_loss_mlp": 0.0102843, "balance_loss_clip": 1.00949585, "balance_loss_mlp": 1.02158141, "epoch": 0.9785366440209222, "flos": 29965945956480.0, "grad_norm": 2.0855433871947637, "language_loss": 0.73520386, "learning_rate": 4.80451282393246e-09, "loss": 0.75717813, "num_input_tokens_seen": 175690570, "step": 8138, "time_per_iteration": 2.6625349521636963 }, { "auxiliary_loss_clip": 0.01168956, "auxiliary_loss_mlp": 0.01027774, "balance_loss_clip": 0.97324526, "balance_loss_mlp": 1.02062106, "epoch": 0.9786568869115614, "flos": 32343591847680.0, "grad_norm": 1.9378966038560415, "language_loss": 0.67298269, "learning_rate": 4.750702493933722e-09, "loss": 0.69494998, "num_input_tokens_seen": 175710455, "step": 8139, "time_per_iteration": 2.7426161766052246 }, { "auxiliary_loss_clip": 0.01167587, "auxiliary_loss_mlp": 0.01122611, "balance_loss_clip": 0.97394878, "balance_loss_mlp": 0.0, "epoch": 0.9787771298022004, "flos": 23331414424320.0, "grad_norm": 2.8630546375021626, "language_loss": 0.85507697, "learning_rate": 4.697194841666974e-09, "loss": 0.87797904, "num_input_tokens_seen": 175729380, "step": 8140, "time_per_iteration": 2.6757657527923584 }, { "auxiliary_loss_clip": 0.01167044, "auxiliary_loss_mlp": 0.01026755, "balance_loss_clip": 1.00888848, "balance_loss_mlp": 1.01925969, "epoch": 0.9788973726928395, "flos": 21468548298240.0, "grad_norm": 1.7937823719420452, "language_loss": 0.82119548, "learning_rate": 4.6439898752492764e-09, "loss": 0.84313351, "num_input_tokens_seen": 175749520, "step": 8141, "time_per_iteration": 2.707486867904663 }, { "auxiliary_loss_clip": 0.01062313, "auxiliary_loss_mlp": 0.01115698, "balance_loss_clip": 0.97113591, "balance_loss_mlp": 0.0, "epoch": 0.9790176155834787, "flos": 68897459439360.0, "grad_norm": 0.7572630503230952, "language_loss": 0.63751364, "learning_rate": 4.591087602751731e-09, "loss": 0.65929377, "num_input_tokens_seen": 175811380, "step": 8142, "time_per_iteration": 3.301461935043335 }, { "auxiliary_loss_clip": 0.01165573, "auxiliary_loss_mlp": 0.01025391, "balance_loss_clip": 1.00924134, "balance_loss_mlp": 1.01853609, "epoch": 0.9791378584741177, "flos": 21430877909760.0, "grad_norm": 4.120421731287104, "language_loss": 0.72106308, "learning_rate": 4.538488032199916e-09, "loss": 0.74297273, "num_input_tokens_seen": 175829480, "step": 8143, "time_per_iteration": 2.604741096496582 }, { "auxiliary_loss_clip": 0.01166569, "auxiliary_loss_mlp": 0.01021808, "balance_loss_clip": 1.00628805, "balance_loss_mlp": 1.01457465, "epoch": 0.9792581013647568, "flos": 20153032594560.0, "grad_norm": 3.702613060302514, "language_loss": 0.68813038, "learning_rate": 4.486191171572784e-09, "loss": 0.7100141, "num_input_tokens_seen": 175846750, "step": 8144, "time_per_iteration": 2.65435791015625 }, { "auxiliary_loss_clip": 0.01168879, "auxiliary_loss_mlp": 0.01026143, "balance_loss_clip": 1.01067412, "balance_loss_mlp": 1.01946723, "epoch": 0.9793783442553959, "flos": 23728191033600.0, "grad_norm": 1.481393553972857, "language_loss": 0.77624291, "learning_rate": 4.434197028803766e-09, "loss": 0.7981931, "num_input_tokens_seen": 175865975, "step": 8145, "time_per_iteration": 2.629558801651001 }, { "auxiliary_loss_clip": 0.01171111, "auxiliary_loss_mlp": 0.01030811, "balance_loss_clip": 0.93320906, "balance_loss_mlp": 1.02375114, "epoch": 0.979498587146035, "flos": 23038742407680.0, "grad_norm": 1.9093345084788187, "language_loss": 0.81931818, "learning_rate": 4.3825056117805514e-09, "loss": 0.84133744, "num_input_tokens_seen": 175881860, "step": 8146, "time_per_iteration": 2.697880506515503 }, { "auxiliary_loss_clip": 0.01167028, "auxiliary_loss_mlp": 0.01027267, "balance_loss_clip": 1.04648221, "balance_loss_mlp": 1.01979828, "epoch": 0.979618830036674, "flos": 14318841951360.0, "grad_norm": 2.025338455533955, "language_loss": 0.79238337, "learning_rate": 4.331116928344425e-09, "loss": 0.81432629, "num_input_tokens_seen": 175898175, "step": 8147, "time_per_iteration": 2.6505565643310547 }, { "auxiliary_loss_clip": 0.01168677, "auxiliary_loss_mlp": 0.01123063, "balance_loss_clip": 0.96999192, "balance_loss_mlp": 0.0, "epoch": 0.9797390729273132, "flos": 16727514215040.0, "grad_norm": 2.023404895728938, "language_loss": 0.62639284, "learning_rate": 4.28003098629115e-09, "loss": 0.64931023, "num_input_tokens_seen": 175914310, "step": 8148, "time_per_iteration": 2.663564920425415 }, { "auxiliary_loss_clip": 0.011561, "auxiliary_loss_mlp": 0.01020531, "balance_loss_clip": 0.92661983, "balance_loss_mlp": 1.01366723, "epoch": 0.9798593158179523, "flos": 24532661986560.0, "grad_norm": 1.929647192874403, "language_loss": 0.78566742, "learning_rate": 4.229247793370305e-09, "loss": 0.80743372, "num_input_tokens_seen": 175933435, "step": 8149, "time_per_iteration": 2.751235246658325 }, { "auxiliary_loss_clip": 0.01169583, "auxiliary_loss_mlp": 0.01025941, "balance_loss_clip": 1.04916787, "balance_loss_mlp": 1.01920903, "epoch": 0.9799795587085913, "flos": 27308808339840.0, "grad_norm": 1.6330598678052755, "language_loss": 0.7031129, "learning_rate": 4.178767357285951e-09, "loss": 0.72506815, "num_input_tokens_seen": 175955065, "step": 8150, "time_per_iteration": 2.6528451442718506 }, { "auxiliary_loss_clip": 0.011666, "auxiliary_loss_mlp": 0.0112203, "balance_loss_clip": 1.01000285, "balance_loss_mlp": 0.0, "epoch": 0.9800998015992305, "flos": 26286575184000.0, "grad_norm": 3.245026689391919, "language_loss": 0.71297359, "learning_rate": 4.128589685695516e-09, "loss": 0.73585987, "num_input_tokens_seen": 175975490, "step": 8151, "time_per_iteration": 2.7165842056274414 }, { "auxiliary_loss_clip": 0.01167865, "auxiliary_loss_mlp": 0.01021448, "balance_loss_clip": 1.04794693, "balance_loss_mlp": 1.01430154, "epoch": 0.9802200444898695, "flos": 16723635546240.0, "grad_norm": 2.5534364088707044, "language_loss": 0.84219044, "learning_rate": 4.078714786211135e-09, "loss": 0.86408353, "num_input_tokens_seen": 175991340, "step": 8152, "time_per_iteration": 2.568030834197998 }, { "auxiliary_loss_clip": 0.01163781, "auxiliary_loss_mlp": 0.01025578, "balance_loss_clip": 1.00844145, "balance_loss_mlp": 1.01887548, "epoch": 0.9803402873805086, "flos": 24900459298560.0, "grad_norm": 1.6987496750321902, "language_loss": 0.77022254, "learning_rate": 4.029142666398977e-09, "loss": 0.79211617, "num_input_tokens_seen": 176011505, "step": 8153, "time_per_iteration": 2.7390072345733643 }, { "auxiliary_loss_clip": 0.01164201, "auxiliary_loss_mlp": 0.0102377, "balance_loss_clip": 1.04807127, "balance_loss_mlp": 1.01703179, "epoch": 0.9804605302711478, "flos": 22564937082240.0, "grad_norm": 1.7482197778030912, "language_loss": 0.79866445, "learning_rate": 3.979873333778805e-09, "loss": 0.82054412, "num_input_tokens_seen": 176029680, "step": 8154, "time_per_iteration": 3.4092330932617188 }, { "auxiliary_loss_clip": 0.01170455, "auxiliary_loss_mlp": 0.01024232, "balance_loss_clip": 0.97215068, "balance_loss_mlp": 1.01709116, "epoch": 0.9805807731617868, "flos": 38905368382080.0, "grad_norm": 1.8178541523691594, "language_loss": 0.73803592, "learning_rate": 3.930906795824862e-09, "loss": 0.75998276, "num_input_tokens_seen": 176050355, "step": 8155, "time_per_iteration": 2.820831775665283 }, { "auxiliary_loss_clip": 0.01163087, "auxiliary_loss_mlp": 0.01027591, "balance_loss_clip": 1.0076524, "balance_loss_mlp": 1.02042103, "epoch": 0.9807010160524259, "flos": 17821999578240.0, "grad_norm": 1.874278176342814, "language_loss": 0.76320958, "learning_rate": 3.882243059965207e-09, "loss": 0.78511637, "num_input_tokens_seen": 176068070, "step": 8156, "time_per_iteration": 2.573911666870117 }, { "auxiliary_loss_clip": 0.01159268, "auxiliary_loss_mlp": 0.01020696, "balance_loss_clip": 1.00848699, "balance_loss_mlp": 1.01349592, "epoch": 0.980821258943065, "flos": 13552975140480.0, "grad_norm": 2.563272729463867, "language_loss": 0.65927613, "learning_rate": 3.833882133582156e-09, "loss": 0.68107575, "num_input_tokens_seen": 176083730, "step": 8157, "time_per_iteration": 2.582169771194458 }, { "auxiliary_loss_clip": 0.0116899, "auxiliary_loss_mlp": 0.01023868, "balance_loss_clip": 1.00895071, "balance_loss_mlp": 1.01714766, "epoch": 0.9809415018337041, "flos": 21689794120320.0, "grad_norm": 1.689590281961893, "language_loss": 0.78216875, "learning_rate": 3.785824024012285e-09, "loss": 0.80409729, "num_input_tokens_seen": 176102730, "step": 8158, "time_per_iteration": 2.5888102054595947 }, { "auxiliary_loss_clip": 0.01159842, "auxiliary_loss_mlp": 0.01030159, "balance_loss_clip": 0.97366858, "balance_loss_mlp": 1.02339709, "epoch": 0.9810617447243432, "flos": 23294857357440.0, "grad_norm": 1.48179076793852, "language_loss": 0.78147984, "learning_rate": 3.738068738545541e-09, "loss": 0.80337989, "num_input_tokens_seen": 176121815, "step": 8159, "time_per_iteration": 2.7053911685943604 }, { "auxiliary_loss_clip": 0.01170898, "auxiliary_loss_mlp": 0.01029546, "balance_loss_clip": 1.01075494, "balance_loss_mlp": 1.02239943, "epoch": 0.9811819876149822, "flos": 18332038748160.0, "grad_norm": 2.4317859547236926, "language_loss": 0.78623343, "learning_rate": 3.6906162844265733e-09, "loss": 0.80823791, "num_input_tokens_seen": 176138900, "step": 8160, "time_per_iteration": 3.5508615970611572 }, { "auxiliary_loss_clip": 0.01157351, "auxiliary_loss_mlp": 0.01021228, "balance_loss_clip": 0.96978664, "balance_loss_mlp": 1.01413822, "epoch": 0.9813022305056214, "flos": 22601961025920.0, "grad_norm": 1.7864472294496618, "language_loss": 0.70741808, "learning_rate": 3.643466668853845e-09, "loss": 0.72920394, "num_input_tokens_seen": 176156925, "step": 8161, "time_per_iteration": 2.6759402751922607 }, { "auxiliary_loss_clip": 0.01164744, "auxiliary_loss_mlp": 0.01023386, "balance_loss_clip": 0.97077763, "balance_loss_mlp": 1.01628757, "epoch": 0.9814224733962604, "flos": 25413335642880.0, "grad_norm": 1.8143301576447144, "language_loss": 0.75094438, "learning_rate": 3.59661989898008e-09, "loss": 0.77282566, "num_input_tokens_seen": 176177980, "step": 8162, "time_per_iteration": 3.614973545074463 }, { "auxiliary_loss_clip": 0.01158405, "auxiliary_loss_mlp": 0.01024252, "balance_loss_clip": 0.93394411, "balance_loss_mlp": 1.01757884, "epoch": 0.9815427162868995, "flos": 25007185584000.0, "grad_norm": 1.6501048388937838, "language_loss": 0.76883495, "learning_rate": 3.5500759819115934e-09, "loss": 0.79066151, "num_input_tokens_seen": 176198345, "step": 8163, "time_per_iteration": 2.7723307609558105 }, { "auxiliary_loss_clip": 0.01170147, "auxiliary_loss_mlp": 0.01031013, "balance_loss_clip": 1.05080843, "balance_loss_mlp": 1.02349067, "epoch": 0.9816629591775387, "flos": 20662604887680.0, "grad_norm": 7.173717552307603, "language_loss": 0.81482399, "learning_rate": 3.5038349247094034e-09, "loss": 0.8368355, "num_input_tokens_seen": 176215605, "step": 8164, "time_per_iteration": 3.5346248149871826 }, { "auxiliary_loss_clip": 0.01162213, "auxiliary_loss_mlp": 0.01028632, "balance_loss_clip": 0.96855932, "balance_loss_mlp": 1.02171755, "epoch": 0.9817832020681777, "flos": 17712220636800.0, "grad_norm": 2.2580338136000018, "language_loss": 0.77442944, "learning_rate": 3.4578967343878994e-09, "loss": 0.79633784, "num_input_tokens_seen": 176231810, "step": 8165, "time_per_iteration": 2.683598518371582 }, { "auxiliary_loss_clip": 0.01163607, "auxiliary_loss_mlp": 0.01024379, "balance_loss_clip": 0.97198486, "balance_loss_mlp": 1.01739979, "epoch": 0.9819034449588168, "flos": 22530032040960.0, "grad_norm": 1.824842908334017, "language_loss": 0.80933797, "learning_rate": 3.4122614179161733e-09, "loss": 0.83121777, "num_input_tokens_seen": 176251770, "step": 8166, "time_per_iteration": 2.63075852394104 }, { "auxiliary_loss_clip": 0.01150154, "auxiliary_loss_mlp": 0.01023359, "balance_loss_clip": 0.92979228, "balance_loss_mlp": 1.01655507, "epoch": 0.9820236878494559, "flos": 20011221699840.0, "grad_norm": 1.7299605806655824, "language_loss": 0.78166234, "learning_rate": 3.36692898221691e-09, "loss": 0.80339754, "num_input_tokens_seen": 176270135, "step": 8167, "time_per_iteration": 2.699319839477539 }, { "auxiliary_loss_clip": 0.01164879, "auxiliary_loss_mlp": 0.01026885, "balance_loss_clip": 1.00873613, "balance_loss_mlp": 1.01984, "epoch": 0.982143930740095, "flos": 18807316531200.0, "grad_norm": 2.2172905987668465, "language_loss": 0.73804921, "learning_rate": 3.3218994341668305e-09, "loss": 0.75996685, "num_input_tokens_seen": 176289065, "step": 8168, "time_per_iteration": 2.56595778465271 }, { "auxiliary_loss_clip": 0.01164914, "auxiliary_loss_mlp": 0.01027339, "balance_loss_clip": 1.04922903, "balance_loss_mlp": 1.02049017, "epoch": 0.982264173630734, "flos": 26578026138240.0, "grad_norm": 1.7105705858765587, "language_loss": 0.75444925, "learning_rate": 3.2771727805971373e-09, "loss": 0.77637178, "num_input_tokens_seen": 176310450, "step": 8169, "time_per_iteration": 2.6203699111938477 }, { "auxiliary_loss_clip": 0.01150674, "auxiliary_loss_mlp": 0.01025496, "balance_loss_clip": 0.88934731, "balance_loss_mlp": 1.01838541, "epoch": 0.9823844165213732, "flos": 22014462176640.0, "grad_norm": 1.8286788235234186, "language_loss": 0.770311, "learning_rate": 3.232749028292847e-09, "loss": 0.79207271, "num_input_tokens_seen": 176327415, "step": 8170, "time_per_iteration": 2.7095870971679688 }, { "auxiliary_loss_clip": 0.01166611, "auxiliary_loss_mlp": 0.01021485, "balance_loss_clip": 1.04568768, "balance_loss_mlp": 1.0144341, "epoch": 0.9825046594120123, "flos": 21908166854400.0, "grad_norm": 2.8104796804190335, "language_loss": 0.88312179, "learning_rate": 3.188628183992792e-09, "loss": 0.90500277, "num_input_tokens_seen": 176347680, "step": 8171, "time_per_iteration": 2.608445882797241 }, { "auxiliary_loss_clip": 0.0106248, "auxiliary_loss_mlp": 0.01003098, "balance_loss_clip": 0.97030401, "balance_loss_mlp": 1.00150013, "epoch": 0.9826249023026513, "flos": 59494610718720.0, "grad_norm": 0.7415613965064726, "language_loss": 0.6254316, "learning_rate": 3.1448102543902844e-09, "loss": 0.64608741, "num_input_tokens_seen": 176411595, "step": 8172, "time_per_iteration": 3.141789674758911 }, { "auxiliary_loss_clip": 0.01158269, "auxiliary_loss_mlp": 0.01024392, "balance_loss_clip": 0.97118151, "balance_loss_mlp": 1.0172987, "epoch": 0.9827451451932905, "flos": 16071031296000.0, "grad_norm": 1.9418094600586835, "language_loss": 0.675753, "learning_rate": 3.1012952461324515e-09, "loss": 0.69757962, "num_input_tokens_seen": 176430570, "step": 8173, "time_per_iteration": 2.6425588130950928 }, { "auxiliary_loss_clip": 0.011633, "auxiliary_loss_mlp": 0.01025895, "balance_loss_clip": 1.01131475, "balance_loss_mlp": 1.01917744, "epoch": 0.9828653880839295, "flos": 20262775622400.0, "grad_norm": 2.18278598546064, "language_loss": 0.73700356, "learning_rate": 3.0580831658204575e-09, "loss": 0.75889552, "num_input_tokens_seen": 176448150, "step": 8174, "time_per_iteration": 2.680410861968994 }, { "auxiliary_loss_clip": 0.01163435, "auxiliary_loss_mlp": 0.01022835, "balance_loss_clip": 1.00965333, "balance_loss_mlp": 1.01606429, "epoch": 0.9829856309745686, "flos": 21616141282560.0, "grad_norm": 1.7320181307790563, "language_loss": 0.78166652, "learning_rate": 3.015174020009281e-09, "loss": 0.80352914, "num_input_tokens_seen": 176467475, "step": 8175, "time_per_iteration": 2.749485492706299 }, { "auxiliary_loss_clip": 0.0116358, "auxiliary_loss_mlp": 0.01026614, "balance_loss_clip": 0.93053836, "balance_loss_mlp": 1.01943433, "epoch": 0.9831058738652078, "flos": 23764209396480.0, "grad_norm": 1.7119090458218353, "language_loss": 0.74898022, "learning_rate": 2.9725678152086043e-09, "loss": 0.77088213, "num_input_tokens_seen": 176486045, "step": 8176, "time_per_iteration": 2.6876025199890137 }, { "auxiliary_loss_clip": 0.01152754, "auxiliary_loss_mlp": 0.010213, "balance_loss_clip": 0.9696902, "balance_loss_mlp": 1.01430488, "epoch": 0.9832261167558468, "flos": 11320911072000.0, "grad_norm": 2.54283853642527, "language_loss": 0.82622671, "learning_rate": 2.930264557881257e-09, "loss": 0.84796727, "num_input_tokens_seen": 176501230, "step": 8177, "time_per_iteration": 2.65634822845459 }, { "auxiliary_loss_clip": 0.01058741, "auxiliary_loss_mlp": 0.01002315, "balance_loss_clip": 1.00798774, "balance_loss_mlp": 1.00074148, "epoch": 0.9833463596464859, "flos": 60000304343040.0, "grad_norm": 0.8494932635027969, "language_loss": 0.58214259, "learning_rate": 2.8882642544452163e-09, "loss": 0.60275316, "num_input_tokens_seen": 176565955, "step": 8178, "time_per_iteration": 3.1690173149108887 }, { "auxiliary_loss_clip": 0.01150935, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 0.96727121, "balance_loss_mlp": 1.01889133, "epoch": 0.983466602537125, "flos": 13626699805440.0, "grad_norm": 2.1043372251056214, "language_loss": 0.74822813, "learning_rate": 2.8465669112716083e-09, "loss": 0.76999587, "num_input_tokens_seen": 176583480, "step": 8179, "time_per_iteration": 2.6208813190460205 }, { "auxiliary_loss_clip": 0.01167075, "auxiliary_loss_mlp": 0.01122744, "balance_loss_clip": 1.00830758, "balance_loss_mlp": 0.0, "epoch": 0.9835868454277641, "flos": 22926844563840.0, "grad_norm": 1.7799596401075128, "language_loss": 0.76491362, "learning_rate": 2.8051725346858177e-09, "loss": 0.78781176, "num_input_tokens_seen": 176603740, "step": 8180, "time_per_iteration": 3.526820182800293 }, { "auxiliary_loss_clip": 0.01167068, "auxiliary_loss_mlp": 0.01028868, "balance_loss_clip": 1.04568422, "balance_loss_mlp": 1.02192998, "epoch": 0.9837070883184031, "flos": 27673409341440.0, "grad_norm": 1.8990979813177593, "language_loss": 0.71145165, "learning_rate": 2.7640811309674883e-09, "loss": 0.73341095, "num_input_tokens_seen": 176623240, "step": 8181, "time_per_iteration": 2.6557562351226807 }, { "auxiliary_loss_clip": 0.01154046, "auxiliary_loss_mlp": 0.01030727, "balance_loss_clip": 0.93293929, "balance_loss_mlp": 1.0238874, "epoch": 0.9838273312090423, "flos": 29241951425280.0, "grad_norm": 1.5609460251076386, "language_loss": 0.80854017, "learning_rate": 2.7232927063498557e-09, "loss": 0.83038795, "num_input_tokens_seen": 176643615, "step": 8182, "time_per_iteration": 2.781269073486328 }, { "auxiliary_loss_clip": 0.01169141, "auxiliary_loss_mlp": 0.01026037, "balance_loss_clip": 1.01027608, "balance_loss_mlp": 1.01927209, "epoch": 0.9839475740996814, "flos": 40110207304320.0, "grad_norm": 2.063786903351956, "language_loss": 0.69133008, "learning_rate": 2.682807267020859e-09, "loss": 0.71328187, "num_input_tokens_seen": 176666375, "step": 8183, "time_per_iteration": 2.823540449142456 }, { "auxiliary_loss_clip": 0.01164254, "auxiliary_loss_mlp": 0.01025446, "balance_loss_clip": 1.0095377, "balance_loss_mlp": 1.01820958, "epoch": 0.9840678169903204, "flos": 24169389788160.0, "grad_norm": 1.5502429894010983, "language_loss": 0.62295699, "learning_rate": 2.642624819121808e-09, "loss": 0.64485395, "num_input_tokens_seen": 176686525, "step": 8184, "time_per_iteration": 2.6842591762542725 }, { "auxiliary_loss_clip": 0.01162054, "auxiliary_loss_mlp": 0.01024355, "balance_loss_clip": 0.97179568, "balance_loss_mlp": 1.01758432, "epoch": 0.9841880598809596, "flos": 14684484447360.0, "grad_norm": 1.901343028587375, "language_loss": 0.61722374, "learning_rate": 2.6027453687487154e-09, "loss": 0.63908786, "num_input_tokens_seen": 176703615, "step": 8185, "time_per_iteration": 2.6788594722747803 }, { "auxiliary_loss_clip": 0.01164812, "auxiliary_loss_mlp": 0.0102727, "balance_loss_clip": 0.97219723, "balance_loss_mlp": 1.02029371, "epoch": 0.9843083027715986, "flos": 22344768668160.0, "grad_norm": 2.2242367889722314, "language_loss": 0.53783536, "learning_rate": 2.5631689219509643e-09, "loss": 0.55975616, "num_input_tokens_seen": 176722295, "step": 8186, "time_per_iteration": 3.712064027786255 }, { "auxiliary_loss_clip": 0.01163673, "auxiliary_loss_mlp": 0.01027759, "balance_loss_clip": 0.97163713, "balance_loss_mlp": 1.02086592, "epoch": 0.9844285456622377, "flos": 21800111765760.0, "grad_norm": 1.649056096534097, "language_loss": 0.83510727, "learning_rate": 2.523895484732197e-09, "loss": 0.85702163, "num_input_tokens_seen": 176741750, "step": 8187, "time_per_iteration": 2.7373480796813965 }, { "auxiliary_loss_clip": 0.01171117, "auxiliary_loss_mlp": 0.01026842, "balance_loss_clip": 1.00879776, "balance_loss_mlp": 1.01930249, "epoch": 0.9845487885528769, "flos": 18035380321920.0, "grad_norm": 1.7378963421379603, "language_loss": 0.74857569, "learning_rate": 2.4849250630505357e-09, "loss": 0.77055526, "num_input_tokens_seen": 176759995, "step": 8188, "time_per_iteration": 2.598417282104492 }, { "auxiliary_loss_clip": 0.01146804, "auxiliary_loss_mlp": 0.01026043, "balance_loss_clip": 0.81527245, "balance_loss_mlp": 1.01920605, "epoch": 0.9846690314435159, "flos": 25228610974080.0, "grad_norm": 1.866722613200292, "language_loss": 0.73395634, "learning_rate": 2.4462576628172528e-09, "loss": 0.75568485, "num_input_tokens_seen": 176778625, "step": 8189, "time_per_iteration": 3.681654930114746 }, { "auxiliary_loss_clip": 0.01163267, "auxiliary_loss_mlp": 0.01029601, "balance_loss_clip": 1.00990713, "balance_loss_mlp": 1.02305341, "epoch": 0.984789274334155, "flos": 18552171248640.0, "grad_norm": 2.1546661826644313, "language_loss": 0.73923218, "learning_rate": 2.407893289898766e-09, "loss": 0.76116085, "num_input_tokens_seen": 176797655, "step": 8190, "time_per_iteration": 3.608337163925171 }, { "auxiliary_loss_clip": 0.01153011, "auxiliary_loss_mlp": 0.01024959, "balance_loss_clip": 0.92983866, "balance_loss_mlp": 1.01755357, "epoch": 0.984909517224794, "flos": 27345437233920.0, "grad_norm": 1.9229693745617058, "language_loss": 0.83443362, "learning_rate": 2.3698319501144202e-09, "loss": 0.85621333, "num_input_tokens_seen": 176818640, "step": 8191, "time_per_iteration": 2.745122194290161 }, { "auxiliary_loss_clip": 0.01171343, "auxiliary_loss_mlp": 0.01027346, "balance_loss_clip": 1.00863004, "balance_loss_mlp": 1.01984823, "epoch": 0.9850297601154332, "flos": 18734058743040.0, "grad_norm": 1.8271470818525566, "language_loss": 0.7340405, "learning_rate": 2.3320736492382644e-09, "loss": 0.75602734, "num_input_tokens_seen": 176837475, "step": 8192, "time_per_iteration": 2.6689870357513428 }, { "auxiliary_loss_clip": 0.01163277, "auxiliary_loss_mlp": 0.01028205, "balance_loss_clip": 1.04740465, "balance_loss_mlp": 1.02103508, "epoch": 0.9851500030060723, "flos": 22308247514880.0, "grad_norm": 2.0285945021299945, "language_loss": 0.68075448, "learning_rate": 2.29461839299816e-09, "loss": 0.70266932, "num_input_tokens_seen": 176857190, "step": 8193, "time_per_iteration": 2.6325900554656982 }, { "auxiliary_loss_clip": 0.01165464, "auxiliary_loss_mlp": 0.01024759, "balance_loss_clip": 0.93245423, "balance_loss_mlp": 1.01820874, "epoch": 0.9852702458967113, "flos": 26353691746560.0, "grad_norm": 1.4730057497442366, "language_loss": 0.79714572, "learning_rate": 2.257466187076229e-09, "loss": 0.81904793, "num_input_tokens_seen": 176876395, "step": 8194, "time_per_iteration": 2.6994469165802 }, { "auxiliary_loss_clip": 0.01169614, "auxiliary_loss_mlp": 0.01122337, "balance_loss_clip": 1.00804877, "balance_loss_mlp": 0.0, "epoch": 0.9853904887873505, "flos": 20883599314560.0, "grad_norm": 2.4993629430274202, "language_loss": 0.7123946, "learning_rate": 2.2206170371081854e-09, "loss": 0.73531407, "num_input_tokens_seen": 176894980, "step": 8195, "time_per_iteration": 2.6357109546661377 }, { "auxiliary_loss_clip": 0.01164353, "auxiliary_loss_mlp": 0.01026902, "balance_loss_clip": 0.96943802, "balance_loss_mlp": 1.01971388, "epoch": 0.9855107316779895, "flos": 25263444188160.0, "grad_norm": 1.5552298892981118, "language_loss": 0.84756958, "learning_rate": 2.1840709486842247e-09, "loss": 0.8694821, "num_input_tokens_seen": 176914600, "step": 8196, "time_per_iteration": 2.6687307357788086 }, { "auxiliary_loss_clip": 0.01156862, "auxiliary_loss_mlp": 0.01027614, "balance_loss_clip": 0.96971029, "balance_loss_mlp": 1.01990104, "epoch": 0.9856309745686286, "flos": 19062102677760.0, "grad_norm": 2.7065607048201, "language_loss": 0.79518497, "learning_rate": 2.1478279273481335e-09, "loss": 0.81702971, "num_input_tokens_seen": 176933085, "step": 8197, "time_per_iteration": 2.715129852294922 }, { "auxiliary_loss_clip": 0.01162458, "auxiliary_loss_mlp": 0.01023903, "balance_loss_clip": 1.01001, "balance_loss_mlp": 1.01691711, "epoch": 0.9857512174592677, "flos": 34130758060800.0, "grad_norm": 2.2567945365641084, "language_loss": 0.79917186, "learning_rate": 2.1118879785981815e-09, "loss": 0.8210355, "num_input_tokens_seen": 176953225, "step": 8198, "time_per_iteration": 2.70352840423584 }, { "auxiliary_loss_clip": 0.01164033, "auxiliary_loss_mlp": 0.01027209, "balance_loss_clip": 0.97109139, "balance_loss_mlp": 1.02015185, "epoch": 0.9858714603499068, "flos": 25994693266560.0, "grad_norm": 1.7091882591842418, "language_loss": 0.79163408, "learning_rate": 2.0762511078862288e-09, "loss": 0.81354654, "num_input_tokens_seen": 176973570, "step": 8199, "time_per_iteration": 2.685194253921509 }, { "auxiliary_loss_clip": 0.0117225, "auxiliary_loss_mlp": 0.01021606, "balance_loss_clip": 0.97040164, "balance_loss_mlp": 1.01482916, "epoch": 0.9859917032405459, "flos": 23696230907520.0, "grad_norm": 1.69445443187095, "language_loss": 0.64897108, "learning_rate": 2.0409173206186183e-09, "loss": 0.67090964, "num_input_tokens_seen": 176992810, "step": 8200, "time_per_iteration": 2.64817476272583 }, { "auxiliary_loss_clip": 0.01163286, "auxiliary_loss_mlp": 0.01029847, "balance_loss_clip": 0.9361062, "balance_loss_mlp": 1.02317464, "epoch": 0.986111946131185, "flos": 19938287134080.0, "grad_norm": 1.918667659547718, "language_loss": 0.86981714, "learning_rate": 2.0058866221550617e-09, "loss": 0.89174843, "num_input_tokens_seen": 177011050, "step": 8201, "time_per_iteration": 2.6997830867767334 }, { "auxiliary_loss_clip": 0.01166972, "auxiliary_loss_mlp": 0.01021527, "balance_loss_clip": 1.047369, "balance_loss_mlp": 1.01384366, "epoch": 0.9862321890218241, "flos": 19828831415040.0, "grad_norm": 1.9916033640408135, "language_loss": 0.75041902, "learning_rate": 1.971159017809976e-09, "loss": 0.77230406, "num_input_tokens_seen": 177029340, "step": 8202, "time_per_iteration": 2.548466920852661 }, { "auxiliary_loss_clip": 0.01163527, "auxiliary_loss_mlp": 0.01026965, "balance_loss_clip": 1.00853944, "balance_loss_mlp": 1.01995254, "epoch": 0.9863524319124631, "flos": 21652051904640.0, "grad_norm": 2.085083510167226, "language_loss": 0.78104711, "learning_rate": 1.93673451285159e-09, "loss": 0.80295199, "num_input_tokens_seen": 177048390, "step": 8203, "time_per_iteration": 2.636059045791626 }, { "auxiliary_loss_clip": 0.01066294, "auxiliary_loss_mlp": 0.01002761, "balance_loss_clip": 0.9334684, "balance_loss_mlp": 1.00109243, "epoch": 0.9864726748031023, "flos": 52769977920000.0, "grad_norm": 0.7321858907032832, "language_loss": 0.56619668, "learning_rate": 1.9026131125019495e-09, "loss": 0.58688718, "num_input_tokens_seen": 177105760, "step": 8204, "time_per_iteration": 3.1120078563690186 }, { "auxiliary_loss_clip": 0.01160317, "auxiliary_loss_mlp": 0.01026038, "balance_loss_clip": 1.00886893, "balance_loss_mlp": 1.01871228, "epoch": 0.9865929176937414, "flos": 23364631526400.0, "grad_norm": 1.7372988190166059, "language_loss": 0.8688255, "learning_rate": 1.8687948219371363e-09, "loss": 0.89068907, "num_input_tokens_seen": 177124985, "step": 8205, "time_per_iteration": 2.6006641387939453 }, { "auxiliary_loss_clip": 0.01170083, "auxiliary_loss_mlp": 0.0102831, "balance_loss_clip": 1.04672253, "balance_loss_mlp": 1.02041268, "epoch": 0.9867131605843804, "flos": 21616679986560.0, "grad_norm": 2.3769855270689915, "language_loss": 0.88492054, "learning_rate": 1.835279646287491e-09, "loss": 0.9069044, "num_input_tokens_seen": 177142995, "step": 8206, "time_per_iteration": 3.4335427284240723 }, { "auxiliary_loss_clip": 0.0117447, "auxiliary_loss_mlp": 0.01031779, "balance_loss_clip": 1.01289022, "balance_loss_mlp": 1.02377975, "epoch": 0.9868334034750196, "flos": 22271403139200.0, "grad_norm": 1.6818689572368506, "language_loss": 0.76655954, "learning_rate": 1.8020675906371685e-09, "loss": 0.78862202, "num_input_tokens_seen": 177162390, "step": 8207, "time_per_iteration": 2.601515531539917 }, { "auxiliary_loss_clip": 0.0115668, "auxiliary_loss_mlp": 0.01028534, "balance_loss_clip": 0.89188087, "balance_loss_mlp": 1.02140808, "epoch": 0.9869536463656586, "flos": 25809573548160.0, "grad_norm": 1.8607880523218534, "language_loss": 0.74980062, "learning_rate": 1.7691586600243612e-09, "loss": 0.77165276, "num_input_tokens_seen": 177181290, "step": 8208, "time_per_iteration": 2.7305679321289062 }, { "auxiliary_loss_clip": 0.01163305, "auxiliary_loss_mlp": 0.01024138, "balance_loss_clip": 0.97269899, "balance_loss_mlp": 1.01710796, "epoch": 0.9870738892562977, "flos": 16398500613120.0, "grad_norm": 3.1392040983548424, "language_loss": 0.86650467, "learning_rate": 1.7365528594415202e-09, "loss": 0.8883791, "num_input_tokens_seen": 177195360, "step": 8209, "time_per_iteration": 2.665193796157837 }, { "auxiliary_loss_clip": 0.01171335, "auxiliary_loss_mlp": 0.01122801, "balance_loss_clip": 1.01055348, "balance_loss_mlp": 0.0, "epoch": 0.9871941321469369, "flos": 35481358373760.0, "grad_norm": 1.4971156534197114, "language_loss": 0.67441607, "learning_rate": 1.7042501938346888e-09, "loss": 0.69735742, "num_input_tokens_seen": 177218090, "step": 8210, "time_per_iteration": 2.745098114013672 }, { "auxiliary_loss_clip": 0.01148363, "auxiliary_loss_mlp": 0.0103101, "balance_loss_clip": 0.96561432, "balance_loss_mlp": 1.0244354, "epoch": 0.9873143750375759, "flos": 21434217874560.0, "grad_norm": 1.9648313248692308, "language_loss": 0.76225638, "learning_rate": 1.6722506681043913e-09, "loss": 0.78405011, "num_input_tokens_seen": 177237050, "step": 8211, "time_per_iteration": 2.6194443702697754 }, { "auxiliary_loss_clip": 0.01168157, "auxiliary_loss_mlp": 0.01026701, "balance_loss_clip": 0.9714433, "balance_loss_mlp": 1.01972747, "epoch": 0.987434617928215, "flos": 16326499800960.0, "grad_norm": 2.3175570625931794, "language_loss": 0.68974578, "learning_rate": 1.640554287104745e-09, "loss": 0.71169436, "num_input_tokens_seen": 177255325, "step": 8212, "time_per_iteration": 3.532392740249634 }, { "auxiliary_loss_clip": 0.0116346, "auxiliary_loss_mlp": 0.01023465, "balance_loss_clip": 0.93031132, "balance_loss_mlp": 1.01625872, "epoch": 0.9875548608188541, "flos": 17851984456320.0, "grad_norm": 2.188776800437161, "language_loss": 0.80243087, "learning_rate": 1.609161055644348e-09, "loss": 0.82430005, "num_input_tokens_seen": 177271250, "step": 8213, "time_per_iteration": 2.6352150440216064 }, { "auxiliary_loss_clip": 0.01171674, "auxiliary_loss_mlp": 0.01028808, "balance_loss_clip": 1.00790036, "balance_loss_mlp": 1.02147675, "epoch": 0.9876751037094932, "flos": 26132876887680.0, "grad_norm": 3.5896862233046223, "language_loss": 0.68473738, "learning_rate": 1.5780709784849467e-09, "loss": 0.70674217, "num_input_tokens_seen": 177288270, "step": 8214, "time_per_iteration": 2.6146035194396973 }, { "auxiliary_loss_clip": 0.01167474, "auxiliary_loss_mlp": 0.01030167, "balance_loss_clip": 0.86046875, "balance_loss_mlp": 1.0229671, "epoch": 0.9877953466001322, "flos": 15991344973440.0, "grad_norm": 2.18584540741048, "language_loss": 0.82274997, "learning_rate": 1.5472840603436565e-09, "loss": 0.84472638, "num_input_tokens_seen": 177305500, "step": 8215, "time_per_iteration": 3.5930306911468506 }, { "auxiliary_loss_clip": 0.01165785, "auxiliary_loss_mlp": 0.01026665, "balance_loss_clip": 0.97035706, "balance_loss_mlp": 1.01973629, "epoch": 0.9879155894907714, "flos": 18806777827200.0, "grad_norm": 1.8614482371846568, "language_loss": 0.78132188, "learning_rate": 1.5168003058900757e-09, "loss": 0.80324638, "num_input_tokens_seen": 177323500, "step": 8216, "time_per_iteration": 3.424506425857544 }, { "auxiliary_loss_clip": 0.01161313, "auxiliary_loss_mlp": 0.01020815, "balance_loss_clip": 0.93107378, "balance_loss_mlp": 1.01397216, "epoch": 0.9880358323814105, "flos": 22382044007040.0, "grad_norm": 1.7573643456126673, "language_loss": 0.9203018, "learning_rate": 1.4866197197491715e-09, "loss": 0.94212306, "num_input_tokens_seen": 177342860, "step": 8217, "time_per_iteration": 2.7045767307281494 }, { "auxiliary_loss_clip": 0.01170591, "auxiliary_loss_mlp": 0.01122716, "balance_loss_clip": 1.01053798, "balance_loss_mlp": 0.0, "epoch": 0.9881560752720495, "flos": 15668831733120.0, "grad_norm": 3.1035592898640028, "language_loss": 0.78340739, "learning_rate": 1.4567423064988371e-09, "loss": 0.80634046, "num_input_tokens_seen": 177360210, "step": 8218, "time_per_iteration": 2.6174421310424805 }, { "auxiliary_loss_clip": 0.01168669, "auxiliary_loss_mlp": 0.0103073, "balance_loss_clip": 1.04764247, "balance_loss_mlp": 1.02284408, "epoch": 0.9882763181626887, "flos": 21500113374720.0, "grad_norm": 1.8517406096340738, "language_loss": 0.78058422, "learning_rate": 1.4271680706718913e-09, "loss": 0.80257827, "num_input_tokens_seen": 177377885, "step": 8219, "time_per_iteration": 2.616461992263794 }, { "auxiliary_loss_clip": 0.01170901, "auxiliary_loss_mlp": 0.01031381, "balance_loss_clip": 1.01161218, "balance_loss_mlp": 1.02369261, "epoch": 0.9883965610533277, "flos": 28034598551040.0, "grad_norm": 1.597070790262158, "language_loss": 0.82322407, "learning_rate": 1.3978970167543013e-09, "loss": 0.84524691, "num_input_tokens_seen": 177398065, "step": 8220, "time_per_iteration": 2.7560555934906006 }, { "auxiliary_loss_clip": 0.011535, "auxiliary_loss_mlp": 0.01025431, "balance_loss_clip": 0.9691211, "balance_loss_mlp": 1.01792061, "epoch": 0.9885168039439668, "flos": 14098601710080.0, "grad_norm": 2.952155491127217, "language_loss": 0.77439725, "learning_rate": 1.3689291491867372e-09, "loss": 0.79618657, "num_input_tokens_seen": 177416380, "step": 8221, "time_per_iteration": 2.670081377029419 }, { "auxiliary_loss_clip": 0.01168481, "auxiliary_loss_mlp": 0.01026315, "balance_loss_clip": 1.04744077, "balance_loss_mlp": 1.01926124, "epoch": 0.988637046834606, "flos": 26432013352320.0, "grad_norm": 2.2832737791956794, "language_loss": 0.73944956, "learning_rate": 1.3402644723636836e-09, "loss": 0.76139754, "num_input_tokens_seen": 177438410, "step": 8222, "time_per_iteration": 2.6295359134674072 }, { "auxiliary_loss_clip": 0.01161518, "auxiliary_loss_mlp": 0.01028809, "balance_loss_clip": 0.97229278, "balance_loss_mlp": 1.02186489, "epoch": 0.988757289725245, "flos": 25229113764480.0, "grad_norm": 1.8575900518786608, "language_loss": 0.83525801, "learning_rate": 1.311902990633218e-09, "loss": 0.85716128, "num_input_tokens_seen": 177457375, "step": 8223, "time_per_iteration": 2.6460728645324707 }, { "auxiliary_loss_clip": 0.01153059, "auxiliary_loss_mlp": 0.01022868, "balance_loss_clip": 0.9653374, "balance_loss_mlp": 1.01580155, "epoch": 0.9888775326158841, "flos": 26359042872960.0, "grad_norm": 1.4897738714238438, "language_loss": 0.71141165, "learning_rate": 1.2838447082978987e-09, "loss": 0.73317087, "num_input_tokens_seen": 177478530, "step": 8224, "time_per_iteration": 2.714597463607788 }, { "auxiliary_loss_clip": 0.01161037, "auxiliary_loss_mlp": 0.01022596, "balance_loss_clip": 1.00730515, "balance_loss_mlp": 1.0155623, "epoch": 0.9889977755065231, "flos": 24316120846080.0, "grad_norm": 2.247156607069221, "language_loss": 0.82991594, "learning_rate": 1.2560896296143208e-09, "loss": 0.85175228, "num_input_tokens_seen": 177496995, "step": 8225, "time_per_iteration": 2.6249148845672607 }, { "auxiliary_loss_clip": 0.01165053, "auxiliary_loss_mlp": 0.01023821, "balance_loss_clip": 1.04653001, "balance_loss_mlp": 1.01662052, "epoch": 0.9891180183971623, "flos": 18951066760320.0, "grad_norm": 2.1270919612388766, "language_loss": 0.82668841, "learning_rate": 1.2286377587926722e-09, "loss": 0.84857714, "num_input_tokens_seen": 177513785, "step": 8226, "time_per_iteration": 2.5781993865966797 }, { "auxiliary_loss_clip": 0.01165373, "auxiliary_loss_mlp": 0.01026126, "balance_loss_clip": 1.04597449, "balance_loss_mlp": 1.01926589, "epoch": 0.9892382612878013, "flos": 26176580760960.0, "grad_norm": 10.178169987488825, "language_loss": 0.74887109, "learning_rate": 1.2014890999973992e-09, "loss": 0.77078605, "num_input_tokens_seen": 177530705, "step": 8227, "time_per_iteration": 2.6384437084198 }, { "auxiliary_loss_clip": 0.01165108, "auxiliary_loss_mlp": 0.01022481, "balance_loss_clip": 1.04716313, "balance_loss_mlp": 1.01591277, "epoch": 0.9893585041784404, "flos": 25449605400960.0, "grad_norm": 2.278629047595554, "language_loss": 0.7864511, "learning_rate": 1.1746436573472073e-09, "loss": 0.80832696, "num_input_tokens_seen": 177552440, "step": 8228, "time_per_iteration": 2.648125648498535 }, { "auxiliary_loss_clip": 0.01172237, "auxiliary_loss_mlp": 0.01025034, "balance_loss_clip": 0.97049117, "balance_loss_mlp": 1.01675165, "epoch": 0.9894787470690796, "flos": 20189302352640.0, "grad_norm": 2.270823755415503, "language_loss": 0.69404393, "learning_rate": 1.1481014349141726e-09, "loss": 0.71601659, "num_input_tokens_seen": 177569660, "step": 8229, "time_per_iteration": 2.6171257495880127 }, { "auxiliary_loss_clip": 0.01166572, "auxiliary_loss_mlp": 0.01032882, "balance_loss_clip": 0.97132611, "balance_loss_mlp": 1.02487731, "epoch": 0.9895989899597186, "flos": 24644308435200.0, "grad_norm": 1.777823052522792, "language_loss": 0.84526384, "learning_rate": 1.121862436724852e-09, "loss": 0.86725837, "num_input_tokens_seen": 177588500, "step": 8230, "time_per_iteration": 2.7098002433776855 }, { "auxiliary_loss_clip": 0.01165119, "auxiliary_loss_mlp": 0.01024912, "balance_loss_clip": 1.00993657, "balance_loss_mlp": 1.01785147, "epoch": 0.9897192328503577, "flos": 21799034357760.0, "grad_norm": 1.61499550243773, "language_loss": 0.70444483, "learning_rate": 1.0959266667598388e-09, "loss": 0.72634518, "num_input_tokens_seen": 177607315, "step": 8231, "time_per_iteration": 2.653416156768799 }, { "auxiliary_loss_clip": 0.01167352, "auxiliary_loss_mlp": 0.01027427, "balance_loss_clip": 0.93478996, "balance_loss_mlp": 1.01956546, "epoch": 0.9898394757409968, "flos": 21325229032320.0, "grad_norm": 1.7827331413802319, "language_loss": 0.74812347, "learning_rate": 1.0702941289533196e-09, "loss": 0.77007127, "num_input_tokens_seen": 177625990, "step": 8232, "time_per_iteration": 3.603902816772461 }, { "auxiliary_loss_clip": 0.01162853, "auxiliary_loss_mlp": 0.0102234, "balance_loss_clip": 0.93355876, "balance_loss_mlp": 1.01537204, "epoch": 0.9899597186316359, "flos": 18545024442240.0, "grad_norm": 2.2031046196323567, "language_loss": 0.88511872, "learning_rate": 1.0449648271939615e-09, "loss": 0.90697062, "num_input_tokens_seen": 177642335, "step": 8233, "time_per_iteration": 2.7465591430664062 }, { "auxiliary_loss_clip": 0.01169488, "auxiliary_loss_mlp": 0.01122649, "balance_loss_clip": 0.89784831, "balance_loss_mlp": 0.0, "epoch": 0.990079961522275, "flos": 23766723348480.0, "grad_norm": 1.5577576317856123, "language_loss": 0.72578382, "learning_rate": 1.0199387653240243e-09, "loss": 0.74870521, "num_input_tokens_seen": 177662025, "step": 8234, "time_per_iteration": 2.708718776702881 }, { "auxiliary_loss_clip": 0.01155583, "auxiliary_loss_mlp": 0.01025601, "balance_loss_clip": 0.96994478, "balance_loss_mlp": 1.01892281, "epoch": 0.9902002044129141, "flos": 16399182971520.0, "grad_norm": 1.4901203186762906, "language_loss": 0.7073639, "learning_rate": 9.952159471400267e-10, "loss": 0.72917581, "num_input_tokens_seen": 177679065, "step": 8235, "time_per_iteration": 2.6301419734954834 }, { "auxiliary_loss_clip": 0.01170141, "auxiliary_loss_mlp": 0.01121876, "balance_loss_clip": 1.01002026, "balance_loss_mlp": 0.0, "epoch": 0.9903204473035532, "flos": 22559657783040.0, "grad_norm": 1.7879019115321642, "language_loss": 0.84078157, "learning_rate": 9.707963763923022e-10, "loss": 0.8637017, "num_input_tokens_seen": 177698115, "step": 8236, "time_per_iteration": 2.6054420471191406 }, { "auxiliary_loss_clip": 0.01160366, "auxiliary_loss_mlp": 0.01019814, "balance_loss_clip": 0.96837068, "balance_loss_mlp": 1.01298356, "epoch": 0.9904406901941922, "flos": 16144001775360.0, "grad_norm": 1.7708341011907471, "language_loss": 0.79230142, "learning_rate": 9.466800567854427e-10, "loss": 0.81410325, "num_input_tokens_seen": 177716715, "step": 8237, "time_per_iteration": 3.575986862182617 }, { "auxiliary_loss_clip": 0.0115989, "auxiliary_loss_mlp": 0.0103243, "balance_loss_clip": 0.92857254, "balance_loss_mlp": 1.02497983, "epoch": 0.9905609330848314, "flos": 26651499408000.0, "grad_norm": 2.0810218148463937, "language_loss": 0.67751586, "learning_rate": 9.228669919778553e-10, "loss": 0.69943899, "num_input_tokens_seen": 177735640, "step": 8238, "time_per_iteration": 2.7038466930389404 }, { "auxiliary_loss_clip": 0.01158461, "auxiliary_loss_mlp": 0.0102252, "balance_loss_clip": 0.96981168, "balance_loss_mlp": 1.01599312, "epoch": 0.9906811759754705, "flos": 23111820627840.0, "grad_norm": 1.9884668622271795, "language_loss": 0.79242527, "learning_rate": 8.993571855817617e-10, "loss": 0.81423509, "num_input_tokens_seen": 177754470, "step": 8239, "time_per_iteration": 2.612330913543701 }, { "auxiliary_loss_clip": 0.01163658, "auxiliary_loss_mlp": 0.01024228, "balance_loss_clip": 1.00851536, "balance_loss_mlp": 1.01693821, "epoch": 0.9908014188661095, "flos": 22090593052800.0, "grad_norm": 1.9085927935660927, "language_loss": 0.75372213, "learning_rate": 8.761506411638642e-10, "loss": 0.77560097, "num_input_tokens_seen": 177773935, "step": 8240, "time_per_iteration": 3.522387981414795 }, { "auxiliary_loss_clip": 0.01167147, "auxiliary_loss_mlp": 0.01027113, "balance_loss_clip": 0.97285771, "balance_loss_mlp": 1.01988947, "epoch": 0.9909216617567487, "flos": 19242948677760.0, "grad_norm": 1.6225809575815189, "language_loss": 0.73577082, "learning_rate": 8.53247362244236e-10, "loss": 0.75771344, "num_input_tokens_seen": 177792745, "step": 8241, "time_per_iteration": 2.6042582988739014 }, { "auxiliary_loss_clip": 0.01164539, "auxiliary_loss_mlp": 0.01027821, "balance_loss_clip": 0.97168291, "balance_loss_mlp": 1.02074254, "epoch": 0.9910419046473877, "flos": 23621213352960.0, "grad_norm": 1.6807398672539027, "language_loss": 0.68364942, "learning_rate": 8.306473522976532e-10, "loss": 0.70557296, "num_input_tokens_seen": 177812150, "step": 8242, "time_per_iteration": 3.6563565731048584 }, { "auxiliary_loss_clip": 0.01167567, "auxiliary_loss_mlp": 0.01023653, "balance_loss_clip": 1.04879642, "balance_loss_mlp": 1.0168432, "epoch": 0.9911621475380268, "flos": 22711380831360.0, "grad_norm": 2.704819536065786, "language_loss": 0.71608627, "learning_rate": 8.083506147522623e-10, "loss": 0.73799849, "num_input_tokens_seen": 177831545, "step": 8243, "time_per_iteration": 2.60947322845459 }, { "auxiliary_loss_clip": 0.01157205, "auxiliary_loss_mlp": 0.0102154, "balance_loss_clip": 1.00696611, "balance_loss_mlp": 1.01487064, "epoch": 0.991282390428666, "flos": 13516956777600.0, "grad_norm": 2.0234260155167134, "language_loss": 0.84958065, "learning_rate": 7.863571529906909e-10, "loss": 0.87136805, "num_input_tokens_seen": 177847130, "step": 8244, "time_per_iteration": 2.5784993171691895 }, { "auxiliary_loss_clip": 0.01063389, "auxiliary_loss_mlp": 0.0100116, "balance_loss_clip": 0.97107029, "balance_loss_mlp": 0.99949139, "epoch": 0.991402633319305, "flos": 61830492071040.0, "grad_norm": 0.7564703817760566, "language_loss": 0.59676564, "learning_rate": 7.646669703489372e-10, "loss": 0.61741114, "num_input_tokens_seen": 177911440, "step": 8245, "time_per_iteration": 3.2861075401306152 }, { "auxiliary_loss_clip": 0.01164062, "auxiliary_loss_mlp": 0.01025884, "balance_loss_clip": 0.73982495, "balance_loss_mlp": 1.01842189, "epoch": 0.9915228762099441, "flos": 18770148933120.0, "grad_norm": 1.7130828504536875, "language_loss": 0.57189339, "learning_rate": 7.432800701177023e-10, "loss": 0.59379292, "num_input_tokens_seen": 177929440, "step": 8246, "time_per_iteration": 3.1485161781311035 }, { "auxiliary_loss_clip": 0.01065344, "auxiliary_loss_mlp": 0.01001063, "balance_loss_clip": 0.93404418, "balance_loss_mlp": 0.9994179, "epoch": 0.9916431191005832, "flos": 65936660244480.0, "grad_norm": 0.7957032020148154, "language_loss": 0.57797241, "learning_rate": 7.221964555415017e-10, "loss": 0.59863651, "num_input_tokens_seen": 177989100, "step": 8247, "time_per_iteration": 3.4020612239837646 }, { "auxiliary_loss_clip": 0.01163048, "auxiliary_loss_mlp": 0.01019926, "balance_loss_clip": 0.97169292, "balance_loss_mlp": 1.01302135, "epoch": 0.9917633619912223, "flos": 16581573256320.0, "grad_norm": 1.8431602599147534, "language_loss": 0.74960583, "learning_rate": 7.01416129818222e-10, "loss": 0.77143556, "num_input_tokens_seen": 178006720, "step": 8248, "time_per_iteration": 2.669233560562134 }, { "auxiliary_loss_clip": 0.01171942, "auxiliary_loss_mlp": 0.01023229, "balance_loss_clip": 0.93242675, "balance_loss_mlp": 1.01621628, "epoch": 0.9918836048818613, "flos": 25411108999680.0, "grad_norm": 1.911235950663936, "language_loss": 0.58028054, "learning_rate": 6.809390961006745e-10, "loss": 0.60223222, "num_input_tokens_seen": 178026850, "step": 8249, "time_per_iteration": 2.7503201961517334 }, { "auxiliary_loss_clip": 0.0116644, "auxiliary_loss_mlp": 0.01028113, "balance_loss_clip": 0.97200722, "balance_loss_mlp": 1.02110636, "epoch": 0.9920038477725005, "flos": 25046867134080.0, "grad_norm": 2.005528337030787, "language_loss": 0.68096292, "learning_rate": 6.607653574948191e-10, "loss": 0.70290852, "num_input_tokens_seen": 178047630, "step": 8250, "time_per_iteration": 2.6831257343292236 }, { "auxiliary_loss_clip": 0.01153762, "auxiliary_loss_mlp": 0.01023031, "balance_loss_clip": 1.0051105, "balance_loss_mlp": 1.01658225, "epoch": 0.9921240906631396, "flos": 21829773421440.0, "grad_norm": 1.6885799276890059, "language_loss": 0.81635046, "learning_rate": 6.408949170613187e-10, "loss": 0.83811843, "num_input_tokens_seen": 178066895, "step": 8251, "time_per_iteration": 2.6012227535247803 }, { "auxiliary_loss_clip": 0.01161896, "auxiliary_loss_mlp": 0.01022833, "balance_loss_clip": 0.97023892, "balance_loss_mlp": 1.01546037, "epoch": 0.9922443335537786, "flos": 24864225454080.0, "grad_norm": 1.681301787940304, "language_loss": 0.81844187, "learning_rate": 6.213277778144288e-10, "loss": 0.84028912, "num_input_tokens_seen": 178088540, "step": 8252, "time_per_iteration": 2.706312894821167 }, { "auxiliary_loss_clip": 0.01161513, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 0.85551476, "balance_loss_mlp": 1.02124941, "epoch": 0.9923645764444178, "flos": 21613088626560.0, "grad_norm": 2.2270588101626294, "language_loss": 0.67015737, "learning_rate": 6.020639427224416e-10, "loss": 0.69205827, "num_input_tokens_seen": 178106185, "step": 8253, "time_per_iteration": 2.7101423740386963 }, { "auxiliary_loss_clip": 0.0116563, "auxiliary_loss_mlp": 0.01029268, "balance_loss_clip": 0.97187614, "balance_loss_mlp": 1.02186489, "epoch": 0.9924848193350568, "flos": 25001798544000.0, "grad_norm": 1.8764539824710533, "language_loss": 0.72606409, "learning_rate": 5.831034147076864e-10, "loss": 0.74801302, "num_input_tokens_seen": 178123435, "step": 8254, "time_per_iteration": 2.7864668369293213 }, { "auxiliary_loss_clip": 0.01057972, "auxiliary_loss_mlp": 0.01004436, "balance_loss_clip": 0.97068322, "balance_loss_mlp": 1.00282657, "epoch": 0.9926050622256959, "flos": 68912543151360.0, "grad_norm": 0.6884892867539464, "language_loss": 0.55722958, "learning_rate": 5.644461966463065e-10, "loss": 0.57785362, "num_input_tokens_seen": 178191045, "step": 8255, "time_per_iteration": 3.244715452194214 }, { "auxiliary_loss_clip": 0.01163878, "auxiliary_loss_mlp": 0.01023095, "balance_loss_clip": 0.97245914, "balance_loss_mlp": 1.01628494, "epoch": 0.9927253051163349, "flos": 20923675914240.0, "grad_norm": 3.315161488012721, "language_loss": 0.75865501, "learning_rate": 5.460922913687049e-10, "loss": 0.78052473, "num_input_tokens_seen": 178210135, "step": 8256, "time_per_iteration": 2.64937162399292 }, { "auxiliary_loss_clip": 0.01155953, "auxiliary_loss_mlp": 0.01122945, "balance_loss_clip": 0.89071596, "balance_loss_mlp": 0.0, "epoch": 0.9928455480069741, "flos": 22308211601280.0, "grad_norm": 1.8954823982252829, "language_loss": 0.74850261, "learning_rate": 5.280417016593208e-10, "loss": 0.77129155, "num_input_tokens_seen": 178229925, "step": 8257, "time_per_iteration": 2.718994140625 }, { "auxiliary_loss_clip": 0.01163975, "auxiliary_loss_mlp": 0.01122061, "balance_loss_clip": 1.01218998, "balance_loss_mlp": 0.0, "epoch": 0.9929657908976132, "flos": 17383889393280.0, "grad_norm": 2.0762132268741293, "language_loss": 0.74586105, "learning_rate": 5.102944302559642e-10, "loss": 0.7687214, "num_input_tokens_seen": 178247420, "step": 8258, "time_per_iteration": 3.687988758087158 }, { "auxiliary_loss_clip": 0.01169413, "auxiliary_loss_mlp": 0.01029139, "balance_loss_clip": 0.85679269, "balance_loss_mlp": 1.02149189, "epoch": 0.9930860337882522, "flos": 22674680110080.0, "grad_norm": 2.228006070031152, "language_loss": 0.79290116, "learning_rate": 4.9285047985137e-10, "loss": 0.81488663, "num_input_tokens_seen": 178266840, "step": 8259, "time_per_iteration": 2.7914583683013916 }, { "auxiliary_loss_clip": 0.0116972, "auxiliary_loss_mlp": 0.01030747, "balance_loss_clip": 1.01058888, "balance_loss_mlp": 1.02383304, "epoch": 0.9932062766788914, "flos": 28147789284480.0, "grad_norm": 1.9372127172430165, "language_loss": 0.74339521, "learning_rate": 4.757098530916436e-10, "loss": 0.76539981, "num_input_tokens_seen": 178287285, "step": 8260, "time_per_iteration": 2.714470863342285 }, { "auxiliary_loss_clip": 0.01168944, "auxiliary_loss_mlp": 0.01025026, "balance_loss_clip": 1.01228142, "balance_loss_mlp": 1.01762319, "epoch": 0.9933265195695304, "flos": 20156659868160.0, "grad_norm": 5.0425449297855565, "language_loss": 0.77690071, "learning_rate": 4.5887255257670563e-10, "loss": 0.7988404, "num_input_tokens_seen": 178304325, "step": 8261, "time_per_iteration": 2.7028310298919678 }, { "auxiliary_loss_clip": 0.01166586, "auxiliary_loss_mlp": 0.01023613, "balance_loss_clip": 1.04761434, "balance_loss_mlp": 1.01627517, "epoch": 0.9934467624601695, "flos": 21362037494400.0, "grad_norm": 1.9093809129185608, "language_loss": 0.76250553, "learning_rate": 4.4233858086117906e-10, "loss": 0.7844075, "num_input_tokens_seen": 178322850, "step": 8262, "time_per_iteration": 2.5662460327148438 }, { "auxiliary_loss_clip": 0.01158855, "auxiliary_loss_mlp": 0.01031082, "balance_loss_clip": 0.89766383, "balance_loss_mlp": 1.02384043, "epoch": 0.9935670053508087, "flos": 19756040503680.0, "grad_norm": 1.9885576995013783, "language_loss": 0.67458075, "learning_rate": 4.261079404528356e-10, "loss": 0.69648015, "num_input_tokens_seen": 178342330, "step": 8263, "time_per_iteration": 3.774634838104248 }, { "auxiliary_loss_clip": 0.01161108, "auxiliary_loss_mlp": 0.01025133, "balance_loss_clip": 1.00827622, "balance_loss_mlp": 1.01774228, "epoch": 0.9936872482414477, "flos": 21978838863360.0, "grad_norm": 1.8351920369774257, "language_loss": 0.69112778, "learning_rate": 4.1018063381437205e-10, "loss": 0.71299016, "num_input_tokens_seen": 178362715, "step": 8264, "time_per_iteration": 2.599045991897583 }, { "auxiliary_loss_clip": 0.01059568, "auxiliary_loss_mlp": 0.01003573, "balance_loss_clip": 0.97302204, "balance_loss_mlp": 1.00198746, "epoch": 0.9938074911320868, "flos": 69810667839360.0, "grad_norm": 0.9332537443443354, "language_loss": 0.61181843, "learning_rate": 3.9455666336141167e-10, "loss": 0.63244987, "num_input_tokens_seen": 178426495, "step": 8265, "time_per_iteration": 3.2263739109039307 }, { "auxiliary_loss_clip": 0.01164087, "auxiliary_loss_mlp": 0.01029657, "balance_loss_clip": 1.04655552, "balance_loss_mlp": 1.02270985, "epoch": 0.9939277340227259, "flos": 15084170058240.0, "grad_norm": 2.6806733896434114, "language_loss": 0.83354062, "learning_rate": 3.7923603146450267e-10, "loss": 0.85547805, "num_input_tokens_seen": 178442555, "step": 8266, "time_per_iteration": 3.1562273502349854 }, { "auxiliary_loss_clip": 0.01160981, "auxiliary_loss_mlp": 0.01028874, "balance_loss_clip": 0.9301194, "balance_loss_mlp": 1.02150679, "epoch": 0.994047976913365, "flos": 17712364291200.0, "grad_norm": 2.400264006886677, "language_loss": 0.80939674, "learning_rate": 3.642187404473418e-10, "loss": 0.83129525, "num_input_tokens_seen": 178460715, "step": 8267, "time_per_iteration": 2.590139389038086 }, { "auxiliary_loss_clip": 0.01167325, "auxiliary_loss_mlp": 0.01024117, "balance_loss_clip": 1.01016951, "balance_loss_mlp": 1.01713705, "epoch": 0.994168219804004, "flos": 19171558396800.0, "grad_norm": 2.044912976286535, "language_loss": 0.85865539, "learning_rate": 3.495047925885508e-10, "loss": 0.88056982, "num_input_tokens_seen": 178479050, "step": 8268, "time_per_iteration": 3.6871304512023926 }, { "auxiliary_loss_clip": 0.0115719, "auxiliary_loss_mlp": 0.01031233, "balance_loss_clip": 0.96776879, "balance_loss_mlp": 1.02390182, "epoch": 0.9942884626946432, "flos": 17851589406720.0, "grad_norm": 1.9778840746965245, "language_loss": 0.82568908, "learning_rate": 3.350941901199e-10, "loss": 0.8475734, "num_input_tokens_seen": 178495970, "step": 8269, "time_per_iteration": 2.7049789428710938 }, { "auxiliary_loss_clip": 0.01168235, "auxiliary_loss_mlp": 0.01029033, "balance_loss_clip": 0.97086477, "balance_loss_mlp": 1.02193403, "epoch": 0.9944087055852823, "flos": 18796578364800.0, "grad_norm": 7.285446979034458, "language_loss": 0.83219236, "learning_rate": 3.2098693522764066e-10, "loss": 0.85416496, "num_input_tokens_seen": 178509170, "step": 8270, "time_per_iteration": 2.65335750579834 }, { "auxiliary_loss_clip": 0.01171396, "auxiliary_loss_mlp": 0.01122554, "balance_loss_clip": 0.97117901, "balance_loss_mlp": 0.0, "epoch": 0.9945289484759213, "flos": 20996969616000.0, "grad_norm": 1.9685696773811991, "language_loss": 0.81052542, "learning_rate": 3.071830300516165e-10, "loss": 0.83346486, "num_input_tokens_seen": 178527000, "step": 8271, "time_per_iteration": 2.6198856830596924 }, { "auxiliary_loss_clip": 0.01171879, "auxiliary_loss_mlp": 0.01029259, "balance_loss_clip": 1.00959897, "balance_loss_mlp": 1.02149558, "epoch": 0.9946491913665605, "flos": 14756952136320.0, "grad_norm": 2.27458559067366, "language_loss": 0.70926082, "learning_rate": 2.9368247668615234e-10, "loss": 0.73127222, "num_input_tokens_seen": 178545590, "step": 8272, "time_per_iteration": 2.541604518890381 }, { "auxiliary_loss_clip": 0.01173208, "auxiliary_loss_mlp": 0.01026789, "balance_loss_clip": 1.05087948, "balance_loss_mlp": 1.01887393, "epoch": 0.9947694342571995, "flos": 12669931186560.0, "grad_norm": 2.3944288405891703, "language_loss": 0.61391163, "learning_rate": 2.804852771789434e-10, "loss": 0.63591164, "num_input_tokens_seen": 178558890, "step": 8273, "time_per_iteration": 2.4770357608795166 }, { "auxiliary_loss_clip": 0.01166234, "auxiliary_loss_mlp": 0.01027715, "balance_loss_clip": 1.0475173, "balance_loss_mlp": 1.02113152, "epoch": 0.9948896771478386, "flos": 18843442634880.0, "grad_norm": 8.0034490302738, "language_loss": 0.55648458, "learning_rate": 2.675914335321661e-10, "loss": 0.5784241, "num_input_tokens_seen": 178577645, "step": 8274, "time_per_iteration": 2.473422050476074 }, { "auxiliary_loss_clip": 0.01172228, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 1.01023829, "balance_loss_mlp": 1.02045512, "epoch": 0.9950099200384778, "flos": 24900207903360.0, "grad_norm": 10.24225867979148, "language_loss": 0.79994702, "learning_rate": 2.550009477018111e-10, "loss": 0.82194883, "num_input_tokens_seen": 178596415, "step": 8275, "time_per_iteration": 2.588190793991089 }, { "auxiliary_loss_clip": 0.01164088, "auxiliary_loss_mlp": 0.01122874, "balance_loss_clip": 0.97204781, "balance_loss_mlp": 0.0, "epoch": 0.9951301629291168, "flos": 23733613987200.0, "grad_norm": 2.0658297320219945, "language_loss": 0.63019931, "learning_rate": 2.4271382159790634e-10, "loss": 0.65306902, "num_input_tokens_seen": 178613845, "step": 8276, "time_per_iteration": 2.7014319896698 }, { "auxiliary_loss_clip": 0.01172304, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 0.85686976, "balance_loss_mlp": 1.02282393, "epoch": 0.9952504058197559, "flos": 22236893147520.0, "grad_norm": 2.44105861741753, "language_loss": 0.85905492, "learning_rate": 2.3073005708429406e-10, "loss": 0.88108218, "num_input_tokens_seen": 178633490, "step": 8277, "time_per_iteration": 2.7684152126312256 }, { "auxiliary_loss_clip": 0.01159463, "auxiliary_loss_mlp": 0.01021622, "balance_loss_clip": 0.93410671, "balance_loss_mlp": 1.01443696, "epoch": 0.995370648710395, "flos": 21211032718080.0, "grad_norm": 1.6557812203111701, "language_loss": 0.71995556, "learning_rate": 2.190496559788535e-10, "loss": 0.74176645, "num_input_tokens_seen": 178651775, "step": 8278, "time_per_iteration": 2.6490910053253174 }, { "auxiliary_loss_clip": 0.01161099, "auxiliary_loss_mlp": 0.01023798, "balance_loss_clip": 0.9725979, "balance_loss_mlp": 1.01672876, "epoch": 0.9954908916010341, "flos": 14866731077760.0, "grad_norm": 2.6126214338611264, "language_loss": 0.76195955, "learning_rate": 2.0767262005372265e-10, "loss": 0.78380847, "num_input_tokens_seen": 178669290, "step": 8279, "time_per_iteration": 2.6183433532714844 }, { "auxiliary_loss_clip": 0.01168754, "auxiliary_loss_mlp": 0.01026571, "balance_loss_clip": 0.93105996, "balance_loss_mlp": 1.01917386, "epoch": 0.9956111344916732, "flos": 19208259118080.0, "grad_norm": 1.8827449172045831, "language_loss": 0.75090969, "learning_rate": 1.965989510346322e-10, "loss": 0.77286291, "num_input_tokens_seen": 178688410, "step": 8280, "time_per_iteration": 2.6546480655670166 }, { "auxiliary_loss_clip": 0.01157667, "auxiliary_loss_mlp": 0.01022562, "balance_loss_clip": 0.89442396, "balance_loss_mlp": 1.0152787, "epoch": 0.9957313773823123, "flos": 20047060494720.0, "grad_norm": 1.9607965581614577, "language_loss": 0.71075457, "learning_rate": 1.8582865060134955e-10, "loss": 0.73255688, "num_input_tokens_seen": 178706600, "step": 8281, "time_per_iteration": 2.6624186038970947 }, { "auxiliary_loss_clip": 0.01058651, "auxiliary_loss_mlp": 0.01002385, "balance_loss_clip": 1.00797868, "balance_loss_mlp": 1.00075173, "epoch": 0.9958516202729514, "flos": 57483253768320.0, "grad_norm": 0.7976533486050813, "language_loss": 0.5581919, "learning_rate": 1.7536172038790098e-10, "loss": 0.57880223, "num_input_tokens_seen": 178766910, "step": 8282, "time_per_iteration": 3.2308576107025146 }, { "auxiliary_loss_clip": 0.01166023, "auxiliary_loss_mlp": 0.0102784, "balance_loss_clip": 0.97149068, "balance_loss_mlp": 1.02041936, "epoch": 0.9959718631635904, "flos": 27782900974080.0, "grad_norm": 1.9891956612560886, "language_loss": 0.69088662, "learning_rate": 1.651981619819054e-10, "loss": 0.71282518, "num_input_tokens_seen": 178784060, "step": 8283, "time_per_iteration": 2.671172857284546 }, { "auxiliary_loss_clip": 0.01166803, "auxiliary_loss_mlp": 0.01032893, "balance_loss_clip": 0.89355141, "balance_loss_mlp": 1.02617288, "epoch": 0.9960921060542296, "flos": 24024095274240.0, "grad_norm": 2.268638413673641, "language_loss": 0.70954317, "learning_rate": 1.5533797692546257e-10, "loss": 0.73154014, "num_input_tokens_seen": 178802795, "step": 8284, "time_per_iteration": 3.5174410343170166 }, { "auxiliary_loss_clip": 0.0115803, "auxiliary_loss_mlp": 0.01025483, "balance_loss_clip": 1.0056107, "balance_loss_mlp": 1.01775837, "epoch": 0.9962123489448687, "flos": 18697393935360.0, "grad_norm": 1.862942456158555, "language_loss": 0.84098089, "learning_rate": 1.4578116671404296e-10, "loss": 0.86281598, "num_input_tokens_seen": 178821075, "step": 8285, "time_per_iteration": 2.5854506492614746 }, { "auxiliary_loss_clip": 0.01162479, "auxiliary_loss_mlp": 0.01026486, "balance_loss_clip": 1.01093185, "balance_loss_mlp": 1.01956296, "epoch": 0.9963325918355077, "flos": 20010754823040.0, "grad_norm": 1.9476591935110805, "language_loss": 0.71390468, "learning_rate": 1.3652773279759777e-10, "loss": 0.73579431, "num_input_tokens_seen": 178837725, "step": 8286, "time_per_iteration": 2.6518497467041016 }, { "auxiliary_loss_clip": 0.01164435, "auxiliary_loss_mlp": 0.01022921, "balance_loss_clip": 1.00955558, "balance_loss_mlp": 1.01549387, "epoch": 0.9964528347261468, "flos": 33108488991360.0, "grad_norm": 1.6776806857995483, "language_loss": 0.62930238, "learning_rate": 1.2757767657989305e-10, "loss": 0.65117586, "num_input_tokens_seen": 178861515, "step": 8287, "time_per_iteration": 2.727675437927246 }, { "auxiliary_loss_clip": 0.01164253, "auxiliary_loss_mlp": 0.01021404, "balance_loss_clip": 1.01061201, "balance_loss_mlp": 1.01437688, "epoch": 0.9965730776167859, "flos": 23109342589440.0, "grad_norm": 2.5937989129828862, "language_loss": 0.87135011, "learning_rate": 1.1893099941850948e-10, "loss": 0.89320666, "num_input_tokens_seen": 178880410, "step": 8288, "time_per_iteration": 2.793147087097168 }, { "auxiliary_loss_clip": 0.01169364, "auxiliary_loss_mlp": 0.01028816, "balance_loss_clip": 0.96937704, "balance_loss_mlp": 1.02162504, "epoch": 0.996693320507425, "flos": 22965843755520.0, "grad_norm": 7.223355164932529, "language_loss": 0.77658606, "learning_rate": 1.105877026252866e-10, "loss": 0.79856783, "num_input_tokens_seen": 178898740, "step": 8289, "time_per_iteration": 3.6463546752929688 }, { "auxiliary_loss_clip": 0.01169272, "auxiliary_loss_mlp": 0.01026352, "balance_loss_clip": 1.04861856, "balance_loss_mlp": 1.01918113, "epoch": 0.996813563398064, "flos": 13222740476160.0, "grad_norm": 1.9783157692104834, "language_loss": 0.71830761, "learning_rate": 1.0254778746565663e-10, "loss": 0.74026382, "num_input_tokens_seen": 178914015, "step": 8290, "time_per_iteration": 2.5482873916625977 }, { "auxiliary_loss_clip": 0.01161089, "auxiliary_loss_mlp": 0.01023115, "balance_loss_clip": 0.93421394, "balance_loss_mlp": 1.01642489, "epoch": 0.9969338062887032, "flos": 14647855553280.0, "grad_norm": 2.156409276759633, "language_loss": 0.72941518, "learning_rate": 9.481125515953259e-11, "loss": 0.75125718, "num_input_tokens_seen": 178932075, "step": 8291, "time_per_iteration": 2.7635416984558105 }, { "auxiliary_loss_clip": 0.01160989, "auxiliary_loss_mlp": 0.01024618, "balance_loss_clip": 0.8906526, "balance_loss_mlp": 1.01734638, "epoch": 0.9970540491793423, "flos": 25735741142400.0, "grad_norm": 2.066142949639567, "language_loss": 0.79821813, "learning_rate": 8.737810688064228e-11, "loss": 0.82007426, "num_input_tokens_seen": 178951910, "step": 8292, "time_per_iteration": 3.5747299194335938 }, { "auxiliary_loss_clip": 0.01158698, "auxiliary_loss_mlp": 0.01034874, "balance_loss_clip": 0.93348217, "balance_loss_mlp": 1.02721453, "epoch": 0.9971742920699813, "flos": 21470236237440.0, "grad_norm": 2.0145356268601473, "language_loss": 0.79387248, "learning_rate": 8.024834375608414e-11, "loss": 0.81580818, "num_input_tokens_seen": 178970500, "step": 8293, "time_per_iteration": 3.5206480026245117 }, { "auxiliary_loss_clip": 0.01058594, "auxiliary_loss_mlp": 0.01002147, "balance_loss_clip": 1.00798726, "balance_loss_mlp": 1.00051427, "epoch": 0.9972945349606205, "flos": 72211223629440.0, "grad_norm": 0.8309940912371861, "language_loss": 0.62838948, "learning_rate": 7.342196686788149e-11, "loss": 0.64899695, "num_input_tokens_seen": 179023665, "step": 8294, "time_per_iteration": 3.050913095474243 }, { "auxiliary_loss_clip": 0.01166783, "auxiliary_loss_mlp": 0.01028107, "balance_loss_clip": 0.97539806, "balance_loss_mlp": 1.02066278, "epoch": 0.9974147778512595, "flos": 19678293515520.0, "grad_norm": 2.7452692551170443, "language_loss": 0.68630064, "learning_rate": 6.689897725142834e-11, "loss": 0.70824957, "num_input_tokens_seen": 179043140, "step": 8295, "time_per_iteration": 2.6107382774353027 }, { "auxiliary_loss_clip": 0.01166343, "auxiliary_loss_mlp": 0.01026242, "balance_loss_clip": 0.97106886, "balance_loss_mlp": 1.01926816, "epoch": 0.9975350207418986, "flos": 15960821391360.0, "grad_norm": 3.051929357036058, "language_loss": 0.88430822, "learning_rate": 6.067937589615545e-11, "loss": 0.90623403, "num_input_tokens_seen": 179061215, "step": 8296, "time_per_iteration": 2.627230405807495 }, { "auxiliary_loss_clip": 0.0106568, "auxiliary_loss_mlp": 0.01003494, "balance_loss_clip": 0.93338722, "balance_loss_mlp": 1.00170636, "epoch": 0.9976552636325378, "flos": 59961879768960.0, "grad_norm": 0.7693867517206213, "language_loss": 0.57808965, "learning_rate": 5.476316374575241e-11, "loss": 0.59878135, "num_input_tokens_seen": 179124700, "step": 8297, "time_per_iteration": 3.187525749206543 }, { "auxiliary_loss_clip": 0.01169055, "auxiliary_loss_mlp": 0.01022752, "balance_loss_clip": 1.04888391, "balance_loss_mlp": 1.01533723, "epoch": 0.9977755065231768, "flos": 22487872452480.0, "grad_norm": 3.539751562039569, "language_loss": 0.72465587, "learning_rate": 4.9150341697723476e-11, "loss": 0.74657393, "num_input_tokens_seen": 179144590, "step": 8298, "time_per_iteration": 2.5834240913391113 }, { "auxiliary_loss_clip": 0.01163303, "auxiliary_loss_mlp": 0.01025213, "balance_loss_clip": 0.97350234, "balance_loss_mlp": 1.01768851, "epoch": 0.9978957494138159, "flos": 26030280666240.0, "grad_norm": 1.4660574793606551, "language_loss": 0.66502494, "learning_rate": 4.384091060338768e-11, "loss": 0.68691015, "num_input_tokens_seen": 179165060, "step": 8299, "time_per_iteration": 2.713655948638916 }, { "auxiliary_loss_clip": 0.01163229, "auxiliary_loss_mlp": 0.01021106, "balance_loss_clip": 1.00864458, "balance_loss_mlp": 1.01398349, "epoch": 0.998015992304455, "flos": 22637835734400.0, "grad_norm": 2.2003122855184363, "language_loss": 0.73798525, "learning_rate": 3.883487126810081e-11, "loss": 0.75982857, "num_input_tokens_seen": 179184320, "step": 8300, "time_per_iteration": 2.6131551265716553 }, { "auxiliary_loss_clip": 0.01154199, "auxiliary_loss_mlp": 0.01028109, "balance_loss_clip": 1.00640416, "balance_loss_mlp": 1.02112889, "epoch": 0.9981362351950941, "flos": 18223444955520.0, "grad_norm": 1.6649587389392542, "language_loss": 0.79165602, "learning_rate": 3.41322244516995e-11, "loss": 0.81347907, "num_input_tokens_seen": 179202265, "step": 8301, "time_per_iteration": 2.659653425216675 }, { "auxiliary_loss_clip": 0.01149129, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 0.89307189, "balance_loss_mlp": 1.01986718, "epoch": 0.9982564780857331, "flos": 33474095573760.0, "grad_norm": 1.5850859284159626, "language_loss": 0.632581, "learning_rate": 2.9732970866946925e-11, "loss": 0.65434259, "num_input_tokens_seen": 179222145, "step": 8302, "time_per_iteration": 2.8030622005462646 }, { "auxiliary_loss_clip": 0.01144733, "auxiliary_loss_mlp": 0.01026801, "balance_loss_clip": 0.92709476, "balance_loss_mlp": 1.01925516, "epoch": 0.9983767209763723, "flos": 15523465392000.0, "grad_norm": 2.264777289805609, "language_loss": 0.78013581, "learning_rate": 2.563711118175327e-11, "loss": 0.80185109, "num_input_tokens_seen": 179239030, "step": 8303, "time_per_iteration": 2.72644305229187 }, { "auxiliary_loss_clip": 0.01155415, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 0.9327054, "balance_loss_mlp": 1.02125061, "epoch": 0.9984969638670114, "flos": 19974377324160.0, "grad_norm": 1.7475372961672306, "language_loss": 0.8360945, "learning_rate": 2.184464601717728e-11, "loss": 0.85793352, "num_input_tokens_seen": 179257345, "step": 8304, "time_per_iteration": 2.691899538040161 }, { "auxiliary_loss_clip": 0.01173167, "auxiliary_loss_mlp": 0.01029492, "balance_loss_clip": 1.01287079, "balance_loss_mlp": 1.02192187, "epoch": 0.9986172067576504, "flos": 20375750874240.0, "grad_norm": 2.251128376837346, "language_loss": 0.7791574, "learning_rate": 1.8355575948758585e-11, "loss": 0.80118406, "num_input_tokens_seen": 179275330, "step": 8305, "time_per_iteration": 2.661508798599243 }, { "auxiliary_loss_clip": 0.01162052, "auxiliary_loss_mlp": 0.01029243, "balance_loss_clip": 0.96890157, "balance_loss_mlp": 1.0221324, "epoch": 0.9987374496482896, "flos": 23727903724800.0, "grad_norm": 3.2051449451667553, "language_loss": 0.73267746, "learning_rate": 1.5169901505407424e-11, "loss": 0.75459039, "num_input_tokens_seen": 179292395, "step": 8306, "time_per_iteration": 2.665259599685669 }, { "auxiliary_loss_clip": 0.01164183, "auxiliary_loss_mlp": 0.01023687, "balance_loss_clip": 0.97188544, "balance_loss_mlp": 1.01686227, "epoch": 0.9988576925389286, "flos": 25044029959680.0, "grad_norm": 1.7337527676020181, "language_loss": 0.73823845, "learning_rate": 1.228762317073695e-11, "loss": 0.76011717, "num_input_tokens_seen": 179311225, "step": 8307, "time_per_iteration": 2.691427230834961 }, { "auxiliary_loss_clip": 0.01166958, "auxiliary_loss_mlp": 0.01024188, "balance_loss_clip": 0.97242594, "balance_loss_mlp": 1.01704121, "epoch": 0.9989779354295677, "flos": 31285627637760.0, "grad_norm": 1.968924370125575, "language_loss": 0.79148412, "learning_rate": 9.70874138195299e-12, "loss": 0.81339556, "num_input_tokens_seen": 179333135, "step": 8308, "time_per_iteration": 2.72015643119812 }, { "auxiliary_loss_clip": 0.0116649, "auxiliary_loss_mlp": 0.01027871, "balance_loss_clip": 1.04672956, "balance_loss_mlp": 1.02067316, "epoch": 0.9990981783202069, "flos": 19573398823680.0, "grad_norm": 1.4721463305882783, "language_loss": 0.74422157, "learning_rate": 7.433256530076093e-12, "loss": 0.76616514, "num_input_tokens_seen": 179353090, "step": 8309, "time_per_iteration": 2.6334171295166016 }, { "auxiliary_loss_clip": 0.01165207, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 0.89153701, "balance_loss_mlp": 1.01980972, "epoch": 0.9992184212108459, "flos": 17199667514880.0, "grad_norm": 2.214245808027268, "language_loss": 0.75829619, "learning_rate": 5.46116896038562e-12, "loss": 0.78021467, "num_input_tokens_seen": 179367500, "step": 8310, "time_per_iteration": 3.638455867767334 }, { "auxiliary_loss_clip": 0.01162533, "auxiliary_loss_mlp": 0.01030692, "balance_loss_clip": 0.97097754, "balance_loss_mlp": 1.02375412, "epoch": 0.999338664101485, "flos": 46497853681920.0, "grad_norm": 2.024375291831616, "language_loss": 0.62008607, "learning_rate": 3.792478972197699e-12, "loss": 0.64201832, "num_input_tokens_seen": 179388085, "step": 8311, "time_per_iteration": 2.8497397899627686 }, { "auxiliary_loss_clip": 0.01165296, "auxiliary_loss_mlp": 0.01021938, "balance_loss_clip": 1.04668891, "balance_loss_mlp": 1.01542926, "epoch": 0.9994589069921241, "flos": 15158253859200.0, "grad_norm": 3.0324716757814993, "language_loss": 0.6996752, "learning_rate": 2.4271868181990895e-12, "loss": 0.72154748, "num_input_tokens_seen": 179405250, "step": 8312, "time_per_iteration": 2.584022283554077 }, { "auxiliary_loss_clip": 0.01165848, "auxiliary_loss_mlp": 0.01024764, "balance_loss_clip": 1.0080986, "balance_loss_mlp": 1.01797175, "epoch": 0.9995791498827632, "flos": 12531460256640.0, "grad_norm": 2.2966204659285996, "language_loss": 0.81380367, "learning_rate": 1.3652927060014973e-12, "loss": 0.83570981, "num_input_tokens_seen": 179420845, "step": 8313, "time_per_iteration": 2.5421054363250732 }, { "auxiliary_loss_clip": 0.01170085, "auxiliary_loss_mlp": 0.01027731, "balance_loss_clip": 0.93377125, "balance_loss_mlp": 1.02048278, "epoch": 0.9996993927734023, "flos": 19245175320960.0, "grad_norm": 1.865592856267681, "language_loss": 0.6368469, "learning_rate": 6.067967965872612e-13, "loss": 0.65882504, "num_input_tokens_seen": 179440455, "step": 8314, "time_per_iteration": 2.635129451751709 }, { "auxiliary_loss_clip": 0.01164071, "auxiliary_loss_mlp": 0.01026445, "balance_loss_clip": 0.93488008, "balance_loss_mlp": 1.01883674, "epoch": 0.9998196356640414, "flos": 62952804518400.0, "grad_norm": 1.523760286089498, "language_loss": 0.76875007, "learning_rate": 1.5169920497548615e-13, "loss": 0.79065526, "num_input_tokens_seen": 179465075, "step": 8315, "time_per_iteration": 3.923131227493286 }, { "auxiliary_loss_clip": 0.01117112, "auxiliary_loss_mlp": 0.01012033, "balance_loss_clip": 0.99012351, "balance_loss_mlp": 1.00749719, "epoch": 0.9999398785546805, "flos": 50922375073920.0, "grad_norm": 1.2048408670407436, "language_loss": 0.55020851, "learning_rate": 0.0, "loss": 0.57149994, "num_input_tokens_seen": 179513955, "step": 8316, "time_per_iteration": 3.1828629970550537 }, { "epoch": 0.9999398785546805, "num_input_tokens_seen": 179513955, "step": 8316, "total_flos": 6.996749092776837e+17, "train_loss": 0.7894440462679287, "train_runtime": 25495.1894, "train_samples_per_second": 13.047, "train_steps_per_second": 0.326 } ], "logging_steps": 1.0, "max_steps": 8316, "num_input_tokens_seen": 179513955, "num_train_epochs": 1, "save_steps": 1664, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.996749092776837e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }